diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index b5e60781e0006..184fed2268e81 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -193,17 +193,24 @@ jobs:
             **/crash_diagnostics/*
 
   macos:
-    runs-on: macos-14
     needs: [ stage1 ]
     strategy:
-      fail-fast: true
+      fail-fast: false
       matrix:
-        config: [
-          generic-cxx03,
-          generic-cxx23,
-          generic-modules,
-          apple-configuration
-        ]
+        include:
+        - config: generic-cxx03
+          os: macos-latest
+        - config: generic-cxx23
+          os: macos-latest
+        - config: generic-modules
+          os: macos-latest
+        - config: apple-configuration
+          os: macos-latest
+        - config: apple-system
+          os: macos-13
+        - config: apple-system-hardened
+          os: macos-13
+    runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
       - uses: maxim-lobanov/setup-xcode@v1
diff --git a/.github/workflows/release-binaries-all.yml b/.github/workflows/release-binaries-all.yml
index 394b0c74d24ed..f5318aecc53a7 100644
--- a/.github/workflows/release-binaries-all.yml
+++ b/.github/workflows/release-binaries-all.yml
@@ -43,6 +43,7 @@ on:
       - '.github/workflows/release-binaries.yml'
       - '.github/workflows/release-binaries-setup-stage/*'
       - '.github/workflows/release-binaries-save-stage/*'
+      - 'clang/cmake/caches/Release.cmake'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || 'dispatch' }}
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 925912df6843e..f24e25879b96b 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -420,6 +420,14 @@ jobs:
       attestations: write # For artifact attestations
 
     steps:
+    - name: Checkout Release Scripts
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        sparse-checkout: |
+          llvm/utils/release/github-upload-release.py
+          llvm/utils/git/requirements.txt
+        sparse-checkout-cone-mode: false
+
     - name: 'Download artifact'
       uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
       with:
@@ -442,14 +450,6 @@ jobs:
         name: ${{ needs.prepare.outputs.release-binary-filename }}-attestation
         path: ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
 
-    - name: Checkout Release Scripts
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        sparse-checkout: |
-          llvm/utils/release/github-upload-release.py
-          llvm/utils/git/requirements.txt
-        sparse-checkout-cone-mode: false
-
     - name: Install Python Requirements
       run: |
         pip install --require-hashes -r ./llvm/utils/git/requirements.txt
diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml
index 70e5f08b6f72e..922c5093f1357 100644
--- a/.github/workflows/release-documentation.yml
+++ b/.github/workflows/release-documentation.yml
@@ -72,17 +72,20 @@ jobs:
           ref: main
           fetch-depth: 0
           path: www-releases
+          persist-credentials: false
 
       - name: Upload Release Notes
         if: env.upload
         env:
-          WWW_RELEASES_TOKEN: ${{ secrets.WWW_RELEASES_TOKEN }}
+          GH_TOKEN: ${{ secrets.WWW_RELEASES_TOKEN }}
         run: |
-          mkdir -p ../www-releases/${{ inputs.release-version }}
-          mv ./docs-build/html-export/* ../www-releases/${{ inputs.release-version }}
-          cd ../www-releases
+          mkdir -p www-releases/${{ inputs.release-version }}
+          mv ./docs-build/html-export/* www-releases/${{ inputs.release-version }}
+          cd www-releases
+          git checkout -b ${{ inputs.release-version }}
           git add ${{ inputs.release-version }}
           git config user.email "llvmbot@llvm.org"
           git config user.name "llvmbot"
           git commit -a -m "Add ${{ inputs.release-version }} documentation"
-          git push "https://$WWW_RELEASES_TOKEN@github.com/${{ github.repository_owner }}/www-releases" main:main
+          git push --force  "https://$GH_TOKEN@github.com/llvmbot/www-releases.git" HEAD:refs/heads/${{ inputs.release-version }}
+          gh pr create -f -B main -H ${{ inputs.release-version }} -R llvmbot/www-releases
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 46bdf208be6ad..36c42fced93d0 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -165,6 +165,12 @@ bool shouldPrint(const BinaryFunction &Function) {
     }
   }
 
+  std::optional<StringRef> Origin = Function.getOriginSectionName();
+  if (Origin && llvm::any_of(opts::PrintOnly, [&](const std::string &Name) {
+        return Name == *Origin;
+      }))
+    return true;
+
   return false;
 }
 
diff --git a/bolt/test/AArch64/constant_island_pie_update.s b/bolt/test/AArch64/constant_island_pie_update.s
index 313e103b19c05..889f6b6acd9f2 100644
--- a/bolt/test/AArch64/constant_island_pie_update.s
+++ b/bolt/test/AArch64/constant_island_pie_update.s
@@ -8,15 +8,15 @@
 # RUN: %clang %cflags -fPIC -pie %t.o -o %t.rela.exe -nostdlib \
 # RUN:   -Wl,-q -Wl,-z,notext
 # RUN: llvm-bolt %t.rela.exe -o %t.rela.bolt --use-old-text=0 --lite=0
-# RUN: llvm-objdump -j .text -d --show-all-symbols %t.rela.bolt | FileCheck %s
+# RUN: llvm-objdump -j .text -d -z --show-all-symbols %t.rela.bolt | FileCheck %s
 # RUN: llvm-readelf -rsW %t.rela.bolt | FileCheck --check-prefix=ELFCHECK %s
 // .relr.dyn
 # RUN: %clang %cflags -fPIC -pie %t.o -o %t.relr.exe -nostdlib \
 # RUN:   -Wl,-q -Wl,-z,notext -Wl,--pack-dyn-relocs=relr
 # RUN: llvm-objcopy --remove-section .rela.mytext %t.relr.exe
 # RUN: llvm-bolt %t.relr.exe -o %t.relr.bolt --use-old-text=0 --lite=0
-# RUN: llvm-objdump -j .text -d --show-all-symbols %t.relr.bolt | FileCheck %s
-# RUN: llvm-objdump -j .text -d %t.relr.bolt | \
+# RUN: llvm-objdump -j .text -d -z --show-all-symbols %t.relr.bolt | FileCheck %s
+# RUN: llvm-objdump -j .text -d -z %t.relr.bolt | \
 # RUN:   FileCheck %s --check-prefix=ADDENDCHECK
 # RUN: llvm-readelf -rsW %t.relr.bolt | FileCheck --check-prefix=RELRELFCHECK %s
 # RUN: llvm-readelf -SW %t.relr.bolt | FileCheck --check-prefix=RELRSZCHECK %s
diff --git a/bolt/test/AArch64/update-weak-reference-symbol.s b/bolt/test/AArch64/update-weak-reference-symbol.s
index 600a06b8b6d8f..46819e888b08e 100644
--- a/bolt/test/AArch64/update-weak-reference-symbol.s
+++ b/bolt/test/AArch64/update-weak-reference-symbol.s
@@ -3,7 +3,7 @@
 // RUN: %clang %cflags -Wl,-z,notext -shared -Wl,-q %s -o %t.so
 // RUN: llvm-bolt %t.so -o %t.so.bolt
 // RUN: llvm-nm -n %t.so.bolt > %t.out.txt
-// RUN: llvm-objdump -dj .rodata %t.so.bolt >> %t.out.txt
+// RUN: llvm-objdump -z -dj .rodata %t.so.bolt >> %t.out.txt
 // RUN: FileCheck %s --input-file=%t.out.txt
 
 # CHECK: w func_1
diff --git a/bolt/test/X86/print-only-section.s b/bolt/test/X86/print-only-section.s
new file mode 100644
index 0000000000000..d580818ca4fc6
--- /dev/null
+++ b/bolt/test/X86/print-only-section.s
@@ -0,0 +1,29 @@
+## Check that --print-only flag works with sections.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux %s -o %t.o
+# RUN: ld.lld %t.o -o %t.exe
+# RUN: llvm-bolt %t.exe -o %t.out --print-cfg --print-only=unused_code 2>&1 \
+# RUN:   | FileCheck %s
+
+# CHECK: Binary Function "foo"
+# CHECK-NOT: Binary Function "_start"
+
+  .text
+  .globl _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  ret
+  .cfi_endproc
+  .size _start, .-_start
+
+  .section unused_code,"ax",@progbits
+  .globl foo
+  .type foo, %function
+foo:
+  .cfi_startproc
+  ret
+  .cfi_endproc
+  .size foo, .-foo
diff --git a/bolt/test/merge-fdata-uninitialized-header.test b/bolt/test/merge-fdata-uninitialized-header.test
new file mode 100644
index 0000000000000..5336961278411
--- /dev/null
+++ b/bolt/test/merge-fdata-uninitialized-header.test
@@ -0,0 +1,45 @@
+## Test that merge-fdata correctly handles YAML header with an uninitialized
+## fields. a.yaml does not have hash-func set and it used to crash merge-fdata.
+
+# REQUIRES: system-linux
+
+# RUN: split-file %s %t
+# RUN: not merge-fdata %t/a.yaml %t/b.yaml 2>&1 | FileCheck %s
+
+# CHECK: cannot merge profiles with different hash functions
+
+#--- a.yaml
+---
+header:
+  profile-version: 1
+  binary-name:     'a.out'
+  binary-build-id: '<unknown>'
+  profile-flags:   [ lbr ]
+  profile-origin:  branch profile reader
+  profile-events:  ''
+  dfs-order:       false
+functions:
+  - name:            'main'
+    fid:             1
+    hash:            0x50BBA3441D436491
+    exec:            1
+    nblocks:         0
+...
+#--- b.yaml
+---
+header:
+  profile-version: 1
+  binary-name:     'a.out'
+  binary-build-id: '<unknown>'
+  profile-flags:   [ lbr ]
+  profile-origin:  branch profile reader
+  profile-events:  ''
+  dfs-order:       false
+  hash-func: xxh3
+functions:
+  - name:            'main'
+    fid:             1
+    hash:            0x50BBA3441D436491
+    exec:            1
+    nblocks:         0
+...
diff --git a/bolt/tools/merge-fdata/merge-fdata.cpp b/bolt/tools/merge-fdata/merge-fdata.cpp
index b640aae808f56..89ca46c1c0a8f 100644
--- a/bolt/tools/merge-fdata/merge-fdata.cpp
+++ b/bolt/tools/merge-fdata/merge-fdata.cpp
@@ -145,6 +145,10 @@ void mergeProfileHeaders(BinaryProfileHeader &MergedHeader,
     errs() << "WARNING: merging profiles with different sampling events\n";
     MergedHeader.EventNames += "," + Header.EventNames;
   }
+
+  if (MergedHeader.HashFunction != Header.HashFunction)
+    report_error("merge conflict",
+                 "cannot merge profiles with different hash functions");
 }
 
 void mergeBasicBlockProfile(BinaryBasicBlockProfile &MergedBB,
@@ -386,6 +390,7 @@ int main(int argc, char **argv) {
   // Merged information for all functions.
   StringMap<BinaryFunctionProfile> MergedBFs;
 
+  bool FirstHeader = true;
   for (std::string &InputDataFilename : Inputs) {
     ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
         MemoryBuffer::getFileOrSTDIN(InputDataFilename);
@@ -409,7 +414,12 @@ int main(int argc, char **argv) {
     }
 
     // Merge the header.
-    mergeProfileHeaders(MergedHeader, BP.Header);
+    if (FirstHeader) {
+      MergedHeader = BP.Header;
+      FirstHeader = false;
+    } else {
+      mergeProfileHeaders(MergedHeader, BP.Header);
+    }
 
     // Do the function merge.
     for (BinaryFunctionProfile &BF : BP.Functions) {
diff --git a/clang-tools-extra/clang-move/tool/ClangMove.cpp b/clang-tools-extra/clang-move/tool/ClangMove.cpp
index 1560dcaad6779..655ea81ee37d4 100644
--- a/clang-tools-extra/clang-move/tool/ClangMove.cpp
+++ b/clang-tools-extra/clang-move/tool/ClangMove.cpp
@@ -199,7 +199,7 @@ int main(int argc, const char **argv) {
       for (auto I = Files.begin(), E = Files.end(); I != E; ++I) {
         OS << "  {\n";
         OS << "    \"FilePath\": \"" << *I << "\",\n";
-        const auto Entry = FileMgr.getFile(*I);
+        const auto Entry = FileMgr.getOptionalFileRef(*I);
         auto ID = SM.translateFile(*Entry);
         std::string Content;
         llvm::raw_string_ostream ContentStream(Content);
diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index 689eb92a3d8d1..642f025359b1d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -78,6 +78,7 @@
 #include "SuspiciousStringviewDataUsageCheck.h"
 #include "SwappedArgumentsCheck.h"
 #include "SwitchMissingDefaultCaseCheck.h"
+#include "TaggedUnionMemberCountCheck.h"
 #include "TerminatingContinueCheck.h"
 #include "ThrowKeywordMissingCheck.h"
 #include "TooSmallLoopVariableCheck.h"
@@ -229,6 +230,8 @@ class BugproneModule : public ClangTidyModule {
         "bugprone-suspicious-stringview-data-usage");
     CheckFactories.registerCheck<SwappedArgumentsCheck>(
         "bugprone-swapped-arguments");
+    CheckFactories.registerCheck<TaggedUnionMemberCountCheck>(
+        "bugprone-tagged-union-member-count");
     CheckFactories.registerCheck<TerminatingContinueCheck>(
         "bugprone-terminating-continue");
     CheckFactories.registerCheck<ThrowKeywordMissingCheck>(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index cb0d8ae98bac5..9f7ecb9623c53 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -73,6 +73,7 @@ add_clang_library(clangTidyBugproneModule
   SuspiciousSemicolonCheck.cpp
   SuspiciousStringCompareCheck.cpp
   SwappedArgumentsCheck.cpp
+  TaggedUnionMemberCountCheck.cpp
   TerminatingContinueCheck.cpp
   ThrowKeywordMissingCheck.cpp
   TooSmallLoopVariableCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp
index 0b38b18208194..d77df50f8fea2 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp
@@ -107,7 +107,6 @@ static std::string getNameOfNamespace(const CXXRecordDecl *Decl) {
   std::string Ns;
   llvm::raw_string_ostream OStream(Ns);
   NsDecl->printQualifiedName(OStream);
-  OStream.flush();
   return Ns.empty() ? "(global)" : Ns;
 }
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp
index 378427a1eab00..f05924b81c4c0 100644
--- a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp
@@ -7,19 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "PosixReturnCheck.h"
-#include "../utils/Matchers.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Lex/Lexer.h"
 
 using namespace clang::ast_matchers;
 
 namespace clang::tidy::bugprone {
 
-static StringRef getFunctionSpelling(const MatchFinder::MatchResult &Result,
-                                     const char *BindingStr) {
-  const CallExpr *MatchedCall = cast<CallExpr>(
-      (Result.Nodes.getNodeAs<BinaryOperator>(BindingStr))->getLHS());
+static StringRef getFunctionSpelling(const MatchFinder::MatchResult &Result) {
+  const auto *MatchedCall = Result.Nodes.getNodeAs<CallExpr>("call");
   const SourceManager &SM = *Result.SourceManager;
   return Lexer::getSourceText(CharSourceRange::getTokenRange(
                                   MatchedCall->getCallee()->getSourceRange()),
@@ -27,32 +25,40 @@ static StringRef getFunctionSpelling(const MatchFinder::MatchResult &Result,
 }
 
 void PosixReturnCheck::registerMatchers(MatchFinder *Finder) {
+  const auto PosixCall =
+      callExpr(callee(functionDecl(
+                   anyOf(matchesName("^::posix_"), matchesName("^::pthread_")),
+                   unless(hasName("::posix_openpt")))))
+          .bind("call");
+  const auto ZeroIntegerLiteral = integerLiteral(equals(0));
+  const auto NegIntegerLiteral =
+      unaryOperator(hasOperatorName("-"), hasUnaryOperand(integerLiteral()));
+
   Finder->addMatcher(
       binaryOperator(
-          hasOperatorName("<"),
-          hasLHS(callExpr(callee(functionDecl(
-              anyOf(matchesName("^::posix_"), matchesName("^::pthread_")),
-              unless(hasName("::posix_openpt")))))),
-          hasRHS(integerLiteral(equals(0))))
+          anyOf(allOf(hasOperatorName("<"), hasLHS(PosixCall),
+                      hasRHS(ZeroIntegerLiteral)),
+                allOf(hasOperatorName(">"), hasLHS(ZeroIntegerLiteral),
+                      hasRHS(PosixCall))))
           .bind("ltzop"),
       this);
   Finder->addMatcher(
       binaryOperator(
-          hasOperatorName(">="),
-          hasLHS(callExpr(callee(functionDecl(
-              anyOf(matchesName("^::posix_"), matchesName("^::pthread_")),
-              unless(hasName("::posix_openpt")))))),
-          hasRHS(integerLiteral(equals(0))))
+          anyOf(allOf(hasOperatorName(">="), hasLHS(PosixCall),
+                      hasRHS(ZeroIntegerLiteral)),
+                allOf(hasOperatorName("<="), hasLHS(ZeroIntegerLiteral),
+                      hasRHS(PosixCall))))
           .bind("atop"),
       this);
+  Finder->addMatcher(binaryOperator(hasAnyOperatorName("==", "!="),
+                                    hasOperands(PosixCall, NegIntegerLiteral))
+                         .bind("binop"),
+                     this);
   Finder->addMatcher(
-      binaryOperator(
-          hasAnyOperatorName("==", "!=", "<=", "<"),
-          hasLHS(callExpr(callee(functionDecl(
-              anyOf(matchesName("^::posix_"), matchesName("^::pthread_")),
-              unless(hasName("::posix_openpt")))))),
-          hasRHS(unaryOperator(hasOperatorName("-"),
-                               hasUnaryOperand(integerLiteral()))))
+      binaryOperator(anyOf(allOf(hasAnyOperatorName("<=", "<"),
+                                 hasLHS(PosixCall), hasRHS(NegIntegerLiteral)),
+                           allOf(hasAnyOperatorName(">", ">="),
+                                 hasLHS(NegIntegerLiteral), hasRHS(PosixCall))))
           .bind("binop"),
       this);
 }
@@ -61,10 +67,13 @@ void PosixReturnCheck::check(const MatchFinder::MatchResult &Result) {
   if (const auto *LessThanZeroOp =
           Result.Nodes.getNodeAs<BinaryOperator>("ltzop")) {
     SourceLocation OperatorLoc = LessThanZeroOp->getOperatorLoc();
+    StringRef NewBinOp =
+        LessThanZeroOp->getOpcode() == BinaryOperator::Opcode::BO_LT ? ">"
+                                                                     : "<";
     diag(OperatorLoc, "the comparison always evaluates to false because %0 "
                       "always returns non-negative values")
-        << getFunctionSpelling(Result, "ltzop")
-        << FixItHint::CreateReplacement(OperatorLoc, Twine(">").str());
+        << getFunctionSpelling(Result)
+        << FixItHint::CreateReplacement(OperatorLoc, NewBinOp);
     return;
   }
   if (const auto *AlwaysTrueOp =
@@ -72,12 +81,12 @@ void PosixReturnCheck::check(const MatchFinder::MatchResult &Result) {
     diag(AlwaysTrueOp->getOperatorLoc(),
          "the comparison always evaluates to true because %0 always returns "
          "non-negative values")
-        << getFunctionSpelling(Result, "atop");
+        << getFunctionSpelling(Result);
     return;
   }
   const auto *BinOp = Result.Nodes.getNodeAs<BinaryOperator>("binop");
   diag(BinOp->getOperatorLoc(), "%0 only returns non-negative values")
-      << getFunctionSpelling(Result, "binop");
+      << getFunctionSpelling(Result);
 }
 
 } // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.cpp
new file mode 100644
index 0000000000000..db99ef3786e5f
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.cpp
@@ -0,0 +1,199 @@
+//===--- TaggedUnionMemberCountCheck.cpp - clang-tidy ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TaggedUnionMemberCountCheck.h"
+#include "../utils/OptionsUtils.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::bugprone {
+
+static constexpr llvm::StringLiteral StrictModeOptionName = "StrictMode";
+static constexpr llvm::StringLiteral EnableCountingEnumHeuristicOptionName =
+    "EnableCountingEnumHeuristic";
+static constexpr llvm::StringLiteral CountingEnumPrefixesOptionName =
+    "CountingEnumPrefixes";
+static constexpr llvm::StringLiteral CountingEnumSuffixesOptionName =
+    "CountingEnumSuffixes";
+
+static constexpr bool StrictModeOptionDefaultValue = false;
+static constexpr bool EnableCountingEnumHeuristicOptionDefaultValue = true;
+static constexpr llvm::StringLiteral CountingEnumPrefixesOptionDefaultValue =
+    "";
+static constexpr llvm::StringLiteral CountingEnumSuffixesOptionDefaultValue =
+    "count";
+
+static constexpr llvm::StringLiteral RootMatchBindName = "root";
+static constexpr llvm::StringLiteral UnionMatchBindName = "union";
+static constexpr llvm::StringLiteral TagMatchBindName = "tags";
+
+namespace {
+
+AST_MATCHER_P2(RecordDecl, fieldCountOfKindIsOne,
+               ast_matchers::internal::Matcher<FieldDecl>, InnerMatcher,
+               StringRef, BindName) {
+  // BoundNodesTreeBuilder resets itself when a match occurs.
+  // So to avoid losing previously saved binds, a temporary instance
+  // is used for matching.
+  //
+  // For precedence, see commit: 5b07de1a5faf4a22ae6fd982b877c5e7e3a76559
+  clang::ast_matchers::internal::BoundNodesTreeBuilder TempBuilder;
+
+  const FieldDecl *FirstMatch = nullptr;
+  for (const FieldDecl *Field : Node.fields()) {
+    if (InnerMatcher.matches(*Field, Finder, &TempBuilder)) {
+      if (FirstMatch) {
+        return false;
+      } else {
+        FirstMatch = Field;
+      }
+    }
+  }
+
+  if (FirstMatch) {
+    Builder->setBinding(BindName, clang::DynTypedNode::create(*FirstMatch));
+    return true;
+  }
+  return false;
+}
+
+} // namespace
+
+TaggedUnionMemberCountCheck::TaggedUnionMemberCountCheck(
+    StringRef Name, ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      StrictMode(
+          Options.get(StrictModeOptionName, StrictModeOptionDefaultValue)),
+      EnableCountingEnumHeuristic(
+          Options.get(EnableCountingEnumHeuristicOptionName,
+                      EnableCountingEnumHeuristicOptionDefaultValue)),
+      CountingEnumPrefixes(utils::options::parseStringList(
+          Options.get(CountingEnumPrefixesOptionName,
+                      CountingEnumPrefixesOptionDefaultValue))),
+      CountingEnumSuffixes(utils::options::parseStringList(
+          Options.get(CountingEnumSuffixesOptionName,
+                      CountingEnumSuffixesOptionDefaultValue))) {
+  if (!EnableCountingEnumHeuristic) {
+    if (Options.get(CountingEnumPrefixesOptionName))
+      configurationDiag("%0: Counting enum heuristic is disabled but "
+                        "%1 is set")
+          << Name << CountingEnumPrefixesOptionName;
+    if (Options.get(CountingEnumSuffixesOptionName))
+      configurationDiag("%0: Counting enum heuristic is disabled but "
+                        "%1 is set")
+          << Name << CountingEnumSuffixesOptionName;
+  }
+}
+
+void TaggedUnionMemberCountCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, StrictModeOptionName, StrictMode);
+  Options.store(Opts, EnableCountingEnumHeuristicOptionName,
+                EnableCountingEnumHeuristic);
+  Options.store(Opts, CountingEnumPrefixesOptionName,
+                utils::options::serializeStringList(CountingEnumPrefixes));
+  Options.store(Opts, CountingEnumSuffixesOptionName,
+                utils::options::serializeStringList(CountingEnumSuffixes));
+}
+
+void TaggedUnionMemberCountCheck::registerMatchers(MatchFinder *Finder) {
+
+  auto UnionField = fieldDecl(hasType(qualType(
+      hasCanonicalType(recordType(hasDeclaration(recordDecl(isUnion())))))));
+
+  auto EnumField = fieldDecl(hasType(
+      qualType(hasCanonicalType(enumType(hasDeclaration(enumDecl()))))));
+
+  auto hasOneUnionField = fieldCountOfKindIsOne(UnionField, UnionMatchBindName);
+  auto hasOneEnumField = fieldCountOfKindIsOne(EnumField, TagMatchBindName);
+
+  Finder->addMatcher(recordDecl(anyOf(isStruct(), isClass()), hasOneUnionField,
+                                hasOneEnumField, unless(isImplicit()))
+                         .bind(RootMatchBindName),
+                     this);
+}
+
+bool TaggedUnionMemberCountCheck::isCountingEnumLikeName(StringRef Name) const {
+  if (llvm::any_of(CountingEnumPrefixes, [Name](StringRef Prefix) -> bool {
+        return Name.starts_with_insensitive(Prefix);
+      }))
+    return true;
+  if (llvm::any_of(CountingEnumSuffixes, [Name](StringRef Suffix) -> bool {
+        return Name.ends_with_insensitive(Suffix);
+      }))
+    return true;
+  return false;
+}
+
+std::pair<const std::size_t, const EnumConstantDecl *>
+TaggedUnionMemberCountCheck::getNumberOfEnumValues(const EnumDecl *ED) {
+  llvm::SmallSet<llvm::APSInt, 16> EnumValues;
+
+  const EnumConstantDecl *LastEnumConstant = nullptr;
+  for (const EnumConstantDecl *Enumerator : ED->enumerators()) {
+    EnumValues.insert(Enumerator->getInitVal());
+    LastEnumConstant = Enumerator;
+  }
+
+  if (EnableCountingEnumHeuristic && LastEnumConstant &&
+      isCountingEnumLikeName(LastEnumConstant->getName()) &&
+      (LastEnumConstant->getInitVal() == (EnumValues.size() - 1))) {
+    return {EnumValues.size() - 1, LastEnumConstant};
+  }
+
+  return {EnumValues.size(), nullptr};
+}
+
+void TaggedUnionMemberCountCheck::check(
+    const MatchFinder::MatchResult &Result) {
+  const auto *Root = Result.Nodes.getNodeAs<RecordDecl>(RootMatchBindName);
+  const auto *UnionField =
+      Result.Nodes.getNodeAs<FieldDecl>(UnionMatchBindName);
+  const auto *TagField = Result.Nodes.getNodeAs<FieldDecl>(TagMatchBindName);
+
+  assert(Root && "Root is missing!");
+  assert(UnionField && "UnionField is missing!");
+  assert(TagField && "TagField is missing!");
+  if (!Root || !UnionField || !TagField)
+    return;
+
+  const auto *UnionDef =
+      UnionField->getType().getCanonicalType().getTypePtr()->getAsRecordDecl();
+  const auto *EnumDef = llvm::dyn_cast<EnumDecl>(
+      TagField->getType().getCanonicalType().getTypePtr()->getAsTagDecl());
+
+  assert(UnionDef && "UnionDef is missing!");
+  assert(EnumDef && "EnumDef is missing!");
+  if (!UnionDef || !EnumDef)
+    return;
+
+  const std::size_t UnionMemberCount = llvm::range_size(UnionDef->fields());
+  auto [TagCount, CountingEnumConstantDecl] = getNumberOfEnumValues(EnumDef);
+
+  if (UnionMemberCount > TagCount) {
+    diag(Root->getLocation(),
+         "tagged union has more data members (%0) than tags (%1)!")
+        << UnionMemberCount << TagCount;
+  } else if (StrictMode && UnionMemberCount < TagCount) {
+    diag(Root->getLocation(),
+         "tagged union has fewer data members (%0) than tags (%1)!")
+        << UnionMemberCount << TagCount;
+  }
+
+  if (CountingEnumConstantDecl) {
+    diag(CountingEnumConstantDecl->getLocation(),
+         "assuming that this constant is just an auxiliary value and not "
+         "used for indicating a valid union data member",
+         DiagnosticIDs::Note);
+  }
+}
+
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.h b/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.h
new file mode 100644
index 0000000000000..8b9d677d00b40
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.h
@@ -0,0 +1,41 @@
+//===--- TaggedUnionMemberCountCheck.h - clang-tidy -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_TAGGEDUNIONMEMBERCOUNTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_TAGGEDUNIONMEMBERCOUNTCHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::bugprone {
+
+/// Gives warnings for tagged unions, where the number of tags is
+/// different from the number of data members inside the union.
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/tagged-union-member-count.html
+class TaggedUnionMemberCountCheck : public ClangTidyCheck {
+public:
+  TaggedUnionMemberCountCheck(StringRef Name, ClangTidyContext *Context);
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+
+private:
+  const bool StrictMode;
+  const bool EnableCountingEnumHeuristic;
+  const std::vector<StringRef> CountingEnumPrefixes;
+  const std::vector<StringRef> CountingEnumSuffixes;
+
+  std::pair<const std::size_t, const EnumConstantDecl *>
+  getNumberOfEnumValues(const EnumDecl *ED);
+  bool isCountingEnumLikeName(StringRef Name) const;
+};
+
+} // namespace clang::tidy::bugprone
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_TAGGEDUNIONMEMBERCOUNTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
index ea7eaa0b0ff81..604a7cac0e490 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "UnsafeFunctionsCheck.h"
+#include "../utils/OptionsUtils.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/Lex/PPCallbacks.h"
@@ -18,6 +19,10 @@ using namespace llvm;
 
 namespace clang::tidy::bugprone {
 
+static constexpr llvm::StringLiteral OptionNameCustomFunctions =
+    "CustomFunctions";
+static constexpr llvm::StringLiteral OptionNameReportDefaultFunctions =
+    "ReportDefaultFunctions";
 static constexpr llvm::StringLiteral OptionNameReportMoreUnsafeFunctions =
     "ReportMoreUnsafeFunctions";
 
@@ -26,6 +31,8 @@ static constexpr llvm::StringLiteral FunctionNamesWithAnnexKReplacementId =
 static constexpr llvm::StringLiteral FunctionNamesId = "FunctionsNames";
 static constexpr llvm::StringLiteral AdditionalFunctionNamesId =
     "AdditionalFunctionsNames";
+static constexpr llvm::StringLiteral CustomFunctionNamesId =
+    "CustomFunctionNames";
 static constexpr llvm::StringLiteral DeclRefId = "DRE";
 
 static std::optional<std::string>
@@ -127,57 +134,128 @@ static bool isAnnexKAvailable(std::optional<bool> &CacheVar, Preprocessor *PP,
   return CacheVar.value();
 }
 
+static std::vector<UnsafeFunctionsCheck::CheckedFunction>
+parseCheckedFunctions(StringRef Option, ClangTidyContext *Context) {
+  const std::vector<StringRef> Functions =
+      utils::options::parseStringList(Option);
+  std::vector<UnsafeFunctionsCheck::CheckedFunction> Result;
+  Result.reserve(Functions.size());
+
+  for (StringRef Function : Functions) {
+    if (Function.empty())
+      continue;
+
+    const auto [Name, Rest] = Function.split(',');
+    const auto [Replacement, Reason] = Rest.split(',');
+
+    if (Name.trim().empty()) {
+      Context->configurationDiag("invalid configuration value for option '%0'; "
+                                 "expected the name of an unsafe function")
+          << OptionNameCustomFunctions;
+      continue;
+    }
+
+    Result.push_back(
+        {Name.trim().str(),
+         matchers::MatchesAnyListedNameMatcher::NameMatcher(Name.trim()),
+         Replacement.trim().str(), Reason.trim().str()});
+  }
+
+  return Result;
+}
+
+static std::string serializeCheckedFunctions(
+    const std::vector<UnsafeFunctionsCheck::CheckedFunction> &Functions) {
+  std::vector<std::string> Result;
+  Result.reserve(Functions.size());
+
+  for (const auto &Entry : Functions) {
+    if (Entry.Reason.empty())
+      Result.push_back(Entry.Name + "," + Entry.Replacement);
+    else
+      Result.push_back(Entry.Name + "," + Entry.Replacement + "," +
+                       Entry.Reason);
+  }
+
+  return llvm::join(Result, ";");
+}
+
 UnsafeFunctionsCheck::UnsafeFunctionsCheck(StringRef Name,
                                            ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
+      CustomFunctions(parseCheckedFunctions(
+          Options.get(OptionNameCustomFunctions, ""), Context)),
+      ReportDefaultFunctions(
+          Options.get(OptionNameReportDefaultFunctions, true)),
       ReportMoreUnsafeFunctions(
           Options.get(OptionNameReportMoreUnsafeFunctions, true)) {}
 
 void UnsafeFunctionsCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, OptionNameCustomFunctions,
+                serializeCheckedFunctions(CustomFunctions));
+  Options.store(Opts, OptionNameReportDefaultFunctions, ReportDefaultFunctions);
   Options.store(Opts, OptionNameReportMoreUnsafeFunctions,
                 ReportMoreUnsafeFunctions);
 }
 
 void UnsafeFunctionsCheck::registerMatchers(MatchFinder *Finder) {
-  if (getLangOpts().C11) {
-    // Matching functions with safe replacements only in Annex K.
-    auto FunctionNamesWithAnnexKReplacementMatcher = hasAnyName(
-        "::bsearch", "::ctime", "::fopen", "::fprintf", "::freopen", "::fscanf",
-        "::fwprintf", "::fwscanf", "::getenv", "::gmtime", "::localtime",
-        "::mbsrtowcs", "::mbstowcs", "::memcpy", "::memmove", "::memset",
-        "::printf", "::qsort", "::scanf", "::snprintf", "::sprintf", "::sscanf",
-        "::strcat", "::strcpy", "::strerror", "::strlen", "::strncat",
-        "::strncpy", "::strtok", "::swprintf", "::swscanf", "::vfprintf",
-        "::vfscanf", "::vfwprintf", "::vfwscanf", "::vprintf", "::vscanf",
-        "::vsnprintf", "::vsprintf", "::vsscanf", "::vswprintf", "::vswscanf",
-        "::vwprintf", "::vwscanf", "::wcrtomb", "::wcscat", "::wcscpy",
-        "::wcslen", "::wcsncat", "::wcsncpy", "::wcsrtombs", "::wcstok",
-        "::wcstombs", "::wctomb", "::wmemcpy", "::wmemmove", "::wprintf",
-        "::wscanf");
+  if (ReportDefaultFunctions) {
+    if (getLangOpts().C11) {
+      // Matching functions with safe replacements only in Annex K.
+      auto FunctionNamesWithAnnexKReplacementMatcher = hasAnyName(
+          "::bsearch", "::ctime", "::fopen", "::fprintf", "::freopen",
+          "::fscanf", "::fwprintf", "::fwscanf", "::getenv", "::gmtime",
+          "::localtime", "::mbsrtowcs", "::mbstowcs", "::memcpy", "::memmove",
+          "::memset", "::printf", "::qsort", "::scanf", "::snprintf",
+          "::sprintf", "::sscanf", "::strcat", "::strcpy", "::strerror",
+          "::strlen", "::strncat", "::strncpy", "::strtok", "::swprintf",
+          "::swscanf", "::vfprintf", "::vfscanf", "::vfwprintf", "::vfwscanf",
+          "::vprintf", "::vscanf", "::vsnprintf", "::vsprintf", "::vsscanf",
+          "::vswprintf", "::vswscanf", "::vwprintf", "::vwscanf", "::wcrtomb",
+          "::wcscat", "::wcscpy", "::wcslen", "::wcsncat", "::wcsncpy",
+          "::wcsrtombs", "::wcstok", "::wcstombs", "::wctomb", "::wmemcpy",
+          "::wmemmove", "::wprintf", "::wscanf");
+      Finder->addMatcher(
+          declRefExpr(to(functionDecl(FunctionNamesWithAnnexKReplacementMatcher)
+                             .bind(FunctionNamesWithAnnexKReplacementId)))
+              .bind(DeclRefId),
+          this);
+    }
+
+    // Matching functions with replacements without Annex K.
+    auto FunctionNamesMatcher =
+        hasAnyName("::asctime", "asctime_r", "::gets", "::rewind", "::setbuf");
     Finder->addMatcher(
-        declRefExpr(to(functionDecl(FunctionNamesWithAnnexKReplacementMatcher)
-                           .bind(FunctionNamesWithAnnexKReplacementId)))
+        declRefExpr(
+            to(functionDecl(FunctionNamesMatcher).bind(FunctionNamesId)))
             .bind(DeclRefId),
         this);
+
+    if (ReportMoreUnsafeFunctions) {
+      // Matching functions with replacements without Annex K, at user request.
+      auto AdditionalFunctionNamesMatcher =
+          hasAnyName("::bcmp", "::bcopy", "::bzero", "::getpw", "::vfork");
+      Finder->addMatcher(
+          declRefExpr(to(functionDecl(AdditionalFunctionNamesMatcher)
+                             .bind(AdditionalFunctionNamesId)))
+              .bind(DeclRefId),
+          this);
+    }
   }
 
-  // Matching functions with replacements without Annex K.
-  auto FunctionNamesMatcher =
-      hasAnyName("::asctime", "asctime_r", "::gets", "::rewind", "::setbuf");
-  Finder->addMatcher(
-      declRefExpr(to(functionDecl(FunctionNamesMatcher).bind(FunctionNamesId)))
-          .bind(DeclRefId),
-      this);
-
-  if (ReportMoreUnsafeFunctions) {
-    // Matching functions with replacements without Annex K, at user request.
-    auto AdditionalFunctionNamesMatcher =
-        hasAnyName("::bcmp", "::bcopy", "::bzero", "::getpw", "::vfork");
-    Finder->addMatcher(
-        declRefExpr(to(functionDecl(AdditionalFunctionNamesMatcher)
-                           .bind(AdditionalFunctionNamesId)))
-            .bind(DeclRefId),
-        this);
+  if (!CustomFunctions.empty()) {
+    std::vector<llvm::StringRef> FunctionNames;
+    FunctionNames.reserve(CustomFunctions.size());
+
+    for (const auto &Entry : CustomFunctions)
+      FunctionNames.push_back(Entry.Name);
+
+    auto CustomFunctionsMatcher = matchers::matchesAnyListedName(FunctionNames);
+
+    Finder->addMatcher(declRefExpr(to(functionDecl(CustomFunctionsMatcher)
+                                          .bind(CustomFunctionNamesId)))
+                           .bind(DeclRefId),
+                       this);
   }
 }
 
@@ -186,16 +264,46 @@ void UnsafeFunctionsCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *FuncDecl = cast<FunctionDecl>(DeclRef->getDecl());
   assert(DeclRef && FuncDecl && "No valid matched node in check()");
 
+  // Only one of these are matched at a time.
   const auto *AnnexK = Result.Nodes.getNodeAs<FunctionDecl>(
       FunctionNamesWithAnnexKReplacementId);
   const auto *Normal = Result.Nodes.getNodeAs<FunctionDecl>(FunctionNamesId);
   const auto *Additional =
       Result.Nodes.getNodeAs<FunctionDecl>(AdditionalFunctionNamesId);
-  assert((AnnexK || Normal || Additional) && "No valid match category.");
+  const auto *Custom =
+      Result.Nodes.getNodeAs<FunctionDecl>(CustomFunctionNamesId);
+  assert((AnnexK || Normal || Additional || Custom) &&
+         "No valid match category.");
 
   bool AnnexKIsAvailable =
       isAnnexKAvailable(IsAnnexKAvailable, PP, getLangOpts());
   StringRef FunctionName = FuncDecl->getName();
+
+  if (Custom) {
+    for (const auto &Entry : CustomFunctions) {
+      if (Entry.Pattern.match(*FuncDecl)) {
+        StringRef Reason =
+            Entry.Reason.empty() ? "is marked as unsafe" : Entry.Reason.c_str();
+
+        if (Entry.Replacement.empty()) {
+          diag(DeclRef->getExprLoc(), "function %0 %1; it should not be used")
+              << FuncDecl << Reason << Entry.Replacement
+              << DeclRef->getSourceRange();
+        } else {
+          diag(DeclRef->getExprLoc(),
+               "function %0 %1; '%2' should be used instead")
+              << FuncDecl << Reason << Entry.Replacement
+              << DeclRef->getSourceRange();
+        }
+
+        return;
+      }
+    }
+
+    llvm_unreachable("No custom function was matched.");
+    return;
+  }
+
   const std::optional<std::string> ReplacementFunctionName =
       [&]() -> std::optional<std::string> {
     if (AnnexK) {
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h
index 5adfee60d1a7d..63058c326ef29 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h
@@ -10,6 +10,7 @@
 #define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNSAFEFUNCTIONSCHECK_H
 
 #include "../ClangTidyCheck.h"
+#include "../utils/Matchers.h"
 #include <optional>
 
 namespace clang::tidy::bugprone {
@@ -32,7 +33,18 @@ class UnsafeFunctionsCheck : public ClangTidyCheck {
                            Preprocessor *ModuleExpanderPP) override;
   void onEndOfTranslationUnit() override;
 
+  struct CheckedFunction {
+    std::string Name;
+    matchers::MatchesAnyListedNameMatcher::NameMatcher Pattern;
+    std::string Replacement;
+    std::string Reason;
+  };
+
 private:
+  const std::vector<CheckedFunction> CustomFunctions;
+
+  // If true, the default set of functions are reported.
+  const bool ReportDefaultFunctions;
   /// If true, additional functions from widely used API-s (such as POSIX) are
   /// added to the list of reported functions.
   const bool ReportMoreUnsafeFunctions;
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp
index 90b317527ee41..1ff61bae46b1e 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp
@@ -25,6 +25,13 @@ AST_MATCHER_P(DeducedTemplateSpecializationType, refsToTemplatedDecl,
   return false;
 }
 
+AST_MATCHER_P(Type, asTagDecl, clang::ast_matchers::internal::Matcher<TagDecl>,
+              DeclMatcher) {
+  if (const TagDecl *ND = Node.getAsTagDecl())
+    return DeclMatcher.matches(*ND, Finder, Builder);
+  return false;
+}
+
 } // namespace
 
 // A function that helps to tell whether a TargetDecl in a UsingDecl will be
@@ -61,7 +68,8 @@ void UnusedUsingDeclsCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(userDefinedLiteral().bind("used"), this);
   Finder->addMatcher(
       loc(elaboratedType(unless(hasQualifier(nestedNameSpecifier())),
-                         hasUnqualifiedDesugaredType(type().bind("usedType")))),
+                         hasUnqualifiedDesugaredType(
+                             type(asTagDecl(tagDecl().bind("used")))))),
       this);
   // Cases where we can identify the UsingShadowDecl directly, rather than
   // just its target.
@@ -139,12 +147,6 @@ void UnusedUsingDeclsCheck::check(const MatchFinder::MatchResult &Result) {
     return;
   }
 
-  if (const auto *T = Result.Nodes.getNodeAs<Type>("usedType")) {
-    if (const auto *ND = T->getAsTagDecl())
-      RemoveNamedDecl(ND);
-    return;
-  }
-
   if (const auto *UsedShadow =
           Result.Nodes.getNodeAs<UsingShadowDecl>("usedShadow")) {
     removeFromFoundDecls(UsedShadow->getTargetDecl());
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
index 6a003a347bada..108717e151b57 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
@@ -35,10 +35,20 @@ AST_MATCHER(Type, sugaredNullptrType) {
 /// to null within.
 /// Finding sequences of explicit casts is necessary so that an entire sequence
 /// can be replaced instead of just the inner-most implicit cast.
+///
+/// TODO/NOTE: The second "anyOf" below discards matches on a substituted type,
+/// since we don't know if that would _always_ be a pointer type for all other
+/// specializations, unless the expression was "__null", in which case we assume
+/// that all specializations are expected to be for pointer types. Ideally this
+/// would check for the "NULL" macro instead, but that'd be harder to express.
+/// In practice, "NULL" is often defined as "__null", and this is a useful
+/// condition.
 StatementMatcher makeCastSequenceMatcher(llvm::ArrayRef<StringRef> NameList) {
   auto ImplicitCastToNull = implicitCastExpr(
       anyOf(hasCastKind(CK_NullToPointer), hasCastKind(CK_NullToMemberPointer)),
-      unless(hasImplicitDestinationType(qualType(substTemplateTypeParmType()))),
+      anyOf(hasSourceExpression(gnuNullExpr()),
+            unless(hasImplicitDestinationType(
+                qualType(substTemplateTypeParmType())))),
       unless(hasSourceExpression(hasType(sugaredNullptrType()))),
       unless(hasImplicitDestinationType(
           qualType(matchers::matchesAnyListedTypeName(NameList)))));
diff --git a/clang-tools-extra/clang-tidy/utils/Matchers.h b/clang-tools-extra/clang-tidy/utils/Matchers.h
index 5fd98db967870..451c4ce92585b 100644
--- a/clang-tools-extra/clang-tidy/utils/Matchers.h
+++ b/clang-tools-extra/clang-tidy/utils/Matchers.h
@@ -85,15 +85,7 @@ class MatchesAnyListedNameMatcher
         NameList.begin(), NameList.end(), std::back_inserter(NameMatchers),
         [](const llvm::StringRef Name) { return NameMatcher(Name); });
   }
-  bool matches(
-      const NamedDecl &Node, ast_matchers::internal::ASTMatchFinder *Finder,
-      ast_matchers::internal::BoundNodesTreeBuilder *Builder) const override {
-    return llvm::any_of(NameMatchers, [&Node](const NameMatcher &NM) {
-      return NM.match(Node);
-    });
-  }
 
-private:
   class NameMatcher {
     llvm::Regex Regex;
     enum class MatchMode {
@@ -136,6 +128,15 @@ class MatchesAnyListedNameMatcher
     }
   };
 
+  bool matches(
+      const NamedDecl &Node, ast_matchers::internal::ASTMatchFinder *Finder,
+      ast_matchers::internal::BoundNodesTreeBuilder *Builder) const override {
+    return llvm::any_of(NameMatchers, [&Node](const NameMatcher &NM) {
+      return NM.match(Node);
+    });
+  }
+
+private:
   std::vector<NameMatcher> NameMatchers;
 };
 
diff --git a/clang-tools-extra/clangd/AST.cpp b/clang-tools-extra/clangd/AST.cpp
index fda1e5fdf8d82..333fc10f17d7b 100644
--- a/clang-tools-extra/clangd/AST.cpp
+++ b/clang-tools-extra/clangd/AST.cpp
@@ -187,7 +187,6 @@ std::string printQualifiedName(const NamedDecl &ND) {
   // In clangd, context is usually available and paths are mostly noise.
   Policy.AnonymousTagLocations = false;
   ND.printQualifiedName(OS, Policy);
-  OS.flush();
   assert(!StringRef(QName).starts_with("::"));
   return QName;
 }
@@ -270,7 +269,6 @@ std::string printTemplateSpecializationArgs(const NamedDecl &ND) {
     // location information.
     printTemplateArgumentList(OS, Cls->getTemplateArgs().asArray(), Policy);
   }
-  OS.flush();
   return TemplateArgs;
 }
 
@@ -303,7 +301,6 @@ std::string printObjCMethod(const ObjCMethodDecl &Method) {
     OS << ", ...";
 
   OS << ']';
-  OS.flush();
   return Name;
 }
 
@@ -314,7 +311,6 @@ std::string printObjCContainer(const ObjCContainerDecl &C) {
     const ObjCInterfaceDecl *Class = Category->getClassInterface();
     OS << getNameOrErrForObjCInterface(Class) << '(' << Category->getName()
        << ')';
-    OS.flush();
     return Name;
   }
   if (const ObjCCategoryImplDecl *CID = dyn_cast<ObjCCategoryImplDecl>(&C)) {
@@ -322,7 +318,6 @@ std::string printObjCContainer(const ObjCContainerDecl &C) {
     llvm::raw_string_ostream OS(Name);
     const ObjCInterfaceDecl *Class = CID->getClassInterface();
     OS << getNameOrErrForObjCInterface(Class) << '(' << CID->getName() << ')';
-    OS.flush();
     return Name;
   }
   return C.getNameAsString();
diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index 89eee392837af..6711eb7dc10f8 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -1409,6 +1409,9 @@ bool semaCodeComplete(std::unique_ptr<CodeCompleteConsumer> Consumer,
   Clang->getPreprocessorOpts().SingleFileParseMode = CompletingInPreamble;
   Clang->setCodeCompletionConsumer(Consumer.release());
 
+  if (Input.Preamble.RequiredModules)
+    Input.Preamble.RequiredModules->adjustHeaderSearchOptions(Clang->getHeaderSearchOpts());
+
   SyntaxOnlyAction Action;
   if (!Action.BeginSourceFile(*Clang, Clang->getFrontendOpts().Inputs[0])) {
     log("BeginSourceFile() failed when running codeComplete for {0}",
@@ -2122,7 +2125,7 @@ clang::CodeCompleteOptions CodeCompleteOptions::getClangCompleteOpts() const {
   // When an is used, Sema is responsible for completing the main file,
   // the index can provide results from the preamble.
   // Tell Sema not to deserialize the preamble to look for results.
-  Result.LoadExternal = !Index;
+  Result.LoadExternal = ForceLoadPreamble || !Index;
   Result.IncludeFixIts = IncludeFixIts;
 
   return Result;
diff --git a/clang-tools-extra/clangd/CodeComplete.h b/clang-tools-extra/clangd/CodeComplete.h
index a7c1ae95dcbf4..9bcdeb0227cd4 100644
--- a/clang-tools-extra/clangd/CodeComplete.h
+++ b/clang-tools-extra/clangd/CodeComplete.h
@@ -52,6 +52,11 @@ struct CodeCompleteOptions {
   /// For example, private members are usually inaccessible.
   bool IncludeIneligibleResults = false;
 
+  /// Force sema to load decls from preamble even if an index is provided.
+  /// This is helpful for cases the index can't provide symbols, e.g. with
+  /// experimental c++20 modules
+  bool ForceLoadPreamble = false;
+
   /// Combine overloads into a single completion item where possible.
   /// If none, the implementation may choose an appropriate behavior.
   /// (In practice, ClangdLSPServer enables bundling if the client claims
diff --git a/clang-tools-extra/clangd/Diagnostics.cpp b/clang-tools-extra/clangd/Diagnostics.cpp
index 552dd36b6900b..a59d1e7ac8409 100644
--- a/clang-tools-extra/clangd/Diagnostics.cpp
+++ b/clang-tools-extra/clangd/Diagnostics.cpp
@@ -319,7 +319,6 @@ std::string mainMessage(const Diag &D, const ClangdDiagnosticOptions &Opts) {
       OS << "\n\n";
       printDiag(OS, Note);
     }
-  OS.flush();
   return capitalize(std::move(Result));
 }
 
@@ -335,7 +334,6 @@ std::string noteMessage(const Diag &Main, const DiagBase &Note,
     OS << "\n\n";
     printDiag(OS, Main);
   }
-  OS.flush();
   return capitalize(std::move(Result));
 }
 
@@ -579,17 +577,7 @@ std::vector<Diag> StoreDiags::take(const clang::tidy::ClangTidyContext *Tidy) {
   for (auto &Diag : Output) {
     if (const char *ClangDiag = getDiagnosticCode(Diag.ID)) {
       // Warnings controlled by -Wfoo are better recognized by that name.
-      const StringRef Warning = [&] {
-        if (OrigSrcMgr) {
-          return OrigSrcMgr->getDiagnostics()
-              .getDiagnosticIDs()
-              ->getWarningOptionForDiag(Diag.ID);
-        }
-        if (!DiagnosticIDs::IsCustomDiag(Diag.ID))
-          return DiagnosticIDs{}.getWarningOptionForDiag(Diag.ID);
-        return StringRef{};
-      }();
-
+      StringRef Warning = DiagnosticIDs::getWarningOptionForDiag(Diag.ID);
       if (!Warning.empty()) {
         Diag.Name = ("-W" + Warning).str();
       } else {
@@ -906,23 +894,20 @@ void StoreDiags::flushLastDiag() {
   Output.push_back(std::move(*LastDiag));
 }
 
-bool isDiagnosticSuppressed(const clang::Diagnostic &Diag,
-                            const llvm::StringSet<> &Suppress,
-                            const LangOptions &LangOpts) {
+bool isBuiltinDiagnosticSuppressed(unsigned ID,
+                                   const llvm::StringSet<> &Suppress,
+                                   const LangOptions &LangOpts) {
   // Don't complain about header-only stuff in mainfiles if it's a header.
   // FIXME: would be cleaner to suppress in clang, once we decide whether the
   //        behavior should be to silently-ignore or respect the pragma.
-  if (Diag.getID() == diag::pp_pragma_sysheader_in_main_file &&
-      LangOpts.IsHeaderFile)
+  if (ID == diag::pp_pragma_sysheader_in_main_file && LangOpts.IsHeaderFile)
     return true;
 
-  if (const char *CodePtr = getDiagnosticCode(Diag.getID())) {
+  if (const char *CodePtr = getDiagnosticCode(ID)) {
     if (Suppress.contains(normalizeSuppressedCode(CodePtr)))
       return true;
   }
-  StringRef Warning =
-      Diag.getDiags()->getDiagnosticIDs()->getWarningOptionForDiag(
-          Diag.getID());
+  StringRef Warning = DiagnosticIDs::getWarningOptionForDiag(ID);
   if (!Warning.empty() && Suppress.contains(Warning))
     return true;
   return false;
diff --git a/clang-tools-extra/clangd/Diagnostics.h b/clang-tools-extra/clangd/Diagnostics.h
index c45d8dc3aa6ce..d4c0478c63a5c 100644
--- a/clang-tools-extra/clangd/Diagnostics.h
+++ b/clang-tools-extra/clangd/Diagnostics.h
@@ -181,11 +181,11 @@ class StoreDiags : public DiagnosticConsumer {
 };
 
 /// Determine whether a (non-clang-tidy) diagnostic is suppressed by config.
-bool isDiagnosticSuppressed(const clang::Diagnostic &Diag,
-                            const llvm::StringSet<> &Suppressed,
-                            const LangOptions &);
+bool isBuiltinDiagnosticSuppressed(unsigned ID,
+                                   const llvm::StringSet<> &Suppressed,
+                                   const LangOptions &);
 /// Take a user-specified diagnostic code, and convert it to a normalized form
-/// stored in the config and consumed by isDiagnosticsSuppressed.
+/// stored in the config and consumed by isBuiltinDiagnosticsSuppressed.
 ///
 /// (This strips err_ and -W prefix so we can match with or without them.)
 llvm::StringRef normalizeSuppressedCode(llvm::StringRef);
diff --git a/clang-tools-extra/clangd/FindSymbols.cpp b/clang-tools-extra/clangd/FindSymbols.cpp
index 55f16b7085a6f..cf2f8b62a2841 100644
--- a/clang-tools-extra/clangd/FindSymbols.cpp
+++ b/clang-tools-extra/clangd/FindSymbols.cpp
@@ -182,7 +182,6 @@ std::string getSymbolName(ASTContext &Ctx, const NamedDecl &ND) {
     OS << (Method->isInstanceMethod() ? '-' : '+');
     Method->getSelector().print(OS);
 
-    OS.flush();
     return Name;
   }
   return printName(Ctx, ND);
diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index de103e011c708..298fa79e3fd0b 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -150,7 +150,6 @@ std::string printDefinition(const Decl *D, PrintingPolicy PP,
   std::string Definition;
   llvm::raw_string_ostream OS(Definition);
   D->print(OS, PP);
-  OS.flush();
   return Definition;
 }
 
@@ -179,7 +178,6 @@ HoverInfo::PrintedType printType(QualType QT, ASTContext &ASTCtx,
       OS << TT->getDecl()->getKindName() << " ";
   }
   QT.print(OS, PP);
-  OS.flush();
 
   const Config &Cfg = Config::current();
   if (!QT.isNull() && Cfg.Hover.ShowAKA) {
@@ -229,7 +227,6 @@ HoverInfo::PrintedType printType(const TemplateTemplateParmDecl *TTP,
   // FIXME: TemplateTemplateParameter doesn't store the info on whether this
   // param was a "typename" or "class".
   OS << "> class";
-  OS.flush();
   return Result;
 }
 
@@ -821,7 +818,6 @@ std::string typeAsDefinition(const HoverInfo::PrintedType &PType) {
   OS << PType.Type;
   if (PType.AKA)
     OS << " // aka: " << *PType.AKA;
-  OS.flush();
   return Result;
 }
 
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 5cf1691ce3961..045d32afbc938 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -342,7 +342,7 @@ void applyWarningOptions(llvm::ArrayRef<std::string> ExtraArgs,
       if (Enable) {
         if (Diags.getDiagnosticLevel(ID, SourceLocation()) <
             DiagnosticsEngine::Warning) {
-          auto Group = Diags.getDiagnosticIDs()->getGroupForDiag(ID);
+          auto Group = DiagnosticIDs::getGroupForDiag(ID);
           if (!Group || !EnabledGroups(*Group))
             continue;
           Diags.setSeverity(ID, diag::Severity::Warning, SourceLocation());
@@ -585,8 +585,8 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
     ASTDiags.setLevelAdjuster([&](DiagnosticsEngine::Level DiagLevel,
                                   const clang::Diagnostic &Info) {
       if (Cfg.Diagnostics.SuppressAll ||
-          isDiagnosticSuppressed(Info, Cfg.Diagnostics.Suppress,
-                                 Clang->getLangOpts()))
+          isBuiltinDiagnosticSuppressed(Info.getID(), Cfg.Diagnostics.Suppress,
+                                        Clang->getLangOpts()))
         return DiagnosticsEngine::Ignored;
 
       auto It = OverriddenSeverity.find(Info.getID());
diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp
index 84e8fec342829..c14c4d1ba103f 100644
--- a/clang-tools-extra/clangd/Preamble.cpp
+++ b/clang-tools-extra/clangd/Preamble.cpp
@@ -621,8 +621,8 @@ buildPreamble(PathRef FileName, CompilerInvocation CI,
   PreambleDiagnostics.setLevelAdjuster([&](DiagnosticsEngine::Level DiagLevel,
                                            const clang::Diagnostic &Info) {
     if (Cfg.Diagnostics.SuppressAll ||
-        isDiagnosticSuppressed(Info, Cfg.Diagnostics.Suppress,
-                               CI.getLangOpts()))
+        isBuiltinDiagnosticSuppressed(Info.getID(), Cfg.Diagnostics.Suppress,
+                                      CI.getLangOpts()))
       return DiagnosticsEngine::Ignored;
     switch (Info.getID()) {
     case diag::warn_no_newline_eof:
@@ -913,7 +913,6 @@ PreamblePatch PreamblePatch::create(llvm::StringRef FileName,
   PP.PatchedMarks = std::move(ModifiedScan->Marks);
   PP.PatchedMacros = std::move(ModifiedScan->Macros);
   dlog("Created preamble patch: {0}", Patch.str());
-  Patch.flush();
   return PP;
 }
 
diff --git a/clang-tools-extra/clangd/Quality.cpp b/clang-tools-extra/clangd/Quality.cpp
index 7371d95fbf275..c1ab63fb22f61 100644
--- a/clang-tools-extra/clangd/Quality.cpp
+++ b/clang-tools-extra/clangd/Quality.cpp
@@ -554,7 +554,6 @@ std::string sortText(float Score, llvm::StringRef Name) {
   llvm::write_hex(OS, encodeFloat(-Score), llvm::HexPrintStyle::Lower,
                   /*Width=*/2 * sizeof(Score));
   OS << Name;
-  OS.flush();
   return S;
 }
 
diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp
index 3af99b9db056d..780aaa471dc8b 100644
--- a/clang-tools-extra/clangd/SourceCode.cpp
+++ b/clang-tools-extra/clangd/SourceCode.cpp
@@ -814,8 +814,8 @@ llvm::SmallVector<llvm::StringRef> ancestorNamespaces(llvm::StringRef NS) {
 
 // Checks whether \p FileName is a valid spelling of main file.
 bool isMainFile(llvm::StringRef FileName, const SourceManager &SM) {
-  auto FE = SM.getFileManager().getFile(FileName);
-  return FE && *FE == SM.getFileEntryForID(SM.getMainFileID());
+  auto FE = SM.getFileManager().getOptionalFileRef(FileName);
+  return FE && FE == SM.getFileEntryRefForID(SM.getMainFileID());
 }
 
 } // namespace
diff --git a/clang-tools-extra/clangd/SystemIncludeExtractor.cpp b/clang-tools-extra/clangd/SystemIncludeExtractor.cpp
index d4b9b173d149d..c1c2e9fab9664 100644
--- a/clang-tools-extra/clangd/SystemIncludeExtractor.cpp
+++ b/clang-tools-extra/clangd/SystemIncludeExtractor.cpp
@@ -483,7 +483,6 @@ std::string convertGlobToRegex(llvm::StringRef Glob) {
     }
   }
   RegStream << '$';
-  RegStream.flush();
   return RegText;
 }
 
diff --git a/clang-tools-extra/clangd/index/StdLib.cpp b/clang-tools-extra/clangd/index/StdLib.cpp
index 921ab5d1c96d5..d34838a45048d 100644
--- a/clang-tools-extra/clangd/index/StdLib.cpp
+++ b/clang-tools-extra/clangd/index/StdLib.cpp
@@ -87,7 +87,6 @@ std::string buildUmbrella(llvm::StringLiteral Mandatory,
                         "#endif\n",
                         Header);
   }
-  OS.flush();
   return Result;
 }
 
diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp
index a76894cf0855f..d1d744a21cfd5 100644
--- a/clang-tools-extra/clangd/index/SymbolCollector.cpp
+++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp
@@ -41,6 +41,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -75,18 +76,62 @@ bool isPrivateProtoDecl(const NamedDecl &ND) {
   if (ND.getIdentifier() == nullptr)
     return false;
   auto Name = ND.getIdentifier()->getName();
-  if (!Name.contains('_'))
-    return false;
-  // Nested proto entities (e.g. Message::Nested) have top-level decls
-  // that shouldn't be used (Message_Nested). Ignore them completely.
-  // The nested entities are dangling type aliases, we may want to reconsider
-  // including them in the future.
-  // For enum constants, SOME_ENUM_CONSTANT is not private and should be
-  // indexed. Outer_INNER is private. This heuristic relies on naming style, it
-  // will include OUTER_INNER and exclude some_enum_constant.
-  // FIXME: the heuristic relies on naming style (i.e. no underscore in
-  // user-defined names) and can be improved.
-  return (ND.getKind() != Decl::EnumConstant) || llvm::any_of(Name, islower);
+  // There are some internal helpers like _internal_set_foo();
+  if (Name.contains("_internal_"))
+    return true;
+
+  // https://protobuf.dev/reference/cpp/cpp-generated/#nested-types
+  // Nested entities (messages/enums) has two names, one at the top-level scope,
+  // with a mangled name created by prepending all the outer types. These names
+  // are almost never preferred by the developers, so exclude them from index.
+  // e.g.
+  //   message Foo {
+  //    message Bar {}
+  //    enum E { A }
+  //   }
+  //
+  // yields:
+  //   class Foo_Bar {};
+  //   enum Foo_E { Foo_E_A };
+  //   class Foo {
+  //    using Bar = Foo_Bar;
+  //    static constexpr Foo_E A = Foo_E_A;
+  //   };
+
+  // We get rid of Foo_Bar and Foo_E by discarding any top-level entries with
+  // `_` in the name. This relies on original message/enum not having `_` in the
+  // name. Hence might go wrong in certain cases.
+  if (ND.getDeclContext()->isNamespace()) {
+    // Strip off some known public suffix helpers for enums, rest of the helpers
+    // are generated inside record decls so we don't care.
+    // https://protobuf.dev/reference/cpp/cpp-generated/#enum
+    Name.consume_back("_descriptor");
+    Name.consume_back("_IsValid");
+    Name.consume_back("_Name");
+    Name.consume_back("_Parse");
+    Name.consume_back("_MIN");
+    Name.consume_back("_MAX");
+    Name.consume_back("_ARRAYSIZE");
+    return Name.contains('_');
+  }
+
+  // EnumConstantDecls need some special attention, despite being nested in a
+  // TagDecl, they might still have mangled names. We filter those by checking
+  // if it has parent's name as a prefix.
+  // This might go wrong if a nested entity has a name that starts with parent's
+  // name, e.g: enum Foo { Foo_X }.
+  if (llvm::isa<EnumConstantDecl>(&ND)) {
+    auto *DC = llvm::cast<EnumDecl>(ND.getDeclContext());
+    if (!DC || !DC->getIdentifier())
+      return false;
+    auto CtxName = DC->getIdentifier()->getName();
+    return !CtxName.empty() && Name.consume_front(CtxName) &&
+           Name.consume_front("_");
+  }
+
+  // Now we're only left with fields/methods without an `_internal_` in the
+  // name, they're intended for public use.
+  return false;
 }
 
 // We only collect #include paths for symbols that are suitable for global code
diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index 3a5449ac8c799..1b669c50fa31a 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -919,6 +919,9 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var
   Opts.CodeComplete.EnableFunctionArgSnippets = EnableFunctionArgSnippets;
   Opts.CodeComplete.RunParser = CodeCompletionParse;
   Opts.CodeComplete.RankingModel = RankingModel;
+  // FIXME: If we're using C++20 modules, force the lookup process to load
+  // external decls, since currently the index doesn't support C++20 modules.
+  Opts.CodeComplete.ForceLoadPreamble = ExperimentalModulesSupport;
 
   RealThreadsafeFS TFS;
   std::vector<std::unique_ptr<config::Provider>> ProviderStack;
diff --git a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
index cf9b42828568d..4ecfdf0184ab4 100644
--- a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
@@ -298,41 +298,20 @@ TEST_F(ConfigCompileTests, DiagnosticSuppression) {
                                    "unreachable-code", "unused-variable",
                                    "typecheck_bool_condition",
                                    "unexpected_friend", "warn_alloca"));
-  clang::DiagnosticsEngine DiagEngine(new DiagnosticIDs, nullptr,
-                                      new clang::IgnoringDiagConsumer);
-
-  using Diag = clang::Diagnostic;
-  {
-    auto D = DiagEngine.Report(diag::warn_unreachable);
-    EXPECT_TRUE(isDiagnosticSuppressed(
-        Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions()));
-  }
+  EXPECT_TRUE(isBuiltinDiagnosticSuppressed(
+      diag::warn_unreachable, Conf.Diagnostics.Suppress, LangOptions()));
   // Subcategory not respected/suppressed.
-  {
-    auto D = DiagEngine.Report(diag::warn_unreachable_break);
-    EXPECT_FALSE(isDiagnosticSuppressed(
-        Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions()));
-  }
-  {
-    auto D = DiagEngine.Report(diag::warn_unused_variable);
-    EXPECT_TRUE(isDiagnosticSuppressed(
-        Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions()));
-  }
-  {
-    auto D = DiagEngine.Report(diag::err_typecheck_bool_condition);
-    EXPECT_TRUE(isDiagnosticSuppressed(
-        Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions()));
-  }
-  {
-    auto D = DiagEngine.Report(diag::err_unexpected_friend);
-    EXPECT_TRUE(isDiagnosticSuppressed(
-        Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions()));
-  }
-  {
-    auto D = DiagEngine.Report(diag::warn_alloca);
-    EXPECT_TRUE(isDiagnosticSuppressed(
-        Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions()));
-  }
+  EXPECT_FALSE(isBuiltinDiagnosticSuppressed(
+      diag::warn_unreachable_break, Conf.Diagnostics.Suppress, LangOptions()));
+  EXPECT_TRUE(isBuiltinDiagnosticSuppressed(
+      diag::warn_unused_variable, Conf.Diagnostics.Suppress, LangOptions()));
+  EXPECT_TRUE(isBuiltinDiagnosticSuppressed(diag::err_typecheck_bool_condition,
+                                            Conf.Diagnostics.Suppress,
+                                            LangOptions()));
+  EXPECT_TRUE(isBuiltinDiagnosticSuppressed(
+      diag::err_unexpected_friend, Conf.Diagnostics.Suppress, LangOptions()));
+  EXPECT_TRUE(isBuiltinDiagnosticSuppressed(
+      diag::warn_alloca, Conf.Diagnostics.Suppress, LangOptions()));
 
   Frag.Diagnostics.Suppress.emplace_back("*");
   EXPECT_TRUE(compileAndApply());
diff --git a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
index efb2e5ed2fbe1..7a47d6ebebf3b 100644
--- a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
@@ -1984,6 +1984,30 @@ TEST(Diagnostics, Tags) {
                 withTag(DiagnosticTag::Deprecated)))));
 }
 
+TEST(Diagnostics, TidyDiagsArentAffectedFromWerror) {
+  TestTU TU;
+  TU.ExtraArgs = {"-Werror"};
+  Annotations Test(R"cpp($typedef[[typedef int INT]]; // error-ok)cpp");
+  TU.Code = Test.code().str();
+  TU.ClangTidyProvider = addTidyChecks("modernize-use-using");
+  EXPECT_THAT(
+      TU.build().getDiagnostics(),
+      ifTidyChecks(UnorderedElementsAre(
+          AllOf(Diag(Test.range("typedef"), "use 'using' instead of 'typedef'"),
+                // Make sure severity for clang-tidy finding isn't bumped to
+                // error due to Werror in compile flags.
+                diagSeverity(DiagnosticsEngine::Warning)))));
+
+  TU.ClangTidyProvider =
+      addTidyChecks("modernize-use-using", /*WarningsAsErrors=*/"modernize-*");
+  EXPECT_THAT(
+      TU.build().getDiagnostics(),
+      ifTidyChecks(UnorderedElementsAre(
+          AllOf(Diag(Test.range("typedef"), "use 'using' instead of 'typedef'"),
+                // Unless bumped explicitly with WarnAsError.
+                diagSeverity(DiagnosticsEngine::Error)))));
+}
+
 TEST(Diagnostics, DeprecatedDiagsAreHints) {
   ClangdDiagnosticOptions Opts;
   std::optional<clangd::Diagnostic> Diag;
diff --git a/clang-tools-extra/clangd/unittests/ParsedASTTests.cpp b/clang-tools-extra/clangd/unittests/ParsedASTTests.cpp
index 4bb76cd6ab830..6ee641caeefe3 100644
--- a/clang-tools-extra/clangd/unittests/ParsedASTTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ParsedASTTests.cpp
@@ -397,10 +397,10 @@ TEST(ParsedASTTest, PatchesAdditionalIncludes) {
   auto &FM = SM.getFileManager();
   // Copy so that we can use operator[] to get the children.
   IncludeStructure Includes = PatchedAST->getIncludeStructure();
-  auto MainFE = FM.getFile(testPath("foo.cpp"));
+  auto MainFE = FM.getOptionalFileRef(testPath("foo.cpp"));
   ASSERT_TRUE(MainFE);
   auto MainID = Includes.getID(*MainFE);
-  auto AuxFE = FM.getFile(testPath("sub/aux.h"));
+  auto AuxFE = FM.getOptionalFileRef(testPath("sub/aux.h"));
   ASSERT_TRUE(AuxFE);
   auto AuxID = Includes.getID(*AuxFE);
   EXPECT_THAT(Includes.IncludeChildren[*MainID], Contains(*AuxID));
diff --git a/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp b/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp
index 7bbb95c8b8d67..691a93e7acd0a 100644
--- a/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp
+++ b/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp
@@ -402,6 +402,86 @@ import A;
   EXPECT_TRUE(D.isFromASTFile());
 }
 
+// An end to end test for code complete in modules
+TEST_F(PrerequisiteModulesTests, CodeCompleteTest) {
+  MockDirectoryCompilationDatabase CDB(TestDir, FS);
+
+  CDB.addFile("A.cppm", R"cpp(
+export module A;
+export void printA();
+  )cpp");
+
+  llvm::StringLiteral UserContents = R"cpp(
+import A;
+void func() {
+  print^
+}
+)cpp";
+
+  CDB.addFile("Use.cpp", UserContents);
+  Annotations Test(UserContents);
+
+  ModulesBuilder Builder(CDB);
+
+  ParseInputs Use = getInputs("Use.cpp", CDB);
+  Use.ModulesManager = &Builder;
+
+  std::unique_ptr<CompilerInvocation> CI =
+      buildCompilerInvocation(Use, DiagConsumer);
+  EXPECT_TRUE(CI);
+
+  auto Preamble =
+      buildPreamble(getFullPath("Use.cpp"), *CI, Use, /*InMemory=*/true,
+                    /*Callback=*/nullptr);
+  EXPECT_TRUE(Preamble);
+  EXPECT_TRUE(Preamble->RequiredModules);
+  
+  auto Result = codeComplete(getFullPath("Use.cpp"), Test.point(),
+                             Preamble.get(), Use, {});
+  EXPECT_FALSE(Result.Completions.empty());
+  EXPECT_EQ(Result.Completions[0].Name, "printA");
+}
+
+TEST_F(PrerequisiteModulesTests, SignatureHelpTest) {
+  MockDirectoryCompilationDatabase CDB(TestDir, FS);
+
+  CDB.addFile("A.cppm", R"cpp(
+export module A;
+export void printA(int a);
+  )cpp");
+
+  llvm::StringLiteral UserContents = R"cpp(
+import A;
+void func() {
+  printA(^);
+}
+)cpp";
+
+  CDB.addFile("Use.cpp", UserContents);
+  Annotations Test(UserContents);
+
+  ModulesBuilder Builder(CDB);
+
+  ParseInputs Use = getInputs("Use.cpp", CDB);
+  Use.ModulesManager = &Builder;
+
+  std::unique_ptr<CompilerInvocation> CI =
+      buildCompilerInvocation(Use, DiagConsumer);
+  EXPECT_TRUE(CI);
+
+  auto Preamble =
+      buildPreamble(getFullPath("Use.cpp"), *CI, Use, /*InMemory=*/true,
+                    /*Callback=*/nullptr);
+  EXPECT_TRUE(Preamble);
+  EXPECT_TRUE(Preamble->RequiredModules);
+  
+  auto Result = signatureHelp(getFullPath("Use.cpp"), Test.point(),
+                              *Preamble.get(), Use, MarkupKind::PlainText);
+  EXPECT_FALSE(Result.signatures.empty());
+  EXPECT_EQ(Result.signatures[0].label, "printA(int a) -> void");
+  EXPECT_EQ(Result.signatures[0].parameters[0].labelString, "int a");
+}
+
 } // namespace
 } // namespace clang::clangd
 
diff --git a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
index 0666be95b6b9e..e8088cb37fa51 100644
--- a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
@@ -201,19 +201,63 @@ TEST_F(ShouldCollectSymbolTest, NoPrivateProtoSymbol) {
   build(
       R"(// Generated by the protocol buffer compiler.  DO NOT EDIT!
          namespace nx {
-           class Top_Level {};
-           class TopLevel {};
-           enum Kind {
-             KIND_OK,
-             Kind_Not_Ok,
+           enum Outer_Enum : int {
+             Outer_Enum_KIND1,
+             Outer_Enum_Kind_2,
            };
+           bool Outer_Enum_IsValid(int);
+
+           class Outer_Inner {};
+           class Outer {
+             using Inner = Outer_Inner;
+             using Enum = Outer_Enum;
+             static constexpr Enum KIND1 = Outer_Enum_KIND1;
+             static constexpr Enum Kind_2 = Outer_Enum_Kind_2;
+             static bool Enum_IsValid(int);
+             int &x();
+             void set_x();
+             void _internal_set_x();
+
+             int &Outer_y();
+           };
+           enum Foo {
+             FOO_VAL1,
+             Foo_VAL2,
+           };
+           bool Foo_IsValid(int);
          })");
-  EXPECT_TRUE(shouldCollect("nx::TopLevel"));
-  EXPECT_TRUE(shouldCollect("nx::Kind::KIND_OK"));
-  EXPECT_TRUE(shouldCollect("nx::Kind"));
 
-  EXPECT_FALSE(shouldCollect("nx::Top_Level"));
-  EXPECT_FALSE(shouldCollect("nx::Kind::Kind_Not_Ok"));
+  // Make sure all the mangled names for Outer::Enum is discarded.
+  EXPECT_FALSE(shouldCollect("nx::Outer_Enum"));
+  EXPECT_FALSE(shouldCollect("nx::Outer_Enum_KIND1"));
+  EXPECT_FALSE(shouldCollect("nx::Outer_Enum_Kind_2"));
+  EXPECT_FALSE(shouldCollect("nx::Outer_Enum_IsValid"));
+  // But nested aliases are preserved.
+  EXPECT_TRUE(shouldCollect("nx::Outer::Enum"));
+  EXPECT_TRUE(shouldCollect("nx::Outer::KIND1"));
+  EXPECT_TRUE(shouldCollect("nx::Outer::Kind_2"));
+  EXPECT_TRUE(shouldCollect("nx::Outer::Enum_IsValid"));
+
+  // Check for Outer::Inner.
+  EXPECT_FALSE(shouldCollect("nx::Outer_Inner"));
+  EXPECT_TRUE(shouldCollect("nx::Outer"));
+  EXPECT_TRUE(shouldCollect("nx::Outer::Inner"));
+
+  // Make sure field related information is preserved, unless it's explicitly
+  // marked with `_internal_`.
+  EXPECT_TRUE(shouldCollect("nx::Outer::x"));
+  EXPECT_TRUE(shouldCollect("nx::Outer::set_x"));
+  EXPECT_FALSE(shouldCollect("nx::Outer::_internal_set_x"));
+  EXPECT_TRUE(shouldCollect("nx::Outer::Outer_y"));
+
+  // Handling of a top-level enum
+  EXPECT_TRUE(shouldCollect("nx::Foo::FOO_VAL1"));
+  EXPECT_TRUE(shouldCollect("nx::FOO_VAL1"));
+  EXPECT_TRUE(shouldCollect("nx::Foo_IsValid"));
+  // Our heuristic goes wrong here, if the user has a nested name that starts
+  // with parent's name.
+  EXPECT_FALSE(shouldCollect("nx::Foo::Foo_VAL2"));
+  EXPECT_FALSE(shouldCollect("nx::Foo_VAL2"));
 }
 
 TEST_F(ShouldCollectSymbolTest, DoubleCheckProtoHeaderComment) {
diff --git a/clang-tools-extra/clangd/unittests/TypeHierarchyTests.cpp b/clang-tools-extra/clangd/unittests/TypeHierarchyTests.cpp
index 2f82ec7444d7a..15158d8a45ca8 100644
--- a/clang-tools-extra/clangd/unittests/TypeHierarchyTests.cpp
+++ b/clang-tools-extra/clangd/unittests/TypeHierarchyTests.cpp
@@ -242,7 +242,6 @@ const NamedDecl &findDeclWithTemplateArgs(ParsedAST &AST,
     // Use getNameForDiagnostic() which includes the template
     // arguments in the printed name.
     ND.getNameForDiagnostic(OS, Policy, /*Qualified=*/true);
-    OS.flush();
     return QName == Query;
   });
 }
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 8f7b0b5333f3a..e34e296b5a096 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -103,6 +103,12 @@ Improvements to clang-tidy
 New checks
 ^^^^^^^^^^
 
+- New :doc:`bugprone-tagged-union-member-count
+  <clang-tidy/checks/bugprone/tagged-union-member-count>` check.
+
+  Gives warnings for tagged unions, where the number of tags is
+  different from the number of data members inside the union.
+
 New check aliases
 ^^^^^^^^^^^^^^^^^
 
@@ -125,11 +131,24 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/forwarding-reference-overload>` check by fixing
   a crash when determining if an ``enable_if[_t]`` was found.
 
+- Improved :doc:`bugprone-posix-return
+  <clang-tidy/checks/bugprone/posix-return>` check to support integer literals
+  as LHS and posix call as RHS of comparison.
+
 - Improved :doc:`bugprone-sizeof-expression
   <clang-tidy/checks/bugprone/sizeof-expression>` check to find suspicious
   usages of ``sizeof()``, ``alignof()``, and ``offsetof()`` when adding or
   subtracting from a pointer.
 
+- Improved :doc:`bugprone-unchecked-optional-access
+  <clang-tidy/checks/bugprone/unchecked-optional-access>` to support
+  `bsl::optional` and `bdlb::NullableValue` from
+  <https://github.com/bloomberg/bde>_.
+
+- Improved :doc:`bugprone-unsafe-functions
+  <clang-tidy/checks/bugprone/unsafe-functions>` check to allow specifying
+  additional functions to match.
+
 - Improved :doc:`cert-flp30-c <clang-tidy/checks/cert/flp30-c>` check to
   fix false positive that floating point variable is only used in increment
   expression.
@@ -161,6 +180,10 @@ Changes in existing checks
   a false positive when only an implicit conversion happened inside an
   initializer list.
 
+- Improved :doc:`modernize-use-nullptr
+  <clang-tidy/checks/modernize/use-nullptr>` check to also recognize
+  ``NULL``/``__null`` (but not ``0``) when used with a templated type.
+
 - Improved :doc:`modernize-use-std-print
   <clang-tidy/checks/modernize/use-std-print>` check to support replacing
   member function calls too.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst
new file mode 100644
index 0000000000000..2f1036c10345e
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst
@@ -0,0 +1,280 @@
+.. title:: clang-tidy - bugprone-tagged-union-member-count
+
+bugprone-tagged-union-member-count
+==================================
+
+Gives warnings for tagged unions, where the number of tags is
+different from the number of data members inside the union.
+
+A struct or a class is considered to be a tagged union if it has
+exactly one union data member and exactly one enum data member and
+any number of other data members that are neither unions or enums.
+
+Example:
+
+.. code-block:: c++
+
+  enum Tags {
+    Tag1,
+    Tag2,
+  };
+
+  struct TaggedUnion { // warning: tagged union has more data members (3) than tags (2)
+    enum Tags Kind;
+    union {
+      int I;
+      float F;
+      char *Str;
+    } Data;
+  };
+
+How enum constants are counted
+------------------------------
+
+The main complicating factor when counting the number of enum constants is that
+some of them might be auxiliary values that purposefully don't have a corresponding union
+data member and are used for something else. For example the last enum constant
+sometimes explicitly "points to" the last declared valid enum constant or
+tracks how many enum constants have been declared.
+
+For an illustration:
+
+.. code-block:: c++
+
+  enum TagWithLast {
+    Tag1 = 0,
+    Tag2 = 1,
+    Tag3 = 2,
+    LastTag = 2
+  };
+
+  enum TagWithCounter {
+    Tag1, // is 0
+    Tag2, // is 1
+    Tag3, // is 2
+    TagCount, // is 3
+  };
+
+The check counts the number of distinct values among the enum constants and not the enum
+constants themselves. This way the enum constants that are essentially just aliases of other
+enum constants are not included in the final count.
+
+Handling of counting enum constants (ones like :code:`TagCount` in the previous code example)
+is done by decreasing the number of enum values by one if the name of the last enum constant
+starts with a prefix or ends with a suffix specified in :option:`CountingEnumPrefixes`,
+:option:`CountingEnumSuffixes` and it's value is one less than the total number of distinct
+values in the enum.
+
+When the final count is adjusted based on this heuristic then a diagnostic note is emitted
+that shows which enum constant matched the criteria.
+
+The heuristic can be disabled entirely (:option:`EnableCountingEnumHeuristic`) or
+configured to follow your naming convention (:option:`CountingEnumPrefixes`, :option:`CountingEnumSuffixes`).
+The strings specified in :option:`CountingEnumPrefixes`, :option:`CountingEnumSuffixes` are matched
+case insensitively.
+
+Example counts:
+
+.. code-block:: c++
+
+  // Enum count is 3, because the value 2 is counted only once
+  enum TagWithLast {
+    Tag1 = 0,
+    Tag2 = 1,
+    Tag3 = 2,
+    LastTag = 2
+  };
+
+  // Enum count is 3, because TagCount is heuristically excluded
+  enum TagWithCounter {
+    Tag1, // is 0
+    Tag2, // is 1
+    Tag3, // is 2
+    TagCount, // is 3
+  };
+
+
+Options
+-------
+
+.. option:: EnableCountingEnumHeuristic
+
+This option enables or disables the counting enum heuristic.
+It uses the prefixes and suffixes specified in the options
+:option:`CountingEnumPrefixes`, :option:`CountingEnumSuffixes` to find counting enum constants by
+using them for prefix and suffix matching.
+
+This option is enabled by default.
+
+When :option:`EnableCountingEnumHeuristic` is `false`:
+
+.. code-block:: c++
+
+  enum TagWithCounter {
+    Tag1,
+    Tag2,
+    Tag3,
+    TagCount,
+  };
+
+  struct TaggedUnion {
+    TagWithCounter Kind;
+    union {
+      int A;
+      long B;
+      char *Str;
+      float F;
+    } Data;
+  };
+
+When :option:`EnableCountingEnumHeuristic` is `true`:
+
+.. code-block:: c++
+
+  enum TagWithCounter {
+    Tag1,
+    Tag2,
+    Tag3,
+    TagCount,
+  };
+
+  struct TaggedUnion { // warning: tagged union has more data members (4) than tags (3)
+    TagWithCounter Kind;
+    union {
+      int A;
+      long B;
+      char *Str;
+      float F;
+    } Data;
+  };
+
+.. option:: CountingEnumPrefixes
+
+See :option:`CountingEnumSuffixes` below.
+
+.. option:: CountingEnumSuffixes
+
+CountingEnumPrefixes and CountingEnumSuffixes are lists of semicolon
+separated strings that are used to search for possible counting enum constants.
+These strings are matched case insensitively as prefixes and suffixes
+respectively on the names of the enum constants.
+If :option:`EnableCountingEnumHeuristic` is `false` then these options do nothing.
+
+The default value of :option:`CountingEnumSuffixes` is `count` and of
+:option:`CountingEnumPrefixes` is the empty string.
+
+When :option:`EnableCountingEnumHeuristic` is `true` and :option:`CountingEnumSuffixes`
+is `count;size`:
+
+.. code-block:: c++
+
+  enum TagWithCounterCount {
+    Tag1,
+    Tag2,
+    Tag3,
+    TagCount,
+  };
+
+  struct TaggedUnionCount { // warning: tagged union has more data members (4) than tags (3)
+    TagWithCounterCount Kind;
+    union {
+      int A;
+      long B;
+      char *Str;
+      float F;
+    } Data;
+  };
+
+  enum TagWithCounterSize {
+    Tag11,
+    Tag22,
+    Tag33,
+    TagSize,
+  };
+
+  struct TaggedUnionSize { // warning: tagged union has more data members (4) than tags (3)
+    TagWithCounterSize Kind;
+    union {
+      int A;
+      long B;
+      char *Str;
+      float F;
+    } Data;
+  };
+
+When :option:`EnableCountingEnumHeuristic` is `true` and :option:`CountingEnumPrefixes` is `maxsize;last_`
+
+.. code-block:: c++
+
+  enum TagWithCounterLast {
+    Tag1,
+    Tag2,
+    Tag3,
+    last_tag,
+  };
+
+  struct TaggedUnionLast { // warning: tagged union has more data members (4) than tags (3)
+    TagWithCounterLast tag;
+    union {
+      int I;
+      short S;
+      char *C;
+      float F;
+    } Data;
+  };
+
+  enum TagWithCounterMaxSize {
+    Tag1,
+    Tag2,
+    Tag3,
+    MaxSizeTag,
+  };
+
+  struct TaggedUnionMaxSize { // warning: tagged union has more data members (4) than tags (3)
+    TagWithCounterMaxSize tag;
+    union {
+      int I;
+      short S;
+      char *C;
+      float F;
+    } Data;
+  };
+
+.. option:: StrictMode
+
+When enabled, the check will also give a warning, when the number of tags
+is greater than the number of union data members.
+
+This option is disabled by default.
+
+When :option:`StrictMode` is `false`:
+
+.. code-block:: c++
+
+    struct TaggedUnion {
+      enum {
+        Tag1,
+        Tag2,
+        Tag3,
+      } Tags;
+      union {
+        int I;
+        float F;
+      } Data;
+    };
+
+When :option:`StrictMode` is `true`:
+
+.. code-block:: c++
+
+    struct TaggedUnion { // warning: tagged union has fewer data members (2) than tags (3)
+      enum {
+        Tag1,
+        Tag2,
+        Tag3,
+      } Tags;
+      union {
+        int I;
+        float F;
+      } Data;
+    };
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst
index 5a6aaa077d9bf..97fe37b535356 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst
@@ -8,9 +8,10 @@ results. Therefore, it may be more resource intensive (RAM, CPU) than the
 average clang-tidy check.
 
 This check identifies unsafe accesses to values contained in
-``std::optional<T>``, ``absl::optional<T>``, ``base::Optional<T>``, or
-``folly::Optional<T>`` objects. Below we will refer to all these types
-collectively as ``optional<T>``.
+``std::optional<T>``, ``absl::optional<T>``, ``base::Optional<T>``,
+``folly::Optional<T>``, ``bsl::optional``, or
+``BloombergLP::bdlb::NullableValue`` objects. Below we will refer to all these
+types collectively as ``optional<T>``.
 
 An access to the value of an ``optional<T>`` occurs when one of its ``value``,
 ``operator*``, or ``operator->`` member functions is invoked.  To align with
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
index a0a267883b6fe..fb070627e31b1 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
@@ -19,6 +19,8 @@ The check implements the following rules from the CERT C Coding Standard:
 Unsafe functions
 ----------------
 
+The following functions are reported if :option:`ReportDefaultFunctions` is enabled.
+
 If *Annex K.* is available, a replacement from *Annex K.* is suggested for the
 following functions:
 
@@ -45,8 +47,7 @@ The following functions are always checked, regardless of *Annex K* availability
  - ``rewind``, suggested replacement: ``fseek``
  - ``setbuf``, suggested replacement: ``setvbuf``
 
-If `ReportMoreUnsafeFunctions
-<unsafe-functions.html#cmdoption-arg-ReportMoreUnsafeFunctions>`_ is enabled,
+If :option:`ReportMoreUnsafeFunctions` is enabled,
 the following functions are also checked:
 
  - ``bcmp``, suggested replacement: ``memcmp``
@@ -74,6 +75,44 @@ Both macros have to be defined to suggest replacement functions from *Annex K.*
 ``__STDC_WANT_LIB_EXT1__`` must be defined to ``1`` by the user **before**
 including any system headers.
 
+.. _CustomFunctions:
+
+Custom functions
+----------------
+
+The option :option:`CustomFunctions` allows the user to define custom functions to be
+checked. The format is the following, without newlines:
+
+.. code::
+
+   bugprone-unsafe-functions.CustomFunctions="
+     functionRegex1[, replacement1[, reason1]]; 
+     functionRegex2[, replacement2[, reason2]];
+     ...
+   "
+
+The functions are matched using POSIX extended regular expressions.
+*(Note: The regular expressions do not support negative* ``(?!)`` *matches.)*
+
+The `reason` is optional and is used to provide additional information about the
+reasoning behind the replacement. The default reason is `is marked as unsafe`.
+
+If `replacement` is empty, the text `it should not be used` will be shown
+instead of the suggestion for a replacement.
+
+As an example, the configuration `^original$, replacement, is deprecated;`
+will produce the following diagnostic message.
+
+.. code:: c
+  
+   original(); // warning: function 'original' is deprecated; 'replacement' should be used instead.
+   ::std::original(); // no-warning
+   original_function(); // no-warning
+
+If the regular expression contains the character `:`, it is matched against the
+qualified name (i.e. ``std::original``), otherwise the regex is matched against the unqualified name (``original``).
+If the regular expression starts with `::` (or `^::`), it is matched against the
+fully qualified name (``::std::original``).
 
 Options
 -------
@@ -86,6 +125,19 @@ Options
    this option enables.
    Default is `true`.
 
+.. option:: ReportDefaultFunctions
+
+    When `true`, the check reports the default set of functions.
+    Consider changing the setting to false if you only want to see custom
+    functions matched via :ref:`custom functions<CustomFunctions>`.
+    Default is `true`.
+
+.. option:: CustomFunctions
+
+    A semicolon-separated list of custom functions to be matched. A matched
+    function contains a regular expression, an optional name of the replacement
+    function, and an optional reason, separated by comma. For more information,
+    see :ref:`Custom functions<CustomFunctions>`.
 
 Examples
 --------
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 1909d7b8d8e24..e3dfabba8fad1 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -145,6 +145,7 @@ Clang-Tidy Checks
    :doc:`bugprone-suspicious-stringview-data-usage <bugprone/suspicious-stringview-data-usage>`,
    :doc:`bugprone-swapped-arguments <bugprone/swapped-arguments>`, "Yes"
    :doc:`bugprone-switch-missing-default-case <bugprone/switch-missing-default-case>`,
+   :doc:`bugprone-tagged-union-member-count <bugprone/tagged-union-member-count>`,
    :doc:`bugprone-terminating-continue <bugprone/terminating-continue>`, "Yes"
    :doc:`bugprone-throw-keyword-missing <bugprone/throw-keyword-missing>`,
    :doc:`bugprone-too-small-loop-variable <bugprone/too-small-loop-variable>`,
diff --git a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h
index cd2111cf72abf..46ca3c9d08074 100644
--- a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h
+++ b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include <string>
+#include <utility>
 
 namespace clang {
 class SourceLocation;
@@ -62,7 +63,8 @@ void walkUsed(llvm::ArrayRef<Decl *> ASTRoots,
 
 struct AnalysisResults {
   std::vector<const Include *> Unused;
-  std::vector<std::string> Missing; // Spellings, like "<vector>"
+  // Spellings, like "<vector>" paired with the Header that generated it.
+  std::vector<std::pair<std::string, Header>> Missing;
 };
 
 /// Determine which headers should be inserted or removed from the main file.
diff --git a/clang-tools-extra/include-cleaner/lib/Analysis.cpp b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
index 05e9d14734a95..16013f53894e8 100644
--- a/clang-tools-extra/include-cleaner/lib/Analysis.cpp
+++ b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
@@ -26,8 +26,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
@@ -84,7 +84,7 @@ analyze(llvm::ArrayRef<Decl *> ASTRoots,
   auto &SM = PP.getSourceManager();
   const auto MainFile = *SM.getFileEntryRefForID(SM.getMainFileID());
   llvm::DenseSet<const Include *> Used;
-  llvm::StringSet<> Missing;
+  llvm::StringMap<Header> Missing;
   if (!HeaderFilter)
     HeaderFilter = [](llvm::StringRef) { return false; };
   OptionalDirectoryEntryRef ResourceDir =
@@ -119,7 +119,7 @@ analyze(llvm::ArrayRef<Decl *> ASTRoots,
                Satisfied = true;
              }
              if (!Satisfied)
-               Missing.insert(std::move(Spelling));
+               Missing.try_emplace(std::move(Spelling), Providers.front());
            });
 
   AnalysisResults Results;
@@ -144,8 +144,8 @@ analyze(llvm::ArrayRef<Decl *> ASTRoots,
     }
     Results.Unused.push_back(&I);
   }
-  for (llvm::StringRef S : Missing.keys())
-    Results.Missing.push_back(S.str());
+  for (auto &E : Missing)
+    Results.Missing.emplace_back(E.first().str(), E.second);
   llvm::sort(Results.Missing);
   return Results;
 }
@@ -158,9 +158,9 @@ std::string fixIncludes(const AnalysisResults &Results,
   // Encode insertions/deletions in the magic way clang-format understands.
   for (const Include *I : Results.Unused)
     cantFail(R.add(tooling::Replacement(FileName, UINT_MAX, 1, I->quote())));
-  for (llvm::StringRef Spelled : Results.Missing)
-    cantFail(R.add(tooling::Replacement(FileName, UINT_MAX, 0,
-                                        ("#include " + Spelled).str())));
+  for (auto &[Spelled, _] : Results.Missing)
+    cantFail(R.add(
+        tooling::Replacement(FileName, UINT_MAX, 0, "#include " + Spelled)));
   // "cleanup" actually turns the UINT_MAX replacements into concrete edits.
   auto Positioned = cantFail(format::cleanupAroundReplacements(Code, R, Style));
   return cantFail(tooling::applyAllReplacements(Code, Positioned));
diff --git a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
index afae4365587ae..080099adc9a07 100644
--- a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
+++ b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
@@ -192,7 +192,7 @@ class Action : public clang::ASTFrontendAction {
       case PrintStyle::Changes:
         for (const Include *I : Results.Unused)
           llvm::outs() << "- " << I->quote() << " @Line:" << I->Line << "\n";
-        for (const auto &I : Results.Missing)
+        for (const auto &[I, _] : Results.Missing)
           llvm::outs() << "+ " << I << "\n";
         break;
       case PrintStyle::Final:
diff --git a/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp b/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
index 43634ee8f2d80..d2d137a0dfb42 100644
--- a/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/VirtualFileSystem.h"
@@ -39,6 +40,7 @@
 
 namespace clang::include_cleaner {
 namespace {
+using testing::_;
 using testing::AllOf;
 using testing::Contains;
 using testing::ElementsAre;
@@ -262,10 +264,12 @@ int x = a + c;
   auto Results =
       analyze(std::vector<Decl *>{Decls.begin(), Decls.end()},
               PP.MacroReferences, PP.Includes, &PI, AST.preprocessor());
+  auto CHeader = llvm::cantFail(
+      AST.context().getSourceManager().getFileManager().getFileRef("c.h"));
 
   const Include *B = PP.Includes.atLine(3);
   ASSERT_EQ(B->Spelled, "b.h");
-  EXPECT_THAT(Results.Missing, ElementsAre("\"c.h\""));
+  EXPECT_THAT(Results.Missing, ElementsAre(Pair("\"c.h\"", Header(CHeader))));
   EXPECT_THAT(Results.Unused, ElementsAre(B));
 }
 
@@ -370,7 +374,7 @@ TEST_F(AnalyzeTest, SpellingIncludesWithSymlinks) {
   auto Results = analyze(DeclsInTU, {}, PP.Includes, &PI, AST.preprocessor());
   // Check that we're spelling header using the symlink, and not underlying
   // path.
-  EXPECT_THAT(Results.Missing, testing::ElementsAre("\"inner.h\""));
+  EXPECT_THAT(Results.Missing, testing::ElementsAre(Pair("\"inner.h\"", _)));
   // header.h should be unused.
   EXPECT_THAT(Results.Unused, Not(testing::IsEmpty()));
 
@@ -379,7 +383,7 @@ TEST_F(AnalyzeTest, SpellingIncludesWithSymlinks) {
     auto HeaderFilter = [](llvm::StringRef Path) { return Path == "inner.h"; };
     Results = analyze(DeclsInTU, {}, PP.Includes, &PI, AST.preprocessor(),
                       HeaderFilter);
-    EXPECT_THAT(Results.Missing, testing::ElementsAre("\"inner.h\""));
+    EXPECT_THAT(Results.Missing, testing::ElementsAre(Pair("\"inner.h\"", _)));
     // header.h should be unused.
     EXPECT_THAT(Results.Unused, Not(testing::IsEmpty()));
   }
@@ -389,7 +393,7 @@ TEST_F(AnalyzeTest, SpellingIncludesWithSymlinks) {
                       HeaderFilter);
     // header.h should be ignored now.
     EXPECT_THAT(Results.Unused, Not(testing::IsEmpty()));
-    EXPECT_THAT(Results.Missing, testing::ElementsAre("\"inner.h\""));
+    EXPECT_THAT(Results.Missing, testing::ElementsAre(Pair("\"inner.h\"", _)));
   }
 }
 
@@ -414,9 +418,9 @@ TEST(FixIncludes, Basic) {
   Inc.add(I);
 
   AnalysisResults Results;
-  Results.Missing.push_back("\"aa.h\"");
-  Results.Missing.push_back("\"ab.h\"");
-  Results.Missing.push_back("<e.h>");
+  Results.Missing.emplace_back("\"aa.h\"", Header(""));
+  Results.Missing.emplace_back("\"ab.h\"", Header(""));
+  Results.Missing.emplace_back("<e.h>", Header(""));
   Results.Unused.push_back(Inc.atLine(3));
   Results.Unused.push_back(Inc.atLine(4));
 
@@ -429,7 +433,7 @@ R"cpp(#include "d.h"
 )cpp");
 
   Results = {};
-  Results.Missing.push_back("\"d.h\"");
+  Results.Missing.emplace_back("\"d.h\"", Header(""));
   Code = R"cpp(#include "a.h")cpp";
   EXPECT_EQ(fixIncludes(Results, "d.cc", Code, format::getLLVMStyle()),
 R"cpp(#include "d.h"
diff --git a/clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp b/clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp
index c5fc465ced7a7..84e02e1d0d621 100644
--- a/clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp
@@ -60,7 +60,7 @@ class FindHeadersTest : public testing::Test {
   llvm::SmallVector<Hinted<Header>> findHeaders(llvm::StringRef FileName) {
     return include_cleaner::findHeaders(
         AST->sourceManager().translateFileLineCol(
-            AST->fileManager().getFile(FileName).get(),
+            *AST->fileManager().getOptionalFileRef(FileName),
             /*Line=*/1, /*Col=*/1),
         AST->sourceManager(), &PI);
   }
diff --git a/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp b/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
index 715d95eb57346..b5a7b9720903e 100644
--- a/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
@@ -234,7 +234,7 @@ TEST_F(RecordPPTest, CapturesMacroRefs) {
   const auto &SM = AST.sourceManager();
 
   SourceLocation Def = SM.getComposedLoc(
-      SM.translateFile(AST.fileManager().getFile("header.h").get()),
+      SM.translateFile(*AST.fileManager().getOptionalFileRef("header.h")),
       Header.point("def"));
   ASSERT_THAT(Recorded.MacroReferences, Not(IsEmpty()));
   Symbol OrigX = Recorded.MacroReferences.front().Target;
@@ -368,29 +368,29 @@ TEST_F(PragmaIncludeTest, IWYUKeep) {
   TestAST Processed = build();
   auto &FM = Processed.fileManager();
 
-  EXPECT_FALSE(PI.shouldKeep(FM.getFile("normal.h").get()));
-  EXPECT_FALSE(PI.shouldKeep(FM.getFile("std/vector").get()));
+  EXPECT_FALSE(PI.shouldKeep(*FM.getOptionalFileRef("normal.h")));
+  EXPECT_FALSE(PI.shouldKeep(*FM.getOptionalFileRef("std/vector")));
 
   // Keep
-  EXPECT_TRUE(PI.shouldKeep(FM.getFile("keep1.h").get()));
-  EXPECT_TRUE(PI.shouldKeep(FM.getFile("keep2.h").get()));
-  EXPECT_TRUE(PI.shouldKeep(FM.getFile("keep3.h").get()));
-  EXPECT_TRUE(PI.shouldKeep(FM.getFile("keep4.h").get()));
-  EXPECT_TRUE(PI.shouldKeep(FM.getFile("keep5.h").get()));
-  EXPECT_TRUE(PI.shouldKeep(FM.getFile("keep6.h").get()));
-  EXPECT_TRUE(PI.shouldKeep(FM.getFile("std/map").get()));
+  EXPECT_TRUE(PI.shouldKeep(*FM.getOptionalFileRef("keep1.h")));
+  EXPECT_TRUE(PI.shouldKeep(*FM.getOptionalFileRef("keep2.h")));
+  EXPECT_TRUE(PI.shouldKeep(*FM.getOptionalFileRef("keep3.h")));
+  EXPECT_TRUE(PI.shouldKeep(*FM.getOptionalFileRef("keep4.h")));
+  EXPECT_TRUE(PI.shouldKeep(*FM.getOptionalFileRef("keep5.h")));
+  EXPECT_TRUE(PI.shouldKeep(*FM.getOptionalFileRef("keep6.h")));
+  EXPECT_TRUE(PI.shouldKeep(*FM.getOptionalFileRef("std/map")));
 
   // Exports
-  EXPECT_TRUE(PI.shouldKeep(FM.getFile("export1.h").get()));
-  EXPECT_TRUE(PI.shouldKeep(FM.getFile("export2.h").get()));
-  EXPECT_TRUE(PI.shouldKeep(FM.getFile("export3.h").get()));
-  EXPECT_TRUE(PI.shouldKeep(FM.getFile("std/set").get()));
+  EXPECT_TRUE(PI.shouldKeep(*FM.getOptionalFileRef("export1.h")));
+  EXPECT_TRUE(PI.shouldKeep(*FM.getOptionalFileRef("export2.h")));
+  EXPECT_TRUE(PI.shouldKeep(*FM.getOptionalFileRef("export3.h")));
+  EXPECT_TRUE(PI.shouldKeep(*FM.getOptionalFileRef("std/set")));
 }
 
 TEST_F(PragmaIncludeTest, AssociatedHeader) {
   createEmptyFiles({"foo/main.h", "bar/main.h", "bar/other.h", "std/vector"});
   auto IsKeep = [&](llvm::StringRef Name, TestAST &AST) {
-    return PI.shouldKeep(AST.fileManager().getFile(Name).get());
+    return PI.shouldKeep(*AST.fileManager().getOptionalFileRef(Name));
   };
 
   Inputs.FileName = "main.cc";
@@ -452,19 +452,19 @@ TEST_F(PragmaIncludeTest, IWYUPrivate) {
     // IWYU pragma: private
   )cpp";
   TestAST Processed = build();
-  auto PrivateFE = Processed.fileManager().getFile("private.h");
+  auto PrivateFE = Processed.fileManager().getOptionalFileRef("private.h");
   assert(PrivateFE);
-  EXPECT_TRUE(PI.isPrivate(PrivateFE.get()));
-  EXPECT_EQ(PI.getPublic(PrivateFE.get()), "\"public2.h\"");
+  EXPECT_TRUE(PI.isPrivate(*PrivateFE));
+  EXPECT_EQ(PI.getPublic(*PrivateFE), "\"public2.h\"");
 
-  auto PublicFE = Processed.fileManager().getFile("public.h");
+  auto PublicFE = Processed.fileManager().getOptionalFileRef("public.h");
   assert(PublicFE);
-  EXPECT_EQ(PI.getPublic(PublicFE.get()), ""); // no mapping.
-  EXPECT_FALSE(PI.isPrivate(PublicFE.get()));
+  EXPECT_EQ(PI.getPublic(*PublicFE), ""); // no mapping.
+  EXPECT_FALSE(PI.isPrivate(*PublicFE));
 
-  auto Private2FE = Processed.fileManager().getFile("private2.h");
+  auto Private2FE = Processed.fileManager().getOptionalFileRef("private2.h");
   assert(Private2FE);
-  EXPECT_TRUE(PI.isPrivate(Private2FE.get()));
+  EXPECT_TRUE(PI.isPrivate(*Private2FE));
 }
 
 TEST_F(PragmaIncludeTest, IWYUExport) {
@@ -486,13 +486,13 @@ TEST_F(PragmaIncludeTest, IWYUExport) {
   const auto &SM = Processed.sourceManager();
   auto &FM = Processed.fileManager();
 
-  EXPECT_THAT(PI.getExporters(FM.getFile("private.h").get(), FM),
+  EXPECT_THAT(PI.getExporters(*FM.getOptionalFileRef("private.h"), FM),
               testing::UnorderedElementsAre(FileNamed("export1.h"),
                                             FileNamed("export3.h")));
 
-  EXPECT_TRUE(PI.getExporters(FM.getFile("export1.h").get(), FM).empty());
-  EXPECT_TRUE(PI.getExporters(FM.getFile("export2.h").get(), FM).empty());
-  EXPECT_TRUE(PI.getExporters(FM.getFile("export3.h").get(), FM).empty());
+  EXPECT_TRUE(PI.getExporters(*FM.getOptionalFileRef("export1.h"), FM).empty());
+  EXPECT_TRUE(PI.getExporters(*FM.getOptionalFileRef("export2.h"), FM).empty());
+  EXPECT_TRUE(PI.getExporters(*FM.getOptionalFileRef("export3.h"), FM).empty());
   EXPECT_TRUE(
       PI.getExporters(SM.getFileEntryForID(SM.getMainFileID()), FM).empty());
 }
@@ -546,26 +546,25 @@ TEST_F(PragmaIncludeTest, IWYUExportBlock) {
     for (auto &FE : FEs) {
       OS << FE.getName() << " ";
     }
-    OS.flush();
     return Result;
   };
-  auto Exporters = PI.getExporters(FM.getFile("private1.h").get(), FM);
+  auto Exporters = PI.getExporters(*FM.getOptionalFileRef("private1.h"), FM);
   EXPECT_THAT(Exporters, testing::UnorderedElementsAre(FileNamed("export1.h"),
                                                        FileNamed("normal.h")))
       << GetNames(Exporters);
 
-  Exporters = PI.getExporters(FM.getFile("private2.h").get(), FM);
+  Exporters = PI.getExporters(*FM.getOptionalFileRef("private2.h"), FM);
   EXPECT_THAT(Exporters, testing::UnorderedElementsAre(FileNamed("export1.h")))
       << GetNames(Exporters);
 
-  Exporters = PI.getExporters(FM.getFile("private3.h").get(), FM);
+  Exporters = PI.getExporters(*FM.getOptionalFileRef("private3.h"), FM);
   EXPECT_THAT(Exporters, testing::UnorderedElementsAre(FileNamed("export1.h")))
       << GetNames(Exporters);
 
-  Exporters = PI.getExporters(FM.getFile("foo.h").get(), FM);
+  Exporters = PI.getExporters(*FM.getOptionalFileRef("foo.h"), FM);
   EXPECT_TRUE(Exporters.empty()) << GetNames(Exporters);
 
-  Exporters = PI.getExporters(FM.getFile("bar.h").get(), FM);
+  Exporters = PI.getExporters(*FM.getOptionalFileRef("bar.h"), FM);
   EXPECT_TRUE(Exporters.empty()) << GetNames(Exporters);
 }
 
@@ -581,8 +580,8 @@ TEST_F(PragmaIncludeTest, SelfContained) {
   Inputs.ExtraFiles["unguarded.h"] = "";
   TestAST Processed = build();
   auto &FM = Processed.fileManager();
-  EXPECT_TRUE(PI.isSelfContained(FM.getFile("guarded.h").get()));
-  EXPECT_FALSE(PI.isSelfContained(FM.getFile("unguarded.h").get()));
+  EXPECT_TRUE(PI.isSelfContained(*FM.getOptionalFileRef("guarded.h")));
+  EXPECT_FALSE(PI.isSelfContained(*FM.getOptionalFileRef("unguarded.h")));
 }
 
 TEST_F(PragmaIncludeTest, AlwaysKeep) {
@@ -597,8 +596,8 @@ TEST_F(PragmaIncludeTest, AlwaysKeep) {
   Inputs.ExtraFiles["usual.h"] = "#pragma once";
   TestAST Processed = build();
   auto &FM = Processed.fileManager();
-  EXPECT_TRUE(PI.shouldKeep(FM.getFile("always_keep.h").get()));
-  EXPECT_FALSE(PI.shouldKeep(FM.getFile("usual.h").get()));
+  EXPECT_TRUE(PI.shouldKeep(*FM.getOptionalFileRef("always_keep.h")));
+  EXPECT_FALSE(PI.shouldKeep(*FM.getOptionalFileRef("usual.h")));
 }
 
 TEST_F(PragmaIncludeTest, ExportInUnnamedBuffer) {
@@ -654,13 +653,13 @@ TEST_F(PragmaIncludeTest, OutlivesFMAndSM) {
   // Now this build gives us a new File&Source Manager.
   TestAST Processed = build(/*ResetPragmaIncludes=*/false);
   auto &FM = Processed.fileManager();
-  auto PrivateFE = FM.getFile("private.h");
+  auto PrivateFE = FM.getOptionalFileRef("private.h");
   assert(PrivateFE);
-  EXPECT_EQ(PI.getPublic(PrivateFE.get()), "\"public.h\"");
+  EXPECT_EQ(PI.getPublic(*PrivateFE), "\"public.h\"");
 
-  auto Private2FE = FM.getFile("private2.h");
+  auto Private2FE = FM.getOptionalFileRef("private2.h");
   assert(Private2FE);
-  EXPECT_THAT(PI.getExporters(Private2FE.get(), FM),
+  EXPECT_THAT(PI.getExporters(*Private2FE, FM),
               testing::ElementsAre(llvm::cantFail(FM.getFileRef("public.h"))));
 }
 
@@ -677,8 +676,8 @@ TEST_F(PragmaIncludeTest, CanRecordManyTimes) {
 
   TestAST Processed = build();
   auto &FM = Processed.fileManager();
-  auto PrivateFE = FM.getFile("private.h");
-  llvm::StringRef Public = PI.getPublic(PrivateFE.get());
+  auto PrivateFE = FM.getOptionalFileRef("private.h");
+  llvm::StringRef Public = PI.getPublic(*PrivateFE);
   EXPECT_EQ(Public, "\"public.h\"");
 
   // This build populates same PI during build, but this time we don't have
diff --git a/clang-tools-extra/modularize/Modularize.cpp b/clang-tools-extra/modularize/Modularize.cpp
index 2c00c76c85533..4bb3bae0503ac 100644
--- a/clang-tools-extra/modularize/Modularize.cpp
+++ b/clang-tools-extra/modularize/Modularize.cpp
@@ -621,7 +621,6 @@ class CollectEntitiesVisitor
     std::string Name;
     llvm::raw_string_ostream OS(Name);
     ND->printQualifiedName(OS);
-    OS.flush();
     if (Name.empty())
       return true;
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/Inputs/unchecked-optional-access/bde/types/bdlb_nullablevalue.h b/clang-tools-extra/test/clang-tidy/checkers/bugprone/Inputs/unchecked-optional-access/bde/types/bdlb_nullablevalue.h
new file mode 100644
index 0000000000000..4411bcfd60a74
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/Inputs/unchecked-optional-access/bde/types/bdlb_nullablevalue.h
@@ -0,0 +1,38 @@
+#ifndef LLVM_CLANG_TOOLS_EXTRA_TEST_CLANG_TIDY_CHECKERS_INPUTS_BDE_TYPES_NULLABLEVALUE_H_
+#define LLVM_CLANG_TOOLS_EXTRA_TEST_CLANG_TIDY_CHECKERS_INPUTS_BDE_TYPES_NULLABLEVALUE_H_
+
+#include "bsl_optional.h"
+
+/// Mock of `bdlb::NullableValue`.
+namespace BloombergLP::bdlb {
+
+template <typename T>
+class NullableValue : public bsl::optional<T> {
+public:
+  constexpr NullableValue() noexcept;
+
+  constexpr NullableValue(bsl::nullopt_t) noexcept;
+
+  NullableValue(const NullableValue &) = default;
+
+  NullableValue(NullableValue &&) = default;
+
+  const T &value() const &;
+  T &value() &;
+
+  // 'operator bool' is inherited from bsl::optional
+
+  constexpr bool isNull() const noexcept;
+
+  template <typename U>
+  constexpr T valueOr(U &&v) const &;
+
+  // 'reset' is inherited from bsl::optional
+
+  template <typename U> NullableValue &operator=(const U &u);
+};
+
+
+} // namespace BloombergLP::bdlb
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_TEST_CLANG_TIDY_CHECKERS_INPUTS_BDE_TYPES_NULLABLEVALUE_H_
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/Inputs/unchecked-optional-access/bde/types/bsl_optional.h b/clang-tools-extra/test/clang-tidy/checkers/bugprone/Inputs/unchecked-optional-access/bde/types/bsl_optional.h
new file mode 100644
index 0000000000000..7e1a129e04a55
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/Inputs/unchecked-optional-access/bde/types/bsl_optional.h
@@ -0,0 +1,75 @@
+#ifndef LLVM_CLANG_TOOLS_EXTRA_TEST_CLANG_TIDY_CHECKERS_INPUTS_BDE_TYPES_OPTIONAL_H_
+#define LLVM_CLANG_TOOLS_EXTRA_TEST_CLANG_TIDY_CHECKERS_INPUTS_BDE_TYPES_OPTIONAL_H_
+
+/// Mock of `bsl::optional`.
+namespace bsl {
+
+// clang-format off
+template <typename T> struct remove_reference      { using type = T; };
+template <typename T> struct remove_reference<T&>  { using type = T; };
+template <typename T> struct remove_reference<T&&> { using type = T; };
+// clang-format on
+
+template <typename T>
+using remove_reference_t = typename remove_reference<T>::type;
+
+template <typename T>
+constexpr T &&forward(remove_reference_t<T> &t) noexcept;
+
+template <typename T>
+constexpr T &&forward(remove_reference_t<T> &&t) noexcept;
+
+template <typename T>
+constexpr remove_reference_t<T> &&move(T &&x);
+
+struct nullopt_t {
+  constexpr explicit nullopt_t() {}
+};
+
+constexpr nullopt_t nullopt;
+
+template <typename T>
+class optional {
+public:
+  constexpr optional() noexcept;
+
+  constexpr optional(nullopt_t) noexcept;
+
+  optional(const optional &) = default;
+
+  optional(optional &&) = default;
+
+  const T &operator*() const &;
+  T &operator*() &;
+  const T &&operator*() const &&;
+  T &&operator*() &&;
+
+  const T *operator->() const;
+  T *operator->();
+
+  const T &value() const &;
+  T &value() &;
+  const T &&value() const &&;
+  T &&value() &&;
+
+  constexpr explicit operator bool() const noexcept;
+  constexpr bool has_value() const noexcept;
+
+  template <typename U>
+  constexpr T value_or(U &&v) const &;
+  template <typename U>
+  T value_or(U &&v) &&;
+
+  template <typename... Args>
+  T &emplace(Args &&...args);
+
+  void reset() noexcept;
+
+  void swap(optional &rhs) noexcept;
+
+  template <typename U> optional &operator=(const U &u);
+};
+
+} // namespace bsl
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_TEST_CLANG_TIDY_CHECKERS_INPUTS_BDE_TYPES_OPTIONAL_H_
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/posix-return.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/posix-return.cpp
index 271893c707069..76d447a71d68b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/posix-return.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/posix-return.cpp
@@ -74,6 +74,9 @@ void warningLessThanZero() {
   if (pthread_yield() < 0) {}
   // CHECK-MESSAGES: :[[@LINE-1]]:23: warning:
   // CHECK-FIXES: pthread_yield() > 0
+  if (0 > pthread_yield() ) {}
+  // CHECK-MESSAGES: :[[@LINE-1]]:9: warning:
+  // CHECK-FIXES: 0 < pthread_yield()
 
 }
 
@@ -90,7 +93,8 @@ void warningAlwaysTrue() {
   // CHECK-MESSAGES: :[[@LINE-1]]:31: warning:
   if (pthread_yield() >= 0) {}
   // CHECK-MESSAGES: :[[@LINE-1]]:23: warning:
-
+  if (0 <= pthread_yield()) {}
+  // CHECK-MESSAGES: :[[@LINE-1]]:9: warning:
 }
 
 void warningEqualsNegative() {
@@ -120,7 +124,14 @@ void warningEqualsNegative() {
   // CHECK-MESSAGES: :[[@LINE-1]]:46: warning:
   if (pthread_create(NULL, NULL, NULL, NULL) < -1) {}
   // CHECK-MESSAGES: :[[@LINE-1]]:46: warning:
-
+  if (-1 == pthread_create(NULL, NULL, NULL, NULL)) {}
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning:
+  if (-1 != pthread_create(NULL, NULL, NULL, NULL)) {}
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning:
+  if (-1 >= pthread_create(NULL, NULL, NULL, NULL)) {}
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning:
+  if (-1 > pthread_create(NULL, NULL, NULL, NULL)) {}
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning:
 }
 
 void WarningWithMacro() {
@@ -162,6 +173,16 @@ void noWarning() {
   if (posix_openpt(0) < -1) {}
   if (posix_fadvise(0, 0, 0, 0) <= 0) {}
   if (posix_fadvise(0, 0, 0, 0) == 1) {}
+  if (0 > posix_openpt(0)) {}
+  if (0 >= posix_openpt(0)) {}
+  if (-1 == posix_openpt(0)) {}
+  if (-1 != posix_openpt(0)) {}
+  if (-1 >= posix_openpt(0)) {}
+  if (-1 > posix_openpt(0)) {}
+  if (posix_fadvise(0, 0, 0, 0) <= 0) {}
+  if (posix_fadvise(0, 0, 0, 0) == 1) {}
+  if (0 >= posix_fadvise(0, 0, 0, 0)) {}
+  if (1 == posix_fadvise(0, 0, 0, 0)) {}
 }
 
 namespace i {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-heuristic-bad-config.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-heuristic-bad-config.cpp
new file mode 100644
index 0000000000000..73bfb7acbc464
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-heuristic-bad-config.cpp
@@ -0,0 +1,11 @@
+// RUN: %check_clang_tidy %s bugprone-tagged-union-member-count %t \
+// RUN:   -config='{CheckOptions: { \
+// RUN:       bugprone-tagged-union-member-count.EnableCountingEnumHeuristic: false, \
+// RUN:       bugprone-tagged-union-member-count.CountingEnumSuffixes: "count", \
+// RUN:       bugprone-tagged-union-member-count.CountingEnumPrefixes: "last", \
+// RUN:   }}'
+
+// Warn when the heuristic is disabled and a suffix or a prefix is set explicitly.
+
+// CHECK-MESSAGES: warning: bugprone-tagged-union-member-count: Counting enum heuristic is disabled but CountingEnumPrefixes is set
+// CHECK-MESSAGES: warning: bugprone-tagged-union-member-count: Counting enum heuristic is disabled but CountingEnumSuffixes is set
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-heuristic-is-disabled.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-heuristic-is-disabled.cpp
new file mode 100644
index 0000000000000..dca52170a749a
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-heuristic-is-disabled.cpp
@@ -0,0 +1,65 @@
+// RUN: %check_clang_tidy -std=c++98-or-later %s bugprone-tagged-union-member-count %t \
+// RUN:   -config='{CheckOptions: { \
+// RUN:       bugprone-tagged-union-member-count.StrictMode: true, \
+// RUN:       bugprone-tagged-union-member-count.EnableCountingEnumHeuristic: false, \
+// RUN:   }}' --
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has fewer data members (3) than tags (4)
+struct IncorrectBecauseHeuristicIsDisabledPrefixCase {
+  enum {
+    tags11,
+    tags22,
+    tags33,
+    lasttag,
+  } Tags;
+  union {
+    char A;
+    short B;
+    int C;
+  } Data;
+};
+
+struct CorrectBecauseHeuristicIsDisabledPrefixCase { // No warnings expected
+  enum {
+    tags1,
+    tags2,
+    tags3,
+    lasttags,
+  } Tags;
+  union {
+    char A;
+    short B;
+    int C;
+    long D;
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has fewer data members (3) than tags (4)
+struct IncorrectBecauseHeuristicIsDisabledSuffixCase {
+  enum {
+    tags11,
+    tags22,
+    tags33,
+    tags_count,
+  } Tags;
+  union {
+    char A;
+    short B;
+    int C;
+  } Data;
+};
+
+struct CorrectBecauseHeuristicIsDisabledSuffixCase { // No warnings expected
+  enum {
+    tags1,
+    tags2,
+    tags3,
+    tags_count,
+  } Tags;
+  union {
+    char A;
+    short B;
+    int C;
+    long D;
+  } Data;
+};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-heuristic-is-enabled.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-heuristic-is-enabled.cpp
new file mode 100644
index 0000000000000..96aef122e85ef
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-heuristic-is-enabled.cpp
@@ -0,0 +1,156 @@
+// RUN: %check_clang_tidy -std=c++98-or-later %s bugprone-tagged-union-member-count %t \
+// RUN:   -config='{CheckOptions: { \
+// RUN:     bugprone-tagged-union-member-count.StrictMode: false, \
+// RUN:     bugprone-tagged-union-member-count.EnableCountingEnumHeuristic: true, \
+// RUN:     bugprone-tagged-union-member-count.CountingEnumSuffixes: "count", \
+// RUN:     bugprone-tagged-union-member-count.CountingEnumPrefixes: "last", \
+// RUN:  }}' --
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (3) than tags (2)
+struct IncorrectBecauseHeuristicIsEnabledPrefixCase {
+  enum {
+    tags1,
+    tags2,
+    lasttag,
+  } Tags;
+  union {
+    char A;
+    short B;
+    int C;
+  } Data;
+};
+
+struct CorrectBecauseHeuristicIsEnabledPrefixCase { // No warnings expected
+  enum {
+    tags1,
+    tags2,
+    tags3,
+    lasttag,
+  } Tags;
+  union {
+    int A;
+    int B;
+    int C;
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (3) than tags (2)
+struct IncorrectBecauseHeuristicIsEnabledSuffixCase {
+  enum {
+    tags1,
+    tags2,
+    tags_count,
+  } Tags;
+  union {
+    char A;
+    short B;
+    int C;
+  } Data;
+};
+
+struct CorrectBecauseHeuristicIsEnabledSuffixCase { // No warnings expected
+  enum {
+    tags1,
+    tags2,
+    tags3,
+    tags_count,
+  } Tags;
+  union {
+    int A;
+    int B;
+    int C;
+  } Data;
+};
+
+union Union4 {
+  short *Shorts;
+  double *Doubles;
+  int *Ints;
+  float *Floats;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct CountingEnumCaseInsensitivityTest1 { 
+  enum {
+    node_type_loop,
+    node_type_branch,
+    node_type_function,
+    node_type_count,
+  } Kind;
+  union Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct CountingEnumCaseInsensitivityTest2 { 
+  enum {
+    NODE_TYPE_LOOP,
+    NODE_TYPE_BRANCH,
+    NODE_TYPE_FUNCTION,
+    NODE_TYPE_COUNT,
+  } Kind;
+  union Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TagWhereCountingEnumIsAliased {
+  enum {
+    tag_alias_counter1 = 1,
+    tag_alias_counter2 = 2,
+    tag_alias_counter3 = 3,
+    tag_alias_other_count = 3,
+  } Kind;
+  union {
+    char C;
+    short S;
+    int I;
+    long L;
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (2)
+struct TagWithCountingEnumButOtherValueIsAliased {
+  enum {
+    tag_alias_other1 = 1,
+    tag_alias_other2 = 1,
+    tag_alias_other3 = 3,
+    tag_alias_other_count = 2,
+  } Kind;
+  union {
+    char C;
+    short S;
+    int I;
+    long L;
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TagWhereCounterIsTheSmallest {
+  enum {
+    tag_large1 = 1000,
+    tag_large2 = 1001,
+    tag_large3 = 1002,
+    tag_large_count = 3,
+  } Kind;
+  union {
+    char C;
+    short S;
+    int I;
+    long L;
+  } Data;
+};
+
+// No warnings expected, only the last enum constant can be a counting enum constant
+struct TagWhereCounterLikeNameIsNotLast {
+  enum {
+    kind_count,
+    kind2,
+    last_kind1,
+    kind3,
+  } Kind;
+  union {
+    char C;
+    short S;
+    int I;
+    long L;
+  } Data;
+};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-prefixes-and-suffixes.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-prefixes-and-suffixes.cpp
new file mode 100644
index 0000000000000..c0e33ac6f6f36
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-prefixes-and-suffixes.cpp
@@ -0,0 +1,52 @@
+// RUN: %check_clang_tidy -std=c++98-or-later %s bugprone-tagged-union-member-count %t \
+// RUN:   -config='{CheckOptions: { \
+// RUN:     bugprone-tagged-union-member-count.StrictMode: false, \
+// RUN:     bugprone-tagged-union-member-count.EnableCountingEnumHeuristic: true, \
+// RUN:     bugprone-tagged-union-member-count.CountingEnumSuffixes: "count", \
+// RUN:     bugprone-tagged-union-member-count.CountingEnumPrefixes: "last", \
+// RUN:  }}' --
+
+union Union3 {
+  short *Shorts;
+  int *Ints;
+  float *Floats;
+};
+
+union Union4 {
+  short *Shorts;
+  double *Doubles;
+  int *Ints;
+  float *Floats;
+};
+
+// The heuristic only considers the last enum constant
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionPrefixAndSuffixMatch {
+  enum {
+    tags1,
+    tags2,
+    tagscount,
+    lasttags
+  } Kind;
+  Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (3) than tags (2)
+struct TaggedUnionOnlyPrefixMatch {
+  enum { 
+    prefixtag1,
+    prefixtag2,
+    lastprefixtag
+  } Kind;
+  Union3 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (3) than tags (2)
+struct TaggedUnionOnlySuffixMatch {
+  enum {
+    suffixtag1,
+    suffixtag2,
+    suffixtagcount
+  } Kind;
+  Union3 Data;
+};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-prefixes.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-prefixes.cpp
new file mode 100644
index 0000000000000..c287b1953a333
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-prefixes.cpp
@@ -0,0 +1,35 @@
+// RUN: %check_clang_tidy -std=c++98-or-later %s bugprone-tagged-union-member-count %t \
+// RUN:   -config='{CheckOptions: { \
+// RUN:     bugprone-tagged-union-member-count.StrictMode: false, \
+// RUN:     bugprone-tagged-union-member-count.EnableCountingEnumHeuristic: true, \
+// RUN:     bugprone-tagged-union-member-count.CountingEnumPrefixes: "maxsize;last", \
+// RUN:  }}' --
+
+union Union4 {
+  short *Shorts;
+  double *Doubles;
+  int *Ints;
+  float *Floats;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionWithMaxsizeAsCounterPrefix {
+  enum {
+    twc1,
+    twc2,
+    twc3,
+    maxsizetwc,  
+  } Kind;
+  Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionWithLastAsCounterPrefix { 
+  enum {
+    twc11,
+    twc22,
+    twc33,
+    lasttwc,
+  } Kind;
+  Union4 Data;
+};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-suffixes.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-suffixes.cpp
new file mode 100644
index 0000000000000..f248f2efaa5ad
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-counting-enum-suffixes.cpp
@@ -0,0 +1,35 @@
+// RUN: %check_clang_tidy -std=c++98-or-later %s bugprone-tagged-union-member-count %t \
+// RUN:   -config='{CheckOptions: { \
+// RUN:     bugprone-tagged-union-member-count.StrictMode: false, \
+// RUN:     bugprone-tagged-union-member-count.EnableCountingEnumHeuristic: true, \
+// RUN:     bugprone-tagged-union-member-count.CountingEnumSuffixes: "count;size", \
+// RUN:  }}' --
+
+typedef union Union4 {
+  short *Shorts;
+  double *Doubles;
+  int *Ints;
+  float *Floats;
+} union4;
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionWithCounterCountSuffix {
+  enum {
+    twc1,
+    twc2,
+    twc3,
+    twc_count,
+  } Kind;
+  union Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionWithCounterSizeSuffix { 
+  enum {
+    twc11,
+    twc22,
+    twc33,
+    twc_size,
+  } Kind;
+  union Union4 Data;
+};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-strictmode-is-disabled.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-strictmode-is-disabled.cpp
new file mode 100644
index 0000000000000..c39683c3c40f6
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-strictmode-is-disabled.cpp
@@ -0,0 +1,27 @@
+// RUN: %check_clang_tidy -std=c++98-or-later %s bugprone-tagged-union-member-count %t \
+// RUN:   -config='{CheckOptions: { \
+// RUN:     bugprone-tagged-union-member-count.StrictMode: false, \
+// RUN:  }}' --
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (2) than tags (1)
+struct Incorrect {
+  enum {
+    tags1,
+  } Tags;
+  union {
+    char A;
+    short B;
+  } Data;
+};
+
+struct CorrectBecauseStrictModeIsDisabled { // No warnings expected
+  enum {
+    tags1,
+    tags2,
+    tags3,
+  } Tags;
+  union {
+    char A;
+    short B;
+  } Data;
+};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-strictmode-is-enabled.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-strictmode-is-enabled.cpp
new file mode 100644
index 0000000000000..10d376d791968
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count-strictmode-is-enabled.cpp
@@ -0,0 +1,30 @@
+// RUN: %check_clang_tidy -std=c++98-or-later %s bugprone-tagged-union-member-count %t \
+// RUN:   -config='{CheckOptions: { \
+// RUN:     bugprone-tagged-union-member-count.StrictMode: true, \
+// RUN:  }}' --
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has fewer data members (2) than tags (3)
+struct IncorrectBecauseStrictmodeIsEnabled {
+  enum {
+    tags1,
+    tags2,
+    tags3,
+  } Tags;
+  union {
+    char A;
+    short B;
+  } Data;
+};
+
+struct Correct { // No warnings expected
+  enum {
+    tags1,
+    tags2,
+    tags3,
+  } Tags;
+  union {
+    char A;
+    short B;
+    int C;
+  } Data;
+};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count.c
new file mode 100644
index 0000000000000..60c93c553baca
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count.c
@@ -0,0 +1,149 @@
+// RUN: %check_clang_tidy %s bugprone-tagged-union-member-count %t
+
+typedef enum Tags3 {
+  tags3_1,
+  tags3_2,
+  tags3_3,
+} Tags3;
+
+typedef enum Tags4 {
+  tags4_1,
+  tags4_2,
+  tags4_3,
+  tags4_4,
+} Tags4;
+
+typedef union Union3 {
+  short *Shorts;
+  int *Ints;
+  float *Floats;
+} Union3;
+
+typedef union Union4 {
+  short *Shorts;
+  double *Doubles;
+  int *Ints;
+  float *Floats;
+} Union4;
+
+// It is not obvious which enum is the tag for the union.
+struct maybeTaggedUnion1 { // No warnings expected.
+  enum Tags3 TagA;
+  enum Tags4 TagB;
+  union Union4 Data;
+};
+
+// It is not obvious which union does the tag belong to.
+struct maybeTaggedUnion2 { // No warnings expected.
+  enum Tags3 Tag;
+  union Union3 DataB;
+  union Union3 DataA;
+};
+
+// It is not obvious which union does the tag belong to.
+struct maybeTaggedUnion3 { // No warnings expected.
+  enum Tags3 Tag;
+  union {
+    int I1;
+    int I2;
+    int I3;
+  };
+  union {
+    float F1;
+    float F2;
+    float F3;
+  };
+};
+
+// No warnings expected, because LastATag is just an alias
+struct TaggedUnionWithAliasedEnumConstant {
+  enum {
+    ATag1,
+    ATag2,
+    ATag3,
+    LastATag = ATag3,
+  } Tag;
+  union {
+    float F;
+    int *Ints;
+    char Key[8];
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithPredefinedTagAndPredefinedUnion {
+  enum Tags3 Tag;
+    union Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithPredefinedTagAndInlineUnion {
+  enum Tags3 Tag;
+    union {
+    int *Ints;
+    char Characters[13];
+    struct {
+      double Re;
+      double Im;
+    } Complex;
+    long L;
+    } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithInlineTagAndPredefinedUnion { 
+  enum {
+    TaggedUnion7tag1,
+    TaggedUnion7tag2,
+    TaggedUnion7tag3,
+  } Tag;
+  union Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithInlineTagAndInlineUnion { 
+  enum {
+    TaggedUnion8tag1,
+    TaggedUnion8tag2,
+    TaggedUnion8tag3,
+  } Tag;
+  union {
+    int *Ints;
+    char Characters[13];
+    struct {
+      double Re;
+      double Im;
+    } Complex;
+    long L;
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructNesting { 
+  enum Tags3 Tag;
+  union {
+    float F;
+    int I;
+    long L;
+    // CHECK-MESSAGES: :[[@LINE+1]]:12: warning: tagged union has more data members (4) than tags (3)
+    struct innerdecl { 
+      enum Tags3 Tag;
+      union Union4 Data;
+    } Inner; 
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithTypedefedTagAndTypedefedUnion { 
+  Tags3 Tag;
+  Union4 Data;
+};
+
+#define DECLARE_TAGGED_UNION_STRUCT(Tag, Union, Name)\
+struct Name {\
+  Tag Kind;\
+  Union Data;\
+}
+
+// CHECK-MESSAGES: :[[@LINE+1]]:44: warning: tagged union has more data members (4) than tags (3)
+DECLARE_TAGGED_UNION_STRUCT(Tags3, Union4, TaggedUnionStructFromMacro);
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count.cpp
new file mode 100644
index 0000000000000..25827e8c8de0c
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count.cpp
@@ -0,0 +1,310 @@
+// RUN: %check_clang_tidy -std=c++98-or-later %s bugprone-tagged-union-member-count %t
+// Test check with C++ features
+
+typedef enum Tags3 {
+  tags3_1,
+  tags3_2,
+  tags3_3,
+} Tags3;
+
+typedef enum Tags4 {
+  tags4_1,
+  tags4_2,
+  tags4_3,
+  tags4_4,
+} Tags4;
+
+enum class Classtags3 {
+  classtags3_1,
+  classtags3_2,
+  classtags3_3,
+};
+
+enum class Typedtags3 : unsigned int {
+  typedtags3_1,
+  typedtags3_2,
+  typedtags3_3,
+};
+
+typedef union Union3 {
+  short *Shorts;
+  int *Ints;
+  float *Floats;
+} Union3;
+
+typedef union Union4 {
+  short *Shorts;
+  double *Doubles;
+  int *Ints;
+  float *Floats;
+} Union4;
+
+// It is not obvious which enum is the tag for the union.
+class MaybeTaggedUnion1 { // No warnings expected.
+  enum Tags3 TagA;
+  enum Tags4 TagB;
+  union Union4 Data;
+};
+
+// It is not obvious which union does the tag belong to.
+class MaybeTaggedUnion2 { // No warnings expected.
+  enum Tags3 Tag;
+  union Union3 DataB;
+  union Union3 DataA;
+};
+
+// It is not obvious which union does the tag belong to.
+class MaybeTaggedUnion3 { // No warnings expected.
+  enum Tags3 Tag;
+  union {
+    int I1;
+    int I2;
+    int I3;
+  };
+  union {
+    float F1;
+    float F2;
+    float F3;
+  };
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassPredefinedTagAndPredefinedUnion {
+  enum Tags3 Tag;
+  union Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassPredefinedTagAndInlineUnion {
+  enum Tags3 Tag;
+    union {
+    int *Ints;
+    char Characters[13];
+    class {
+      double Re;
+      double Im;
+    } Complex;
+    long L;
+    } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassInlineTagAndPredefinedUnion { 
+  enum {
+    tag1,
+    tag2,
+    tag3,
+  } Tag;
+  union Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassInlineTagAndInlineUnion { 
+  enum {
+    tag1,
+    tag2,
+    tag3,
+  } Tag;
+  union {
+    int *Ints;
+    char Characters[13];
+    class {
+      double Re;
+      double Im;
+    } Complex;
+    long L;
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassWithNestedTaggedUnionClass { 
+  enum Tags3 Tag;
+  union {
+    float F;
+    int I;
+    long L;
+    // CHECK-MESSAGES: :[[@LINE+1]]:11: warning: tagged union has more data members (4) than tags (3)
+    class Innerdecl { 
+      enum Tags3 Tag;
+      union Union4 Data;
+    } Inner; 
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassWithTypedefedTag { 
+  Tags3 Tag;
+  Union4 Data;
+}; 
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithEnumClass { 
+  enum Classtags3 Tag;
+  Union4 Data;
+}; 
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClasswithEnumClass { 
+  enum Classtags3 Tag;
+  Union4 Data;
+}; 
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithTypedEnum {
+  Typedtags3 Tag;
+  Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassWithTypedEnum {
+  Typedtags3 Tag;
+  Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct AnonymousTaggedUnionStruct {
+  Tags3 Tag;
+  union {
+    char A;
+    short B;
+    int C;
+    long D;
+  };
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassWithAnonymousUnion {
+  Tags3 Tag;
+  union {
+    char A;
+    short B;
+    int C;
+    long D;
+  };
+};
+
+namespace testnamespace {
+
+enum Tags3 {
+  tags3_1,
+  tags3_2,
+  tags3_3,
+};
+
+union Union4 {
+  short *Shorts;
+  double *Doubles;
+  int *Ints;
+  float *Floats;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructInNamespace {
+  Tags3 Tags;
+  Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassInNamespace {
+  Tags3 Tags;
+  Union4 Data;
+};
+
+} // namespace testnamespace
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithNamespacedTagAndUnion {
+  testnamespace::Tags3 Tags;
+  testnamespace::Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassWithNamespacedTagAndUnion {
+  testnamespace::Tags3 Tags;
+  testnamespace::Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+2]]:8: warning: tagged union has more data members (4) than tags (3)
+template <typename Tag, typename Union>
+struct TemplatedStructWithNamespacedTagAndUnion {
+  Tag Kind;
+  Union Data;
+};
+
+TemplatedStructWithNamespacedTagAndUnion<testnamespace::Union4, testnamespace::Tags3> TemplatedStruct3;
+
+// CHECK-MESSAGES: :[[@LINE+2]]:7: warning: tagged union has more data members (4) than tags (3)
+template <typename Tag, typename Union>
+class TemplatedClassWithNamespacedTagAndUnion {
+  Tag Kind;
+  Union Data;
+};
+
+TemplatedClassWithNamespacedTagAndUnion<testnamespace::Union4, testnamespace::Tags3> TemplatedClass3;
+
+// CHECK-MESSAGES: :[[@LINE+2]]:8: warning: tagged union has more data members (4) than tags (3)
+template <typename Tag, typename Union>
+struct TemplatedStruct {
+  Tag Kind;
+  Union Data;
+};
+
+TemplatedStruct<Tags3, Union3> TemplatedStruct1; // No warning expected
+TemplatedStruct<Tags3, Union4> TemplatedStruct2;
+
+// CHECK-MESSAGES: :[[@LINE+2]]:7: warning: tagged union has more data members (4) than tags (3)
+template <typename Tag, typename Union>
+class TemplatedClass {
+  Tag Kind;
+  Union Data;
+};
+
+TemplatedClass<Tags3, Union3> TemplatedClass1; // No warning expected
+TemplatedClass<Tags3, Union4> TemplatedClass2;
+
+// CHECK-MESSAGES: :[[@LINE+2]]:8: warning: tagged union has more data members (4) than tags (3)
+template <typename T>
+struct TemplatedStructButTaggedUnionPartIsNotTemplated {
+  Tags3 Kind;
+  Union4 Data;
+  T SomethingElse;
+};
+
+// CHECK-MESSAGES: :[[@LINE+2]]:7: warning: tagged union has more data members (4) than tags (3)
+template <typename T>
+class TemplatedClassButTaggedUnionPartIsNotTemplated {
+  Tags3 Kind;
+  Union4 Data;
+  T SomethingElse;
+};
+
+#define DECLARE_TAGGED_UNION_STRUCT(Tag, Union, Name)\
+struct Name {\
+  Tag Kind;\
+  Union Data;\
+}
+
+// CHECK-MESSAGES: :[[@LINE+1]]:44: warning: tagged union has more data members (4) than tags (3)
+DECLARE_TAGGED_UNION_STRUCT(Tags3, Union4, TaggedUnionStructFromMacro);
+
+#define DECLARE_TAGGED_UNION_CLASS(Tag, Union, Name)\
+class Name {\
+  Tag Kind;\
+  Union Data;\
+}
+
+// CHECK-MESSAGES: :[[@LINE+1]]:43: warning: tagged union has more data members (4) than tags (3)
+DECLARE_TAGGED_UNION_CLASS(Tags3, Union4, TaggedUnionClassFromMacro);
+
+// Lambdas implicitly compile down to an unnamed CXXRecordDecl and if they have captures,
+// then those become unnamed fields.
+void DoNotMatchLambdas() {
+    enum {
+		A
+	} e;
+    union {
+		long A;
+		char B;
+	} u;
+    auto L = [e, u] () {};
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count.m b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count.m
new file mode 100644
index 0000000000000..60c93c553baca
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count.m
@@ -0,0 +1,149 @@
+// RUN: %check_clang_tidy %s bugprone-tagged-union-member-count %t
+
+typedef enum Tags3 {
+  tags3_1,
+  tags3_2,
+  tags3_3,
+} Tags3;
+
+typedef enum Tags4 {
+  tags4_1,
+  tags4_2,
+  tags4_3,
+  tags4_4,
+} Tags4;
+
+typedef union Union3 {
+  short *Shorts;
+  int *Ints;
+  float *Floats;
+} Union3;
+
+typedef union Union4 {
+  short *Shorts;
+  double *Doubles;
+  int *Ints;
+  float *Floats;
+} Union4;
+
+// It is not obvious which enum is the tag for the union.
+struct maybeTaggedUnion1 { // No warnings expected.
+  enum Tags3 TagA;
+  enum Tags4 TagB;
+  union Union4 Data;
+};
+
+// It is not obvious which union does the tag belong to.
+struct maybeTaggedUnion2 { // No warnings expected.
+  enum Tags3 Tag;
+  union Union3 DataB;
+  union Union3 DataA;
+};
+
+// It is not obvious which union does the tag belong to.
+struct maybeTaggedUnion3 { // No warnings expected.
+  enum Tags3 Tag;
+  union {
+    int I1;
+    int I2;
+    int I3;
+  };
+  union {
+    float F1;
+    float F2;
+    float F3;
+  };
+};
+
+// No warnings expected, because LastATag is just an alias
+struct TaggedUnionWithAliasedEnumConstant {
+  enum {
+    ATag1,
+    ATag2,
+    ATag3,
+    LastATag = ATag3,
+  } Tag;
+  union {
+    float F;
+    int *Ints;
+    char Key[8];
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithPredefinedTagAndPredefinedUnion {
+  enum Tags3 Tag;
+    union Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithPredefinedTagAndInlineUnion {
+  enum Tags3 Tag;
+    union {
+    int *Ints;
+    char Characters[13];
+    struct {
+      double Re;
+      double Im;
+    } Complex;
+    long L;
+    } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithInlineTagAndPredefinedUnion { 
+  enum {
+    TaggedUnion7tag1,
+    TaggedUnion7tag2,
+    TaggedUnion7tag3,
+  } Tag;
+  union Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithInlineTagAndInlineUnion { 
+  enum {
+    TaggedUnion8tag1,
+    TaggedUnion8tag2,
+    TaggedUnion8tag3,
+  } Tag;
+  union {
+    int *Ints;
+    char Characters[13];
+    struct {
+      double Re;
+      double Im;
+    } Complex;
+    long L;
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructNesting { 
+  enum Tags3 Tag;
+  union {
+    float F;
+    int I;
+    long L;
+    // CHECK-MESSAGES: :[[@LINE+1]]:12: warning: tagged union has more data members (4) than tags (3)
+    struct innerdecl { 
+      enum Tags3 Tag;
+      union Union4 Data;
+    } Inner; 
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithTypedefedTagAndTypedefedUnion { 
+  Tags3 Tag;
+  Union4 Data;
+};
+
+#define DECLARE_TAGGED_UNION_STRUCT(Tag, Union, Name)\
+struct Name {\
+  Tag Kind;\
+  Union Data;\
+}
+
+// CHECK-MESSAGES: :[[@LINE+1]]:44: warning: tagged union has more data members (4) than tags (3)
+DECLARE_TAGGED_UNION_STRUCT(Tags3, Union4, TaggedUnionStructFromMacro);
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count.mm b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count.mm
new file mode 100644
index 0000000000000..8b308555281c5
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/tagged-union-member-count.mm
@@ -0,0 +1,309 @@
+// RUN: %check_clang_tidy %s bugprone-tagged-union-member-count %t
+
+typedef enum Tags3 {
+  tags3_1,
+  tags3_2,
+  tags3_3,
+} Tags3;
+
+typedef enum Tags4 {
+  tags4_1,
+  tags4_2,
+  tags4_3,
+  tags4_4,
+} Tags4;
+
+enum class Classtags3 {
+  classtags3_1,
+  classtags3_2,
+  classtags3_3,
+};
+
+enum class Typedtags3 : unsigned int {
+  typedtags3_1,
+  typedtags3_2,
+  typedtags3_3,
+};
+
+typedef union Union3 {
+  short *Shorts;
+  int *Ints;
+  float *Floats;
+} Union3;
+
+typedef union Union4 {
+  short *Shorts;
+  double *Doubles;
+  int *Ints;
+  float *Floats;
+} Union4;
+
+// It is not obvious which enum is the tag for the union.
+class MaybeTaggedUnion1 { // No warnings expected.
+  enum Tags3 TagA;
+  enum Tags4 TagB;
+  union Union4 Data;
+};
+
+// It is not obvious which union does the tag belong to.
+class MaybeTaggedUnion2 { // No warnings expected.
+  enum Tags3 Tag;
+  union Union3 DataB;
+  union Union3 DataA;
+};
+
+// It is not obvious which union does the tag belong to.
+class MaybeTaggedUnion3 { // No warnings expected.
+  enum Tags3 Tag;
+  union {
+    int I1;
+    int I2;
+    int I3;
+  };
+  union {
+    float F1;
+    float F2;
+    float F3;
+  };
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassPredefinedTagAndPredefinedUnion {
+  enum Tags3 Tag;
+    union Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassPredefinedTagAndInlineUnion {
+  enum Tags3 Tag;
+    union {
+    int *Ints;
+    char Characters[13];
+    class {
+      double Re;
+      double Im;
+    } Complex;
+    long L;
+    } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassInlineTagAndPredefinedUnion { 
+  enum {
+    tag1,
+    tag2,
+    tag3,
+  } Tag;
+  union Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassInlineTagAndInlineUnion { 
+  enum {
+    tag1,
+    tag2,
+    tag3,
+  } Tag;
+  union {
+    int *Ints;
+    char Characters[13];
+    class {
+      double Re;
+      double Im;
+    } Complex;
+    long L;
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassWithNestedTaggedUnionClass { 
+  enum Tags3 Tag;
+  union {
+    float F;
+    int I;
+    long L;
+    // CHECK-MESSAGES: :[[@LINE+1]]:11: warning: tagged union has more data members (4) than tags (3)
+    class Innerdecl { 
+      enum Tags3 Tag;
+      union Union4 Data;
+    } Inner; 
+  } Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassWithTypedefedTag { 
+  Tags3 Tag;
+  Union4 Data;
+}; 
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithEnumClass { 
+  enum Classtags3 Tag;
+  Union4 Data;
+}; 
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClasswithEnumClass { 
+  enum Classtags3 Tag;
+  Union4 Data;
+}; 
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithTypedEnum {
+  Typedtags3 Tag;
+  Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassWithTypedEnum {
+  Typedtags3 Tag;
+  Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct AnonymousTaggedUnionStruct {
+  Tags3 Tag;
+  union {
+    char A;
+    short B;
+    int C;
+    long D;
+  };
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassWithAnonymousUnion {
+  Tags3 Tag;
+  union {
+    char A;
+    short B;
+    int C;
+    long D;
+  };
+};
+
+namespace testnamespace {
+
+enum Tags3 {
+  tags3_1,
+  tags3_2,
+  tags3_3,
+};
+
+union Union4 {
+  short *Shorts;
+  double *Doubles;
+  int *Ints;
+  float *Floats;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructInNamespace {
+  Tags3 Tags;
+  Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassInNamespace {
+  Tags3 Tags;
+  Union4 Data;
+};
+
+} // namespace testnamespace
+
+// CHECK-MESSAGES: :[[@LINE+1]]:8: warning: tagged union has more data members (4) than tags (3)
+struct TaggedUnionStructWithNamespacedTagAndUnion {
+  testnamespace::Tags3 Tags;
+  testnamespace::Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+1]]:7: warning: tagged union has more data members (4) than tags (3)
+class TaggedUnionClassWithNamespacedTagAndUnion {
+  testnamespace::Tags3 Tags;
+  testnamespace::Union4 Data;
+};
+
+// CHECK-MESSAGES: :[[@LINE+2]]:8: warning: tagged union has more data members (4) than tags (3)
+template <typename Tag, typename Union>
+struct TemplatedStructWithNamespacedTagAndUnion {
+  Tag Kind;
+  Union Data;
+};
+
+TemplatedStructWithNamespacedTagAndUnion<testnamespace::Union4, testnamespace::Tags3> TemplatedStruct3;
+
+// CHECK-MESSAGES: :[[@LINE+2]]:7: warning: tagged union has more data members (4) than tags (3)
+template <typename Tag, typename Union>
+class TemplatedClassWithNamespacedTagAndUnion {
+  Tag Kind;
+  Union Data;
+};
+
+TemplatedClassWithNamespacedTagAndUnion<testnamespace::Union4, testnamespace::Tags3> TemplatedClass3;
+
+// CHECK-MESSAGES: :[[@LINE+2]]:8: warning: tagged union has more data members (4) than tags (3)
+template <typename Tag, typename Union>
+struct TemplatedStruct {
+  Tag Kind;
+  Union Data;
+};
+
+TemplatedStruct<Tags3, Union3> TemplatedStruct1; // No warning expected
+TemplatedStruct<Tags3, Union4> TemplatedStruct2;
+
+// CHECK-MESSAGES: :[[@LINE+2]]:7: warning: tagged union has more data members (4) than tags (3)
+template <typename Tag, typename Union>
+class TemplatedClass {
+  Tag Kind;
+  Union Data;
+};
+
+TemplatedClass<Tags3, Union3> TemplatedClass1; // No warning expected
+TemplatedClass<Tags3, Union4> TemplatedClass2;
+
+// CHECK-MESSAGES: :[[@LINE+2]]:8: warning: tagged union has more data members (4) than tags (3)
+template <typename T>
+struct TemplatedStructButTaggedUnionPartIsNotTemplated {
+  Tags3 Kind;
+  Union4 Data;
+  T SomethingElse;
+};
+
+// CHECK-MESSAGES: :[[@LINE+2]]:7: warning: tagged union has more data members (4) than tags (3)
+template <typename T>
+class TemplatedClassButTaggedUnionPartIsNotTemplated {
+  Tags3 Kind;
+  Union4 Data;
+  T SomethingElse;
+};
+
+#define DECLARE_TAGGED_UNION_STRUCT(Tag, Union, Name)\
+struct Name {\
+  Tag Kind;\
+  Union Data;\
+}
+
+// CHECK-MESSAGES: :[[@LINE+1]]:44: warning: tagged union has more data members (4) than tags (3)
+DECLARE_TAGGED_UNION_STRUCT(Tags3, Union4, TaggedUnionStructFromMacro);
+
+#define DECLARE_TAGGED_UNION_CLASS(Tag, Union, Name)\
+class Name {\
+  Tag Kind;\
+  Union Data;\
+}
+
+// CHECK-MESSAGES: :[[@LINE+1]]:43: warning: tagged union has more data members (4) than tags (3)
+DECLARE_TAGGED_UNION_CLASS(Tags3, Union4, TaggedUnionClassFromMacro);
+
+// Lambdas implicitly compile down to an unnamed CXXRecordDecl and if they have captures,
+// then those become unnamed fields.
+void DoNotMatchLambdas() {
+    enum {
+		A
+	} e;
+    union {
+		long A;
+		char B;
+	} u;
+    auto L = [e, u] () {};
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access.cpp
index 13a3ff52f3ebc..3167b85f0e024 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access.cpp
@@ -2,6 +2,8 @@
 
 #include "absl/types/optional.h"
 #include "folly/types/Optional.h"
+#include "bde/types/bsl_optional.h"
+#include "bde/types/bdlb_nullablevalue.h"
 
 void unchecked_value_access(const absl::optional<int> &opt) {
   opt.value();
@@ -50,6 +52,95 @@ void folly_checked_access(const folly::Optional<int> &opt) {
   }
 }
 
+void bsl_optional_unchecked_value_access(const bsl::optional<int> &opt) {
+  opt.value();
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: unchecked access to optional value [bugprone-unchecked-optional-access]
+
+  int x = *opt;
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: unchecked access to optional value [bugprone-unchecked-optional-access]
+
+  if (!opt) {
+    return;
+  }
+
+  opt.value();
+  x = *opt;
+}
+
+void bsl_optional_checked_access(const bsl::optional<int> &opt) {
+  if (opt.has_value()) {
+    opt.value();
+  }
+  if (opt) {
+    opt.value();
+  }
+}
+
+void bsl_optional_value_after_swap(bsl::optional<int> &opt1, bsl::optional<int> &opt2) {
+  if (opt1) {
+    opt1.swap(opt2);
+    opt1.value();
+    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: unchecked access to optional value
+  }
+}
+
+void nullable_value_unchecked_value_access(const BloombergLP::bdlb::NullableValue<int> &opt) {
+  opt.value();
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: unchecked access to optional value [bugprone-unchecked-optional-access]
+
+  int x = *opt;
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: unchecked access to optional value [bugprone-unchecked-optional-access]
+
+  if (opt.isNull()) {
+    opt.value();
+  }
+  // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: unchecked access to optional value [bugprone-unchecked-optional-access]
+
+  if (!opt) {
+    opt.value();
+  }
+  // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: unchecked access to optional value [bugprone-unchecked-optional-access]
+
+  if (!opt) {
+    return;
+  }
+
+  opt.value();
+  x = *opt;
+}
+
+void nullable_value_optional_checked_access(const BloombergLP::bdlb::NullableValue<int> &opt) {
+  if (opt.has_value()) {
+    opt.value();
+  }
+  if (opt) {
+    opt.value();
+  }
+  if (!opt.isNull()) {
+    opt.value();
+  }
+}
+
+void nullable_value_emplaced(BloombergLP::bdlb::NullableValue<int> &opt) {
+  opt.value();
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: unchecked access to optional value [bugprone-unchecked-optional-access]
+
+  opt.emplace(1);
+  opt.value();
+
+  opt.reset();
+  opt.value();
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: unchecked access to optional value [bugprone-unchecked-optional-access]
+}
+
+void nullable_value_after_swap(BloombergLP::bdlb::NullableValue<int> &opt1, BloombergLP::bdlb::NullableValue<int> &opt2) {
+  if (opt1) {
+    opt1.swap(opt2);
+    opt1.value();
+    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: unchecked access to optional value
+  }
+}
+
 template <typename T>
 void function_template_without_user(const absl::optional<T> &opt) {
   opt.value(); // no-warning
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom-regex.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom-regex.cpp
new file mode 100644
index 0000000000000..fc97d1bc93bc5
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom-regex.cpp
@@ -0,0 +1,44 @@
+// RUN: %check_clang_tidy -check-suffix=NON-STRICT-REGEX         %s bugprone-unsafe-functions %t --\
+// RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: '::name_match,replacement,is a qualname match;^::prefix_match,,is matched on qualname prefix'}}"
+// RUN: %check_clang_tidy -check-suffix=STRICT-REGEX         %s bugprone-unsafe-functions %t --\
+// RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: '^name_match$,replacement,is matched on function name only;^::prefix_match$,,is a full qualname match'}}"
+
+void name_match();
+void prefix_match();
+
+namespace regex_test {
+void name_match();
+void prefix_match();
+}
+
+void name_match_regex();
+void prefix_match_regex();
+
+void f1() {
+  name_match();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match' is a qualname match; 'replacement' should be used instead
+  // CHECK-MESSAGES-STRICT-REGEX: :[[@LINE-2]]:3: warning: function 'name_match' is matched on function name only; 'replacement' should be used instead
+  prefix_match();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'prefix_match' is matched on qualname prefix; it should not be used
+  // CHECK-MESSAGES-STRICT-REGEX: :[[@LINE-2]]:3: warning: function 'prefix_match' is a full qualname match; it should not be used
+
+  ::name_match();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match' is a qualname match; 'replacement' should be used instead
+  // CHECK-MESSAGES-STRICT-REGEX: :[[@LINE-2]]:3: warning: function 'name_match' is matched on function name only; 'replacement' should be used instead
+  regex_test::name_match();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match' is a qualname match; 'replacement' should be used instead
+  // CHECK-MESSAGES-STRICT-REGEX: :[[@LINE-2]]:3: warning: function 'name_match' is matched on function name only; 'replacement' should be used instead
+  name_match_regex();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match_regex' is a qualname match; 'replacement' should be used instead
+  // no-warning STRICT-REGEX
+
+  ::prefix_match();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'prefix_match' is matched on qualname prefix; it should not be used
+  // CHECK-MESSAGES-STRICT-REGEX: :[[@LINE-2]]:3: warning: function 'prefix_match' is a full qualname match; it should not be used
+  regex_test::prefix_match();
+  // no-warning NON-STRICT-REGEX
+  // no-warning STRICT-REGEX
+  prefix_match_regex();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'prefix_match_regex' is matched on qualname prefix; it should not be used
+  // no-warning STRICT-REGEX
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom.c
new file mode 100644
index 0000000000000..7fd71ec2f2e7b
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom.c
@@ -0,0 +1,27 @@
+// RUN: %check_clang_tidy -check-suffix=NON-STRICT-REGEX %s bugprone-unsafe-functions %t --\
+// RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: '::name_match,replacement,is a qualname match;^::prefix_match,,is matched on qualname prefix'}}"
+// RUN: %check_clang_tidy -check-suffix=STRICT-REGEX     %s bugprone-unsafe-functions %t --\
+// RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: '^name_match$,replacement,is matched on function name only;^::prefix_match$,,is a full qualname match'}}"
+
+void name_match();
+void prefix_match();
+
+void name_match_regex();
+void prefix_match_regex();
+
+void f1() {
+  name_match();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match' is a qualname match; 'replacement' should be used instead
+  // CHECK-MESSAGES-STRICT-REGEX: :[[@LINE-2]]:3: warning: function 'name_match' is matched on function name only; 'replacement' should be used instead
+  prefix_match();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'prefix_match' is matched on qualname prefix; it should not be used
+  // CHECK-MESSAGES-STRICT-REGEX: :[[@LINE-2]]:3: warning: function 'prefix_match' is a full qualname match; it should not be used
+
+  name_match_regex();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match_regex' is a qualname match; 'replacement' should be used instead
+  // no-warning STRICT-REGEX
+
+  prefix_match_regex();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'prefix_match_regex' is matched on qualname prefix; it should not be used
+  // no-warning STRICT-REGEX
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions.c
index 4bc2bad996d70..0409dd6bfcaa3 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions.c
@@ -12,6 +12,12 @@
 // RUN: %check_clang_tidy -check-suffix=WITH-ANNEX-K-CERT-ONLY  %s bugprone-unsafe-functions %t -- \
 // RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.ReportMoreUnsafeFunctions: false}}" \
 // RUN:                                                                                            -- -D__STDC_LIB_EXT1__=1 -D__STDC_WANT_LIB_EXT1__=1
+// RUN: %check_clang_tidy -check-suffix=WITH-NONE-ENABLED       %s bugprone-unsafe-functions %t --\
+// RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.ReportDefaultFunctions: false}}" \
+// RUN:                                                                                            -- -D__STDC_LIB_EXT1__=1 -D__STDC_WANT_LIB_EXT1__=1
+
+// CHECK-MESSAGES-WITH-NONE-ENABLED: 1 warning generated
+// CHECK-MESSAGES-WITH-NONE-ENABLED: Suppressed 1 warnings
 
 typedef __SIZE_TYPE__ size_t;
 typedef __WCHAR_TYPE__ wchar_t;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.cpp
index 7bc0925136aa8..2c36349da896c 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.cpp
@@ -84,6 +84,31 @@ void test_macro_expansion4() {
 #undef MY_NULL
 }
 
+template <typename T> struct pear {
+  // If you say __null (or NULL), we assume that T will always be a pointer
+  // type, so we suggest replacing it with nullptr. (We only check __null here,
+  // because in this test NULL is defined as 0, but real library implementations
+  // it is often defined as __null and the check will catch it.)
+  void f() { x = __null; }
+  // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: use nullptr [modernize-use-nullptr]
+  // CHECK-FIXES: x = nullptr;
+
+  // But if you say 0, we allow the possibility that T can be used with integral
+  // and pointer types, and "0" is an acceptable initializer (even if "{}" might
+  // be even better).
+  void g() { y = 0; }
+  // CHECK-MESSAGES-NOT: :[[@LINE-1]] warning: use nullptr
+
+  T x;
+  T y;
+};
+void test_templated() {
+  pear<int*> p;
+  p.f();
+  p.g();
+  dummy(p.x);
+}
+
 #define IS_EQ(x, y) if (x != y) return;
 void test_macro_args() {
   int i = 0;
diff --git a/clang-tools-extra/unittests/include/common/VirtualFileHelper.h b/clang-tools-extra/unittests/include/common/VirtualFileHelper.h
index 18b98d2796e67..abe1067495694 100644
--- a/clang-tools-extra/unittests/include/common/VirtualFileHelper.h
+++ b/clang-tools-extra/unittests/include/common/VirtualFileHelper.h
@@ -60,7 +60,7 @@ class VirtualFileHelper {
          I != E; ++I) {
       std::unique_ptr<llvm::MemoryBuffer> Buf =
           llvm::MemoryBuffer::getMemBuffer(I->Code);
-      const FileEntry *Entry = SM.getFileManager().getVirtualFile(
+      FileEntryRef Entry = SM.getFileManager().getVirtualFileRef(
           I->FileName, Buf->getBufferSize(), /*ModificationTime=*/0);
       SM.overrideFileContents(Entry, std::move(Buf));
     }
diff --git a/clang/CodeOwners.rst b/clang/CodeOwners.rst
index 2ae04c129eb76..f067b7183ae73 100644
--- a/clang/CodeOwners.rst
+++ b/clang/CodeOwners.rst
@@ -120,7 +120,7 @@ OpenBSD driver
 Driver parts not covered by someone else
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | Fangrui Song
-| maskray\@google.com (email), MaskRay (Phabricator), MaskRay (GitHub)
+| i\@maskray.me (email), MaskRay (Phabricator), MaskRay (GitHub)
 
 
 Tools
diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index 4da99e899e7f7..f8a20a1e22472 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -133,7 +133,7 @@ def from_param(cls, param: str | bytes | None) -> c_interop_string:
         )
 
     @staticmethod
-    def to_python_string(x: c_interop_string, *args: Any) -> str | None:
+    def to_python_string(x: c_interop_string) -> str | None:
         return x.value
 
 
@@ -241,9 +241,9 @@ def __del__(self) -> None:
         conf.lib.clang_disposeString(self)
 
     @staticmethod
-    def from_result(res: _CXString, fn: Any = None, args: Any = None) -> str:
+    def from_result(res: _CXString) -> str:
         assert isinstance(res, _CXString)
-        pystr: str | None = conf.lib.clang_getCString(res)
+        pystr = c_interop_string.to_python_string(conf.lib.clang_getCString(res))
         if pystr is None:
             return ""
         return pystr
@@ -424,7 +424,7 @@ def location(self):
 
     @property
     def spelling(self):
-        return conf.lib.clang_getDiagnosticSpelling(self)  # type: ignore [no-any-return]
+        return _CXString.from_result(conf.lib.clang_getDiagnosticSpelling(self))
 
     @property
     def ranges(self) -> NoSliceSequence[SourceRange]:
@@ -453,7 +453,9 @@ def __len__(self) -> int:
 
             def __getitem__(self, key: int) -> FixIt:
                 range = SourceRange()
-                value = conf.lib.clang_getDiagnosticFixIt(self.diag, key, byref(range))
+                value = _CXString.from_result(
+                    conf.lib.clang_getDiagnosticFixIt(self.diag, key, byref(range))
+                )
                 if len(value) == 0:
                     raise IndexError
 
@@ -486,12 +488,12 @@ def category_number(self):
     @property
     def category_name(self):
         """The string name of the category for this diagnostic."""
-        return conf.lib.clang_getDiagnosticCategoryText(self)  # type: ignore [no-any-return]
+        return _CXString.from_result(conf.lib.clang_getDiagnosticCategoryText(self))
 
     @property
     def option(self):
         """The command-line option that enables this diagnostic."""
-        return conf.lib.clang_getDiagnosticOption(self, None)  # type: ignore [no-any-return]
+        return _CXString.from_result(conf.lib.clang_getDiagnosticOption(self, None))
 
     @property
     def disable_option(self):
@@ -511,7 +513,7 @@ def format(self, options=None):
             options = conf.lib.clang_defaultDiagnosticDisplayOptions()
         if options & ~Diagnostic._FormatOptionsMask:
             raise ValueError("Invalid format options")
-        return conf.lib.clang_formatDiagnostic(self, options)  # type: ignore [no-any-return]
+        return _CXString.from_result(conf.lib.clang_formatDiagnostic(self, options))
 
     def __repr__(self):
         return "<Diagnostic severity %r, location %r, spelling %r>" % (
@@ -1734,7 +1736,7 @@ def get_definition(self):
         """
         # TODO: Should probably check that this is either a reference or
         # declaration prior to issuing the lookup.
-        return conf.lib.clang_getCursorDefinition(self)  # type: ignore [no-any-return]
+        return Cursor.from_result(conf.lib.clang_getCursorDefinition(self), self)
 
     def get_usr(self):
         """Return the Unified Symbol Resolution (USR) for the entity referenced
@@ -1745,13 +1747,13 @@ def get_usr(self):
         program. USRs can be compared across translation units to determine,
         e.g., when references in one translation refer to an entity defined in
         another translation unit."""
-        return conf.lib.clang_getCursorUSR(self)  # type: ignore [no-any-return]
+        return _CXString.from_result(conf.lib.clang_getCursorUSR(self))
 
     def get_included_file(self):
         """Returns the File that is included by the current inclusion cursor."""
         assert self.kind == CursorKind.INCLUSION_DIRECTIVE
 
-        return conf.lib.clang_getIncludedFile(self)  # type: ignore [no-any-return]
+        return File.from_result(conf.lib.clang_getIncludedFile(self), self)
 
     @property
     def kind(self):
@@ -1762,7 +1764,9 @@ def kind(self):
     def spelling(self):
         """Return the spelling of the entity pointed at by the cursor."""
         if not hasattr(self, "_spelling"):
-            self._spelling = conf.lib.clang_getCursorSpelling(self)
+            self._spelling = _CXString.from_result(
+                conf.lib.clang_getCursorSpelling(self)
+            )
 
         return self._spelling
 
@@ -1776,7 +1780,9 @@ def displayname(self):
         arguments of a class template specialization.
         """
         if not hasattr(self, "_displayname"):
-            self._displayname = conf.lib.clang_getCursorDisplayName(self)
+            self._displayname = _CXString.from_result(
+                conf.lib.clang_getCursorDisplayName(self)
+            )
 
         return self._displayname
 
@@ -1784,7 +1790,9 @@ def displayname(self):
     def mangled_name(self):
         """Return the mangled name for the entity referenced by this cursor."""
         if not hasattr(self, "_mangled_name"):
-            self._mangled_name = conf.lib.clang_Cursor_getMangling(self)
+            self._mangled_name = _CXString.from_result(
+                conf.lib.clang_Cursor_getMangling(self)
+            )
 
         return self._mangled_name
 
@@ -1876,7 +1884,7 @@ def type(self):
         Retrieve the Type (if any) of the entity pointed at by the cursor.
         """
         if not hasattr(self, "_type"):
-            self._type = conf.lib.clang_getCursorType(self)
+            self._type = Type.from_result(conf.lib.clang_getCursorType(self), (self,))
 
         return self._type
 
@@ -1890,7 +1898,9 @@ def canonical(self):
         declarations will be identical.
         """
         if not hasattr(self, "_canonical"):
-            self._canonical = conf.lib.clang_getCanonicalCursor(self)
+            self._canonical = Cursor.from_cursor_result(
+                conf.lib.clang_getCanonicalCursor(self), self
+            )
 
         return self._canonical
 
@@ -1898,7 +1908,9 @@ def canonical(self):
     def result_type(self):
         """Retrieve the Type of the result for this Cursor."""
         if not hasattr(self, "_result_type"):
-            self._result_type = conf.lib.clang_getCursorResultType(self)
+            self._result_type = Type.from_result(
+                conf.lib.clang_getCursorResultType(self), (self,)
+            )
 
         return self._result_type
 
@@ -1925,7 +1937,9 @@ def underlying_typedef_type(self):
         """
         if not hasattr(self, "_underlying_type"):
             assert self.kind.is_declaration()
-            self._underlying_type = conf.lib.clang_getTypedefDeclUnderlyingType(self)
+            self._underlying_type = Type.from_result(
+                conf.lib.clang_getTypedefDeclUnderlyingType(self), (self,)
+            )
 
         return self._underlying_type
 
@@ -1938,7 +1952,9 @@ def enum_type(self):
         """
         if not hasattr(self, "_enum_type"):
             assert self.kind == CursorKind.ENUM_DECL
-            self._enum_type = conf.lib.clang_getEnumDeclIntegerType(self)
+            self._enum_type = Type.from_result(
+                conf.lib.clang_getEnumDeclIntegerType(self), (self,)
+            )
 
         return self._enum_type
 
@@ -1972,7 +1988,9 @@ def enum_value(self):
     def objc_type_encoding(self):
         """Return the Objective-C type encoding as a str."""
         if not hasattr(self, "_objc_type_encoding"):
-            self._objc_type_encoding = conf.lib.clang_getDeclObjCTypeEncoding(self)
+            self._objc_type_encoding = _CXString.from_result(
+                conf.lib.clang_getDeclObjCTypeEncoding(self)
+            )
 
         return self._objc_type_encoding
 
@@ -1988,7 +2006,9 @@ def hash(self):
     def semantic_parent(self):
         """Return the semantic parent for this cursor."""
         if not hasattr(self, "_semantic_parent"):
-            self._semantic_parent = conf.lib.clang_getCursorSemanticParent(self)
+            self._semantic_parent = Cursor.from_cursor_result(
+                conf.lib.clang_getCursorSemanticParent(self), self
+            )
 
         return self._semantic_parent
 
@@ -1996,7 +2016,9 @@ def semantic_parent(self):
     def lexical_parent(self):
         """Return the lexical parent for this cursor."""
         if not hasattr(self, "_lexical_parent"):
-            self._lexical_parent = conf.lib.clang_getCursorLexicalParent(self)
+            self._lexical_parent = Cursor.from_cursor_result(
+                conf.lib.clang_getCursorLexicalParent(self), self
+            )
 
         return self._lexical_parent
 
@@ -2014,25 +2036,27 @@ def referenced(self):
         representing the entity that it references.
         """
         if not hasattr(self, "_referenced"):
-            self._referenced = conf.lib.clang_getCursorReferenced(self)
+            self._referenced = Cursor.from_result(
+                conf.lib.clang_getCursorReferenced(self), self
+            )
 
         return self._referenced
 
     @property
     def brief_comment(self):
         """Returns the brief comment text associated with that Cursor"""
-        return conf.lib.clang_Cursor_getBriefCommentText(self)  # type: ignore [no-any-return]
+        return _CXString.from_result(conf.lib.clang_Cursor_getBriefCommentText(self))
 
     @property
     def raw_comment(self):
         """Returns the raw comment text associated with that Cursor"""
-        return conf.lib.clang_Cursor_getRawCommentText(self)  # type: ignore [no-any-return]
+        return _CXString.from_result(conf.lib.clang_Cursor_getRawCommentText(self))
 
     def get_arguments(self):
         """Return an iterator for accessing the arguments of this cursor."""
         num_args = conf.lib.clang_Cursor_getNumArguments(self)
         for i in range(0, num_args):
-            yield conf.lib.clang_Cursor_getArgument(self, i)
+            yield Cursor.from_result(conf.lib.clang_Cursor_getArgument(self, i), self)
 
     def get_num_template_arguments(self):
         """Returns the number of template args associated with this cursor."""
@@ -2041,11 +2065,15 @@ def get_num_template_arguments(self):
     def get_template_argument_kind(self, num):
         """Returns the TemplateArgumentKind for the indicated template
         argument."""
-        return conf.lib.clang_Cursor_getTemplateArgumentKind(self, num)  # type: ignore [no-any-return]
+        return TemplateArgumentKind.from_id(
+            conf.lib.clang_Cursor_getTemplateArgumentKind(self, num)
+        )
 
     def get_template_argument_type(self, num):
         """Returns the CXType for the indicated template argument."""
-        return conf.lib.clang_Cursor_getTemplateArgumentType(self, num)  # type: ignore [no-any-return]
+        return Type.from_result(
+            conf.lib.clang_Cursor_getTemplateArgumentType(self, num), (self, num)
+        )
 
     def get_template_argument_value(self, num):
         """Returns the value of the indicated arg as a signed 64b integer."""
@@ -2116,7 +2144,7 @@ def get_bitfield_width(self):
         return conf.lib.clang_getFieldDeclBitWidth(self)  # type: ignore [no-any-return]
 
     @staticmethod
-    def from_result(res, fn, args):
+    def from_result(res, arg):
         assert isinstance(res, Cursor)
         # FIXME: There should just be an isNull method.
         if res == conf.lib.clang_getNullCursor():
@@ -2125,14 +2153,10 @@ def from_result(res, fn, args):
         # Store a reference to the TU in the Python object so it won't get GC'd
         # before the Cursor.
         tu = None
-        for arg in args:
-            if isinstance(arg, TranslationUnit):
-                tu = arg
-                break
-
-            if hasattr(arg, "translation_unit"):
-                tu = arg.translation_unit
-                break
+        if isinstance(arg, TranslationUnit):
+            tu = arg
+        elif hasattr(arg, "translation_unit"):
+            tu = arg.translation_unit
 
         assert tu is not None
 
@@ -2140,12 +2164,12 @@ def from_result(res, fn, args):
         return res
 
     @staticmethod
-    def from_cursor_result(res, fn, args):
+    def from_cursor_result(res, arg):
         assert isinstance(res, Cursor)
         if res == conf.lib.clang_getNullCursor():
             return None
 
-        res._tu = args[0]._tu
+        res._tu = arg._tu
         return res
 
 
@@ -2250,7 +2274,7 @@ class TypeKind(BaseEnumeration):
     @property
     def spelling(self):
         """Retrieve the spelling of this TypeKind."""
-        return conf.lib.clang_getTypeKindSpelling(self.value)  # type: ignore [no-any-return]
+        return _CXString.from_result(conf.lib.clang_getTypeKindSpelling(self.value))
 
     INVALID = 0
     UNEXPOSED = 1
@@ -2438,7 +2462,9 @@ def __getitem__(self, key: int) -> Type:
                         "%d > %d" % (key, len(self))
                     )
 
-                result: Type = conf.lib.clang_getArgType(self.parent, key)
+                result = Type.from_result(
+                    conf.lib.clang_getArgType(self.parent, key), (self.parent, key)
+                )
                 if result.kind == TypeKind.INVALID:
                     raise IndexError("Argument could not be retrieved.")
 
@@ -2454,7 +2480,7 @@ def element_type(self):
         If accessed on a type that is not an array, complex, or vector type, an
         exception will be raised.
         """
-        result = conf.lib.clang_getElementType(self)
+        result = Type.from_result(conf.lib.clang_getElementType(self), (self,))
         if result.kind == TypeKind.INVALID:
             raise Exception("Element type not available on this type.")
 
@@ -2482,7 +2508,7 @@ def translation_unit(self):
         return self._tu
 
     @staticmethod
-    def from_result(res, fn, args):
+    def from_result(res, args):
         assert isinstance(res, Type)
 
         tu = None
@@ -2500,7 +2526,9 @@ def get_num_template_arguments(self):
         return conf.lib.clang_Type_getNumTemplateArguments(self)  # type: ignore [no-any-return]
 
     def get_template_argument_type(self, num):
-        return conf.lib.clang_Type_getTemplateArgumentAsType(self, num)  # type: ignore [no-any-return]
+        return Type.from_result(
+            conf.lib.clang_Type_getTemplateArgumentAsType(self, num), (self, num)
+        )
 
     def get_canonical(self):
         """
@@ -2512,7 +2540,7 @@ def get_canonical(self):
         example, if 'T' is a typedef for 'int', the canonical type for
         'T' would be 'int'.
         """
-        return conf.lib.clang_getCanonicalType(self)  # type: ignore [no-any-return]
+        return Type.from_result(conf.lib.clang_getCanonicalType(self), (self,))
 
     def is_const_qualified(self):
         """Determine whether a Type has the "const" qualifier set.
@@ -2548,7 +2576,7 @@ def get_address_space(self):
         return conf.lib.clang_getAddressSpace(self)  # type: ignore [no-any-return]
 
     def get_typedef_name(self):
-        return conf.lib.clang_getTypedefName(self)  # type: ignore [no-any-return]
+        return _CXString.from_result(conf.lib.clang_getTypedefName(self))
 
     def is_pod(self):
         """Determine whether this Type represents plain old data (POD)."""
@@ -2558,25 +2586,25 @@ def get_pointee(self):
         """
         For pointer types, returns the type of the pointee.
         """
-        return conf.lib.clang_getPointeeType(self)  # type: ignore [no-any-return]
+        return Type.from_result(conf.lib.clang_getPointeeType(self), (self,))
 
     def get_declaration(self):
         """
         Return the cursor for the declaration of the given type.
         """
-        return conf.lib.clang_getTypeDeclaration(self)  # type: ignore [no-any-return]
+        return Cursor.from_result(conf.lib.clang_getTypeDeclaration(self), self)
 
     def get_result(self):
         """
         Retrieve the result type associated with a function type.
         """
-        return conf.lib.clang_getResultType(self)  # type: ignore [no-any-return]
+        return Type.from_result(conf.lib.clang_getResultType(self), (self,))
 
     def get_array_element_type(self):
         """
         Retrieve the type of the elements of the array type.
         """
-        return conf.lib.clang_getArrayElementType(self)  # type: ignore [no-any-return]
+        return Type.from_result(conf.lib.clang_getArrayElementType(self), (self,))
 
     def get_array_size(self):
         """
@@ -2588,13 +2616,13 @@ def get_class_type(self):
         """
         Retrieve the class type of the member pointer type.
         """
-        return conf.lib.clang_Type_getClassType(self)  # type: ignore [no-any-return]
+        return Type.from_result(conf.lib.clang_Type_getClassType(self), (self,))
 
     def get_named_type(self):
         """
         Retrieve the type named by the qualified-id.
         """
-        return conf.lib.clang_Type_getNamedType(self)  # type: ignore [no-any-return]
+        return Type.from_result(conf.lib.clang_Type_getNamedType(self), (self,))
 
     def get_align(self):
         """
@@ -2647,7 +2675,7 @@ def get_exception_specification_kind(self):
     @property
     def spelling(self):
         """Retrieve the spelling of this Type."""
-        return conf.lib.clang_getTypeSpelling(self)  # type: ignore [no-any-return]
+        return _CXString.from_result(conf.lib.clang_getTypeSpelling(self))
 
     def __eq__(self, other):
         if type(other) != type(self):
@@ -2737,7 +2765,9 @@ def __repr__(self):
     def spelling(self):
         if self.__kindNumber in SpellingCache:
             return SpellingCache[self.__kindNumber]
-        return conf.lib.clang_getCompletionChunkText(self.cs, self.key)  # type: ignore [no-any-return]
+        return _CXString.from_result(
+            conf.lib.clang_getCompletionChunkText(self.cs, self.key)
+        )
 
     # We do not use @CachedProperty here, as the manual implementation is
     # apparently still significantly faster. Please profile carefully if you
@@ -2839,7 +2869,9 @@ def availability(self):
     @property
     def briefComment(self):
         if conf.function_exists("clang_getCompletionBriefComment"):
-            return conf.lib.clang_getCompletionBriefComment(self.obj)  # type: ignore [no-any-return]
+            return _CXString.from_result(
+                conf.lib.clang_getCompletionBriefComment(self.obj)
+            )
         return _CXString()
 
     def __repr__(self):
@@ -3125,12 +3157,12 @@ def __del__(self):
     @property
     def cursor(self):
         """Retrieve the cursor that represents the given translation unit."""
-        return conf.lib.clang_getTranslationUnitCursor(self)  # type: ignore [no-any-return]
+        return Cursor.from_result(conf.lib.clang_getTranslationUnitCursor(self), self)
 
     @property
     def spelling(self):
         """Get the original translation unit source file name."""
-        return conf.lib.clang_getTranslationUnitSpelling(self)  # type: ignore [no-any-return]
+        return _CXString.from_result(conf.lib.clang_getTranslationUnitSpelling(self))
 
     def get_includes(self):
         """
@@ -3356,7 +3388,7 @@ def from_name(translation_unit, file_name):
     @property
     def name(self):
         """Return the complete file and path name of the file."""
-        return conf.lib.clang_getFileName(self)  # type: ignore [no-any-return]
+        return _CXString.from_result(conf.lib.clang_getFileName(self))
 
     @property
     def time(self):
@@ -3370,12 +3402,12 @@ def __repr__(self):
         return "<File: %s>" % (self.name)
 
     @staticmethod
-    def from_result(res, fn, args):
+    def from_result(res, arg):
         assert isinstance(res, c_object_p)
         res = File(res)
 
         # Copy a reference to the TranslationUnit to prevent premature GC.
-        res._tu = args[0]._tu
+        res._tu = arg._tu
         return res
 
 
@@ -3440,12 +3472,16 @@ def __init__(self, cmd, ccmds):
     @property
     def directory(self):
         """Get the working directory for this CompileCommand"""
-        return conf.lib.clang_CompileCommand_getDirectory(self.cmd)  # type: ignore [no-any-return]
+        return _CXString.from_result(
+            conf.lib.clang_CompileCommand_getDirectory(self.cmd)
+        )
 
     @property
     def filename(self):
         """Get the working filename for this CompileCommand"""
-        return conf.lib.clang_CompileCommand_getFilename(self.cmd)  # type: ignore [no-any-return]
+        return _CXString.from_result(
+            conf.lib.clang_CompileCommand_getFilename(self.cmd)
+        )
 
     @property
     def arguments(self):
@@ -3457,7 +3493,9 @@ def arguments(self):
         """
         length = conf.lib.clang_CompileCommand_getNumArgs(self.cmd)
         for i in range(length):
-            yield conf.lib.clang_CompileCommand_getArg(self.cmd, i)
+            yield _CXString.from_result(
+                conf.lib.clang_CompileCommand_getArg(self.cmd, i)
+            )
 
 
 class CompileCommands:
@@ -3482,7 +3520,7 @@ def __getitem__(self, i):
         return CompileCommand(cc, self)
 
     @staticmethod
-    def from_result(res, fn, args):
+    def from_result(res):
         if not res:
             return None
         return CompileCommands(res)
@@ -3500,7 +3538,7 @@ def __del__(self):
         conf.lib.clang_CompilationDatabase_dispose(self)
 
     @staticmethod
-    def from_result(res, fn, args):
+    def from_result(res):
         if not res:
             raise CompilationDatabaseError(0, "CompilationDatabase loading failed")
         return CompilationDatabase(res)
@@ -3510,8 +3548,10 @@ def fromDirectory(buildDir):
         """Builds a CompilationDatabase from the database found in buildDir"""
         errorCode = c_uint()
         try:
-            cdb = conf.lib.clang_CompilationDatabase_fromDirectory(
-                os.fspath(buildDir), byref(errorCode)
+            cdb = CompilationDatabase.from_result(
+                conf.lib.clang_CompilationDatabase_fromDirectory(
+                    os.fspath(buildDir), byref(errorCode)
+                )
             )
         except CompilationDatabaseError as e:
             raise CompilationDatabaseError(
@@ -3524,8 +3564,10 @@ def getCompileCommands(self, filename):
         Get an iterable object providing all the CompileCommands available to
         build filename. Returns None if filename is not found in the database.
         """
-        return conf.lib.clang_CompilationDatabase_getCompileCommands(  # type: ignore [no-any-return]
-            self, os.fspath(filename)
+        return CompileCommands.from_result(
+            conf.lib.clang_CompilationDatabase_getCompileCommands(  # type: ignore [no-any-return]
+                self, os.fspath(filename)
+            )
         )
 
     def getAllCompileCommands(self):
@@ -3533,7 +3575,9 @@ def getAllCompileCommands(self):
         Get an iterable object providing all the CompileCommands available from
         the database.
         """
-        return conf.lib.clang_CompilationDatabase_getAllCompileCommands(self)  # type: ignore [no-any-return]
+        return CompileCommands.from_result(
+            conf.lib.clang_CompilationDatabase_getAllCompileCommands(self)  # type: ignore [no-any-return]
+        )
 
 
 class Token(Structure):
@@ -3554,7 +3598,7 @@ def spelling(self):
 
         This is the textual representation of the token in source.
         """
-        return conf.lib.clang_getTokenSpelling(self._tu, self)  # type: ignore [no-any-return]
+        return _CXString.from_result(conf.lib.clang_getTokenSpelling(self._tu, self))
 
     @property
     def kind(self):
@@ -3661,41 +3705,19 @@ def write_main_file_to_stdout(self):
         "clang_CompilationDatabase_fromDirectory",
         [c_interop_string, POINTER(c_uint)],
         c_object_p,
-        CompilationDatabase.from_result,
-    ),
-    (
-        "clang_CompilationDatabase_getAllCompileCommands",
-        [c_object_p],
-        c_object_p,
-        CompileCommands.from_result,
     ),
+    ("clang_CompilationDatabase_getAllCompileCommands", [c_object_p], c_object_p),
     (
         "clang_CompilationDatabase_getCompileCommands",
         [c_object_p, c_interop_string],
         c_object_p,
-        CompileCommands.from_result,
     ),
     ("clang_CompileCommands_dispose", [c_object_p]),
     ("clang_CompileCommands_getCommand", [c_object_p, c_uint], c_object_p),
     ("clang_CompileCommands_getSize", [c_object_p], c_uint),
-    (
-        "clang_CompileCommand_getArg",
-        [c_object_p, c_uint],
-        _CXString,
-        _CXString.from_result,
-    ),
-    (
-        "clang_CompileCommand_getDirectory",
-        [c_object_p],
-        _CXString,
-        _CXString.from_result,
-    ),
-    (
-        "clang_CompileCommand_getFilename",
-        [c_object_p],
-        _CXString,
-        _CXString.from_result,
-    ),
+    ("clang_CompileCommand_getArg", [c_object_p, c_uint], _CXString),
+    ("clang_CompileCommand_getDirectory", [c_object_p], _CXString),
+    ("clang_CompileCommand_getFilename", [c_object_p], _CXString),
     ("clang_CompileCommand_getNumArgs", [c_object_p], c_uint),
     (
         "clang_codeCompleteAt",
@@ -3743,82 +3765,62 @@ def write_main_file_to_stdout(self):
     ("clang_equalLocations", [SourceLocation, SourceLocation], bool),
     ("clang_equalRanges", [SourceRange, SourceRange], bool),
     ("clang_equalTypes", [Type, Type], bool),
-    ("clang_formatDiagnostic", [Diagnostic, c_uint], _CXString, _CXString.from_result),
-    ("clang_getArgType", [Type, c_uint], Type, Type.from_result),
-    ("clang_getArrayElementType", [Type], Type, Type.from_result),
+    ("clang_formatDiagnostic", [Diagnostic, c_uint], _CXString),
+    ("clang_getArgType", [Type, c_uint], Type),
+    ("clang_getArrayElementType", [Type], Type),
     ("clang_getArraySize", [Type], c_longlong),
     ("clang_getFieldDeclBitWidth", [Cursor], c_int),
-    ("clang_getCanonicalCursor", [Cursor], Cursor, Cursor.from_cursor_result),
-    ("clang_getCanonicalType", [Type], Type, Type.from_result),
+    ("clang_getCanonicalCursor", [Cursor], Cursor),
+    ("clang_getCanonicalType", [Type], Type),
     ("clang_getChildDiagnostics", [Diagnostic], c_object_p),
     ("clang_getCompletionAvailability", [c_void_p], c_int),
-    ("clang_getCompletionBriefComment", [c_void_p], _CXString, _CXString.from_result),
+    ("clang_getCompletionBriefComment", [c_void_p], _CXString),
     ("clang_getCompletionChunkCompletionString", [c_void_p, c_int], c_object_p),
     ("clang_getCompletionChunkKind", [c_void_p, c_int], c_int),
-    (
-        "clang_getCompletionChunkText",
-        [c_void_p, c_int],
-        _CXString,
-        _CXString.from_result,
-    ),
+    ("clang_getCompletionChunkText", [c_void_p, c_int], _CXString),
     ("clang_getCompletionPriority", [c_void_p], c_int),
-    (
-        "clang_getCString",
-        [_CXString],
-        c_interop_string,
-        c_interop_string.to_python_string,
-    ),
+    ("clang_getCString", [_CXString], c_interop_string),
     ("clang_getCursor", [TranslationUnit, SourceLocation], Cursor),
     ("clang_getCursorAvailability", [Cursor], c_int),
-    ("clang_getCursorDefinition", [Cursor], Cursor, Cursor.from_result),
-    ("clang_getCursorDisplayName", [Cursor], _CXString, _CXString.from_result),
+    ("clang_getCursorDefinition", [Cursor], Cursor),
+    ("clang_getCursorDisplayName", [Cursor], _CXString),
     ("clang_getCursorExtent", [Cursor], SourceRange),
-    ("clang_getCursorLexicalParent", [Cursor], Cursor, Cursor.from_cursor_result),
+    ("clang_getCursorLexicalParent", [Cursor], Cursor),
     ("clang_getCursorLocation", [Cursor], SourceLocation),
-    ("clang_getCursorReferenced", [Cursor], Cursor, Cursor.from_result),
+    ("clang_getCursorReferenced", [Cursor], Cursor),
     ("clang_getCursorReferenceNameRange", [Cursor, c_uint, c_uint], SourceRange),
-    ("clang_getCursorResultType", [Cursor], Type, Type.from_result),
-    ("clang_getCursorSemanticParent", [Cursor], Cursor, Cursor.from_cursor_result),
-    ("clang_getCursorSpelling", [Cursor], _CXString, _CXString.from_result),
-    ("clang_getCursorType", [Cursor], Type, Type.from_result),
-    ("clang_getCursorUSR", [Cursor], _CXString, _CXString.from_result),
-    ("clang_Cursor_getMangling", [Cursor], _CXString, _CXString.from_result),
+    ("clang_getCursorResultType", [Cursor], Type),
+    ("clang_getCursorSemanticParent", [Cursor], Cursor),
+    ("clang_getCursorSpelling", [Cursor], _CXString),
+    ("clang_getCursorType", [Cursor], Type),
+    ("clang_getCursorUSR", [Cursor], _CXString),
+    ("clang_Cursor_getMangling", [Cursor], _CXString),
     # ("clang_getCXTUResourceUsage",
     #  [TranslationUnit],
     #  CXTUResourceUsage),
     ("clang_getCXXAccessSpecifier", [Cursor], c_uint),
-    ("clang_getDeclObjCTypeEncoding", [Cursor], _CXString, _CXString.from_result),
+    ("clang_getDeclObjCTypeEncoding", [Cursor], _CXString),
     ("clang_getDiagnostic", [c_object_p, c_uint], c_object_p),
     ("clang_getDiagnosticCategory", [Diagnostic], c_uint),
-    ("clang_getDiagnosticCategoryText", [Diagnostic], _CXString, _CXString.from_result),
-    (
-        "clang_getDiagnosticFixIt",
-        [Diagnostic, c_uint, POINTER(SourceRange)],
-        _CXString,
-        _CXString.from_result,
-    ),
+    ("clang_getDiagnosticCategoryText", [Diagnostic], _CXString),
+    ("clang_getDiagnosticFixIt", [Diagnostic, c_uint, POINTER(SourceRange)], _CXString),
     ("clang_getDiagnosticInSet", [c_object_p, c_uint], c_object_p),
     ("clang_getDiagnosticLocation", [Diagnostic], SourceLocation),
     ("clang_getDiagnosticNumFixIts", [Diagnostic], c_uint),
     ("clang_getDiagnosticNumRanges", [Diagnostic], c_uint),
-    (
-        "clang_getDiagnosticOption",
-        [Diagnostic, POINTER(_CXString)],
-        _CXString,
-        _CXString.from_result,
-    ),
+    ("clang_getDiagnosticOption", [Diagnostic, POINTER(_CXString)], _CXString),
     ("clang_getDiagnosticRange", [Diagnostic, c_uint], SourceRange),
     ("clang_getDiagnosticSeverity", [Diagnostic], c_int),
-    ("clang_getDiagnosticSpelling", [Diagnostic], _CXString, _CXString.from_result),
-    ("clang_getElementType", [Type], Type, Type.from_result),
+    ("clang_getDiagnosticSpelling", [Diagnostic], _CXString),
+    ("clang_getElementType", [Type], Type),
     ("clang_getEnumConstantDeclUnsignedValue", [Cursor], c_ulonglong),
     ("clang_getEnumConstantDeclValue", [Cursor], c_longlong),
-    ("clang_getEnumDeclIntegerType", [Cursor], Type, Type.from_result),
+    ("clang_getEnumDeclIntegerType", [Cursor], Type),
     ("clang_getFile", [TranslationUnit, c_interop_string], c_object_p),
-    ("clang_getFileName", [File], _CXString, _CXString.from_result),
+    ("clang_getFileName", [File], _CXString),
     ("clang_getFileTime", [File], c_uint),
-    ("clang_getIBOutletCollectionType", [Cursor], Type, Type.from_result),
-    ("clang_getIncludedFile", [Cursor], c_object_p, File.from_result),
+    ("clang_getIBOutletCollectionType", [Cursor], Type),
+    ("clang_getIncludedFile", [Cursor], c_object_p),
     (
         "clang_getInclusions",
         [TranslationUnit, translation_unit_includes_callback, py_object],
@@ -3842,41 +3844,26 @@ def write_main_file_to_stdout(self):
     ("clang_getNumDiagnosticsInSet", [c_object_p], c_uint),
     ("clang_getNumElements", [Type], c_longlong),
     ("clang_getNumOverloadedDecls", [Cursor], c_uint),
-    ("clang_getOverloadedDecl", [Cursor, c_uint], Cursor, Cursor.from_cursor_result),
-    ("clang_getPointeeType", [Type], Type, Type.from_result),
+    ("clang_getOverloadedDecl", [Cursor, c_uint], Cursor),
+    ("clang_getPointeeType", [Type], Type),
     ("clang_getRange", [SourceLocation, SourceLocation], SourceRange),
     ("clang_getRangeEnd", [SourceRange], SourceLocation),
     ("clang_getRangeStart", [SourceRange], SourceLocation),
-    ("clang_getResultType", [Type], Type, Type.from_result),
-    ("clang_getSpecializedCursorTemplate", [Cursor], Cursor, Cursor.from_cursor_result),
+    ("clang_getResultType", [Type], Type),
+    ("clang_getSpecializedCursorTemplate", [Cursor], Cursor),
     ("clang_getTemplateCursorKind", [Cursor], c_uint),
     ("clang_getTokenExtent", [TranslationUnit, Token], SourceRange),
     ("clang_getTokenKind", [Token], c_uint),
     ("clang_getTokenLocation", [TranslationUnit, Token], SourceLocation),
-    (
-        "clang_getTokenSpelling",
-        [TranslationUnit, Token],
-        _CXString,
-        _CXString.from_result,
-    ),
-    ("clang_getTranslationUnitCursor", [TranslationUnit], Cursor, Cursor.from_result),
-    (
-        "clang_getTranslationUnitSpelling",
-        [TranslationUnit],
-        _CXString,
-        _CXString.from_result,
-    ),
-    (
-        "clang_getTUResourceUsageName",
-        [c_uint],
-        c_interop_string,
-        c_interop_string.to_python_string,
-    ),
-    ("clang_getTypeDeclaration", [Type], Cursor, Cursor.from_result),
-    ("clang_getTypedefDeclUnderlyingType", [Cursor], Type, Type.from_result),
-    ("clang_getTypedefName", [Type], _CXString, _CXString.from_result),
-    ("clang_getTypeKindSpelling", [c_uint], _CXString, _CXString.from_result),
-    ("clang_getTypeSpelling", [Type], _CXString, _CXString.from_result),
+    ("clang_getTokenSpelling", [TranslationUnit, Token], _CXString),
+    ("clang_getTranslationUnitCursor", [TranslationUnit], Cursor),
+    ("clang_getTranslationUnitSpelling", [TranslationUnit], _CXString),
+    ("clang_getTUResourceUsageName", [c_uint], c_interop_string),
+    ("clang_getTypeDeclaration", [Type], Cursor),
+    ("clang_getTypedefDeclUnderlyingType", [Cursor], Type),
+    ("clang_getTypedefName", [Type], _CXString),
+    ("clang_getTypeKindSpelling", [c_uint], _CXString),
+    ("clang_getTypeSpelling", [Type], _CXString),
     ("clang_hashCursor", [Cursor], c_uint),
     ("clang_isAttribute", [CursorKind], bool),
     ("clang_isConstQualifiedType", [Type], bool),
@@ -3909,31 +3896,27 @@ def write_main_file_to_stdout(self):
     ),
     ("clang_visitChildren", [Cursor, cursor_visit_callback, py_object], c_uint),
     ("clang_Cursor_getNumArguments", [Cursor], c_int),
-    ("clang_Cursor_getArgument", [Cursor, c_uint], Cursor, Cursor.from_result),
+    ("clang_Cursor_getArgument", [Cursor, c_uint], Cursor),
     ("clang_Cursor_getNumTemplateArguments", [Cursor], c_int),
-    (
-        "clang_Cursor_getTemplateArgumentKind",
-        [Cursor, c_uint],
-        TemplateArgumentKind.from_id,
-    ),
-    ("clang_Cursor_getTemplateArgumentType", [Cursor, c_uint], Type, Type.from_result),
+    ("clang_Cursor_getTemplateArgumentKind", [Cursor, c_uint], c_uint),
+    ("clang_Cursor_getTemplateArgumentType", [Cursor, c_uint], Type),
     ("clang_Cursor_getTemplateArgumentValue", [Cursor, c_uint], c_longlong),
     ("clang_Cursor_getTemplateArgumentUnsignedValue", [Cursor, c_uint], c_ulonglong),
     ("clang_Cursor_isAnonymous", [Cursor], bool),
     ("clang_Cursor_isBitField", [Cursor], bool),
     ("clang_Cursor_getBinaryOpcode", [Cursor], c_int),
-    ("clang_Cursor_getBriefCommentText", [Cursor], _CXString, _CXString.from_result),
-    ("clang_Cursor_getRawCommentText", [Cursor], _CXString, _CXString.from_result),
+    ("clang_Cursor_getBriefCommentText", [Cursor], _CXString),
+    ("clang_Cursor_getRawCommentText", [Cursor], _CXString),
     ("clang_Cursor_getOffsetOfField", [Cursor], c_longlong),
     ("clang_Location_isInSystemHeader", [SourceLocation], bool),
     ("clang_Type_getAlignOf", [Type], c_longlong),
-    ("clang_Type_getClassType", [Type], Type, Type.from_result),
+    ("clang_Type_getClassType", [Type], Type),
     ("clang_Type_getNumTemplateArguments", [Type], c_int),
-    ("clang_Type_getTemplateArgumentAsType", [Type, c_uint], Type, Type.from_result),
+    ("clang_Type_getTemplateArgumentAsType", [Type, c_uint], Type),
     ("clang_Type_getOffsetOf", [Type, c_interop_string], c_longlong),
     ("clang_Type_getSizeOf", [Type], c_longlong),
     ("clang_Type_getCXXRefQualifier", [Type], c_uint),
-    ("clang_Type_getNamedType", [Type], Type, Type.from_result),
+    ("clang_Type_getNamedType", [Type], Type),
     ("clang_Type_visitFields", [Type, fields_visit_callback, py_object], c_uint),
 ]
 
diff --git a/clang/bindings/python/tests/cindex/test_cursor.py b/clang/bindings/python/tests/cindex/test_cursor.py
index 7476947bde2ea..77d8ca415708f 100644
--- a/clang/bindings/python/tests/cindex/test_cursor.py
+++ b/clang/bindings/python/tests/cindex/test_cursor.py
@@ -14,6 +14,7 @@
 from clang.cindex import TranslationUnit
 from clang.cindex import TypeKind
 from clang.cindex import BinaryOperator
+from clang.cindex import StorageClass
 from .util import get_cursor
 from .util import get_cursors
 from .util import get_tu
@@ -279,6 +280,90 @@ def test_is_default_method(self):
         self.assertTrue(xc.is_default_method())
         self.assertFalse(yc.is_default_method())
 
+    def test_is_deleted_method(self):
+        source = "class X { X() = delete; }; class Y { Y(); };"
+        tu = get_tu(source, lang="cpp")
+
+        xs = get_cursors(tu, "X")
+        ys = get_cursors(tu, "Y")
+
+        self.assertEqual(len(xs), 2)
+        self.assertEqual(len(ys), 2)
+
+        xc = xs[1]
+        yc = ys[1]
+
+        self.assertTrue(xc.is_deleted_method())
+        self.assertFalse(yc.is_deleted_method())
+
+    def test_is_copy_assignment_operator_method(self):
+        source_with_copy_assignment_operators = """
+        struct Foo {
+           // Those are copy-assignment operators
+           bool operator=(const Foo&);
+           bool operator=(Foo&);
+           Foo operator=(Foo);
+           bool operator=(volatile Foo&);
+           bool operator=(const volatile Foo&);
+
+        // Positive-check that the recognition works for templated classes too
+        template <typename T>
+        class Bar {
+            bool operator=(const Bar&);
+            Bar operator=(const Bar);
+            bool operator=(Bar<T>&);
+            bool operator=(volatile Bar&);
+            bool operator=(const volatile Bar<T>&);
+        };
+        """
+        source_without_copy_assignment_operators = """
+        struct Foo {
+            // Those are not copy-assignment operators
+            template<typename T>
+            bool operator=(const T&);
+            bool operator=(const bool&);
+            bool operator=(char&);
+            bool operator=(volatile unsigned int&);
+            bool operator=(const volatile unsigned char&);
+            bool operator=(int);
+            bool operator=(Foo&&);
+        };
+        """
+        tu_with_copy_assignment_operators = get_tu(
+            source_with_copy_assignment_operators, lang="cpp"
+        )
+        tu_without_copy_assignment_operators = get_tu(
+            source_without_copy_assignment_operators, lang="cpp"
+        )
+
+        copy_assignment_operators_cursors = get_cursors(
+            tu_with_copy_assignment_operators, "operator="
+        )
+        non_copy_assignment_operators_cursors = get_cursors(
+            tu_without_copy_assignment_operators, "operator="
+        )
+
+        self.assertEqual(len(copy_assignment_operators_cursors), 10)
+        self.assertTrue(len(non_copy_assignment_operators_cursors), 9)
+
+        self.assertTrue(
+            all(
+                [
+                    cursor.is_copy_assignment_operator_method()
+                    for cursor in copy_assignment_operators_cursors
+                ]
+            )
+        )
+
+        self.assertFalse(
+            any(
+                [
+                    cursor.is_copy_assignment_operator_method()
+                    for cursor in non_copy_assignment_operators_cursors
+                ]
+            )
+        )
+
     def test_is_move_assignment_operator_method(self):
         """Ensure Cursor.is_move_assignment_operator_method works."""
         source_with_move_assignment_operators = """
@@ -482,6 +567,41 @@ def test_is_scoped_enum(self):
         self.assertFalse(regular_enum.is_scoped_enum())
         self.assertTrue(scoped_enum.is_scoped_enum())
 
+    def test_get_definition(self):
+        """Ensure Cursor.get_definition works."""
+        tu = get_tu(
+            """
+class A {
+    constexpr static int f(){return 3;}
+};
+struct B {
+    int b = A::f();
+};
+""",
+            lang="cpp",
+        )
+        curs = get_cursors(tu, "f")
+        self.assertEqual(len(curs), 4)
+        self.assertEqual(curs[0].kind, CursorKind.CXX_METHOD)
+        self.assertEqual(curs[1].get_definition(), curs[0])
+        self.assertEqual(curs[2].get_definition(), curs[0])
+        self.assertEqual(curs[3].get_definition(), curs[0])
+
+    def test_get_usr(self):
+        """Ensure Cursor.get_usr works."""
+        tu = get_tu(
+            """
+int add(int, int);
+int add(int a, int b) { return a + b; }
+int add(float a, float b) { return a + b; }
+""",
+            lang="cpp",
+        )
+        curs = get_cursors(tu, "add")
+        self.assertEqual(len(curs), 3)
+        self.assertEqual(curs[0].get_usr(), curs[1].get_usr())
+        self.assertNotEqual(curs[0].get_usr(), curs[2].get_usr())
+
     def test_underlying_type(self):
         tu = get_tu("typedef int foo;")
         typedef = get_cursor(tu, "foo")
@@ -570,6 +690,23 @@ def test_enum_values_cpp(self):
         self.assertEqual(ham.kind, CursorKind.ENUM_CONSTANT_DECL)
         self.assertEqual(ham.enum_value, 0x10000000000)
 
+    def test_enum_values_unsigned(self):
+        tu = get_tu("enum TEST : unsigned char { SPAM=0, HAM = 200};", lang="cpp")
+        enum = get_cursor(tu, "TEST")
+        self.assertIsNotNone(enum)
+
+        self.assertEqual(enum.kind, CursorKind.ENUM_DECL)
+
+        enum_constants = list(enum.get_children())
+        self.assertEqual(len(enum_constants), 2)
+
+        spam, ham = enum_constants
+
+        self.assertEqual(spam.kind, CursorKind.ENUM_CONSTANT_DECL)
+        self.assertEqual(spam.enum_value, 0)
+        self.assertEqual(ham.kind, CursorKind.ENUM_CONSTANT_DECL)
+        self.assertEqual(ham.enum_value, 200)
+
     def test_annotation_attribute(self):
         tu = get_tu(
             'int foo (void) __attribute__ ((annotate("here be annotation attribute")));'
@@ -625,6 +762,25 @@ def test_result_type_objc_method_decl(self):
         self.assertEqual(cursor.kind, CursorKind.OBJC_INSTANCE_METHOD_DECL)
         self.assertEqual(result_type.kind, TypeKind.VOID)
 
+    def test_storage_class(self):
+        tu = get_tu(
+            """
+extern int ex;
+register int reg;
+int count(int a, int b){
+    static int counter = 0;
+    return 0;
+}
+""",
+            lang="cpp",
+        )
+        cursor = get_cursor(tu, "ex")
+        self.assertEqual(cursor.storage_class, StorageClass.EXTERN)
+        cursor = get_cursor(tu, "counter")
+        self.assertEqual(cursor.storage_class, StorageClass.STATIC)
+        cursor = get_cursor(tu, "reg")
+        self.assertEqual(cursor.storage_class, StorageClass.REGISTER)
+
     def test_availability(self):
         tu = get_tu("class A { A(A const&) = delete; };", lang="cpp")
 
@@ -681,6 +837,23 @@ def test_get_token_cursor(self):
         r_cursor = t_cursor.referenced  # should not raise an exception
         self.assertEqual(r_cursor.kind, CursorKind.CLASS_DECL)
 
+    def test_get_field_offsetof(self):
+        tu = get_tu(
+            "struct myStruct {int a; char b; char c; short d; char e;};", lang="cpp"
+        )
+        c1 = get_cursor(tu, "myStruct")
+        c2 = get_cursor(tu, "a")
+        c3 = get_cursor(tu, "b")
+        c4 = get_cursor(tu, "c")
+        c5 = get_cursor(tu, "d")
+        c6 = get_cursor(tu, "e")
+        self.assertEqual(c1.get_field_offsetof(), -1)
+        self.assertEqual(c2.get_field_offsetof(), 0)
+        self.assertEqual(c3.get_field_offsetof(), 32)
+        self.assertEqual(c4.get_field_offsetof(), 40)
+        self.assertEqual(c5.get_field_offsetof(), 48)
+        self.assertEqual(c6.get_field_offsetof(), 64)
+
     def test_get_arguments(self):
         tu = get_tu("void foo(int i, int j);")
         foo = get_cursor(tu, "foo")
@@ -799,3 +972,13 @@ def test_binop(self):
         for op, typ in operators.items():
             c = get_cursor(tu, op)
             assert c.binary_operator == typ
+
+    def test_from_result_null(self):
+        tu = get_tu("int a = 1+2;", lang="cpp")
+        op = next(next(tu.cursor.get_children()).get_children())
+        self.assertEqual(op.kind, CursorKind.BINARY_OPERATOR)
+        self.assertEqual(op.get_definition(), None)
+
+    def test_from_cursor_result_null(self):
+        tu = get_tu("")
+        self.assertEqual(tu.cursor.semantic_parent, None)
diff --git a/clang/bindings/python/tests/cindex/test_diagnostics.py b/clang/bindings/python/tests/cindex/test_diagnostics.py
index 57c41baaa2541..041083d12c7f1 100644
--- a/clang/bindings/python/tests/cindex/test_diagnostics.py
+++ b/clang/bindings/python/tests/cindex/test_diagnostics.py
@@ -46,6 +46,8 @@ def test_diagnostic_fixit(self):
         self.assertEqual(tu.diagnostics[0].location.column, 26)
         self.assertRegex(tu.diagnostics[0].spelling, "use of GNU old-style.*")
         self.assertEqual(len(tu.diagnostics[0].fixits), 1)
+        with self.assertRaises(IndexError):
+            tu.diagnostics[0].fixits[1]
         self.assertEqual(tu.diagnostics[0].fixits[0].range.start.line, 1)
         self.assertEqual(tu.diagnostics[0].fixits[0].range.start.column, 26)
         self.assertEqual(tu.diagnostics[0].fixits[0].range.end.line, 1)
@@ -97,6 +99,8 @@ def test_diagnostic_children(self):
 
         children = d.children
         self.assertEqual(len(children), 1)
+        with self.assertRaises(IndexError):
+            children[1]
         self.assertEqual(children[0].severity, Diagnostic.Note)
         self.assertRegex(children[0].spelling, ".*declared here")
         self.assertEqual(children[0].location.line, 1)
@@ -111,3 +115,16 @@ def test_diagnostic_string_repr(self):
             repr(d),
             "<Diagnostic severity 3, location <SourceLocation file 't.c', line 1, column 26>, spelling \"expected ';' after struct\">",
         )
+
+    def test_diagnostic_string_format(self):
+        tu = get_tu("struct MissingSemicolon{}")
+        self.assertEqual(len(tu.diagnostics), 1)
+        d = tu.diagnostics[0]
+
+        self.assertEqual(str(d), "t.c:1:26: error: expected ';' after struct")
+        self.assertEqual(
+            d.format(0b111111),
+            "t.c:1:26: error: expected ';' after struct [3, Parse Issue]",
+        )
+        with self.assertRaises(ValueError):
+            d.format(0b1000000)
diff --git a/clang/bindings/python/tests/cindex/test_type.py b/clang/bindings/python/tests/cindex/test_type.py
index 1dd8db0e3e814..928a9794e4213 100644
--- a/clang/bindings/python/tests/cindex/test_type.py
+++ b/clang/bindings/python/tests/cindex/test_type.py
@@ -10,7 +10,9 @@
 from clang.cindex import CursorKind
 from clang.cindex import TranslationUnit
 from clang.cindex import TypeKind
+from clang.cindex import RefQualifierKind
 from .util import get_cursor
+from .util import get_cursors
 from .util import get_tu
 
 
@@ -308,10 +310,10 @@ def test_element_type(self):
     def test_invalid_element_type(self):
         """Ensure Type.element_type raises if type doesn't have elements."""
         tu = get_tu("int i;")
-        i = get_cursor(tu, "i")
-        self.assertIsNotNone(i)
-        with self.assertRaises(Exception):
-            i.element_type
+        ty = get_cursor(tu, "i").type
+        with self.assertRaises(Exception) as ctx:
+            ty.element_type
+        self.assertEqual(str(ctx.exception), "Element type not available on this type.")
 
     def test_element_count(self):
         """Ensure Type.element_count works."""
@@ -357,6 +359,49 @@ def test_is_restrict_qualified(self):
         self.assertTrue(i.type.is_restrict_qualified())
         self.assertFalse(j.type.is_restrict_qualified())
 
+    def test_get_result(self):
+        tu = get_tu("void foo(); int bar(char, short);")
+        foo = get_cursor(tu, "foo")
+        bar = get_cursor(tu, "bar")
+        self.assertEqual(foo.type.get_result().spelling, "void")
+        self.assertEqual(bar.type.get_result().spelling, "int")
+
+    def test_get_class_type(self):
+        tu = get_tu(
+            """
+class myClass
+{
+   char *myAttr;
+};
+
+char *myClass::*pMyAttr = &myClass::myAttr;
+""",
+            lang="cpp",
+        )
+        cur = get_cursor(tu, "pMyAttr")
+        self.assertEqual(cur.type.get_class_type().spelling, "myClass")
+
+    def test_get_named_type(self):
+        tu = get_tu("using char_alias = char; char_alias xyz;", lang="cpp")
+        cur = get_cursor(tu, "xyz")
+        self.assertEqual(cur.type.get_named_type().spelling, "char_alias")
+
+    def test_get_ref_qualifier(self):
+        tu = get_tu(
+            """
+class A
+{
+	const int& getAttr() const &;
+	int getAttr() const &&;
+};
+""",
+            lang="cpp",
+        )
+        getters = get_cursors(tu, "getAttr")
+        self.assertEqual(len(getters), 2)
+        self.assertEqual(getters[0].type.get_ref_qualifier(), RefQualifierKind.LVALUE)
+        self.assertEqual(getters[1].type.get_ref_qualifier(), RefQualifierKind.RVALUE)
+
     def test_record_layout(self):
         """Ensure Cursor.type.get_size, Cursor.type.get_align and
         Cursor.type.get_offset works."""
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 0c6b9b1b8f9ce..ea4b4bcec55e7 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -700,6 +700,8 @@ Unless specified otherwise operation(±0) = ±0 and operation(±infinity) = ±in
  T __builtin_elementwise_canonicalize(T x)   return the platform specific canonical encoding                  floating point types
                                              of a floating-point number
  T __builtin_elementwise_copysign(T x, T y)  return the magnitude of x with the sign of y.                    floating point types
+ T __builtin_elementwise_fmod(T x, T y)      return The floating-point remainder of (x/y) whose sign          floating point types
+                                             matches the sign of x.
  T __builtin_elementwise_max(T x, T y)       return x or y, whichever is larger                               integer and floating point types
  T __builtin_elementwise_min(T x, T y)       return x or y, whichever is smaller                              integer and floating point types
  T __builtin_elementwise_add_sat(T x, T y)   return the sum of x and y, clamped to the range of               integer types
diff --git a/clang/docs/RealtimeSanitizer.rst b/clang/docs/RealtimeSanitizer.rst
index 5e281a2a35790..3f96267603aef 100644
--- a/clang/docs/RealtimeSanitizer.rst
+++ b/clang/docs/RealtimeSanitizer.rst
@@ -84,6 +84,76 @@ non-zero exit code.
     #14 0x0001958960dc  (<unknown module>)
     #15 0x2f557ffffffffffc  (<unknown module>)
 
+Run-time flags
+--------------
+
+RealtimeSanitizer supports a number of run-time flags, which can be specified in the ``RTSAN_OPTIONS`` environment variable:
+
+.. code-block:: console
+
+   % RTSAN_OPTIONS=option_1=true:path_option_2="/some/file.txt" ./a.out
+   ...
+
+Or at compile-time by providing the symbol ``__rtsan_default_options``:
+
+.. code-block:: c
+
+  __attribute__((__visibility__("default")))
+  extern "C" const char *__rtsan_default_options() {
+    return "symbolize=false:abort_on_error=0:log_to_syslog=0";
+  }
+
+You can see all sanitizer options (some of which are unsupported) by using the ``help`` flag:
+
+.. code-block:: console
+
+   % RTSAN_OPTIONS=help=true ./a.out
+
+A **partial** list of flags RealtimeSanitizer respects:
+
+.. list-table:: Run-time Flags
+   :widths: 20 10 10 70
+   :header-rows: 1
+
+   * - Flag name
+     - Default value
+     - Type
+     - Short description
+   * - ``halt_on_error``
+     - ``true``
+     - boolean
+     - Exit after first reported error. If false (continue after a detected error), deduplicates error stacks so errors appear only once.
+   * - ``print_stats_on_exit``
+     - ``false``
+     - boolean
+     - Print stats on exit. Includes total and unique errors.
+   * - ``color``
+     - ``"auto"``
+     - string
+     - Colorize reports: (always|never|auto).
+   * - ``fast_unwind_on_fatal``
+     - ``false``
+     - boolean
+     - If available, use the fast frame-pointer-based unwinder on detected errors. If true, ensure the code under test has been compiled with frame pointers with ``-fno-omit-frame-pointers`` or similar.
+   * - ``abort_on_error``
+     - OS dependent
+     - boolean
+     - If true, the tool calls abort() instead of _exit() after printing the error report. On some OSes (OSX, for exmple) this is beneficial because a better stack trace is emitted on crash.
+   * - ``symbolize``
+     - ``true``
+     - boolean
+     - If set, use the symbolizer to turn virtual addresses to file/line locations. If false, can greatly speed up the error reporting.
+
+
+Some issues with flags can be debugged using the ``verbosity=$NUM`` flag:
+
+.. code-block:: console
+
+   % RTSAN_OPTIONS=verbosity=1:misspelled_flag=true ./a.out
+   WARNING: found 1 unrecognized flag(s):
+   misspelled_flag
+   ...
+
 Disabling
 ---------
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 5923888383022..34d2b584274a5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -81,10 +81,29 @@ C++ Specific Potentially Breaking Changes
     template <typename T>
     void f();
 
+- During constant evaluation, comparisons between different evaluations of the
+  same string literal are now correctly treated as non-constant, and comparisons
+  between string literals that cannot possibly overlap in memory are now treated
+  as constant. This updates Clang to match the anticipated direction of open core
+  issue `CWG2765 <http://wg21.link/CWG2765>`, but is subject to change once that
+  issue is resolved.
+
+  .. code-block:: c++
+
+    constexpr const char *f() { return "hello"; }
+    constexpr const char *g() { return "world"; }
+    // Used to evaluate to false, now error: non-constant comparison.
+    constexpr bool a = f() == f();
+    // Might evaluate to true or false, as before.
+    bool at_runtime() { return f() == f(); }
+    // Was error, now evaluates to false.
+    constexpr bool b = f() == g();
+
 ABI Changes in This Version
 ---------------------------
 
 - Fixed Microsoft name mangling of placeholder, auto and decltype(auto), return types for MSVC 1920+. This change resolves incompatibilities with code compiled by MSVC 1920+ but will introduce incompatibilities with code compiled by earlier versions of Clang unless such code is built with the compiler option -fms-compatibility-version=19.14 to imitate the MSVC 1914 mangling behavior.
+- Fixed the Itanium mangling of the construction vtable name. This change will introduce incompatibilities with code compiled by Clang 19 and earlier versions, unless the -fclang-abi-compat=19 option is used. (#GH108015)
 
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
@@ -123,6 +142,8 @@ C++ Language Changes
 
 - Add ``__builtin_elementwise_popcount`` builtin for integer types only.
 
+- Add ``__builtin_elementwise_fmod`` builtin for floating point types only.
+
 - The builtin type alias ``__builtin_common_type`` has been added to improve the
   performance of ``std::common_type``.
 
@@ -184,6 +205,9 @@ Resolutions to C++ Defect Reports
 - Reject explicit object parameters with type ``void`` (``this void``).
   (`CWG2915: Explicit object parameters of type void <https://cplusplus.github.io/CWG/issues/2915.html>`_).
 
+- Clang now allows trailing requires clause on explicit deduction guides.
+  (`CWG2707: Deduction guides cannot have a trailing requires-clause <https://cplusplus.github.io/CWG/issues/2707.html>`_).
+
 C Language Changes
 ------------------
 
@@ -336,6 +360,10 @@ Improvements to Clang's diagnostics
   local variables passed to function calls using the ``[[clang::musttail]]``
   attribute.
 
+- Clang now diagnoses cases where a dangling ``GSLOwner<GSLPointer>`` object is constructed, e.g. ``std::vector<string_view> v = {std::string()};`` (#GH100526).
+
+- Clang now diagnoses when a ``requires`` expression has a local parameter of void type, aligning with the function parameter (#GH109831).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -424,6 +452,11 @@ Bug Fixes to C++ Support
 - Fixed an assertion failure in debug mode, and potential crashes in release mode, when
   diagnosing a failed cast caused indirectly by a failed implicit conversion to the type of the constructor parameter.
 - Fixed an assertion failure by adjusting integral to boolean vector conversions (#GH108326)
+- Mangle friend function templates with a constraint that depends on a template parameter from an enclosing template as members of the enclosing class. (#GH110247)
+- Fixed an issue in constraint evaluation, where type constraints on the lambda expression
+  containing outer unexpanded parameters were not correctly expanded. (#GH101754)
+- Fixed a bug in constraint expression comparison where the ``sizeof...`` expression was not handled properly
+  in certain friend declarations. (#GH93099)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -487,6 +520,14 @@ X86 Support
   * Supported MINMAX intrinsics of ``*_(mask(z)))_minmax(ne)_p[s|d|h|bh]`` and
   ``*_(mask(z)))_minmax_s[s|d|h]``.
 
+- All intrinsics in lzcntintrin.h can now be used in constant expressions.
+
+- All intrinsics in bmiintrin.h can now be used in constant expressions.
+
+- All intrinsics in bmi2intrin.h can now be used in constant expressions.
+
+- All intrinsics in tbmintrin.h can now be used in constant expressions.
+
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 57d78f867bab6..4f03388bc87bd 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -2369,14 +2369,16 @@ are listed below.
      $ cd $P/bar && clang -c -funique-internal-linkage-names name_conflict.c
      $ cd $P && clang foo/name_conflict.o && bar/name_conflict.o
 
-.. option:: -fbasic-block-sections=[labels, all, list=<arg>, none]
+.. option:: -f[no]-basic-block-address-map:
+  Emits a ``SHT_LLVM_BB_ADDR_MAP`` section which includes address offsets for each
+  basic block in the program, relative to the parent function address.
+
+
+.. option:: -fbasic-block-sections=[all, list=<arg>, none]
 
   Controls how Clang emits text sections for basic blocks. With values ``all``
   and ``list=<arg>``, each basic block or a subset of basic blocks can be placed
-  in its own unique section. With the "labels" value, normal text sections are
-  emitted, but a ``.bb_addr_map`` section is emitted which includes address
-  offsets for each basic block in the program, relative to the parent function
-  address.
+  in its own unique section.
 
   With the ``list=<arg>`` option, a file containing the subset of basic blocks
   that need to placed in unique sections can be specified.  The format of the
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index c124fefc78611..81264428c72ed 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -1008,6 +1008,241 @@ optin.portability.UnixAPI
 """""""""""""""""""""""""
 Finds implementation-defined behavior in UNIX/Posix functions.
 
+
+optin.taint
+^^^^^^^^^^^
+
+Checkers implementing
+`taint analysis <https://en.wikipedia.org/wiki/Taint_checking>`_.
+
+.. _optin-taint-GenericTaint:
+
+optin.taint.GenericTaint (C, C++)
+"""""""""""""""""""""""""""""""""
+
+Taint analysis identifies potential security vulnerabilities where the
+attacker can inject malicious data to the program to execute an attack
+(privilege escalation, command injection, SQL injection etc.).
+
+The malicious data is injected at the taint source (e.g. ``getenv()`` call)
+which is then propagated through function calls and being used as arguments of
+sensitive operations, also called as taint sinks (e.g. ``system()`` call).
+
+One can defend against this type of vulnerability by always checking and
+sanitizing the potentially malicious, untrusted user input.
+
+The goal of the checker is to discover and show to the user these potential
+taint source-sink pairs and the propagation call chain.
+
+The most notable examples of taint sources are:
+
+  - data from network
+  - files or standard input
+  - environment variables
+  - data from databases
+
+Let us examine a practical example of a Command Injection attack.
+
+.. code-block:: c
+
+  // Command Injection Vulnerability Example
+  int main(int argc, char** argv) {
+    char cmd[2048] = "/bin/cat ";
+    char filename[1024];
+    printf("Filename:");
+    scanf (" %1023[^\n]", filename); // The attacker can inject a shell escape here
+    strcat(cmd, filename);
+    system(cmd); // Warning: Untrusted data is passed to a system call
+  }
+
+The program prints the content of any user specified file.
+Unfortunately the attacker can execute arbitrary commands
+with shell escapes. For example with the following input the `ls` command is also
+executed after the contents of `/etc/shadow` is printed.
+`Input: /etc/shadow ; ls /`
+
+The analysis implemented in this checker points out this problem.
+
+One can protect against such attack by for example checking if the provided
+input refers to a valid file and removing any invalid user input.
+
+.. code-block:: c
+
+  // No vulnerability anymore, but we still get the warning
+  void sanitizeFileName(char* filename){
+    if (access(filename,F_OK)){// Verifying user input
+      printf("File does not exist\n");
+      filename[0]='\0';
+      }
+  }
+  int main(int argc, char** argv) {
+    char cmd[2048] = "/bin/cat ";
+    char filename[1024];
+    printf("Filename:");
+    scanf (" %1023[^\n]", filename); // The attacker can inject a shell escape here
+    sanitizeFileName(filename);// filename is safe after this point
+    if (!filename[0])
+      return -1;
+    strcat(cmd, filename);
+    system(cmd); // Superfluous Warning: Untrusted data is passed to a system call
+  }
+
+Unfortunately, the checker cannot discover automatically that the programmer
+have performed data sanitation, so it still emits the warning.
+
+One can get rid of this superfluous warning by telling by specifying the
+sanitation functions in the taint configuration file (see
+:doc:`user-docs/TaintAnalysisConfiguration`).
+
+.. code-block:: YAML
+
+  Filters:
+  - Name: sanitizeFileName
+    Args: [0]
+
+The clang invocation to pass the configuration file location:
+
+.. code-block:: bash
+
+  clang  --analyze -Xclang -analyzer-config  -Xclang optin.taint.TaintPropagation:Config=`pwd`/taint_config.yml ...
+
+If you are validating your inputs instead of sanitizing them, or don't want to
+mention each sanitizing function in our configuration,
+you can use a more generic approach.
+
+Introduce a generic no-op `csa_mark_sanitized(..)` function to
+tell the Clang Static Analyzer
+that the variable is safe to be used on that analysis path.
+
+.. code-block:: c
+
+  // Marking sanitized variables safe.
+  // No vulnerability anymore, no warning.
+
+  // User csa_mark_sanitize function is for the analyzer only
+  #ifdef __clang_analyzer__
+    void csa_mark_sanitized(const void *);
+  #endif
+
+  int main(int argc, char** argv) {
+    char cmd[2048] = "/bin/cat ";
+    char filename[1024];
+    printf("Filename:");
+    scanf (" %1023[^\n]", filename);
+    if (access(filename,F_OK)){// Verifying user input
+      printf("File does not exist\n");
+      return -1;
+    }
+    #ifdef __clang_analyzer__
+      csa_mark_sanitized(filename); // Indicating to CSA that filename variable is safe to be used after this point
+    #endif
+    strcat(cmd, filename);
+    system(cmd); // No warning
+  }
+
+Similarly to the previous example, you need to
+define a `Filter` function in a `YAML` configuration file
+and add the `csa_mark_sanitized` function.
+
+.. code-block:: YAML
+
+  Filters:
+  - Name: csa_mark_sanitized
+    Args: [0]
+
+Then calling `csa_mark_sanitized(X)` will tell the analyzer that `X` is safe to
+be used after this point, because its contents are verified. It is the
+responsibility of the programmer to ensure that this verification was indeed
+correct. Please note that `csa_mark_sanitized` function is only declared and
+used during Clang Static Analysis and skipped in (production) builds.
+
+Further examples of injection vulnerabilities this checker can find.
+
+.. code-block:: c
+
+  void test() {
+    char x = getchar(); // 'x' marked as tainted
+    system(&x); // warn: untrusted data is passed to a system call
+  }
+
+  // note: compiler internally checks if the second param to
+  // sprintf is a string literal or not.
+  // Use -Wno-format-security to suppress compiler warning.
+  void test() {
+    char s[10], buf[10];
+    fscanf(stdin, "%s", s); // 's' marked as tainted
+
+    sprintf(buf, s); // warn: untrusted data used as a format string
+  }
+
+There are built-in sources, propagations and sinks even if no external taint
+configuration is provided.
+
+Default sources:
+ ``_IO_getc``, ``fdopen``, ``fopen``, ``freopen``, ``get_current_dir_name``,
+ ``getch``, ``getchar``, ``getchar_unlocked``, ``getwd``, ``getcwd``,
+ ``getgroups``, ``gethostname``, ``getlogin``, ``getlogin_r``, ``getnameinfo``,
+ ``gets``, ``gets_s``, ``getseuserbyname``, ``readlink``, ``readlinkat``,
+ ``scanf``, ``scanf_s``, ``socket``, ``wgetch``
+
+Default propagations rules:
+ ``atoi``, ``atol``, ``atoll``, ``basename``, ``dirname``, ``fgetc``,
+ ``fgetln``, ``fgets``, ``fnmatch``, ``fread``, ``fscanf``, ``fscanf_s``,
+ ``index``, ``inflate``, ``isalnum``, ``isalpha``, ``isascii``, ``isblank``,
+ ``iscntrl``, ``isdigit``, ``isgraph``, ``islower``, ``isprint``, ``ispunct``,
+ ``isspace``, ``isupper``, ``isxdigit``, ``memchr``, ``memrchr``, ``sscanf``,
+ ``getc``, ``getc_unlocked``, ``getdelim``, ``getline``, ``getw``, ``memcmp``,
+ ``memcpy``, ``memmem``, ``memmove``, ``mbtowc``, ``pread``, ``qsort``,
+ ``qsort_r``, ``rawmemchr``, ``read``, ``recv``, ``recvfrom``, ``rindex``,
+ ``strcasestr``, ``strchr``, ``strchrnul``, ``strcasecmp``, ``strcmp``,
+ ``strcspn``, ``strncasecmp``, ``strncmp``, ``strndup``,
+ ``strndupa``, ``strpbrk``, ``strrchr``, ``strsep``, ``strspn``,
+ ``strstr``, ``strtol``, ``strtoll``, ``strtoul``, ``strtoull``, ``tolower``,
+ ``toupper``, ``ttyname``, ``ttyname_r``, ``wctomb``, ``wcwidth``
+
+Default sinks:
+ ``printf``, ``setproctitle``, ``system``, ``popen``, ``execl``, ``execle``,
+ ``execlp``, ``execv``, ``execvp``, ``execvP``, ``execve``, ``dlopen``
+
+Please note that there are no built-in filter functions.
+
+One can configure their own taint sources, sinks, and propagation rules by
+providing a configuration file via checker option
+``optin.taint.TaintPropagation:Config``. The configuration file is in
+`YAML <http://llvm.org/docs/YamlIO.html#introduction-to-yaml>`_ format. The
+taint-related options defined in the config file extend but do not override the
+built-in sources, rules, sinks. The format of the external taint configuration
+file is not stable, and could change without any notice even in a non-backward
+compatible way.
+
+For a more detailed description of configuration options, please see the
+:doc:`user-docs/TaintAnalysisConfiguration`. For an example see
+:ref:`clangsa-taint-configuration-example`.
+
+**Configuration**
+
+* `Config`  Specifies the name of the YAML configuration file. The user can
+  define their own taint sources and sinks.
+
+**Related Guidelines**
+
+* `CWE Data Neutralization Issues
+  <https://cwe.mitre.org/data/definitions/137.html>`_
+* `SEI Cert STR02-C. Sanitize data passed to complex subsystems
+  <https://wiki.sei.cmu.edu/confluence/display/c/STR02-C.+Sanitize+data+passed+to+complex+subsystems>`_
+* `SEI Cert ENV33-C. Do not call system()
+  <https://wiki.sei.cmu.edu/confluence/pages/viewpage.action?pageId=87152177>`_
+* `ENV03-C. Sanitize the environment when invoking external programs
+  <https://wiki.sei.cmu.edu/confluence/display/c/ENV03-C.+Sanitize+the+environment+when+invoking+external+programs>`_
+
+**Limitations**
+
+* The taintedness property is not propagated through function calls which are
+  unknown (or too complex) to the analyzer, unless there is a specific
+  propagation rule built-in to the checker or given in the YAML configuration
+  file. This causes potential true positive findings to be lost.
+
+
 .. _optin-taint-TaintedAlloc:
 
 optin.taint.TaintedAlloc (C, C++)
@@ -1026,7 +1261,7 @@ covers the SEI Cert coding standard rule `INT04-C
 
 You can silence this warning either by bound checking the ``size`` parameter, or
 by explicitly marking the ``size`` parameter as sanitized. See the
-:ref:`alpha-security-taint-GenericTaint` checker for an example.
+:ref:`optin-taint-GenericTaint` checker for an example.
 
 .. code-block:: c
 
@@ -1053,6 +1288,34 @@ by explicitly marking the ``size`` parameter as sanitized. See the
     delete[] ptr;
   }
 
+.. _optin-taint-TaintedDiv:
+
+optin.taint.TaintedDiv (C, C++, ObjC)
+"""""""""""""""""""""""""""""""""""""
+This checker warns when the denominator in a division
+operation is a tainted (potentially attacker controlled) value.
+If the attacker can set the denominator to 0, a runtime error can
+be triggered. The checker warns when the denominator is a tainted
+value and the analyzer cannot prove that it is not 0. This warning
+is more pessimistic than the :ref:`core-DivideZero` checker
+which warns only when it can prove that the denominator is 0.
+
+.. code-block:: c
+
+  int vulnerable(int n) {
+    size_t size = 0;
+    scanf("%zu", &size);
+    return n / size; // warn: Division by a tainted value, possibly zero
+  }
+
+  int not_vulnerable(int n) {
+    size_t size = 0;
+    scanf("%zu", &size);
+    if (!size)
+      return 0;
+    return n / size; // no warning
+  }
+
 .. _security-checkers:
 
 security
@@ -1309,6 +1572,49 @@ Warn on ``mmap()`` calls with both writable and executable access.
    //       code
  }
 
+.. _security-PointerSub:
+
+security.PointerSub (C)
+"""""""""""""""""""""""
+Check for pointer subtractions on two pointers pointing to different memory
+chunks. According to the C standard §6.5.6 only subtraction of pointers that
+point into (or one past the end) the same array object is valid (for this
+purpose non-array variables are like arrays of size 1). This checker only
+searches for different memory objects at subtraction, but does not check if the
+array index is correct. Furthermore, only cases are reported where
+stack-allocated objects are involved (no warnings on pointers to memory
+allocated by `malloc`).
+
+.. code-block:: c
+
+ void test() {
+   int a, b, c[10], d[10];
+   int x = &c[3] - &c[1];
+   x = &d[4] - &c[1]; // warn: 'c' and 'd' are different arrays
+   x = (&a + 1) - &a;
+   x = &b - &a; // warn: 'a' and 'b' are different variables
+ }
+
+ struct S {
+   int x[10];
+   int y[10];
+ };
+
+ void test1() {
+   struct S a[10];
+   struct S b;
+   int d = &a[4] - &a[6];
+   d = &a[0].x[3] - &a[0].x[1];
+   d = a[0].y - a[0].x; // warn: 'S.b' and 'S.a' are different objects
+   d = (char *)&b.y - (char *)&b.x; // warn: different members of the same object
+   d = (char *)&b.y - (char *)&b; // warn: object of type S is not the same array as a member of it
+ }
+
+There may be existing applications that use code like above for calculating
+offsets of members in a structure, using pointer subtractions. This is still
+undefined behavior according to the standard and code like this can be replaced
+with the `offsetof` macro.
+
 .. _security-putenv-stack-array:
 
 security.PutenvStackArray (C)
@@ -2512,62 +2818,19 @@ Warn about unintended use of identical expressions in operators.
    }
  }
 
-.. _alpha-core-PointerArithm:
-
-alpha.core.PointerArithm (C)
-""""""""""""""""""""""""""""
-Check for pointer arithmetic on locations other than array elements.
-
-.. code-block:: c
-
- void test() {
-   int x;
-   int *p;
-   p = &x + 1; // warn
- }
-
-.. _alpha-core-PointerSub:
-
-alpha.core.PointerSub (C)
-"""""""""""""""""""""""""
-Check for pointer subtractions on two pointers pointing to different memory
-chunks. According to the C standard §6.5.6 only subtraction of pointers that
-point into (or one past the end) the same array object is valid (for this
-purpose non-array variables are like arrays of size 1). This checker only
-searches for different memory objects at subtraction, but does not check if the
-array index is correct. Furthermore, only cases are reported where
-stack-allocated objects are involved (no warnings on pointers to memory
-allocated by `malloc`).
-
-.. code-block:: c
-
- void test() {
-   int a, b, c[10], d[10];
-   int x = &c[3] - &c[1];
-   x = &d[4] - &c[1]; // warn: 'c' and 'd' are different arrays
-   x = (&a + 1) - &a;
-   x = &b - &a; // warn: 'a' and 'b' are different variables
- }
-
- struct S {
-   int x[10];
-   int y[10];
- };
-
- void test1() {
-   struct S a[10];
-   struct S b;
-   int d = &a[4] - &a[6];
-   d = &a[0].x[3] - &a[0].x[1];
-   d = a[0].y - a[0].x; // warn: 'S.b' and 'S.a' are different objects
-   d = (char *)&b.y - (char *)&b.x; // warn: different members of the same object
-   d = (char *)&b.y - (char *)&b; // warn: object of type S is not the same array as a member of it
- }
-
-There may be existing applications that use code like above for calculating
-offsets of members in a structure, using pointer subtractions. This is still
-undefined behavior according to the standard and code like this can be replaced
-with the `offsetof` macro.
+.. _alpha-core-PointerArithm:
+
+alpha.core.PointerArithm (C)
+""""""""""""""""""""""""""""
+Check for pointer arithmetic on locations other than array elements.
+
+.. code-block:: c
+
+ void test() {
+   int x;
+   int *p;
+   p = &x + 1; // warn
+ }
 
 .. _alpha-core-StackAddressAsyncEscape:
 
@@ -2976,7 +3239,7 @@ Warn about buffer overflows (newer checker).
    buf[0][-1] = 1; // warn
  }
 
- // note: requires alpha.security.taint check turned on.
+ // note: requires optin.taint check turned on.
  void test() {
    char s[] = "abc";
    int x = getchar();
@@ -3009,239 +3272,6 @@ alpha.security.cert
 
 SEI CERT checkers which tries to find errors based on their `C coding rules <https://wiki.sei.cmu.edu/confluence/display/c/2+Rules>`_.
 
-alpha.security.taint
-^^^^^^^^^^^^^^^^^^^^
-
-Checkers implementing
-`taint analysis <https://en.wikipedia.org/wiki/Taint_checking>`_.
-
-.. _alpha-security-taint-GenericTaint:
-
-alpha.security.taint.GenericTaint (C, C++)
-""""""""""""""""""""""""""""""""""""""""""
-
-Taint analysis identifies potential security vulnerabilities where the
-attacker can inject malicious data to the program to execute an attack
-(privilege escalation, command injection, SQL injection etc.).
-
-The malicious data is injected at the taint source (e.g. ``getenv()`` call)
-which is then propagated through function calls and being used as arguments of
-sensitive operations, also called as taint sinks (e.g. ``system()`` call).
-
-One can defend against this type of vulnerability by always checking and
-sanitizing the potentially malicious, untrusted user input.
-
-The goal of the checker is to discover and show to the user these potential
-taint source-sink pairs and the propagation call chain.
-
-The most notable examples of taint sources are:
-
-  - data from network
-  - files or standard input
-  - environment variables
-  - data from databases
-
-Let us examine a practical example of a Command Injection attack.
-
-.. code-block:: c
-
-  // Command Injection Vulnerability Example
-  int main(int argc, char** argv) {
-    char cmd[2048] = "/bin/cat ";
-    char filename[1024];
-    printf("Filename:");
-    scanf (" %1023[^\n]", filename); // The attacker can inject a shell escape here
-    strcat(cmd, filename);
-    system(cmd); // Warning: Untrusted data is passed to a system call
-  }
-
-The program prints the content of any user specified file.
-Unfortunately the attacker can execute arbitrary commands
-with shell escapes. For example with the following input the `ls` command is also
-executed after the contents of `/etc/shadow` is printed.
-`Input: /etc/shadow ; ls /`
-
-The analysis implemented in this checker points out this problem.
-
-One can protect against such attack by for example checking if the provided
-input refers to a valid file and removing any invalid user input.
-
-.. code-block:: c
-
-  // No vulnerability anymore, but we still get the warning
-  void sanitizeFileName(char* filename){
-    if (access(filename,F_OK)){// Verifying user input
-      printf("File does not exist\n");
-      filename[0]='\0';
-      }
-  }
-  int main(int argc, char** argv) {
-    char cmd[2048] = "/bin/cat ";
-    char filename[1024];
-    printf("Filename:");
-    scanf (" %1023[^\n]", filename); // The attacker can inject a shell escape here
-    sanitizeFileName(filename);// filename is safe after this point
-    if (!filename[0])
-      return -1;
-    strcat(cmd, filename);
-    system(cmd); // Superfluous Warning: Untrusted data is passed to a system call
-  }
-
-Unfortunately, the checker cannot discover automatically that the programmer
-have performed data sanitation, so it still emits the warning.
-
-One can get rid of this superfluous warning by telling by specifying the
-sanitation functions in the taint configuration file (see
-:doc:`user-docs/TaintAnalysisConfiguration`).
-
-.. code-block:: YAML
-
-  Filters:
-  - Name: sanitizeFileName
-    Args: [0]
-
-The clang invocation to pass the configuration file location:
-
-.. code-block:: bash
-
-  clang  --analyze -Xclang -analyzer-config  -Xclang alpha.security.taint.TaintPropagation:Config=`pwd`/taint_config.yml ...
-
-If you are validating your inputs instead of sanitizing them, or don't want to
-mention each sanitizing function in our configuration,
-you can use a more generic approach.
-
-Introduce a generic no-op `csa_mark_sanitized(..)` function to
-tell the Clang Static Analyzer
-that the variable is safe to be used on that analysis path.
-
-.. code-block:: c
-
-  // Marking sanitized variables safe.
-  // No vulnerability anymore, no warning.
-
-  // User csa_mark_sanitize function is for the analyzer only
-  #ifdef __clang_analyzer__
-    void csa_mark_sanitized(const void *);
-  #endif
-
-  int main(int argc, char** argv) {
-    char cmd[2048] = "/bin/cat ";
-    char filename[1024];
-    printf("Filename:");
-    scanf (" %1023[^\n]", filename);
-    if (access(filename,F_OK)){// Verifying user input
-      printf("File does not exist\n");
-      return -1;
-    }
-    #ifdef __clang_analyzer__
-      csa_mark_sanitized(filename); // Indicating to CSA that filename variable is safe to be used after this point
-    #endif
-    strcat(cmd, filename);
-    system(cmd); // No warning
-  }
-
-Similarly to the previous example, you need to
-define a `Filter` function in a `YAML` configuration file
-and add the `csa_mark_sanitized` function.
-
-.. code-block:: YAML
-
-  Filters:
-  - Name: csa_mark_sanitized
-    Args: [0]
-
-Then calling `csa_mark_sanitized(X)` will tell the analyzer that `X` is safe to
-be used after this point, because its contents are verified. It is the
-responsibility of the programmer to ensure that this verification was indeed
-correct. Please note that `csa_mark_sanitized` function is only declared and
-used during Clang Static Analysis and skipped in (production) builds.
-
-Further examples of injection vulnerabilities this checker can find.
-
-.. code-block:: c
-
-  void test() {
-    char x = getchar(); // 'x' marked as tainted
-    system(&x); // warn: untrusted data is passed to a system call
-  }
-
-  // note: compiler internally checks if the second param to
-  // sprintf is a string literal or not.
-  // Use -Wno-format-security to suppress compiler warning.
-  void test() {
-    char s[10], buf[10];
-    fscanf(stdin, "%s", s); // 's' marked as tainted
-
-    sprintf(buf, s); // warn: untrusted data used as a format string
-  }
-
-There are built-in sources, propagations and sinks even if no external taint
-configuration is provided.
-
-Default sources:
- ``_IO_getc``, ``fdopen``, ``fopen``, ``freopen``, ``get_current_dir_name``,
- ``getch``, ``getchar``, ``getchar_unlocked``, ``getwd``, ``getcwd``,
- ``getgroups``, ``gethostname``, ``getlogin``, ``getlogin_r``, ``getnameinfo``,
- ``gets``, ``gets_s``, ``getseuserbyname``, ``readlink``, ``readlinkat``,
- ``scanf``, ``scanf_s``, ``socket``, ``wgetch``
-
-Default propagations rules:
- ``atoi``, ``atol``, ``atoll``, ``basename``, ``dirname``, ``fgetc``,
- ``fgetln``, ``fgets``, ``fnmatch``, ``fread``, ``fscanf``, ``fscanf_s``,
- ``index``, ``inflate``, ``isalnum``, ``isalpha``, ``isascii``, ``isblank``,
- ``iscntrl``, ``isdigit``, ``isgraph``, ``islower``, ``isprint``, ``ispunct``,
- ``isspace``, ``isupper``, ``isxdigit``, ``memchr``, ``memrchr``, ``sscanf``,
- ``getc``, ``getc_unlocked``, ``getdelim``, ``getline``, ``getw``, ``memcmp``,
- ``memcpy``, ``memmem``, ``memmove``, ``mbtowc``, ``pread``, ``qsort``,
- ``qsort_r``, ``rawmemchr``, ``read``, ``recv``, ``recvfrom``, ``rindex``,
- ``strcasestr``, ``strchr``, ``strchrnul``, ``strcasecmp``, ``strcmp``,
- ``strcspn``, ``strncasecmp``, ``strncmp``, ``strndup``,
- ``strndupa``, ``strpbrk``, ``strrchr``, ``strsep``, ``strspn``,
- ``strstr``, ``strtol``, ``strtoll``, ``strtoul``, ``strtoull``, ``tolower``,
- ``toupper``, ``ttyname``, ``ttyname_r``, ``wctomb``, ``wcwidth``
-
-Default sinks:
- ``printf``, ``setproctitle``, ``system``, ``popen``, ``execl``, ``execle``,
- ``execlp``, ``execv``, ``execvp``, ``execvP``, ``execve``, ``dlopen``
-
-Please note that there are no built-in filter functions.
-
-One can configure their own taint sources, sinks, and propagation rules by
-providing a configuration file via checker option
-``alpha.security.taint.TaintPropagation:Config``. The configuration file is in
-`YAML <http://llvm.org/docs/YamlIO.html#introduction-to-yaml>`_ format. The
-taint-related options defined in the config file extend but do not override the
-built-in sources, rules, sinks. The format of the external taint configuration
-file is not stable, and could change without any notice even in a non-backward
-compatible way.
-
-For a more detailed description of configuration options, please see the
-:doc:`user-docs/TaintAnalysisConfiguration`. For an example see
-:ref:`clangsa-taint-configuration-example`.
-
-**Configuration**
-
-* `Config`  Specifies the name of the YAML configuration file. The user can
-  define their own taint sources and sinks.
-
-**Related Guidelines**
-
-* `CWE Data Neutralization Issues
-  <https://cwe.mitre.org/data/definitions/137.html>`_
-* `SEI Cert STR02-C. Sanitize data passed to complex subsystems
-  <https://wiki.sei.cmu.edu/confluence/display/c/STR02-C.+Sanitize+data+passed+to+complex+subsystems>`_
-* `SEI Cert ENV33-C. Do not call system()
-  <https://wiki.sei.cmu.edu/confluence/pages/viewpage.action?pageId=87152177>`_
-* `ENV03-C. Sanitize the environment when invoking external programs
-  <https://wiki.sei.cmu.edu/confluence/display/c/ENV03-C.+Sanitize+the+environment+when+invoking+external+programs>`_
-
-**Limitations**
-
-* The taintedness property is not propagated through function calls which are
-  unknown (or too complex) to the analyzer, unless there is a specific
-  propagation rule built-in to the checker or given in the YAML configuration
-  file. This causes potential true positive findings to be lost.
-
 alpha.unix
 ^^^^^^^^^^
 
@@ -3440,6 +3470,27 @@ Check for non-determinism caused by sorting of pointers.
 alpha.WebKit
 ^^^^^^^^^^^^
 
+.. _alpha-webkit-NoUncheckedPtrMemberChecker:
+
+alpha.webkit.NoUncheckedPtrMemberChecker
+""""""""""""""""""""""""""""""""""""""""
+Raw pointers and references to an object which supports CheckedPtr or CheckedRef can't be used as class members. Only CheckedPtr, CheckedRef, RefPtr, or Ref are allowed.
+
+.. code-block:: cpp
+
+ struct CheckableObj {
+   void incrementPtrCount() {}
+   void decrementPtrCount() {}
+ };
+
+ struct Foo {
+   CheckableObj* ptr; // warn
+   CheckableObj& ptr; // warn
+   // ...
+ };
+
+See `WebKit Guidelines for Safer C++ Programming <https://github.com/WebKit/WebKit/wiki/Safer-CPP-Guidelines>`_ for details.
+
 .. _alpha-webkit-UncountedCallArgsChecker:
 
 alpha.webkit.UncountedCallArgsChecker
diff --git a/clang/docs/analyzer/user-docs/TaintAnalysisConfiguration.rst b/clang/docs/analyzer/user-docs/TaintAnalysisConfiguration.rst
index 67e71d558f2ce..4849a553cb2ce 100644
--- a/clang/docs/analyzer/user-docs/TaintAnalysisConfiguration.rst
+++ b/clang/docs/analyzer/user-docs/TaintAnalysisConfiguration.rst
@@ -4,10 +4,10 @@ Taint Analysis Configuration
 
 The Clang Static Analyzer uses taint analysis to detect injection vulnerability related issues in code.
 The backbone of taint analysis in the Clang SA is the ``TaintPropagation`` modeling checker.
-The reports are emitted via the :ref:`alpha-security-taint-GenericTaint` checker.
+The reports are emitted via the :ref:`optin-taint-GenericTaint` checker.
 The ``TaintPropagation`` checker has a default taint-related configuration.
 The built-in default settings are defined in code, and they are always in effect.
-The checker also provides a configuration interface for extending the default settings via the ``alpha.security.taint.TaintPropagation:Config`` checker config parameter
+The checker also provides a configuration interface for extending the default settings via the ``optin.taint.TaintPropagation:Config`` checker config parameter
 by providing a configuration file to the in `YAML <http://llvm.org/docs/YamlIO.html#introduction-to-yaml>`_ format.
 This documentation describes the syntax of the configuration file and gives the informal semantics of the configuration options.
 
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 1984310df0442..a4d36f2eacd5d 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -253,7 +253,7 @@ class ASTContext : public RefCountedBase<ASTContext> {
   mutable llvm::FoldingSet<BitIntType> BitIntTypes;
   mutable llvm::ContextualFoldingSet<DependentBitIntType, ASTContext &>
       DependentBitIntTypes;
-  llvm::FoldingSet<BTFTagAttributedType> BTFTagAttributedTypes;
+  mutable llvm::FoldingSet<BTFTagAttributedType> BTFTagAttributedTypes;
   llvm::FoldingSet<HLSLAttributedResourceType> HLSLAttributedResourceTypes;
 
   mutable llvm::FoldingSet<CountAttributedType> CountAttributedTypes;
@@ -324,6 +324,14 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// This is lazily created.  This is intentionally not serialized.
   mutable llvm::StringMap<StringLiteral *> StringLiteralCache;
 
+  /// The next string literal "version" to allocate during constant evaluation.
+  /// This is used to distinguish between repeated evaluations of the same
+  /// string literal.
+  ///
+  /// We don't need to serialize this because constants get re-evaluated in the
+  /// current file before they are compared locally.
+  unsigned NextStringLiteralVersion = 0;
+
   /// MD5 hash of CUID. It is calculated when first used and cached by this
   /// data member.
   mutable std::string CUIDHash;
@@ -1197,7 +1205,8 @@ class ASTContext : public RefCountedBase<ASTContext> {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) CanQualType SingletonId;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId) CanQualType SingletonId;
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
+  CanQualType SingletonId;
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) CanQualType SingletonId;
 #include "clang/Basic/HLSLIntangibleTypes.def"
@@ -1369,10 +1378,21 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// calling T.withConst().
   QualType getConstType(QualType T) const { return T.withConst(); }
 
+  /// Rebuild a type, preserving any existing type sugar. For function types,
+  /// you probably want to just use \c adjustFunctionResultType and friends
+  /// instead.
+  QualType adjustType(QualType OldType,
+                      llvm::function_ref<QualType(QualType)> Adjust) const;
+
   /// Change the ExtInfo on a function type.
   const FunctionType *adjustFunctionType(const FunctionType *Fn,
                                          FunctionType::ExtInfo EInfo);
 
+  /// Change the result type of a function type, preserving sugar such as
+  /// attributed types.
+  QualType adjustFunctionResultType(QualType FunctionType,
+                                    QualType NewResultType);
+
   /// Adjust the given function result type.
   CanQualType getCanonicalFunctionResultType(QualType ResultType) const;
 
@@ -1702,7 +1722,7 @@ class ASTContext : public RefCountedBase<ASTContext> {
                              QualType equivalentType) const;
 
   QualType getBTFTagAttributedType(const BTFTypeTagAttr *BTFAttr,
-                                   QualType Wrapped);
+                                   QualType Wrapped) const;
 
   QualType getHLSLAttributedResourceType(
       QualType Wrapped, QualType Contained,
@@ -3289,6 +3309,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// PredefinedExpr to cache evaluated results.
   StringLiteral *getPredefinedStringLiteralFromCache(StringRef Key) const;
 
+  /// Return the next version number to be used for a string literal evaluated
+  /// as part of constant evaluation.
+  unsigned getNextStringLiteralVersion() { return NextStringLiteralVersion++; }
+
   /// Return a declaration for the global GUID object representing the given
   /// GUID value.
   MSGuidDecl *getMSGuidDecl(MSGuidDeclParts Parts) const;
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 0600ecc4d14a1..7ff35d73df599 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -133,6 +133,10 @@ class TranslationUnitDecl : public Decl,
   static TranslationUnitDecl *castFromDeclContext(const DeclContext *DC) {
     return static_cast<TranslationUnitDecl *>(const_cast<DeclContext*>(DC));
   }
+
+  /// Retrieves the canonical declaration of this translation unit.
+  TranslationUnitDecl *getCanonicalDecl() override { return getFirstDecl(); }
+  const TranslationUnitDecl *getCanonicalDecl() const { return getFirstDecl(); }
 };
 
 /// Represents a `#pragma comment` line. Always a child of
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index 252e6e9256414..2693cc0e95b4b 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -1965,9 +1965,11 @@ class CXXDeductionGuideDecl : public FunctionDecl {
                         ExplicitSpecifier ES,
                         const DeclarationNameInfo &NameInfo, QualType T,
                         TypeSourceInfo *TInfo, SourceLocation EndLocation,
-                        CXXConstructorDecl *Ctor, DeductionCandidate Kind)
+                        CXXConstructorDecl *Ctor, DeductionCandidate Kind,
+                        Expr *TrailingRequiresClause)
       : FunctionDecl(CXXDeductionGuide, C, DC, StartLoc, NameInfo, T, TInfo,
-                     SC_None, false, false, ConstexprSpecKind::Unspecified),
+                     SC_None, false, false, ConstexprSpecKind::Unspecified,
+                     TrailingRequiresClause),
         Ctor(Ctor), ExplicitSpec(ES) {
     if (EndLocation.isValid())
       setRangeEnd(EndLocation);
@@ -1987,7 +1989,8 @@ class CXXDeductionGuideDecl : public FunctionDecl {
          ExplicitSpecifier ES, const DeclarationNameInfo &NameInfo, QualType T,
          TypeSourceInfo *TInfo, SourceLocation EndLocation,
          CXXConstructorDecl *Ctor = nullptr,
-         DeductionCandidate Kind = DeductionCandidate::Normal);
+         DeductionCandidate Kind = DeductionCandidate::Normal,
+         Expr *TrailingRequiresClause = nullptr);
 
   static CXXDeductionGuideDecl *CreateDeserialized(ASTContext &C,
                                                    GlobalDeclID ID);
diff --git a/clang/include/clang/AST/DeclID.h b/clang/include/clang/AST/DeclID.h
index f4607e42c4be3..49964b43c7d1d 100644
--- a/clang/include/clang/AST/DeclID.h
+++ b/clang/include/clang/AST/DeclID.h
@@ -189,6 +189,7 @@ class LocalDeclID : public DeclIDBase {
   // Every Decl ID is a local decl ID to the module being writing in ASTWriter.
   friend class ASTWriter;
   friend class GlobalDeclID;
+  friend struct llvm::DenseMapInfo<clang::LocalDeclID>;
 
 public:
   LocalDeclID() : Base() {}
@@ -267,6 +268,27 @@ template <> struct DenseMapInfo<clang::GlobalDeclID> {
   }
 };
 
+template <> struct DenseMapInfo<clang::LocalDeclID> {
+  using LocalDeclID = clang::LocalDeclID;
+  using DeclID = LocalDeclID::DeclID;
+
+  static LocalDeclID getEmptyKey() {
+    return LocalDeclID(DenseMapInfo<DeclID>::getEmptyKey());
+  }
+
+  static LocalDeclID getTombstoneKey() {
+    return LocalDeclID(DenseMapInfo<DeclID>::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const LocalDeclID &Key) {
+    return DenseMapInfo<DeclID>::getHashValue(Key.getRawValue());
+  }
+
+  static bool isEqual(const LocalDeclID &L, const LocalDeclID &R) {
+    return L == R;
+  }
+};
+
 } // namespace llvm
 
 #endif
diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h
index ea1ffbc7fd08b..90f5b7fc9ab6f 100644
--- a/clang/include/clang/AST/OpenACCClause.h
+++ b/clang/include/clang/AST/OpenACCClause.h
@@ -547,6 +547,32 @@ class OpenACCAsyncClause : public OpenACCClauseWithSingleIntExpr {
                                     SourceLocation EndLoc);
 };
 
+/// Represents a 'collapse' clause on a 'loop' construct. This clause takes an
+/// integer constant expression 'N' that represents how deep to collapse the
+/// construct. It also takes an optional 'force' tag that permits intervening
+/// code in the loops.
+class OpenACCCollapseClause : public OpenACCClauseWithSingleIntExpr {
+  bool HasForce = false;
+
+  OpenACCCollapseClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                        bool HasForce, Expr *LoopCount, SourceLocation EndLoc);
+
+public:
+  const Expr *getLoopCount() const { return getIntExpr(); }
+  Expr *getLoopCount() { return getIntExpr(); }
+
+  bool hasForce() const { return HasForce; }
+
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::Collapse;
+  }
+
+  static OpenACCCollapseClause *Create(const ASTContext &C,
+                                       SourceLocation BeginLoc,
+                                       SourceLocation LParenLoc, bool HasForce,
+                                       Expr *LoopCount, SourceLocation EndLoc);
+};
+
 /// Represents a clause with one or more 'var' objects, represented as an expr,
 /// as its arguments. Var-list is expected to be stored in trailing storage.
 /// For now, we're just storing the original expression in its entirety, unlike
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index dc87b84153e74..7126940058bae 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -3050,7 +3050,7 @@ class BuiltinType : public Type {
 #define WASM_TYPE(Name, Id, SingletonId) Id,
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
 // AMDGPU types
-#define AMDGPU_TYPE(Name, Id, SingletonId) Id,
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) Id,
 #include "clang/Basic/AMDGPUTypes.def"
 // HLSL intangible Types
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) Id,
@@ -6191,7 +6191,9 @@ class HLSLAttributedResourceType : public Type, public llvm::FoldingSetNode {
 
   HLSLAttributedResourceType(QualType Canon, QualType Wrapped,
                              QualType Contained, const Attributes &Attrs)
-      : Type(HLSLAttributedResource, Canon, Wrapped->getDependence()),
+      : Type(HLSLAttributedResource, Canon,
+             Contained.isNull() ? TypeDependence::None
+                                : Contained->getDependence()),
         WrappedType(Wrapped), ContainedType(Contained), Attrs(Attrs) {}
 
 public:
diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td
index bb7bfa8cd0b76..d05072607e949 100644
--- a/clang/include/clang/AST/TypeProperties.td
+++ b/clang/include/clang/AST/TypeProperties.td
@@ -893,7 +893,7 @@ let Class = BuiltinType in {
       case BuiltinType::ID: return ctx.SINGLETON_ID;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
 
-#define AMDGPU_TYPE(NAME, ID, SINGLETON_ID) \
+#define AMDGPU_TYPE(NAME, ID, SINGLETON_ID, WIDTH, ALIGN) \
       case BuiltinType::ID: return ctx.SINGLETON_ID;
 #include "clang/Basic/AMDGPUTypes.def"
 
diff --git a/clang/include/clang/Basic/AMDGPUTypes.def b/clang/include/clang/Basic/AMDGPUTypes.def
index 7454d61f5dd51..e47e544fdc82c 100644
--- a/clang/include/clang/Basic/AMDGPUTypes.def
+++ b/clang/include/clang/Basic/AMDGPUTypes.def
@@ -11,11 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #ifndef AMDGPU_OPAQUE_PTR_TYPE
-#define AMDGPU_OPAQUE_PTR_TYPE(Name, AS, Width, Align, Id, SingletonId) \
-  AMDGPU_TYPE(Name, Id, SingletonId)
+#define AMDGPU_OPAQUE_PTR_TYPE(Name, Id, SingletonId, Width, Align, AS) \
+  AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)
 #endif
 
-AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_buffer_rsrc_t", 8, 128, 128, AMDGPUBufferRsrc, AMDGPUBufferRsrcTy)
+AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_buffer_rsrc_t", AMDGPUBufferRsrc, AMDGPUBufferRsrcTy, 128, 128, 8)
 
 #undef AMDGPU_TYPE
 #undef AMDGPU_OPAQUE_PTR_TYPE
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index ce86116680d7a..fbcbf0ed41641 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3366,16 +3366,18 @@ def DiagnoseIf : InheritableAttr {
   let Spellings = [GNU<"diagnose_if">];
   let Subjects = SubjectList<[Function, ObjCMethod, ObjCProperty]>;
   let Args = [ExprArgument<"Cond">, StringArgument<"Message">,
-              EnumArgument<"DefaultSeverity",
-                           "DefaultSeverity",
+              EnumArgument<"DiagnosticType", "DiagnosticType",
                            /*is_string=*/true,
-                           ["error",    "warning"],
-                           ["DS_error", "DS_warning"]>,
-              StringArgument<"WarningGroup", /*optional*/ 1>,
+                           ["error", "warning"],
+                           ["DT_Error", "DT_Warning"]>,
               BoolArgument<"ArgDependent", 0, /*fake*/ 1>,
               DeclArgument<Named, "Parent", 0, /*fake*/ 1>];
   let InheritEvenIfAlreadyPresent = 1;
   let LateParsed = LateAttrParseStandard;
+  let AdditionalMembers = [{
+    bool isError() const { return diagnosticType == DT_Error; }
+    bool isWarning() const { return diagnosticType == DT_Warning; }
+  }];
   let TemplateDependent = 1;
   let Documentation = [DiagnoseIfDocs];
 }
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index f23a148e546fa..53d88482698f0 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -6696,6 +6696,20 @@ When the Owner's lifetime ends, it will consider the Pointer to be dangling.
     P.getInt(); // P is dangling
   }
 
+If a template class is annotated with ``[[gsl::Owner]]``, and the first
+instantiated template argument is a pointer type (raw pointer, or ``[[gsl::Pointer]]``),
+the analysis will consider the instantiated class as a container of the pointer.
+When constructing such an object from a GSL owner object, the analysis will
+assume that the container holds a pointer to the owner object. Consequently,
+when the owner object is destroyed, the pointer will be considered dangling.
+
+.. code-block:: c++
+
+   int f() {
+     std::vector<std::string_view> v = {std::string()}; // v holds a dangling pointer.
+     std::optional<std::string_view> o = std::string(); // o holds a dangling pointer.
+   }
+
 }];
 }
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 8c5d7ad763bf9..8090119e512fb 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1328,6 +1328,12 @@ def ElementwisePopcount : Builtin {
   let Prototype = "void(...)";
 }
 
+def ElementwiseFmod : Builtin {
+  let Spellings = ["__builtin_elementwise_fmod"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 def ElementwisePow : Builtin {
   let Spellings = ["__builtin_elementwise_pow"];
   let Attributes = [NoThrow, Const, CustomTypeChecking];
@@ -4703,12 +4709,6 @@ def HLSLClamp : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
-def HLSLCreateHandle : LangBuiltin<"HLSL_LANG"> {
-  let Spellings = ["__builtin_hlsl_create_handle"];
-  let Attributes = [NoThrow, Const];
-  let Prototype = "void*(unsigned char)";
-}
-
 def HLSLDotProduct : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_dot"];
   let Attributes = [NoThrow, Const];
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 6fff562165080..6b7bce5bc00d4 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -139,6 +139,7 @@ TARGET_BUILTIN(__nvvm_is_explicit_cluster, "b", "nc", AND(SM_90, PTX78))
 BUILTIN(__nvvm_read_ptx_sreg_laneid, "i", "nc")
 BUILTIN(__nvvm_read_ptx_sreg_warpid, "i", "nc")
 BUILTIN(__nvvm_read_ptx_sreg_nwarpid, "i", "nc")
+BUILTIN(__nvvm_read_ptx_sreg_warpsize, "i", "nc")
 
 BUILTIN(__nvvm_read_ptx_sreg_smid, "i", "nc")
 BUILTIN(__nvvm_read_ptx_sreg_nsmid, "i", "nc")
diff --git a/clang/include/clang/Basic/BuiltinsRISCV.td b/clang/include/clang/Basic/BuiltinsRISCV.td
index 4cc89a8a9d8af..3263603a8a1cf 100644
--- a/clang/include/clang/Basic/BuiltinsRISCV.td
+++ b/clang/include/clang/Basic/BuiltinsRISCV.td
@@ -146,3 +146,8 @@ let Features = "zihintntl", Attributes = [CustomTypeChecking] in {
 def ntl_load : RISCVBuiltin<"void(...)">;
 def ntl_store : RISCVBuiltin<"void(...)">;
 } // Features = "zihintntl", Attributes = [CustomTypeChecking]
+
+//===----------------------------------------------------------------------===//
+// XCV extensions.
+//===----------------------------------------------------------------------===//
+include "clang/Basic/BuiltinsRISCVXCV.td"
diff --git a/clang/include/clang/Basic/BuiltinsRISCVXCV.td b/clang/include/clang/Basic/BuiltinsRISCVXCV.td
new file mode 100644
index 0000000000000..06ce07ade5c12
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsRISCVXCV.td
@@ -0,0 +1,41 @@
+//==- BuiltinsRISCVXCV.td - RISC-V CORE-V Builtin database    ----*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the CORE-V-specific builtin function database.  Users of
+// this file must define the BUILTIN macro to make use of this information.
+//
+//===----------------------------------------------------------------------===//
+
+class RISCXCVBuiltin<string prototype, string features = ""> : TargetBuiltin {
+  let Spellings = ["__builtin_riscv_cv_" # NAME];
+  let Prototype = prototype;
+  let Features = features;
+}
+
+let Attributes = [NoThrow, Const] in {
+//===----------------------------------------------------------------------===//
+// XCValu extension.
+//===----------------------------------------------------------------------===//
+def alu_slet  : RISCXCVBuiltin<"int(int, int)", "xcvalu">;
+def alu_sletu : RISCXCVBuiltin<"int(unsigned int, unsigned int)", "xcvalu">;
+def alu_exths : RISCXCVBuiltin<"int(int)", "xcvalu">;
+def alu_exthz : RISCXCVBuiltin<"unsigned int(unsigned int)", "xcvalu">;
+def alu_extbs : RISCXCVBuiltin<"int(int)", "xcvalu">;
+def alu_extbz : RISCXCVBuiltin<"unsigned int(unsigned int)", "xcvalu">;
+
+def alu_clip   : RISCXCVBuiltin<"int(int, int)", "xcvalu">;
+def alu_clipu  : RISCXCVBuiltin<"unsigned int(unsigned int, unsigned int)", "xcvalu">;
+def alu_addN   : RISCXCVBuiltin<"int(int, int, unsigned int)", "xcvalu">;
+def alu_adduN  : RISCXCVBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", "xcvalu">;
+def alu_addRN  : RISCXCVBuiltin<"int(int, int, unsigned int)", "xcvalu">;
+def alu_adduRN : RISCXCVBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", "xcvalu">;
+def alu_subN   : RISCXCVBuiltin<"int(int, int, unsigned int)", "xcvalu">;
+def alu_subuN  : RISCXCVBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", "xcvalu">;
+def alu_subRN  : RISCXCVBuiltin<"int(int, int, unsigned int)", "xcvalu">;
+def alu_subuRN : RISCXCVBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", "xcvalu">;
+} // Attributes = [NoThrow, Const]
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 25c656a530b15..2a987abcf9a35 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -551,21 +551,21 @@ TARGET_BUILTIN(__builtin_ia32_rdseed16_step, "UiUs*", "n", "rdseed")
 TARGET_BUILTIN(__builtin_ia32_rdseed32_step, "UiUi*", "n", "rdseed")
 
 // LZCNT
-TARGET_BUILTIN(__builtin_ia32_lzcnt_u16, "UsUs", "nc", "lzcnt")
-TARGET_BUILTIN(__builtin_ia32_lzcnt_u32, "UiUi", "nc", "lzcnt")
+TARGET_BUILTIN(__builtin_ia32_lzcnt_u16, "UsUs", "ncE", "lzcnt")
+TARGET_BUILTIN(__builtin_ia32_lzcnt_u32, "UiUi", "ncE", "lzcnt")
 
 // BMI
-TARGET_BUILTIN(__builtin_ia32_bextr_u32, "UiUiUi", "nc", "bmi")
-TARGET_BUILTIN(__builtin_ia32_tzcnt_u16, "UsUs", "nc", "")
-TARGET_BUILTIN(__builtin_ia32_tzcnt_u32, "UiUi", "nc", "")
+TARGET_BUILTIN(__builtin_ia32_bextr_u32, "UiUiUi", "ncE", "bmi")
+TARGET_BUILTIN(__builtin_ia32_tzcnt_u16, "UsUs", "ncE", "")
+TARGET_BUILTIN(__builtin_ia32_tzcnt_u32, "UiUi", "ncE", "")
 
 // BMI2
-TARGET_BUILTIN(__builtin_ia32_bzhi_si, "UiUiUi", "nc", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pdep_si, "UiUiUi", "nc", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pext_si, "UiUiUi", "nc", "bmi2")
+TARGET_BUILTIN(__builtin_ia32_bzhi_si, "UiUiUi", "ncE", "bmi2")
+TARGET_BUILTIN(__builtin_ia32_pdep_si, "UiUiUi", "ncE", "bmi2")
+TARGET_BUILTIN(__builtin_ia32_pext_si, "UiUiUi", "ncE", "bmi2")
 
 // TBM
-TARGET_BUILTIN(__builtin_ia32_bextri_u32, "UiUiIUi", "nc", "tbm")
+TARGET_BUILTIN(__builtin_ia32_bextri_u32, "UiUiIUi", "ncE", "tbm")
 
 // LWP
 TARGET_BUILTIN(__builtin_ia32_llwpcb, "vv*", "n", "lwp")
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index db381aa77e761..d5fdb272d92d1 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -70,13 +70,13 @@ TARGET_BUILTIN(__builtin_ia32_addcarryx_u64, "UcUcUOiUOiUOi*", "n", "")
 TARGET_BUILTIN(__builtin_ia32_subborrow_u64, "UcUcUOiUOiUOi*", "n", "")
 TARGET_BUILTIN(__builtin_ia32_rdrand64_step, "UiUOi*", "n", "rdrnd")
 TARGET_BUILTIN(__builtin_ia32_rdseed64_step, "UiUOi*", "n", "rdseed")
-TARGET_BUILTIN(__builtin_ia32_lzcnt_u64, "UOiUOi", "nc", "lzcnt")
-TARGET_BUILTIN(__builtin_ia32_bextr_u64, "UOiUOiUOi", "nc", "bmi")
-TARGET_BUILTIN(__builtin_ia32_tzcnt_u64, "UOiUOi", "nc", "")
-TARGET_BUILTIN(__builtin_ia32_bzhi_di, "UOiUOiUOi", "nc", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pdep_di, "UOiUOiUOi", "nc", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pext_di, "UOiUOiUOi", "nc", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_bextri_u64, "UOiUOiIUOi", "nc", "tbm")
+TARGET_BUILTIN(__builtin_ia32_lzcnt_u64, "UOiUOi", "ncE", "lzcnt")
+TARGET_BUILTIN(__builtin_ia32_bextr_u64, "UOiUOiUOi", "ncE", "bmi")
+TARGET_BUILTIN(__builtin_ia32_tzcnt_u64, "UOiUOi", "ncE", "")
+TARGET_BUILTIN(__builtin_ia32_bzhi_di, "UOiUOiUOi", "ncE", "bmi2")
+TARGET_BUILTIN(__builtin_ia32_pdep_di, "UOiUOiUOi", "ncE", "bmi2")
+TARGET_BUILTIN(__builtin_ia32_pext_di, "UOiUOiUOi", "ncE", "bmi2")
+TARGET_BUILTIN(__builtin_ia32_bextri_u64, "UOiUOiIUOi", "ncE", "tbm")
 TARGET_BUILTIN(__builtin_ia32_lwpins64, "UcUOiUiIUi", "n", "lwp")
 TARGET_BUILTIN(__builtin_ia32_lwpval64, "vUOiUiIUi", "n", "lwp")
 TARGET_BUILTIN(__builtin_ia32_vcvtsd2si64, "OiV2dIi", "ncV:128:", "avx512f")
diff --git a/clang/include/clang/Basic/CFProtectionOptions.def b/clang/include/clang/Basic/CFProtectionOptions.def
new file mode 100644
index 0000000000000..b9df2de7f7eba
--- /dev/null
+++ b/clang/include/clang/Basic/CFProtectionOptions.def
@@ -0,0 +1,15 @@
+//===-- CFProtectionOptions.def - cf-protection options ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifdef CF_BRANCH_LABEL_SCHEME
+CF_BRANCH_LABEL_SCHEME(Unlabeled, unlabeled)
+CF_BRANCH_LABEL_SCHEME(FuncSig, func-sig)
+
+#undef CF_BRANCH_LABEL_SCHEME
+#endif // #ifdef CF_BRANCH_LABEL_SCHEME
diff --git a/clang/include/clang/Basic/CFProtectionOptions.h b/clang/include/clang/Basic/CFProtectionOptions.h
new file mode 100644
index 0000000000000..13f46d4c13e7e
--- /dev/null
+++ b/clang/include/clang/Basic/CFProtectionOptions.h
@@ -0,0 +1,38 @@
+//===--- CFProtectionOptions.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines constants for -fcf-protection and other related flags.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_BASIC_CFPROTECTIONOPTIONS_H
+#define LLVM_CLANG_BASIC_CFPROTECTIONOPTIONS_H
+
+#include "llvm/Support/ErrorHandling.h"
+
+namespace clang {
+
+enum class CFBranchLabelSchemeKind {
+  Default,
+#define CF_BRANCH_LABEL_SCHEME(Kind, FlagVal) Kind,
+#include "clang/Basic/CFProtectionOptions.def"
+};
+
+static inline const char *
+getCFBranchLabelSchemeFlagVal(const CFBranchLabelSchemeKind Scheme) {
+#define CF_BRANCH_LABEL_SCHEME(Kind, FlagVal)                                  \
+  if (Scheme == CFBranchLabelSchemeKind::Kind)                                 \
+    return #FlagVal;
+#include "clang/Basic/CFProtectionOptions.def"
+
+  llvm::report_fatal_error("invalid scheme");
+}
+
+} // namespace clang
+
+#endif // #ifndef LLVM_CLANG_BASIC_CFPROTECTIONOPTIONS_H
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 2893377e5a38b..eac831278ee20 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -111,6 +111,8 @@ CODEGENOPT(CFProtectionReturn , 1, 0) ///< if -fcf-protection is
                                       ///< set to full or return.
 CODEGENOPT(CFProtectionBranch , 1, 0) ///< if -fcf-protection is
                                       ///< set to full or branch.
+ENUM_CODEGENOPT(CFBranchLabelScheme, CFBranchLabelSchemeKind, 2,
+    CFBranchLabelSchemeKind::Default) ///< if -mcf-branch-label-scheme is set.
 CODEGENOPT(FunctionReturnThunks, 1, 0) ///< -mfunction-return={keep|thunk-extern}
 CODEGENOPT(IndirectBranchCSPrefix, 1, 0) ///< if -mindirect-branch-cs-prefix
                                          ///< is set.
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index f2a707a8ba8d7..2dcf98b465661 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_CLANG_BASIC_CODEGENOPTIONS_H
 #define LLVM_CLANG_BASIC_CODEGENOPTIONS_H
 
+#include "clang/Basic/CFProtectionOptions.h"
 #include "clang/Basic/PointerAuthOptions.h"
 #include "clang/Basic/Sanitizers.h"
 #include "clang/Basic/XRayInstr.h"
@@ -107,18 +108,13 @@ class CodeGenOptions : public CodeGenOptionsBase {
 
   // This field stores one of the allowed values for the option
   // -fbasic-block-sections=.  The allowed values with this option are:
-  // {"labels", "all", "list=<file>", "none"}.
+  // {"all", "list=<file>", "none"}.
   //
-  // "labels":      Only generate basic block symbols (labels) for all basic
-  //                blocks, do not generate unique sections for basic blocks.
-  //                Use the machine basic block id in the symbol name to
-  //                associate profile info from virtual address to machine
-  //                basic block.
   // "all" :        Generate basic block sections for all basic blocks.
   // "list=<file>": Generate basic block sections for a subset of basic blocks.
   //                The functions and the machine basic block ids are specified
   //                in the file.
-  // "none":        Disable sections/labels for basic blocks.
+  // "none":        Disable sections for basic blocks.
   std::string BBSections;
 
   // If set, override the default value of MCAsmInfo::BinutilsVersion. If
diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h
index e17ed8f98afa9..3b1efdb12824c 100644
--- a/clang/include/clang/Basic/Diagnostic.h
+++ b/clang/include/clang/Basic/Diagnostic.h
@@ -371,12 +371,10 @@ class DiagnosticsEngine : public RefCountedBase<DiagnosticsEngine> {
     // Map extensions to warnings or errors?
     diag::Severity ExtBehavior = diag::Severity::Ignored;
 
-    DiagnosticIDs &DiagIDs;
-
-    DiagState(DiagnosticIDs &DiagIDs)
+    DiagState()
         : IgnoreAllWarnings(false), EnableAllWarnings(false),
           WarningsAsErrors(false), ErrorsAsFatal(false),
-          SuppressSystemWarnings(false), DiagIDs(DiagIDs) {}
+          SuppressSystemWarnings(false) {}
 
     using iterator = llvm::DenseMap<unsigned, DiagnosticMapping>::iterator;
     using const_iterator =
@@ -886,8 +884,6 @@ class DiagnosticsEngine : public RefCountedBase<DiagnosticsEngine> {
   /// \param FormatString A fixed diagnostic format string that will be hashed
   /// and mapped to a unique DiagID.
   template <unsigned N>
-  // TODO: Deprecate this once all uses are removed from LLVM
-  // [[deprecated("Use a CustomDiagDesc instead of a Level")]]
   unsigned getCustomDiagID(Level L, const char (&FormatString)[N]) {
     return Diags->getCustomDiagID((DiagnosticIDs::Level)L,
                                   StringRef(FormatString, N - 1));
diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index 21a307d1e8987..6a658cf14356f 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -96,6 +96,9 @@ def note_constexpr_pointer_constant_comparison : Note<
   "at runtime">;
 def note_constexpr_literal_comparison : Note<
   "comparison of addresses of literals has unspecified value">;
+def note_constexpr_opaque_call_comparison : Note<
+  "comparison against opaque constant address '%0' can only be performed at "
+  "runtime">;
 def note_constexpr_pointer_weak_comparison : Note<
   "comparison against address of weak declaration '%0' can only be performed "
   "at runtime">;
diff --git a/clang/include/clang/Basic/DiagnosticCategories.h b/clang/include/clang/Basic/DiagnosticCategories.h
index 839f8dee3ca89..14be326f7515f 100644
--- a/clang/include/clang/Basic/DiagnosticCategories.h
+++ b/clang/include/clang/Basic/DiagnosticCategories.h
@@ -21,12 +21,11 @@ namespace clang {
     };
 
     enum class Group {
-#define DIAG_ENTRY(GroupName, FlagNameOffset, Members, SubGroups, Docs)    \
-      GroupName,
+#define DIAG_ENTRY(GroupName, FlagNameOffset, Members, SubGroups, Docs)        \
+  GroupName,
 #include "clang/Basic/DiagnosticGroups.inc"
 #undef CATEGORY
 #undef DIAG_ENTRY
-      NUM_GROUPS
     };
   }  // end namespace diag
 }  // end namespace clang
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 292e4af1b3b30..a6b17ccb6799d 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -109,8 +109,6 @@ def err_fe_expected_clang_command : Error<
     "expected a clang compiler command">;
 def err_fe_remap_missing_to_file : Error<
     "could not remap file '%0' to the contents of file '%1'">, DefaultFatal;
-def err_fe_remap_missing_from_file : Error<
-    "could not remap from missing file '%0'">, DefaultFatal;
 def err_fe_unable_to_load_pch : Error<
     "unable to load PCH file">;
 def err_fe_unable_to_load_plugin : Error<
diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h
index 1fa38ed6066e2..a051af327de28 100644
--- a/clang/include/clang/Basic/DiagnosticIDs.h
+++ b/clang/include/clang/Basic/DiagnosticIDs.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_CLANG_BASIC_DIAGNOSTICIDS_H
 #define LLVM_CLANG_BASIC_DIAGNOSTICIDS_H
 
-#include "clang/Basic/DiagnosticCategories.h"
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/StringRef.h"
@@ -85,7 +84,7 @@ namespace clang {
     /// to either Ignore (nothing), Remark (emit a remark), Warning
     /// (emit a warning) or Error (emit as an error).  It allows clients to
     /// map ERRORs to Error or Fatal (stop emitting diagnostics after this one).
-    enum class Severity : uint8_t {
+    enum class Severity {
       // NOTE: 0 means "uncomputed".
       Ignored = 1, ///< Do not present this diagnostic, ignore it.
       Remark = 2,  ///< Present this diagnostic as a remark.
@@ -182,96 +181,13 @@ class DiagnosticMapping {
 class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
 public:
   /// The level of the diagnostic, after it has been through mapping.
-  enum Level : uint8_t { Ignored, Note, Remark, Warning, Error, Fatal };
-
-  // Diagnostic classes.
-  enum Class {
-    CLASS_INVALID = 0x00,
-    CLASS_NOTE = 0x01,
-    CLASS_REMARK = 0x02,
-    CLASS_WARNING = 0x03,
-    CLASS_EXTENSION = 0x04,
-    CLASS_ERROR = 0x05
-  };
-
-  static bool IsCustomDiag(diag::kind Diag) {
-    return Diag >= diag::DIAG_UPPER_LIMIT;
-  }
-
-  class CustomDiagDesc {
-    LLVM_PREFERRED_TYPE(diag::Severity)
-    unsigned DefaultSeverity : 3;
-    LLVM_PREFERRED_TYPE(Class)
-    unsigned DiagClass : 3;
-    LLVM_PREFERRED_TYPE(bool)
-    unsigned ShowInSystemHeader : 1;
-    LLVM_PREFERRED_TYPE(bool)
-    unsigned ShowInSystemMacro : 1;
-    LLVM_PREFERRED_TYPE(bool)
-    unsigned HasGroup : 1;
-    diag::Group Group;
-    std::string Description;
-
-    auto get_as_tuple() const {
-      return std::tuple(DefaultSeverity, DiagClass, ShowInSystemHeader,
-                        ShowInSystemMacro, HasGroup, Group,
-                        std::string_view{Description});
-    }
-
-  public:
-    CustomDiagDesc(diag::Severity DefaultSeverity, std::string Description,
-                   unsigned Class = CLASS_WARNING,
-                   bool ShowInSystemHeader = false,
-                   bool ShowInSystemMacro = false,
-                   std::optional<diag::Group> Group = std::nullopt)
-        : DefaultSeverity(static_cast<unsigned>(DefaultSeverity)),
-          DiagClass(Class), ShowInSystemHeader(ShowInSystemHeader),
-          ShowInSystemMacro(ShowInSystemMacro), HasGroup(Group != std::nullopt),
-          Group(Group.value_or(diag::Group{})),
-          Description(std::move(Description)) {}
-
-    std::optional<diag::Group> GetGroup() const {
-      if (HasGroup)
-        return Group;
-      return std::nullopt;
-    }
-
-    diag::Severity GetDefaultSeverity() const {
-      return static_cast<diag::Severity>(DefaultSeverity);
-    }
-
-    Class GetClass() const { return static_cast<Class>(DiagClass); }
-    std::string_view GetDescription() const { return Description; }
-    bool ShouldShowInSystemHeader() const { return ShowInSystemHeader; }
-
-    friend bool operator==(const CustomDiagDesc &lhs,
-                           const CustomDiagDesc &rhs) {
-      return lhs.get_as_tuple() == rhs.get_as_tuple();
-    }
-
-    friend bool operator<(const CustomDiagDesc &lhs,
-                          const CustomDiagDesc &rhs) {
-      return lhs.get_as_tuple() < rhs.get_as_tuple();
-    }
-  };
-
-  struct GroupInfo {
-    LLVM_PREFERRED_TYPE(diag::Severity)
-    unsigned Severity : 3;
-    LLVM_PREFERRED_TYPE(bool)
-    unsigned HasNoWarningAsError : 1;
+  enum Level {
+    Ignored, Note, Remark, Warning, Error, Fatal
   };
 
 private:
   /// Information for uniquing and looking up custom diags.
   std::unique_ptr<diag::CustomDiagInfo> CustomDiagInfo;
-  std::unique_ptr<GroupInfo[]> GroupInfos = []() {
-    auto GIs = std::make_unique<GroupInfo[]>(
-        static_cast<size_t>(diag::Group::NUM_GROUPS));
-    for (size_t i = 0; i != static_cast<size_t>(diag::Group::NUM_GROUPS); ++i)
-      GIs[i] = {{}, false};
-    return GIs;
-  }();
 
 public:
   DiagnosticIDs();
@@ -286,35 +202,7 @@ class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
   // FIXME: Replace this function with a create-only facilty like
   // createCustomDiagIDFromFormatString() to enforce safe usage. At the time of
   // writing, nearly all callers of this function were invalid.
-  unsigned getCustomDiagID(CustomDiagDesc Diag);
-
-  // TODO: Deprecate this once all uses are removed from LLVM
-  // [[deprecated("Use a CustomDiagDesc instead of a Level")]]
-  unsigned getCustomDiagID(Level Level, StringRef Message) {
-    return getCustomDiagID([&]() -> CustomDiagDesc {
-      switch (Level) {
-      case DiagnosticIDs::Level::Ignored:
-        return {diag::Severity::Ignored, std::string(Message), CLASS_WARNING,
-                /*ShowInSystemHeader*/ true};
-      case DiagnosticIDs::Level::Note:
-        return {diag::Severity::Fatal, std::string(Message), CLASS_NOTE,
-                /*ShowInSystemHeader*/ true};
-      case DiagnosticIDs::Level::Remark:
-        return {diag::Severity::Remark, std::string(Message), CLASS_REMARK,
-                /*ShowInSystemHeader*/ true};
-      case DiagnosticIDs::Level::Warning:
-        return {diag::Severity::Warning, std::string(Message), CLASS_WARNING,
-                /*ShowInSystemHeader*/ true};
-      case DiagnosticIDs::Level::Error:
-        return {diag::Severity::Error, std::string(Message), CLASS_ERROR,
-                /*ShowInSystemHeader*/ true};
-      case DiagnosticIDs::Level::Fatal:
-        return {diag::Severity::Fatal, std::string(Message), CLASS_ERROR,
-                /*ShowInSystemHeader*/ true};
-      }
-      llvm_unreachable("Fully covered switch above!");
-    }());
-  }
+  unsigned getCustomDiagID(Level L, StringRef FormatString);
 
   //===--------------------------------------------------------------------===//
   // Diagnostic classification and reporting interfaces.
@@ -326,36 +214,35 @@ class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
   /// Return true if the unmapped diagnostic levelof the specified
   /// diagnostic ID is a Warning or Extension.
   ///
-  /// This is not legal to call on NOTEs.
-  bool isWarningOrExtension(unsigned DiagID) const;
+  /// This only works on builtin diagnostics, not custom ones, and is not
+  /// legal to call on NOTEs.
+  static bool isBuiltinWarningOrExtension(unsigned DiagID);
 
   /// Return true if the specified diagnostic is mapped to errors by
   /// default.
-  bool isDefaultMappingAsError(unsigned DiagID) const;
+  static bool isDefaultMappingAsError(unsigned DiagID);
 
   /// Get the default mapping for this diagnostic.
-  DiagnosticMapping getDefaultMapping(unsigned DiagID) const;
-
-  void initCustomDiagMapping(DiagnosticMapping &, unsigned DiagID);
+  static DiagnosticMapping getDefaultMapping(unsigned DiagID);
 
-  /// Determine whether the given diagnostic ID is a Note.
-  bool isNote(unsigned DiagID) const;
+  /// Determine whether the given built-in diagnostic ID is a Note.
+  static bool isBuiltinNote(unsigned DiagID);
 
-  /// Determine whether the given diagnostic ID is for an
+  /// Determine whether the given built-in diagnostic ID is for an
   /// extension of some sort.
-  bool isExtensionDiag(unsigned DiagID) const {
+  static bool isBuiltinExtensionDiag(unsigned DiagID) {
     bool ignored;
-    return isExtensionDiag(DiagID, ignored);
+    return isBuiltinExtensionDiag(DiagID, ignored);
   }
 
-  /// Determine whether the given diagnostic ID is for an
+  /// Determine whether the given built-in diagnostic ID is for an
   /// extension of some sort, and whether it is enabled by default.
   ///
   /// This also returns EnabledByDefault, which is set to indicate whether the
   /// diagnostic is ignored by default (in which case -pedantic enables it) or
   /// treated as a warning/error by default.
   ///
-  bool isExtensionDiag(unsigned DiagID, bool &EnabledByDefault) const;
+  static bool isBuiltinExtensionDiag(unsigned DiagID, bool &EnabledByDefault);
 
   /// Given a group ID, returns the flag that toggles the group.
   /// For example, for Group::DeprecatedDeclarations, returns
@@ -365,22 +252,19 @@ class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
   /// Given a diagnostic group ID, return its documentation.
   static StringRef getWarningOptionDocumentation(diag::Group GroupID);
 
-  void setGroupSeverity(StringRef Group, diag::Severity);
-  void setGroupNoWarningsAsError(StringRef Group, bool);
-
   /// Given a group ID, returns the flag that toggles the group.
   /// For example, for "deprecated-declarations", returns
   /// Group::DeprecatedDeclarations.
   static std::optional<diag::Group> getGroupForWarningOption(StringRef);
 
   /// Return the lowest-level group that contains the specified diagnostic.
-  std::optional<diag::Group> getGroupForDiag(unsigned DiagID) const;
+  static std::optional<diag::Group> getGroupForDiag(unsigned DiagID);
 
   /// Return the lowest-level warning option that enables the specified
   /// diagnostic.
   ///
   /// If there is no -Wfoo flag that controls the diagnostic, this returns null.
-  StringRef getWarningOptionForDiag(unsigned DiagID);
+  static StringRef getWarningOptionForDiag(unsigned DiagID);
 
   /// Return the category number that a specified \p DiagID belongs to,
   /// or 0 if no category.
@@ -481,8 +365,6 @@ class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
   getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
                         const DiagnosticsEngine &Diag) const LLVM_READONLY;
 
-  Class getDiagClass(unsigned DiagID) const;
-
   /// Used to report a diagnostic that is finally fully formed.
   ///
   /// \returns \c true if the diagnostic was emitted, \c false if it was
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index e4e04bff8b512..64e6d0407b0ce 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -2933,15 +2933,9 @@ def ext_constexpr_function_never_constant_expr : ExtWarn<
   "constant expression">, InGroup<DiagGroup<"invalid-constexpr">>, DefaultError;
 def err_attr_cond_never_constant_expr : Error<
   "%0 attribute expression never produces a constant expression">;
-def err_diagnose_if_unknown_warning : Error<"unknown warning group '%0'">;
 def err_diagnose_if_invalid_diagnostic_type : Error<
   "invalid diagnostic type for 'diagnose_if'; use \"error\" or \"warning\" "
   "instead">;
-def err_diagnose_if_unknown_option : Error<"unknown diagnostic option">;
-def err_diagnose_if_expected_equals : Error<
-  "expected '=' after diagnostic option">;
-def err_diagnose_if_unexpected_value : Error<
-  "unexpected value; use 'true' or 'false'">;
 def err_constexpr_body_no_return : Error<
   "no return statement in %select{constexpr|consteval}0 function">;
 def err_constexpr_return_missing_expr : Error<
@@ -3056,8 +3050,6 @@ def note_is_deducible_constraint_evaluated_to_false : Note<
   "cannot deduce template arguments for %0 from %1">;
 def err_constrained_virtual_method : Error<
   "virtual function cannot have a requires clause">;
-def err_trailing_requires_clause_on_deduction_guide : Error<
-  "deduction guide cannot have a requires clause">;
 def err_constrained_non_templated_function
     : Error<"non-templated function cannot have a requires clause">;
 def err_non_temp_spec_requires_clause : Error<
@@ -12394,12 +12386,16 @@ def err_hlsl_packoffset_alignment_mismatch : Error<"packoffset at 'y' not match
 def err_hlsl_pointers_unsupported : Error<
   "%select{pointers|references}0 are unsupported in HLSL">;
 def err_hlsl_missing_resource_class : Error<"HLSL resource needs to have [[hlsl::resource_class()]] attribute">;
+def err_hlsl_attribute_needs_intangible_type: Error<"attribute %0 can be used only on HLSL intangible type %1">;
 
 def err_hlsl_operator_unsupported : Error<
   "the '%select{&|*|->}0' operator is unsupported in HLSL">;
 
 def err_hlsl_param_qualifier_mismatch :
   Error<"conflicting parameter qualifier %0 on parameter %1">;
+def err_hlsl_vector_compound_assignment_truncation : Error<
+  "left hand operand of type %0 to compound assignment cannot be truncated "
+  "when used with right hand operand of type %1">;
 
 def warn_hlsl_impcast_vector_truncation : Warning<
   "implicit conversion truncates vector: %0 to %1">, InGroup<Conversion>;
@@ -12610,6 +12606,9 @@ def note_acc_construct_here : Note<"'%0' construct is here">;
 def err_acc_loop_spec_conflict
     : Error<"OpenACC clause '%0' on '%1' construct conflicts with previous "
             "data dependence clause">;
+def err_acc_collapse_loop_count
+    : Error<"OpenACC 'collapse' clause loop count must be a %select{constant "
+            "expression|positive integer value, evaluated to %1}0">;
 
 // AMDGCN builtins diagnostics
 def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">;
diff --git a/clang/include/clang/Basic/FileManager.h b/clang/include/clang/Basic/FileManager.h
index 74029a91d1a6d..ce4e8c1fbe16e 100644
--- a/clang/include/clang/Basic/FileManager.h
+++ b/clang/include/clang/Basic/FileManager.h
@@ -84,7 +84,7 @@ class FileManager : public RefCountedBase<FileManager> {
   /// VirtualDirectoryEntries/VirtualFileEntries above.
   ///
   llvm::StringMap<llvm::ErrorOr<DirectoryEntry &>, llvm::BumpPtrAllocator>
-  SeenDirEntries;
+      SeenDirEntries;
 
   /// A cache that maps paths to file entries (either real or
   /// virtual) we have looked up, or an error that occurred when we looked up
@@ -190,6 +190,8 @@ class FileManager : public RefCountedBase<FileManager> {
   ///
   /// \param CacheFailure If true and the file does not exist, we'll cache
   /// the failure to find this file.
+  LLVM_DEPRECATED("Functions returning DirectoryEntry are deprecated.",
+                  "getOptionalDirectoryRef()")
   llvm::ErrorOr<const DirectoryEntry *>
   getDirectory(StringRef DirName, bool CacheFailure = true);
 
@@ -207,6 +209,8 @@ class FileManager : public RefCountedBase<FileManager> {
   ///
   /// \param CacheFailure If true and the file does not exist, we'll cache
   /// the failure to find this file.
+  LLVM_DEPRECATED("Functions returning FileEntry are deprecated.",
+                  "getOptionalFileRef()")
   llvm::ErrorOr<const FileEntry *>
   getFile(StringRef Filename, bool OpenFile = false, bool CacheFailure = true);
 
@@ -269,6 +273,8 @@ class FileManager : public RefCountedBase<FileManager> {
   FileEntryRef getVirtualFileRef(StringRef Filename, off_t Size,
                                  time_t ModificationTime);
 
+  LLVM_DEPRECATED("Functions returning FileEntry are deprecated.",
+                  "getVirtualFileRef()")
   const FileEntry *getVirtualFile(StringRef Filename, off_t Size,
                                   time_t ModificationTime);
 
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index fd3346d29f26a..68db400c22e6c 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -364,6 +364,8 @@ BENIGN_LANGOPT(CompatibilityQualifiedIdBlockParamTypeChecking, 1, 0,
 LANGOPT(ObjCDisableDirectMethodsForTesting, 1, 0,
         "Disable recognition of objc_direct methods")
 LANGOPT(CFProtectionBranch , 1, 0, "Control-Flow Branch Protection enabled")
+ENUM_LANGOPT(CFBranchLabelScheme, CFBranchLabelSchemeKind, 2, CFBranchLabelSchemeKind::Default,
+             "Control-Flow Branch Protection Label Scheme")
 LANGOPT(FakeAddressSpaceMap , 1, 0, "OpenCL fake address space map")
 ENUM_LANGOPT(AddressSpaceMapMangling , AddrSpaceMapMangling, 2, ASMM_Target, "OpenCL address space map mangling mode")
 LANGOPT(IncludeDefaultHeader, 1, 0, "Include default header file for OpenCL")
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 6c186c410e158..8c605f6852016 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CLANG_BASIC_LANGOPTIONS_H
 #define LLVM_CLANG_BASIC_LANGOPTIONS_H
 
+#include "clang/Basic/CFProtectionOptions.h"
 #include "clang/Basic/CommentOptions.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/LangStandard.h"
@@ -73,6 +74,7 @@ class LangOptionsBase {
 public:
   using Visibility = clang::Visibility;
   using RoundingMode = llvm::RoundingMode;
+  using CFBranchLabelSchemeKind = clang::CFBranchLabelSchemeKind;
 
   enum GCMode { NonGC, GCOnly, HybridGC };
   enum StackProtectorMode { SSPOff, SSPOn, SSPStrong, SSPReq };
@@ -237,6 +239,12 @@ class LangOptionsBase {
     /// in the initializers of members of local classes.
     Ver18,
 
+    /// Attempt to be ABI-compatible with code generated by Clang 19.0.x.
+    /// This causes clang to:
+    ///   - Incorrectly mangles the 'base type' substitutions of the CXX
+    ///   construction vtable because it hasn't added 'type' as a substitution.
+    Ver19,
+
     /// Conform to the underlying platform's C and C++ ABIs as closely
     /// as we can.
     Latest
diff --git a/clang/include/clang/Basic/OpenACCClauses.def b/clang/include/clang/Basic/OpenACCClauses.def
index 85f4859925f0b..19cdfe7672133 100644
--- a/clang/include/clang/Basic/OpenACCClauses.def
+++ b/clang/include/clang/Basic/OpenACCClauses.def
@@ -24,6 +24,7 @@
 VISIT_CLAUSE(Auto)
 VISIT_CLAUSE(Async)
 VISIT_CLAUSE(Attach)
+VISIT_CLAUSE(Collapse)
 VISIT_CLAUSE(Copy)
 CLAUSE_ALIAS(PCopy, Copy, true)
 CLAUSE_ALIAS(PresentOrCopy, Copy, true)
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index f31d88a354ea2..5778385060629 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -16,6 +16,7 @@
 
 #include "clang/Basic/AddressSpaces.h"
 #include "clang/Basic/BitmaskEnum.h"
+#include "clang/Basic/CFProtectionOptions.h"
 #include "clang/Basic/CodeGenOptions.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/LangOptions.h"
@@ -1727,6 +1728,13 @@ class TargetInfo : public TransferrableTargetInfo,
   virtual bool
   checkCFProtectionBranchSupported(DiagnosticsEngine &Diags) const;
 
+  /// Get the target default CFBranchLabelScheme scheme
+  virtual CFBranchLabelSchemeKind getDefaultCFBranchLabelScheme() const;
+
+  virtual bool
+  checkCFBranchLabelSchemeSupported(const CFBranchLabelSchemeKind Scheme,
+                                    DiagnosticsEngine &Diags) const;
+
   /// Check if the target supports CFProtection return.
   virtual bool
   checkCFProtectionReturnSupported(DiagnosticsEngine &Diags) const;
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 92f39744f3d08..8652b5e3a9c90 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2126,3 +2126,9 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
   def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
   def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
 }
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
+  // fscale
+  def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">;
+  def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">;
+}
\ No newline at end of file
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index ae6b55e98827f..45673cb915c5e 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -817,4 +817,11 @@ multiclass ZAReadzArray<string vg_num>{
 
 defm SVREADZ_VG2 :  ZAReadzArray<"2">;
 defm SVREADZ_VG4 :  ZAReadzArray<"4">;
+
+let SMETargetGuard = "sme2,sme-lutv2" in {
+  def SVWRITE_LANE_ZT : SInst<"svwrite_lane_zt[_{d}]", "vidi", "cUcsUsiUilUlfhdb", MergeNone, "aarch64_sme_write_lane_zt", [IsStreaming, IsInOutZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck1_3>]>;
+  def SVWRITE_ZT : SInst<"svwrite_zt[_{d}]", "vid", "cUcsUsiUilUlfhdb", MergeNone, "aarch64_sme_write_zt", [IsStreaming, IsOutZT0], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
+}
+
 } // let SVETargetGuard = InvalidMode
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index edf73d9022b06..da496e30fbb52 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2418,6 +2418,16 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
   def SVUUNPK_X4 : SInst<"svunpk_{d}[_{3}_x4]", "42.h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x4", [IsStreaming], []>;
 }
 
+//
+// Multi-vector scaling
+//
+let  SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in {
+  def FSCALE_SINGLE_X2 : Inst<"svscale[_single_{d}_x2]", "22x", "fhd", MergeNone, "aarch64_sme_fp8_scale_single_x2", [IsStreaming],[]>;
+  def FSCALE_SINGLE_X4 : Inst<"svscale[_single_{d}_x4]", "44x", "fhd", MergeNone, "aarch64_sme_fp8_scale_single_x4", [IsStreaming],[]>;
+  def FSCALE_X2 : Inst<"svscale[_{d}_x2]", "222.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x2", [IsStreaming],[]>;
+  def FSCALE_X4 : Inst<"svscale[_{d}_x4]", "444.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x4", [IsStreaming],[]>;
+}
+
 let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
 // == BFloat16 multiply-subtract ==
   def SVBFMLSLB : SInst<"svbfmlslb[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslb", [IsOverloadNone, VerifyRuntimeMode], []>;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 23bd686a85f52..9d183ff2d69b3 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -816,6 +816,7 @@ def _DASH_DASH : Option<["--"], "", KIND_REMAINING_ARGS>,
 def A : JoinedOrSeparate<["-"], "A">, Flags<[RenderJoined]>,
   Group<gfortran_Group>;
 def B : JoinedOrSeparate<["-"], "B">, MetaVarName<"<prefix>">,
+    Visibility<[ClangOption, FlangOption]>,
     HelpText<"Search $prefix$file for executables, libraries, and data files. "
     "If $prefix is a directory, search $prefix/$file">;
 def gcc_install_dir_EQ : Joined<["--"], "gcc-install-dir=">,
@@ -1055,6 +1056,7 @@ def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>,
   HelpText<"Pass <arg> to the linker">, MetaVarName<"<arg>">,
   Group<Link_Group>;
 def Xoffload_linker : JoinedAndSeparate<["-"], "Xoffload-linker">,
+  Visibility<[ClangOption, CLOption, FlangOption, DXCOption]>,
   HelpText<"Pass <arg> to the offload linkers or the ones identified by -<triple>">,
   MetaVarName<"<triple> <arg>">, Group<Link_Group>;
 def Xpreprocessor : Separate<["-"], "Xpreprocessor">, Group<Preprocessor_Group>,
@@ -1975,10 +1977,15 @@ def fno_color_diagnostics : Flag<["-"], "fno-color-diagnostics">, Group<f_Group>
   Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>,
   HelpText<"Disable colors in diagnostics">;
 def : Flag<["-"], "fdiagnostics-color">, Group<f_Group>,
-  Visibility<[ClangOption, CLOption, DXCOption]>, Alias<fcolor_diagnostics>;
+  Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>,
+  Alias<fcolor_diagnostics>;
 def : Flag<["-"], "fno-diagnostics-color">, Group<f_Group>,
-  Visibility<[ClangOption, CLOption, DXCOption]>, Alias<fno_color_diagnostics>;
-def fdiagnostics_color_EQ : Joined<["-"], "fdiagnostics-color=">, Group<f_Group>;
+  Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>,
+  Alias<fno_color_diagnostics>;
+def fdiagnostics_color_EQ : Joined<["-"], "fdiagnostics-color=">, Group<f_Group>,
+  Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>,
+  Values<"auto,always,never">,
+  HelpText<"When to use colors in diagnostics">;
 def fansi_escape_codes : Flag<["-"], "fansi-escape-codes">, Group<f_Group>,
   Visibility<[ClangOption, CLOption, DXCOption, CC1Option]>,
   HelpText<"Use ANSI escape codes for diagnostics">,
@@ -2841,6 +2848,10 @@ def fcf_protection : Flag<["-"], "fcf-protection">, Group<f_Group>,
   Visibility<[ClangOption, CLOption, CC1Option]>,
   Alias<fcf_protection_EQ>, AliasArgs<["full"]>,
   HelpText<"Enable cf-protection in 'full' mode">;
+def mcf_branch_label_scheme_EQ : Joined<["-"], "mcf-branch-label-scheme=">,
+  Visibility<[ClangOption, CC1Option]>, Group<m_Group>,
+  HelpText<"Select label scheme for branch control-flow architecture protection">,
+  Values<"unlabeled,func-sig">;
 def mfunction_return_EQ : Joined<["-"], "mfunction-return=">,
   Group<m_Group>, Visibility<[ClangOption, CLOption, CC1Option]>,
   HelpText<"Replace returns with jumps to ``__x86_return_thunk`` (x86 only, error otherwise)">,
@@ -2969,7 +2980,7 @@ def flax_vector_conversions_EQ : Joined<["-"], "flax-vector-conversions=">, Grou
                     "LangOptions::LaxVectorConversionKind::Integer",
                     "LangOptions::LaxVectorConversionKind::All"]>,
   MarshallingInfoEnum<LangOpts<"LaxVectorConversions">,
-                      open_cl.KeyPath #
+                      !strconcat("(", open_cl.KeyPath, " || ", hlsl.KeyPath, ")") #
                           " ? LangOptions::LaxVectorConversionKind::None" #
                           " : LangOptions::LaxVectorConversionKind::All">;
 def flax_vector_conversions : Flag<["-"], "flax-vector-conversions">, Group<f_Group>,
@@ -4244,8 +4255,8 @@ defm basic_block_address_map : BoolFOption<"basic-block-address-map",
 def fbasic_block_sections_EQ : Joined<["-"], "fbasic-block-sections=">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, CC1AsOption]>,
   HelpText<"Place each function's basic blocks in unique sections (ELF Only)">,
-  DocBrief<[{Generate labels for each basic block or place each basic block or a subset of basic blocks in its own section.}]>,
-  Values<"all,labels,none,list=">,
+  DocBrief<[{Place each basic block or a subset of basic blocks in its own section.}]>,
+  Values<"all,none,list=">,
   MarshallingInfoString<CodeGenOpts<"BBSections">, [{"none"}]>;
 defm data_sections : BoolFOption<"data-sections",
   CodeGenOpts<"DataSections">, DefaultFalse,
@@ -6888,6 +6899,8 @@ def module_suffix : Separate<["-"], "module-suffix">,  Group<f_Group>, MetaVarNa
   HelpText<"Use <suffix> as the suffix for module files (the default value is `.mod`)">;
 def fno_reformat : Flag<["-"], "fno-reformat">, Group<Preprocessor_Group>,
   HelpText<"Dump the cooked character stream in -E mode">;
+def fpreprocess_include_lines : Flag<["-"], "fpreprocess-include-lines">, Group<Preprocessor_Group>,
+  HelpText<"Treat INCLUDE lines like #include directives in -E mode">;
 defm analyzed_objects_for_unparse : OptOutFC1FFlag<"analyzed-objects-for-unparse", "", "Do not use the analyzed objects when unparsing">;
 
 def emit_fir : Flag<["-"], "emit-fir">, Group<Action_Group>,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index e1c3a99cfa167..d616c3834c429 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -7423,7 +7423,8 @@ class Sema final : public SemaBase {
                                               SourceLocation Loc,
                                               BinaryOperatorKind Opc);
   QualType CheckVectorLogicalOperands(ExprResult &LHS, ExprResult &RHS,
-                                      SourceLocation Loc);
+                                      SourceLocation Loc,
+                                      BinaryOperatorKind Opc);
 
   /// Context in which we're performing a usual arithmetic conversion.
   enum ArithConvKind {
@@ -11252,6 +11253,7 @@ class Sema final : public SemaBase {
                             ConceptDecl *NamedConcept, NamedDecl *FoundDecl,
                             const TemplateArgumentListInfo *TemplateArgs,
                             TemplateTypeParmDecl *ConstrainedParameter,
+                            QualType ConstrainedType,
                             SourceLocation EllipsisLoc);
 
   bool AttachTypeConstraint(AutoTypeLoc TL,
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index e088254c566d3..fa957abc9791a 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -63,6 +63,11 @@ class SemaHLSL : public SemaBase {
       std::initializer_list<llvm::Triple::EnvironmentType> AllowedStages);
   void DiagnoseAvailabilityViolations(TranslationUnitDecl *TU);
 
+  QualType handleVectorBinOpConversion(ExprResult &LHS, ExprResult &RHS,
+                                       QualType LHSType, QualType RHSType,
+                                       bool IsCompAssign);
+  void emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS, BinaryOperatorKind Opc);
+
   void handleNumThreadsAttr(Decl *D, const ParsedAttr &AL);
   void handleWaveSizeAttr(Decl *D, const ParsedAttr &AL);
   void handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL);
@@ -70,7 +75,7 @@ class SemaHLSL : public SemaBase {
   void handleShaderAttr(Decl *D, const ParsedAttr &AL);
   void handleResourceBindingAttr(Decl *D, const ParsedAttr &AL);
   void handleParamModifierAttr(Decl *D, const ParsedAttr &AL);
-  bool handleResourceTypeAttr(const ParsedAttr &AL);
+  bool handleResourceTypeAttr(QualType T, const ParsedAttr &AL);
 
   bool CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   QualType ProcessResourceTypeAttributes(QualType Wrapped);
diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h
index 0ca76842e5f90..839fdb79cd0ac 100644
--- a/clang/include/clang/Sema/SemaOpenACC.h
+++ b/clang/include/clang/Sema/SemaOpenACC.h
@@ -87,9 +87,14 @@ class SemaOpenACC : public SemaBase {
       SmallVector<Expr *> VarList;
     };
 
+    struct CollapseDetails {
+      bool IsForce;
+      Expr *LoopCount;
+    };
+
     std::variant<std::monostate, DefaultDetails, ConditionDetails,
                  IntExprDetails, VarListDetails, WaitDetails, DeviceTypeDetails,
-                 ReductionDetails>
+                 ReductionDetails, CollapseDetails>
         Details = std::monostate{};
 
   public:
@@ -246,6 +251,18 @@ class SemaOpenACC : public SemaBase {
       return std::get<VarListDetails>(Details).IsZero;
     }
 
+    bool isForce() const {
+      assert(ClauseKind == OpenACCClauseKind::Collapse &&
+             "Only 'collapse' has a force tag");
+      return std::get<CollapseDetails>(Details).IsForce;
+    }
+
+    Expr *getLoopCount() const {
+      assert(ClauseKind == OpenACCClauseKind::Collapse &&
+             "Only 'collapse' has a loop count");
+      return std::get<CollapseDetails>(Details).LoopCount;
+    }
+
     ArrayRef<DeviceTypeArgument> getDeviceTypeArchitectures() const {
       assert((ClauseKind == OpenACCClauseKind::DeviceType ||
               ClauseKind == OpenACCClauseKind::DType) &&
@@ -384,6 +401,12 @@ class SemaOpenACC : public SemaBase {
              "Only 'device_type'/'dtype' has a device-type-arg list");
       Details = DeviceTypeDetails{std::move(Archs)};
     }
+
+    void setCollapseDetails(bool IsForce, Expr *LoopCount) {
+      assert(ClauseKind == OpenACCClauseKind::Collapse &&
+             "Only 'collapse' has collapse details");
+      Details = CollapseDetails{IsForce, LoopCount};
+    }
   };
 
   SemaOpenACC(Sema &S);
@@ -448,6 +471,8 @@ class SemaOpenACC : public SemaBase {
                                    Expr *LowerBound,
                                    SourceLocation ColonLocFirst, Expr *Length,
                                    SourceLocation RBLoc);
+  /// Checks the loop depth value for a collapse clause.
+  ExprResult CheckCollapseLoopCount(Expr *LoopCount);
 
   /// Helper type for the registration/assignment of constructs that need to
   /// 'know' about their parent constructs and hold a reference to them, such as
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index 0340c23fd170d..fe27290efdbfc 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -411,6 +411,11 @@ enum class TemplateSubstitutionKind : char {
     /// lookup will search our outer scope.
     bool CombineWithOuterScope;
 
+    /// Whether this scope is being used to instantiate a lambda expression,
+    /// in which case it should be reused for instantiating the lambda's
+    /// FunctionProtoType.
+    bool InstantiatingLambda = false;
+
     /// If non-NULL, the template parameter pack that has been
     /// partially substituted per C++0x [temp.arg.explicit]p9.
     NamedDecl *PartiallySubstitutedPack = nullptr;
@@ -425,9 +430,11 @@ enum class TemplateSubstitutionKind : char {
     unsigned NumArgsInPartiallySubstitutedPack;
 
   public:
-    LocalInstantiationScope(Sema &SemaRef, bool CombineWithOuterScope = false)
+    LocalInstantiationScope(Sema &SemaRef, bool CombineWithOuterScope = false,
+                            bool InstantiatingLambda = false)
         : SemaRef(SemaRef), Outer(SemaRef.CurrentInstantiationScope),
-          CombineWithOuterScope(CombineWithOuterScope) {
+          CombineWithOuterScope(CombineWithOuterScope),
+          InstantiatingLambda(InstantiatingLambda) {
       SemaRef.CurrentInstantiationScope = this;
     }
 
@@ -553,6 +560,9 @@ enum class TemplateSubstitutionKind : char {
 
     /// Determine whether D is a pack expansion created in this scope.
     bool isLocalPackExpansion(const Decl *D);
+
+    /// Determine whether this scope is for instantiating a lambda.
+    bool isLambda() const { return InstantiatingLambda; }
   };
 
   class TemplateDeclInstantiator
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 5be33ae0ed1b9..bde19a09d6ae0 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1128,7 +1128,7 @@ enum PredefinedTypeIDs {
 #define WASM_TYPE(Name, Id, SingletonId) PREDEF_TYPE_##Id##_ID,
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
 // \brief AMDGPU types with auto numeration
-#define AMDGPU_TYPE(Name, Id, SingletonId) PREDEF_TYPE_##Id##_ID,
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) PREDEF_TYPE_##Id##_ID,
 #include "clang/Basic/AMDGPUTypes.def"
 // \brief HLSL intangible types with auto numeration
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) PREDEF_TYPE_##Id##_ID,
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index 760866fd9de93..e21d41c867314 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -233,13 +233,13 @@ class ASTWriter : public ASTDeserializationListener,
   /// instead of comparing the result of `getDeclID()` or `GetDeclRef()`.
   llvm::SmallPtrSet<const Decl *, 32> PredefinedDecls;
 
-  /// Mapping from FunctionDecl to the list of lambda IDs inside the function.
+  /// Mapping from FunctionDecl ID to the list of lambda IDs inside the
+  /// function.
   ///
   /// These lambdas have to be loaded right after the function they belong to.
   /// In order to have canonical declaration for lambda class from the same
   /// module as enclosing function during deserialization.
-  llvm::DenseMap<const Decl *, SmallVector<LocalDeclID, 4>>
-      FunctionToLambdasMap;
+  llvm::DenseMap<LocalDeclID, SmallVector<LocalDeclID, 4>> FunctionToLambdasMap;
 
   /// Offset of each declaration in the bitstream, indexed by
   /// the declaration's ID.
diff --git a/clang/include/clang/Serialization/ModuleFile.h b/clang/include/clang/Serialization/ModuleFile.h
index 3e920c0f68360..30e7f6b3e57bd 100644
--- a/clang/include/clang/Serialization/ModuleFile.h
+++ b/clang/include/clang/Serialization/ModuleFile.h
@@ -88,13 +88,13 @@ class InputFile {
 
   InputFile(FileEntryRef File, bool isOverridden = false,
             bool isOutOfDate = false) {
-    assert(!(isOverridden && isOutOfDate) &&
-           "an overridden cannot be out-of-date");
     unsigned intVal = 0;
-    if (isOverridden)
-      intVal = Overridden;
-    else if (isOutOfDate)
+    // Make isOutOfDate with higher priority than isOverridden.
+    // It is possible if the recorded hash value mismatches.
+    if (isOutOfDate)
       intVal = OutOfDate;
+    else if (isOverridden)
+      intVal = Overridden;
     Val.setPointerAndInt(&File.getMapEntry(), intVal);
   }
 
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 585246547b3dc..349040c15eeb8 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -74,7 +74,6 @@ def Performance : Package<"performance">, ParentPackage<OptIn>;
 def Security : Package <"security">;
 def InsecureAPI : Package<"insecureAPI">, ParentPackage<Security>;
 def SecurityAlpha : Package<"security">, ParentPackage<Alpha>;
-def Taint : Package<"taint">, ParentPackage<SecurityAlpha>;
 
 def CERT : Package<"cert">, ParentPackage<Security>;
 def ENV : Package<"env">, ParentPackage<CERT>;
@@ -291,11 +290,6 @@ def PointerArithChecker : Checker<"PointerArithm">,
            "elements">,
   Documentation<HasDocumentation>;
 
-def PointerSubChecker : Checker<"PointerSub">,
-  HelpText<"Check for pointer subtractions on two pointers pointing to "
-           "different memory chunks">,
-  Documentation<HasDocumentation>;
-
 def TestAfterDivZeroChecker : Checker<"TestAfterDivZero">,
   HelpText<"Check for division by variable that is later compared against 0. "
            "Either the comparison is useless or there is division by zero.">,
@@ -1004,6 +998,11 @@ def MmapWriteExecChecker : Checker<"MmapWriteExec">,
   HelpText<"Warn on mmap() calls with both writable and executable access">,
   Documentation<HasDocumentation>;
 
+def PointerSubChecker : Checker<"PointerSub">,
+  HelpText<"Check for pointer subtractions on two pointers pointing to "
+           "different memory chunks">,
+  Documentation<HasDocumentation>;
+
 def PutenvStackArray : Checker<"PutenvStackArray">,
   HelpText<"Finds calls to the function 'putenv' which pass a pointer to "
            "an automatic (stack-allocated) array as the argument.">,
@@ -1049,30 +1048,6 @@ def ReturnPointerRangeChecker : Checker<"ReturnPtrRange">,
 
 } // end "alpha.security"
 
-//===----------------------------------------------------------------------===//
-// Taint checkers.
-//===----------------------------------------------------------------------===//
-
-let ParentPackage = Taint in {
-
-def TaintPropagationChecker : Checker<"TaintPropagation">, // Modelling checker
-  HelpText<"Generate taint information used by other checkers">,
-  CheckerOptions<[
-    CmdLineOption<String,
-                  "Config",
-                  "Specifies the name of the configuration file.",
-                  "",
-                  InAlpha>,
-  ]>,
-  Documentation<NotDocumented>,
-  Hidden;
-
-def GenericTaintChecker : Checker<"GenericTaint">,
-  HelpText<"Reports potential injection vulnerabilities">,
-  Dependencies<[TaintPropagationChecker]>,
-  Documentation<HasDocumentation>;
-
-} // end "alpha.security.taint"
 
 //===----------------------------------------------------------------------===//
 // Mac OS X, Cocoa, and Core Foundation checkers.
@@ -1704,12 +1679,36 @@ def UnixAPIPortabilityChecker : Checker<"UnixAPI">,
 
 let ParentPackage = TaintOptIn in {
 
+def TaintPropagationChecker : Checker<"TaintPropagation">, // Modelling checker
+  HelpText<"Generate taint information used by other checkers">,
+  CheckerOptions<[
+    CmdLineOption<String,
+                  "Config",
+                  "Specifies the name of the configuration file.",
+                  "",
+                  Released>
+  ]>,
+  Documentation<NotDocumented>,
+  Hidden;
+
+def GenericTaintChecker : Checker<"GenericTaint">,
+  HelpText<"Reports potential injection vulnerabilities">,
+  Dependencies<[TaintPropagationChecker]>,
+  Documentation<HasDocumentation>;
+
+
 def TaintedAllocChecker: Checker<"TaintedAlloc">,
   HelpText<"Check for memory allocations, where the size parameter "
            "might be a tainted (attacker controlled) value.">,
   Dependencies<[DynamicMemoryModeling, TaintPropagationChecker]>,
   Documentation<HasDocumentation>;
 
+def TaintedDivChecker: Checker<"TaintedDiv">,
+  HelpText<"Check for divisions where the denominator is tainted "
+           "(attacker controlled) and might be 0.">,
+  Dependencies<[TaintPropagationChecker]>,
+  Documentation<HasDocumentation>;
+
 } // end "optin.taint"
 
 //===----------------------------------------------------------------------===//
@@ -1771,6 +1770,10 @@ def UncountedLambdaCapturesChecker : Checker<"UncountedLambdaCapturesChecker">,
 
 let ParentPackage = WebKitAlpha in {
 
+def NoUncheckedPtrMemberChecker : Checker<"NoUncheckedPtrMemberChecker">,
+  HelpText<"Check for no unchecked member variables.">,
+  Documentation<HasDocumentation>;
+
 def UncountedCallArgsChecker : Checker<"UncountedCallArgsChecker">,
   HelpText<"Check uncounted call arguments.">,
   Documentation<HasDocumentation>;
diff --git a/clang/include/clang/StaticAnalyzer/Core/CheckerManager.h b/clang/include/clang/StaticAnalyzer/Core/CheckerManager.h
index 24c5b66fd5822..de40b96614dbc 100644
--- a/clang/include/clang/StaticAnalyzer/Core/CheckerManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/CheckerManager.h
@@ -221,6 +221,10 @@ class CheckerManager {
     return static_cast<CHECKER *>(CheckerTags[tag]);
   }
 
+  template <typename CHECKER> bool isRegisteredChecker() {
+    return CheckerTags.contains(getTag<CHECKER>());
+  }
+
 //===----------------------------------------------------------------------===//
 // Functions for running checkers for AST traversing.
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Testing/CommandLineArgs.h b/clang/include/clang/Testing/CommandLineArgs.h
index e71907e8bbd0c..52beac7254fef 100644
--- a/clang/include/clang/Testing/CommandLineArgs.h
+++ b/clang/include/clang/Testing/CommandLineArgs.h
@@ -21,19 +21,18 @@
 namespace clang {
 
 enum TestLanguage {
-  Lang_C89,
-  Lang_C99,
-  Lang_CXX03,
-  Lang_CXX11,
-  Lang_CXX14,
-  Lang_CXX17,
-  Lang_CXX20,
-  Lang_CXX23,
+#define TESTLANGUAGE(lang, version, std_flag, version_index)                   \
+  Lang_##lang##version,
+#include "clang/Testing/TestLanguage.def"
+
   Lang_OpenCL,
   Lang_OBJC,
-  Lang_OBJCXX
+  Lang_OBJCXX,
 };
 
+std::vector<TestLanguage> getCOrLater(int MinimumStd);
+std::vector<TestLanguage> getCXXOrLater(int MinimumStd);
+
 std::vector<std::string> getCommandLineArgsForTesting(TestLanguage Lang);
 std::vector<std::string> getCC1ArgsForTesting(TestLanguage Lang);
 
diff --git a/clang/include/clang/Testing/TestClangConfig.h b/clang/include/clang/Testing/TestClangConfig.h
index 1b4efca80e9d4..e52aa37482dc1 100644
--- a/clang/include/clang/Testing/TestClangConfig.h
+++ b/clang/include/clang/Testing/TestClangConfig.h
@@ -27,37 +27,90 @@ struct TestClangConfig {
   /// The argument of the `-target` command line flag.
   std::string Target;
 
-  bool isC() const { return Language == Lang_C89 || Language == Lang_C99; }
+  bool isC() const {
+    return false
+#define TESTLANGUAGE_C(lang, version, std_flag, version_index)                 \
+  || Language == Lang_##lang##version
+#include "clang/Testing/TestLanguage.def"
+        ;
+  }
 
-  bool isC99OrLater() const { return Language == Lang_C99; }
+  bool isC(int Version) const {
+    return false
+#define TESTLANGUAGE_C(lang, version, std_flag, version_index)                 \
+  || (Version == version && Language == Lang_##lang##version)
+#include "clang/Testing/TestLanguage.def"
+        ;
+  }
 
-  bool isCXX() const {
-    return Language == Lang_CXX03 || Language == Lang_CXX11 ||
-           Language == Lang_CXX14 || Language == Lang_CXX17 ||
-           Language == Lang_CXX20 || Language == Lang_CXX23;
+  bool isCOrLater(int MinimumStdVersion) const {
+    const auto MinimumStdVersionIndex = 0
+#define TESTLANGUAGE_C(lang, version, std_flag, version_index)                 \
+  +(MinimumStdVersion == version ? version_index : 0)
+#include "clang/Testing/TestLanguage.def"
+        ;
+    switch (Language) {
+#define TESTLANGUAGE_C(lang, version, std_flag, version_index)                 \
+  case Lang_##lang##version:                                                   \
+    return MinimumStdVersionIndex <= version_index;
+#include "clang/Testing/TestLanguage.def"
+    default:
+      return false;
+    }
   }
 
-  bool isCXX11OrLater() const {
-    return Language == Lang_CXX11 || Language == Lang_CXX14 ||
-           Language == Lang_CXX17 || Language == Lang_CXX20 ||
-           Language == Lang_CXX23;
+  bool isC99OrLater() const { return isCOrLater(99); }
+
+  bool isCOrEarlier(int MaximumStdVersion) const {
+    return isC() && (isC(MaximumStdVersion) || !isCOrLater(MaximumStdVersion));
   }
 
-  bool isCXX14OrLater() const {
-    return Language == Lang_CXX14 || Language == Lang_CXX17 ||
-           Language == Lang_CXX20 || Language == Lang_CXX23;
+  bool isCXX() const {
+    return false
+#define TESTLANGUAGE_CXX(lang, version, std_flag, version_index)               \
+  || Language == Lang_##lang##version
+#include "clang/Testing/TestLanguage.def"
+        ;
   }
 
-  bool isCXX17OrLater() const {
-    return Language == Lang_CXX17 || Language == Lang_CXX20 ||
-           Language == Lang_CXX23;
+  bool isCXX(int Version) const {
+    return false
+#define TESTLANGUAGE_CXX(lang, version, std_flag, version_index)               \
+  || (Version == version && Language == Lang_##lang##version)
+#include "clang/Testing/TestLanguage.def"
+        ;
   }
 
-  bool isCXX20OrLater() const {
-    return Language == Lang_CXX20 || Language == Lang_CXX23;
+  bool isCXXOrLater(int MinimumStdVersion) const {
+    const auto MinimumStdVersionIndex = 0
+#define TESTLANGUAGE_CXX(lang, version, std_flag, version_index)               \
+  +(MinimumStdVersion == version ? version_index : 0)
+#include "clang/Testing/TestLanguage.def"
+        ;
+    switch (Language) {
+#define TESTLANGUAGE_CXX(lang, version, std_flag, version_index)               \
+  case Lang_##lang##version:                                                   \
+    return MinimumStdVersionIndex <= version_index;
+#include "clang/Testing/TestLanguage.def"
+    default:
+      return false;
+    }
   }
 
-  bool isCXX23OrLater() const { return Language == Lang_CXX23; }
+  bool isCXX11OrLater() const { return isCXXOrLater(11); }
+
+  bool isCXX14OrLater() const { return isCXXOrLater(14); }
+
+  bool isCXX17OrLater() const { return isCXXOrLater(17); }
+
+  bool isCXX20OrLater() const { return isCXXOrLater(20); }
+
+  bool isCXX23OrLater() const { return isCXXOrLater(23); }
+
+  bool isCXXOrEarlier(int MaximumStdVersion) const {
+    return isCXX() &&
+           (isCXX(MaximumStdVersion) || !isCXXOrLater(MaximumStdVersion));
+  }
 
   bool supportsCXXDynamicExceptionSpecification() const {
     return Language == Lang_CXX03 || Language == Lang_CXX11 ||
@@ -75,6 +128,30 @@ struct TestClangConfig {
     return Result;
   }
 
+  std::string toShortString() const {
+    std::string Result;
+    llvm::raw_string_ostream OS(Result);
+    switch (Language) {
+#define TESTLANGUAGE(lang, version, std_flag, version_index)                   \
+  case Lang_##lang##version:                                                   \
+    OS << (#lang #version);                                                    \
+    break;
+#include "clang/Testing/TestLanguage.def"
+    case Lang_OpenCL:
+      OS << "OpenCL";
+      break;
+    case Lang_OBJC:
+      OS << "OBJC";
+      break;
+    case Lang_OBJCXX:
+      OS << "OBJCXX";
+      break;
+    }
+
+    OS << (Target.find("win") != std::string::npos ? "_win" : "");
+    return Result;
+  }
+
   std::string toString() const {
     std::string Result;
     llvm::raw_string_ostream OS(Result);
diff --git a/clang/include/clang/Testing/TestLanguage.def b/clang/include/clang/Testing/TestLanguage.def
new file mode 100644
index 0000000000000..ac62b176a0b87
--- /dev/null
+++ b/clang/include/clang/Testing/TestLanguage.def
@@ -0,0 +1,47 @@
+
+//===-- TestLanguage.def - Language Versions for Testing --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// The TESTLANGUAGE(-C/-CXX) macros have four parameters:
+// the language, the standard version, the corresponding compile-flag,
+// and an index of the language version for each language.
+// The standard version is used to compare a standard version numerically,
+// and the index is used to impose ordering for the language versions
+// with respect to each language.
+
+#ifndef TESTLANGUAGE
+#  define TESTLANGUAGE(...)
+#endif
+
+#ifndef TESTLANGUAGE_C
+#  define TESTLANGUAGE_C(...) TESTLANGUAGE(__VA_ARGS__)
+#endif
+
+#ifndef TESTLANGUAGE_CXX
+#  define TESTLANGUAGE_CXX(...) TESTLANGUAGE(__VA_ARGS__)
+#endif
+
+TESTLANGUAGE_C(C, 89, c89, 0)
+TESTLANGUAGE_C(C, 99, c99, 1)
+TESTLANGUAGE_C(C, 11, c11, 2)
+TESTLANGUAGE_C(C, 17, c17, 3)
+TESTLANGUAGE_C(C, 23, c23, 4)
+TESTLANGUAGE_C(C, 26, c2y, 5)
+
+// TESTLANGUAGE_CXX(CXX, 98, c++98, 0)
+TESTLANGUAGE_CXX(CXX, 03, c++03, 1)
+TESTLANGUAGE_CXX(CXX, 11, c++11, 2)
+TESTLANGUAGE_CXX(CXX, 14, c++14, 3)
+TESTLANGUAGE_CXX(CXX, 17, c++17, 4)
+TESTLANGUAGE_CXX(CXX, 20, c++20, 5)
+TESTLANGUAGE_CXX(CXX, 23, c++23, 6)
+TESTLANGUAGE_CXX(CXX, 26, c++26, 7)
+
+#undef TESTLANGUAGE_CXX
+#undef TESTLANGUAGE_C
+#undef TESTLANGUAGE
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index fd8aa8de79b49..735def67f7840 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1441,7 +1441,7 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
 
   if (Target.getTriple().isAMDGPU() ||
       (AuxTarget && AuxTarget->getTriple().isAMDGPU())) {
-#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
   InitBuiltinType(SingletonId, BuiltinType::Id);
 #include "clang/Basic/AMDGPUTypes.def"
   }
@@ -2264,7 +2264,7 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     Align = 8;                                                                 \
     break;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_OPAQUE_PTR_TYPE(NAME, AS, WIDTH, ALIGN, ID, SINGLETONID)        \
+#define AMDGPU_TYPE(NAME, ID, SINGLETONID, WIDTH, ALIGN)                       \
   case BuiltinType::ID:                                                        \
     Width = WIDTH;                                                             \
     Align = ALIGN;                                                             \
@@ -2272,8 +2272,8 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
-      Width = 0;
-      Align = 8;
+      Width = Target->getPointerWidth(LangAS::Default);
+      Align = Target->getPointerAlign(LangAS::Default);
       break;
     }
     break;
@@ -3398,7 +3398,7 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
 #include "clang/Basic/HLSLIntangibleTypes.def"
     case BuiltinType::Dependent:
       llvm_unreachable("should never get here");
-#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
 #include "clang/Basic/AMDGPUTypes.def"
     case BuiltinType::WasmExternRef:
 #define RVV_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
@@ -3535,6 +3535,50 @@ QualType ASTContext::getCountAttributedType(
   return QualType(CATy, 0);
 }
 
+QualType
+ASTContext::adjustType(QualType Orig,
+                       llvm::function_ref<QualType(QualType)> Adjust) const {
+  switch (Orig->getTypeClass()) {
+  case Type::Attributed: {
+    const auto *AT = dyn_cast<AttributedType>(Orig);
+    return getAttributedType(AT->getAttrKind(),
+                             adjustType(AT->getModifiedType(), Adjust),
+                             adjustType(AT->getEquivalentType(), Adjust));
+  }
+
+  case Type::BTFTagAttributed: {
+    const auto *BTFT = dyn_cast<BTFTagAttributedType>(Orig);
+    return getBTFTagAttributedType(BTFT->getAttr(),
+                                   adjustType(BTFT->getWrappedType(), Adjust));
+  }
+
+  case Type::Elaborated: {
+    const auto *ET = cast<ElaboratedType>(Orig);
+    return getElaboratedType(ET->getKeyword(), ET->getQualifier(),
+                             adjustType(ET->getNamedType(), Adjust));
+  }
+
+  case Type::Paren:
+    return getParenType(
+        adjustType(cast<ParenType>(Orig)->getInnerType(), Adjust));
+
+  case Type::Adjusted: {
+    const auto *AT = cast<AdjustedType>(Orig);
+    return getAdjustedType(AT->getOriginalType(),
+                           adjustType(AT->getAdjustedType(), Adjust));
+  }
+
+  case Type::MacroQualified: {
+    const auto *MQT = cast<MacroQualifiedType>(Orig);
+    return getMacroQualifiedType(adjustType(MQT->getUnderlyingType(), Adjust),
+                                 MQT->getMacroIdentifier());
+  }
+
+  default:
+    return Adjust(Orig);
+  }
+}
+
 const FunctionType *ASTContext::adjustFunctionType(const FunctionType *T,
                                                    FunctionType::ExtInfo Info) {
   if (T->getExtInfo() == Info)
@@ -3553,13 +3597,23 @@ const FunctionType *ASTContext::adjustFunctionType(const FunctionType *T,
   return cast<FunctionType>(Result.getTypePtr());
 }
 
+QualType ASTContext::adjustFunctionResultType(QualType FunctionType,
+                                              QualType ResultType) {
+  return adjustType(FunctionType, [&](QualType Orig) {
+    if (const auto *FNPT = Orig->getAs<FunctionNoProtoType>())
+      return getFunctionNoProtoType(ResultType, FNPT->getExtInfo());
+
+    const auto *FPT = Orig->castAs<FunctionProtoType>();
+    return getFunctionType(ResultType, FPT->getParamTypes(),
+                           FPT->getExtProtoInfo());
+  });
+}
+
 void ASTContext::adjustDeducedFunctionResultType(FunctionDecl *FD,
                                                  QualType ResultType) {
   FD = FD->getMostRecentDecl();
   while (true) {
-    const auto *FPT = FD->getType()->castAs<FunctionProtoType>();
-    FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
-    FD->setType(getFunctionType(ResultType, FPT->getParamTypes(), EPI));
+    FD->setType(adjustFunctionResultType(FD->getType(), ResultType));
     if (FunctionDecl *Next = FD->getPreviousDecl())
       FD = Next;
     else
@@ -3575,30 +3629,11 @@ void ASTContext::adjustDeducedFunctionResultType(FunctionDecl *FD,
 /// and preserved. Other type sugar (for instance, typedefs) is not.
 QualType ASTContext::getFunctionTypeWithExceptionSpec(
     QualType Orig, const FunctionProtoType::ExceptionSpecInfo &ESI) const {
-  // Might have some parens.
-  if (const auto *PT = dyn_cast<ParenType>(Orig))
-    return getParenType(
-        getFunctionTypeWithExceptionSpec(PT->getInnerType(), ESI));
-
-  // Might be wrapped in a macro qualified type.
-  if (const auto *MQT = dyn_cast<MacroQualifiedType>(Orig))
-    return getMacroQualifiedType(
-        getFunctionTypeWithExceptionSpec(MQT->getUnderlyingType(), ESI),
-        MQT->getMacroIdentifier());
-
-  // Might have a calling-convention attribute.
-  if (const auto *AT = dyn_cast<AttributedType>(Orig))
-    return getAttributedType(
-        AT->getAttrKind(),
-        getFunctionTypeWithExceptionSpec(AT->getModifiedType(), ESI),
-        getFunctionTypeWithExceptionSpec(AT->getEquivalentType(), ESI));
-
-  // Anything else must be a function type. Rebuild it with the new exception
-  // specification.
-  const auto *Proto = Orig->castAs<FunctionProtoType>();
-  return getFunctionType(
-      Proto->getReturnType(), Proto->getParamTypes(),
-      Proto->getExtProtoInfo().withExceptionSpec(ESI));
+  return adjustType(Orig, [&](QualType Ty) {
+    const auto *Proto = Ty->castAs<FunctionProtoType>();
+    return getFunctionType(Proto->getReturnType(), Proto->getParamTypes(),
+                           Proto->getExtProtoInfo().withExceptionSpec(ESI));
+  });
 }
 
 bool ASTContext::hasSameFunctionTypeIgnoringExceptionSpec(QualType T,
@@ -5165,7 +5200,7 @@ QualType ASTContext::getAttributedType(attr::Kind attrKind,
 }
 
 QualType ASTContext::getBTFTagAttributedType(const BTFTypeTagAttr *BTFAttr,
-                                             QualType Wrapped) {
+                                             QualType Wrapped) const {
   llvm::FoldingSetNodeID ID;
   BTFTagAttributedType::Profile(ID, Wrapped, BTFAttr);
 
@@ -8598,7 +8633,7 @@ static char getObjCEncodingForPrimitiveType(const ASTContext *C,
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
 #include "clang/Basic/AMDGPUTypes.def"
       {
         DiagnosticsEngine &Diags = C->getDiagnostics();
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index bba97e289da2e..020a2f396b5aa 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -1138,7 +1138,7 @@ ExpectedType ASTNodeImporter::VisitBuiltinType(const BuiltinType *T) {
   case BuiltinType::Id:                                                        \
     return Importer.getToContext().SingletonId;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
   case BuiltinType::Id:                                                        \
     return Importer.getToContext().SingletonId;
 #include "clang/Basic/AMDGPUTypes.def"
@@ -10020,8 +10020,8 @@ Expected<FileID> ASTImporter::Import(FileID FromID, bool IsBuiltin) {
         ToIncludeLocOrFakeLoc = ToSM.getLocForStartOfFile(ToSM.getMainFileID());
 
       if (Cache->OrigEntry && Cache->OrigEntry->getDir()) {
-        // FIXME: We probably want to use getVirtualFile(), so we don't hit the
-        // disk again
+        // FIXME: We probably want to use getVirtualFileRef(), so we don't hit
+        // the disk again
         // FIXME: We definitely want to re-use the existing MemoryBuffer, rather
         // than mmap the files several times.
         auto Entry =
diff --git a/clang/lib/AST/ByteCode/Boolean.h b/clang/lib/AST/ByteCode/Boolean.h
index f1914ddb9970d..c568b557574e2 100644
--- a/clang/lib/AST/ByteCode/Boolean.h
+++ b/clang/lib/AST/ByteCode/Boolean.h
@@ -30,6 +30,7 @@ class Boolean final {
 public:
   /// Zero-initializes a boolean.
   Boolean() : V(false) {}
+  Boolean(const llvm::APSInt &I) : V(!I.isZero()) {}
   explicit Boolean(bool V) : V(V) {}
 
   bool operator<(Boolean RHS) const { return V < RHS.V; }
diff --git a/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp b/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
index b8778f6027894..4fd697ebe4938 100644
--- a/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
+++ b/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
@@ -8,6 +8,7 @@
 
 #include "ByteCodeEmitter.h"
 #include "Context.h"
+#include "FixedPoint.h"
 #include "Floating.h"
 #include "IntegralAP.h"
 #include "Opcode.h"
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index e54b6568d7060..680be736aa647 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -9,6 +9,7 @@
 #include "Compiler.h"
 #include "ByteCodeEmitter.h"
 #include "Context.h"
+#include "FixedPoint.h"
 #include "Floating.h"
 #include "Function.h"
 #include "InterpShared.h"
@@ -470,6 +471,7 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
   }
 
   case CK_IntegralToBoolean:
+  case CK_FixedPointToBoolean:
   case CK_BooleanToSignedIntegral:
   case CK_IntegralCast: {
     if (DiscardResult)
@@ -670,6 +672,45 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
                                ToSize, CE);
   };
 
+  case CK_IntegralToFixedPoint: {
+    if (!this->visit(SubExpr))
+      return false;
+
+    auto Sem = Ctx.getASTContext().getFixedPointSemantics(CE->getType());
+    uint32_t I;
+    std::memcpy(&I, &Sem, sizeof(Sem));
+    return this->emitCastIntegralFixedPoint(classifyPrim(SubExpr->getType()), I,
+                                            CE);
+  }
+  case CK_FloatingToFixedPoint: {
+    if (!this->visit(SubExpr))
+      return false;
+
+    auto Sem = Ctx.getASTContext().getFixedPointSemantics(CE->getType());
+    uint32_t I;
+    std::memcpy(&I, &Sem, sizeof(Sem));
+    return this->emitCastFloatingFixedPoint(I, CE);
+  }
+  case CK_FixedPointToFloating: {
+    if (!this->visit(SubExpr))
+      return false;
+    const auto *TargetSemantics = &Ctx.getFloatSemantics(CE->getType());
+    return this->emitCastFixedPointFloating(TargetSemantics, CE);
+  }
+  case CK_FixedPointToIntegral: {
+    if (!this->visit(SubExpr))
+      return false;
+    return this->emitCastFixedPointIntegral(classifyPrim(CE->getType()), CE);
+  }
+  case CK_FixedPointCast: {
+    if (!this->visit(SubExpr))
+      return false;
+    auto Sem = Ctx.getASTContext().getFixedPointSemantics(CE->getType());
+    uint32_t I;
+    std::memcpy(&I, &Sem, sizeof(Sem));
+    return this->emitCastFixedPoint(I, CE);
+  }
+
   case CK_ToVoid:
     return discard(SubExpr);
 
@@ -717,6 +758,19 @@ bool Compiler<Emitter>::VisitImaginaryLiteral(const ImaginaryLiteral *E) {
   return this->visitArrayElemInit(1, SubExpr);
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::VisitFixedPointLiteral(const FixedPointLiteral *E) {
+  assert(E->getType()->isFixedPointType());
+  assert(classifyPrim(E) == PT_FixedPoint);
+
+  if (DiscardResult)
+    return true;
+
+  auto Sem = Ctx.getASTContext().getFixedPointSemantics(E->getType());
+  APInt Value = E->getValue();
+  return this->emitConstFixedPoint(FixedPoint(Value, Sem), E);
+}
+
 template <class Emitter>
 bool Compiler<Emitter>::VisitParenExpr(const ParenExpr *E) {
   return this->delegate(E->getSubExpr());
@@ -750,6 +804,8 @@ bool Compiler<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
        RHS->getType()->isAnyComplexType()) &&
       BO->isComparisonOp())
     return this->emitComplexComparison(LHS, RHS, BO);
+  if (LHS->getType()->isFixedPointType() || RHS->getType()->isFixedPointType())
+    return this->VisitFixedPointBinOp(BO);
 
   if (BO->isPtrMemOp()) {
     if (!this->visit(LHS))
@@ -1446,6 +1502,112 @@ bool Compiler<Emitter>::VisitVectorBinOp(const BinaryOperator *E) {
   return true;
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::VisitFixedPointBinOp(const BinaryOperator *E) {
+  const Expr *LHS = E->getLHS();
+  const Expr *RHS = E->getRHS();
+
+  assert(LHS->getType()->isFixedPointType() ||
+         RHS->getType()->isFixedPointType());
+
+  auto LHSSema = Ctx.getASTContext().getFixedPointSemantics(LHS->getType());
+  auto RHSSema = Ctx.getASTContext().getFixedPointSemantics(RHS->getType());
+
+  if (!this->visit(LHS))
+    return false;
+  if (!LHS->getType()->isFixedPointType()) {
+    uint32_t I;
+    std::memcpy(&I, &LHSSema, sizeof(llvm::FixedPointSemantics));
+    if (!this->emitCastIntegralFixedPoint(classifyPrim(LHS->getType()), I, E))
+      return false;
+  }
+
+  if (!this->visit(RHS))
+    return false;
+  if (!RHS->getType()->isFixedPointType()) {
+    uint32_t I;
+    std::memcpy(&I, &RHSSema, sizeof(llvm::FixedPointSemantics));
+    if (!this->emitCastIntegralFixedPoint(classifyPrim(RHS->getType()), I, E))
+      return false;
+  }
+
+  // Convert the result to the target semantics.
+  auto ConvertResult = [&](bool R) -> bool {
+    if (!R)
+      return false;
+    auto ResultSema = Ctx.getASTContext().getFixedPointSemantics(E->getType());
+    auto CommonSema = LHSSema.getCommonSemantics(RHSSema);
+    if (ResultSema != CommonSema) {
+      uint32_t I;
+      std::memcpy(&I, &ResultSema, sizeof(ResultSema));
+      return this->emitCastFixedPoint(I, E);
+    }
+    return true;
+  };
+
+  auto MaybeCastToBool = [&](bool Result) {
+    if (!Result)
+      return false;
+    PrimType T = classifyPrim(E);
+    if (DiscardResult)
+      return this->emitPop(T, E);
+    if (T != PT_Bool)
+      return this->emitCast(PT_Bool, T, E);
+    return true;
+  };
+
+  switch (E->getOpcode()) {
+  case BO_EQ:
+    return MaybeCastToBool(this->emitEQFixedPoint(E));
+  case BO_NE:
+    return MaybeCastToBool(this->emitNEFixedPoint(E));
+  case BO_LT:
+    return MaybeCastToBool(this->emitLTFixedPoint(E));
+  case BO_LE:
+    return MaybeCastToBool(this->emitLEFixedPoint(E));
+  case BO_GT:
+    return MaybeCastToBool(this->emitGTFixedPoint(E));
+  case BO_GE:
+    return MaybeCastToBool(this->emitGEFixedPoint(E));
+  case BO_Add:
+    return ConvertResult(this->emitAddFixedPoint(E));
+  case BO_Sub:
+    return ConvertResult(this->emitSubFixedPoint(E));
+  case BO_Mul:
+    return ConvertResult(this->emitMulFixedPoint(E));
+  case BO_Div:
+    return ConvertResult(this->emitDivFixedPoint(E));
+  case BO_Shl:
+    return ConvertResult(this->emitShiftFixedPoint(/*Left=*/true, E));
+  case BO_Shr:
+    return ConvertResult(this->emitShiftFixedPoint(/*Left=*/false, E));
+
+  default:
+    return this->emitInvalid(E);
+  }
+
+  llvm_unreachable("unhandled binop opcode");
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::VisitFixedPointUnaryOperator(const UnaryOperator *E) {
+  const Expr *SubExpr = E->getSubExpr();
+  assert(SubExpr->getType()->isFixedPointType());
+
+  switch (E->getOpcode()) {
+  case UO_Plus:
+    return this->delegate(SubExpr);
+  case UO_Minus:
+    if (!this->visit(SubExpr))
+      return false;
+    return this->emitNegFixedPoint(E);
+  default:
+    return false;
+  }
+
+  llvm_unreachable("Unhandled unary opcode");
+}
+
 template <class Emitter>
 bool Compiler<Emitter>::VisitImplicitValueInitExpr(
     const ImplicitValueInitExpr *E) {
@@ -3685,9 +3847,13 @@ bool Compiler<Emitter>::visitZeroInitializer(PrimType T, QualType QT,
     return this->emitNullFnPtr(nullptr, E);
   case PT_MemberPtr:
     return this->emitNullMemberPtr(nullptr, E);
-  case PT_Float: {
+  case PT_Float:
     return this->emitConstFloat(APFloat::getZero(Ctx.getFloatSemantics(QT)), E);
+  case PT_FixedPoint: {
+    auto Sem = Ctx.getASTContext().getFixedPointSemantics(E->getType());
+    return this->emitConstFixedPoint(FixedPoint::zero(Sem), E);
   }
+    llvm_unreachable("Implement");
   }
   llvm_unreachable("unknown primitive type");
 }
@@ -3798,6 +3964,7 @@ bool Compiler<Emitter>::emitConst(T Value, PrimType Ty, const Expr *E) {
   case PT_Float:
   case PT_IntAP:
   case PT_IntAPS:
+  case PT_FixedPoint:
     llvm_unreachable("Invalid integral type");
     break;
   }
@@ -5293,7 +5460,7 @@ bool Compiler<Emitter>::compileDestructor(const CXXDestructorDecl *Dtor) {
       if (!D->isPrimitive() && !D->isPrimitiveArray()) {
         if (!this->emitGetPtrField(Field.Offset, SourceInfo{}))
           return false;
-        if (!this->emitDestruction(D))
+        if (!this->emitDestruction(D, SourceInfo{}))
           return false;
         if (!this->emitPopPtr(SourceInfo{}))
           return false;
@@ -5302,9 +5469,12 @@ bool Compiler<Emitter>::compileDestructor(const CXXDestructorDecl *Dtor) {
   }
 
   for (const Record::Base &Base : llvm::reverse(R->bases())) {
+    if (Base.R->isAnonymousUnion())
+      continue;
+
     if (!this->emitGetPtrBase(Base.Offset, SourceInfo{}))
       return false;
-    if (!this->emitRecordDestruction(Base.R))
+    if (!this->emitRecordDestruction(Base.R, {}))
       return false;
     if (!this->emitPopPtr(SourceInfo{}))
       return false;
@@ -5347,6 +5517,8 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
     return this->VisitComplexUnaryOperator(E);
   if (SubExpr->getType()->isVectorType())
     return this->VisitVectorUnaryOperator(E);
+  if (SubExpr->getType()->isFixedPointType())
+    return this->VisitFixedPointUnaryOperator(E);
   std::optional<PrimType> T = classify(SubExpr->getType());
 
   switch (E->getOpcode()) {
@@ -6145,8 +6317,9 @@ bool Compiler<Emitter>::emitComplexComparison(const Expr *LHS, const Expr *RHS,
 /// on the stack.
 /// Emit destruction of record types (or arrays of record types).
 template <class Emitter>
-bool Compiler<Emitter>::emitRecordDestruction(const Record *R) {
+bool Compiler<Emitter>::emitRecordDestruction(const Record *R, SourceInfo Loc) {
   assert(R);
+  assert(!R->isAnonymousUnion());
   const CXXDestructorDecl *Dtor = R->getDestructor();
   if (!Dtor || Dtor->isTrivial())
     return true;
@@ -6157,15 +6330,16 @@ bool Compiler<Emitter>::emitRecordDestruction(const Record *R) {
     return false;
   assert(DtorFunc->hasThisPointer());
   assert(DtorFunc->getNumParams() == 1);
-  if (!this->emitDupPtr(SourceInfo{}))
+  if (!this->emitDupPtr(Loc))
     return false;
-  return this->emitCall(DtorFunc, 0, SourceInfo{});
+  return this->emitCall(DtorFunc, 0, Loc);
 }
 /// When calling this, we have a pointer of the local-to-destroy
 /// on the stack.
 /// Emit destruction of record types (or arrays of record types).
 template <class Emitter>
-bool Compiler<Emitter>::emitDestruction(const Descriptor *Desc) {
+bool Compiler<Emitter>::emitDestruction(const Descriptor *Desc,
+                                        SourceInfo Loc) {
   assert(Desc);
   assert(!Desc->isPrimitive());
   assert(!Desc->isPrimitiveArray());
@@ -6189,20 +6363,23 @@ bool Compiler<Emitter>::emitDestruction(const Descriptor *Desc) {
     }
 
     for (ssize_t I = Desc->getNumElems() - 1; I >= 0; --I) {
-      if (!this->emitConstUint64(I, SourceInfo{}))
+      if (!this->emitConstUint64(I, Loc))
         return false;
-      if (!this->emitArrayElemPtrUint64(SourceInfo{}))
+      if (!this->emitArrayElemPtrUint64(Loc))
         return false;
-      if (!this->emitDestruction(ElemDesc))
+      if (!this->emitDestruction(ElemDesc, Loc))
         return false;
-      if (!this->emitPopPtr(SourceInfo{}))
+      if (!this->emitPopPtr(Loc))
         return false;
     }
     return true;
   }
 
   assert(Desc->ElemRecord);
-  return this->emitRecordDestruction(Desc->ElemRecord);
+  if (Desc->ElemRecord->isAnonymousUnion())
+    return true;
+
+  return this->emitRecordDestruction(Desc->ElemRecord, Loc);
 }
 
 namespace clang {
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index 2dfa187713a80..22e078f3fe546 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -125,12 +125,15 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   bool VisitIntegerLiteral(const IntegerLiteral *E);
   bool VisitFloatingLiteral(const FloatingLiteral *E);
   bool VisitImaginaryLiteral(const ImaginaryLiteral *E);
+  bool VisitFixedPointLiteral(const FixedPointLiteral *E);
   bool VisitParenExpr(const ParenExpr *E);
   bool VisitBinaryOperator(const BinaryOperator *E);
   bool VisitLogicalBinOp(const BinaryOperator *E);
   bool VisitPointerArithBinOp(const BinaryOperator *E);
   bool VisitComplexBinOp(const BinaryOperator *E);
   bool VisitVectorBinOp(const BinaryOperator *E);
+  bool VisitFixedPointBinOp(const BinaryOperator *E);
+  bool VisitFixedPointUnaryOperator(const UnaryOperator *E);
   bool VisitCXXDefaultArgExpr(const CXXDefaultArgExpr *E);
   bool VisitCallExpr(const CallExpr *E);
   bool VisitBuiltinCallExpr(const CallExpr *E, unsigned BuiltinID);
@@ -364,8 +367,8 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   bool emitComplexBoolCast(const Expr *E);
   bool emitComplexComparison(const Expr *LHS, const Expr *RHS,
                              const BinaryOperator *E);
-  bool emitRecordDestruction(const Record *R);
-  bool emitDestruction(const Descriptor *Desc);
+  bool emitRecordDestruction(const Record *R, SourceInfo Loc);
+  bool emitDestruction(const Descriptor *Desc, SourceInfo Loc);
   unsigned collectBaseOffset(const QualType BaseType,
                              const QualType DerivedType);
   bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
@@ -540,7 +543,7 @@ template <class Emitter> class LocalScope : public VariableScope<Emitter> {
         if (!this->Ctx->emitGetPtrLocal(Local.Offset, E))
           return false;
 
-        if (!this->Ctx->emitDestruction(Local.Desc))
+        if (!this->Ctx->emitDestruction(Local.Desc, Local.Desc->getLoc()))
           return false;
 
         if (!this->Ctx->emitPopPtr(E))
diff --git a/clang/lib/AST/ByteCode/Context.cpp b/clang/lib/AST/ByteCode/Context.cpp
index 8661acf536658..9bca8138cd9f6 100644
--- a/clang/lib/AST/ByteCode/Context.cpp
+++ b/clang/lib/AST/ByteCode/Context.cpp
@@ -198,6 +198,9 @@ std::optional<PrimType> Context::classify(QualType T) const {
   if (const auto *DT = dyn_cast<DecltypeType>(T))
     return classify(DT->getUnderlyingType());
 
+  if (T->isFixedPointType())
+    return PT_FixedPoint;
+
   return std::nullopt;
 }
 
diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp
index 05ece907af42f..65ac7a3129aba 100644
--- a/clang/lib/AST/ByteCode/Descriptor.cpp
+++ b/clang/lib/AST/ByteCode/Descriptor.cpp
@@ -8,6 +8,7 @@
 
 #include "Descriptor.h"
 #include "Boolean.h"
+#include "FixedPoint.h"
 #include "Floating.h"
 #include "FunctionPointer.h"
 #include "IntegralAP.h"
@@ -15,6 +16,7 @@
 #include "Pointer.h"
 #include "PrimType.h"
 #include "Record.h"
+#include "Source.h"
 
 using namespace clang;
 using namespace clang::interp;
@@ -423,6 +425,14 @@ SourceLocation Descriptor::getLocation() const {
   llvm_unreachable("Invalid descriptor type");
 }
 
+SourceInfo Descriptor::getLoc() const {
+  if (const auto *D = Source.dyn_cast<const Decl *>())
+    return SourceInfo(D);
+  if (const auto *E = Source.dyn_cast<const Expr *>())
+    return SourceInfo(E);
+  llvm_unreachable("Invalid descriptor type");
+}
+
 bool Descriptor::isUnion() const { return isRecord() && ElemRecord->isUnion(); }
 
 InitMap::InitMap(unsigned N)
diff --git a/clang/lib/AST/ByteCode/Descriptor.h b/clang/lib/AST/ByteCode/Descriptor.h
index 82f90430f7f4e..5460199e0e991 100644
--- a/clang/lib/AST/ByteCode/Descriptor.h
+++ b/clang/lib/AST/ByteCode/Descriptor.h
@@ -21,6 +21,7 @@ namespace clang {
 namespace interp {
 class Block;
 class Record;
+class SourceInfo;
 struct InitMap;
 struct Descriptor;
 enum PrimType : unsigned;
@@ -194,6 +195,7 @@ struct Descriptor final {
   QualType getType() const;
   QualType getElemQualType() const;
   SourceLocation getLocation() const;
+  SourceInfo getLoc() const;
 
   const Decl *asDecl() const { return Source.dyn_cast<const Decl *>(); }
   const Expr *asExpr() const { return Source.dyn_cast<const Expr *>(); }
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index e1051e5c2bbf6..85522ffd32dcc 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -13,6 +13,7 @@
 #include "Boolean.h"
 #include "Context.h"
 #include "EvaluationResult.h"
+#include "FixedPoint.h"
 #include "Floating.h"
 #include "Function.h"
 #include "FunctionPointer.h"
@@ -126,6 +127,8 @@ static const char *primTypeToString(PrimType T) {
     return "FnPtr";
   case PT_MemberPtr:
     return "MemberPtr";
+  case PT_FixedPoint:
+    return "FixedPoint";
   }
   llvm_unreachable("Unhandled PrimType");
 }
diff --git a/clang/lib/AST/ByteCode/FixedPoint.h b/clang/lib/AST/ByteCode/FixedPoint.h
new file mode 100644
index 0000000000000..ab8d6d7f02b52
--- /dev/null
+++ b/clang/lib/AST/ByteCode/FixedPoint.h
@@ -0,0 +1,172 @@
+//===------- FixedPoint.h - Fixedd point types for the VM -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_INTERP_FIXED_POINT_H
+#define LLVM_CLANG_AST_INTERP_FIXED_POINT_H
+
+#include "clang/AST/APValue.h"
+#include "clang/AST/ComparisonCategories.h"
+#include "llvm/ADT/APFixedPoint.h"
+
+namespace clang {
+namespace interp {
+
+using APInt = llvm::APInt;
+using APSInt = llvm::APSInt;
+
+/// Wrapper around fixed point types.
+class FixedPoint final {
+private:
+  llvm::APFixedPoint V;
+
+public:
+  FixedPoint(llvm::APFixedPoint &&V) : V(std::move(V)) {}
+  FixedPoint(llvm::APFixedPoint &V) : V(V) {}
+  FixedPoint(APInt V, llvm::FixedPointSemantics Sem) : V(V, Sem) {}
+  // This needs to be default-constructible so llvm::endian::read works.
+  FixedPoint()
+      : V(APInt(0, 0ULL, false),
+          llvm::FixedPointSemantics(0, 0, false, false, false)) {}
+
+  static FixedPoint zero(llvm::FixedPointSemantics Sem) {
+    return FixedPoint(APInt(Sem.getWidth(), 0ULL, Sem.isSigned()), Sem);
+  }
+
+  static FixedPoint from(const APSInt &I, llvm::FixedPointSemantics Sem,
+                         bool *Overflow) {
+    return FixedPoint(llvm::APFixedPoint::getFromIntValue(I, Sem, Overflow));
+  }
+  static FixedPoint from(const llvm::APFloat &I, llvm::FixedPointSemantics Sem,
+                         bool *Overflow) {
+    return FixedPoint(llvm::APFixedPoint::getFromFloatValue(I, Sem, Overflow));
+  }
+
+  operator bool() const { return V.getBoolValue(); }
+  void print(llvm::raw_ostream &OS) const { OS << V; }
+
+  APValue toAPValue(const ASTContext &) const { return APValue(V); }
+  APSInt toAPSInt(unsigned BitWidth = 0) const { return V.getValue(); }
+
+  unsigned bitWidth() const { return V.getWidth(); }
+  bool isSigned() const { return V.isSigned(); }
+  bool isZero() const { return V.getValue().isZero(); }
+  bool isNegative() const { return V.getValue().isNegative(); }
+  bool isPositive() const { return V.getValue().isNonNegative(); }
+  bool isMin() const {
+    return V == llvm::APFixedPoint::getMin(V.getSemantics());
+  }
+  bool isMinusOne() const { return V.isSigned() && V.getValue() == -1; }
+
+  FixedPoint truncate(unsigned BitWidth) const { return *this; }
+
+  FixedPoint toSemantics(const llvm::FixedPointSemantics &Sem,
+                         bool *Overflow) const {
+    return FixedPoint(V.convert(Sem, Overflow));
+  }
+  llvm::FixedPointSemantics getSemantics() const { return V.getSemantics(); }
+
+  llvm::APFloat toFloat(const llvm::fltSemantics *Sem) const {
+    return V.convertToFloat(*Sem);
+  }
+
+  llvm::APSInt toInt(unsigned BitWidth, bool Signed, bool *Overflow) const {
+    return V.convertToInt(BitWidth, Signed, Overflow);
+  }
+
+  std::string toDiagnosticString(const ASTContext &Ctx) const {
+    return V.toString();
+  }
+
+  ComparisonCategoryResult compare(const FixedPoint &Other) const {
+    int c = V.compare(Other.V);
+    if (c == 0)
+      return ComparisonCategoryResult::Equal;
+    else if (c < 0)
+      return ComparisonCategoryResult::Less;
+    return ComparisonCategoryResult::Greater;
+  }
+
+  static bool neg(const FixedPoint &A, FixedPoint *R) {
+    bool Overflow = false;
+    *R = FixedPoint(A.V.negate(&Overflow));
+    return Overflow;
+  }
+
+  static bool add(const FixedPoint A, const FixedPoint B, unsigned Bits,
+                  FixedPoint *R) {
+    bool Overflow = false;
+    *R = FixedPoint(A.V.add(B.V, &Overflow));
+    return Overflow;
+  }
+  static bool sub(const FixedPoint A, const FixedPoint B, unsigned Bits,
+                  FixedPoint *R) {
+    bool Overflow = false;
+    *R = FixedPoint(A.V.sub(B.V, &Overflow));
+    return Overflow;
+  }
+  static bool mul(const FixedPoint A, const FixedPoint B, unsigned Bits,
+                  FixedPoint *R) {
+    bool Overflow = false;
+    *R = FixedPoint(A.V.mul(B.V, &Overflow));
+    return Overflow;
+  }
+  static bool div(const FixedPoint A, const FixedPoint B, unsigned Bits,
+                  FixedPoint *R) {
+    bool Overflow = false;
+    *R = FixedPoint(A.V.div(B.V, &Overflow));
+    return Overflow;
+  }
+
+  static bool shiftLeft(const FixedPoint A, const FixedPoint B, unsigned OpBits,
+                        FixedPoint *R) {
+    unsigned Amt = B.V.getValue().getLimitedValue(OpBits);
+    bool Overflow;
+    *R = FixedPoint(A.V.shl(Amt, &Overflow));
+    return Overflow;
+  }
+  static bool shiftRight(const FixedPoint A, const FixedPoint B,
+                         unsigned OpBits, FixedPoint *R) {
+    unsigned Amt = B.V.getValue().getLimitedValue(OpBits);
+    bool Overflow;
+    *R = FixedPoint(A.V.shr(Amt, &Overflow));
+    return Overflow;
+  }
+
+  static bool rem(const FixedPoint A, const FixedPoint B, unsigned Bits,
+                  FixedPoint *R) {
+    llvm_unreachable("Rem doesn't exist for fixed point values");
+    return true;
+  }
+  static bool bitAnd(const FixedPoint A, const FixedPoint B, unsigned Bits,
+                     FixedPoint *R) {
+    return true;
+  }
+  static bool bitOr(const FixedPoint A, const FixedPoint B, unsigned Bits,
+                    FixedPoint *R) {
+    return true;
+  }
+  static bool bitXor(const FixedPoint A, const FixedPoint B, unsigned Bits,
+                     FixedPoint *R) {
+    return true;
+  }
+
+  static bool increment(const FixedPoint &A, FixedPoint *R) { return true; }
+  static bool decrement(const FixedPoint &A, FixedPoint *R) { return true; }
+};
+
+inline FixedPoint getSwappedBytes(FixedPoint F) { return F; }
+
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, FixedPoint F) {
+  F.print(OS);
+  return OS;
+}
+
+} // namespace interp
+} // namespace clang
+
+#endif
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index b9c85626ffa99..fd9a256843a0e 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1043,6 +1043,25 @@ bool CheckLiteralType(InterpState &S, CodePtr OpPC, const Type *T) {
   return false;
 }
 
+static bool checkConstructor(InterpState &S, CodePtr OpPC, const Function *Func,
+                             const Pointer &ThisPtr) {
+  assert(Func->isConstructor());
+
+  const Descriptor *D = ThisPtr.getFieldDesc();
+
+  // FIXME: I think this case is not 100% correct. E.g. a pointer into a
+  // subobject of a composite array.
+  if (!D->ElemRecord)
+    return true;
+
+  if (D->ElemRecord->getNumVirtualBases() == 0)
+    return true;
+
+  S.FFDiag(S.Current->getLocation(OpPC), diag::note_constexpr_virtual_base)
+      << Func->getParentDecl();
+  return false;
+}
+
 bool CallVar(InterpState &S, CodePtr OpPC, const Function *Func,
              uint32_t VarArgSize) {
   if (Func->hasThisPointer()) {
@@ -1117,6 +1136,9 @@ bool Call(InterpState &S, CodePtr OpPC, const Function *Func,
       if (!CheckInvoke(S, OpPC, ThisPtr))
         return cleanup();
     }
+
+    if (Func->isConstructor() && !checkConstructor(S, OpPC, Func, ThisPtr))
+      return false;
   }
 
   if (!CheckCallable(S, OpPC, Func))
@@ -1136,6 +1158,7 @@ bool Call(InterpState &S, CodePtr OpPC, const Function *Func,
   InterpFrame *FrameBefore = S.Current;
   S.Current = NewFrame.get();
 
+  InterpStateCCOverride CCOverride(S, Func->getDecl()->isImmediateFunction());
   APValue CallResult;
   // Note that we cannot assert(CallResult.hasValue()) here since
   // Ret() above only sets the APValue if the curent frame doesn't
@@ -1299,7 +1322,8 @@ bool CheckNewTypeMismatch(InterpState &S, CodePtr OpPC, const Expr *E,
   const auto *NewExpr = cast<CXXNewExpr>(E);
   QualType StorageType = Ptr.getType();
 
-  if (isa_and_nonnull<CXXNewExpr>(Ptr.getFieldDesc()->asExpr())) {
+  if (isa_and_nonnull<CXXNewExpr>(Ptr.getFieldDesc()->asExpr()) &&
+      StorageType->isPointerType()) {
     // FIXME: Are there other cases where this is a problem?
     StorageType = StorageType->getPointeeType();
   }
@@ -1369,6 +1393,19 @@ bool InvalidNewDeleteExpr(InterpState &S, CodePtr OpPC, const Expr *E) {
   return false;
 }
 
+bool handleFixedPointOverflow(InterpState &S, CodePtr OpPC,
+                              const FixedPoint &FP) {
+  const Expr *E = S.Current->getExpr(OpPC);
+  if (S.checkingForUndefinedBehavior()) {
+    S.getASTContext().getDiagnostics().Report(
+        E->getExprLoc(), diag::warn_fixedpoint_constant_overflow)
+        << FP.toDiagnosticString(S.getASTContext()) << E->getType();
+  }
+  S.CCEDiag(E, diag::note_constexpr_overflow)
+      << FP.toDiagnosticString(S.getASTContext()) << E->getType();
+  return S.noteUndefinedBehavior();
+}
+
 bool Interpret(InterpState &S, APValue &Result) {
   // The current stack frame when we started Interpret().
   // This is being used by the ops to determine wheter
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index b029399a1554b..5c3ee5e689f1c 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -16,6 +16,7 @@
 #include "../ExprConstShared.h"
 #include "Boolean.h"
 #include "DynamicAllocator.h"
+#include "FixedPoint.h"
 #include "Floating.h"
 #include "Function.h"
 #include "FunctionPointer.h"
@@ -37,6 +38,7 @@ namespace clang {
 namespace interp {
 
 using APSInt = llvm::APSInt;
+using FixedPointSemantics = llvm::FixedPointSemantics;
 
 /// Convert a value to an APValue.
 template <typename T>
@@ -160,6 +162,15 @@ bool CallPtr(InterpState &S, CodePtr OpPC, uint32_t ArgSize,
              const CallExpr *CE);
 bool CheckLiteralType(InterpState &S, CodePtr OpPC, const Type *T);
 
+template <typename T>
+static bool handleOverflow(InterpState &S, CodePtr OpPC, const T &SrcValue) {
+  const Expr *E = S.Current->getExpr(OpPC);
+  S.CCEDiag(E, diag::note_constexpr_overflow) << SrcValue << E->getType();
+  return S.noteUndefinedBehavior();
+}
+bool handleFixedPointOverflow(InterpState &S, CodePtr OpPC,
+                              const FixedPoint &FP);
+
 enum class ShiftDir { Left, Right };
 
 /// Checks if the shift operation is legal.
@@ -224,14 +235,16 @@ bool CheckDivRem(InterpState &S, CodePtr OpPC, const T &LHS, const T &RHS) {
     return false;
   }
 
-  if (LHS.isSigned() && LHS.isMin() && RHS.isNegative() && RHS.isMinusOne()) {
-    APSInt LHSInt = LHS.toAPSInt();
-    SmallString<32> Trunc;
-    (-LHSInt.extend(LHSInt.getBitWidth() + 1)).toString(Trunc, 10);
-    const SourceInfo &Loc = S.Current->getSource(OpPC);
-    const Expr *E = S.Current->getExpr(OpPC);
-    S.CCEDiag(Loc, diag::note_constexpr_overflow) << Trunc << E->getType();
-    return false;
+  if constexpr (!std::is_same_v<T, FixedPoint>) {
+    if (LHS.isSigned() && LHS.isMin() && RHS.isNegative() && RHS.isMinusOne()) {
+      APSInt LHSInt = LHS.toAPSInt();
+      SmallString<32> Trunc;
+      (-LHSInt.extend(LHSInt.getBitWidth() + 1)).toString(Trunc, 10);
+      const SourceInfo &Loc = S.Current->getSource(OpPC);
+      const Expr *E = S.Current->getExpr(OpPC);
+      S.CCEDiag(Loc, diag::note_constexpr_overflow) << Trunc << E->getType();
+      return false;
+    }
   }
   return true;
 }
@@ -363,10 +376,13 @@ bool AddSubMulHelper(InterpState &S, CodePtr OpPC, unsigned Bits, const T &LHS,
     S.Stk.push<T>(Result);
     return true;
   }
-
   // If for some reason evaluation continues, use the truncated results.
   S.Stk.push<T>(Result);
 
+  // Short-circuit fixed-points here since the error handling is easier.
+  if constexpr (std::is_same_v<T, FixedPoint>)
+    return handleFixedPointOverflow(S, OpPC, Result);
+
   // Slow path - compute the result using another bit of precision.
   APSInt Value = OpAP<APSInt>()(LHS.toAPSInt(Bits), RHS.toAPSInt(Bits));
 
@@ -383,13 +399,10 @@ bool AddSubMulHelper(InterpState &S, CodePtr OpPC, unsigned Bits, const T &LHS,
         << Trunc << Type << E->getSourceRange();
   }
 
-  S.CCEDiag(E, diag::note_constexpr_overflow) << Value << Type;
-
-  if (!S.noteUndefinedBehavior()) {
+  if (!handleOverflow(S, OpPC, Value)) {
     S.Stk.pop<T>();
     return false;
   }
-
   return true;
 }
 
@@ -679,6 +692,13 @@ bool Div(InterpState &S, CodePtr OpPC) {
     S.Stk.push<T>(Result);
     return true;
   }
+
+  if constexpr (std::is_same_v<T, FixedPoint>) {
+    if (handleFixedPointOverflow(S, OpPC, Result)) {
+      S.Stk.push<T>(Result);
+      return true;
+    }
+  }
   return false;
 }
 
@@ -739,8 +759,7 @@ bool Neg(InterpState &S, CodePtr OpPC) {
     return true;
   }
 
-  S.CCEDiag(E, diag::note_constexpr_overflow) << NegatedValue << Type;
-  return S.noteUndefinedBehavior();
+  return handleOverflow(S, OpPC, NegatedValue);
 }
 
 enum class PushVal : bool {
@@ -802,8 +821,7 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
     return true;
   }
 
-  S.CCEDiag(E, diag::note_constexpr_overflow) << APResult << Type;
-  return S.noteUndefinedBehavior();
+  return handleOverflow(S, OpPC, APResult);
 }
 
 /// 1) Pops a pointer from the stack
@@ -1623,14 +1641,14 @@ inline bool GetPtrDerivedPop(InterpState &S, CodePtr OpPC, uint32_t Off) {
 
 inline bool GetPtrBase(InterpState &S, CodePtr OpPC, uint32_t Off) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
+  if (!CheckNull(S, OpPC, Ptr, CSK_Base))
+    return false;
 
   if (!Ptr.isBlockPointer()) {
     S.Stk.push<Pointer>(Ptr.asIntPointer().baseCast(S.getASTContext(), Off));
     return true;
   }
 
-  if (!CheckNull(S, OpPC, Ptr, CSK_Base))
-    return false;
   if (!CheckSubobject(S, OpPC, Ptr, CSK_Base))
     return false;
   const Pointer &Result = Ptr.atField(Off);
@@ -1643,13 +1661,14 @@ inline bool GetPtrBase(InterpState &S, CodePtr OpPC, uint32_t Off) {
 inline bool GetPtrBasePop(InterpState &S, CodePtr OpPC, uint32_t Off) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
+  if (!CheckNull(S, OpPC, Ptr, CSK_Base))
+    return false;
+
   if (!Ptr.isBlockPointer()) {
     S.Stk.push<Pointer>(Ptr.asIntPointer().baseCast(S.getASTContext(), Off));
     return true;
   }
 
-  if (!CheckNull(S, OpPC, Ptr, CSK_Base))
-    return false;
   if (!CheckSubobject(S, OpPC, Ptr, CSK_Base))
     return false;
   const Pointer &Result = Ptr.atField(Off);
@@ -2159,6 +2178,22 @@ inline bool CastFP(InterpState &S, CodePtr OpPC, const llvm::fltSemantics *Sem,
   return true;
 }
 
+inline bool CastFixedPoint(InterpState &S, CodePtr OpPC, uint32_t FPS) {
+  FixedPointSemantics TargetSemantics(0, 0, false, false, false);
+  std::memcpy(&TargetSemantics, &FPS, sizeof(TargetSemantics));
+
+  const auto &Source = S.Stk.pop<FixedPoint>();
+
+  bool Overflow;
+  FixedPoint Result = Source.toSemantics(TargetSemantics, &Overflow);
+
+  if (Overflow && !handleFixedPointOverflow(S, OpPC, Result))
+    return false;
+
+  S.Stk.push<FixedPoint>(Result);
+  return true;
+}
+
 /// Like Cast(), but we cast to an arbitrary-bitwidth integral, so we need
 /// to know what bitwidth the result should be.
 template <PrimType Name, class T = typename PrimConv<Name>::T>
@@ -2229,13 +2264,8 @@ static inline bool CastFloatingIntegralAP(InterpState &S, CodePtr OpPC,
   auto Status = F.convertToInteger(Result);
 
   // Float-to-Integral overflow check.
-  if ((Status & APFloat::opStatus::opInvalidOp) && F.isFinite()) {
-    const Expr *E = S.Current->getExpr(OpPC);
-    QualType Type = E->getType();
-
-    S.CCEDiag(E, diag::note_constexpr_overflow) << F.getAPFloat() << Type;
-    return S.noteUndefinedBehavior();
-  }
+  if ((Status & APFloat::opStatus::opInvalidOp) && F.isFinite())
+    return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
   S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
@@ -2250,13 +2280,8 @@ static inline bool CastFloatingIntegralAPS(InterpState &S, CodePtr OpPC,
   auto Status = F.convertToInteger(Result);
 
   // Float-to-Integral overflow check.
-  if ((Status & APFloat::opStatus::opInvalidOp) && F.isFinite()) {
-    const Expr *E = S.Current->getExpr(OpPC);
-    QualType Type = E->getType();
-
-    S.CCEDiag(E, diag::note_constexpr_overflow) << F.getAPFloat() << Type;
-    return S.noteUndefinedBehavior();
-  }
+  if ((Status & APFloat::opStatus::opInvalidOp) && F.isFinite())
+    return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
   S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
@@ -2310,6 +2335,63 @@ static inline bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+static inline bool CastIntegralFixedPoint(InterpState &S, CodePtr OpPC,
+                                          uint32_t FPS) {
+  const T &Int = S.Stk.pop<T>();
+
+  FixedPointSemantics Sem(0, 0, false, false, false);
+  std::memcpy(&Sem, &FPS, sizeof(Sem));
+
+  bool Overflow;
+  FixedPoint Result = FixedPoint::from(Int.toAPSInt(), Sem, &Overflow);
+
+  if (Overflow && !handleFixedPointOverflow(S, OpPC, Result))
+    return false;
+
+  S.Stk.push<FixedPoint>(Result);
+  return true;
+}
+
+static inline bool CastFloatingFixedPoint(InterpState &S, CodePtr OpPC,
+                                          uint32_t FPS) {
+  const auto &Float = S.Stk.pop<Floating>();
+
+  FixedPointSemantics Sem(0, 0, false, false, false);
+  std::memcpy(&Sem, &FPS, sizeof(Sem));
+
+  bool Overflow;
+  FixedPoint Result = FixedPoint::from(Float.getAPFloat(), Sem, &Overflow);
+
+  if (Overflow && !handleFixedPointOverflow(S, OpPC, Result))
+    return false;
+
+  S.Stk.push<FixedPoint>(Result);
+  return true;
+}
+
+static inline bool CastFixedPointFloating(InterpState &S, CodePtr OpPC,
+                                          const llvm::fltSemantics *Sem) {
+  const auto &Fixed = S.Stk.pop<FixedPoint>();
+
+  S.Stk.push<Floating>(Fixed.toFloat(Sem));
+  return true;
+}
+
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+static inline bool CastFixedPointIntegral(InterpState &S, CodePtr OpPC) {
+  const auto &Fixed = S.Stk.pop<FixedPoint>();
+
+  bool Overflow;
+  APSInt Int = Fixed.toInt(T::bitWidth(), T::isSigned(), &Overflow);
+
+  if (Overflow && !handleOverflow(S, OpPC, Int))
+    return false;
+
+  S.Stk.push<T>(Int);
+  return true;
+}
+
 static inline bool PtrPtrCast(InterpState &S, CodePtr OpPC, bool SrcIsVoidPtr) {
   const auto &Ptr = S.Stk.peek<Pointer>();
 
@@ -2500,6 +2582,42 @@ inline bool Shl(InterpState &S, CodePtr OpPC) {
   return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS);
 }
 
+static inline bool ShiftFixedPoint(InterpState &S, CodePtr OpPC, bool Left) {
+  const auto &RHS = S.Stk.pop<FixedPoint>();
+  const auto &LHS = S.Stk.pop<FixedPoint>();
+  llvm::FixedPointSemantics LHSSema = LHS.getSemantics();
+
+  unsigned ShiftBitWidth =
+      LHSSema.getWidth() - (unsigned)LHSSema.hasUnsignedPadding() - 1;
+
+  // Embedded-C 4.1.6.2.2:
+  //   The right operand must be nonnegative and less than the total number
+  //   of (nonpadding) bits of the fixed-point operand ...
+  if (RHS.isNegative()) {
+    S.CCEDiag(S.Current->getLocation(OpPC), diag::note_constexpr_negative_shift)
+        << RHS.toAPSInt();
+  } else if (static_cast<unsigned>(RHS.toAPSInt().getLimitedValue(
+                 ShiftBitWidth)) != RHS.toAPSInt()) {
+    const Expr *E = S.Current->getExpr(OpPC);
+    S.CCEDiag(E, diag::note_constexpr_large_shift)
+        << RHS.toAPSInt() << E->getType() << ShiftBitWidth;
+  }
+
+  FixedPoint Result;
+  if (Left) {
+    if (FixedPoint::shiftLeft(LHS, RHS, ShiftBitWidth, &Result) &&
+        !handleFixedPointOverflow(S, OpPC, Result))
+      return false;
+  } else {
+    if (FixedPoint::shiftRight(LHS, RHS, ShiftBitWidth, &Result) &&
+        !handleFixedPointOverflow(S, OpPC, Result))
+      return false;
+  }
+
+  S.Stk.push<FixedPoint>(Result);
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // NoRet
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 68710f67be200..ebc800623f0d4 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -14,6 +14,7 @@
 #include "clang/AST/OSLog.h"
 #include "clang/AST/RecordLayout.h"
 #include "clang/Basic/Builtins.h"
+#include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/Support/SipHash.h"
 
@@ -136,16 +137,17 @@ static bool retPrimValue(InterpState &S, CodePtr OpPC, APValue &Result,
 static bool interp__builtin_is_constant_evaluated(InterpState &S, CodePtr OpPC,
                                                   const InterpFrame *Frame,
                                                   const CallExpr *Call) {
+  unsigned Depth = S.Current->getDepth();
+  auto isStdCall = [](const FunctionDecl *F) -> bool {
+    return F && F->isInStdNamespace() && F->getIdentifier() &&
+           F->getIdentifier()->isStr("is_constant_evaluated");
+  };
+  const InterpFrame *Caller = Frame->Caller;
   // The current frame is the one for __builtin_is_constant_evaluated.
   // The one above that, potentially the one for std::is_constant_evaluated().
   if (S.inConstantContext() && !S.checkingPotentialConstantExpression() &&
-      Frame->Caller && S.getEvalStatus().Diag) {
-    auto isStdCall = [](const FunctionDecl *F) -> bool {
-      return F && F->isInStdNamespace() && F->getIdentifier() &&
-             F->getIdentifier()->isStr("is_constant_evaluated");
-    };
-    const InterpFrame *Caller = Frame->Caller;
-
+      S.getEvalStatus().Diag &&
+      (Depth == 1 || (Depth == 2 && isStdCall(Caller->getCallee())))) {
     if (Caller->Caller && isStdCall(Caller->getCallee())) {
       const Expr *E = Caller->Caller->getExpr(Caller->getRetPC());
       S.report(E->getExprLoc(),
@@ -1151,6 +1153,114 @@ static bool interp__builtin_is_aligned_up_down(InterpState &S, CodePtr OpPC,
   return false;
 }
 
+static bool interp__builtin_ia32_bextr(InterpState &S, CodePtr OpPC,
+                                       const InterpFrame *Frame,
+                                       const Function *Func,
+                                       const CallExpr *Call) {
+  PrimType ValT = *S.Ctx.classify(Call->getArg(0));
+  PrimType IndexT = *S.Ctx.classify(Call->getArg(1));
+  APSInt Val = peekToAPSInt(S.Stk, ValT,
+                            align(primSize(ValT)) + align(primSize(IndexT)));
+  APSInt Index = peekToAPSInt(S.Stk, IndexT);
+
+  unsigned BitWidth = Val.getBitWidth();
+  uint64_t Shift = Index.extractBitsAsZExtValue(8, 0);
+  uint64_t Length = Index.extractBitsAsZExtValue(8, 8);
+  Length = Length > BitWidth ? BitWidth : Length;
+
+  // Handle out of bounds cases.
+  if (Length == 0 || Shift >= BitWidth) {
+    pushInteger(S, 0, Call->getType());
+    return true;
+  }
+
+  uint64_t Result = Val.getZExtValue() >> Shift;
+  Result &= llvm::maskTrailingOnes<uint64_t>(Length);
+  pushInteger(S, Result, Call->getType());
+  return true;
+}
+
+static bool interp__builtin_ia32_bzhi(InterpState &S, CodePtr OpPC,
+                                      const InterpFrame *Frame,
+                                      const Function *Func,
+                                      const CallExpr *Call) {
+  PrimType ValT = *S.Ctx.classify(Call->getArg(0));
+  PrimType IndexT = *S.Ctx.classify(Call->getArg(1));
+
+  APSInt Val = peekToAPSInt(S.Stk, ValT,
+                            align(primSize(ValT)) + align(primSize(IndexT)));
+  APSInt Idx = peekToAPSInt(S.Stk, IndexT);
+
+  unsigned BitWidth = Val.getBitWidth();
+  uint64_t Index = Idx.extractBitsAsZExtValue(8, 0);
+
+  if (Index < BitWidth)
+    Val.clearHighBits(BitWidth - Index);
+
+  pushInteger(S, Val, Call->getType());
+  return true;
+}
+
+static bool interp__builtin_ia32_lzcnt(InterpState &S, CodePtr OpPC,
+                                       const InterpFrame *Frame,
+                                       const Function *Func,
+                                       const CallExpr *Call) {
+  APSInt Val = peekToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(0)));
+  pushInteger(S, Val.countLeadingZeros(), Call->getType());
+  return true;
+}
+
+static bool interp__builtin_ia32_tzcnt(InterpState &S, CodePtr OpPC,
+                                       const InterpFrame *Frame,
+                                       const Function *Func,
+                                       const CallExpr *Call) {
+  APSInt Val = peekToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(0)));
+  pushInteger(S, Val.countTrailingZeros(), Call->getType());
+  return true;
+}
+
+static bool interp__builtin_ia32_pdep(InterpState &S, CodePtr OpPC,
+                                      const InterpFrame *Frame,
+                                      const Function *Func,
+                                      const CallExpr *Call) {
+  PrimType ValT = *S.Ctx.classify(Call->getArg(0));
+  PrimType MaskT = *S.Ctx.classify(Call->getArg(1));
+
+  APSInt Val =
+      peekToAPSInt(S.Stk, ValT, align(primSize(ValT)) + align(primSize(MaskT)));
+  APSInt Mask = peekToAPSInt(S.Stk, MaskT);
+
+  unsigned BitWidth = Val.getBitWidth();
+  APInt Result = APInt::getZero(BitWidth);
+  for (unsigned I = 0, P = 0; I != BitWidth; ++I) {
+    if (Mask[I])
+      Result.setBitVal(I, Val[P++]);
+  }
+  pushInteger(S, Result, Call->getType());
+  return true;
+}
+
+static bool interp__builtin_ia32_pext(InterpState &S, CodePtr OpPC,
+                                      const InterpFrame *Frame,
+                                      const Function *Func,
+                                      const CallExpr *Call) {
+  PrimType ValT = *S.Ctx.classify(Call->getArg(0));
+  PrimType MaskT = *S.Ctx.classify(Call->getArg(1));
+
+  APSInt Val =
+      peekToAPSInt(S.Stk, ValT, align(primSize(ValT)) + align(primSize(MaskT)));
+  APSInt Mask = peekToAPSInt(S.Stk, MaskT);
+
+  unsigned BitWidth = Val.getBitWidth();
+  APInt Result = APInt::getZero(BitWidth);
+  for (unsigned I = 0, P = 0; I != BitWidth; ++I) {
+    if (Mask[I])
+      Result.setBitVal(P++, Val[I]);
+  }
+  pushInteger(S, Result, Call->getType());
+  return true;
+}
+
 static bool interp__builtin_os_log_format_buffer_size(InterpState &S,
                                                       CodePtr OpPC,
                                                       const InterpFrame *Frame,
@@ -1736,6 +1846,46 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
       return false;
     break;
 
+  case clang::X86::BI__builtin_ia32_bextr_u32:
+  case clang::X86::BI__builtin_ia32_bextr_u64:
+  case clang::X86::BI__builtin_ia32_bextri_u32:
+  case clang::X86::BI__builtin_ia32_bextri_u64:
+    if (!interp__builtin_ia32_bextr(S, OpPC, Frame, F, Call))
+      return false;
+    break;
+
+  case clang::X86::BI__builtin_ia32_bzhi_si:
+  case clang::X86::BI__builtin_ia32_bzhi_di:
+    if (!interp__builtin_ia32_bzhi(S, OpPC, Frame, F, Call))
+      return false;
+    break;
+
+  case clang::X86::BI__builtin_ia32_lzcnt_u16:
+  case clang::X86::BI__builtin_ia32_lzcnt_u32:
+  case clang::X86::BI__builtin_ia32_lzcnt_u64:
+    if (!interp__builtin_ia32_lzcnt(S, OpPC, Frame, F, Call))
+      return false;
+    break;
+
+  case clang::X86::BI__builtin_ia32_tzcnt_u16:
+  case clang::X86::BI__builtin_ia32_tzcnt_u32:
+  case clang::X86::BI__builtin_ia32_tzcnt_u64:
+    if (!interp__builtin_ia32_tzcnt(S, OpPC, Frame, F, Call))
+      return false;
+    break;
+
+  case clang::X86::BI__builtin_ia32_pdep_si:
+  case clang::X86::BI__builtin_ia32_pdep_di:
+    if (!interp__builtin_ia32_pdep(S, OpPC, Frame, F, Call))
+      return false;
+    break;
+
+  case clang::X86::BI__builtin_ia32_pext_si:
+  case clang::X86::BI__builtin_ia32_pext_di:
+    if (!interp__builtin_ia32_pext(S, OpPC, Frame, F, Call))
+      return false;
+    break;
+
   case Builtin::BI__builtin_os_log_format_buffer_size:
     if (!interp__builtin_os_log_format_buffer_size(S, OpPC, Frame, F, Call))
       return false;
diff --git a/clang/lib/AST/ByteCode/InterpStack.cpp b/clang/lib/AST/ByteCode/InterpStack.cpp
index ae3721e983741..b183335dd5884 100644
--- a/clang/lib/AST/ByteCode/InterpStack.cpp
+++ b/clang/lib/AST/ByteCode/InterpStack.cpp
@@ -8,6 +8,7 @@
 
 #include "InterpStack.h"
 #include "Boolean.h"
+#include "FixedPoint.h"
 #include "Floating.h"
 #include "Integral.h"
 #include "MemberPointer.h"
diff --git a/clang/lib/AST/ByteCode/InterpStack.h b/clang/lib/AST/ByteCode/InterpStack.h
index 43988bb680d1c..f7b8c386bcc13 100644
--- a/clang/lib/AST/ByteCode/InterpStack.h
+++ b/clang/lib/AST/ByteCode/InterpStack.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_CLANG_AST_INTERP_INTERPSTACK_H
 #define LLVM_CLANG_AST_INTERP_INTERPSTACK_H
 
+#include "FixedPoint.h"
 #include "FunctionPointer.h"
 #include "IntegralAP.h"
 #include "MemberPointer.h"
@@ -190,6 +191,8 @@ class InterpStack final {
       return PT_IntAP;
     else if constexpr (std::is_same_v<T, MemberPointer>)
       return PT_MemberPtr;
+    else if constexpr (std::is_same_v<T, FixedPoint>)
+      return PT_FixedPoint;
 
     llvm_unreachable("unknown type push()'ed into InterpStack");
   }
diff --git a/clang/lib/AST/ByteCode/InterpState.cpp b/clang/lib/AST/ByteCode/InterpState.cpp
index 4ea05305540ee..287c3bd3bca3a 100644
--- a/clang/lib/AST/ByteCode/InterpState.cpp
+++ b/clang/lib/AST/ByteCode/InterpState.cpp
@@ -19,6 +19,13 @@ InterpState::InterpState(State &Parent, Program &P, InterpStack &Stk,
                          Context &Ctx, SourceMapper *M)
     : Parent(Parent), M(M), P(P), Stk(Stk), Ctx(Ctx), Current(nullptr) {}
 
+bool InterpState::inConstantContext() const {
+  if (ConstantContextOverride)
+    return *ConstantContextOverride;
+
+  return Parent.InConstantContext;
+}
+
 InterpState::~InterpState() {
   while (Current) {
     InterpFrame *Next = Current->Caller;
diff --git a/clang/lib/AST/ByteCode/InterpState.h b/clang/lib/AST/ByteCode/InterpState.h
index 4b7371450cc98..2a1311c86a2f2 100644
--- a/clang/lib/AST/ByteCode/InterpState.h
+++ b/clang/lib/AST/ByteCode/InterpState.h
@@ -77,7 +77,7 @@ class InterpState final : public State, public SourceMapper {
   bool noteUndefinedBehavior() override {
     return Parent.noteUndefinedBehavior();
   }
-  bool inConstantContext() const { return Parent.InConstantContext; }
+  bool inConstantContext() const;
   bool hasActiveDiagnostic() override { return Parent.hasActiveDiagnostic(); }
   void setActiveDiagnostic(bool Flag) override {
     Parent.setActiveDiagnostic(Flag);
@@ -116,6 +116,7 @@ class InterpState final : public State, public SourceMapper {
 
 private:
   friend class EvaluationResult;
+  friend class InterpStateCCOverride;
   /// AST Walker state.
   State &Parent;
   /// Dead block chain.
@@ -124,6 +125,7 @@ class InterpState final : public State, public SourceMapper {
   SourceMapper *M;
   /// Allocator used for dynamic allocations performed via the program.
   DynamicAllocator Alloc;
+  std::optional<bool> ConstantContextOverride;
 
 public:
   /// Reference to the module containing all bytecode.
@@ -144,6 +146,26 @@ class InterpState final : public State, public SourceMapper {
       SeenGlobalTemporaries;
 };
 
+class InterpStateCCOverride final {
+public:
+  InterpStateCCOverride(InterpState &Ctx, bool Value)
+      : Ctx(Ctx), OldCC(Ctx.ConstantContextOverride) {
+    // We only override this if the new value is true.
+    Enabled = Value;
+    if (Enabled)
+      Ctx.ConstantContextOverride = Value;
+  }
+  ~InterpStateCCOverride() {
+    if (Enabled)
+      Ctx.ConstantContextOverride = OldCC;
+  }
+
+private:
+  bool Enabled;
+  InterpState &Ctx;
+  std::optional<bool> OldCC;
+};
+
 } // namespace interp
 } // namespace clang
 
diff --git a/clang/lib/AST/ByteCode/MemberPointer.cpp b/clang/lib/AST/ByteCode/MemberPointer.cpp
index 0fe94db97a3c4..dfc8583e464ab 100644
--- a/clang/lib/AST/ByteCode/MemberPointer.cpp
+++ b/clang/lib/AST/ByteCode/MemberPointer.cpp
@@ -79,7 +79,7 @@ APValue MemberPointer::toAPValue(const ASTContext &ASTCtx) const {
   if (hasBase())
     return Base.toAPValue(ASTCtx);
 
-  return APValue(cast<ValueDecl>(getDecl()), /*IsDerivedMember=*/false,
+  return APValue(getDecl(), /*IsDerivedMember=*/false,
                  /*Path=*/{});
 }
 
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 36191f096aeb8..61b6f2e8daa2f 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -31,6 +31,7 @@ def Float : Type;
 def Ptr : Type;
 def FnPtr : Type;
 def MemberPtr : Type;
+def FixedPoint : Type;
 
 //===----------------------------------------------------------------------===//
 // Types transferred to the interpreter.
@@ -49,6 +50,7 @@ def ArgIntAP : ArgType { let Name = "IntegralAP<false>"; let AsRef = true; }
 def ArgIntAPS : ArgType { let Name = "IntegralAP<true>"; let AsRef = true; }
 def ArgFloat : ArgType { let Name = "Floating"; let AsRef = true; }
 def ArgBool : ArgType { let Name = "bool"; }
+def ArgFixedPoint : ArgType { let Name = "FixedPoint"; let AsRef = true; }
 
 def ArgFunction : ArgType { let Name = "const Function *"; }
 def ArgRecordDecl : ArgType { let Name = "const RecordDecl *"; }
@@ -82,6 +84,11 @@ def IntegerTypeClass : TypeClass {
                Uint32, Sint64, Uint64, IntAP, IntAPS];
 }
 
+def IntegerAndFixedTypeClass : TypeClass {
+  let Types = [Sint8, Uint8, Sint16, Uint16, Sint32,
+               Uint32, Sint64, Uint64, IntAP, IntAPS, FixedPoint];
+}
+
 def FixedSizeIntegralTypeClass : TypeClass {
   let Types = [Sint8, Uint8, Sint16, Uint16, Sint32,
                Uint32, Sint64, Uint64, Bool];
@@ -96,7 +103,7 @@ def FloatTypeClass : TypeClass {
 }
 
 def AluTypeClass : TypeClass {
-  let Types = !listconcat(IntegerTypeClass.Types, [Bool]);
+  let Types = !listconcat(IntegerTypeClass.Types, [Bool], [FixedPoint]);
 }
 
 def PtrTypeClass : TypeClass {
@@ -104,7 +111,7 @@ def PtrTypeClass : TypeClass {
 }
 
 def NonPtrTypeClass : TypeClass {
-  let Types = !listconcat(IntegerTypeClass.Types, [Bool], [Float]);
+  let Types = !listconcat(IntegerTypeClass.Types, [Bool], [Float], [FixedPoint]);
 }
 
 def AllTypeClass : TypeClass {
@@ -144,7 +151,7 @@ class FloatOpcode : Opcode {
 }
 
 class IntegerOpcode : Opcode {
-  let Types = [IntegerTypeClass];
+  let Types = [IntegerAndFixedTypeClass];
   let HasGroup = 1;
 }
 
@@ -255,6 +262,7 @@ def ConstFloat : ConstOpcode<Float, ArgFloat>;
 def constIntAP : ConstOpcode<IntAP, ArgIntAP>;
 def constIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
 def ConstBool : ConstOpcode<Bool, ArgBool>;
+def ConstFixedPoint : ConstOpcode<FixedPoint, ArgFixedPoint>;
 
 // [] -> [Integer]
 def Zero : Opcode {
@@ -607,7 +615,7 @@ def IsNonNull : Opcode {
 //===----------------------------------------------------------------------===//
 
 def FromCastTypeClass : TypeClass {
-  let Types = [Uint8, Sint8, Uint16, Sint16, Uint32, Sint32, Uint64, Sint64, Bool, IntAP, IntAPS];
+  let Types = [Uint8, Sint8, Uint16, Sint16, Uint32, Sint32, Uint64, Sint64, Bool, IntAP, IntAPS, FixedPoint];
 }
 
 def ToCastTypeClass : TypeClass {
@@ -623,6 +631,10 @@ def CastFP : Opcode {
   let Args = [ArgFltSemantics, ArgRoundingMode];
 }
 
+def CastFixedPoint : Opcode {
+  let Args = [ArgUint32];
+}
+
 def FixedSizeIntegralTypes : TypeClass {
   let Types = [Uint8, Sint8, Uint16, Sint16, Uint32, Sint32, Uint64, Sint64, Bool];
 }
@@ -671,6 +683,25 @@ def CastPointerIntegralAP : Opcode {
 def CastPointerIntegralAPS : Opcode {
   let Args = [ArgUint32];
 }
+def CastIntegralFixedPoint : Opcode {
+  let Types = [FixedSizeIntegralTypes];
+  let Args = [ArgUint32];
+  let HasGroup = 1;
+}
+def CastFloatingFixedPoint : Opcode {
+  let Args = [ArgUint32];
+}
+def CastFixedPointFloating : Opcode {
+  let Args = [ArgFltSemantics];
+}
+def CastFixedPointIntegral : Opcode {
+  let Types = [FixedSizeIntegralTypes];
+  let HasGroup = 1;
+}
+def ShiftFixedPoint : Opcode {
+  let Args = [ArgBool];
+}
+
 def PtrPtrCast : Opcode {
   let Args = [ArgBool];
 
diff --git a/clang/lib/AST/ByteCode/PrimType.cpp b/clang/lib/AST/ByteCode/PrimType.cpp
index 3054e67d5c49f..7dbab996416cc 100644
--- a/clang/lib/AST/ByteCode/PrimType.cpp
+++ b/clang/lib/AST/ByteCode/PrimType.cpp
@@ -8,6 +8,7 @@
 
 #include "PrimType.h"
 #include "Boolean.h"
+#include "FixedPoint.h"
 #include "Floating.h"
 #include "FunctionPointer.h"
 #include "IntegralAP.h"
diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h
index bb2f59d86e98d..59c04c4673d93 100644
--- a/clang/lib/AST/ByteCode/PrimType.h
+++ b/clang/lib/AST/ByteCode/PrimType.h
@@ -26,6 +26,7 @@ class Boolean;
 class Floating;
 class FunctionPointer;
 class MemberPointer;
+class FixedPoint;
 template <bool Signed> class IntegralAP;
 template <unsigned Bits, bool Signed> class Integral;
 
@@ -42,10 +43,11 @@ enum PrimType : unsigned {
   PT_IntAP = 8,
   PT_IntAPS = 9,
   PT_Bool = 10,
-  PT_Float = 11,
-  PT_Ptr = 12,
-  PT_FnPtr = 13,
-  PT_MemberPtr = 14,
+  PT_FixedPoint = 11,
+  PT_Float = 12,
+  PT_Ptr = 13,
+  PT_FnPtr = 14,
+  PT_MemberPtr = 15,
 };
 
 inline constexpr bool isPtrType(PrimType T) {
@@ -69,7 +71,7 @@ inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
   return OS;
 }
 
-constexpr bool isIntegralType(PrimType T) { return T <= PT_Bool; }
+constexpr bool isIntegralType(PrimType T) { return T <= PT_FixedPoint; }
 
 /// Mapping from primitive types to their representation.
 template <PrimType T> struct PrimConv;
@@ -118,6 +120,9 @@ template <> struct PrimConv<PT_FnPtr> {
 template <> struct PrimConv<PT_MemberPtr> {
   using T = MemberPointer;
 };
+template <> struct PrimConv<PT_FixedPoint> {
+  using T = FixedPoint;
+};
 
 /// Returns the size of a primitive type in bytes.
 size_t primSize(PrimType Type);
@@ -163,6 +168,7 @@ static inline bool aligned(const void *P) {
       TYPE_SWITCH_CASE(PT_Ptr, B)                                              \
       TYPE_SWITCH_CASE(PT_FnPtr, B)                                            \
       TYPE_SWITCH_CASE(PT_MemberPtr, B)                                        \
+      TYPE_SWITCH_CASE(PT_FixedPoint, B)                                       \
     }                                                                          \
   } while (0)
 
diff --git a/clang/lib/AST/ByteCode/Record.cpp b/clang/lib/AST/ByteCode/Record.cpp
index ec1b55da347af..0c06bec7e5508 100644
--- a/clang/lib/AST/ByteCode/Record.cpp
+++ b/clang/lib/AST/ByteCode/Record.cpp
@@ -16,7 +16,8 @@ Record::Record(const RecordDecl *Decl, BaseList &&SrcBases,
                FieldList &&SrcFields, VirtualBaseList &&SrcVirtualBases,
                unsigned VirtualSize, unsigned BaseSize)
     : Decl(Decl), Bases(std::move(SrcBases)), Fields(std::move(SrcFields)),
-      BaseSize(BaseSize), VirtualSize(VirtualSize), IsUnion(Decl->isUnion()) {
+      BaseSize(BaseSize), VirtualSize(VirtualSize), IsUnion(Decl->isUnion()),
+      IsAnonymousUnion(IsUnion && Decl->isAnonymousStructOrUnion()) {
   for (Base &V : SrcVirtualBases)
     VirtualBases.push_back({V.Decl, V.Offset + BaseSize, V.Desc, V.R});
 
diff --git a/clang/lib/AST/ByteCode/Record.h b/clang/lib/AST/ByteCode/Record.h
index 83e15b125f77a..7a5c482e4efcc 100644
--- a/clang/lib/AST/ByteCode/Record.h
+++ b/clang/lib/AST/ByteCode/Record.h
@@ -54,6 +54,8 @@ class Record final {
   const std::string getName() const;
   /// Checks if the record is a union.
   bool isUnion() const { return IsUnion; }
+  /// Checks if the record is an anonymous union.
+  bool isAnonymousUnion() const { return IsAnonymousUnion; }
   /// Returns the size of the record.
   unsigned getSize() const { return BaseSize; }
   /// Returns the full size of the record, including records.
@@ -134,6 +136,8 @@ class Record final {
   unsigned VirtualSize;
   /// If this record is a union.
   bool IsUnion;
+  /// If this is an anonymous union.
+  bool IsAnonymousUnion;
 };
 
 } // namespace interp
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index a14b1b33d35ef..84ef9f74582ef 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3302,6 +3302,7 @@ bool FunctionDecl::isImmediateFunction() const {
 
 bool FunctionDecl::isMain() const {
   return isNamed(this, "main") && !getLangOpts().Freestanding &&
+         !getLangOpts().HLSL &&
          (getDeclContext()->getRedeclContext()->isTranslationUnit() ||
           isExternC());
 }
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 01143391edab4..f5a0aa8f82512 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -2211,9 +2211,10 @@ CXXDeductionGuideDecl *CXXDeductionGuideDecl::Create(
     ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
     ExplicitSpecifier ES, const DeclarationNameInfo &NameInfo, QualType T,
     TypeSourceInfo *TInfo, SourceLocation EndLocation, CXXConstructorDecl *Ctor,
-    DeductionCandidate Kind) {
-  return new (C, DC) CXXDeductionGuideDecl(C, DC, StartLoc, ES, NameInfo, T,
-                                           TInfo, EndLocation, Ctor, Kind);
+    DeductionCandidate Kind, Expr *TrailingRequiresClause) {
+  return new (C, DC)
+      CXXDeductionGuideDecl(C, DC, StartLoc, ES, NameInfo, T, TInfo,
+                            EndLocation, Ctor, Kind, TrailingRequiresClause);
 }
 
 CXXDeductionGuideDecl *
@@ -2221,7 +2222,7 @@ CXXDeductionGuideDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) CXXDeductionGuideDecl(
       C, nullptr, SourceLocation(), ExplicitSpecifier(), DeclarationNameInfo(),
       QualType(), nullptr, SourceLocation(), nullptr,
-      DeductionCandidate::Normal);
+      DeductionCandidate::Normal, nullptr);
 }
 
 RequiresExprBodyDecl *RequiresExprBodyDecl::Create(
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 6387e375dda79..48816d3078826 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -52,10 +52,13 @@
 #include "clang/AST/TypeLoc.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/DiagnosticSema.h"
+#include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/APFixedPoint.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/SipHash.h"
@@ -2061,8 +2064,8 @@ static bool EvaluateIgnoredValue(EvalInfo &Info, const Expr *E) {
   return true;
 }
 
-/// Should this call expression be treated as a no-op?
-static bool IsNoOpCall(const CallExpr *E) {
+/// Should this call expression be treated as forming an opaque constant?
+static bool IsOpaqueConstantCall(const CallExpr *E) {
   unsigned Builtin = E->getBuiltinCallee();
   return (Builtin == Builtin::BI__builtin___CFStringMakeConstantString ||
           Builtin == Builtin::BI__builtin___NSStringMakeConstantString ||
@@ -2070,6 +2073,12 @@ static bool IsNoOpCall(const CallExpr *E) {
           Builtin == Builtin::BI__builtin_function_start);
 }
 
+static bool IsOpaqueConstantCall(const LValue &LVal) {
+  const auto *BaseExpr =
+      llvm::dyn_cast_if_present<CallExpr>(LVal.Base.dyn_cast<const Expr *>());
+  return BaseExpr && IsOpaqueConstantCall(BaseExpr);
+}
+
 static bool IsGlobalLValue(APValue::LValueBase B) {
   // C++11 [expr.const]p3 An address constant expression is a prvalue core
   // constant expression of pointer type that evaluates to...
@@ -2115,7 +2124,7 @@ static bool IsGlobalLValue(APValue::LValueBase B) {
   case Expr::ObjCBoxedExprClass:
     return cast<ObjCBoxedExpr>(E)->isExpressibleAsConstantInitializer();
   case Expr::CallExprClass:
-    return IsNoOpCall(cast<CallExpr>(E));
+    return IsOpaqueConstantCall(cast<CallExpr>(E));
   // For GCC compatibility, &&label has static storage duration.
   case Expr::AddrLabelExprClass:
     return true;
@@ -2142,11 +2151,91 @@ static const ValueDecl *GetLValueBaseDecl(const LValue &LVal) {
   return LVal.Base.dyn_cast<const ValueDecl*>();
 }
 
-static bool IsLiteralLValue(const LValue &Value) {
-  if (Value.getLValueCallIndex())
+// Information about an LValueBase that is some kind of string.
+struct LValueBaseString {
+  std::string ObjCEncodeStorage;
+  StringRef Bytes;
+  int CharWidth;
+};
+
+// Gets the lvalue base of LVal as a string.
+static bool GetLValueBaseAsString(const EvalInfo &Info, const LValue &LVal,
+                                  LValueBaseString &AsString) {
+  const auto *BaseExpr = LVal.Base.dyn_cast<const Expr *>();
+  if (!BaseExpr)
+    return false;
+
+  // For ObjCEncodeExpr, we need to compute and store the string.
+  if (const auto *EE = dyn_cast<ObjCEncodeExpr>(BaseExpr)) {
+    Info.Ctx.getObjCEncodingForType(EE->getEncodedType(),
+                                    AsString.ObjCEncodeStorage);
+    AsString.Bytes = AsString.ObjCEncodeStorage;
+    AsString.CharWidth = 1;
+    return true;
+  }
+
+  // Otherwise, we have a StringLiteral.
+  const auto *Lit = dyn_cast<StringLiteral>(BaseExpr);
+  if (const auto *PE = dyn_cast<PredefinedExpr>(BaseExpr))
+    Lit = PE->getFunctionName();
+
+  if (!Lit)
+    return false;
+
+  AsString.Bytes = Lit->getBytes();
+  AsString.CharWidth = Lit->getCharByteWidth();
+  return true;
+}
+
+// Determine whether two string literals potentially overlap. This will be the
+// case if they agree on the values of all the bytes on the overlapping region
+// between them.
+//
+// The overlapping region is the portion of the two string literals that must
+// overlap in memory if the pointers actually point to the same address at
+// runtime. For example, if LHS is "abcdef" + 3 and RHS is "cdef\0gh" + 1 then
+// the overlapping region is "cdef\0", which in this case does agree, so the
+// strings are potentially overlapping. Conversely, for "foobar" + 3 versus
+// "bazbar" + 3, the overlapping region contains all of both strings, so they
+// are not potentially overlapping, even though they agree from the given
+// addresses onwards.
+//
+// See open core issue CWG2765 which is discussing the desired rule here.
+static bool ArePotentiallyOverlappingStringLiterals(const EvalInfo &Info,
+                                                    const LValue &LHS,
+                                                    const LValue &RHS) {
+  LValueBaseString LHSString, RHSString;
+  if (!GetLValueBaseAsString(Info, LHS, LHSString) ||
+      !GetLValueBaseAsString(Info, RHS, RHSString))
     return false;
-  const Expr *E = Value.Base.dyn_cast<const Expr*>();
-  return E && !isa<MaterializeTemporaryExpr>(E);
+
+  // This is the byte offset to the location of the first character of LHS
+  // within RHS. We don't need to look at the characters of one string that
+  // would appear before the start of the other string if they were merged.
+  CharUnits Offset = RHS.Offset - LHS.Offset;
+  if (Offset.isNegative())
+    LHSString.Bytes = LHSString.Bytes.drop_front(-Offset.getQuantity());
+  else
+    RHSString.Bytes = RHSString.Bytes.drop_front(Offset.getQuantity());
+
+  bool LHSIsLonger = LHSString.Bytes.size() > RHSString.Bytes.size();
+  StringRef Longer = LHSIsLonger ? LHSString.Bytes : RHSString.Bytes;
+  StringRef Shorter = LHSIsLonger ? RHSString.Bytes : LHSString.Bytes;
+  int ShorterCharWidth = (LHSIsLonger ? RHSString : LHSString).CharWidth;
+
+  // The null terminator isn't included in the string data, so check for it
+  // manually. If the longer string doesn't have a null terminator where the
+  // shorter string ends, they aren't potentially overlapping.
+  for (int NullByte : llvm::seq(ShorterCharWidth)) {
+    if (Shorter.size() + NullByte >= Longer.size())
+      break;
+    if (Longer[Shorter.size() + NullByte])
+      return false;
+  }
+
+  // Otherwise, they're potentially overlapping if and only if the overlapping
+  // region is the same.
+  return Shorter == Longer.take_front(Shorter.size());
 }
 
 static bool IsWeakLValue(const LValue &Value) {
@@ -8573,7 +8662,10 @@ class LValueExprEvaluator
   bool VisitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *E);
   bool VisitCompoundLiteralExpr(const CompoundLiteralExpr *E);
   bool VisitMemberExpr(const MemberExpr *E);
-  bool VisitStringLiteral(const StringLiteral *E) { return Success(E); }
+  bool VisitStringLiteral(const StringLiteral *E) {
+    return Success(APValue::LValueBase(
+        E, 0, Info.getASTContext().getNextStringLiteralVersion()));
+  }
   bool VisitObjCEncodeExpr(const ObjCEncodeExpr *E) { return Success(E); }
   bool VisitCXXTypeidExpr(const CXXTypeidExpr *E);
   bool VisitCXXUuidofExpr(const CXXUuidofExpr *E);
@@ -9639,7 +9731,7 @@ static bool isOneByteCharacterType(QualType T) {
 
 bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
                                                 unsigned BuiltinOp) {
-  if (IsNoOpCall(E))
+  if (IsOpaqueConstantCall(E))
     return Success(E);
 
   switch (BuiltinOp) {
@@ -12022,7 +12114,7 @@ GCCTypeClass EvaluateBuiltinClassifyType(QualType T,
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
@@ -13371,6 +13463,91 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
       return false;
     return Success(DidOverflow, E);
   }
+
+  case clang::X86::BI__builtin_ia32_bextr_u32:
+  case clang::X86::BI__builtin_ia32_bextr_u64:
+  case clang::X86::BI__builtin_ia32_bextri_u32:
+  case clang::X86::BI__builtin_ia32_bextri_u64: {
+    APSInt Val, Idx;
+    if (!EvaluateInteger(E->getArg(0), Val, Info) ||
+        !EvaluateInteger(E->getArg(1), Idx, Info))
+      return false;
+
+    unsigned BitWidth = Val.getBitWidth();
+    uint64_t Shift = Idx.extractBitsAsZExtValue(8, 0);
+    uint64_t Length = Idx.extractBitsAsZExtValue(8, 8);
+    Length = Length > BitWidth ? BitWidth : Length;
+
+    // Handle out of bounds cases.
+    if (Length == 0 || Shift >= BitWidth)
+      return Success(0, E);
+
+    uint64_t Result = Val.getZExtValue() >> Shift;
+    Result &= llvm::maskTrailingOnes<uint64_t>(Length);
+    return Success(Result, E);
+  }
+
+  case clang::X86::BI__builtin_ia32_bzhi_si:
+  case clang::X86::BI__builtin_ia32_bzhi_di: {
+    APSInt Val, Idx;
+    if (!EvaluateInteger(E->getArg(0), Val, Info) ||
+        !EvaluateInteger(E->getArg(1), Idx, Info))
+      return false;
+
+    unsigned BitWidth = Val.getBitWidth();
+    unsigned Index = Idx.extractBitsAsZExtValue(8, 0);
+    if (Index < BitWidth)
+      Val.clearHighBits(BitWidth - Index);
+    return Success(Val, E);
+  }
+
+  case clang::X86::BI__builtin_ia32_lzcnt_u16:
+  case clang::X86::BI__builtin_ia32_lzcnt_u32:
+  case clang::X86::BI__builtin_ia32_lzcnt_u64: {
+    APSInt Val;
+    if (!EvaluateInteger(E->getArg(0), Val, Info))
+      return false;
+    return Success(Val.countLeadingZeros(), E);
+  }
+
+  case clang::X86::BI__builtin_ia32_tzcnt_u16:
+  case clang::X86::BI__builtin_ia32_tzcnt_u32:
+  case clang::X86::BI__builtin_ia32_tzcnt_u64: {
+    APSInt Val;
+    if (!EvaluateInteger(E->getArg(0), Val, Info))
+      return false;
+    return Success(Val.countTrailingZeros(), E);
+  }
+
+  case clang::X86::BI__builtin_ia32_pdep_si:
+  case clang::X86::BI__builtin_ia32_pdep_di: {
+    APSInt Val, Msk;
+    if (!EvaluateInteger(E->getArg(0), Val, Info) ||
+        !EvaluateInteger(E->getArg(1), Msk, Info))
+      return false;
+
+    unsigned BitWidth = Val.getBitWidth();
+    APInt Result = APInt::getZero(BitWidth);
+    for (unsigned I = 0, P = 0; I != BitWidth; ++I)
+      if (Msk[I])
+        Result.setBitVal(I, Val[P++]);
+    return Success(Result, E);
+  }
+
+  case clang::X86::BI__builtin_ia32_pext_si:
+  case clang::X86::BI__builtin_ia32_pext_di: {
+    APSInt Val, Msk;
+    if (!EvaluateInteger(E->getArg(0), Val, Info) ||
+        !EvaluateInteger(E->getArg(1), Msk, Info))
+      return false;
+
+    unsigned BitWidth = Val.getBitWidth();
+    APInt Result = APInt::getZero(BitWidth);
+    for (unsigned I = 0, P = 0; I != BitWidth; ++I)
+      if (Msk[I])
+        Result.setBitVal(P++, Val[I]);
+    return Success(Result, E);
+  }
   }
 }
 
@@ -13889,13 +14066,22 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E,
           (!RHSValue.Base && !RHSValue.Offset.isZero()))
         return DiagComparison(diag::note_constexpr_pointer_constant_comparison,
                               !RHSValue.Base);
-      // It's implementation-defined whether distinct literals will have
-      // distinct addresses. In clang, the result of such a comparison is
-      // unspecified, so it is not a constant expression. However, we do know
-      // that the address of a literal will be non-null.
-      if ((IsLiteralLValue(LHSValue) || IsLiteralLValue(RHSValue)) &&
-          LHSValue.Base && RHSValue.Base)
+      // C++2c [intro.object]/10:
+      //   Two objects [...] may have the same address if [...] they are both
+      //   potentially non-unique objects.
+      // C++2c [intro.object]/9:
+      //   An object is potentially non-unique if it is a string literal object,
+      //   the backing array of an initializer list, or a subobject thereof.
+      //
+      // This makes the comparison result unspecified, so it's not a constant
+      // expression.
+      //
+      // TODO: Do we need to handle the initializer list case here?
+      if (ArePotentiallyOverlappingStringLiterals(Info, LHSValue, RHSValue))
         return DiagComparison(diag::note_constexpr_literal_comparison);
+      if (IsOpaqueConstantCall(LHSValue) || IsOpaqueConstantCall(RHSValue))
+        return DiagComparison(diag::note_constexpr_opaque_call_comparison,
+                              !IsOpaqueConstantCall(LHSValue));
       // We can't tell whether weak symbols will end up pointing to the same
       // object.
       if (IsWeakLValue(LHSValue) || IsWeakLValue(RHSValue))
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index b6e1da0c3192d..1feec13f9deef 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -464,7 +464,7 @@ class CXXNameMangler {
   void mangleSeqID(unsigned SeqID);
   void mangleName(GlobalDecl GD);
   void mangleType(QualType T);
-  void mangleNameOrStandardSubstitution(const NamedDecl *ND);
+  void mangleCXXRecordDecl(const CXXRecordDecl *Record);
   void mangleLambdaSig(const CXXRecordDecl *Lambda);
   void mangleModuleNamePrefix(StringRef Name, bool IsPartition = false);
   void mangleVendorQualifier(StringRef Name);
@@ -693,7 +693,7 @@ ItaniumMangleContextImpl::getEffectiveDeclContext(const Decl *D) {
     if (VD->isExternC())
       return getASTContext().getTranslationUnitDecl();
 
-  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
+  if (const auto *FD = D->getAsFunction()) {
     if (FD->isExternC())
       return getASTContext().getTranslationUnitDecl();
     // Member-like constrained friends are mangled as if they were members of
@@ -3029,9 +3029,13 @@ void CXXNameMangler::mangleType(QualType T) {
     addSubstitution(T);
 }
 
-void CXXNameMangler::mangleNameOrStandardSubstitution(const NamedDecl *ND) {
-  if (!mangleStandardSubstitution(ND))
-    mangleName(ND);
+void CXXNameMangler::mangleCXXRecordDecl(const CXXRecordDecl *Record) {
+  if (mangleSubstitution(Record))
+    return;
+  mangleName(Record);
+  if (isCompatibleWith(LangOptions::ClangABI::Ver19))
+    return;
+  addSubstitution(Record);
 }
 
 void CXXNameMangler::mangleType(const BuiltinType *T) {
@@ -3426,7 +3430,7 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
     Out << 'u' << type_name.size() << type_name;                               \
     break;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
   case BuiltinType::Id:                                                        \
     type_name = Name;                                                          \
     Out << 'u' << type_name.size() << type_name;                               \
@@ -7309,7 +7313,7 @@ void ItaniumMangleContextImpl::mangleCXXVTable(const CXXRecordDecl *RD,
   // <special-name> ::= TV <type>  # virtual table
   CXXNameMangler Mangler(*this, Out);
   Mangler.getStream() << "_ZTV";
-  Mangler.mangleNameOrStandardSubstitution(RD);
+  Mangler.mangleCXXRecordDecl(RD);
 }
 
 void ItaniumMangleContextImpl::mangleCXXVTT(const CXXRecordDecl *RD,
@@ -7317,7 +7321,7 @@ void ItaniumMangleContextImpl::mangleCXXVTT(const CXXRecordDecl *RD,
   // <special-name> ::= TT <type>  # VTT structure
   CXXNameMangler Mangler(*this, Out);
   Mangler.getStream() << "_ZTT";
-  Mangler.mangleNameOrStandardSubstitution(RD);
+  Mangler.mangleCXXRecordDecl(RD);
 }
 
 void ItaniumMangleContextImpl::mangleCXXCtorVTable(const CXXRecordDecl *RD,
@@ -7327,10 +7331,10 @@ void ItaniumMangleContextImpl::mangleCXXCtorVTable(const CXXRecordDecl *RD,
   // <special-name> ::= TC <type> <offset number> _ <base type>
   CXXNameMangler Mangler(*this, Out);
   Mangler.getStream() << "_ZTC";
-  Mangler.mangleNameOrStandardSubstitution(RD);
+  Mangler.mangleCXXRecordDecl(RD);
   Mangler.getStream() << Offset;
   Mangler.getStream() << '_';
-  Mangler.mangleNameOrStandardSubstitution(Type);
+  Mangler.mangleCXXRecordDecl(Type);
 }
 
 void ItaniumMangleContextImpl::mangleCXXRTTI(QualType Ty, raw_ostream &Out) {
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 7b069c66aed59..e4c8663c134fd 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -2814,7 +2814,7 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
 #include "clang/Basic/PPCTypes.def"
 #define RVV_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/RISCVVTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
 #include "clang/Basic/AMDGPUTypes.def"
   case BuiltinType::ShortAccum:
   case BuiltinType::Accum:
diff --git a/clang/lib/AST/NSAPI.cpp b/clang/lib/AST/NSAPI.cpp
index 3d1f8488a8927..311fec32bbfa9 100644
--- a/clang/lib/AST/NSAPI.cpp
+++ b/clang/lib/AST/NSAPI.cpp
@@ -453,7 +453,7 @@ NSAPI::getNSNumberFactoryMethodKind(QualType T) const {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp
index 95089a9b79e26..d864ded33e8d1 100644
--- a/clang/lib/AST/OpenACCClause.cpp
+++ b/clang/lib/AST/OpenACCClause.cpp
@@ -43,7 +43,7 @@ bool OpenACCClauseWithCondition::classof(const OpenACCClause *C) {
 bool OpenACCClauseWithSingleIntExpr::classof(const OpenACCClause *C) {
   return OpenACCNumWorkersClause::classof(C) ||
          OpenACCVectorLengthClause::classof(C) ||
-         OpenACCAsyncClause::classof(C);
+         OpenACCCollapseClause::classof(C) || OpenACCAsyncClause::classof(C);
 }
 OpenACCDefaultClause *OpenACCDefaultClause::Create(const ASTContext &C,
                                                    OpenACCDefaultClauseKind K,
@@ -134,6 +134,30 @@ OpenACCNumWorkersClause::Create(const ASTContext &C, SourceLocation BeginLoc,
       OpenACCNumWorkersClause(BeginLoc, LParenLoc, IntExpr, EndLoc);
 }
 
+OpenACCCollapseClause::OpenACCCollapseClause(SourceLocation BeginLoc,
+                                             SourceLocation LParenLoc,
+                                             bool HasForce, Expr *LoopCount,
+                                             SourceLocation EndLoc)
+    : OpenACCClauseWithSingleIntExpr(OpenACCClauseKind::Collapse, BeginLoc,
+                                     LParenLoc, LoopCount, EndLoc),
+      HasForce(HasForce) {
+  assert(LoopCount && "LoopCount required");
+}
+
+OpenACCCollapseClause *
+OpenACCCollapseClause::Create(const ASTContext &C, SourceLocation BeginLoc,
+                              SourceLocation LParenLoc, bool HasForce,
+                              Expr *LoopCount, SourceLocation EndLoc) {
+  assert(
+      LoopCount &&
+      (LoopCount->isInstantiationDependent() || isa<ConstantExpr>(LoopCount)) &&
+      "Loop count not constant expression");
+  void *Mem =
+      C.Allocate(sizeof(OpenACCCollapseClause), alignof(OpenACCCollapseClause));
+  return new (Mem)
+      OpenACCCollapseClause(BeginLoc, LParenLoc, HasForce, LoopCount, EndLoc);
+}
+
 OpenACCVectorLengthClause::OpenACCVectorLengthClause(SourceLocation BeginLoc,
                                                      SourceLocation LParenLoc,
                                                      Expr *IntExpr,
@@ -550,3 +574,11 @@ void OpenACCClausePrinter::VisitIndependentClause(
 void OpenACCClausePrinter::VisitSeqClause(const OpenACCSeqClause &C) {
   OS << "seq";
 }
+
+void OpenACCClausePrinter::VisitCollapseClause(const OpenACCCollapseClause &C) {
+  OS << "collapse(";
+  if (C.hasForce())
+    OS << "force:";
+  printExpr(C.getLoopCount());
+  OS << ")";
+}
diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp
index 3c6cd2d0f4341..2181591ba1790 100644
--- a/clang/lib/AST/PrintfFormatString.cpp
+++ b/clang/lib/AST/PrintfFormatString.cpp
@@ -865,7 +865,7 @@ bool PrintfSpecifier::fixType(QualType QT, const LangOptions &LangOpt,
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index ad4281986f668..c3812844ab8a3 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2558,6 +2558,12 @@ void OpenACCClauseProfiler::VisitNumWorkersClause(
   Profiler.VisitStmt(Clause.getIntExpr());
 }
 
+void OpenACCClauseProfiler::VisitCollapseClause(
+    const OpenACCCollapseClause &Clause) {
+  assert(Clause.getLoopCount() && "collapse clause requires a valid int expr");
+  Profiler.VisitStmt(Clause.getLoopCount());
+}
+
 void OpenACCClauseProfiler::VisitPrivateClause(
     const OpenACCPrivateClause &Clause) {
   for (auto *E : Clause.getVarList())
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 3c51c74647182..8a74159c7c93e 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -419,6 +419,12 @@ void TextNodeDumper::Visit(const OpenACCClause *C) {
       // but print 'clause' here so it is clear what is happening from the dump.
       OS << " clause";
       break;
+    case OpenACCClauseKind::Collapse:
+      OS << " clause";
+      if (cast<OpenACCCollapseClause>(C)->hasForce())
+        OS << ": force";
+      break;
+
     case OpenACCClauseKind::CopyIn:
     case OpenACCClauseKind::PCopyIn:
     case OpenACCClauseKind::PresentOrCopyIn:
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index a55e6c8bf0261..c703e43f12a9a 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3453,7 +3453,7 @@ StringRef BuiltinType::getName(const PrintingPolicy &Policy) const {
   case Id:                                                                     \
     return Name;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
   case Id:                                                                     \
     return Name;
 #include "clang/Basic/AMDGPUTypes.def"
@@ -4793,7 +4793,7 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp
index 8aada7e603407..fbb7fc5cd7690 100644
--- a/clang/lib/AST/TypeLoc.cpp
+++ b/clang/lib/AST/TypeLoc.cpp
@@ -428,7 +428,7 @@ TypeSpecifierType BuiltinTypeLoc::getWrittenTypeSpec() const {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
index 0707aa662e4cc..70ffe92753e05 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
@@ -38,10 +38,25 @@
 namespace clang {
 namespace dataflow {
 
-static bool isTopLevelNamespaceWithName(const NamespaceDecl &NS,
-                                        llvm::StringRef Name) {
-  return NS.getDeclName().isIdentifier() && NS.getName() == Name &&
-         NS.getParent() != nullptr && NS.getParent()->isTranslationUnit();
+// Note: the Names appear in reverse order. E.g., to check
+// if NS is foo::bar::, call isFullyQualifiedNamespaceEqualTo(NS, "bar", "foo")
+template <class... NameTypes>
+static bool isFullyQualifiedNamespaceEqualTo(const NamespaceDecl &NS,
+                                             llvm::StringRef Name,
+                                             NameTypes... Names) {
+  if (!(NS.getDeclName().isIdentifier() && NS.getName() == Name &&
+        NS.getParent() != nullptr))
+    return false;
+
+  if constexpr (sizeof...(NameTypes) > 0) {
+    if (NS.getParent()->isTranslationUnit())
+      return false;
+    if (const auto *NextNS = dyn_cast_or_null<NamespaceDecl>(NS.getParent()))
+      return isFullyQualifiedNamespaceEqualTo(*NextNS, Names...);
+    return false;
+  } else {
+    return NS.getParent()->isTranslationUnit();
+  }
 }
 
 static bool hasOptionalClassName(const CXXRecordDecl &RD) {
@@ -50,15 +65,23 @@ static bool hasOptionalClassName(const CXXRecordDecl &RD) {
 
   if (RD.getName() == "optional") {
     if (const auto *N = dyn_cast_or_null<NamespaceDecl>(RD.getDeclContext()))
-      return N->isStdNamespace() || isTopLevelNamespaceWithName(*N, "absl");
+      return N->isStdNamespace() ||
+             isFullyQualifiedNamespaceEqualTo(*N, "absl") ||
+             isFullyQualifiedNamespaceEqualTo(*N, "bsl");
     return false;
   }
 
   if (RD.getName() == "Optional") {
     // Check whether namespace is "::base" or "::folly".
     const auto *N = dyn_cast_or_null<NamespaceDecl>(RD.getDeclContext());
-    return N != nullptr && (isTopLevelNamespaceWithName(*N, "base") ||
-                            isTopLevelNamespaceWithName(*N, "folly"));
+    return N != nullptr && (isFullyQualifiedNamespaceEqualTo(*N, "base") ||
+                            isFullyQualifiedNamespaceEqualTo(*N, "folly"));
+  }
+
+  if (RD.getName() == "NullableValue") {
+    const auto *N = dyn_cast_or_null<NamespaceDecl>(RD.getDeclContext());
+    return N != nullptr &&
+           isFullyQualifiedNamespaceEqualTo(*N, "bdlb", "BloombergLP");
   }
 
   return false;
@@ -195,22 +218,25 @@ auto isOptionalOperatorCallWithName(
 }
 
 auto isMakeOptionalCall() {
-  return callExpr(callee(functionDecl(hasAnyName(
-                      "std::make_optional", "base::make_optional",
-                      "absl::make_optional", "folly::make_optional"))),
-                  hasOptionalType());
+  return callExpr(
+      callee(functionDecl(hasAnyName(
+          "std::make_optional", "base::make_optional", "absl::make_optional",
+          "folly::make_optional", "bsl::make_optional"))),
+      hasOptionalType());
 }
 
 auto nulloptTypeDecl() {
   return namedDecl(hasAnyName("std::nullopt_t", "absl::nullopt_t",
-                              "base::nullopt_t", "folly::None"));
+                              "base::nullopt_t", "folly::None",
+                              "bsl::nullopt_t"));
 }
 
 auto hasNulloptType() { return hasType(nulloptTypeDecl()); }
 
 auto inPlaceClass() {
   return recordDecl(hasAnyName("std::in_place_t", "absl::in_place_t",
-                               "base::in_place_t", "folly::in_place_t"));
+                               "base::in_place_t", "folly::in_place_t",
+                               "bsl::in_place_t"));
 }
 
 auto isOptionalNulloptConstructor() {
@@ -415,6 +441,15 @@ void transferOptionalHasValueCall(const CXXMemberCallExpr *CallExpr,
   }
 }
 
+void transferOptionalIsNullCall(const CXXMemberCallExpr *CallExpr,
+                                const MatchFinder::MatchResult &,
+                                LatticeTransferState &State) {
+  if (auto *HasValueVal = getHasValue(
+          State.Env, getImplicitObjectLocation(*CallExpr, State.Env))) {
+    State.Env.setValue(*CallExpr, State.Env.makeNot(*HasValueVal));
+  }
+}
+
 /// `ModelPred` builds a logical formula relating the predicate in
 /// `ValueOrPredExpr` to the optional's `has_value` property.
 void transferValueOrImpl(
@@ -784,6 +819,12 @@ auto buildTransferMatchSwitch() {
           isOptionalMemberCallWithNameMatcher(hasName("operator bool")),
           transferOptionalHasValueCall)
 
+      // NullableValue::isNull
+      // Only NullableValue has isNull
+      .CaseOfCFGStmt<CXXMemberCallExpr>(
+          isOptionalMemberCallWithNameMatcher(hasName("isNull")),
+          transferOptionalIsNullCall)
+
       // optional::emplace
       .CaseOfCFGStmt<CXXMemberCallExpr>(
           isOptionalMemberCallWithNameMatcher(hasName("emplace")),
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index 0bd6845085b73..e23362fc7af00 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -136,7 +136,7 @@ void DiagnosticsEngine::Reset(bool soft /*=false*/) {
 
     // Create a DiagState and DiagStatePoint representing diagnostic changes
     // through command-line.
-    DiagStates.emplace_back(*Diags);
+    DiagStates.emplace_back();
     DiagStatesByLoc.appendFirst(&DiagStates.back());
   }
 }
@@ -147,11 +147,8 @@ DiagnosticsEngine::DiagState::getOrAddMapping(diag::kind Diag) {
       DiagMap.insert(std::make_pair(Diag, DiagnosticMapping()));
 
   // Initialize the entry if we added it.
-  if (Result.second) {
-    Result.first->second = DiagIDs.getDefaultMapping(Diag);
-    if (DiagnosticIDs::IsCustomDiag(Diag))
-      DiagIDs.initCustomDiagMapping(Result.first->second, Diag);
-  }
+  if (Result.second)
+    Result.first->second = DiagnosticIDs::getDefaultMapping(Diag);
 
   return Result.first->second;
 }
@@ -293,8 +290,7 @@ void DiagnosticsEngine::DiagStateMap::dump(SourceManager &SrcMgr,
 
       for (auto &Mapping : *Transition.State) {
         StringRef Option =
-            SrcMgr.getDiagnostics().Diags->getWarningOptionForDiag(
-                Mapping.first);
+            DiagnosticIDs::getWarningOptionForDiag(Mapping.first);
         if (!DiagName.empty() && DiagName != Option)
           continue;
 
@@ -338,7 +334,9 @@ void DiagnosticsEngine::PushDiagStatePoint(DiagState *State,
 
 void DiagnosticsEngine::setSeverity(diag::kind Diag, diag::Severity Map,
                                     SourceLocation L) {
-  assert((Diags->isWarningOrExtension(Diag) ||
+  assert(Diag < diag::DIAG_UPPER_LIMIT &&
+         "Can only map builtin diagnostics");
+  assert((Diags->isBuiltinWarningOrExtension(Diag) ||
           (Map == diag::Severity::Fatal || Map == diag::Severity::Error)) &&
          "Cannot map errors into warnings!");
   assert((L.isInvalid() || SourceMgr) && "No SourceMgr for valid location");
@@ -390,8 +388,6 @@ bool DiagnosticsEngine::setSeverityForGroup(diag::Flavor Flavor,
   if (Diags->getDiagnosticsInGroup(Flavor, Group, GroupDiags))
     return true;
 
-  Diags->setGroupSeverity(Group, Map);
-
   // Set the mapping.
   for (diag::kind Diag : GroupDiags)
     setSeverity(Diag, Map, Loc);
@@ -414,7 +410,6 @@ bool DiagnosticsEngine::setDiagnosticGroupWarningAsError(StringRef Group,
   if (Enabled)
     return setSeverityForGroup(diag::Flavor::WarningOrError, Group,
                                diag::Severity::Error);
-  Diags->setGroupSeverity(Group, diag::Severity::Warning);
 
   // Otherwise, we want to set the diagnostic mapping's "no Werror" bit, and
   // potentially downgrade anything already mapped to be a warning.
@@ -446,7 +441,6 @@ bool DiagnosticsEngine::setDiagnosticGroupErrorAsFatal(StringRef Group,
   if (Enabled)
     return setSeverityForGroup(diag::Flavor::WarningOrError, Group,
                                diag::Severity::Fatal);
-  Diags->setGroupSeverity(Group, diag::Severity::Error);
 
   // Otherwise, we want to set the diagnostic mapping's "no Wfatal-errors" bit,
   // and potentially downgrade anything already mapped to be a fatal error.
@@ -479,7 +473,7 @@ void DiagnosticsEngine::setSeverityForAll(diag::Flavor Flavor,
 
   // Set the mapping.
   for (diag::kind Diag : AllDiags)
-    if (Diags->isWarningOrExtension(Diag))
+    if (Diags->isBuiltinWarningOrExtension(Diag))
       setSeverity(Diag, Map, Loc);
 }
 
diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp
index 031d9d7817d1f..d45bb0f392d45 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -102,12 +102,13 @@ const uint32_t StaticDiagInfoDescriptionOffsets[] = {
 #undef DIAG
 };
 
+// Diagnostic classes.
 enum DiagnosticClass {
-  CLASS_NOTE = DiagnosticIDs::CLASS_NOTE,
-  CLASS_REMARK = DiagnosticIDs::CLASS_REMARK,
-  CLASS_WARNING = DiagnosticIDs::CLASS_WARNING,
-  CLASS_EXTENSION = DiagnosticIDs::CLASS_EXTENSION,
-  CLASS_ERROR = DiagnosticIDs::CLASS_ERROR,
+  CLASS_NOTE       = 0x01,
+  CLASS_REMARK     = 0x02,
+  CLASS_WARNING    = 0x03,
+  CLASS_EXTENSION  = 0x04,
+  CLASS_ERROR      = 0x05
 };
 
 struct StaticDiagInfoRec {
@@ -268,60 +269,11 @@ CATEGORY(INSTALLAPI, REFACTORING)
   return Found;
 }
 
-//===----------------------------------------------------------------------===//
-// Custom Diagnostic information
-//===----------------------------------------------------------------------===//
-
-namespace clang {
-namespace diag {
-using CustomDiagDesc = DiagnosticIDs::CustomDiagDesc;
-class CustomDiagInfo {
-  std::vector<CustomDiagDesc> DiagInfo;
-  std::map<CustomDiagDesc, unsigned> DiagIDs;
-  std::map<diag::Group, std::vector<unsigned>> GroupToDiags;
-
-public:
-  /// getDescription - Return the description of the specified custom
-  /// diagnostic.
-  const CustomDiagDesc &getDescription(unsigned DiagID) const {
-    assert(DiagID - DIAG_UPPER_LIMIT < DiagInfo.size() &&
-           "Invalid diagnostic ID");
-    return DiagInfo[DiagID - DIAG_UPPER_LIMIT];
-  }
-
-  unsigned getOrCreateDiagID(DiagnosticIDs::CustomDiagDesc D) {
-    // Check to see if it already exists.
-    std::map<CustomDiagDesc, unsigned>::iterator I = DiagIDs.lower_bound(D);
-    if (I != DiagIDs.end() && I->first == D)
-      return I->second;
-
-    // If not, assign a new ID.
-    unsigned ID = DiagInfo.size() + DIAG_UPPER_LIMIT;
-    DiagIDs.insert(std::make_pair(D, ID));
-    DiagInfo.push_back(D);
-    if (auto Group = D.GetGroup())
-      GroupToDiags[*Group].emplace_back(ID);
-    return ID;
-  }
-
-  ArrayRef<unsigned> getDiagsInGroup(diag::Group G) const {
-    if (auto Diags = GroupToDiags.find(G); Diags != GroupToDiags.end())
-      return Diags->second;
-    return {};
-  }
-};
-
-} // namespace diag
-} // namespace clang
-
-DiagnosticMapping DiagnosticIDs::getDefaultMapping(unsigned DiagID) const {
+DiagnosticMapping DiagnosticIDs::getDefaultMapping(unsigned DiagID) {
   DiagnosticMapping Info = DiagnosticMapping::Make(
       diag::Severity::Fatal, /*IsUser=*/false, /*IsPragma=*/false);
 
-  if (IsCustomDiag(DiagID)) {
-    Info.setSeverity(
-        CustomDiagInfo->getDescription(DiagID).GetDefaultSeverity());
-  } else if (const StaticDiagInfoRec *StaticInfo = GetDiagInfo(DiagID)) {
+  if (const StaticDiagInfoRec *StaticInfo = GetDiagInfo(DiagID)) {
     Info.setSeverity((diag::Severity)StaticInfo->DefaultSeverity);
 
     if (StaticInfo->WarnNoWerror) {
@@ -334,18 +286,6 @@ DiagnosticMapping DiagnosticIDs::getDefaultMapping(unsigned DiagID) const {
   return Info;
 }
 
-void DiagnosticIDs::initCustomDiagMapping(DiagnosticMapping &Mapping,
-                                          unsigned DiagID) {
-  assert(IsCustomDiag(DiagID));
-  const auto &Diag = CustomDiagInfo->getDescription(DiagID);
-  if (auto Group = Diag.GetGroup()) {
-    GroupInfo GroupInfo = GroupInfos[static_cast<size_t>(*Group)];
-    if (static_cast<diag::Severity>(GroupInfo.Severity) != diag::Severity())
-      Mapping.setSeverity(static_cast<diag::Severity>(GroupInfo.Severity));
-    Mapping.setNoWarningAsError(GroupInfo.HasNoWarningAsError);
-  }
-}
-
 /// getCategoryNumberForDiag - Return the category number that a specified
 /// DiagID belongs to, or 0 if no category.
 unsigned DiagnosticIDs::getCategoryNumberForDiag(unsigned DiagID) {
@@ -403,6 +343,61 @@ bool DiagnosticIDs::isDeferrable(unsigned DiagID) {
   return false;
 }
 
+/// getBuiltinDiagClass - Return the class field of the diagnostic.
+///
+static unsigned getBuiltinDiagClass(unsigned DiagID) {
+  if (const StaticDiagInfoRec *Info = GetDiagInfo(DiagID))
+    return Info->Class;
+  return ~0U;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom Diagnostic information
+//===----------------------------------------------------------------------===//
+
+namespace clang {
+  namespace diag {
+    class CustomDiagInfo {
+      typedef std::pair<DiagnosticIDs::Level, std::string> DiagDesc;
+      std::vector<DiagDesc> DiagInfo;
+      std::map<DiagDesc, unsigned> DiagIDs;
+    public:
+
+      /// getDescription - Return the description of the specified custom
+      /// diagnostic.
+      StringRef getDescription(unsigned DiagID) const {
+        assert(DiagID - DIAG_UPPER_LIMIT < DiagInfo.size() &&
+               "Invalid diagnostic ID");
+        return DiagInfo[DiagID-DIAG_UPPER_LIMIT].second;
+      }
+
+      /// getLevel - Return the level of the specified custom diagnostic.
+      DiagnosticIDs::Level getLevel(unsigned DiagID) const {
+        assert(DiagID - DIAG_UPPER_LIMIT < DiagInfo.size() &&
+               "Invalid diagnostic ID");
+        return DiagInfo[DiagID-DIAG_UPPER_LIMIT].first;
+      }
+
+      unsigned getOrCreateDiagID(DiagnosticIDs::Level L, StringRef Message,
+                                 DiagnosticIDs &Diags) {
+        DiagDesc D(L, std::string(Message));
+        // Check to see if it already exists.
+        std::map<DiagDesc, unsigned>::iterator I = DiagIDs.lower_bound(D);
+        if (I != DiagIDs.end() && I->first == D)
+          return I->second;
+
+        // If not, assign a new ID.
+        unsigned ID = DiagInfo.size()+DIAG_UPPER_LIMIT;
+        DiagIDs.insert(std::make_pair(D, ID));
+        DiagInfo.push_back(D);
+        return ID;
+      }
+    };
+
+  } // end diag namespace
+} // end clang namespace
+
+
 //===----------------------------------------------------------------------===//
 // Common Diagnostic implementation
 //===----------------------------------------------------------------------===//
@@ -417,32 +412,38 @@ DiagnosticIDs::~DiagnosticIDs() {}
 ///
 /// \param FormatString A fixed diagnostic format string that will be hashed and
 /// mapped to a unique DiagID.
-unsigned DiagnosticIDs::getCustomDiagID(CustomDiagDesc Diag) {
+unsigned DiagnosticIDs::getCustomDiagID(Level L, StringRef FormatString) {
   if (!CustomDiagInfo)
     CustomDiagInfo.reset(new diag::CustomDiagInfo());
-  return CustomDiagInfo->getOrCreateDiagID(Diag);
+  return CustomDiagInfo->getOrCreateDiagID(L, FormatString, *this);
 }
 
-bool DiagnosticIDs::isWarningOrExtension(unsigned DiagID) const {
-  return DiagID < diag::DIAG_UPPER_LIMIT
-             ? getDiagClass(DiagID) != CLASS_ERROR
-             : CustomDiagInfo->getDescription(DiagID).GetClass() != CLASS_ERROR;
+
+/// isBuiltinWarningOrExtension - Return true if the unmapped diagnostic
+/// level of the specified diagnostic ID is a Warning or Extension.
+/// This only works on builtin diagnostics, not custom ones, and is not legal to
+/// call on NOTEs.
+bool DiagnosticIDs::isBuiltinWarningOrExtension(unsigned DiagID) {
+  return DiagID < diag::DIAG_UPPER_LIMIT &&
+         getBuiltinDiagClass(DiagID) != CLASS_ERROR;
 }
 
 /// Determine whether the given built-in diagnostic ID is a
 /// Note.
-bool DiagnosticIDs::isNote(unsigned DiagID) const {
-  return DiagID < diag::DIAG_UPPER_LIMIT && getDiagClass(DiagID) == CLASS_NOTE;
+bool DiagnosticIDs::isBuiltinNote(unsigned DiagID) {
+  return DiagID < diag::DIAG_UPPER_LIMIT &&
+    getBuiltinDiagClass(DiagID) == CLASS_NOTE;
 }
 
-/// isExtensionDiag - Determine whether the given built-in diagnostic
+/// isBuiltinExtensionDiag - Determine whether the given built-in diagnostic
 /// ID is for an extension of some sort.  This also returns EnabledByDefault,
 /// which is set to indicate whether the diagnostic is ignored by default (in
 /// which case -pedantic enables it) or treated as a warning/error by default.
 ///
-bool DiagnosticIDs::isExtensionDiag(unsigned DiagID,
-                                    bool &EnabledByDefault) const {
-  if (IsCustomDiag(DiagID) || getDiagClass(DiagID) != CLASS_EXTENSION)
+bool DiagnosticIDs::isBuiltinExtensionDiag(unsigned DiagID,
+                                        bool &EnabledByDefault) {
+  if (DiagID >= diag::DIAG_UPPER_LIMIT ||
+      getBuiltinDiagClass(DiagID) != CLASS_EXTENSION)
     return false;
 
   EnabledByDefault =
@@ -450,7 +451,10 @@ bool DiagnosticIDs::isExtensionDiag(unsigned DiagID,
   return true;
 }
 
-bool DiagnosticIDs::isDefaultMappingAsError(unsigned DiagID) const {
+bool DiagnosticIDs::isDefaultMappingAsError(unsigned DiagID) {
+  if (DiagID >= diag::DIAG_UPPER_LIMIT)
+    return false;
+
   return getDefaultMapping(DiagID).getSeverity() >= diag::Severity::Error;
 }
 
@@ -460,7 +464,7 @@ StringRef DiagnosticIDs::getDescription(unsigned DiagID) const {
   if (const StaticDiagInfoRec *Info = GetDiagInfo(DiagID))
     return Info->getDescription();
   assert(CustomDiagInfo && "Invalid CustomDiagInfo");
-  return CustomDiagInfo->getDescription(DiagID).GetDescription();
+  return CustomDiagInfo->getDescription(DiagID);
 }
 
 static DiagnosticIDs::Level toLevel(diag::Severity SV) {
@@ -485,7 +489,13 @@ static DiagnosticIDs::Level toLevel(diag::Severity SV) {
 DiagnosticIDs::Level
 DiagnosticIDs::getDiagnosticLevel(unsigned DiagID, SourceLocation Loc,
                                   const DiagnosticsEngine &Diag) const {
-  unsigned DiagClass = getDiagClass(DiagID);
+  // Handle custom diagnostics, which cannot be mapped.
+  if (DiagID >= diag::DIAG_UPPER_LIMIT) {
+    assert(CustomDiagInfo && "Invalid CustomDiagInfo");
+    return CustomDiagInfo->getLevel(DiagID);
+  }
+
+  unsigned DiagClass = getBuiltinDiagClass(DiagID);
   if (DiagClass == CLASS_NOTE) return DiagnosticIDs::Note;
   return toLevel(getDiagnosticSeverity(DiagID, Loc, Diag));
 }
@@ -499,8 +509,7 @@ DiagnosticIDs::getDiagnosticLevel(unsigned DiagID, SourceLocation Loc,
 diag::Severity
 DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
                                      const DiagnosticsEngine &Diag) const {
-  bool IsCustomDiag = DiagnosticIDs::IsCustomDiag(DiagID);
-  assert(getDiagClass(DiagID) != CLASS_NOTE);
+  assert(getBuiltinDiagClass(DiagID) != CLASS_NOTE);
 
   // Specific non-error diagnostics may be mapped to various levels from ignored
   // to error.  Errors can only be mapped to fatal.
@@ -508,7 +517,7 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
 
   // Get the mapping information, or compute it lazily.
   DiagnosticsEngine::DiagState *State = Diag.GetDiagStateForLoc(Loc);
-  DiagnosticMapping Mapping = State->getOrAddMapping((diag::kind)DiagID);
+  DiagnosticMapping &Mapping = State->getOrAddMapping((diag::kind)DiagID);
 
   // TODO: Can a null severity really get here?
   if (Mapping.getSeverity() != diag::Severity())
@@ -516,15 +525,14 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
 
   // Upgrade ignored diagnostics if -Weverything is enabled.
   if (State->EnableAllWarnings && Result == diag::Severity::Ignored &&
-      !Mapping.isUser() &&
-      (IsCustomDiag || getDiagClass(DiagID) != CLASS_REMARK))
+      !Mapping.isUser() && getBuiltinDiagClass(DiagID) != CLASS_REMARK)
     Result = diag::Severity::Warning;
 
   // Ignore -pedantic diagnostics inside __extension__ blocks.
   // (The diagnostics controlled by -pedantic are the extension diagnostics
   // that are not enabled by default.)
   bool EnabledByDefault = false;
-  bool IsExtensionDiag = isExtensionDiag(DiagID, EnabledByDefault);
+  bool IsExtensionDiag = isBuiltinExtensionDiag(DiagID, EnabledByDefault);
   if (Diag.AllExtensionsSilenced && IsExtensionDiag && !EnabledByDefault)
     return diag::Severity::Ignored;
 
@@ -542,12 +550,10 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
   // as well as disabling all messages which are currently mapped to Warning
   // (whether by default or downgraded from Error via e.g. -Wno-error or #pragma
   // diagnostic.)
-  // FIXME: Should -w be ignored for custom warnings without a group?
   if (State->IgnoreAllWarnings) {
-    if ((!IsCustomDiag || CustomDiagInfo->getDescription(DiagID).GetGroup()) &&
-        (Result == diag::Severity::Warning ||
-         (Result >= diag::Severity::Error &&
-          !isDefaultMappingAsError((diag::kind)DiagID))))
+    if (Result == diag::Severity::Warning ||
+        (Result >= diag::Severity::Error &&
+         !isDefaultMappingAsError((diag::kind)DiagID)))
       return diag::Severity::Ignored;
   }
 
@@ -569,10 +575,9 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
       DiagID != diag::fatal_too_many_errors && Diag.FatalsAsError)
     Result = diag::Severity::Error;
 
+  // Custom diagnostics always are emitted in system headers.
   bool ShowInSystemHeader =
-      IsCustomDiag
-          ? CustomDiagInfo->getDescription(DiagID).ShouldShowInSystemHeader()
-          : !GetDiagInfo(DiagID) || GetDiagInfo(DiagID)->WarnShowInSystemHeader;
+      !GetDiagInfo(DiagID) || GetDiagInfo(DiagID)->WarnShowInSystemHeader;
 
   // If we are in a system header, we ignore it. We look at the diagnostic class
   // because we also want to ignore extensions and warnings in -Werror and
@@ -592,15 +597,6 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
   return Result;
 }
 
-DiagnosticIDs::Class DiagnosticIDs::getDiagClass(unsigned DiagID) const {
-  if (IsCustomDiag(DiagID))
-    return Class(CustomDiagInfo->getDescription(DiagID).GetClass());
-
-  if (const StaticDiagInfoRec *Info = GetDiagInfo(DiagID))
-    return Class(Info->Class);
-  return CLASS_INVALID;
-}
-
 #define GET_DIAG_ARRAYS
 #include "clang/Basic/DiagnosticGroups.inc"
 #undef GET_DIAG_ARRAYS
@@ -646,12 +642,7 @@ DiagnosticIDs::getGroupForWarningOption(StringRef Name) {
   return static_cast<diag::Group>(Found - OptionTable);
 }
 
-std::optional<diag::Group>
-DiagnosticIDs::getGroupForDiag(unsigned DiagID) const {
-  if (IsCustomDiag(DiagID)) {
-    assert(CustomDiagInfo);
-    return CustomDiagInfo->getDescription(DiagID).GetGroup();
-  }
+std::optional<diag::Group> DiagnosticIDs::getGroupForDiag(unsigned DiagID) {
   if (const StaticDiagInfoRec *Info = GetDiagInfo(DiagID))
     return static_cast<diag::Group>(Info->getOptionGroupIndex());
   return std::nullopt;
@@ -682,8 +673,7 @@ std::vector<std::string> DiagnosticIDs::getDiagnosticFlags() {
 /// were filtered out due to having the wrong flavor.
 static bool getDiagnosticsInGroup(diag::Flavor Flavor,
                                   const WarningOption *Group,
-                                  SmallVectorImpl<diag::kind> &Diags,
-                                  diag::CustomDiagInfo *CustomDiagInfo) {
+                                  SmallVectorImpl<diag::kind> &Diags) {
   // An empty group is considered to be a warning group: we have empty groups
   // for GCC compatibility, and GCC does not have remarks.
   if (!Group->Members && !Group->SubGroups)
@@ -702,14 +692,9 @@ static bool getDiagnosticsInGroup(diag::Flavor Flavor,
 
   // Add the members of the subgroups.
   const int16_t *SubGroups = DiagSubGroups + Group->SubGroups;
-  for (; *SubGroups != (int16_t)-1; ++SubGroups) {
-    if (CustomDiagInfo)
-      llvm::copy(
-          CustomDiagInfo->getDiagsInGroup(static_cast<diag::Group>(*SubGroups)),
-          std::back_inserter(Diags));
+  for (; *SubGroups != (int16_t)-1; ++SubGroups)
     NotFound &= getDiagnosticsInGroup(Flavor, &OptionTable[(short)*SubGroups],
-                                      Diags, CustomDiagInfo);
-  }
+                                      Diags);
 
   return NotFound;
 }
@@ -717,49 +702,12 @@ static bool getDiagnosticsInGroup(diag::Flavor Flavor,
 bool
 DiagnosticIDs::getDiagnosticsInGroup(diag::Flavor Flavor, StringRef Group,
                                      SmallVectorImpl<diag::kind> &Diags) const {
-  if (std::optional<diag::Group> G = getGroupForWarningOption(Group)) {
-    if (CustomDiagInfo)
-      llvm::copy(CustomDiagInfo->getDiagsInGroup(*G),
-                 std::back_inserter(Diags));
-    return ::getDiagnosticsInGroup(Flavor,
-                                   &OptionTable[static_cast<unsigned>(*G)],
-                                   Diags, CustomDiagInfo.get());
-  }
+  if (std::optional<diag::Group> G = getGroupForWarningOption(Group))
+    return ::getDiagnosticsInGroup(
+        Flavor, &OptionTable[static_cast<unsigned>(*G)], Diags);
   return true;
 }
 
-template <class Func>
-static void forEachSubGroupImpl(const WarningOption *Group, Func func) {
-  for (const int16_t *SubGroups = DiagSubGroups + Group->SubGroups;
-       *SubGroups != -1; ++SubGroups) {
-    func(static_cast<size_t>(*SubGroups));
-    forEachSubGroupImpl(&OptionTable[*SubGroups], std::move(func));
-  }
-}
-
-template <class Func>
-static void forEachSubGroup(diag::Group Group, Func func) {
-  const WarningOption *WarningOpt = &OptionTable[static_cast<size_t>(Group)];
-  func(static_cast<size_t>(Group));
-  ::forEachSubGroupImpl(WarningOpt, std::move(func));
-}
-
-void DiagnosticIDs::setGroupSeverity(StringRef Group, diag::Severity Sev) {
-  if (std::optional<diag::Group> G = getGroupForWarningOption(Group)) {
-    ::forEachSubGroup(*G, [&](size_t SubGroup) {
-      GroupInfos[SubGroup].Severity = static_cast<unsigned>(Sev);
-    });
-  }
-}
-
-void DiagnosticIDs::setGroupNoWarningsAsError(StringRef Group, bool Val) {
-  if (std::optional<diag::Group> G = getGroupForWarningOption(Group)) {
-    ::forEachSubGroup(*G, [&](size_t SubGroup) {
-      GroupInfos[static_cast<size_t>(*G)].HasNoWarningAsError = Val;
-    });
-  }
-}
-
 void DiagnosticIDs::getAllDiagnostics(diag::Flavor Flavor,
                                       std::vector<diag::kind> &Diags) {
   for (unsigned i = 0; i != StaticDiagInfoSize; ++i)
@@ -782,7 +730,7 @@ StringRef DiagnosticIDs::getNearestOption(diag::Flavor Flavor,
 
     // Don't suggest groups that are not of this kind.
     llvm::SmallVector<diag::kind, 8> Diags;
-    if (::getDiagnosticsInGroup(Flavor, &O, Diags, nullptr) || Diags.empty())
+    if (::getDiagnosticsInGroup(Flavor, &O, Diags) || Diags.empty())
       continue;
 
     if (Distance == BestDistance) {
@@ -896,8 +844,14 @@ void DiagnosticIDs::EmitDiag(DiagnosticsEngine &Diag,
 }
 
 bool DiagnosticIDs::isUnrecoverable(unsigned DiagID) const {
+  if (DiagID >= diag::DIAG_UPPER_LIMIT) {
+    assert(CustomDiagInfo && "Invalid CustomDiagInfo");
+    // Custom diagnostics.
+    return CustomDiagInfo->getLevel(DiagID) >= DiagnosticIDs::Error;
+  }
+
   // Only errors may be unrecoverable.
-  if (getDiagClass(DiagID) < CLASS_ERROR)
+  if (getBuiltinDiagClass(DiagID) < CLASS_ERROR)
     return false;
 
   if (DiagID == diag::err_unavailable ||
diff --git a/clang/lib/Basic/TargetID.cpp b/clang/lib/Basic/TargetID.cpp
index 3c06d9bad1dc0..fa1bfec2aacb9 100644
--- a/clang/lib/Basic/TargetID.cpp
+++ b/clang/lib/Basic/TargetID.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Basic/TargetID.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/TargetParser.h"
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 29f5cd14e46e1..9219533382109 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -198,6 +198,22 @@ TargetInfo::checkCFProtectionBranchSupported(DiagnosticsEngine &Diags) const {
   return false;
 }
 
+CFBranchLabelSchemeKind TargetInfo::getDefaultCFBranchLabelScheme() const {
+  // if this hook is called, the target should override it to return a
+  // non-default scheme
+  llvm::report_fatal_error("not implemented");
+}
+
+bool TargetInfo::checkCFBranchLabelSchemeSupported(
+    const CFBranchLabelSchemeKind Scheme, DiagnosticsEngine &Diags) const {
+  if (Scheme != CFBranchLabelSchemeKind::Default)
+    Diags.Report(diag::err_opt_not_valid_on_target)
+        << (Twine("mcf-branch-label-scheme=") +
+            getCFBranchLabelSchemeFlagVal(Scheme))
+               .str();
+  return false;
+}
+
 bool
 TargetInfo::checkCFProtectionReturnSupported(DiagnosticsEngine &Diags) const {
   Diags.Report(diag::err_opt_not_valid_on_target) << "cf-protection=return";
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index b6ea4440507ea..2a225820208c8 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -44,7 +44,7 @@ ArrayRef<const char *> RISCVTargetInfo::getGCCRegNames() const {
       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
 
       // CSRs
-      "fflags", "frm", "vtype", "vl", "vxsat", "vxrm"
+      "fflags", "frm", "vtype", "vl", "vxsat", "vxrm", "sf.vcix_state"
     };
   // clang-format on
   return llvm::ArrayRef(GCCRegNames);
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index 351ef21e197c4..bf40edb8683b3 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -134,6 +134,28 @@ class RISCVTargetInfo : public TargetInfo {
 
   bool validateGlobalRegisterVariable(StringRef RegName, unsigned RegSize,
                                       bool &HasSizeMismatch) const override;
+
+  bool checkCFProtectionBranchSupported(DiagnosticsEngine &) const override {
+    // Always generate Zicfilp lpad insns
+    // Non-zicfilp CPUs would read them as NOP
+    return true;
+  }
+
+  CFBranchLabelSchemeKind getDefaultCFBranchLabelScheme() const override {
+    return CFBranchLabelSchemeKind::FuncSig;
+  }
+
+  bool
+  checkCFBranchLabelSchemeSupported(const CFBranchLabelSchemeKind Scheme,
+                                    DiagnosticsEngine &Diags) const override {
+    switch (Scheme) {
+    case CFBranchLabelSchemeKind::Default:
+    case CFBranchLabelSchemeKind::Unlabeled:
+    case CFBranchLabelSchemeKind::FuncSig:
+      return true;
+    }
+    return TargetInfo::checkCFBranchLabelSchemeSupported(Scheme, Diags);
+  }
 };
 class LLVM_LIBRARY_VISIBILITY RISCV32TargetInfo : public RISCVTargetInfo {
 public:
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 8a26db7971cba..cc79562de2871 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -386,6 +386,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final
     PointerWidth = PointerAlign = 64;
     SizeType = TargetInfo::UnsignedLong;
     PtrDiffType = IntPtrType = TargetInfo::SignedLong;
+    AddrSpaceMap = &SPIRDefIsGenMap;
 
     resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
                     "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1-P4-A0");
@@ -418,6 +419,10 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final
 
   void setAuxTarget(const TargetInfo *Aux) override;
 
+  void adjust(DiagnosticsEngine &Diags, LangOptions &Opts) override {
+    TargetInfo::adjust(Diags, Opts);
+  }
+
   bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); }
 };
 
diff --git a/clang/lib/Basic/Targets/Sparc.h b/clang/lib/Basic/Targets/Sparc.h
index 3357bee33e1ac..ee0d3e2b4329e 100644
--- a/clang/lib/Basic/Targets/Sparc.h
+++ b/clang/lib/Basic/Targets/Sparc.h
@@ -151,7 +151,7 @@ class LLVM_LIBRARY_VISIBILITY SparcV8TargetInfo : public SparcTargetInfo {
 public:
   SparcV8TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : SparcTargetInfo(Triple, Opts) {
-    resetDataLayout("E-m:e-p:32:32-i64:64-f128:64-n32-S64");
+    resetDataLayout("E-m:e-p:32:32-i64:64-i128:128-f128:64-n32-S64");
     // NetBSD / OpenBSD use long (same as llvm default); everyone else uses int.
     switch (getTriple().getOS()) {
     default:
@@ -188,7 +188,7 @@ class LLVM_LIBRARY_VISIBILITY SparcV8elTargetInfo : public SparcV8TargetInfo {
 public:
   SparcV8elTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : SparcV8TargetInfo(Triple, Opts) {
-    resetDataLayout("e-m:e-p:32:32-i64:64-f128:64-n32-S64");
+    resetDataLayout("e-m:e-p:32:32-i64:64-i128:128-f128:64-n32-S64");
   }
 };
 
@@ -198,7 +198,7 @@ class LLVM_LIBRARY_VISIBILITY SparcV9TargetInfo : public SparcTargetInfo {
   SparcV9TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : SparcTargetInfo(Triple, Opts) {
     // FIXME: Support Sparc quad-precision long double?
-    resetDataLayout("E-m:e-i64:64-n32:64-S128");
+    resetDataLayout("E-m:e-i64:64-i128:128-n32:64-S128");
     // This is an LP64 platform.
     LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
 
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 916c92adb8930..62c6a57e8b7c8 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -429,7 +429,6 @@ static bool initTargetOptions(DiagnosticsEngine &Diags,
   Options.BBSections =
       llvm::StringSwitch<llvm::BasicBlockSection>(CodeGenOpts.BBSections)
           .Case("all", llvm::BasicBlockSection::All)
-          .Case("labels", llvm::BasicBlockSection::Labels)
           .StartsWith("list=", llvm::BasicBlockSection::List)
           .Case("none", llvm::BasicBlockSection::None)
           .Default(llvm::BasicBlockSection::None);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 566252b263680..da3eca73bfb57 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2878,7 +2878,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_fmodf:
     case Builtin::BI__builtin_fmodf16:
     case Builtin::BI__builtin_fmodl:
-    case Builtin::BI__builtin_fmodf128: {
+    case Builtin::BI__builtin_fmodf128:
+    case Builtin::BI__builtin_elementwise_fmod: {
       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
       Value *Arg1 = EmitScalarExpr(E->getArg(0));
       Value *Arg2 = EmitScalarExpr(E->getArg(1));
@@ -9880,6 +9881,22 @@ Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred,
   return C;
 }
 
+Value *CodeGenFunction::EmitSVEPredicateTupleCast(Value *PredTuple,
+                                                  llvm::StructType *Ty) {
+  if (PredTuple->getType() == Ty)
+    return PredTuple;
+
+  Value *Ret = llvm::PoisonValue::get(Ty);
+  for (unsigned I = 0; I < Ty->getNumElements(); ++I) {
+    Value *Pred = Builder.CreateExtractValue(PredTuple, I);
+    Pred = EmitSVEPredicateCast(
+        Pred, cast<llvm::ScalableVectorType>(Ty->getTypeAtIndex(I)));
+    Ret = Builder.CreateInsertValue(Ret, Pred, I);
+  }
+
+  return Ret;
+}
+
 Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
                                           SmallVectorImpl<Value *> &Ops,
                                           unsigned IntID) {
@@ -10386,41 +10403,6 @@ Value *CodeGenFunction::EmitSVETupleCreate(const SVETypeFlags &TypeFlags,
   return Tuple;
 }
 
-Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) {
-  // Multi-vector results should be broken up into a single (wide) result
-  // vector.
-  auto *StructTy = dyn_cast<StructType>(Call->getType());
-  if (!StructTy)
-    return Call;
-
-  auto *VTy = dyn_cast<ScalableVectorType>(StructTy->getTypeAtIndex(0U));
-  if (!VTy)
-    return Call;
-  unsigned N = StructTy->getNumElements();
-
-  // We may need to emit a cast to a svbool_t
-  bool IsPredTy = VTy->getElementType()->isIntegerTy(1);
-  unsigned MinElts = IsPredTy ? 16 : VTy->getMinNumElements();
-
-  ScalableVectorType *WideVTy =
-      ScalableVectorType::get(VTy->getElementType(), MinElts * N);
-  Value *Ret = llvm::PoisonValue::get(WideVTy);
-  for (unsigned I = 0; I < N; ++I) {
-    Value *SRet = Builder.CreateExtractValue(Call, I);
-    assert(SRet->getType() == VTy && "Unexpected type for result value");
-    Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
-
-    if (IsPredTy)
-      SRet = EmitSVEPredicateCast(
-          SRet, ScalableVectorType::get(Builder.getInt1Ty(), 16));
-
-    Ret = Builder.CreateInsertVector(WideVTy, Ret, SRet, Idx);
-  }
-  Call = Ret;
-
-  return Call;
-}
-
 void CodeGenFunction::GetAArch64SVEProcessedOperands(
     unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops,
     SVETypeFlags TypeFlags) {
@@ -10551,12 +10533,16 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
                                    getSVEOverloadTypes(TypeFlags, Ty, Ops));
     Value *Call = Builder.CreateCall(F, Ops);
 
+    if (Call->getType() == Ty)
+      return Call;
+
     // Predicate results must be converted to svbool_t.
-    if (auto PredTy = dyn_cast<llvm::VectorType>(Call->getType()))
-      if (PredTy->getScalarType()->isIntegerTy(1))
-        Call = EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty));
+    if (auto PredTy = dyn_cast<llvm::ScalableVectorType>(Ty))
+      return EmitSVEPredicateCast(Call, PredTy);
+    if (auto PredTupleTy = dyn_cast<llvm::StructType>(Ty))
+      return EmitSVEPredicateTupleCast(Call, PredTupleTy);
 
-    return FormSVEBuiltinResult(Call);
+    llvm_unreachable("unsupported element count!");
   }
 
   switch (BuiltinID) {
@@ -10888,9 +10874,8 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
       TypeFlags.isOverloadNone()
           ? CGM.getIntrinsic(Builtin->LLVMIntrinsic)
           : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)});
-  Value *Call = Builder.CreateCall(F, Ops);
 
-  return FormSVEBuiltinResult(Call);
+  return Builder.CreateCall(F, Ops);
 }
 
 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
@@ -13589,6 +13574,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_famax;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "famax");
   }
+  case NEON::BI__builtin_neon_vscale_f16:
+  case NEON::BI__builtin_neon_vscaleq_f16:
+  case NEON::BI__builtin_neon_vscale_f32:
+  case NEON::BI__builtin_neon_vscaleq_f32:
+  case NEON::BI__builtin_neon_vscaleq_f64: {
+    Int = Intrinsic::aarch64_neon_fp8_fscale;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fscale");
+  }
   }
 }
 
@@ -22347,10 +22340,60 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
 
     return Store;
   }
+  // XCValu
+  case RISCV::BI__builtin_riscv_cv_alu_addN:
+    ID = Intrinsic::riscv_cv_alu_addN;
+    break;
+  case RISCV::BI__builtin_riscv_cv_alu_addRN:
+    ID = Intrinsic::riscv_cv_alu_addRN;
+    break;
+  case RISCV::BI__builtin_riscv_cv_alu_adduN:
+    ID = Intrinsic::riscv_cv_alu_adduN;
+    break;
+  case RISCV::BI__builtin_riscv_cv_alu_adduRN:
+    ID = Intrinsic::riscv_cv_alu_adduRN;
+    break;
+  case RISCV::BI__builtin_riscv_cv_alu_clip:
+    ID = Intrinsic::riscv_cv_alu_clip;
+    break;
+  case RISCV::BI__builtin_riscv_cv_alu_clipu:
+    ID = Intrinsic::riscv_cv_alu_clipu;
+    break;
+  case RISCV::BI__builtin_riscv_cv_alu_extbs:
+    return Builder.CreateSExt(Builder.CreateTrunc(Ops[0], Int8Ty), Int32Ty,
+                              "extbs");
+  case RISCV::BI__builtin_riscv_cv_alu_extbz:
+    return Builder.CreateZExt(Builder.CreateTrunc(Ops[0], Int8Ty), Int32Ty,
+                              "extbz");
+  case RISCV::BI__builtin_riscv_cv_alu_exths:
+    return Builder.CreateSExt(Builder.CreateTrunc(Ops[0], Int16Ty), Int32Ty,
+                              "exths");
+  case RISCV::BI__builtin_riscv_cv_alu_exthz:
+    return Builder.CreateZExt(Builder.CreateTrunc(Ops[0], Int16Ty), Int32Ty,
+                              "exthz");
+  case RISCV::BI__builtin_riscv_cv_alu_slet:
+    return Builder.CreateZExt(Builder.CreateICmpSLE(Ops[0], Ops[1]), Int32Ty,
+                              "sle");
+  case RISCV::BI__builtin_riscv_cv_alu_sletu:
+    return Builder.CreateZExt(Builder.CreateICmpULE(Ops[0], Ops[1]), Int32Ty,
+                              "sleu");
+  case RISCV::BI__builtin_riscv_cv_alu_subN:
+    ID = Intrinsic::riscv_cv_alu_subN;
+    break;
+  case RISCV::BI__builtin_riscv_cv_alu_subRN:
+    ID = Intrinsic::riscv_cv_alu_subRN;
+    break;
+  case RISCV::BI__builtin_riscv_cv_alu_subuN:
+    ID = Intrinsic::riscv_cv_alu_subuN;
+    break;
+  case RISCV::BI__builtin_riscv_cv_alu_subuRN:
+    ID = Intrinsic::riscv_cv_alu_subuRN;
+    break;
 
-  // Vector builtins are handled from here.
+    // Vector builtins are handled from here.
 #include "clang/Basic/riscv_vector_builtin_cg.inc"
-  // SiFive Vector builtins are handled from here.
+
+    // SiFive Vector builtins are handled from here.
 #include "clang/Basic/riscv_sifive_vector_builtin_cg.inc"
   }
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 4782e80f22177..8887c4de7c4c8 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -900,7 +900,7 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
     return SingletonId;                                                        \
   }
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_OPAQUE_PTR_TYPE(Name, AS, Width, Align, Id, SingletonId)        \
+#define AMDGPU_OPAQUE_PTR_TYPE(Name, Id, SingletonId, Width, Align, AS)        \
   case BuiltinType::Id: {                                                      \
     if (!SingletonId)                                                          \
       SingletonId =                                                            \
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index 4ccff581cadb2..3fd0237a1c61d 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -85,7 +85,8 @@ class CGDebugInfo {
 #include "clang/Basic/OpenCLExtensionTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) llvm::DIType *SingletonId = nullptr;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId) llvm::DIType *SingletonId = nullptr;
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
+  llvm::DIType *SingletonId = nullptr;
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId)                            \
   llvm::DIType *SingletonId = nullptr;
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index c44f38ef02a3f..8dcb5f6100619 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -640,13 +640,13 @@ CodeGenModule::EmitCXXGlobalVarDeclInitFunc(const VarDecl *D,
       addUsedGlobal(COMDATKey);
     }
 
-    // If comdats are in use and supported, place the initializer function into
-    // the comdat group of the global. In the MS ABI, initializers are mangled
-    // and have their own comdat, so we don't include them in the group for
-    // consistency with MSVC.
+    // If we used a COMDAT key for the global ctor, the init function can be
+    // discarded if the global ctor entry is discarded.
+    // FIXME: Do we need to restrict this to ELF and Wasm?
     llvm::Comdat *C = Addr->getComdat();
-    if (COMDATKey && C && getTriple().supportsCOMDAT() &&
-        !getTarget().getCXXABI().isMicrosoft()) {
+    if (COMDATKey && C &&
+        (getTarget().getTriple().isOSBinFormatELF() ||
+         getTarget().getTriple().isOSBinFormatWasm())) {
       Fn->setComdat(C);
     }
   } else {
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 9166db4c74128..df4994ba9af6e 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -122,7 +122,7 @@ RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, CharUnits Align,
       Builder.SetInsertPoint(getPostAllocaInsertPoint());
     V = getTargetHooks().performAddrSpaceCast(
         *this, V, getASTAllocaAddressSpace(), LangAS::Default,
-        Ty->getPointerTo(DestAddrSpace), /*non-null*/ true);
+        Builder.getPtrTy(DestAddrSpace), /*non-null*/ true);
   }
 
   return RawAddress(V, Ty, Align, KnownNonNull);
@@ -469,7 +469,8 @@ static RawAddress createReferenceTemporary(CodeGenFunction &CGF,
         if (AS != LangAS::Default)
           C = TCG.performAddrSpaceCast(
               CGF.CGM, GV, AS, LangAS::Default,
-              GV->getValueType()->getPointerTo(
+              llvm::PointerType::get(
+                  CGF.getLLVMContext(),
                   CGF.getContext().getTargetAddressSpace(LangAS::Default)));
         // FIXME: Should we put the new global into a COMDAT?
         return RawAddress(C, GV->getValueType(), alignment);
@@ -3207,7 +3208,7 @@ LValue CodeGenFunction::EmitDeclRefLValue(const DeclRefExpr *E) {
 
     if (AS != T.getAddressSpace()) {
       auto TargetAS = getContext().getTargetAddressSpace(T.getAddressSpace());
-      auto PtrTy = ATPO.getElementType()->getPointerTo(TargetAS);
+      auto PtrTy = llvm::PointerType::get(CGM.getLLVMContext(), TargetAS);
       auto ASC = getTargetHooks().performAddrSpaceCast(
           CGM, ATPO.getPointer(), AS, T.getAddressSpace(), PtrTy);
       ATPO = ConstantAddress(ASC, ATPO.getElementType(), ATPO.getAlignment());
@@ -3835,9 +3836,7 @@ void CodeGenFunction::EmitCfiCheckFail() {
       llvm::StructType::get(Int8Ty, SourceLocationTy, VoidPtrTy);
 
   llvm::Value *V = Builder.CreateConstGEP2_32(
-      CfiCheckFailDataTy,
-      Builder.CreatePointerCast(Data, CfiCheckFailDataTy->getPointerTo(0)), 0,
-      0);
+      CfiCheckFailDataTy, Builder.CreatePointerCast(Data, UnqualPtrTy), 0, 0);
 
   Address CheckKindAddr(V, Int8Ty, getIntAlign());
   llvm::Value *CheckKind = Builder.CreateLoad(CheckKindAddr);
@@ -6115,36 +6114,6 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
   if (ResolvedFnInfo)
     *ResolvedFnInfo = &FnInfo;
 
-  // C99 6.5.2.2p6:
-  //   If the expression that denotes the called function has a type
-  //   that does not include a prototype, [the default argument
-  //   promotions are performed]. If the number of arguments does not
-  //   equal the number of parameters, the behavior is undefined. If
-  //   the function is defined with a type that includes a prototype,
-  //   and either the prototype ends with an ellipsis (, ...) or the
-  //   types of the arguments after promotion are not compatible with
-  //   the types of the parameters, the behavior is undefined. If the
-  //   function is defined with a type that does not include a
-  //   prototype, and the types of the arguments after promotion are
-  //   not compatible with those of the parameters after promotion,
-  //   the behavior is undefined [except in some trivial cases].
-  // That is, in the general case, we should assume that a call
-  // through an unprototyped function type works like a *non-variadic*
-  // call.  The way we make this work is to cast to the exact type
-  // of the promoted arguments.
-  //
-  // Chain calls use this same code path to add the invisible chain parameter
-  // to the function type.
-  if (isa<FunctionNoProtoType>(FnType) || Chain) {
-    llvm::Type *CalleeTy = getTypes().GetFunctionType(FnInfo);
-    int AS = Callee.getFunctionPointer()->getType()->getPointerAddressSpace();
-    CalleeTy = CalleeTy->getPointerTo(AS);
-
-    llvm::Value *CalleePtr = Callee.getFunctionPointer();
-    CalleePtr = Builder.CreateBitCast(CalleePtr, CalleeTy, "callee.knr.cast");
-    Callee.setFunctionPointer(CalleePtr);
-  }
-
   // HIP function pointer contains kernel handle when it is used in triple
   // chevron. The kernel stub needs to be loaded from kernel handle and used
   // as callee.
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 59d8fc830dcc8..3237d93ca31ce 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -415,7 +415,7 @@ void CGHLSLRuntime::emitEntryFunction(const FunctionDecl *FD,
   }
 
   CallInst *CI = B.CreateCall(FunctionCallee(Fn), Args);
-  (void)CI;
+  CI->setCallingConv(Fn->getCallingConv());
   // FIXME: Handle codegen for return type semantics.
   // See: https://github.com/llvm/llvm-project/issues/57875
   B.CreateRetVoid();
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 9e095a3755219..c920d93957b16 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1073,9 +1073,8 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
         CGM.getContext().getTargetInfo().getNewAlign() / 8));
 
     // Cast the void pointer and get the address of the globalized variable.
-    llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
     llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
+        VoidPtr, Bld.getPtrTy(0), VD->getName() + "_on_stack");
     LValue VarAddr =
         CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy);
     Rec.second.PrivateAddr = VarAddr.getAddress();
@@ -1930,7 +1929,7 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
   if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
     Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
     Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
+        Src, Bld.getPtrTy(0), CGF.SizeTy);
     llvm::Value *LB = CGF.EmitLoadOfScalar(
         TypedAddress,
         /*Volatile=*/false,
@@ -1939,8 +1938,8 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
     Args.emplace_back(LB);
     ++Idx;
     Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
-    TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
+    TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(Src, Bld.getPtrTy(0),
+                                                           CGF.SizeTy);
     llvm::Value *UB = CGF.EmitLoadOfScalar(
         TypedAddress,
         /*Volatile=*/false,
@@ -2079,7 +2078,7 @@ Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
     GV->setAlignment(Align.getAsAlign());
     return Address(
         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-            GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
+            GV, CGF.Builder.getPtrTy(CGM.getContext().getTargetAddressSpace(
                     VD->getType().getAddressSpace()))),
         VarTy, Align);
   }
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index 883333f0924dd..c9f9b688d0d8a 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -586,9 +586,9 @@ const FullSourceLoc BackendConsumer::getBestLocationFromDebugLoc(
   if (D.isLocationAvailable()) {
     D.getLocation(Filename, Line, Column);
     if (Line > 0) {
-      auto FE = FileMgr.getFile(Filename);
+      auto FE = FileMgr.getOptionalFileRef(Filename);
       if (!FE)
-        FE = FileMgr.getFile(D.getAbsolutePath());
+        FE = FileMgr.getOptionalFileRef(D.getAbsolutePath());
       if (FE) {
         // If -gcolumn-info was not used, Column will be 0. This upsets the
         // source manager, so pass 1 if Column is not set.
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 8a1f6ff00ada7..3e2abbd9bc109 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4646,6 +4646,8 @@ class CodeGenFunction : public CodeGenTypeCache {
                            unsigned BuiltinID);
   llvm::Value *EmitSVEPredicateCast(llvm::Value *Pred,
                                     llvm::ScalableVectorType *VTy);
+  llvm::Value *EmitSVEPredicateTupleCast(llvm::Value *PredTuple,
+                                         llvm::StructType *Ty);
   llvm::Value *EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
                                  llvm::SmallVectorImpl<llvm::Value *> &Ops,
                                  unsigned IntID);
@@ -4670,12 +4672,6 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitSVEStructStore(const SVETypeFlags &TypeFlags,
                                   SmallVectorImpl<llvm::Value *> &Ops,
                                   unsigned IntID);
-  /// FormSVEBuiltinResult - Returns the struct of scalable vectors as a wider
-  /// vector. It extracts the scalable vector from the struct and inserts into
-  /// the wider vector. This avoids the error when allocating space in llvm
-  /// for struct of scalable vectors if a function returns struct.
-  llvm::Value *FormSVEBuiltinResult(llvm::Value *Call);
-
   llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
 
   llvm::Value *EmitSMELd1St1(const SVETypeFlags &TypeFlags,
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index d53d47979f29f..25c1c496a4f27 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1164,6 +1164,16 @@ void CodeGenModule::Release() {
     // Indicate that we want to instrument branch control flow protection.
     getModule().addModuleFlag(llvm::Module::Min, "cf-protection-branch",
                               1);
+
+    auto Scheme = CodeGenOpts.getCFBranchLabelScheme();
+    if (Target.checkCFBranchLabelSchemeSupported(Scheme, getDiags())) {
+      if (Scheme == CFBranchLabelSchemeKind::Default)
+        Scheme = Target.getDefaultCFBranchLabelScheme();
+      getModule().addModuleFlag(
+          llvm::Module::Error, "cf-branch-label-scheme",
+          llvm::MDString::get(getLLVMContext(),
+                              getCFBranchLabelSchemeFlagVal(Scheme)));
+    }
   }
 
   if (CodeGenOpts.FunctionReturnThunks)
@@ -5612,8 +5622,9 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
     emitter->finalize(GV);
 
   // If it is safe to mark the global 'constant', do so now.
-  GV->setConstant(!NeedsGlobalCtor && !NeedsGlobalDtor &&
-                  D->getType().isConstantStorage(getContext(), true, true));
+  GV->setConstant((D->hasAttr<CUDAConstantAttr>() && LangOpts.CUDAIsDevice) ||
+                  (!NeedsGlobalCtor && !NeedsGlobalDtor &&
+                   D->getType().isConstantStorage(getContext(), true, true)));
 
   // If it is in a read-only section, mark it 'constant'.
   if (const SectionAttr *SA = D->getAttr<SectionAttr>()) {
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 77c1b27cebf40..0b486a644f57b 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -558,7 +558,7 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
       llvm_unreachable("Unexpected wasm reference builtin type!");             \
   } break;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_OPAQUE_PTR_TYPE(Name, AS, Width, Align, Id, SingletonId)        \
+#define AMDGPU_OPAQUE_PTR_TYPE(Name, Id, SingletonId, Width, Align, AS)        \
   case BuiltinType::Id:                                                        \
     return llvm::PointerType::get(getLLVMContext(), AS);
 #include "clang/Basic/AMDGPUTypes.def"
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index dcc35d5689831..965e09a7a760e 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -3639,7 +3639,7 @@ static bool TypeInfoIsInStandardLibrary(const BuiltinType *Ty) {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index d5e8e4f7a5916..7dd5c518e7149 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -58,6 +58,8 @@ class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo {
   SPIRVTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT)
       : CommonSPIRTargetCodeGenInfo(std::make_unique<SPIRVABIInfo>(CGT)) {}
   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
+  LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
+                                  const VarDecl *D) const override;
   llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
                                          SyncScope Scope,
                                          llvm::AtomicOrdering Ordering,
@@ -217,6 +219,28 @@ void SPIRVTargetCodeGenInfo::setCUDAKernelCallingConvention(
   }
 }
 
+LangAS
+SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
+                                                 const VarDecl *D) const {
+  assert(!CGM.getLangOpts().OpenCL &&
+         !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
+         "Address space agnostic languages only");
+  // If we're here it means that we're using the SPIRDefIsGen ASMap, hence for
+  // the global AS we can rely on either cuda_device or sycl_global to be
+  // correct; however, since this is not a CUDA Device context, we use
+  // sycl_global to prevent confusion with the assertion.
+  LangAS DefaultGlobalAS = getLangASFromTargetAS(
+      CGM.getContext().getTargetAddressSpace(LangAS::sycl_global));
+  if (!D)
+    return DefaultGlobalAS;
+
+  LangAS AddrSpace = D->getType().getAddressSpace();
+  if (AddrSpace != LangAS::Default)
+    return AddrSpace;
+
+  return DefaultGlobalAS;
+}
+
 llvm::SyncScope::ID
 SPIRVTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &, SyncScope Scope,
                                            llvm::AtomicOrdering,
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 44548fa9d706f..fba6a8853c396 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -1005,6 +1005,17 @@ static void appendOneArg(InputArgList &Args, const Arg *Opt,
   Copy->setOwnsValues(Opt->getOwnsValues());
   Opt->setOwnsValues(false);
   Args.append(Copy);
+  if (Opt->getAlias()) {
+    const Arg *Alias = Opt->getAlias();
+    unsigned Index = Args.MakeIndex(Alias->getSpelling());
+    auto AliasCopy = std::make_unique<Arg>(Alias->getOption(),
+                                           Args.getArgString(Index), Index);
+    AliasCopy->getValues() = Alias->getValues();
+    AliasCopy->setOwnsValues(false);
+    if (Alias->isClaimed())
+      AliasCopy->claim();
+    Copy->setAlias(std::move(AliasCopy));
+  }
 }
 
 bool Driver::readConfigFile(StringRef FileName,
@@ -6470,6 +6481,7 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
     case llvm::Triple::ZOS:
       TC = std::make_unique<toolchains::ZOS>(*this, Target, Args);
       break;
+    case llvm::Triple::Vulkan:
     case llvm::Triple::ShaderModel:
       TC = std::make_unique<toolchains::HLSLToolChain>(*this, Target, Args);
       break;
diff --git a/clang/lib/Driver/OffloadBundler.cpp b/clang/lib/Driver/OffloadBundler.cpp
index aaa4fdf03be1e..687a38333e128 100644
--- a/clang/lib/Driver/OffloadBundler.cpp
+++ b/clang/lib/Driver/OffloadBundler.cpp
@@ -1192,7 +1192,7 @@ Error OffloadBundler::ListBundleIDsInFile(
     StringRef InputFileName, const OffloadBundlerConfig &BundlerConfig) {
   // Open Input file.
   ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
-      MemoryBuffer::getFileOrSTDIN(InputFileName);
+      MemoryBuffer::getFileOrSTDIN(InputFileName, /*IsText=*/true);
   if (std::error_code EC = CodeOrErr.getError())
     return createFileError(InputFileName, EC);
 
@@ -1324,7 +1324,7 @@ Error OffloadBundler::BundleFiles() {
   InputBuffers.reserve(BundlerConfig.InputFileNames.size());
   for (auto &I : BundlerConfig.InputFileNames) {
     ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
-        MemoryBuffer::getFileOrSTDIN(I);
+        MemoryBuffer::getFileOrSTDIN(I, /*IsText=*/true);
     if (std::error_code EC = CodeOrErr.getError())
       return createFileError(I, EC);
     InputBuffers.emplace_back(std::move(*CodeOrErr));
@@ -1392,7 +1392,8 @@ Error OffloadBundler::BundleFiles() {
 Error OffloadBundler::UnbundleFiles() {
   // Open Input file.
   ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
-      MemoryBuffer::getFileOrSTDIN(BundlerConfig.InputFileNames.front());
+      MemoryBuffer::getFileOrSTDIN(BundlerConfig.InputFileNames.front(),
+                                   /*IsText=*/true);
   if (std::error_code EC = CodeOrErr.getError())
     return createFileError(BundlerConfig.InputFileNames.front(), EC);
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 0bab48caf1a5e..b9987288d82d1 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4424,21 +4424,7 @@ static void RenderDiagnosticsOptions(const Driver &D, const ArgList &Args,
       CmdArgs.push_back("-fno-diagnostics-show-note-include-stack");
   }
 
-  // Color diagnostics are parsed by the driver directly from argv and later
-  // re-parsed to construct this job; claim any possible color diagnostic here
-  // to avoid warn_drv_unused_argument and diagnose bad
-  // OPT_fdiagnostics_color_EQ values.
-  Args.getLastArg(options::OPT_fcolor_diagnostics,
-                  options::OPT_fno_color_diagnostics);
-  if (const Arg *A = Args.getLastArg(options::OPT_fdiagnostics_color_EQ)) {
-    StringRef Value(A->getValue());
-    if (Value != "always" && Value != "never" && Value != "auto")
-      D.Diag(diag::err_drv_invalid_argument_to_option)
-          << Value << A->getOption().getName();
-  }
-
-  if (D.getDiags().getDiagnosticOptions().ShowColors)
-    CmdArgs.push_back("-fcolor-diagnostics");
+  handleColorDiagnosticsArgs(D, Args, CmdArgs);
 
   if (Args.hasArg(options::OPT_fansi_escape_codes))
     CmdArgs.push_back("-fansi-escape-codes");
@@ -6228,9 +6214,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (Arg *A = Args.getLastArg(options::OPT_fbasic_block_sections_EQ)) {
     StringRef Val = A->getValue();
-    if (Triple.isX86() && Triple.isOSBinFormatELF()) {
-      if (Val != "all" && Val != "labels" && Val != "none" &&
-          !Val.starts_with("list="))
+    if (Val == "labels") {
+      D.Diag(diag::warn_drv_deprecated_arg)
+          << A->getAsString(Args) << /*hasReplacement=*/true
+          << "-fbasic-block-address-map";
+      CmdArgs.push_back("-fbasic-block-address-map");
+    } else if (Triple.isX86() && Triple.isOSBinFormatELF()) {
+      if (Val != "all" && Val != "none" && !Val.starts_with("list="))
         D.Diag(diag::err_drv_invalid_value)
             << A->getAsString(Args) << A->getValue();
       else
@@ -7012,6 +7002,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (Arg *A = Args.getLastArg(options::OPT_fcf_protection_EQ)) {
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-fcf-protection=") + A->getValue()));
+
+    if (Arg *SA = Args.getLastArg(options::OPT_mcf_branch_label_scheme_EQ))
+      CmdArgs.push_back(Args.MakeArgString(Twine("-mcf-branch-label-scheme=") +
+                                           SA->getValue()));
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_mfunction_return_EQ))
@@ -9242,6 +9236,12 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
         CmdArgs.push_back(Args.MakeArgString(
             "--device-linker=" + TC.getTripleString() + "=" + "-lm"));
       }
+      auto HasCompilerRT = getToolChain().getVFS().exists(
+          TC.getCompilerRT(Args, "builtins", ToolChain::FT_Static));
+      if (HasCompilerRT)
+        CmdArgs.push_back(
+            Args.MakeArgString("--device-linker=" + TC.getTripleString() + "=" +
+                               "-lclang_rt.builtins"));
     });
   }
 
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 043d9e4876443..0c6a585c3acff 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2960,3 +2960,22 @@ void tools::addMCModel(const Driver &D, const llvm::opt::ArgList &Args,
     }
   }
 }
+
+void tools::handleColorDiagnosticsArgs(const Driver &D, const ArgList &Args,
+                                       ArgStringList &CmdArgs) {
+  // Color diagnostics are parsed by the driver directly from argv and later
+  // re-parsed to construct this job; claim any possible color diagnostic here
+  // to avoid warn_drv_unused_argument and diagnose bad
+  // OPT_fdiagnostics_color_EQ values.
+  Args.getLastArg(options::OPT_fcolor_diagnostics,
+                  options::OPT_fno_color_diagnostics);
+  if (const Arg *A = Args.getLastArg(options::OPT_fdiagnostics_color_EQ)) {
+    StringRef Value(A->getValue());
+    if (Value != "always" && Value != "never" && Value != "auto")
+      D.Diag(diag::err_drv_invalid_argument_to_option)
+          << Value << A->getOption().getName();
+  }
+
+  if (D.getDiags().getDiagnosticOptions().ShowColors)
+    CmdArgs.push_back("-fcolor-diagnostics");
+}
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h
index 8695d3fe5b55b..eff21b210b424 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -233,6 +233,10 @@ void addMCModel(const Driver &D, const llvm::opt::ArgList &Args,
                 const llvm::Reloc::Model &RelocationModel,
                 llvm::opt::ArgStringList &CmdArgs);
 
+/// Handle the -f{no}-color-diagnostics and -f{no}-diagnostics-colors options.
+void handleColorDiagnosticsArgs(const Driver &D, const llvm::opt::ArgList &Args,
+                                llvm::opt::ArgStringList &CmdArgs);
+
 } // end namespace tools
 } // end namespace driver
 } // end namespace clang
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 6ce79d27e98c4..98350690f8d20 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -727,13 +727,7 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
 
   addFortranDialectOptions(Args, CmdArgs);
 
-  // Color diagnostics are parsed by the driver directly from argv and later
-  // re-parsed to construct this job; claim any possible color diagnostic here
-  // to avoid warn_drv_unused_argument.
-  Args.getLastArg(options::OPT_fcolor_diagnostics,
-                  options::OPT_fno_color_diagnostics);
-  if (Diags.getDiagnosticOptions().ShowColors)
-    CmdArgs.push_back("-fcolor-diagnostics");
+  handleColorDiagnosticsArgs(D, Args, CmdArgs);
 
   // LTO mode is parsed by the Clang driver library.
   LTOKind LTOMode = D.getLTOMode();
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 647580e4e235d..7c028f18c0308 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -186,6 +186,9 @@ void tools::PS4cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs))
     TC.addSanitizerArgs(Args, CmdArgs, "-l", "");
 
+  // Other drivers typically add library search paths (`-L`) here via
+  // TC.AddFilePathLibArgs(). We don't do that on PS4 as the PS4 linker
+  // searches those locations by default.
   Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
                             options::OPT_s, options::OPT_t});
 
@@ -290,6 +293,7 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs))
     TC.addSanitizerArgs(Args, CmdArgs, "-l", "");
 
+  TC.AddFilePathLibArgs(Args, CmdArgs);
   Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
                             options::OPT_s, options::OPT_t});
 
@@ -338,7 +342,7 @@ toolchains::PS4PS5Base::PS4PS5Base(const Driver &D, const llvm::Triple &Triple,
   }
 
   // Allow --sysroot= to override the root directory for header and library
-  // search, and -sysroot to override header search. If both are specified,
+  // search, and -isysroot to override header search. If both are specified,
   // -isysroot overrides --sysroot for header search.
   auto OverrideRoot = [&](const options::ID &Opt, std::string &Root,
                           StringRef Default) {
@@ -382,6 +386,8 @@ toolchains::PS4PS5Base::PS4PS5Base(const Driver &D, const llvm::Triple &Triple,
     llvm::sys::path::append(Dir, "target/include");
     CheckSDKPartExists(Dir, "system headers");
   }
+
+  getFilePaths().push_back(".");
 }
 
 void toolchains::PS4PS5Base::AddClangSystemIncludeArgs(
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index cf39038dcac37..fd3232b7c1b06 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -266,8 +266,7 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       }
     }
     // Avoid AsanInitInternal cycle, Issue #64126.
-    if (ToolChain.getTriple().isX86() && SA.needsSharedRt() &&
-        SA.needsAsanRt()) {
+    if (SA.needsSharedRt() && SA.needsAsanRt()) {
       CmdArgs.push_back("-z");
       CmdArgs.push_back("now");
     }
@@ -334,10 +333,11 @@ Solaris::Solaris(const Driver &D, const llvm::Triple &Triple,
 }
 
 SanitizerMask Solaris::getSupportedSanitizers() const {
+  const bool IsSparc = getTriple().getArch() == llvm::Triple::sparc;
   const bool IsX86 = getTriple().getArch() == llvm::Triple::x86;
   SanitizerMask Res = ToolChain::getSupportedSanitizers();
-  // FIXME: Omit X86_64 until 64-bit support is figured out.
-  if (IsX86) {
+  // FIXME: Omit SparcV9 and X86_64 until 64-bit support is figured out.
+  if (IsSparc || IsX86) {
     Res |= SanitizerKind::Address;
     Res |= SanitizerKind::PointerCompare;
     Res |= SanitizerKind::PointerSubtract;
diff --git a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
index 75c2dec22400b..6f42b36bd36a4 100644
--- a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
+++ b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
@@ -217,8 +217,8 @@ struct LocationFileChecker {
                       SmallVector<std::pair<SmallString<32>, bool>> &KnownFiles)
       : CI(CI), KnownFiles(KnownFiles), ExternalFileEntries() {
     for (const auto &KnownFile : KnownFiles)
-      if (auto FileEntry = CI.getFileManager().getFile(KnownFile.first))
-        KnownFileEntries.insert(*FileEntry);
+      if (auto FE = CI.getFileManager().getOptionalFileRef(KnownFile.first))
+        KnownFileEntries.insert(*FE);
   }
 
 private:
diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
index 93836ec5402fa..bffff0d27af3a 100644
--- a/clang/lib/Frontend/ASTUnit.cpp
+++ b/clang/lib/Frontend/ASTUnit.cpp
@@ -2395,7 +2395,7 @@ void ASTUnit::TranslateStoredDiagnostics(
     // Rebuild the StoredDiagnostic.
     if (SD.Filename.empty())
       continue;
-    auto FE = FileMgr.getFile(SD.Filename);
+    auto FE = FileMgr.getOptionalFileRef(SD.Filename);
     if (!FE)
       continue;
     SourceLocation FileLoc;
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 5f2a9637e3ea4..240305b33824b 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -427,12 +427,8 @@ static void InitializeFileRemapping(DiagnosticsEngine &Diags,
     }
 
     // Create the file entry for the file that we're mapping from.
-    const FileEntry *FromFile =
-        FileMgr.getVirtualFile(RF.first, ToFile->getSize(), 0);
-    if (!FromFile) {
-      Diags.Report(diag::err_fe_remap_missing_from_file) << RF.first;
-      continue;
-    }
+    FileEntryRef FromFile =
+        FileMgr.getVirtualFileRef(RF.first, ToFile->getSize(), 0);
 
     // Override the contents of the "from" file with the contents of
     // the "to" file.
@@ -1926,7 +1922,7 @@ ModuleLoadResult CompilerInstance::findOrCompileModuleAndReadAST(
 
     // Check whether M refers to the file in the prebuilt module path.
     if (M && M->getASTFile())
-      if (auto ModuleFile = FileMgr->getFile(ModuleFilename))
+      if (auto ModuleFile = FileMgr->getOptionalFileRef(ModuleFilename))
         if (*ModuleFile == M->getASTFile())
           return M;
 
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index efd852593468a..a0291ccfea245 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1688,6 +1688,18 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
   else if (Opts.CFProtectionBranch)
     GenerateArg(Consumer, OPT_fcf_protection_EQ, "branch");
 
+  if (Opts.CFProtectionBranch) {
+    switch (Opts.getCFBranchLabelScheme()) {
+    case CFBranchLabelSchemeKind::Default:
+      break;
+#define CF_BRANCH_LABEL_SCHEME(Kind, FlagVal)                                  \
+  case CFBranchLabelSchemeKind::Kind:                                          \
+    GenerateArg(Consumer, OPT_mcf_branch_label_scheme_EQ, #FlagVal);           \
+    break;
+#include "clang/Basic/CFProtectionOptions.def"
+    }
+  }
+
   if (Opts.FunctionReturnThunks)
     GenerateArg(Consumer, OPT_mfunction_return_EQ, "thunk-extern");
 
@@ -2022,6 +2034,22 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
       Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Name;
   }
 
+  if (Opts.CFProtectionBranch && T.isRISCV()) {
+    if (const Arg *A = Args.getLastArg(OPT_mcf_branch_label_scheme_EQ)) {
+      const auto Scheme =
+          llvm::StringSwitch<CFBranchLabelSchemeKind>(A->getValue())
+#define CF_BRANCH_LABEL_SCHEME(Kind, FlagVal)                                  \
+  .Case(#FlagVal, CFBranchLabelSchemeKind::Kind)
+#include "clang/Basic/CFProtectionOptions.def"
+              .Default(CFBranchLabelSchemeKind::Default);
+      if (Scheme != CFBranchLabelSchemeKind::Default)
+        Opts.setCFBranchLabelScheme(Scheme);
+      else
+        Diags.Report(diag::err_drv_invalid_value)
+            << A->getAsString(Args) << A->getValue();
+    }
+  }
+
   if (const Arg *A = Args.getLastArg(OPT_mfunction_return_EQ)) {
     auto Val = llvm::StringSwitch<llvm::FunctionReturnThunksKind>(A->getValue())
                    .Case("keep", llvm::FunctionReturnThunksKind::Keep)
@@ -3802,6 +3830,9 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
   case LangOptions::ClangABI::Ver18:
     GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "18.0");
     break;
+  case LangOptions::ClangABI::Ver19:
+    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "19.0");
+    break;
   case LangOptions::ClangABI::Latest:
     break;
   }
@@ -3952,6 +3983,18 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
     }
   }
 
+  if (Opts.CFProtectionBranch) {
+    if (const Arg *A = Args.getLastArg(OPT_mcf_branch_label_scheme_EQ)) {
+      const auto Scheme =
+          llvm::StringSwitch<CFBranchLabelSchemeKind>(A->getValue())
+#define CF_BRANCH_LABEL_SCHEME(Kind, FlagVal)                                  \
+  .Case(#FlagVal, CFBranchLabelSchemeKind::Kind)
+#include "clang/Basic/CFProtectionOptions.def"
+              .Default(CFBranchLabelSchemeKind::Default);
+      Opts.setCFBranchLabelScheme(Scheme);
+    }
+  }
+
   if ((Args.hasArg(OPT_fsycl_is_device) || Args.hasArg(OPT_fsycl_is_host)) &&
       !Args.hasArg(OPT_sycl_std_EQ)) {
     // If the user supplied -fsycl-is-device or -fsycl-is-host, but failed to
@@ -4332,6 +4375,8 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
         Opts.setClangABICompat(LangOptions::ClangABI::Ver17);
       else if (Major <= 18)
         Opts.setClangABICompat(LangOptions::ClangABI::Ver18);
+      else if (Major <= 19)
+        Opts.setClangABICompat(LangOptions::ClangABI::Ver19);
     } else if (Ver != "latest") {
       Diags.Report(diag::err_drv_invalid_value)
           << A->getAsString(Args) << A->getValue();
diff --git a/clang/lib/Frontend/LogDiagnosticPrinter.cpp b/clang/lib/Frontend/LogDiagnosticPrinter.cpp
index 4e963af837f01..469d1c22633aa 100644
--- a/clang/lib/Frontend/LogDiagnosticPrinter.cpp
+++ b/clang/lib/Frontend/LogDiagnosticPrinter.cpp
@@ -129,8 +129,7 @@ void LogDiagnosticPrinter::HandleDiagnostic(DiagnosticsEngine::Level Level,
   DE.DiagnosticLevel = Level;
 
   DE.WarningOption =
-      std::string(Info.getDiags()->getDiagnosticIDs()->getWarningOptionForDiag(
-          DE.DiagnosticID));
+      std::string(DiagnosticIDs::getWarningOptionForDiag(DE.DiagnosticID));
 
   // Format the message.
   SmallString<100> MessageStr;
@@ -161,3 +160,4 @@ void LogDiagnosticPrinter::HandleDiagnostic(DiagnosticsEngine::Level Level,
   // Record the diagnostic entry.
   Entries.push_back(DE);
 }
+
diff --git a/clang/lib/Frontend/Rewrite/FrontendActions.cpp b/clang/lib/Frontend/Rewrite/FrontendActions.cpp
index cf5a9437e89e6..6e1f949f543a5 100644
--- a/clang/lib/Frontend/Rewrite/FrontendActions.cpp
+++ b/clang/lib/Frontend/Rewrite/FrontendActions.cpp
@@ -213,7 +213,7 @@ class RewriteIncludesAction::RewriteImportsListener : public ASTReaderListener {
 
   void visitModuleFile(StringRef Filename,
                        serialization::ModuleKind Kind) override {
-    auto File = CI.getFileManager().getFile(Filename);
+    auto File = CI.getFileManager().getOptionalFileRef(Filename);
     assert(File && "missing file for loaded module?");
 
     // Only rewrite each module file once.
diff --git a/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp b/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
index d1db31763e500..0887b5a504f05 100644
--- a/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
+++ b/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
@@ -202,7 +202,7 @@ class SDiagsWriter : public DiagnosticConsumer {
 
   /// Emit the string information for diagnostic flags.
   unsigned getEmitDiagnosticFlag(DiagnosticsEngine::Level DiagLevel,
-                                 const Diagnostic *Diag = nullptr);
+                                 unsigned DiagID = 0);
 
   unsigned getEmitDiagnosticFlag(StringRef DiagName);
 
@@ -536,13 +536,11 @@ unsigned SDiagsWriter::getEmitCategory(unsigned int category) {
 }
 
 unsigned SDiagsWriter::getEmitDiagnosticFlag(DiagnosticsEngine::Level DiagLevel,
-                                             const Diagnostic *Diag) {
-  if (!Diag || DiagLevel == DiagnosticsEngine::Note)
+                                             unsigned DiagID) {
+  if (DiagLevel == DiagnosticsEngine::Note)
     return 0; // No flag for notes.
 
-  StringRef FlagName =
-      Diag->getDiags()->getDiagnosticIDs()->getWarningOptionForDiag(
-          Diag->getID());
+  StringRef FlagName = DiagnosticIDs::getWarningOptionForDiag(DiagID);
   return getEmitDiagnosticFlag(FlagName);
 }
 
@@ -657,7 +655,7 @@ void SDiagsWriter::EmitDiagnosticMessage(FullSourceLoc Loc, PresumedLoc PLoc,
     unsigned DiagID = DiagnosticIDs::getCategoryNumberForDiag(Info->getID());
     Record.push_back(getEmitCategory(DiagID));
     // Emit the diagnostic flag string lazily and get the mapped ID.
-    Record.push_back(getEmitDiagnosticFlag(Level, Info));
+    Record.push_back(getEmitDiagnosticFlag(Level, Info->getID()));
   } else {
     Record.push_back(getEmitCategory());
     Record.push_back(getEmitDiagnosticFlag(Level));
diff --git a/clang/lib/Frontend/TextDiagnosticPrinter.cpp b/clang/lib/Frontend/TextDiagnosticPrinter.cpp
index 28f7218dc23f5..dac5c44fe9256 100644
--- a/clang/lib/Frontend/TextDiagnosticPrinter.cpp
+++ b/clang/lib/Frontend/TextDiagnosticPrinter.cpp
@@ -70,17 +70,13 @@ static void printDiagnosticOptions(raw_ostream &OS,
     // flag it as such. Note that diagnostics could also have been mapped by a
     // pragma, but we don't currently have a way to distinguish this.
     if (Level == DiagnosticsEngine::Error &&
-        Info.getDiags()->getDiagnosticIDs()->isWarningOrExtension(
-            Info.getID()) &&
-        !Info.getDiags()->getDiagnosticIDs()->isDefaultMappingAsError(
-            Info.getID())) {
+        DiagnosticIDs::isBuiltinWarningOrExtension(Info.getID()) &&
+        !DiagnosticIDs::isDefaultMappingAsError(Info.getID())) {
       OS << " [-Werror";
       Started = true;
     }
 
-    StringRef Opt =
-        Info.getDiags()->getDiagnosticIDs()->getWarningOptionForDiag(
-            Info.getID());
+    StringRef Opt = DiagnosticIDs::getWarningOptionForDiag(Info.getID());
     if (!Opt.empty()) {
       OS << (Started ? "," : " [")
          << (Level == DiagnosticsEngine::Remark ? "-R" : "-W") << Opt;
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index f5cc07c303f9e..ff392e7122a44 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -120,6 +120,7 @@ set(ppc_htm_files
 
 set(riscv_files
   riscv_bitmanip.h
+        riscv_corev_alu.h
   riscv_crypto.h
   riscv_ntlh.h
   sifive_vector.h
diff --git a/clang/lib/Headers/bmi2intrin.h b/clang/lib/Headers/bmi2intrin.h
index f0a3343bef915..bdb61b13fb83b 100644
--- a/clang/lib/Headers/bmi2intrin.h
+++ b/clang/lib/Headers/bmi2intrin.h
@@ -15,7 +15,13 @@
 #define __BMI2INTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("bmi2"))) constexpr
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
+#endif
 
 /// Copies the unsigned 32-bit integer \a __X and zeroes the upper bits
 ///    starting at bit number \a __Y.
@@ -38,8 +44,7 @@
 ///    The lower 8 bits specify the bit number of the lowest bit to zero.
 /// \returns The partially zeroed 32-bit value.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_bzhi_u32(unsigned int __X, unsigned int __Y)
-{
+_bzhi_u32(unsigned int __X, unsigned int __Y) {
   return __builtin_ia32_bzhi_si(__X, __Y);
 }
 
@@ -68,8 +73,7 @@ _bzhi_u32(unsigned int __X, unsigned int __Y)
 ///    The 32-bit mask specifying where to deposit source bits.
 /// \returns The 32-bit result.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_pdep_u32(unsigned int __X, unsigned int __Y)
-{
+_pdep_u32(unsigned int __X, unsigned int __Y) {
   return __builtin_ia32_pdep_si(__X, __Y);
 }
 
@@ -98,8 +102,7 @@ _pdep_u32(unsigned int __X, unsigned int __Y)
 ///    The 32-bit mask specifying which source bits to extract.
 /// \returns The 32-bit result.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_pext_u32(unsigned int __X, unsigned int __Y)
-{
+_pext_u32(unsigned int __X, unsigned int __Y) {
   return __builtin_ia32_pext_si(__X, __Y);
 }
 
@@ -124,8 +127,7 @@ _pext_u32(unsigned int __X, unsigned int __Y)
 ///    A pointer to memory for storing the upper half of the product.
 /// \returns The lower half of the product.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P)
-{
+_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
   unsigned long long __res = (unsigned long long) __X * __Y;
   *__P = (unsigned int)(__res >> 32);
   return (unsigned int)__res;
@@ -154,8 +156,7 @@ _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P)
 ///    The lower 8 bits specify the bit number of the lowest bit to zero.
 /// \returns The partially zeroed 64-bit value.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_bzhi_u64(unsigned long long __X, unsigned long long __Y)
-{
+_bzhi_u64(unsigned long long __X, unsigned long long __Y) {
   return __builtin_ia32_bzhi_di(__X, __Y);
 }
 
@@ -184,8 +185,7 @@ _bzhi_u64(unsigned long long __X, unsigned long long __Y)
 ///    The 64-bit mask specifying where to deposit source bits.
 /// \returns The 64-bit result.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_pdep_u64(unsigned long long __X, unsigned long long __Y)
-{
+_pdep_u64(unsigned long long __X, unsigned long long __Y) {
   return __builtin_ia32_pdep_di(__X, __Y);
 }
 
@@ -214,8 +214,7 @@ _pdep_u64(unsigned long long __X, unsigned long long __Y)
 ///    The 64-bit mask specifying which source bits to extract.
 /// \returns The 64-bit result.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_pext_u64(unsigned long long __X, unsigned long long __Y)
-{
+_pext_u64(unsigned long long __X, unsigned long long __Y) {
   return __builtin_ia32_pext_di(__X, __Y);
 }
 
@@ -241,8 +240,7 @@ _pext_u64(unsigned long long __X, unsigned long long __Y)
 /// \returns The lower half of the product.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _mulx_u64 (unsigned long long __X, unsigned long long __Y,
-	   unsigned long long *__P)
-{
+           unsigned long long *__P) {
   unsigned __int128 __res = (unsigned __int128) __X * __Y;
   *__P = (unsigned long long) (__res >> 64);
   return (unsigned long long) __res;
diff --git a/clang/lib/Headers/bmiintrin.h b/clang/lib/Headers/bmiintrin.h
index 78bffe68e221a..673f043ccfb30 100644
--- a/clang/lib/Headers/bmiintrin.h
+++ b/clang/lib/Headers/bmiintrin.h
@@ -17,7 +17,12 @@
 /* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
    instruction behaves as BSF on non-BMI targets, there is code that expects
    to use it as a potentially faster version of BSF. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __RELAXED_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__)) constexpr
+#else
 #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#endif
 
 /// Counts the number of trailing zero bits in the operand.
 ///
@@ -164,7 +169,13 @@ _mm_tzcnt_64(unsigned long long __X)
 #if !defined(__SCE__) || __has_feature(modules) || defined(__BMI__)
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("bmi"))) constexpr
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
+#endif
 
 /// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
@@ -224,8 +235,7 @@ __andn_u32(unsigned int __X, unsigned int __Y)
 ///    extracted bits.
 /// \see _bextr_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__bextr_u32(unsigned int __X, unsigned int __Y)
-{
+__bextr_u32(unsigned int __X, unsigned int __Y) {
   return __builtin_ia32_bextr_u32(__X, __Y);
 }
 
@@ -249,9 +259,8 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
 ///    extracted bits.
 /// \see __bextr_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
-{
-  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z) {
+  return __builtin_ia32_bextr_u32(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }
 
 /* Intel-specified, single-leading-underscore version of BEXTR2 */
@@ -445,8 +454,7 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y)
 ///    extracted bits.
 /// \see _bextr_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__bextr_u64(unsigned long long __X, unsigned long long __Y)
-{
+__bextr_u64(unsigned long long __X, unsigned long long __Y) {
   return __builtin_ia32_bextr_u64(__X, __Y);
 }
 
@@ -470,9 +478,8 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
 ///    extracted bits.
 /// \see __bextr_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
-{
-  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) {
+  return __builtin_ia32_bextr_u64(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }
 
 /* Intel-specified, single-leading-underscore version of BEXTR2 */
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index b139f9eb7d999..810a16d75f022 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -378,6 +378,22 @@ template <typename T> constexpr float asfloat(T F) {
   return __detail::bit_cast<float, T>(F);
 }
 
+//===----------------------------------------------------------------------===//
+// asint builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn int asint(T Val)
+/// \brief Interprets the bit pattern of x as an integer.
+/// \param Val The input value.
+
+template <typename T, int N> constexpr vector<int, N> asint(vector<T, N> V) {
+  return __detail::bit_cast<int, T, N>(V);
+}
+
+template <typename T> constexpr int asint(T F) {
+  return __detail::bit_cast<int, T>(F);
+}
+
 //===----------------------------------------------------------------------===//
 // asin builtins
 //===----------------------------------------------------------------------===//
@@ -913,6 +929,40 @@ float3 floor(float3);
 _HLSL_BUILTIN_ALIAS(__builtin_elementwise_floor)
 float4 floor(float4);
 
+//===----------------------------------------------------------------------===//
+// fmod builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T fmod(T x, T y)
+/// \brief Returns the linear interpolation of x to y.
+/// \param x [in] The dividend.
+/// \param y [in] The divisor.
+///
+/// Return the floating-point remainder of the x parameter divided by the y
+/// parameter.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod)
+half fmod(half, half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod)
+half2 fmod(half2, half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod)
+half3 fmod(half3, half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod)
+half4 fmod(half4, half4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod)
+float fmod(float, float);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod)
+float2 fmod(float2, float2);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod)
+float3 fmod(float3, float3);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_fmod)
+float4 fmod(float4, float4);
+
 //===----------------------------------------------------------------------===//
 // frac builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Headers/lzcntintrin.h b/clang/lib/Headers/lzcntintrin.h
index f4ddce9d0e683..db00474ffd394 100644
--- a/clang/lib/Headers/lzcntintrin.h
+++ b/clang/lib/Headers/lzcntintrin.h
@@ -15,7 +15,13 @@
 #define __LZCNTINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) constexpr
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
+#endif
 
 #ifndef _MSC_VER
 /// Counts the number of leading zero bits in the operand.
diff --git a/clang/lib/Headers/riscv_corev_alu.h b/clang/lib/Headers/riscv_corev_alu.h
new file mode 100644
index 0000000000000..d2832ddf72efe
--- /dev/null
+++ b/clang/lib/Headers/riscv_corev_alu.h
@@ -0,0 +1,128 @@
+/*===---- riscv_corev_alu.h - CORE-V ALU intrinsics ------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __RISCV_COREV_ALU_H
+#define __RISCV_COREV_ALU_H
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__riscv_xcvalu)
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_abs(long a) {
+  return __builtin_abs(a);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_slet(long a, long b) {
+  return __builtin_riscv_cv_alu_slet(a, b);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_sletu(unsigned long a, unsigned long b) {
+  return __builtin_riscv_cv_alu_sletu(a, b);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_min(long a, long b) {
+  return __builtin_elementwise_min(a, b);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_minu(unsigned long a, unsigned long b) {
+  return __builtin_elementwise_min(a, b);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_max(long a, long b) {
+  return __builtin_elementwise_max(a, b);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_maxu(unsigned long a, unsigned long b) {
+  return __builtin_elementwise_max(a, b);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_exths(int16_t a) {
+  return __builtin_riscv_cv_alu_exths(a);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_exthz(uint16_t a) {
+  return __builtin_riscv_cv_alu_exthz(a);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_extbs(int8_t a) {
+  return __builtin_riscv_cv_alu_extbs(a);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_extbz(uint8_t a) {
+  return __builtin_riscv_cv_alu_extbz(a);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_clip(long a,
+                                                              unsigned long b) {
+  return __builtin_riscv_cv_alu_clip(a, b);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_clipu(unsigned long a, unsigned long b) {
+  return __builtin_riscv_cv_alu_clipu(a, b);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_addN(long a, long b,
+                                                              uint8_t shft) {
+  return __builtin_riscv_cv_alu_addN(a, b, shft);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_adduN(unsigned long a, unsigned long b, uint8_t shft) {
+  return __builtin_riscv_cv_alu_adduN(a, b, shft);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_addRN(long a, long b,
+                                                               uint8_t shft) {
+  return __builtin_riscv_cv_alu_addRN(a, b, shft);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_adduRN(unsigned long a, unsigned long b, uint8_t shft) {
+  return __builtin_riscv_cv_alu_adduRN(a, b, shft);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_subN(long a, long b,
+                                                              uint8_t shft) {
+  return __builtin_riscv_cv_alu_subN(a, b, shft);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_subuN(unsigned long a, unsigned long b, uint8_t shft) {
+  return __builtin_riscv_cv_alu_subuN(a, b, shft);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_subRN(long a, long b,
+                                                               uint8_t shft) {
+  return __builtin_riscv_cv_alu_subRN(a, b, shft);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_subuRN(unsigned long a, unsigned long b, uint8_t shft) {
+  return __builtin_riscv_cv_alu_subuRN(a, b, shft);
+}
+
+#endif // defined(__riscv_xcvalu)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif // define __RISCV_COREV_ALU_H
diff --git a/clang/lib/Headers/tbmintrin.h b/clang/lib/Headers/tbmintrin.h
index f4e848a1c0019..48a9d07568ff2 100644
--- a/clang/lib/Headers/tbmintrin.h
+++ b/clang/lib/Headers/tbmintrin.h
@@ -15,7 +15,13 @@
 #define __TBMINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("tbm"))) constexpr
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
+#endif
 
 #define __bextri_u32(a, b) \
   ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
diff --git a/clang/lib/Index/USRGeneration.cpp b/clang/lib/Index/USRGeneration.cpp
index f00bc56429f1a..35d0aefaf69a6 100644
--- a/clang/lib/Index/USRGeneration.cpp
+++ b/clang/lib/Index/USRGeneration.cpp
@@ -780,7 +780,7 @@ void USRGenerator::VisitType(QualType T) {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
   case BuiltinType::Id:                                                        \
     Out << "@BT@" << #Name;                                                    \
     break;
diff --git a/clang/lib/InstallAPI/Frontend.cpp b/clang/lib/InstallAPI/Frontend.cpp
index 04d06f46d2652..2ebe72bf021cf 100644
--- a/clang/lib/InstallAPI/Frontend.cpp
+++ b/clang/lib/InstallAPI/Frontend.cpp
@@ -107,7 +107,7 @@ InstallAPIContext::findAndRecordFile(const FileEntry *FE,
 }
 
 void InstallAPIContext::addKnownHeader(const HeaderFile &H) {
-  auto FE = FM->getFile(H.getPath());
+  auto FE = FM->getOptionalFileRef(H.getPath());
   if (!FE)
     return; // File does not exist.
   KnownFiles[*FE] = H.getType();
diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index 4914c10e62d0c..8826ab449df49 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -227,7 +227,7 @@ std::string HeaderSearch::getPrebuiltModuleFileName(StringRef ModuleName,
                                           ".pcm");
     else
       llvm::sys::path::append(Result, ModuleName + ".pcm");
-    if (getFileMgr().getFile(Result.str()))
+    if (getFileMgr().getOptionalFileRef(Result))
       return std::string(Result);
   }
 
@@ -246,7 +246,7 @@ std::string HeaderSearch::getPrebuiltImplicitModuleFileName(Module *Module) {
     llvm::sys::path::append(CachePath, ModuleCacheHash);
     std::string FileName =
         getCachedModuleFileNameImpl(ModuleName, ModuleMapPath, CachePath);
-    if (!FileName.empty() && getFileMgr().getFile(FileName))
+    if (!FileName.empty() && getFileMgr().getOptionalFileRef(FileName))
       return FileName;
   }
   return {};
@@ -655,7 +655,7 @@ OptionalFileEntryRef DirectoryLookup::DoFrameworkLookup(
     ++NumFrameworkLookups;
 
     // If the framework dir doesn't exist, we fail.
-    auto Dir = FileMgr.getDirectory(FrameworkName);
+    auto Dir = FileMgr.getOptionalDirectoryRef(FrameworkName);
     if (!Dir)
       return std::nullopt;
 
@@ -718,7 +718,7 @@ OptionalFileEntryRef DirectoryLookup::DoFrameworkLookup(
     bool FoundFramework = false;
     do {
       // Determine whether this directory exists.
-      auto Dir = FileMgr.getDirectory(FrameworkPath);
+      auto Dir = FileMgr.getOptionalDirectoryRef(FrameworkPath);
       if (!Dir)
         break;
 
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index eed7eca2e7356..2aada51c71c50 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -1144,7 +1144,8 @@ Module *ModuleMap::inferFrameworkModule(DirectoryEntryRef FrameworkDir,
         if (SubframeworkDirName.empty())
           break;
 
-        if (auto SubDir = FileMgr.getDirectory(SubframeworkDirName)) {
+        if (auto SubDir =
+                FileMgr.getOptionalDirectoryRef(SubframeworkDirName)) {
           if (*SubDir == FrameworkDir) {
             FoundParent = true;
             break;
diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp
index 8221db46e06ac..1a71f03b18236 100644
--- a/clang/lib/Lex/PPLexerChange.cpp
+++ b/clang/lib/Lex/PPLexerChange.cpp
@@ -229,7 +229,7 @@ static void computeRelativePath(FileManager &FM, const DirectoryEntry *Dir,
   StringRef FilePath = File.getDir().getName();
   StringRef Path = FilePath;
   while (!Path.empty()) {
-    if (auto CurDir = FM.getDirectory(Path)) {
+    if (auto CurDir = FM.getOptionalDirectoryRef(Path)) {
       if (*CurDir == Dir) {
         Result = FilePath.substr(Path.size());
         llvm::sys::path::append(Result,
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index 0261e8ea3c9b7..e66abd6873794 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -976,14 +976,25 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
                                      /*IsReadOnly=*/false, /*IsZero=*/false);
       break;
     case OpenACCClauseKind::Collapse: {
-      tryParseAndConsumeSpecialTokenKind(*this, OpenACCSpecialTokenKind::Force,
-                                         ClauseKind);
-      ExprResult NumLoops =
+      bool HasForce = tryParseAndConsumeSpecialTokenKind(
+          *this, OpenACCSpecialTokenKind::Force, ClauseKind);
+      ExprResult LoopCount =
           getActions().CorrectDelayedTyposInExpr(ParseConstantExpression());
-      if (NumLoops.isInvalid()) {
+      if (LoopCount.isInvalid()) {
         Parens.skipToEnd();
         return OpenACCCanContinue();
       }
+
+      LoopCount = getActions().OpenACC().ActOnIntExpr(
+          OpenACCDirectiveKind::Invalid, ClauseKind,
+          LoopCount.get()->getBeginLoc(), LoopCount.get());
+
+      if (LoopCount.isInvalid()) {
+        Parens.skipToEnd();
+        return OpenACCCanContinue();
+      }
+
+      ParsedClause.setCollapseDetails(HasForce, LoopCount.get());
       break;
     }
     case OpenACCClauseKind::Bind: {
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index e9e39c11ffbaa..009b8d000e6b0 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -271,6 +271,49 @@ static bool isInStlNamespace(const Decl *D) {
   return DC->isStdNamespace();
 }
 
+static bool isPointerLikeType(QualType Type) {
+  return isRecordWithAttr<PointerAttr>(Type) || Type->isPointerType() ||
+         Type->isNullPtrType();
+}
+
+// Returns true if the given Record decl is a form of `GSLOwner<Pointer>`
+// type, e.g. std::vector<string_view>, std::optional<string_view>.
+static bool isContainerOfPointer(const RecordDecl *Container) {
+  if (const auto *CTSD =
+          dyn_cast_if_present<ClassTemplateSpecializationDecl>(Container)) {
+    if (!CTSD->hasAttr<OwnerAttr>()) // Container must be a GSL owner type.
+      return false;
+    const auto &TAs = CTSD->getTemplateArgs();
+    return TAs.size() > 0 && TAs[0].getKind() == TemplateArgument::Type &&
+           isPointerLikeType(TAs[0].getAsType());
+  }
+  return false;
+}
+static bool isContainerOfOwner(const RecordDecl *Container) {
+  const auto *CTSD =
+      dyn_cast_if_present<ClassTemplateSpecializationDecl>(Container);
+  if (!CTSD)
+    return false;
+  if (!CTSD->hasAttr<OwnerAttr>()) // Container must be a GSL owner type.
+    return false;
+  const auto &TAs = CTSD->getTemplateArgs();
+  return TAs.size() > 0 && TAs[0].getKind() == TemplateArgument::Type &&
+         isRecordWithAttr<OwnerAttr>(TAs[0].getAsType());
+}
+
+// Returns true if the given Record is `std::initializer_list<pointer>`.
+static bool isStdInitializerListOfPointer(const RecordDecl *RD) {
+  if (const auto *CTSD =
+          dyn_cast_if_present<ClassTemplateSpecializationDecl>(RD)) {
+    const auto &TAs = CTSD->getTemplateArgs();
+    return isInStlNamespace(RD) && RD->getIdentifier() &&
+           RD->getName() == "initializer_list" && TAs.size() > 0 &&
+           TAs[0].getKind() == TemplateArgument::Type &&
+           isPointerLikeType(TAs[0].getAsType());
+  }
+  return false;
+}
+
 static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) {
   if (auto *Conv = dyn_cast_or_null<CXXConversionDecl>(Callee))
     if (isRecordWithAttr<PointerAttr>(Conv->getConversionType()) &&
@@ -282,8 +325,7 @@ static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) {
           Callee->getFunctionObjectParameterType()) &&
       !isRecordWithAttr<OwnerAttr>(Callee->getFunctionObjectParameterType()))
     return false;
-  if (Callee->getReturnType()->isPointerType() ||
-      isRecordWithAttr<PointerAttr>(Callee->getReturnType())) {
+  if (isPointerLikeType(Callee->getReturnType())) {
     if (!Callee->getIdentifier())
       return false;
     return llvm::StringSwitch<bool>(Callee->getName())
@@ -331,6 +373,103 @@ static bool shouldTrackFirstArgument(const FunctionDecl *FD) {
   return false;
 }
 
+// Returns true if the given constructor is a copy-like constructor, such as
+// `Ctor(Owner<U>&&)` or `Ctor(const Owner<U>&)`.
+static bool isCopyLikeConstructor(const CXXConstructorDecl *Ctor) {
+  if (!Ctor || Ctor->param_size() != 1)
+    return false;
+  const auto *ParamRefType =
+      Ctor->getParamDecl(0)->getType()->getAs<ReferenceType>();
+  if (!ParamRefType)
+    return false;
+
+  // Check if the first parameter type is "Owner<U>".
+  if (const auto *TST =
+          ParamRefType->getPointeeType()->getAs<TemplateSpecializationType>())
+    return TST->getTemplateName()
+        .getAsTemplateDecl()
+        ->getTemplatedDecl()
+        ->hasAttr<OwnerAttr>();
+  return false;
+}
+
+// Returns true if we should perform the GSL analysis on the first argument for
+// the given constructor.
+static bool
+shouldTrackFirstArgumentForConstructor(const CXXConstructExpr *Ctor) {
+  const auto *LHSRecordDecl = Ctor->getConstructor()->getParent();
+
+  // Case 1, construct a GSL pointer, e.g. std::string_view
+  // Always inspect when LHS is a pointer.
+  if (LHSRecordDecl->hasAttr<PointerAttr>())
+    return true;
+
+  if (Ctor->getConstructor()->getNumParams() != 1 ||
+      !isContainerOfPointer(LHSRecordDecl))
+    return false;
+
+  // Now, the LHS is an Owner<Pointer> type, e.g., std::vector<string_view>.
+  //
+  // At a high level, we cannot precisely determine what the nested pointer
+  // owns. However, by analyzing the RHS owner type, we can use heuristics to
+  // infer ownership information. These heuristics are designed to be
+  // conservative, minimizing false positives while still providing meaningful
+  // diagnostics.
+  //
+  // While this inference isn't perfect, it helps catch common use-after-free
+  // patterns.
+  auto RHSArgType = Ctor->getArg(0)->getType();
+  const auto *RHSRD = RHSArgType->getAsRecordDecl();
+  // LHS is constructed from an intializer_list.
+  //
+  // std::initializer_list is a proxy object that provides access to the backing
+  // array. We perform analysis on it to determine if there are any dangling
+  // temporaries in the backing array.
+  // E.g. std::vector<string_view> abc = {string()};
+  if (isStdInitializerListOfPointer(RHSRD))
+    return true;
+
+  // RHS must be an owner.
+  if (!isRecordWithAttr<OwnerAttr>(RHSArgType))
+    return false;
+
+  // Bail out if the RHS is Owner<Pointer>.
+  //
+  // We cannot reliably determine what the LHS nested pointer owns -- it could
+  // be the entire RHS or the nested pointer in RHS. To avoid false positives,
+  // we skip this case, such as:
+  //   std::stack<std::string_view> s(std::deque<std::string_view>{});
+  //
+  // TODO: this also has a false negative, it doesn't catch the case like:
+  //   std::optional<span<int*>> os = std::vector<int*>{}
+  if (isContainerOfPointer(RHSRD))
+    return false;
+
+  // Assume that the nested Pointer is constructed from the nested Owner.
+  // E.g. std::optional<string_view> sv = std::optional<string>(s);
+  if (isContainerOfOwner(RHSRD))
+    return true;
+
+  // Now, the LHS is an Owner<Pointer> and the RHS is an Owner<X>,  where X is
+  // neither an `Owner` nor a `Pointer`.
+  //
+  // Use the constructor's signature as a hint. If it is a copy-like constructor
+  // `Owner1<Pointer>(Owner2<X>&&)`, we assume that the nested pointer is
+  // constructed from X. In such cases, we do not diagnose, as `X` is not an
+  // owner, e.g.
+  //   std::optional<string_view> sv = std::optional<Foo>();
+  if (const auto *PrimaryCtorTemplate =
+          Ctor->getConstructor()->getPrimaryTemplate();
+      PrimaryCtorTemplate &&
+      isCopyLikeConstructor(dyn_cast_if_present<CXXConstructorDecl>(
+          PrimaryCtorTemplate->getTemplatedDecl()))) {
+    return false;
+  }
+  // Assume that the nested pointer is constructed from the whole RHS.
+  // E.g. optional<string_view> s = std::string();
+  return true;
+}
+
 // Return true if this is an "normal" assignment operator.
 // We assuments that a normal assingment operator always returns *this, that is,
 // an lvalue reference that is the same type as the implicit object parameter
@@ -473,12 +612,12 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
     if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr<LifetimeBoundAttr>())
       VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]);
     else if (EnableGSLAnalysis && I == 0) {
+      // Perform GSL analysis for the first argument
       if (shouldTrackFirstArgument(Callee)) {
         VisitGSLPointerArg(Callee, Args[0]);
-      } else if (auto *CCE = dyn_cast<CXXConstructExpr>(Call);
-                 CCE &&
-                 CCE->getConstructor()->getParent()->hasAttr<PointerAttr>()) {
-        VisitGSLPointerArg(CCE->getConstructor(), Args[0]);
+      } else if (auto *Ctor = dyn_cast<CXXConstructExpr>(Call);
+                 Ctor && shouldTrackFirstArgumentForConstructor(Ctor)) {
+        VisitGSLPointerArg(Ctor->getConstructor(), Args[0]);
       }
     }
   }
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index d19f79b6ddefc..2913d16fca482 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -117,33 +117,30 @@ struct BuiltinTypeDeclBuilder {
     if (Record->isCompleteDefinition())
       return *this;
 
+    ASTContext &Ctx = S.getASTContext();
     TypeSourceInfo *ElementTypeInfo = nullptr;
 
-    QualType Ty = Record->getASTContext().VoidPtrTy;
+    QualType ElemTy = Ctx.Char8Ty;
     if (Template) {
       if (const auto *TTD = dyn_cast<TemplateTypeParmDecl>(
               Template->getTemplateParameters()->getParam(0))) {
-        Ty = Record->getASTContext().getPointerType(
-            QualType(TTD->getTypeForDecl(), 0));
-        QualType ElemType = QualType(TTD->getTypeForDecl(), 0);
-        ElementTypeInfo = S.getASTContext().getTrivialTypeSourceInfo(
-            ElemType, SourceLocation());
+        ElemTy = QualType(TTD->getTypeForDecl(), 0);
       }
     }
+    ElementTypeInfo = Ctx.getTrivialTypeSourceInfo(ElemTy, SourceLocation());
 
     // add handle member with resource type attributes
     QualType AttributedResTy = QualType();
     SmallVector<const Attr *> Attrs = {
-        HLSLResourceClassAttr::CreateImplicit(Record->getASTContext(), RC),
-        IsROV ? HLSLROVAttr::CreateImplicit(Record->getASTContext()) : nullptr,
-        RawBuffer ? HLSLRawBufferAttr::CreateImplicit(Record->getASTContext())
-                  : nullptr,
-        ElementTypeInfo ? HLSLContainedTypeAttr::CreateImplicit(
-                              Record->getASTContext(), ElementTypeInfo)
-                        : nullptr};
-    Attr *ResourceAttr =
-        HLSLResourceAttr::CreateImplicit(Record->getASTContext(), RK);
-    if (CreateHLSLAttributedResourceType(S, Ty, Attrs, AttributedResTy))
+        HLSLResourceClassAttr::CreateImplicit(Ctx, RC),
+        IsROV ? HLSLROVAttr::CreateImplicit(Ctx) : nullptr,
+        RawBuffer ? HLSLRawBufferAttr::CreateImplicit(Ctx) : nullptr,
+        ElementTypeInfo
+            ? HLSLContainedTypeAttr::CreateImplicit(Ctx, ElementTypeInfo)
+            : nullptr};
+    Attr *ResourceAttr = HLSLResourceAttr::CreateImplicit(Ctx, RK);
+    if (CreateHLSLAttributedResourceType(S, Ctx.HLSLResourceTy, Attrs,
+                                         AttributedResTy))
       addMemberVariable("h", AttributedResTy, {ResourceAttr}, Access);
     return *this;
   }
@@ -193,36 +190,8 @@ struct BuiltinTypeDeclBuilder {
         ExplicitSpecifier(), false, true, false,
         ConstexprSpecKind::Unspecified);
 
-    DeclRefExpr *Fn =
-        lookupBuiltinFunction(AST, S, "__builtin_hlsl_create_handle");
-    Expr *RCExpr = emitResourceClassExpr(AST, RC);
-    Expr *Call = CallExpr::Create(AST, Fn, {RCExpr}, AST.VoidPtrTy, VK_PRValue,
-                                  SourceLocation(), FPOptionsOverride());
-
-    CXXThisExpr *This = CXXThisExpr::Create(
-        AST, SourceLocation(), Constructor->getFunctionObjectParameterType(),
-        true);
-    Expr *Handle = MemberExpr::CreateImplicit(AST, This, false, Fields["h"],
-                                              Fields["h"]->getType(), VK_LValue,
-                                              OK_Ordinary);
-
-    // If the handle isn't a void pointer, cast the builtin result to the
-    // correct type.
-    if (Handle->getType().getCanonicalType() != AST.VoidPtrTy) {
-      Call = CXXStaticCastExpr::Create(
-          AST, Handle->getType(), VK_PRValue, CK_Dependent, Call, nullptr,
-          AST.getTrivialTypeSourceInfo(Handle->getType(), SourceLocation()),
-          FPOptionsOverride(), SourceLocation(), SourceLocation(),
-          SourceRange());
-    }
-
-    BinaryOperator *Assign = BinaryOperator::Create(
-        AST, Handle, Call, BO_Assign, Handle->getType(), VK_LValue, OK_Ordinary,
-        SourceLocation(), FPOptionsOverride());
-
-    Constructor->setBody(
-        CompoundStmt::Create(AST, {Assign}, FPOptionsOverride(),
-                             SourceLocation(), SourceLocation()));
+    Constructor->setBody(CompoundStmt::Create(
+        AST, {}, FPOptionsOverride(), SourceLocation(), SourceLocation()));
     Constructor->setAccess(AccessSpecifier::AS_public);
     Record->addDecl(Constructor);
     return *this;
@@ -242,14 +211,14 @@ struct BuiltinTypeDeclBuilder {
     assert(Fields.count("h") > 0 &&
            "Subscript operator must be added after the handle.");
 
-    FieldDecl *Handle = Fields["h"];
     ASTContext &AST = Record->getASTContext();
-
-    assert(Handle->getType().getCanonicalType() != AST.VoidPtrTy &&
-           "Not yet supported for void pointer handles.");
-
-    QualType ElemTy =
-        QualType(Handle->getType()->getPointeeOrArrayElementType(), 0);
+    QualType ElemTy = AST.Char8Ty;
+    if (Template) {
+      if (const auto *TTD = dyn_cast<TemplateTypeParmDecl>(
+              Template->getTemplateParameters()->getParam(0))) {
+        ElemTy = QualType(TTD->getTypeForDecl(), 0);
+      }
+    }
     QualType ReturnTy = ElemTy;
 
     FunctionProtoType::ExtProtoInfo ExtInfo;
@@ -285,22 +254,23 @@ struct BuiltinTypeDeclBuilder {
     auto FnProtoLoc = TSInfo->getTypeLoc().getAs<FunctionProtoTypeLoc>();
     FnProtoLoc.setParam(0, IdxParam);
 
+    // FIXME: Placeholder to make sure we return the correct type - create
+    // field of element_type and return reference to it. This field will go
+    // away once indexing into resources is properly implemented in
+    // llvm/llvm-project#95956.
+    if (Fields.count("e") == 0) {
+      addMemberVariable("e", ElemTy, {});
+    }
+    FieldDecl *ElemFieldDecl = Fields["e"];
+
     auto *This =
         CXXThisExpr::Create(AST, SourceLocation(),
                             MethodDecl->getFunctionObjectParameterType(), true);
-    auto *HandleAccess = MemberExpr::CreateImplicit(
-        AST, This, false, Handle, Handle->getType(), VK_LValue, OK_Ordinary);
-
-    auto *IndexExpr = DeclRefExpr::Create(
-        AST, NestedNameSpecifierLoc(), SourceLocation(), IdxParam, false,
-        DeclarationNameInfo(IdxParam->getDeclName(), SourceLocation()),
-        AST.UnsignedIntTy, VK_PRValue);
-
-    auto *Array =
-        new (AST) ArraySubscriptExpr(HandleAccess, IndexExpr, ElemTy, VK_LValue,
-                                     OK_Ordinary, SourceLocation());
-
-    auto *Return = ReturnStmt::Create(AST, SourceLocation(), Array, nullptr);
+    Expr *ElemField = MemberExpr::CreateImplicit(
+        AST, This, false, ElemFieldDecl, ElemFieldDecl->getType(), VK_LValue,
+        OK_Ordinary);
+    auto *Return =
+        ReturnStmt::Create(AST, SourceLocation(), ElemField, nullptr);
 
     MethodDecl->setBody(CompoundStmt::Create(AST, {Return}, FPOptionsOverride(),
                                              SourceLocation(),
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 6d7a57d7b5a41..4be7dfbc29392 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -509,7 +509,7 @@ void Sema::Initialize() {
   if (Context.getTargetInfo().getTriple().isAMDGPU() ||
       (Context.getAuxTargetInfo() &&
        Context.getAuxTargetInfo()->getTriple().isAMDGPU())) {
-#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
   addImplicitTypedef(Name, Context.SingletonId);
 #include "clang/Basic/AMDGPUTypes.def"
   }
@@ -1676,7 +1676,7 @@ void Sema::EmitDiagnostic(unsigned DiagID, const DiagnosticBuilder &DB) {
   // that is different from the last template instantiation where
   // we emitted an error, print a template instantiation
   // backtrace.
-  if (!Diags.getDiagnosticIDs()->isNote(DiagID))
+  if (!DiagnosticIDs::isBuiltinNote(DiagID))
     PrintContextStack();
 }
 
@@ -1690,8 +1690,7 @@ bool Sema::hasUncompilableErrorOccurred() const {
   if (Loc == DeviceDeferredDiags.end())
     return false;
   for (auto PDAt : Loc->second) {
-    if (Diags.getDiagnosticIDs()->isDefaultMappingAsError(
-            PDAt.second.getDiagID()))
+    if (DiagnosticIDs::isDefaultMappingAsError(PDAt.second.getDiagID()))
       return true;
   }
   return false;
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index fbb3de4b3e416..ec37c0df56c67 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -835,7 +835,7 @@ SemaBase::SemaDiagnosticBuilder SemaCUDA::DiagIfDeviceCode(SourceLocation Loc,
       if (!getLangOpts().CUDAIsDevice)
         return SemaDiagnosticBuilder::K_Nop;
       if (SemaRef.IsLastErrorImmediate &&
-          getDiagnostics().getDiagnosticIDs()->isNote(DiagID))
+          getDiagnostics().getDiagnosticIDs()->isBuiltinNote(DiagID))
         return SemaDiagnosticBuilder::K_Immediate;
       return (SemaRef.getEmissionStatus(CurFunContext) ==
               Sema::FunctionEmissionStatus::Emitted)
@@ -866,7 +866,7 @@ Sema::SemaDiagnosticBuilder SemaCUDA::DiagIfHostCode(SourceLocation Loc,
       if (getLangOpts().CUDAIsDevice)
         return SemaDiagnosticBuilder::K_Nop;
       if (SemaRef.IsLastErrorImmediate &&
-          getDiagnostics().getDiagnosticIDs()->isNote(DiagID))
+          getDiagnostics().getDiagnosticIDs()->isBuiltinNote(DiagID))
         return SemaDiagnosticBuilder::K_Immediate;
       return (SemaRef.getEmissionStatus(CurFunContext) ==
               Sema::FunctionEmissionStatus::Emitted)
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index af1dc21594da8..8634b54b0535d 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2755,6 +2755,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
 
   // These builtins restrict the element type to floating point
   // types only, and take in two arguments.
+  case Builtin::BI__builtin_elementwise_fmod:
   case Builtin::BI__builtin_elementwise_pow: {
     if (BuiltinElementwiseMath(TheCall))
       return ExprError();
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 6a1b32598bb4a..67fc603e9ce1d 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -975,11 +975,14 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
   // parameters that the surrounding function hasn't been instantiated yet. Note
   // this may happen while we're comparing two templates' constraint
   // equivalence.
-  LocalInstantiationScope ScopeForParameters(S);
-  if (auto *FD = DeclInfo.getDecl()->getAsFunction())
+  std::optional<LocalInstantiationScope> ScopeForParameters;
+  if (const NamedDecl *ND = DeclInfo.getDecl();
+      ND && ND->isFunctionOrFunctionTemplate()) {
+    ScopeForParameters.emplace(S);
+    const FunctionDecl *FD = ND->getAsFunction();
     for (auto *PVD : FD->parameters()) {
       if (!PVD->isParameterPack()) {
-        ScopeForParameters.InstantiatedLocal(PVD, PVD);
+        ScopeForParameters->InstantiatedLocal(PVD, PVD);
         continue;
       }
       // This is hacky: we're mapping the parameter pack to a size-of-1 argument
@@ -998,9 +1001,10 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
       // that we can eliminate the Scope in the cases where the declarations are
       // not necessarily instantiated. It would also benefit the noexcept
       // specifier comparison.
-      ScopeForParameters.MakeInstantiatedLocalArgPack(PVD);
-      ScopeForParameters.InstantiatedLocalPackArg(PVD, PVD);
+      ScopeForParameters->MakeInstantiatedLocalArgPack(PVD);
+      ScopeForParameters->InstantiatedLocalPackArg(PVD, PVD);
     }
+  }
 
   std::optional<Sema::CXXThisScopeRAII> ThisScope;
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 1bf0e800a3622..0e536f71a2f70 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -9293,15 +9293,12 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
         TrailingRequiresClause);
 
   } else if (Name.getNameKind() == DeclarationName::CXXDeductionGuideName) {
-    if (TrailingRequiresClause)
-      SemaRef.Diag(TrailingRequiresClause->getBeginLoc(),
-                   diag::err_trailing_requires_clause_on_deduction_guide)
-          << TrailingRequiresClause->getSourceRange();
     if (SemaRef.CheckDeductionGuideDeclarator(D, R, SC))
       return nullptr;
-    return CXXDeductionGuideDecl::Create(SemaRef.Context, DC, D.getBeginLoc(),
-                                         ExplicitSpecifier, NameInfo, R, TInfo,
-                                         D.getEndLoc());
+    return CXXDeductionGuideDecl::Create(
+        SemaRef.Context, DC, D.getBeginLoc(), ExplicitSpecifier, NameInfo, R,
+        TInfo, D.getEndLoc(), /*Ctor=*/nullptr,
+        /*Kind=*/DeductionCandidate::Normal, TrailingRequiresClause);
   } else if (DC->isRecord()) {
     // If the name of the function is the same as the name of the record,
     // then this must be an invalid constructor that has a return type.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 14cc51cf89665..c9b9f3a0007da 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -852,38 +852,22 @@ static void handleDiagnoseIfAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   if (!checkFunctionConditionAttr(S, D, AL, Cond, Msg))
     return;
 
-  StringRef DefaultSevStr;
-  if (!S.checkStringLiteralArgumentAttr(AL, 2, DefaultSevStr))
+  StringRef DiagTypeStr;
+  if (!S.checkStringLiteralArgumentAttr(AL, 2, DiagTypeStr))
     return;
 
-  DiagnoseIfAttr::DefaultSeverity DefaultSev;
-  if (!DiagnoseIfAttr::ConvertStrToDefaultSeverity(DefaultSevStr, DefaultSev)) {
+  DiagnoseIfAttr::DiagnosticType DiagType;
+  if (!DiagnoseIfAttr::ConvertStrToDiagnosticType(DiagTypeStr, DiagType)) {
     S.Diag(AL.getArgAsExpr(2)->getBeginLoc(),
            diag::err_diagnose_if_invalid_diagnostic_type);
     return;
   }
 
-  StringRef WarningGroup;
-  SmallVector<StringRef, 2> Options;
-  if (AL.getNumArgs() > 3) {
-    if (!S.checkStringLiteralArgumentAttr(AL, 3, WarningGroup))
-      return;
-    if (WarningGroup.empty() ||
-        !S.getDiagnostics().getDiagnosticIDs()->getGroupForWarningOption(
-            WarningGroup)) {
-      S.Diag(AL.getArgAsExpr(3)->getBeginLoc(),
-             diag::err_diagnose_if_unknown_warning)
-          << WarningGroup;
-      return;
-    }
-  }
-
   bool ArgDependent = false;
   if (const auto *FD = dyn_cast<FunctionDecl>(D))
     ArgDependent = ArgumentDependenceChecker(FD).referencesArgs(Cond);
   D->addAttr(::new (S.Context) DiagnoseIfAttr(
-      S.Context, AL, Cond, Msg, DefaultSev, WarningGroup, ArgDependent,
-      cast<NamedDecl>(D)));
+      S.Context, AL, Cond, Msg, DiagType, ArgDependent, cast<NamedDecl>(D)));
 }
 
 static void handleNoBuiltinAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 66df9c969256a..2db9d1fc69ed1 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6127,7 +6127,7 @@ static bool isPlaceholderToRemoveAsArg(QualType type) {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
@@ -10133,6 +10133,10 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
   const VectorType *RHSVecType = RHSType->getAs<VectorType>();
   assert(LHSVecType || RHSVecType);
 
+  if (getLangOpts().HLSL)
+    return HLSL().handleVectorBinOpConversion(LHS, RHS, LHSType, RHSType,
+                                              IsCompAssign);
+
   // AltiVec-style "vector bool op vector bool" combinations are allowed
   // for some operators but not others.
   if (!AllowBothBool && LHSVecType &&
@@ -12863,7 +12867,8 @@ static void diagnoseXorMisusedAsPow(Sema &S, const ExprResult &XorLHS,
 }
 
 QualType Sema::CheckVectorLogicalOperands(ExprResult &LHS, ExprResult &RHS,
-                                          SourceLocation Loc) {
+                                          SourceLocation Loc,
+                                          BinaryOperatorKind Opc) {
   // Ensure that either both operands are of the same vector type, or
   // one operand is of a vector type and the other is of its element type.
   QualType vType = CheckVectorOperands(LHS, RHS, Loc, false,
@@ -12883,6 +12888,15 @@ QualType Sema::CheckVectorLogicalOperands(ExprResult &LHS, ExprResult &RHS,
   if (!getLangOpts().CPlusPlus &&
       !(isa<ExtVectorType>(vType->getAs<VectorType>())))
     return InvalidLogicalVectorOperands(Loc, LHS, RHS);
+  // Beginning with HLSL 2021, HLSL disallows logical operators on vector
+  // operands and instead requires the use of the `and`, `or`, `any`, `all`, and
+  // `select` functions.
+  if (getLangOpts().HLSL &&
+      getLangOpts().getHLSLVersion() >= LangOptionsBase::HLSL_2021) {
+    (void)InvalidOperands(Loc, LHS, RHS);
+    HLSL().emitLogicalOperatorFixIt(LHS.get(), RHS.get(), Opc);
+    return QualType();
+  }
 
   return GetSignedVectorType(LHS.get()->getType());
 }
@@ -13054,7 +13068,7 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
   // Check vector operands differently.
   if (LHS.get()->getType()->isVectorType() ||
       RHS.get()->getType()->isVectorType())
-    return CheckVectorLogicalOperands(LHS, RHS, Loc);
+    return CheckVectorLogicalOperands(LHS, RHS, Loc, Opc);
 
   bool EnumConstantInBoolContext = false;
   for (const ExprResult &HS : {LHS, RHS}) {
@@ -20985,7 +20999,7 @@ ExprResult Sema::CheckPlaceholderExpr(Expr *E) {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index ac3fe6ab8f9bd..b30414a8a8277 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -9509,6 +9509,18 @@ Sema::ActOnStartRequiresExpr(SourceLocation RequiresKWLoc,
   PushDeclContext(BodyScope, Body);
 
   for (ParmVarDecl *Param : LocalParameters) {
+    if (Param->getType()->isVoidType()) {
+      if (LocalParameters.size() > 1) {
+        Diag(Param->getBeginLoc(), diag::err_void_only_param);
+        Param->setType(Context.IntTy);
+      } else if (Param->getIdentifier()) {
+        Diag(Param->getBeginLoc(), diag::err_param_with_void_type);
+        Param->setType(Context.IntTy);
+      } else if (Param->getType().hasQualifiers()) {
+        Diag(Param->getBeginLoc(), diag::err_void_param_qualified);
+      }
+    }
+
     if (Param->hasDefaultArg())
       // C++2a [expr.prim.req] p4
       //     [...] A local parameter of a requires-expression shall not have a
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index ebe76185cbb2d..43cc6c81ae5cb 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -401,6 +401,194 @@ void SemaHLSL::DiagnoseAttrStageMismatch(
       << (AllowedStages.size() != 1) << join(StageStrings, ", ");
 }
 
+template <CastKind Kind>
+static void castVector(Sema &S, ExprResult &E, QualType &Ty, unsigned Sz) {
+  if (const auto *VTy = Ty->getAs<VectorType>())
+    Ty = VTy->getElementType();
+  Ty = S.getASTContext().getExtVectorType(Ty, Sz);
+  E = S.ImpCastExprToType(E.get(), Ty, Kind);
+}
+
+template <CastKind Kind>
+static QualType castElement(Sema &S, ExprResult &E, QualType Ty) {
+  E = S.ImpCastExprToType(E.get(), Ty, Kind);
+  return Ty;
+}
+
+static QualType handleFloatVectorBinOpConversion(
+    Sema &SemaRef, ExprResult &LHS, ExprResult &RHS, QualType LHSType,
+    QualType RHSType, QualType LElTy, QualType RElTy, bool IsCompAssign) {
+  bool LHSFloat = LElTy->isRealFloatingType();
+  bool RHSFloat = RElTy->isRealFloatingType();
+
+  if (LHSFloat && RHSFloat) {
+    if (IsCompAssign ||
+        SemaRef.getASTContext().getFloatingTypeOrder(LElTy, RElTy) > 0)
+      return castElement<CK_FloatingCast>(SemaRef, RHS, LHSType);
+
+    return castElement<CK_FloatingCast>(SemaRef, LHS, RHSType);
+  }
+
+  if (LHSFloat)
+    return castElement<CK_IntegralToFloating>(SemaRef, RHS, LHSType);
+
+  assert(RHSFloat);
+  if (IsCompAssign)
+    return castElement<clang::CK_FloatingToIntegral>(SemaRef, RHS, LHSType);
+
+  return castElement<CK_IntegralToFloating>(SemaRef, LHS, RHSType);
+}
+
+static QualType handleIntegerVectorBinOpConversion(
+    Sema &SemaRef, ExprResult &LHS, ExprResult &RHS, QualType LHSType,
+    QualType RHSType, QualType LElTy, QualType RElTy, bool IsCompAssign) {
+
+  int IntOrder = SemaRef.Context.getIntegerTypeOrder(LElTy, RElTy);
+  bool LHSSigned = LElTy->hasSignedIntegerRepresentation();
+  bool RHSSigned = RElTy->hasSignedIntegerRepresentation();
+  auto &Ctx = SemaRef.getASTContext();
+
+  // If both types have the same signedness, use the higher ranked type.
+  if (LHSSigned == RHSSigned) {
+    if (IsCompAssign || IntOrder >= 0)
+      return castElement<CK_IntegralCast>(SemaRef, RHS, LHSType);
+
+    return castElement<CK_IntegralCast>(SemaRef, LHS, RHSType);
+  }
+
+  // If the unsigned type has greater than or equal rank of the signed type, use
+  // the unsigned type.
+  if (IntOrder != (LHSSigned ? 1 : -1)) {
+    if (IsCompAssign || RHSSigned)
+      return castElement<CK_IntegralCast>(SemaRef, RHS, LHSType);
+    return castElement<CK_IntegralCast>(SemaRef, LHS, RHSType);
+  }
+
+  // At this point the signed type has higher rank than the unsigned type, which
+  // means it will be the same size or bigger. If the signed type is bigger, it
+  // can represent all the values of the unsigned type, so select it.
+  if (Ctx.getIntWidth(LElTy) != Ctx.getIntWidth(RElTy)) {
+    if (IsCompAssign || LHSSigned)
+      return castElement<CK_IntegralCast>(SemaRef, RHS, LHSType);
+    return castElement<CK_IntegralCast>(SemaRef, LHS, RHSType);
+  }
+
+  // This is a bit of an odd duck case in HLSL. It shouldn't happen, but can due
+  // to C/C++ leaking through. The place this happens today is long vs long
+  // long. When arguments are vector<unsigned long, N> and vector<long long, N>,
+  // the long long has higher rank than long even though they are the same size.
+
+  // If this is a compound assignment cast the right hand side to the left hand
+  // side's type.
+  if (IsCompAssign)
+    return castElement<CK_IntegralCast>(SemaRef, RHS, LHSType);
+
+  // If this isn't a compound assignment we convert to unsigned long long.
+  QualType ElTy = Ctx.getCorrespondingUnsignedType(LHSSigned ? LElTy : RElTy);
+  QualType NewTy = Ctx.getExtVectorType(
+      ElTy, RHSType->castAs<VectorType>()->getNumElements());
+  (void)castElement<CK_IntegralCast>(SemaRef, RHS, NewTy);
+
+  return castElement<CK_IntegralCast>(SemaRef, LHS, NewTy);
+}
+
+static CastKind getScalarCastKind(ASTContext &Ctx, QualType DestTy,
+                                  QualType SrcTy) {
+  if (DestTy->isRealFloatingType() && SrcTy->isRealFloatingType())
+    return CK_FloatingCast;
+  if (DestTy->isIntegralType(Ctx) && SrcTy->isIntegralType(Ctx))
+    return CK_IntegralCast;
+  if (DestTy->isRealFloatingType())
+    return CK_IntegralToFloating;
+  assert(SrcTy->isRealFloatingType() && DestTy->isIntegralType(Ctx));
+  return CK_FloatingToIntegral;
+}
+
+QualType SemaHLSL::handleVectorBinOpConversion(ExprResult &LHS, ExprResult &RHS,
+                                               QualType LHSType,
+                                               QualType RHSType,
+                                               bool IsCompAssign) {
+  const auto *LVecTy = LHSType->getAs<VectorType>();
+  const auto *RVecTy = RHSType->getAs<VectorType>();
+  auto &Ctx = getASTContext();
+
+  // If the LHS is not a vector and this is a compound assignment, we truncate
+  // the argument to a scalar then convert it to the LHS's type.
+  if (!LVecTy && IsCompAssign) {
+    QualType RElTy = RHSType->castAs<VectorType>()->getElementType();
+    RHS = SemaRef.ImpCastExprToType(RHS.get(), RElTy, CK_HLSLVectorTruncation);
+    RHSType = RHS.get()->getType();
+    if (Ctx.hasSameUnqualifiedType(LHSType, RHSType))
+      return LHSType;
+    RHS = SemaRef.ImpCastExprToType(RHS.get(), LHSType,
+                                    getScalarCastKind(Ctx, LHSType, RHSType));
+    return LHSType;
+  }
+
+  unsigned EndSz = std::numeric_limits<unsigned>::max();
+  unsigned LSz = 0;
+  if (LVecTy)
+    LSz = EndSz = LVecTy->getNumElements();
+  if (RVecTy)
+    EndSz = std::min(RVecTy->getNumElements(), EndSz);
+  assert(EndSz != std::numeric_limits<unsigned>::max() &&
+         "one of the above should have had a value");
+
+  // In a compound assignment, the left operand does not change type, the right
+  // operand is converted to the type of the left operand.
+  if (IsCompAssign && LSz != EndSz) {
+    Diag(LHS.get()->getBeginLoc(),
+         diag::err_hlsl_vector_compound_assignment_truncation)
+        << LHSType << RHSType;
+    return QualType();
+  }
+
+  if (RVecTy && RVecTy->getNumElements() > EndSz)
+    castVector<CK_HLSLVectorTruncation>(SemaRef, RHS, RHSType, EndSz);
+  if (!IsCompAssign && LVecTy && LVecTy->getNumElements() > EndSz)
+    castVector<CK_HLSLVectorTruncation>(SemaRef, LHS, LHSType, EndSz);
+
+  if (!RVecTy)
+    castVector<CK_VectorSplat>(SemaRef, RHS, RHSType, EndSz);
+  if (!IsCompAssign && !LVecTy)
+    castVector<CK_VectorSplat>(SemaRef, LHS, LHSType, EndSz);
+
+  // If we're at the same type after resizing we can stop here.
+  if (Ctx.hasSameUnqualifiedType(LHSType, RHSType))
+    return Ctx.getCommonSugaredType(LHSType, RHSType);
+
+  QualType LElTy = LHSType->castAs<VectorType>()->getElementType();
+  QualType RElTy = RHSType->castAs<VectorType>()->getElementType();
+
+  // Handle conversion for floating point vectors.
+  if (LElTy->isRealFloatingType() || RElTy->isRealFloatingType())
+    return handleFloatVectorBinOpConversion(SemaRef, LHS, RHS, LHSType, RHSType,
+                                            LElTy, RElTy, IsCompAssign);
+
+  assert(LElTy->isIntegralType(Ctx) && RElTy->isIntegralType(Ctx) &&
+         "HLSL Vectors can only contain integer or floating point types");
+  return handleIntegerVectorBinOpConversion(SemaRef, LHS, RHS, LHSType, RHSType,
+                                            LElTy, RElTy, IsCompAssign);
+}
+
+void SemaHLSL::emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS,
+                                        BinaryOperatorKind Opc) {
+  assert((Opc == BO_LOr || Opc == BO_LAnd) &&
+         "Called with non-logical operator");
+  llvm::SmallVector<char, 256> Buff;
+  llvm::raw_svector_ostream OS(Buff);
+  PrintingPolicy PP(SemaRef.getLangOpts());
+  StringRef NewFnName = Opc == BO_LOr ? "or" : "and";
+  OS << NewFnName << "(";
+  LHS->printPretty(OS, nullptr, PP);
+  OS << ", ";
+  RHS->printPretty(OS, nullptr, PP);
+  OS << ")";
+  SourceRange FullRange = SourceRange(LHS->getBeginLoc(), RHS->getEndLoc());
+  SemaRef.Diag(LHS->getBeginLoc(), diag::note_function_suggestion)
+      << NewFnName << FixItHint::CreateReplacement(FullRange, OS.str());
+}
+
 void SemaHLSL::handleNumThreadsAttr(Decl *D, const ParsedAttr &AL) {
   llvm::VersionTuple SMVersion =
       getASTContext().getTargetInfo().getTriple().getOSVersion();
@@ -693,13 +881,19 @@ bool clang::CreateHLSLAttributedResourceType(
 // HLSL resource. The attributes are collected in HLSLResourcesTypeAttrs and at
 // the end of the declaration they are applied to the declaration type by
 // wrapping it in HLSLAttributedResourceType.
-bool SemaHLSL::handleResourceTypeAttr(const ParsedAttr &AL) {
-  Attr *A = nullptr;
+bool SemaHLSL::handleResourceTypeAttr(QualType T, const ParsedAttr &AL) {
+  // only allow resource type attributes on intangible types
+  if (!T->isHLSLResourceType()) {
+    Diag(AL.getLoc(), diag::err_hlsl_attribute_needs_intangible_type)
+        << AL << getASTContext().HLSLResourceTy;
+    return false;
+  }
 
   // validate number of arguments
   if (!AL.checkExactlyNumArgs(SemaRef, AL.getMinArgs()))
     return false;
 
+  Attr *A = nullptr;
   switch (AL.getKind()) {
   case ParsedAttr::AT_HLSLResourceClass: {
     if (!AL.isArgIdent(0)) {
@@ -1771,6 +1965,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   case Builtin::BI__builtin_elementwise_exp:
   case Builtin::BI__builtin_elementwise_exp2:
   case Builtin::BI__builtin_elementwise_floor:
+  case Builtin::BI__builtin_elementwise_fmod:
   case Builtin::BI__builtin_elementwise_log:
   case Builtin::BI__builtin_elementwise_log2:
   case Builtin::BI__builtin_elementwise_log10:
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index ecbcc19413dc6..89142b837e60a 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -343,6 +343,18 @@ bool doesClauseApplyToDirective(OpenACCDirectiveKind DirectiveKind,
       return false;
     }
 
+  case OpenACCClauseKind::Collapse: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Loop:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   default:
     // Do nothing so we can go to the 'unimplemented' diagnostic instead.
     return true;
@@ -1037,6 +1049,26 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitReductionClause(
       ValidVars, Clause.getEndLoc());
 }
 
+OpenACCClause *SemaOpenACCClauseVisitor::VisitCollapseClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // Duplicates here are not really sensible.  We could possible permit
+  // multiples if they all had the same value, but there isn't really a good
+  // reason to do so. Also, this simplifies the suppression of duplicates, in
+  // that we know if we 'find' one after instantiation, that it is the same
+  // clause, which simplifies instantiation/checking/etc.
+  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  ExprResult LoopCount = SemaRef.CheckCollapseLoopCount(Clause.getLoopCount());
+
+  if (!LoopCount.isUsable())
+    return nullptr;
+
+  return OpenACCCollapseClause::Create(Ctx, Clause.getBeginLoc(),
+                                       Clause.getLParenLoc(), Clause.isForce(),
+                                       LoopCount.get(), Clause.getEndLoc());
+}
+
 } // namespace
 
 SemaOpenACC::SemaOpenACC(Sema &S) : SemaBase(S) {}
@@ -1273,6 +1305,9 @@ ExprResult SemaOpenACC::ActOnIntExpr(OpenACCDirectiveKind DK,
     }
   } IntExprDiagnoser(DK, CK, IntExpr);
 
+  if (!IntExpr)
+    return ExprError();
+
   ExprResult IntExprResult = SemaRef.PerformContextualImplicitConversion(
       Loc, IntExpr, IntExprDiagnoser);
   if (IntExprResult.isInvalid())
@@ -1583,6 +1618,34 @@ ExprResult SemaOpenACC::ActOnArraySectionExpr(Expr *Base, SourceLocation LBLoc,
                        OK_Ordinary, ColonLoc, RBLoc);
 }
 
+ExprResult SemaOpenACC::CheckCollapseLoopCount(Expr *LoopCount) {
+  if (!LoopCount)
+    return ExprError();
+
+  assert((LoopCount->isInstantiationDependent() ||
+          LoopCount->getType()->isIntegerType()) &&
+         "Loop argument non integer?");
+
+  // If this is dependent, there really isn't anything we can check.
+  if (LoopCount->isInstantiationDependent())
+    return ExprResult{LoopCount};
+
+  std::optional<llvm::APSInt> ICE =
+      LoopCount->getIntegerConstantExpr(getASTContext());
+
+  // OpenACC 3.3: 2.9.1
+  // The argument to the collapse clause must be a constant positive integer
+  // expression.
+  if (!ICE || *ICE <= 0) {
+    Diag(LoopCount->getBeginLoc(), diag::err_acc_collapse_loop_count)
+        << ICE.has_value() << ICE.value_or(llvm::APSInt{}).getExtValue();
+    return ExprError();
+  }
+
+  return ExprResult{
+      ConstantExpr::Create(getASTContext(), LoopCount, APValue{*ICE})};
+}
+
 bool SemaOpenACC::ActOnStartStmtDirective(OpenACCDirectiveKind K,
                                           SourceLocation StartLoc) {
   SemaRef.DiscardCleanupsInEvaluationContext();
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 9afb8cea26fe7..8615da4b044a8 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -9195,7 +9195,7 @@ static bool checkOpenMPIterationSpace(
         SemaRef.Diag(CollapseLoopCountExpr->getExprLoc(),
                      diag::note_omp_collapse_ordered_expr)
             << 0 << CollapseLoopCountExpr->getSourceRange();
-      else
+      else if (OrderedLoopCountExpr)
         SemaRef.Diag(OrderedLoopCountExpr->getExprLoc(),
                      diag::note_omp_collapse_ordered_expr)
             << 1 << OrderedLoopCountExpr->getSourceRange();
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index d304f322aced6..0c1e054f7c30a 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -7300,10 +7300,8 @@ static bool diagnoseDiagnoseIfAttrsWith(Sema &S, const NamedDecl *ND,
     return false;
 
   auto WarningBegin = std::stable_partition(
-      Attrs.begin(), Attrs.end(), [](const DiagnoseIfAttr *DIA) {
-        return DIA->getDefaultSeverity() == DiagnoseIfAttr::DS_error &&
-               DIA->getWarningGroup().empty();
-      });
+      Attrs.begin(), Attrs.end(),
+      [](const DiagnoseIfAttr *DIA) { return DIA->isError(); });
 
   // Note that diagnose_if attributes are late-parsed, so they appear in the
   // correct order (unlike enable_if attributes).
@@ -7317,32 +7315,11 @@ static bool diagnoseDiagnoseIfAttrsWith(Sema &S, const NamedDecl *ND,
     return true;
   }
 
-  auto ToSeverity = [](DiagnoseIfAttr::DefaultSeverity Sev) {
-    switch (Sev) {
-    case DiagnoseIfAttr::DS_warning:
-      return diag::Severity::Warning;
-    case DiagnoseIfAttr::DS_error:
-      return diag::Severity::Error;
-    }
-    llvm_unreachable("Fully covered switch above!");
-  };
-
   for (const auto *DIA : llvm::make_range(WarningBegin, Attrs.end()))
     if (IsSuccessful(DIA)) {
-      if (DIA->getWarningGroup().empty() &&
-          DIA->getDefaultSeverity() == DiagnoseIfAttr::DS_warning) {
-        S.Diag(Loc, diag::warn_diagnose_if_succeeded) << DIA->getMessage();
-        S.Diag(DIA->getLocation(), diag::note_from_diagnose_if)
-            << DIA->getParent() << DIA->getCond()->getSourceRange();
-      } else {
-        auto DiagGroup = S.Diags.getDiagnosticIDs()->getGroupForWarningOption(
-            DIA->getWarningGroup());
-        assert(DiagGroup);
-        auto DiagID = S.Diags.getDiagnosticIDs()->getCustomDiagID(
-            {ToSeverity(DIA->getDefaultSeverity()), "%0",
-             DiagnosticIDs::CLASS_WARNING, false, false, *DiagGroup});
-        S.Diag(Loc, DiagID) << DIA->getMessage();
-      }
+      S.Diag(Loc, diag::warn_diagnose_if_succeeded) << DIA->getMessage();
+      S.Diag(DIA->getLocation(), diag::note_from_diagnose_if)
+          << DIA->getParent() << DIA->getCond()->getSourceRange();
     }
 
   return false;
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 99423b01114cc..c7d48b81bc034 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1134,7 +1134,8 @@ bool Sema::BuildTypeConstraint(const CXXScopeSpec &SS,
       SS.isSet() ? SS.getWithLocInContext(Context) : NestedNameSpecifierLoc(),
       ConceptName, CD, /*FoundDecl=*/USD ? cast<NamedDecl>(USD) : CD,
       TypeConstr->LAngleLoc.isValid() ? &TemplateArgs : nullptr,
-      ConstrainedParameter, EllipsisLoc);
+      ConstrainedParameter, Context.getTypeDeclType(ConstrainedParameter),
+      EllipsisLoc);
 }
 
 template <typename ArgumentLocAppender>
@@ -1191,6 +1192,7 @@ bool Sema::AttachTypeConstraint(NestedNameSpecifierLoc NS,
                                 ConceptDecl *NamedConcept, NamedDecl *FoundDecl,
                                 const TemplateArgumentListInfo *TemplateArgs,
                                 TemplateTypeParmDecl *ConstrainedParameter,
+                                QualType ConstrainedType,
                                 SourceLocation EllipsisLoc) {
   // C++2a [temp.param]p4:
   //     [...] If Q is of the form C<A1, ..., An>, then let E' be
@@ -1199,7 +1201,7 @@ bool Sema::AttachTypeConstraint(NestedNameSpecifierLoc NS,
     TemplateArgs ? ASTTemplateArgumentListInfo::Create(Context,
                                                        *TemplateArgs) : nullptr;
 
-  QualType ParamAsArgument(ConstrainedParameter->getTypeForDecl(), 0);
+  QualType ParamAsArgument = ConstrainedType;
 
   ExprResult ImmediatelyDeclaredConstraint = formImmediatelyDeclaredConstraint(
       *this, NS, NameInfo, NamedConcept, FoundDecl,
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 55f38743e2768..b36381422851f 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -371,7 +371,7 @@ Response HandleFunctionTemplateDecl(const FunctionTemplateDecl *FTD,
                   Specialization->getTemplateInstantiationArgs().asArray();
           }
           Result.addOuterTemplateArguments(
-              const_cast<FunctionTemplateDecl *>(FTD), Arguments,
+              TSTy->getTemplateName().getAsTemplateDecl(), Arguments,
               /*Final=*/false);
         }
       }
@@ -1655,6 +1655,21 @@ namespace {
                                            SubstTemplateTypeParmPackTypeLoc TL,
                                            bool SuppressObjCLifetime);
 
+    QualType
+    TransformSubstTemplateTypeParmType(TypeLocBuilder &TLB,
+                                       SubstTemplateTypeParmTypeLoc TL) {
+      if (SemaRef.CodeSynthesisContexts.back().Kind !=
+          Sema::CodeSynthesisContext::ConstraintSubstitution)
+        return inherited::TransformSubstTemplateTypeParmType(TLB, TL);
+
+      auto PackIndex = TL.getTypePtr()->getPackIndex();
+      std::optional<Sema::ArgumentPackSubstitutionIndexRAII> SubstIndex;
+      if (SemaRef.ArgumentPackSubstitutionIndex == -1 && PackIndex)
+        SubstIndex.emplace(SemaRef, *PackIndex);
+
+      return inherited::TransformSubstTemplateTypeParmType(TLB, TL);
+    }
+
     CXXRecordDecl::LambdaDependencyKind
     ComputeLambdaDependency(LambdaScopeInfo *LSI) {
       if (auto TypeAlias =
@@ -1677,7 +1692,8 @@ namespace {
       // Lambdas have already been processed inside their eval contexts.
       if (SemaRef.RebuildingImmediateInvocation)
         return E;
-      LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true);
+      LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true,
+                                    /*InstantiatingLambda=*/true);
       Sema::ConstraintEvalRAII<TemplateInstantiator> RAII(*this);
 
       return inherited::TransformLambdaExpr(E);
@@ -1721,6 +1737,33 @@ namespace {
       return inherited::TransformLambdaBody(E, Body);
     }
 
+    ExprResult RebuildSizeOfPackExpr(SourceLocation OperatorLoc,
+                                     NamedDecl *Pack, SourceLocation PackLoc,
+                                     SourceLocation RParenLoc,
+                                     std::optional<unsigned> Length,
+                                     ArrayRef<TemplateArgument> PartialArgs) {
+      if (SemaRef.CodeSynthesisContexts.back().Kind !=
+          Sema::CodeSynthesisContext::ConstraintNormalization)
+        return inherited::RebuildSizeOfPackExpr(OperatorLoc, Pack, PackLoc,
+                                                RParenLoc, Length, PartialArgs);
+
+#ifndef NDEBUG
+      for (auto *Iter = TemplateArgs.begin(); Iter != TemplateArgs.end();
+           ++Iter)
+        for (const TemplateArgument &TA : Iter->Args)
+          assert(TA.getKind() != TemplateArgument::Pack || TA.pack_size() == 1);
+#endif
+      Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(
+          SemaRef, /*NewSubstitutionIndex=*/0);
+      Decl *NewPack = TransformDecl(PackLoc, Pack);
+      if (!NewPack)
+        return ExprError();
+
+      return inherited::RebuildSizeOfPackExpr(OperatorLoc,
+                                              cast<NamedDecl>(NewPack), PackLoc,
+                                              RParenLoc, Length, PartialArgs);
+    }
+
     ExprResult TransformRequiresExpr(RequiresExpr *E) {
       LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true);
       ExprResult TransReq = inherited::TransformRequiresExpr(E);
@@ -2432,8 +2475,18 @@ QualType TemplateInstantiator::TransformFunctionProtoType(TypeLocBuilder &TLB,
                                  CXXRecordDecl *ThisContext,
                                  Qualifiers ThisTypeQuals,
                                  Fn TransformExceptionSpec) {
-  // We need a local instantiation scope for this function prototype.
-  LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true);
+  // If this is a lambda, the transformation MUST be done in the
+  // CurrentInstantiationScope since it introduces a mapping of
+  // the original to the newly created transformed parameters.
+  //
+  // In that case, TemplateInstantiator::TransformLambdaExpr will
+  // have already pushed a scope for this prototype, so don't create
+  // a second one.
+  LocalInstantiationScope *Current = getSema().CurrentInstantiationScope;
+  std::optional<LocalInstantiationScope> Scope;
+  if (!Current || !Current->isLambda())
+    Scope.emplace(SemaRef, /*CombineWithOuterScope=*/true);
+
   return inherited::TransformFunctionProtoType(
       TLB, TL, ThisContext, ThisTypeQuals, TransformExceptionSpec);
 }
@@ -3067,6 +3120,58 @@ namespace {
 
 } // namespace
 
+namespace {
+
+struct ExpandPackedTypeConstraints
+    : TreeTransform<ExpandPackedTypeConstraints> {
+
+  using inherited = TreeTransform<ExpandPackedTypeConstraints>;
+
+  ExpandPackedTypeConstraints(Sema &SemaRef) : inherited(SemaRef) {}
+
+  using inherited::TransformTemplateTypeParmType;
+
+  QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB,
+                                         TemplateTypeParmTypeLoc TL, bool) {
+    const TemplateTypeParmType *T = TL.getTypePtr();
+    if (!T->isParameterPack()) {
+      TemplateTypeParmTypeLoc NewTL =
+          TLB.push<TemplateTypeParmTypeLoc>(TL.getType());
+      NewTL.setNameLoc(TL.getNameLoc());
+      return TL.getType();
+    }
+
+    assert(SemaRef.ArgumentPackSubstitutionIndex != -1);
+
+    QualType Result = SemaRef.Context.getSubstTemplateTypeParmType(
+        TL.getType(), T->getDecl(), T->getIndex(),
+        SemaRef.ArgumentPackSubstitutionIndex);
+    SubstTemplateTypeParmTypeLoc NewTL =
+        TLB.push<SubstTemplateTypeParmTypeLoc>(Result);
+    NewTL.setNameLoc(TL.getNameLoc());
+    return Result;
+  }
+
+  QualType TransformSubstTemplateTypeParmType(TypeLocBuilder &TLB,
+                                              SubstTemplateTypeParmTypeLoc TL) {
+    const SubstTemplateTypeParmType *T = TL.getTypePtr();
+    if (T->getPackIndex()) {
+      SubstTemplateTypeParmTypeLoc TypeLoc =
+          TLB.push<SubstTemplateTypeParmTypeLoc>(TL.getType());
+      TypeLoc.setNameLoc(TL.getNameLoc());
+      return TypeLoc.getType();
+    }
+    return inherited::TransformSubstTemplateTypeParmType(TLB, TL);
+  }
+
+  bool SubstTemplateArguments(ArrayRef<TemplateArgumentLoc> Args,
+                              TemplateArgumentListInfo &Out) {
+    return inherited::TransformTemplateArguments(Args.begin(), Args.end(), Out);
+  }
+};
+
+} // namespace
+
 bool Sema::SubstTypeConstraint(
     TemplateTypeParmDecl *Inst, const TypeConstraint *TC,
     const MultiLevelTemplateArgumentList &TemplateArgs,
@@ -3075,9 +3180,62 @@ bool Sema::SubstTypeConstraint(
       TC->getTemplateArgsAsWritten();
 
   if (!EvaluateConstraints) {
-      Inst->setTypeConstraint(TC->getConceptReference(),
-                              TC->getImmediatelyDeclaredConstraint());
-      return false;
+    bool ShouldExpandExplicitTemplateArgs =
+        TemplArgInfo && ArgumentPackSubstitutionIndex != -1 &&
+        llvm::any_of(TemplArgInfo->arguments(), [](auto &Arg) {
+          return Arg.getArgument().containsUnexpandedParameterPack();
+        });
+
+    // We want to transform the packs into Subst* nodes for type constraints
+    // inside a pack expansion. For example,
+    //
+    //  template <class... Ts> void foo() {
+    //    bar([](C<Ts> auto value) {}...);
+    //  }
+    //
+    // As we expand Ts in the process of instantiating foo(), and retain
+    // the original template depths of Ts until the constraint evaluation, we
+    // would otherwise have no chance to expand Ts by the time of evaluating
+    // C<auto, Ts>.
+    //
+    // So we form a Subst* node for Ts along with a proper substitution index
+    // here, and substitute the node with a complete MLTAL later in evaluation.
+    if (ShouldExpandExplicitTemplateArgs) {
+      TemplateArgumentListInfo InstArgs;
+      InstArgs.setLAngleLoc(TemplArgInfo->LAngleLoc);
+      InstArgs.setRAngleLoc(TemplArgInfo->RAngleLoc);
+      if (ExpandPackedTypeConstraints(*this).SubstTemplateArguments(
+              TemplArgInfo->arguments(), InstArgs))
+        return true;
+
+      // The type of the original parameter.
+      auto *ConstraintExpr = TC->getImmediatelyDeclaredConstraint();
+      QualType ConstrainedType;
+
+      if (auto *FE = dyn_cast<CXXFoldExpr>(ConstraintExpr)) {
+        assert(FE->getLHS());
+        ConstraintExpr = FE->getLHS();
+      }
+      auto *CSE = cast<ConceptSpecializationExpr>(ConstraintExpr);
+      assert(!CSE->getTemplateArguments().empty() &&
+             "Empty template arguments?");
+      ConstrainedType = CSE->getTemplateArguments()[0].getAsType();
+      assert(!ConstrainedType.isNull() &&
+             "Failed to extract the original ConstrainedType?");
+
+      return AttachTypeConstraint(
+          TC->getNestedNameSpecifierLoc(), TC->getConceptNameInfo(),
+          TC->getNamedConcept(),
+          /*FoundDecl=*/TC->getConceptReference()->getFoundDecl(), &InstArgs,
+          Inst, ConstrainedType,
+          Inst->isParameterPack()
+              ? cast<CXXFoldExpr>(TC->getImmediatelyDeclaredConstraint())
+                    ->getEllipsisLoc()
+              : SourceLocation());
+    }
+    Inst->setTypeConstraint(TC->getConceptReference(),
+                            TC->getImmediatelyDeclaredConstraint());
+    return false;
   }
 
   TemplateArgumentListInfo InstArgs;
@@ -3093,6 +3251,7 @@ bool Sema::SubstTypeConstraint(
       TC->getNestedNameSpecifierLoc(), TC->getConceptNameInfo(),
       TC->getNamedConcept(),
       /*FoundDecl=*/TC->getConceptReference()->getFoundDecl(), &InstArgs, Inst,
+      Context.getTypeDeclType(Inst),
       Inst->isParameterPack()
           ? cast<CXXFoldExpr>(TC->getImmediatelyDeclaredConstraint())
                 ->getEllipsisLoc()
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index e055c87e78381..1c35c7d288e32 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -284,8 +284,7 @@ static void instantiateDependentDiagnoseIfAttr(
   if (Cond)
     New->addAttr(new (S.getASTContext()) DiagnoseIfAttr(
         S.getASTContext(), *DIA, Cond, DIA->getMessage(),
-        DIA->getDefaultSeverity(), DIA->getWarningGroup(),
-        DIA->getArgDependent(), New));
+        DIA->getDiagnosticType(), DIA->getArgDependent(), New));
 }
 
 // Constructs and adds to New a new instance of CUDALaunchBoundsAttr using
@@ -2234,7 +2233,7 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(
         SemaRef.Context, DC, D->getInnerLocStart(),
         InstantiatedExplicitSpecifier, NameInfo, T, TInfo,
         D->getSourceRange().getEnd(), DGuide->getCorrespondingConstructor(),
-        DGuide->getDeductionCandidateKind());
+        DGuide->getDeductionCandidateKind(), TrailingRequiresClause);
     Function->setAccess(D->getAccess());
   } else {
     Function = FunctionDecl::Create(
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 950bd6db0359d..c44fc9c4194ca 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -3035,7 +3035,9 @@ InventTemplateParameter(TypeProcessingState &state, QualType T,
             AutoLoc.getNestedNameSpecifierLoc(), AutoLoc.getConceptNameInfo(),
             AutoLoc.getNamedConcept(), /*FoundDecl=*/AutoLoc.getFoundDecl(),
             AutoLoc.hasExplicitTemplateArgs() ? &TAL : nullptr,
-            InventedTemplateParam, D.getEllipsisLoc());
+            InventedTemplateParam,
+            S.Context.getTypeDeclType(InventedTemplateParam),
+            D.getEllipsisLoc());
       }
     } else {
       // The 'auto' appears in the decl-specifiers; we've not finished forming
@@ -3072,7 +3074,9 @@ InventTemplateParameter(TypeProcessingState &state, QualType T,
             /*FoundDecl=*/
             USD ? cast<NamedDecl>(USD) : CD,
             TemplateId->LAngleLoc.isValid() ? &TemplateArgsInfo : nullptr,
-            InventedTemplateParam, D.getEllipsisLoc());
+            InventedTemplateParam,
+            S.Context.getTypeDeclType(InventedTemplateParam),
+            D.getEllipsisLoc());
       }
     }
   }
@@ -8860,7 +8864,7 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type,
       // decl-specifier-seq; do not collect attributes on declarations or those
       // that get to slide after declaration name.
       if (TAL == TAL_DeclSpec &&
-          state.getSema().HLSL().handleResourceTypeAttr(attr))
+          state.getSema().HLSL().handleResourceTypeAttr(type, attr))
         attr.setUsedAsTypeAttr();
       break;
     }
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 95ded5e59a9fa..6fdb18d51acef 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -684,10 +684,6 @@ class TreeTransform {
                                       Qualifiers ThisTypeQuals,
                                       Fn TransformExceptionSpec);
 
-  template <typename Fn>
-  QualType TransformAttributedType(TypeLocBuilder &TLB, AttributedTypeLoc TL,
-                                   Fn TransformModifiedType);
-
   bool TransformExceptionSpec(SourceLocation Loc,
                               FunctionProtoType::ExceptionSpecInfo &ESI,
                               SmallVectorImpl<QualType> &Exceptions,
@@ -7373,11 +7369,10 @@ TreeTransform<Derived>::TransformElaboratedType(TypeLocBuilder &TLB,
 }
 
 template <typename Derived>
-template <typename Fn>
-QualType TreeTransform<Derived>::TransformAttributedType(
-    TypeLocBuilder &TLB, AttributedTypeLoc TL, Fn TransformModifiedTypeFn) {
+QualType TreeTransform<Derived>::TransformAttributedType(TypeLocBuilder &TLB,
+                                                         AttributedTypeLoc TL) {
   const AttributedType *oldType = TL.getTypePtr();
-  QualType modifiedType = TransformModifiedTypeFn(TLB, TL.getModifiedLoc());
+  QualType modifiedType = getDerived().TransformType(TLB, TL.getModifiedLoc());
   if (modifiedType.isNull())
     return QualType();
 
@@ -7392,12 +7387,27 @@ QualType TreeTransform<Derived>::TransformAttributedType(
   // FIXME: dependent operand expressions?
   if (getDerived().AlwaysRebuild() ||
       modifiedType != oldType->getModifiedType()) {
-    TypeLocBuilder AuxiliaryTLB;
-    AuxiliaryTLB.reserve(TL.getFullDataSize());
-    QualType equivalentType =
-        getDerived().TransformType(AuxiliaryTLB, TL.getEquivalentTypeLoc());
-    if (equivalentType.isNull())
-      return QualType();
+    // If the equivalent type is equal to the modified type, we don't want to
+    // transform it as well because:
+    //
+    //   1. The transformation would yield the same result and is therefore
+    //      superfluous, and
+    //
+    //   2. Transforming the same type twice can cause problems, e.g. if it
+    //      is a FunctionProtoType, we may end up instantiating the function
+    //      parameters twice, which causes an assertion since the parameters
+    //      are already bound to their counterparts in the template for this
+    //      instantiation.
+    //
+    QualType equivalentType = modifiedType;
+    if (TL.getModifiedLoc().getType() != TL.getEquivalentTypeLoc().getType()) {
+      TypeLocBuilder AuxiliaryTLB;
+      AuxiliaryTLB.reserve(TL.getFullDataSize());
+      equivalentType =
+          getDerived().TransformType(AuxiliaryTLB, TL.getEquivalentTypeLoc());
+      if (equivalentType.isNull())
+        return QualType();
+    }
 
     // Check whether we can add nullability; it is only represented as
     // type sugar, and therefore cannot be diagnosed in any other way.
@@ -7421,15 +7431,6 @@ QualType TreeTransform<Derived>::TransformAttributedType(
   return result;
 }
 
-template <typename Derived>
-QualType TreeTransform<Derived>::TransformAttributedType(TypeLocBuilder &TLB,
-                                                         AttributedTypeLoc TL) {
-  return getDerived().TransformAttributedType(
-      TLB, TL, [&](TypeLocBuilder &TLB, TypeLoc ModifiedLoc) -> QualType {
-        return getDerived().TransformType(TLB, ModifiedLoc);
-      });
-}
-
 template <typename Derived>
 QualType TreeTransform<Derived>::TransformCountAttributedType(
     TypeLocBuilder &TLB, CountAttributedTypeLoc TL) {
@@ -11821,6 +11822,31 @@ void OpenACCClauseTransform<Derived>::VisitReductionClause(
       ParsedClause.getLParenLoc(), C.getReductionOp(), ValidVars,
       ParsedClause.getEndLoc());
 }
+
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitCollapseClause(
+    const OpenACCCollapseClause &C) {
+  Expr *LoopCount = const_cast<Expr *>(C.getLoopCount());
+  assert(LoopCount && "collapse clause constructed with invalid loop count");
+
+  ExprResult NewLoopCount = Self.TransformExpr(LoopCount);
+
+  NewLoopCount = Self.getSema().OpenACC().ActOnIntExpr(
+      OpenACCDirectiveKind::Invalid, ParsedClause.getClauseKind(),
+      NewLoopCount.get()->getBeginLoc(), NewLoopCount.get());
+
+  NewLoopCount =
+      Self.getSema().OpenACC().CheckCollapseLoopCount(NewLoopCount.get());
+
+  if (!NewLoopCount.isUsable())
+    return;
+
+  ParsedClause.setCollapseDetails(C.hasForce(), NewLoopCount.get());
+  NewClause = OpenACCCollapseClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+      ParsedClause.getLParenLoc(), ParsedClause.isForce(),
+      ParsedClause.getLoopCount(), ParsedClause.getEndLoc());
+}
 } // namespace
 template <typename Derived>
 OpenACCClause *TreeTransform<Derived>::TransformOpenACCClause(
@@ -14774,63 +14800,29 @@ TreeTransform<Derived>::TransformLambdaExpr(LambdaExpr *E) {
         TPL->containsUnexpandedParameterPack();
   }
 
-  // Transform the type of the original lambda's call operator.
-  // The transformation MUST be done in the CurrentInstantiationScope since
-  // it introduces a mapping of the original to the newly created
-  // transformed parameters.
-  TypeSourceInfo *NewCallOpTSI = nullptr;
-  {
-    auto OldCallOpTypeLoc =
-        E->getCallOperator()->getTypeSourceInfo()->getTypeLoc();
-
-    auto TransformFunctionProtoTypeLoc =
-        [this](TypeLocBuilder &TLB, FunctionProtoTypeLoc FPTL) -> QualType {
-      SmallVector<QualType, 4> ExceptionStorage;
-      return this->TransformFunctionProtoType(
-          TLB, FPTL, nullptr, Qualifiers(),
-          [&](FunctionProtoType::ExceptionSpecInfo &ESI, bool &Changed) {
-            return TransformExceptionSpec(FPTL.getBeginLoc(), ESI,
-                                          ExceptionStorage, Changed);
-          });
-    };
-
-    QualType NewCallOpType;
-    TypeLocBuilder NewCallOpTLBuilder;
-
-    if (auto ATL = OldCallOpTypeLoc.getAs<AttributedTypeLoc>()) {
-      NewCallOpType = this->TransformAttributedType(
-          NewCallOpTLBuilder, ATL,
-          [&](TypeLocBuilder &TLB, TypeLoc TL) -> QualType {
-            return TransformFunctionProtoTypeLoc(
-                TLB, TL.castAs<FunctionProtoTypeLoc>());
-          });
-    } else {
-      auto FPTL = OldCallOpTypeLoc.castAs<FunctionProtoTypeLoc>();
-      NewCallOpType = TransformFunctionProtoTypeLoc(NewCallOpTLBuilder, FPTL);
-    }
-
-    if (NewCallOpType.isNull())
-      return ExprError();
-    LSI->ContainsUnexpandedParameterPack |=
-        NewCallOpType->containsUnexpandedParameterPack();
-    NewCallOpTSI =
-        NewCallOpTLBuilder.getTypeSourceInfo(getSema().Context, NewCallOpType);
-  }
+  TypeLocBuilder NewCallOpTLBuilder;
+  TypeLoc OldCallOpTypeLoc =
+      E->getCallOperator()->getTypeSourceInfo()->getTypeLoc();
+  QualType NewCallOpType =
+      getDerived().TransformType(NewCallOpTLBuilder, OldCallOpTypeLoc);
+  if (NewCallOpType.isNull())
+    return ExprError();
+  LSI->ContainsUnexpandedParameterPack |=
+      NewCallOpType->containsUnexpandedParameterPack();
+  TypeSourceInfo *NewCallOpTSI =
+      NewCallOpTLBuilder.getTypeSourceInfo(getSema().Context, NewCallOpType);
 
-  ArrayRef<ParmVarDecl *> Params;
-  if (auto ATL = NewCallOpTSI->getTypeLoc().getAs<AttributedTypeLoc>()) {
-    Params = ATL.getModifiedLoc().castAs<FunctionProtoTypeLoc>().getParams();
-  } else {
-    auto FPTL = NewCallOpTSI->getTypeLoc().castAs<FunctionProtoTypeLoc>();
-    Params = FPTL.getParams();
-  }
+  // The type may be an AttributedType or some other kind of sugar;
+  // get the actual underlying FunctionProtoType.
+  auto FPTL = NewCallOpTSI->getTypeLoc().getAsAdjusted<FunctionProtoTypeLoc>();
+  assert(FPTL && "Not a FunctionProtoType?");
 
   getSema().CompleteLambdaCallOperator(
       NewCallOperator, E->getCallOperator()->getLocation(),
       E->getCallOperator()->getInnerLocStart(),
       E->getCallOperator()->getTrailingRequiresClause(), NewCallOpTSI,
       E->getCallOperator()->getConstexprKind(),
-      E->getCallOperator()->getStorageClass(), Params,
+      E->getCallOperator()->getStorageClass(), FPTL.getParams(),
       E->hasExplicitResultType());
 
   getDerived().transformAttrs(E->getCallOperator(), NewCallOperator);
diff --git a/clang/lib/Serialization/ASTCommon.cpp b/clang/lib/Serialization/ASTCommon.cpp
index f30642f513ae4..ab4923de6346f 100644
--- a/clang/lib/Serialization/ASTCommon.cpp
+++ b/clang/lib/Serialization/ASTCommon.cpp
@@ -258,7 +258,7 @@ serialization::TypeIdxFromBuiltin(const BuiltinType *BT) {
     ID = PREDEF_TYPE_##Id##_ID;                                                \
     break;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
   case BuiltinType::Id:                                                        \
     ID = PREDEF_TYPE_##Id##_ID;                                                \
     break;
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index a369ad0be4795..0a4251c0e5240 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -2044,14 +2044,14 @@ ASTReader::getGlobalPreprocessedEntityID(ModuleFile &M,
 const FileEntry *HeaderFileInfoTrait::getFile(const internal_key_type &Key) {
   FileManager &FileMgr = Reader.getFileManager();
   if (!Key.Imported) {
-    if (auto File = FileMgr.getFile(Key.Filename))
+    if (auto File = FileMgr.getOptionalFileRef(Key.Filename))
       return *File;
     return nullptr;
   }
 
   std::string Resolved = std::string(Key.Filename);
   Reader.ResolveImportedPath(M, Resolved);
-  if (auto File = FileMgr.getFile(Resolved))
+  if (auto File = FileMgr.getOptionalFileRef(Resolved))
     return *File;
   return nullptr;
 }
@@ -4217,7 +4217,7 @@ ASTReader::ReadModuleMapFileBlock(RecordData &Record, ModuleFile &F,
     assert(M && M->Name == F.ModuleName && "found module with different name");
 
     // Check the primary module map file.
-    auto StoredModMap = FileMgr.getFile(F.ModuleMapPath);
+    auto StoredModMap = FileMgr.getOptionalFileRef(F.ModuleMapPath);
     if (!StoredModMap || *StoredModMap != ModMap) {
       assert(ModMap && "found module is missing module map file");
       assert((ImportedBy || F.Kind == MK_ImplicitModule) &&
@@ -6647,7 +6647,7 @@ void ASTReader::ReadPragmaDiagnosticMappings(DiagnosticsEngine &Diag) {
       // command line (-w, -Weverything, -Werror, ...) along with any explicit
       // -Wblah flags.
       unsigned Flags = Record[Idx++];
-      DiagState Initial(*Diag.getDiagnosticIDs());
+      DiagState Initial;
       Initial.SuppressSystemWarnings = Flags & 1; Flags >>= 1;
       Initial.ErrorsAsFatal = Flags & 1; Flags >>= 1;
       Initial.WarningsAsErrors = Flags & 1; Flags >>= 1;
@@ -7488,7 +7488,7 @@ QualType ASTReader::GetType(TypeID ID) {
     T = Context.SingletonId;                                                   \
     break;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
   case PREDEF_TYPE_##Id##_ID:                                                  \
     T = Context.SingletonId;                                                   \
     break;
@@ -12283,6 +12283,13 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
     return OpenACCIndependentClause::Create(getContext(), BeginLoc, EndLoc);
   case OpenACCClauseKind::Auto:
     return OpenACCAutoClause::Create(getContext(), BeginLoc, EndLoc);
+  case OpenACCClauseKind::Collapse: {
+    SourceLocation LParenLoc = readSourceLocation();
+    bool HasForce = readBool();
+    Expr *LoopCount = readSubExpr();
+    return OpenACCCollapseClause::Create(getContext(), BeginLoc, LParenLoc,
+                                         HasForce, LoopCount, EndLoc);
+  }
 
   case OpenACCClauseKind::Finalize:
   case OpenACCClauseKind::IfPresent:
@@ -12296,7 +12303,6 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
   case OpenACCClauseKind::DeviceResident:
   case OpenACCClauseKind::Host:
   case OpenACCClauseKind::Link:
-  case OpenACCClauseKind::Collapse:
   case OpenACCClauseKind::Bind:
   case OpenACCClauseKind::DeviceNum:
   case OpenACCClauseKind::DefaultAsync:
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index f326e3c2e2ff7..aa9764e25c323 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -3220,7 +3220,7 @@ void ASTWriter::WritePragmaDiagnosticMappings(const DiagnosticsEngine &Diag,
         // Skip default mappings. We have a mapping for every diagnostic ever
         // emitted, regardless of whether it was customized.
         if (!I.second.isPragma() &&
-            I.second == Diag.getDiagnosticIDs()->getDefaultMapping(I.first))
+            I.second == DiagnosticIDs::getDefaultMapping(I.first))
           continue;
         Mappings.push_back(I);
       }
@@ -5713,8 +5713,7 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) {
     // efficent becuase it allows lazy deserialization.
     RecordData FunctionToLambdasMapRecord;
     for (const auto &Pair : FunctionToLambdasMap) {
-      FunctionToLambdasMapRecord.push_back(
-          GetDeclRef(Pair.first).getRawValue());
+      FunctionToLambdasMapRecord.push_back(Pair.first.getRawValue());
       FunctionToLambdasMapRecord.push_back(Pair.second.size());
       for (const auto &Lambda : Pair.second)
         FunctionToLambdasMapRecord.push_back(Lambda.getRawValue());
@@ -8149,6 +8148,13 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     // Nothing to do here, there is no additional information beyond the
     // begin/end loc and clause kind.
     return;
+  case OpenACCClauseKind::Collapse: {
+    const auto *CC = cast<OpenACCCollapseClause>(C);
+    writeSourceLocation(CC->getLParenLoc());
+    writeBool(CC->hasForce());
+    AddStmt(const_cast<Expr *>(CC->getLoopCount()));
+    return;
+  }
 
   case OpenACCClauseKind::Finalize:
   case OpenACCClauseKind::IfPresent:
@@ -8162,7 +8168,6 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::DeviceResident:
   case OpenACCClauseKind::Host:
   case OpenACCClauseKind::Link:
-  case OpenACCClauseKind::Collapse:
   case OpenACCClauseKind::Bind:
   case OpenACCClauseKind::DeviceNum:
   case OpenACCClauseKind::DefaultAsync:
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 50c090b195d61..b71684569609a 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -1524,7 +1524,8 @@ void ASTDeclWriter::VisitCXXRecordDecl(CXXRecordDecl *D) {
     // For lambdas inside canonical FunctionDecl remember the mapping.
     if (auto FD = llvm::dyn_cast_or_null<FunctionDecl>(D->getDeclContext());
         FD && FD->isCanonicalDecl()) {
-      Writer.FunctionToLambdasMap[FD].push_back(Writer.GetDeclRef(D));
+      Writer.FunctionToLambdasMap[Writer.GetDeclRef(FD)].push_back(
+          Writer.GetDeclRef(D));
     }
   } else {
     Record.push_back(CXXRecNotTemplate);
@@ -1784,6 +1785,18 @@ void ASTDeclWriter::VisitClassTemplateSpecializationDecl(
   if (ArgsWritten)
     Record.AddASTTemplateArgumentListInfo(ArgsWritten);
 
+  // Mention the implicitly generated C++ deduction guide to make sure the
+  // deduction guide will be rewritten as expected.
+  //
+  // FIXME: Would it be more efficient to add a callback register function
+  // in sema to register the deduction guide?
+  if (Writer.isWritingStdCXXNamedModules()) {
+    auto Name = Context.DeclarationNames.getCXXDeductionGuideName(
+        D->getSpecializedTemplate());
+    for (auto *DG : D->getDeclContext()->noload_lookup(Name))
+      Writer.GetDeclRef(DG->getCanonicalDecl());
+  }
+
   Code = serialization::DECL_CLASS_TEMPLATE_SPECIALIZATION;
 }
 
diff --git a/clang/lib/Serialization/ModuleManager.cpp b/clang/lib/Serialization/ModuleManager.cpp
index 51b6429412960..e74a16b636802 100644
--- a/clang/lib/Serialization/ModuleManager.cpp
+++ b/clang/lib/Serialization/ModuleManager.cpp
@@ -42,8 +42,8 @@ using namespace clang;
 using namespace serialization;
 
 ModuleFile *ModuleManager::lookupByFileName(StringRef Name) const {
-  auto Entry = FileMgr.getFile(Name, /*OpenFile=*/false,
-                               /*CacheFailure=*/false);
+  auto Entry = FileMgr.getOptionalFileRef(Name, /*OpenFile=*/false,
+                                          /*CacheFailure=*/false);
   if (Entry)
     return lookup(*Entry);
 
@@ -64,8 +64,8 @@ ModuleFile *ModuleManager::lookup(const FileEntry *File) const {
 
 std::unique_ptr<llvm::MemoryBuffer>
 ModuleManager::lookupBuffer(StringRef Name) {
-  auto Entry = FileMgr.getFile(Name, /*OpenFile=*/false,
-                               /*CacheFailure=*/false);
+  auto Entry = FileMgr.getOptionalFileRef(Name, /*OpenFile=*/false,
+                                          /*CacheFailure=*/false);
   if (!Entry)
     return nullptr;
   return std::move(InMemoryBuffers[*Entry]);
@@ -279,8 +279,8 @@ void ModuleManager::removeModules(ModuleIterator First) {
 void
 ModuleManager::addInMemoryBuffer(StringRef FileName,
                                  std::unique_ptr<llvm::MemoryBuffer> Buffer) {
-  const FileEntry *Entry =
-      FileMgr.getVirtualFile(FileName, Buffer->getBufferSize(), 0);
+  FileEntryRef Entry =
+      FileMgr.getVirtualFileRef(FileName, Buffer->getBufferSize(), 0);
   InMemoryBuffers[Entry] = std::move(Buffer);
 }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt
index 414282d58f779..6da3665ab9a4d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt
+++ b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -132,7 +132,7 @@ add_clang_library(clangStaticAnalyzerCheckers
   VLASizeChecker.cpp
   ValistChecker.cpp
   VirtualCallChecker.cpp
-  WebKit/NoUncountedMembersChecker.cpp
+  WebKit/RawPtrRefMemberChecker.cpp
   WebKit/ASTUtils.cpp
   WebKit/PtrTypesSemantics.cpp
   WebKit/RefCntblBaseVirtualDtorChecker.cpp
diff --git a/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
index 5496f087447fb..7c8b44eb05942 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
@@ -25,9 +25,7 @@ using namespace ento;
 using namespace taint;
 
 namespace {
-class DivZeroChecker : public Checker< check::PreStmt<BinaryOperator> > {
-  const BugType BT{this, "Division by zero"};
-  const BugType TaintBT{this, "Division by zero", categories::TaintedData};
+class DivZeroChecker : public Checker<check::PreStmt<BinaryOperator>> {
   void reportBug(StringRef Msg, ProgramStateRef StateZero,
                  CheckerContext &C) const;
   void reportTaintBug(StringRef Msg, ProgramStateRef StateZero,
@@ -35,6 +33,12 @@ class DivZeroChecker : public Checker< check::PreStmt<BinaryOperator> > {
                       llvm::ArrayRef<SymbolRef> TaintedSyms) const;
 
 public:
+  /// This checker class implements several user facing checkers
+  enum CheckKind { CK_DivideZero, CK_TaintedDivChecker, CK_NumCheckKinds };
+  bool ChecksEnabled[CK_NumCheckKinds] = {false};
+  CheckerNameRef CheckNames[CK_NumCheckKinds];
+  mutable std::unique_ptr<BugType> BugTypes[CK_NumCheckKinds];
+
   void checkPreStmt(const BinaryOperator *B, CheckerContext &C) const;
 };
 } // end anonymous namespace
@@ -48,8 +52,14 @@ static const Expr *getDenomExpr(const ExplodedNode *N) {
 
 void DivZeroChecker::reportBug(StringRef Msg, ProgramStateRef StateZero,
                                CheckerContext &C) const {
+  if (!ChecksEnabled[CK_DivideZero])
+    return;
+  if (!BugTypes[CK_DivideZero])
+    BugTypes[CK_DivideZero].reset(
+        new BugType(CheckNames[CK_DivideZero], "Division by zero"));
   if (ExplodedNode *N = C.generateErrorNode(StateZero)) {
-    auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
+    auto R = std::make_unique<PathSensitiveBugReport>(*BugTypes[CK_DivideZero],
+                                                      Msg, N);
     bugreporter::trackExpressionValue(N, getDenomExpr(N), *R);
     C.emitReport(std::move(R));
   }
@@ -58,8 +68,15 @@ void DivZeroChecker::reportBug(StringRef Msg, ProgramStateRef StateZero,
 void DivZeroChecker::reportTaintBug(
     StringRef Msg, ProgramStateRef StateZero, CheckerContext &C,
     llvm::ArrayRef<SymbolRef> TaintedSyms) const {
-  if (ExplodedNode *N = C.generateErrorNode(StateZero)) {
-    auto R = std::make_unique<PathSensitiveBugReport>(TaintBT, Msg, N);
+  if (!ChecksEnabled[CK_TaintedDivChecker])
+    return;
+  if (!BugTypes[CK_TaintedDivChecker])
+    BugTypes[CK_TaintedDivChecker].reset(
+        new BugType(CheckNames[CK_TaintedDivChecker], "Division by zero",
+                    categories::TaintedData));
+  if (ExplodedNode *N = C.generateNonFatalErrorNode(StateZero)) {
+    auto R = std::make_unique<PathSensitiveBugReport>(
+        *BugTypes[CK_TaintedDivChecker], Msg, N);
     bugreporter::trackExpressionValue(N, getDenomExpr(N), *R);
     for (auto Sym : TaintedSyms)
       R->markInteresting(Sym);
@@ -101,8 +118,8 @@ void DivZeroChecker::checkPreStmt(const BinaryOperator *B,
   if ((stateNotZero && stateZero)) {
     std::vector<SymbolRef> taintedSyms = getTaintedSymbols(C.getState(), *DV);
     if (!taintedSyms.empty()) {
-      reportTaintBug("Division by a tainted value, possibly zero", stateZero, C,
-                     taintedSyms);
+      reportTaintBug("Division by a tainted value, possibly zero", stateNotZero,
+                     C, taintedSyms);
       return;
     }
   }
@@ -113,9 +130,27 @@ void DivZeroChecker::checkPreStmt(const BinaryOperator *B,
 }
 
 void ento::registerDivZeroChecker(CheckerManager &mgr) {
-  mgr.registerChecker<DivZeroChecker>();
+  DivZeroChecker *checker = mgr.registerChecker<DivZeroChecker>();
+  checker->ChecksEnabled[DivZeroChecker::CK_DivideZero] = true;
+  checker->CheckNames[DivZeroChecker::CK_DivideZero] =
+      mgr.getCurrentCheckerName();
 }
 
 bool ento::shouldRegisterDivZeroChecker(const CheckerManager &mgr) {
   return true;
 }
+
+void ento::registerTaintedDivChecker(CheckerManager &mgr) {
+  DivZeroChecker *checker;
+  if (!mgr.isRegisteredChecker<DivZeroChecker>())
+    checker = mgr.registerChecker<DivZeroChecker>();
+  else
+    checker = mgr.getChecker<DivZeroChecker>();
+  checker->ChecksEnabled[DivZeroChecker::CK_TaintedDivChecker] = true;
+  checker->CheckNames[DivZeroChecker::CK_TaintedDivChecker] =
+      mgr.getCurrentCheckerName();
+}
+
+bool ento::shouldRegisterTaintedDivChecker(const CheckerManager &mgr) {
+  return true;
+}
diff --git a/clang/lib/StaticAnalyzer/Checkers/FixedAddressChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/FixedAddressChecker.cpp
index 7aefcdc6d358a..e7fde3edc7f9e 100644
--- a/clang/lib/StaticAnalyzer/Checkers/FixedAddressChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/FixedAddressChecker.cpp
@@ -48,6 +48,9 @@ void FixedAddressChecker::checkPreStmt(const BinaryOperator *B,
   if (!RV.isConstant() || RV.isZeroConstant())
     return;
 
+  if (C.getSourceManager().isInSystemMacro(B->getRHS()->getBeginLoc()))
+    return;
+
   if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
     // FIXME: improve grammar in the following strings:
     constexpr llvm::StringLiteral Msg =
diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index 81ec8e1b51698..3e95db7e97fac 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -1811,9 +1811,9 @@ MallocChecker::MallocMemReturnsAttr(CheckerContext &C, const CallEvent &Call,
   if (!Att->args().empty()) {
     return MallocMemAux(C, Call,
                         Call.getArgExpr(Att->args_begin()->getASTIndex()),
-                        UndefinedVal(), State, Family);
+                        UnknownVal(), State, Family);
   }
-  return MallocMemAux(C, Call, UnknownVal(), UndefinedVal(), State, Family);
+  return MallocMemAux(C, Call, UnknownVal(), UnknownVal(), State, Family);
 }
 
 ProgramStateRef MallocChecker::MallocBindRetVal(CheckerContext &C,
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 9da3e54e45431..4d145be808f6d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -19,8 +19,7 @@ using namespace clang;
 
 namespace {
 
-bool hasPublicMethodInBaseClass(const CXXRecordDecl *R,
-                                const char *NameToMatch) {
+bool hasPublicMethodInBaseClass(const CXXRecordDecl *R, StringRef NameToMatch) {
   assert(R);
   assert(R->hasDefinition());
 
@@ -37,7 +36,7 @@ bool hasPublicMethodInBaseClass(const CXXRecordDecl *R,
 namespace clang {
 
 std::optional<const clang::CXXRecordDecl *>
-hasPublicMethodInBase(const CXXBaseSpecifier *Base, const char *NameToMatch) {
+hasPublicMethodInBase(const CXXBaseSpecifier *Base, StringRef NameToMatch) {
   assert(Base);
 
   const Type *T = Base->getType().getTypePtrOrNull();
@@ -53,16 +52,17 @@ hasPublicMethodInBase(const CXXBaseSpecifier *Base, const char *NameToMatch) {
   return hasPublicMethodInBaseClass(R, NameToMatch) ? R : nullptr;
 }
 
-std::optional<bool> isRefCountable(const CXXRecordDecl* R)
-{
+std::optional<bool> isSmartPtrCompatible(const CXXRecordDecl *R,
+                                         StringRef IncMethodName,
+                                         StringRef DecMethodName) {
   assert(R);
 
   R = R->getDefinition();
   if (!R)
     return std::nullopt;
 
-  bool hasRef = hasPublicMethodInBaseClass(R, "ref");
-  bool hasDeref = hasPublicMethodInBaseClass(R, "deref");
+  bool hasRef = hasPublicMethodInBaseClass(R, IncMethodName);
+  bool hasDeref = hasPublicMethodInBaseClass(R, DecMethodName);
   if (hasRef && hasDeref)
     return true;
 
@@ -70,15 +70,15 @@ std::optional<bool> isRefCountable(const CXXRecordDecl* R)
   Paths.setOrigin(const_cast<CXXRecordDecl *>(R));
 
   bool AnyInconclusiveBase = false;
-  const auto hasPublicRefInBase =
-      [&AnyInconclusiveBase](const CXXBaseSpecifier *Base, CXXBasePath &) {
-        auto hasRefInBase = clang::hasPublicMethodInBase(Base, "ref");
-        if (!hasRefInBase) {
-          AnyInconclusiveBase = true;
-          return false;
-        }
-        return (*hasRefInBase) != nullptr;
-      };
+  const auto hasPublicRefInBase = [&](const CXXBaseSpecifier *Base,
+                                      CXXBasePath &) {
+    auto hasRefInBase = clang::hasPublicMethodInBase(Base, IncMethodName);
+    if (!hasRefInBase) {
+      AnyInconclusiveBase = true;
+      return false;
+    }
+    return (*hasRefInBase) != nullptr;
+  };
 
   hasRef = hasRef || R->lookupInBases(hasPublicRefInBase, Paths,
                                       /*LookupInDependent =*/true);
@@ -86,15 +86,15 @@ std::optional<bool> isRefCountable(const CXXRecordDecl* R)
     return std::nullopt;
 
   Paths.clear();
-  const auto hasPublicDerefInBase =
-      [&AnyInconclusiveBase](const CXXBaseSpecifier *Base, CXXBasePath &) {
-        auto hasDerefInBase = clang::hasPublicMethodInBase(Base, "deref");
-        if (!hasDerefInBase) {
-          AnyInconclusiveBase = true;
-          return false;
-        }
-        return (*hasDerefInBase) != nullptr;
-      };
+  const auto hasPublicDerefInBase = [&](const CXXBaseSpecifier *Base,
+                                        CXXBasePath &) {
+    auto hasDerefInBase = clang::hasPublicMethodInBase(Base, DecMethodName);
+    if (!hasDerefInBase) {
+      AnyInconclusiveBase = true;
+      return false;
+    }
+    return (*hasDerefInBase) != nullptr;
+  };
   hasDeref = hasDeref || R->lookupInBases(hasPublicDerefInBase, Paths,
                                           /*LookupInDependent =*/true);
   if (AnyInconclusiveBase)
@@ -103,11 +103,23 @@ std::optional<bool> isRefCountable(const CXXRecordDecl* R)
   return hasRef && hasDeref;
 }
 
+std::optional<bool> isRefCountable(const clang::CXXRecordDecl *R) {
+  return isSmartPtrCompatible(R, "ref", "deref");
+}
+
+std::optional<bool> isCheckedPtrCapable(const clang::CXXRecordDecl *R) {
+  return isSmartPtrCompatible(R, "incrementPtrCount", "decrementPtrCount");
+}
+
 bool isRefType(const std::string &Name) {
   return Name == "Ref" || Name == "RefAllowingPartiallyDestroyed" ||
          Name == "RefPtr" || Name == "RefPtrAllowingPartiallyDestroyed";
 }
 
+bool isCheckedPtr(const std::string &Name) {
+  return Name == "CheckedPtr" || Name == "CheckedRef";
+}
+
 bool isCtorOfRefCounted(const clang::FunctionDecl *F) {
   assert(F);
   const std::string &FunctionName = safeGetName(F);
@@ -155,7 +167,7 @@ std::optional<bool> isUncounted(const QualType T) {
 std::optional<bool> isUncounted(const CXXRecordDecl* Class)
 {
   // Keep isRefCounted first as it's cheaper.
-  if (isRefCounted(Class))
+  if (!Class || isRefCounted(Class))
     return false;
 
   std::optional<bool> IsRefCountable = isRefCountable(Class);
@@ -217,6 +229,15 @@ bool isRefCounted(const CXXRecordDecl *R) {
   return false;
 }
 
+bool isCheckedPtr(const CXXRecordDecl *R) {
+  assert(R);
+  if (auto *TmplR = R->getTemplateInstantiationPattern()) {
+    const auto &ClassName = safeGetName(TmplR);
+    return isCheckedPtr(ClassName);
+  }
+  return false;
+}
+
 bool isPtrConversion(const FunctionDecl *F) {
   assert(F);
   if (isCtorOfRefCounted(F))
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index e2d0342bebd52..3528c52a7d659 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -34,15 +34,23 @@ class Type;
 /// \returns CXXRecordDecl of the base if the type has ref as a public method,
 /// nullptr if not, std::nullopt if inconclusive.
 std::optional<const clang::CXXRecordDecl *>
-hasPublicMethodInBase(const CXXBaseSpecifier *Base, const char *NameToMatch);
+hasPublicMethodInBase(const CXXBaseSpecifier *Base,
+                      llvm::StringRef NameToMatch);
 
 /// \returns true if \p Class is ref-countable, false if not, std::nullopt if
 /// inconclusive.
-std::optional<bool> isRefCountable(const clang::CXXRecordDecl* Class);
+std::optional<bool> isRefCountable(const clang::CXXRecordDecl *Class);
+
+/// \returns true if \p Class is checked-pointer compatible, false if not,
+/// std::nullopt if inconclusive.
+std::optional<bool> isCheckedPtrCapable(const clang::CXXRecordDecl *Class);
 
 /// \returns true if \p Class is ref-counted, false if not.
 bool isRefCounted(const clang::CXXRecordDecl *Class);
 
+/// \returns true if \p Class is a CheckedPtr / CheckedRef, false if not.
+bool isCheckedPtr(const clang::CXXRecordDecl *Class);
+
 /// \returns true if \p Class is ref-countable AND not ref-counted, false if
 /// not, std::nullopt if inconclusive.
 std::optional<bool> isUncounted(const clang::QualType T);
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
similarity index 63%
rename from clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp
rename to clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
index 69a0eb3086ab7..2ce6bc330e0ca 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
@@ -1,4 +1,4 @@
-//=======- NoUncountedMembersChecker.cpp -------------------------*- C++ -*-==//
+//=======- RawPtrRefMemberChecker.cpp ----------------------------*- C++ -*-==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -25,18 +25,21 @@ using namespace ento;
 
 namespace {
 
-class NoUncountedMemberChecker
+class RawPtrRefMemberChecker
     : public Checker<check::ASTDecl<TranslationUnitDecl>> {
 private:
   BugType Bug;
   mutable BugReporter *BR;
 
 public:
-  NoUncountedMemberChecker()
-      : Bug(this,
-            "Member variable is a raw-pointer/reference to reference-countable "
-            "type",
-            "WebKit coding guidelines") {}
+  RawPtrRefMemberChecker(const char *description)
+      : Bug(this, description, "WebKit coding guidelines") {}
+
+  virtual std::optional<bool>
+  isPtrCompatible(const clang::CXXRecordDecl *) const = 0;
+  virtual bool isPtrCls(const clang::CXXRecordDecl *) const = 0;
+  virtual const char *typeName() const = 0;
+  virtual const char *invariant() const = 0;
 
   void checkASTDecl(const TranslationUnitDecl *TUD, AnalysisManager &MGR,
                     BugReporter &BRArg) const {
@@ -46,8 +49,8 @@ class NoUncountedMemberChecker
     // visit template instantiations or lambda classes. We
     // want to visit those, so we make our own RecursiveASTVisitor.
     struct LocalVisitor : public RecursiveASTVisitor<LocalVisitor> {
-      const NoUncountedMemberChecker *Checker;
-      explicit LocalVisitor(const NoUncountedMemberChecker *Checker)
+      const RawPtrRefMemberChecker *Checker;
+      explicit LocalVisitor(const RawPtrRefMemberChecker *Checker)
           : Checker(Checker) {
         assert(Checker);
       }
@@ -77,9 +80,9 @@ class NoUncountedMemberChecker
       if (auto *MemberCXXRD = MemberType->getPointeeCXXRecordDecl()) {
         // If we don't see the definition we just don't know.
         if (MemberCXXRD->hasDefinition()) {
-            std::optional<bool> isRCAble = isRefCountable(MemberCXXRD);
-            if (isRCAble && *isRCAble)
-                reportBug(Member, MemberType, MemberCXXRD, RD);
+          std::optional<bool> isRCAble = isPtrCompatible(MemberCXXRD);
+          if (isRCAble && *isRCAble)
+            reportBug(Member, MemberType, MemberCXXRD, RD);
         }
       }
     }
@@ -114,7 +117,7 @@ class NoUncountedMemberChecker
     // a member but we trust them to handle it correctly.
     auto CXXRD = llvm::dyn_cast_or_null<CXXRecordDecl>(RD);
     if (CXXRD)
-      return isRefCounted(CXXRD);
+      return isPtrCls(CXXRD);
 
     return false;
   }
@@ -134,10 +137,10 @@ class NoUncountedMemberChecker
     Os << " in ";
     printQuotedQualifiedName(Os, ClassCXXRD);
     Os << " is a "
-       << (isa<PointerType>(MemberType) ? "raw pointer" : "reference")
-       << " to ref-countable type ";
+       << (isa<PointerType>(MemberType) ? "raw pointer" : "reference") << " to "
+       << typeName() << " ";
     printQuotedQualifiedName(Os, MemberCXXRD);
-    Os << "; member variables must be ref-counted.";
+    Os << "; " << invariant() << ".";
 
     PathDiagnosticLocation BSLoc(Member->getSourceRange().getBegin(),
                                  BR->getSourceManager());
@@ -146,13 +149,67 @@ class NoUncountedMemberChecker
     BR->emitReport(std::move(Report));
   }
 };
+
+class NoUncountedMemberChecker final : public RawPtrRefMemberChecker {
+public:
+  NoUncountedMemberChecker()
+      : RawPtrRefMemberChecker("Member variable is a raw-pointer/reference to "
+                               "reference-countable type") {}
+
+  std::optional<bool>
+  isPtrCompatible(const clang::CXXRecordDecl *R) const final {
+    return isRefCountable(R);
+  }
+
+  bool isPtrCls(const clang::CXXRecordDecl *R) const final {
+    return isRefCounted(R);
+  }
+
+  const char *typeName() const final { return "ref-countable type"; }
+
+  const char *invariant() const final {
+    return "member variables must be Ref, RefPtr, WeakRef, or WeakPtr";
+  }
+};
+
+class NoUncheckedPtrMemberChecker final : public RawPtrRefMemberChecker {
+public:
+  NoUncheckedPtrMemberChecker()
+      : RawPtrRefMemberChecker("Member variable is a raw-pointer/reference to "
+                               "checked-pointer capable type") {}
+
+  std::optional<bool>
+  isPtrCompatible(const clang::CXXRecordDecl *R) const final {
+    return isCheckedPtrCapable(R);
+  }
+
+  bool isPtrCls(const clang::CXXRecordDecl *R) const final {
+    return isCheckedPtr(R);
+  }
+
+  const char *typeName() const final { return "CheckedPtr capable type"; }
+
+  const char *invariant() const final {
+    return "member variables must be a CheckedPtr, CheckedRef, WeakRef, or "
+           "WeakPtr";
+  }
+};
+
 } // namespace
 
 void ento::registerNoUncountedMemberChecker(CheckerManager &Mgr) {
   Mgr.registerChecker<NoUncountedMemberChecker>();
 }
 
-bool ento::shouldRegisterNoUncountedMemberChecker(
+bool ento::shouldRegisterNoUncountedMemberChecker(const CheckerManager &Mgr) {
+  return true;
+}
+
+void ento::registerNoUncheckedPtrMemberChecker(CheckerManager &Mgr) {
+  Mgr.registerChecker<NoUncheckedPtrMemberChecker>();
+}
+
+bool ento::shouldRegisterNoUncheckedPtrMemberChecker(
     const CheckerManager &Mgr) {
   return true;
 }
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index 81c2434ce6477..0ed93ab26bf5c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -18,6 +18,8 @@
 #include "clang/StaticAnalyzer/Core/BugReporter/BugReporter.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Support/SaveAndRestore.h"
 #include <optional>
 
 using namespace clang;
@@ -44,7 +46,11 @@ class UncountedCallArgsChecker
     // visit template instantiations or lambda classes. We
     // want to visit those, so we make our own RecursiveASTVisitor.
     struct LocalVisitor : public RecursiveASTVisitor<LocalVisitor> {
+      using Base = RecursiveASTVisitor<LocalVisitor>;
+
       const UncountedCallArgsChecker *Checker;
+      Decl *DeclWithIssue{nullptr};
+
       explicit LocalVisitor(const UncountedCallArgsChecker *Checker)
           : Checker(Checker) {
         assert(Checker);
@@ -56,12 +62,18 @@ class UncountedCallArgsChecker
       bool TraverseClassTemplateDecl(ClassTemplateDecl *Decl) {
         if (isRefType(safeGetName(Decl)))
           return true;
-        return RecursiveASTVisitor<LocalVisitor>::TraverseClassTemplateDecl(
-            Decl);
+        return Base::TraverseClassTemplateDecl(Decl);
+      }
+
+      bool TraverseDecl(Decl *D) {
+        llvm::SaveAndRestore SavedDecl(DeclWithIssue);
+        if (D && (isa<FunctionDecl>(D) || isa<ObjCMethodDecl>(D)))
+          DeclWithIssue = D;
+        return Base::TraverseDecl(D);
       }
 
       bool VisitCallExpr(const CallExpr *CE) {
-        Checker->visitCallExpr(CE);
+        Checker->visitCallExpr(CE, DeclWithIssue);
         return true;
       }
     };
@@ -70,7 +82,7 @@ class UncountedCallArgsChecker
     visitor.TraverseDecl(const_cast<TranslationUnitDecl *>(TUD));
   }
 
-  void visitCallExpr(const CallExpr *CE) const {
+  void visitCallExpr(const CallExpr *CE, const Decl *D) const {
     if (shouldSkipCall(CE))
       return;
 
@@ -86,10 +98,10 @@ class UncountedCallArgsChecker
             return;
         }
         auto *E = MemberCallExpr->getImplicitObjectArgument();
-        QualType ArgType = MemberCallExpr->getObjectType();
+        QualType ArgType = MemberCallExpr->getObjectType().getCanonicalType();
         std::optional<bool> IsUncounted = isUncounted(ArgType);
         if (IsUncounted && *IsUncounted && !isPtrOriginSafe(E))
-          reportBugOnThis(E);
+          reportBugOnThis(E, D);
       }
 
       for (auto P = F->param_begin();
@@ -102,12 +114,13 @@ class UncountedCallArgsChecker
         // if ((*P)->hasAttr<SafeRefCntblRawPtrAttr>())
         //  continue;
 
-        const auto *ArgType = (*P)->getType().getTypePtrOrNull();
-        if (!ArgType)
+        QualType ArgType = (*P)->getType().getCanonicalType();
+        const auto *TypePtr = ArgType.getTypePtrOrNull();
+        if (!TypePtr)
           continue; // FIXME? Should we bail?
 
         // FIXME: more complex types (arrays, references to raw pointers, etc)
-        std::optional<bool> IsUncounted = isUncountedPtr(ArgType);
+        std::optional<bool> IsUncounted = isUncountedPtr(TypePtr);
         if (!IsUncounted || !(*IsUncounted))
           continue;
 
@@ -119,7 +132,7 @@ class UncountedCallArgsChecker
         if (isPtrOriginSafe(Arg))
           continue;
 
-        reportBug(Arg, *P);
+        reportBug(Arg, *P, D);
       }
     }
   }
@@ -240,7 +253,8 @@ class UncountedCallArgsChecker
             ClsName.ends_with("String"));
   }
 
-  void reportBug(const Expr *CallArg, const ParmVarDecl *Param) const {
+  void reportBug(const Expr *CallArg, const ParmVarDecl *Param,
+                 const Decl *DeclWithIssue) const {
     assert(CallArg);
 
     SmallString<100> Buf;
@@ -261,10 +275,11 @@ class UncountedCallArgsChecker
     PathDiagnosticLocation BSLoc(SrcLocToReport, BR->getSourceManager());
     auto Report = std::make_unique<BasicBugReport>(Bug, Os.str(), BSLoc);
     Report->addRange(CallArg->getSourceRange());
+    Report->setDeclWithIssue(DeclWithIssue);
     BR->emitReport(std::move(Report));
   }
 
-  void reportBugOnThis(const Expr *CallArg) const {
+  void reportBugOnThis(const Expr *CallArg, const Decl *DeclWithIssue) const {
     assert(CallArg);
 
     const SourceLocation SrcLocToReport = CallArg->getSourceRange().getBegin();
@@ -274,6 +289,7 @@ class UncountedCallArgsChecker
         Bug, "Call argument for 'this' parameter is uncounted and unsafe.",
         BSLoc);
     Report->addRange(CallArg->getSourceRange());
+    Report->setDeclWithIssue(DeclWithIssue);
     BR->emitReport(std::move(Report));
   }
 };
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
index 274da0baf2ce5..9d0a3bb5da732 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
@@ -121,6 +121,7 @@ class UncountedLocalVarsChecker
     // want to visit those, so we make our own RecursiveASTVisitor.
     struct LocalVisitor : public RecursiveASTVisitor<LocalVisitor> {
       const UncountedLocalVarsChecker *Checker;
+      Decl *DeclWithIssue{nullptr};
 
       TrivialFunctionAnalysis TFA;
 
@@ -134,10 +135,17 @@ class UncountedLocalVarsChecker
       bool shouldVisitTemplateInstantiations() const { return true; }
       bool shouldVisitImplicitCode() const { return false; }
 
+      bool TraverseDecl(Decl *D) {
+        llvm::SaveAndRestore SavedDecl(DeclWithIssue);
+        if (D && (isa<FunctionDecl>(D) || isa<ObjCMethodDecl>(D)))
+          DeclWithIssue = D;
+        return Base::TraverseDecl(D);
+      }
+
       bool VisitVarDecl(VarDecl *V) {
         auto *Init = V->getInit();
         if (Init && V->isLocalVarDecl())
-          Checker->visitVarDecl(V, Init);
+          Checker->visitVarDecl(V, Init, DeclWithIssue);
         return true;
       }
 
@@ -145,7 +153,7 @@ class UncountedLocalVarsChecker
         if (BO->isAssignmentOp()) {
           if (auto *VarRef = dyn_cast<DeclRefExpr>(BO->getLHS())) {
             if (auto *V = dyn_cast<VarDecl>(VarRef->getDecl()))
-              Checker->visitVarDecl(V, BO->getRHS());
+              Checker->visitVarDecl(V, BO->getRHS(), DeclWithIssue);
           }
         }
         return true;
@@ -186,7 +194,8 @@ class UncountedLocalVarsChecker
     visitor.TraverseDecl(const_cast<TranslationUnitDecl *>(TUD));
   }
 
-  void visitVarDecl(const VarDecl *V, const Expr *Value) const {
+  void visitVarDecl(const VarDecl *V, const Expr *Value,
+                    const Decl *DeclWithIssue) const {
     if (shouldSkipVarDecl(V))
       return;
 
@@ -240,7 +249,7 @@ class UncountedLocalVarsChecker
               }))
         return;
 
-      reportBug(V, Value);
+      reportBug(V, Value, DeclWithIssue);
     }
   }
 
@@ -249,7 +258,8 @@ class UncountedLocalVarsChecker
     return BR->getSourceManager().isInSystemHeader(V->getLocation());
   }
 
-  void reportBug(const VarDecl *V, const Expr *Value) const {
+  void reportBug(const VarDecl *V, const Expr *Value,
+                 const Decl *DeclWithIssue) const {
     assert(V);
     SmallString<100> Buf;
     llvm::raw_svector_ostream Os(Buf);
@@ -278,6 +288,7 @@ class UncountedLocalVarsChecker
       PathDiagnosticLocation BSLoc(V->getLocation(), BR->getSourceManager());
       auto Report = std::make_unique<BasicBugReport>(Bug, Os.str(), BSLoc);
       Report->addRange(V->getSourceRange());
+      Report->setDeclWithIssue(DeclWithIssue);
       BR->emitReport(std::move(Report));
     }
   }
diff --git a/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp
index 7cdd545e61b32..71268af22e242 100644
--- a/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp
@@ -91,6 +91,7 @@ class TextDiagnostics : public PathDiagnosticConsumer {
                                     ? " [" + PD->getCheckerName() + "]"
                                     : "")
                                    .str();
+
       reportPiece(WarnID, PD->getLocation().asLocation(),
                   (PD->getShortDescription() + WarningMsg).str(),
                   PD->path.back()->getRanges(), PD->path.back()->getFixits());
diff --git a/clang/lib/Testing/CommandLineArgs.cpp b/clang/lib/Testing/CommandLineArgs.cpp
index 3abc689b93e8d..88c6ce0e098bf 100644
--- a/clang/lib/Testing/CommandLineArgs.cpp
+++ b/clang/lib/Testing/CommandLineArgs.cpp
@@ -11,99 +11,79 @@
 #include "llvm/Support/ErrorHandling.h"
 
 namespace clang {
+std::vector<TestLanguage> getCOrLater(const int MinimumStd) {
+  std::vector<TestLanguage> Result{};
+
+#define TESTLANGUAGE_C(lang, version, std_flag, version_index)                 \
+  if (version >= MinimumStd)                                                   \
+    Result.push_back(Lang_##lang##version);
+#include "clang/Testing/TestLanguage.def"
+
+  return Result;
+}
+std::vector<TestLanguage> getCXXOrLater(const int MinimumStd) {
+  std::vector<TestLanguage> Result{};
+
+#define TESTLANGUAGE_CXX(lang, version, std_flag, version_index)               \
+  if (version >= MinimumStd)                                                   \
+    Result.push_back(Lang_##lang##version);
+#include "clang/Testing/TestLanguage.def"
+
+  return Result;
+}
 
 std::vector<std::string> getCommandLineArgsForTesting(TestLanguage Lang) {
-  std::vector<std::string> Args;
   // Test with basic arguments.
   switch (Lang) {
-  case Lang_C89:
-    Args = {"-x", "c", "-std=c89"};
-    break;
-  case Lang_C99:
-    Args = {"-x", "c", "-std=c99"};
-    break;
-  case Lang_CXX03:
-    Args = {"-std=c++03", "-frtti"};
-    break;
-  case Lang_CXX11:
-    Args = {"-std=c++11", "-frtti"};
-    break;
-  case Lang_CXX14:
-    Args = {"-std=c++14", "-frtti"};
-    break;
-  case Lang_CXX17:
-    Args = {"-std=c++17", "-frtti"};
-    break;
-  case Lang_CXX20:
-    Args = {"-std=c++20", "-frtti"};
-    break;
-  case Lang_CXX23:
-    Args = {"-std=c++23", "-frtti"};
-    break;
+#define TESTLANGUAGE_C(lang, version, std_flag, version_index)                 \
+  case Lang_##lang##version:                                                   \
+    return { "-x", "c", "-std=" #std_flag };
+#define TESTLANGUAGE_CXX(lang, version, std_flag, version_index)               \
+  case Lang_##lang##version:                                                   \
+    return { "-std=" #std_flag, "-frtti" };
+#include "clang/Testing/TestLanguage.def"
+
   case Lang_OBJC:
-    Args = {"-x", "objective-c", "-frtti", "-fobjc-nonfragile-abi"};
-    break;
+    return {"-x", "objective-c", "-frtti", "-fobjc-nonfragile-abi"};
   case Lang_OBJCXX:
-    Args = {"-x", "objective-c++", "-frtti"};
-    break;
+    return {"-x", "objective-c++", "-frtti"};
   case Lang_OpenCL:
-    llvm_unreachable("Not implemented yet!");
+    llvm_unreachable("Unhandled TestLanguage enum");
   }
-  return Args;
+  llvm_unreachable("Unhandled TestLanguage enum");
 }
 
 std::vector<std::string> getCC1ArgsForTesting(TestLanguage Lang) {
-  std::vector<std::string> Args;
   switch (Lang) {
-  case Lang_C89:
-    Args = {"-xc", "-std=c89"};
-    break;
-  case Lang_C99:
-    Args = {"-xc", "-std=c99"};
-    break;
-  case Lang_CXX03:
-    Args = {"-std=c++03"};
-    break;
-  case Lang_CXX11:
-    Args = {"-std=c++11"};
-    break;
-  case Lang_CXX14:
-    Args = {"-std=c++14"};
-    break;
-  case Lang_CXX17:
-    Args = {"-std=c++17"};
-    break;
-  case Lang_CXX20:
-    Args = {"-std=c++20"};
-    break;
-  case Lang_CXX23:
-    Args = {"-std=c++23"};
-    break;
+#define TESTLANGUAGE_C(lang, version, std_flag, version_index)                 \
+  case Lang_##lang##version:                                                   \
+    return { "-xc", "-std=" #std_flag };
+#define TESTLANGUAGE_CXX(lang, version, std_flag, version_index)               \
+  case Lang_##lang##version:                                                   \
+    return { "-std=" #std_flag };
+#include "clang/Testing/TestLanguage.def"
+
   case Lang_OBJC:
-    Args = {"-xobjective-c"};
+    return {"-xobjective-c"};
     break;
   case Lang_OBJCXX:
-    Args = {"-xobjective-c++"};
+    return {"-xobjective-c++"};
     break;
   case Lang_OpenCL:
-    llvm_unreachable("Not implemented yet!");
+    llvm_unreachable("Unhandled TestLanguage enum");
   }
-  return Args;
+  llvm_unreachable("Unhandled TestLanguage enum");
 }
 
 StringRef getFilenameForTesting(TestLanguage Lang) {
   switch (Lang) {
-  case Lang_C89:
-  case Lang_C99:
+#define TESTLANGUAGE_C(lang, version, std_flag, version_index)                 \
+  case Lang_##lang##version:                                                   \
     return "input.c";
-
-  case Lang_CXX03:
-  case Lang_CXX11:
-  case Lang_CXX14:
-  case Lang_CXX17:
-  case Lang_CXX20:
-  case Lang_CXX23:
+#define TESTLANGUAGE_CXX(lang, version, std_flag, version_index)               \
+  case Lang_##lang##version:                                                   \
     return "input.cc";
+#include "clang/Testing/TestLanguage.def"
 
   case Lang_OpenCL:
     return "input.cl";
diff --git a/clang/lib/Tooling/Core/Replacement.cpp b/clang/lib/Tooling/Core/Replacement.cpp
index 89a5b15244274..92e9859ca206e 100644
--- a/clang/lib/Tooling/Core/Replacement.cpp
+++ b/clang/lib/Tooling/Core/Replacement.cpp
@@ -614,7 +614,7 @@ std::map<std::string, Replacements> groupReplacementsByFile(
   std::map<std::string, Replacements> Result;
   llvm::SmallPtrSet<const FileEntry *, 16> ProcessedFileEntries;
   for (const auto &Entry : FileToReplaces) {
-    auto FE = FileMgr.getFile(Entry.first);
+    auto FE = FileMgr.getOptionalFileRef(Entry.first);
     if (!FE)
       llvm::errs() << "File path " << Entry.first << " is invalid.\n";
     else if (ProcessedFileEntries.insert(*FE).second)
diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
index c775adc0ddd73..677f426590ab9 100644
--- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
+++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
@@ -241,7 +241,7 @@ ModuleDepCollector::getInvocationAdjustedForModuleBuildWithoutOutputs(
                                               ModuleMapInputKind);
 
   auto CurrentModuleMapEntry =
-      ScanInstance.getFileManager().getFile(Deps.ClangModuleMapFile);
+      ScanInstance.getFileManager().getOptionalFileRef(Deps.ClangModuleMapFile);
   assert(CurrentModuleMapEntry && "module map file entry not found");
 
   // Remove directly passed modulemap files. They will get added back if they
@@ -251,7 +251,8 @@ ModuleDepCollector::getInvocationAdjustedForModuleBuildWithoutOutputs(
   auto DepModuleMapFiles = collectModuleMapFiles(Deps.ClangModuleDeps);
   for (StringRef ModuleMapFile : Deps.ModuleMapFileDeps) {
     // TODO: Track these as `FileEntryRef` to simplify the equality check below.
-    auto ModuleMapEntry = ScanInstance.getFileManager().getFile(ModuleMapFile);
+    auto ModuleMapEntry =
+        ScanInstance.getFileManager().getOptionalFileRef(ModuleMapFile);
     assert(ModuleMapEntry && "module map file entry not found");
 
     // Don't report module maps describing eagerly-loaded dependency. This
@@ -299,7 +300,8 @@ llvm::DenseSet<const FileEntry *> ModuleDepCollector::collectModuleMapFiles(
     ModuleDeps *MD = ModuleDepsByID.lookup(MID);
     assert(MD && "Inconsistent dependency info");
     // TODO: Track ClangModuleMapFile as `FileEntryRef`.
-    auto FE = ScanInstance.getFileManager().getFile(MD->ClangModuleMapFile);
+    auto FE = ScanInstance.getFileManager().getOptionalFileRef(
+        MD->ClangModuleMapFile);
     assert(FE && "Missing module map file that was previously found");
     ModuleMapFiles.insert(*FE);
   }
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 9fd5eae67a21f..18ccee382d44e 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -966,7 +966,8 @@ namespace shufflevector {
 namespace FunctionStart {
   void a(void) {}
   static_assert(__builtin_function_start(a) == a, ""); // both-error {{not an integral constant expression}} \
-                                                       // both-note {{comparison of addresses of literals has unspecified value}}
+                                                       // ref-note {{comparison against opaque constant address '&__builtin_function_start(a)'}} \
+                                                       // expected-note {{comparison of addresses of literals has unspecified value}}
 }
 
 namespace BuiltinInImplicitCtor {
diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp
index 68e212ff8933f..dea4055c531d2 100644
--- a/clang/test/AST/ByteCode/cxx20.cpp
+++ b/clang/test/AST/ByteCode/cxx20.cpp
@@ -99,7 +99,7 @@ constexpr int f() {
 static_assert(f());
 #endif
 
-/// Distinct literals have disctinct addresses.
+/// Distinct literals have distinct addresses.
 /// see https://github.com/llvm/llvm-project/issues/58754
 constexpr auto foo(const char *p) { return p; }
 constexpr auto p1 = "test1";
@@ -108,22 +108,16 @@ constexpr auto p2 = "test2";
 constexpr bool b1 = foo(p1) == foo(p1);
 static_assert(b1);
 
-constexpr bool b2 = foo(p1) == foo(p2); // ref-error {{must be initialized by a constant expression}} \
-                                        // ref-note {{comparison of addresses of literals}} \
-                                        // ref-note {{declared here}}
-static_assert(!b2); // ref-error {{not an integral constant expression}} \
-                    // ref-note {{not a constant expression}}
+constexpr bool b2 = foo(p1) == foo(p2);
+static_assert(!b2);
 
 constexpr auto name1() { return "name1"; }
 constexpr auto name2() { return "name2"; }
 
-constexpr auto b3 = name1() == name1();
-static_assert(b3);
-constexpr auto b4 = name1() == name2(); // ref-error {{must be initialized by a constant expression}} \
-                                        // ref-note {{has unspecified value}} \
-                                        // ref-note {{declared here}}
-static_assert(!b4); // ref-error {{not an integral constant expression}} \
-                    // ref-note {{not a constant expression}}
+constexpr auto b3 = name1() == name1(); // ref-error {{must be initialized by a constant expression}} \
+                                        // ref-note {{comparison of addresses of literals}}
+constexpr auto b4 = name1() == name2();
+static_assert(!b4);
 
 namespace UninitializedFields {
   class A {
diff --git a/clang/test/AST/ByteCode/cxx23.cpp b/clang/test/AST/ByteCode/cxx23.cpp
index 756eec5b82560..1803fb8ab2e9a 100644
--- a/clang/test/AST/ByteCode/cxx23.cpp
+++ b/clang/test/AST/ByteCode/cxx23.cpp
@@ -158,6 +158,21 @@ namespace VirtualBases {
     /// Calls the constructor of D.
     D d;
   }
+
+#if __cplusplus >= 202302L
+  struct VBase {};
+  struct HasVBase : virtual VBase {}; // all23-note 1{{virtual base class declared here}}
+  struct Derived : HasVBase {
+    constexpr Derived() {} // all23-error {{constexpr constructor not allowed in struct with virtual base class}}
+  };
+  template<typename T> struct DerivedFromVBase : T {
+    constexpr DerivedFromVBase();
+  };
+  constexpr int f(DerivedFromVBase<HasVBase>) {}
+  template<typename T> constexpr DerivedFromVBase<T>::DerivedFromVBase() : T() {}
+  constexpr int nVBase = (DerivedFromVBase<HasVBase>(), 0); // all23-error {{constant expression}} \
+                                                            // all23-note {{cannot construct object of type 'DerivedFromVBase<VirtualBases::HasVBase>' with virtual base class in a constant expression}}
+#endif
 }
 
 namespace LabelGoto {
@@ -238,3 +253,54 @@ namespace TwosComplementShifts {
   static_assert(-3 >> 1 == -2);
   static_assert(-7 >> 1 == -4);
 }
+
+namespace AnonUnionDtor {
+  struct A {
+    A ();
+    ~A();
+  };
+
+  template <class T>
+  struct opt
+  {
+    union { // all20-note {{is not literal}}
+      char c;
+      T data;
+    };
+
+    constexpr opt() {}
+
+    constexpr ~opt()  {
+     if (engaged)
+       data.~T();
+   }
+
+    bool engaged = false;
+  };
+
+  consteval void foo() {
+    opt<A> a; // all20-error {{variable of non-literal type}}
+  }
+
+  void bar() { foo(); }
+}
+
+/// FIXME: The two interpreters disagree about there to diagnose the non-constexpr destructor call.
+namespace NonLiteralDtorInParam {
+  class NonLiteral { // all20-note {{is not an aggregate and has no constexpr constructors other than copy or move constructors}}
+  public:
+    NonLiteral() {}
+    ~NonLiteral() {} // all23-note {{declared here}}
+  };
+  constexpr int F2(NonLiteral N) { // all20-error {{constexpr function's 1st parameter type 'NonLiteral' is not a literal type}} \
+                                   // ref23-note {{non-constexpr function '~NonLiteral' cannot be used in a constant expression}}
+    return 8;
+  }
+
+
+  void test() {
+    NonLiteral L;
+    constexpr auto D = F2(L); // all23-error {{must be initialized by a constant expression}} \
+                              // expected23-note {{non-constexpr function '~NonLiteral' cannot be used in a constant expression}}
+  }
+}
diff --git a/clang/test/AST/ByteCode/fixed-point.cpp b/clang/test/AST/ByteCode/fixed-point.cpp
new file mode 100644
index 0000000000000..4bf80ba7c58f0
--- /dev/null
+++ b/clang/test/AST/ByteCode/fixed-point.cpp
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 %s -fsyntax-only -ffixed-point -verify=expected,both -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 %s -fsyntax-only -ffixed-point -verify=ref,both
+
+static_assert((bool)1.0k);
+static_assert(!((bool)0.0k));
+static_assert((bool)0.0k); // both-error {{static assertion failed}}
+
+static_assert(1.0k == 1.0k);
+static_assert(1.0k == 1);
+static_assert(1.0k != 1.0k); // both-error {{failed due to requirement '1.0k != 1.0k'}}
+static_assert(1.0k != 1); // both-error {{failed due to requirement '1.0k != 1'}}
+static_assert(-12.0k == -(-(-12.0k)));
+
+constexpr _Accum acc = (0.5r, 6.9k);
+
+/// Zero-init.
+constexpr _Accum A{};
+static_assert(A == 0.0k);
+static_assert(A == 0);
+
+namespace IntToFixedPointCast {
+  constexpr _Accum B = 13;
+  static_assert(B == 13.0k);
+  static_assert(B == 13);
+
+  constexpr _Fract sf = -1;
+  static_assert(sf == -1.0k);
+  static_assert(sf == -1);
+}
+
+namespace FixedPointToIntCasts {
+  constexpr _Accum A = -13.0k;
+  constexpr int I = A;
+  static_assert(I == -13);
+}
+
+namespace FloatToFixedPointCast {
+  constexpr _Fract sf = 1.0; // both-error {{must be initialized by a constant expression}} \
+                             // both-note {{outside the range of representable values of type 'const _Fract'}}
+
+  constexpr _Fract sf2 = 0.5;
+  static_assert(sf2 == 0.5);
+  constexpr float sf2f = sf2;
+  static_assert(sf2f == 0.5);
+}
+
+namespace BinOps {
+  constexpr _Accum A = 13;
+  static_assert(A + 1 == 14.0k);
+  static_assert(1 + A == 14.0k);
+  static_assert((A + A) == 26);
+
+  static_assert(A + 100000 == 14.0k); // both-error {{is not an integral constant expression}} \
+                                      // both-note {{is outside the range of representable values}}
+
+  static_assert((A - A) == 0);
+  constexpr short _Accum mul_ovf1 = 255.0hk * 4.5hk; // both-error {{must be initialized by a constant expression}} \
+                                                     // both-note {{value 123.5 is outside the range of representable values of type 'short _Accum'}}
+  constexpr short _Accum div_ovf1 = 255.0hk / 0.5hk; // both-error {{must be initialized by a constant expression}} \
+                                                     // both-note {{value -2.0 is outside the range of representable values of type 'short _Accum'}}
+
+}
+
+namespace FixedPointCasts {
+  constexpr _Fract B = 0.3;
+  constexpr _Accum A = B;
+  constexpr _Fract C = A;
+}
+
+namespace Cmp {
+  constexpr _Accum A = 13.0k;
+  constexpr _Accum B = 14.0k;
+  static_assert(B > A);
+  static_assert(B >= A);
+  static_assert(A < B);
+  static_assert(A <= B);
+}
diff --git a/clang/test/AST/ByteCode/placement-new.cpp b/clang/test/AST/ByteCode/placement-new.cpp
index 1ff6ff3ac1922..caf3ac97fd1c0 100644
--- a/clang/test/AST/ByteCode/placement-new.cpp
+++ b/clang/test/AST/ByteCode/placement-new.cpp
@@ -271,3 +271,18 @@ namespace ConstructAt {
                                           // both-note {{in call}}
 
 }
+
+namespace UsedToCrash {
+  struct S {
+      int* i;
+      constexpr S() : i(new int(42)) {} // #no-deallocation
+      constexpr ~S() {delete i;}
+  };
+  consteval void alloc() {
+      S* s = new S();
+      s->~S();
+      new (s) S();
+      delete s;
+  }
+  int alloc1 = (alloc(), 0);
+}
diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp
index 7e3cf5b94518f..215f26bd5da8e 100644
--- a/clang/test/AST/ByteCode/records.cpp
+++ b/clang/test/AST/ByteCode/records.cpp
@@ -1653,3 +1653,11 @@ namespace ExprWithCleanups {
   constexpr auto F = true ? 1i : 2i;
   static_assert(F == 1i, "");
 }
+
+namespace NullptrUpcast {
+  struct A {};
+  struct B : A { int n; };
+  constexpr B *nb = nullptr;
+  constexpr A &ra = *nb; // both-error {{constant expression}} \
+                         // both-note {{cannot access base class of null pointer}}
+}
diff --git a/clang/test/AST/HLSL/RWBuffer-AST.hlsl b/clang/test/AST/HLSL/RWBuffer-AST.hlsl
index c3ba520e0f68e..55c0dfa2eaa53 100644
--- a/clang/test/AST/HLSL/RWBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/RWBuffer-AST.hlsl
@@ -29,36 +29,26 @@ RWBuffer<float> Buffer;
 // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class RWBuffer definition
 
 // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: implicit h 'element_type *
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SAME:':'element_type *'
+// CHECK-SAME: ':'__hlsl_resource_t'
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
 
 // CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
 // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
 // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
 // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue
-// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] 
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SAME: ':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}}
 // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'const RWBuffer<element_type>' lvalue implicit this
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int'
 // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
 
 // CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
 // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
 // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
 // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue
-// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] 
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SAME: ':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}}
 // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'RWBuffer<element_type>' lvalue implicit this
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int'
 // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
 
 // CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class RWBuffer definition
@@ -66,8 +56,8 @@ RWBuffer<float> Buffer;
 // CHECK: TemplateArgument type 'float'
 // CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
 // CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'float *
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] 
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
-// CHECK-SAME: ':'float *'
+// CHECK-SAME: ':'__hlsl_resource_t'
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
index 1a3deba5830fa..b31db8ce59f22 100644
--- a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
@@ -30,39 +30,27 @@ StructuredBuffer<float> Buffer;
 // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class StructuredBuffer definition
 
 // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h 'element_type *
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SAME: ':'element_type *'
+// CHECK-SAME: ':'__hlsl_resource_t'
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
 
 // CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
 // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
 // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
 // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue
-// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SAME: ':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}}
 // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'const StructuredBuffer<element_type>' lvalue implicit this
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int'
 // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
 
 // CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
 // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
 // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
 // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue
-// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SAME: ':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}}
 // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'StructuredBuffer<element_type>' lvalue implicit this
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int'
 // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
 
 // CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class StructuredBuffer definition
@@ -70,9 +58,9 @@ StructuredBuffer<float> Buffer;
 // CHECK: TemplateArgument type 'float'
 // CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
 // CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'float *
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
-// CHECK-SAME: ':'float *'
+// CHECK-SAME: ':'__hlsl_resource_t'
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
diff --git a/clang/test/AST/ast-print-openacc-loop-construct.cpp b/clang/test/AST/ast-print-openacc-loop-construct.cpp
index cde302a66f3af..ae1f7964f019e 100644
--- a/clang/test/AST/ast-print-openacc-loop-construct.cpp
+++ b/clang/test/AST/ast-print-openacc-loop-construct.cpp
@@ -57,4 +57,29 @@ void foo() {
 // CHECK-NEXT: ;
 #pragma acc loop private(i, array[1], array, array[1:2])
   for(;;);
+
+// CHECK: #pragma acc loop collapse(1)
+// CHECK-NEXT: for (;;)
+// CHECK-NEXT: ;
+#pragma acc loop collapse(1)
+  for(;;);
+// CHECK: #pragma acc loop collapse(force:1)
+// CHECK-NEXT: for (;;)
+// CHECK-NEXT: ;
+#pragma acc loop collapse(force:1)
+  for(;;);
+// CHECK: #pragma acc loop collapse(2)
+// CHECK-NEXT: for (;;)
+// CHECK-NEXT: for (;;)
+// CHECK-NEXT: ;
+#pragma acc loop collapse(2)
+  for(;;)
+    for(;;);
+// CHECK: #pragma acc loop collapse(force:2)
+// CHECK-NEXT: for (;;)
+// CHECK-NEXT: for (;;)
+// CHECK-NEXT: ;
+#pragma acc loop collapse(force:2)
+  for(;;)
+    for(;;);
 }
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index c427b22fd683e..933b4c5e62a79 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -108,4 +108,52 @@ struct RefCountable {
 
 template <typename T> T *downcast(T *t) { return t; }
 
+template <typename T> struct CheckedRef {
+private:
+  T *t;
+
+public:
+  CheckedRef() : t{} {};
+  CheckedRef(T &t) : t(t) { t->incrementPtrCount(); }
+  CheckedRef(const CheckedRef& o) : t(o.t) { if (t) t->incrementPtrCount(); }
+  ~CheckedRef() { if (t) t->decrementPtrCount(); }
+  T &get() { return *t; }
+  T *ptr() { return t; }
+  T *operator->() { return t; }
+  operator const T &() const { return *t; }
+  operator T &() { return *t; }
+};
+
+template <typename T> struct CheckedPtr {
+private:
+  T *t;
+
+public:
+  CheckedPtr() : t(nullptr) {}
+  CheckedPtr(T *t)
+    : t(t) {
+    if (t)
+      t->incrementPtrCount();
+  }
+  CheckedPtr(Ref<T>&& o)
+    : t(o.leakRef())
+  { }
+  ~CheckedPtr() {
+    if (t)
+      t->decrementPtrCount();
+  }
+  T *get() { return t; }
+  T *operator->() { return t; }
+  const T *operator->() const { return t; }
+  T &operator*() { return *t; }
+  CheckedPtr &operator=(T *) { return *this; }
+  operator bool() const { return t; }
+};
+
+class CheckedObj {
+public:
+  void incrementPtrCount();
+  void decrementPtrCount();
+};
+
 #endif
diff --git a/clang/test/Analysis/Checkers/WebKit/unchecked-members.cpp b/clang/test/Analysis/Checkers/WebKit/unchecked-members.cpp
new file mode 100644
index 0000000000000..0189b0cd50fcc
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unchecked-members.cpp
@@ -0,0 +1,52 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.NoUncheckedPtrMemberChecker -verify %s
+
+#include "mock-types.h"
+
+namespace members {
+
+  struct Foo {
+  private:
+    CheckedObj* a = nullptr;
+// expected-warning@-1{{Member variable 'a' in 'members::Foo' is a raw pointer to CheckedPtr capable type 'CheckedObj'}}
+    CheckedObj& b;
+// expected-warning@-1{{Member variable 'b' in 'members::Foo' is a reference to CheckedPtr capable type 'CheckedObj'}}
+
+    [[clang::suppress]]
+    CheckedObj* a_suppressed = nullptr;
+
+    [[clang::suppress]]
+    CheckedObj& b_suppressed;
+
+    CheckedPtr<CheckedObj> c;
+    CheckedRef<CheckedObj> d;
+
+  public:
+    Foo();
+  };
+
+  template <typename S>
+  struct FooTmpl {
+    S* e;
+// expected-warning@-1{{Member variable 'e' in 'members::FooTmpl<CheckedObj>' is a raw pointer to CheckedPtr capable type 'CheckedObj'}}
+  };
+
+  void forceTmplToInstantiate(FooTmpl<CheckedObj>) { }
+
+} // namespace members
+
+namespace ignore_unions {
+
+  union Foo {
+    CheckedObj* a;
+    CheckedPtr<CheckedObj> c;
+    CheckedRef<CheckedObj> d;
+  };
+
+  template<class T>
+  union FooTmpl {
+    T* a;
+  };
+
+  void forceTmplToInstantiate(FooTmpl<CheckedObj>) { }
+
+} // namespace ignore_unions
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-const-v-muable.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-const-v-muable.cpp
new file mode 100644
index 0000000000000..2721cd8474e1b
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-const-v-muable.cpp
@@ -0,0 +1,27 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+
+#include "mock-types.h"
+
+class Object {
+public:
+    void ref() const;
+    void deref() const;
+
+    bool constFunc() const;
+    void mutableFunc();
+};
+
+class Caller {
+  void someFunction();
+  void otherFunction();
+private:
+    RefPtr<Object> m_obj;
+};
+
+void Caller::someFunction()
+{
+    m_obj->constFunc();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    m_obj->mutableFunc();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+}
diff --git a/clang/test/Analysis/Inputs/system-header-simulator.h b/clang/test/Analysis/Inputs/system-header-simulator.h
index 8fd51449ecc0a..fadc09f65d536 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator.h
@@ -154,3 +154,11 @@ void _Exit(int status) __attribute__ ((__noreturn__));
 #define EOF (-1)
 
 #define offsetof(t, d) __builtin_offsetof(t, d)
+
+struct sigaction {
+  void (*sa_handler)(int);
+};
+#define SIGINT 2
+#define	SIG_IGN (void (*)(int))1
+
+int sigaction(int, const struct sigaction *restrict, struct sigaction *restrict);
diff --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c
index 8eb869bac46f8..47594e8317bc7 100644
--- a/clang/test/Analysis/analyzer-config.c
+++ b/clang/test/Analysis/analyzer-config.c
@@ -9,7 +9,6 @@
 // CHECK-NEXT: alpha.clone.CloneChecker:ReportNormalClones = true
 // CHECK-NEXT: alpha.cplusplus.STLAlgorithmModeling:AggressiveStdFindModeling = false
 // CHECK-NEXT: alpha.osx.cocoa.DirectIvarAssignment:AnnotatedFunctions = false
-// CHECK-NEXT: alpha.security.taint.TaintPropagation:Config = ""
 // CHECK-NEXT: apply-fixits = false
 // CHECK-NEXT: assume-controlled-environment = false
 // CHECK-NEXT: avoid-suppressing-null-argument-paths = false
@@ -111,6 +110,7 @@
 // CHECK-NEXT: optin.cplusplus.VirtualCall:ShowFixIts = false
 // CHECK-NEXT: optin.osx.cocoa.localizability.NonLocalizedStringChecker:AggressiveReport = false
 // CHECK-NEXT: optin.performance.Padding:AllowedPad = 24
+// CHECK-NEXT: optin.taint.TaintPropagation:Config = ""
 // CHECK-NEXT: osx.NumberObjectConversion:Pedantic = false
 // CHECK-NEXT: osx.cocoa.RetainCount:TrackNSCFStartParam = false
 // CHECK-NEXT: prune-paths = true
diff --git a/clang/test/Analysis/assume-controlled-environment.c b/clang/test/Analysis/assume-controlled-environment.c
index fce1a1e7bae33..4f663502f8657 100644
--- a/clang/test/Analysis/assume-controlled-environment.c
+++ b/clang/test/Analysis/assume-controlled-environment.c
@@ -1,12 +1,12 @@
 // RUN: %clang_analyze_cc1 -verify=untrusted-env %s \
 // RUN:   -analyzer-checker=core                    \
-// RUN:   -analyzer-checker=alpha.security.taint    \
+// RUN:   -analyzer-checker=optin.taint    \
 // RUN:   -analyzer-checker=debug.TaintTest
 
 // RUN: %clang_analyze_cc1 -verify %s -DEXPECT_NO_WARNINGS    \
 // RUN:   -analyzer-config assume-controlled-environment=true \
 // RUN:   -analyzer-checker=core                              \
-// RUN:   -analyzer-checker=alpha.security.taint              \
+// RUN:   -analyzer-checker=optin.taint              \
 // RUN:   -analyzer-checker=debug.TaintTest
 
 
diff --git a/clang/test/Analysis/bool-assignment.c b/clang/test/Analysis/bool-assignment.c
index c32bc8f9e8b14..3a104cf627ffa 100644
--- a/clang/test/Analysis/bool-assignment.c
+++ b/clang/test/Analysis/bool-assignment.c
@@ -1,5 +1,5 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.core.BoolAssignment,alpha.security.taint -verify -std=c99 -Dbool=_Bool %s
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.core.BoolAssignment,alpha.security.taint -verify -x c++ %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.core.BoolAssignment,optin.taint -verify -std=c99 -Dbool=_Bool %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.core.BoolAssignment,optin.taint -verify -x c++ %s
 
 // Test C++'s bool and C's _Bool.
 // FIXME: We stopped warning on these when SValBuilder got smarter about
diff --git a/clang/test/Analysis/casts.c b/clang/test/Analysis/casts.c
index 462a9865f1564..30cd74be564fd 100644
--- a/clang/test/Analysis/casts.c
+++ b/clang/test/Analysis/casts.c
@@ -129,7 +129,7 @@ void locAsIntegerCasts(void *p) {
 }
 
 void multiDimensionalArrayPointerCasts(void) {
-  static int x[10][10]; // expected-note2{{Array at the right-hand side of subtraction}}
+  static int x[10][10];
   int *y1 = &(x[3][5]);
   char *z = ((char *) y1) + 2;
   int *y2 = (int *)(z - 2);
@@ -138,9 +138,7 @@ void multiDimensionalArrayPointerCasts(void) {
   clang_analyzer_eval(y1 == y2); // expected-warning{{TRUE}}
 
   // FIXME: should be FALSE (i.e. equal pointers).
-  // FIXME: pointer subtraction warning might be incorrect
   clang_analyzer_eval(y1 - y2); // expected-warning{{UNKNOWN}}
-  // expected-warning@-1{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
   // FIXME: should be TRUE (i.e. same symbol).
   clang_analyzer_eval(*y1 == *y2); // expected-warning{{UNKNOWN}}
 
@@ -149,9 +147,7 @@ void multiDimensionalArrayPointerCasts(void) {
   clang_analyzer_eval(y1 == y3); // expected-warning{{TRUE}}
 
   // FIXME: should be FALSE (i.e. equal pointers).
-  // FIXME: pointer subtraction warning might be incorrect
   clang_analyzer_eval(y1 - y3); // expected-warning{{UNKNOWN}}
-  // expected-warning@-1{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
   // FIXME: should be TRUE (i.e. same symbol).
   clang_analyzer_eval(*y1 == *y3); // expected-warning{{UNKNOWN}}
 
diff --git a/clang/test/Analysis/ctu-on-demand-parsing.c b/clang/test/Analysis/ctu-on-demand-parsing.c
index 72288def61b13..17ade150ded5e 100644
--- a/clang/test/Analysis/ctu-on-demand-parsing.c
+++ b/clang/test/Analysis/ctu-on-demand-parsing.c
@@ -24,6 +24,7 @@
 //
 // FIXME: Path handling should work on all platforms.
 // REQUIRES: system-linux
+// UNSUPPORTED: target={{.*}}-zos{{.*}}
 
 void clang_analyzer_eval(int);
 
diff --git a/clang/test/Analysis/ctu-on-demand-parsing.cpp b/clang/test/Analysis/ctu-on-demand-parsing.cpp
index d28d3c22c69b0..0c0128faefaea 100644
--- a/clang/test/Analysis/ctu-on-demand-parsing.cpp
+++ b/clang/test/Analysis/ctu-on-demand-parsing.cpp
@@ -35,6 +35,7 @@
 //
 // FIXME: Path handling should work on all platforms.
 // REQUIRES: system-linux
+// UNSUPPORTED: target={{.*}}-zos{{.*}}
 
 #include "ctu-hdr.h"
 
diff --git a/clang/test/Analysis/cxx-method-names.cpp b/clang/test/Analysis/cxx-method-names.cpp
index 22ec4db34796b..5254d82bd90b2 100644
--- a/clang/test/Analysis/cxx-method-names.cpp
+++ b/clang/test/Analysis/cxx-method-names.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix,osx,alpha.unix,alpha.security.taint -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix,osx,alpha.unix,optin.taint -verify %s
 // expected-no-diagnostics
 
 class Evil {
diff --git a/clang/test/Analysis/debug-exprinspection-istainted.c b/clang/test/Analysis/debug-exprinspection-istainted.c
index 8d1ebca930885..b459f3a3e791b 100644
--- a/clang/test/Analysis/debug-exprinspection-istainted.c
+++ b/clang/test/Analysis/debug-exprinspection-istainted.c
@@ -1,7 +1,7 @@
 // RUN: %clang_analyze_cc1 -verify %s \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=debug.ExprInspection \
-// RUN:   -analyzer-checker=alpha.security.taint
+// RUN:   -analyzer-checker=optin.taint
 
 int scanf(const char *restrict format, ...);
 void clang_analyzer_isTainted(char);
diff --git a/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-diagnostics-taint-test.c.sarif b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-diagnostics-taint-test.c.sarif
index 0ac96cacbed19..0bded6f0925d1 100644
--- a/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-diagnostics-taint-test.c.sarif
+++ b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-diagnostics-taint-test.c.sarif
@@ -4,7 +4,7 @@
     {
       "artifacts": [
         {
-          "length": 434,
+          "length": 425,
           "location": {
             "index": 0,
           },
diff --git a/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-diagnostic-test.c.sarif b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-diagnostic-test.c.sarif
index ae49ad6604cb2..7f9deea304832 100644
--- a/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-diagnostic-test.c.sarif
+++ b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-diagnostic-test.c.sarif
@@ -4,7 +4,7 @@
     {
       "artifacts": [
         {
-          "length": 1081,
+          "length": 1071,
           "location": {
             "index": 0,
           },
diff --git a/clang/test/Analysis/diagnostics/sarif-diagnostics-taint-test.c b/clang/test/Analysis/diagnostics/sarif-diagnostics-taint-test.c
index b1042f9034d7d..7a3ca61c4319f 100644
--- a/clang/test/Analysis/diagnostics/sarif-diagnostics-taint-test.c
+++ b/clang/test/Analysis/diagnostics/sarif-diagnostics-taint-test.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.security.taint,debug.TaintTest %s -verify -analyzer-output=sarif -o - | %normalize_sarif | diff -U1 -b %S/Inputs/expected-sarif/sarif-diagnostics-taint-test.c.sarif -
+// RUN: %clang_analyze_cc1 -analyzer-checker=optin.taint,debug.TaintTest %s -verify -analyzer-output=sarif -o - | %normalize_sarif | diff -U1 -b %S/Inputs/expected-sarif/sarif-diagnostics-taint-test.c.sarif -
 #include "../Inputs/system-header-simulator.h"
 
 int atoi(const char *nptr);
diff --git a/clang/test/Analysis/diagnostics/sarif-multi-diagnostic-test.c b/clang/test/Analysis/diagnostics/sarif-multi-diagnostic-test.c
index 61d19817407e2..eeafd178628b3 100644
--- a/clang/test/Analysis/diagnostics/sarif-multi-diagnostic-test.c
+++ b/clang/test/Analysis/diagnostics/sarif-multi-diagnostic-test.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.security.taint,debug.TaintTest,unix.Malloc %s -verify -analyzer-output=sarif -o - | %normalize_sarif | diff -U1 -b %S/Inputs/expected-sarif/sarif-multi-diagnostic-test.c.sarif -
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,optin.taint,debug.TaintTest,unix.Malloc %s -verify -analyzer-output=sarif -o - | %normalize_sarif | diff -U1 -b %S/Inputs/expected-sarif/sarif-multi-diagnostic-test.c.sarif -
 #include "../Inputs/system-header-simulator.h"
 #include "../Inputs/system-header-simulator-for-malloc.h"
 #define ERR -1
@@ -43,4 +43,3 @@ int main(void) {
   unicode();
   return 0;
 }
-
diff --git a/clang/test/Analysis/divzero-tainted-div-difference.c b/clang/test/Analysis/divzero-tainted-div-difference.c
new file mode 100644
index 0000000000000..28486ccdf7e4f
--- /dev/null
+++ b/clang/test/Analysis/divzero-tainted-div-difference.c
@@ -0,0 +1,34 @@
+// RUN: %clang_analyze_cc1 -Wno-format-security -Wno-pointer-to-int-cast \
+// RUN:   -Wno-incompatible-library-redeclaration -verify=normaldiv %s \
+// RUN:   -analyzer-checker=optin.taint.GenericTaint \
+// RUN:   -analyzer-checker=core
+
+// RUN: %clang_analyze_cc1 -Wno-format-security -Wno-pointer-to-int-cast \
+// RUN:   -Wno-incompatible-library-redeclaration -verify=tainteddiv %s \
+// RUN:   -analyzer-checker=optin.taint.GenericTaint \
+// RUN:   -analyzer-checker=optin.taint.TaintedDiv
+
+int getchar(void);
+
+
+//If we are sure that we divide by zero
+//we emit a divide by zero warning
+int testDivZero(void) {
+  int x = getchar(); // taint source
+  if (!x)
+    return 5 / x; // normaldiv-warning{{Division by zero}}
+  return 8;
+}
+
+// The attacker provided value might be 0
+int testDivZero2(void) {
+  int x = getchar(); // taint source
+  return 5 / x; // tainteddiv-warning{{Division by a tainted value}}
+}
+
+int testDivZero3(void) {
+  int x = getchar(); // taint source
+  if (!x)
+    return 0;
+  return 5 / x; // no warning
+}
diff --git a/clang/test/Analysis/fread.c b/clang/test/Analysis/fread.c
index 5dc6c0c744093..8dc998ea1e899 100644
--- a/clang/test/Analysis/fread.c
+++ b/clang/test/Analysis/fread.c
@@ -1,6 +1,6 @@
 // RUN: %clang_analyze_cc1 -verify %s \
 // RUN:   -triple x86_64-linux-gnu  \
-// RUN:   -analyzer-checker=core,unix.Stream,alpha.security.taint \
+// RUN:   -analyzer-checker=core,unix.Stream,optin.taint \
 // RUN:   -analyzer-checker=debug.ExprInspection
 
 #include "Inputs/system-header-simulator-for-simple-stream.h"
diff --git a/clang/test/Analysis/global-region-invalidation-errno.c b/clang/test/Analysis/global-region-invalidation-errno.c
index 9de10ad59095a..868869b5d262f 100644
--- a/clang/test/Analysis/global-region-invalidation-errno.c
+++ b/clang/test/Analysis/global-region-invalidation-errno.c
@@ -1,9 +1,9 @@
 // RUN: %clang_analyze_cc1 -triple x86_64-apple-darwin10 -disable-free -verify %s \
-// RUN:   -analyzer-checker=core,deadcode,alpha.security.taint \
+// RUN:   -analyzer-checker=core,deadcode,optin.taint \
 // RUN:   -DERRNO_VAR
 
 // RUN: %clang_analyze_cc1 -triple x86_64-apple-darwin10 -disable-free -verify %s \
-// RUN:   -analyzer-checker=core,deadcode,alpha.security.taint \
+// RUN:   -analyzer-checker=core,deadcode,optin.taint \
 // RUN:   -DERRNO_FUNC
 
 // Note, we do need to include headers here, since the analyzer checks if the function declaration is located in a system header.
diff --git a/clang/test/Analysis/global-region-invalidation.c b/clang/test/Analysis/global-region-invalidation.c
index faca3baf11caf..2f78467630ce6 100644
--- a/clang/test/Analysis/global-region-invalidation.c
+++ b/clang/test/Analysis/global-region-invalidation.c
@@ -1,5 +1,5 @@
 // RUN: %clang_analyze_cc1 -triple x86_64-apple-darwin10 -disable-free -verify %s \
-// RUN:   -analyzer-checker=core,deadcode,alpha.security.taint,debug.TaintTest,debug.ExprInspection
+// RUN:   -analyzer-checker=core,deadcode,optin.taint,debug.TaintTest,debug.ExprInspection
 
 void clang_analyzer_eval(int);
 
diff --git a/clang/test/Analysis/malloc-annotations.c b/clang/test/Analysis/malloc-annotations.c
index c2fdf8a5641ae..c601a0383d221 100644
--- a/clang/test/Analysis/malloc-annotations.c
+++ b/clang/test/Analysis/malloc-annotations.c
@@ -3,6 +3,7 @@
 // RUN:   -analyzer-checker=alpha.deadcode.UnreachableCode \
 // RUN:   -analyzer-checker=alpha.core.CastSize \
 // RUN:   -analyzer-checker=unix.Malloc \
+// RUN:   -analyzer-checker=debug.ExprInspection \
 // RUN:   -analyzer-config unix.DynamicMemoryModeling:Optimistic=true %s
 
 typedef __typeof(sizeof(int)) size_t;
@@ -23,6 +24,12 @@ void __attribute((ownership_holds(malloc, 1))) my_hold(void *);
 void __attribute((ownership_holds(malloc, 1)))
 __attribute((ownership_holds(malloc, 1)))
 __attribute((ownership_holds(malloc, 3))) my_hold2(void *, void *, void *);
+
+__attribute((ownership_returns(user_malloc, 1))) void *user_malloc(size_t);
+__attribute((ownership_takes(user_malloc, 1))) void user_free(void *);
+
+void clang_analyzer_dump(int);
+
 void *my_malloc3(size_t);
 void *myglobalpointer;
 struct stuff {
@@ -273,3 +280,10 @@ void testMultipleFreeAnnotations(void) {
   my_freeBoth(p, q);
 }
 
+void testNoUninitAttr(void) {
+  int *p = user_malloc(sizeof(int));
+  int read = p[0]; // no-warning
+  clang_analyzer_dump(p[0]); // expected-warning{{Unknown}}
+  user_free(p);
+}
+
diff --git a/clang/test/Analysis/malloc.c b/clang/test/Analysis/malloc.c
index 9c7ca43bfbc5a..57f8561a74da6 100644
--- a/clang/test/Analysis/malloc.c
+++ b/clang/test/Analysis/malloc.c
@@ -4,7 +4,7 @@
 // RUN:   -analyzer-checker=alpha.core.CastSize \
 // RUN:   -analyzer-checker=unix \
 // RUN:   -analyzer-checker=debug.ExprInspection \
-// RUN:   -analyzer-checker=alpha.security.taint.TaintPropagation \
+// RUN:   -analyzer-checker=optin.taint.TaintPropagation \
 // RUN:   -analyzer-checker=optin.taint.TaintedAlloc
 
 #include "Inputs/system-header-simulator.h"
diff --git a/clang/test/Analysis/malloc.cpp b/clang/test/Analysis/malloc.cpp
index 7af1b59e04a5a..2bbfaf6640b79 100644
--- a/clang/test/Analysis/malloc.cpp
+++ b/clang/test/Analysis/malloc.cpp
@@ -4,7 +4,7 @@
 // RUN:   -analyzer-checker=alpha.core.CastSize \
 // RUN:   -analyzer-checker=unix.Malloc \
 // RUN:   -analyzer-checker=cplusplus.NewDelete \
-// RUN:   -analyzer-checker=alpha.security.taint.TaintPropagation \
+// RUN:   -analyzer-checker=optin.taint.TaintPropagation \
 // RUN:   -analyzer-checker=optin.taint.TaintedAlloc
 
 // RUN: %clang_analyze_cc1 -w -verify %s \
@@ -14,7 +14,7 @@
 // RUN:   -analyzer-checker=alpha.core.CastSize \
 // RUN:   -analyzer-checker=unix.Malloc \
 // RUN:   -analyzer-checker=cplusplus.NewDelete \
-// RUN:   -analyzer-checker=alpha.security.taint.TaintPropagation \
+// RUN:   -analyzer-checker=optin.taint.TaintPropagation \
 // RUN:   -analyzer-checker=optin.taint.TaintedAlloc
 
 // RUN: %clang_analyze_cc1 -w -verify %s -DTEST_INLINABLE_ALLOCATORS \
@@ -23,7 +23,7 @@
 // RUN:   -analyzer-checker=alpha.core.CastSize \
 // RUN:   -analyzer-checker=unix.Malloc \
 // RUN:   -analyzer-checker=cplusplus.NewDelete \
-// RUN:   -analyzer-checker=alpha.security.taint.TaintPropagation \
+// RUN:   -analyzer-checker=optin.taint.TaintPropagation \
 // RUN:   -analyzer-checker=optin.taint.TaintedAlloc
 
 // RUN: %clang_analyze_cc1 -w -verify %s -DTEST_INLINABLE_ALLOCATORS \
@@ -33,7 +33,7 @@
 // RUN:   -analyzer-checker=alpha.core.CastSize \
 // RUN:   -analyzer-checker=unix.Malloc \
 // RUN:   -analyzer-checker=cplusplus.NewDelete \
-// RUN:   -analyzer-checker=alpha.security.taint.TaintPropagation \
+// RUN:   -analyzer-checker=optin.taint.TaintPropagation \
 // RUN:   -analyzer-checker=optin.taint.TaintedAlloc
 
 #include "Inputs/system-header-simulator-cxx.h"
diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index de70e483add1c..8ecad7036c331 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -1,5 +1,5 @@
 // RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-output=text        \
-// RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint -verify %s
+// RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,optin.taint -verify %s
 
 int TenElements[10];
 
diff --git a/clang/test/Analysis/out-of-bounds-notes.c b/clang/test/Analysis/out-of-bounds-notes.c
index c29b6f8ab111b..391089b6a35d8 100644
--- a/clang/test/Analysis/out-of-bounds-notes.c
+++ b/clang/test/Analysis/out-of-bounds-notes.c
@@ -1,5 +1,5 @@
 // RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-output=text        \
-// RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint -verify %s
+// RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,optin.taint -verify %s
 
 int TenElements[10];
 
diff --git a/clang/test/Analysis/pointer-sub-notes.c b/clang/test/Analysis/pointer-sub-notes.c
index 59681b4e7555a..7f94d6544d0f8 100644
--- a/clang/test/Analysis/pointer-sub-notes.c
+++ b/clang/test/Analysis/pointer-sub-notes.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.core.PointerSub -analyzer-output=text -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=security.PointerSub -analyzer-output=text -verify %s
 
 void different_1() {
   int a[3]; // expected-note{{Array at the left-hand side of subtraction}}
diff --git a/clang/test/Analysis/pointer-sub.c b/clang/test/Analysis/pointer-sub.c
index cf9eac1abc2dc..1c9d676ebb8f2 100644
--- a/clang/test/Analysis/pointer-sub.c
+++ b/clang/test/Analysis/pointer-sub.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.core.PointerSub -analyzer-output=text-minimal -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=security.PointerSub -analyzer-output=text-minimal -verify %s
 
 void f1(void) {
   int x, y, z[10];
diff --git a/clang/test/Analysis/ptr-arith.c b/clang/test/Analysis/ptr-arith.c
index f99dfabb07366..020a500629230 100644
--- a/clang/test/Analysis/ptr-arith.c
+++ b/clang/test/Analysis/ptr-arith.c
@@ -1,6 +1,8 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=alpha.core.FixedAddr,alpha.core.PointerArithm,debug.ExprInspection -Wno-pointer-to-int-cast -verify -triple x86_64-apple-darwin9 -Wno-tautological-pointer-compare -analyzer-config eagerly-assume=false %s
 // RUN: %clang_analyze_cc1 -analyzer-checker=alpha.core.FixedAddr,alpha.core.PointerArithm,debug.ExprInspection -Wno-pointer-to-int-cast -verify -triple i686-apple-darwin9 -Wno-tautological-pointer-compare -analyzer-config eagerly-assume=false %s
 
+#include "Inputs/system-header-simulator.h"
+
 void clang_analyzer_eval(int);
 void clang_analyzer_dump(int);
 
@@ -35,9 +37,20 @@ domain_port (const char *domain_b, const char *domain_e,
   return port;
 }
 
+#define FIXED_VALUE (int*) 0x1111
+
 void f4(void) {
   int *p;
   p = (int*) 0x10000; // expected-warning{{Using a fixed address is not portable because that address will probably not be valid in all environments or platforms}}
+  long x = 0x10100;
+  x += 10;
+  p = (int*) x; // expected-warning{{Using a fixed address is not portable because that address will probably not be valid in all environments or platforms}}
+
+  struct sigaction sa;
+  sa.sa_handler = SIG_IGN; // no warning (exclude macros defined in system header)
+  sigaction(SIGINT, &sa, NULL);
+
+  p = FIXED_VALUE; // expected-warning{{Using a fixed address is not portable because that address will probably not be valid in all environments or platforms}}
 }
 
 void f5(void) {
diff --git a/clang/test/Analysis/redefined_system.c b/clang/test/Analysis/redefined_system.c
index 0a55c36c6dd5b..0998fb92d2cd2 100644
--- a/clang/test/Analysis/redefined_system.c
+++ b/clang/test/Analysis/redefined_system.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=osx,unix,core,alpha.security.taint -w -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=osx,unix,core,optin.taint -w -verify %s
 // expected-no-diagnostics
 
 // Make sure we don't crash when someone redefines a system function we reason about.
diff --git a/clang/test/Analysis/string.c b/clang/test/Analysis/string.c
index 85232624160c0..79b4877eedbd9 100644
--- a/clang/test/Analysis/string.c
+++ b/clang/test/Analysis/string.c
@@ -25,7 +25,7 @@
 // RUN: %clang_analyze_cc1 -verify %s -Wno-null-dereference \
 // RUN:   -DUSE_BUILTINS -DVARIANT \
 // RUN:   -analyzer-checker=core \
-// RUN:   -analyzer-checker=alpha.security.taint \
+// RUN:   -analyzer-checker=optin.taint \
 // RUN:   -analyzer-checker=unix.cstring \
 // RUN:   -analyzer-checker=unix.Malloc \
 // RUN:   -analyzer-checker=alpha.unix.cstring \
diff --git a/clang/test/Analysis/taint-checker-callback-order-has-definition.c b/clang/test/Analysis/taint-checker-callback-order-has-definition.c
index eaf96cc675f06..66c244c4cfda3 100644
--- a/clang/test/Analysis/taint-checker-callback-order-has-definition.c
+++ b/clang/test/Analysis/taint-checker-callback-order-has-definition.c
@@ -1,5 +1,5 @@
 // RUN: %clang_analyze_cc1 %s \
-// RUN:   -analyzer-checker=core,alpha.security.taint \
+// RUN:   -analyzer-checker=core,optin.taint \
 // RUN:   -mllvm -debug-only=taint-checker \
 // RUN:   2>&1 | FileCheck %s
 
diff --git a/clang/test/Analysis/taint-checker-callback-order-without-definition.c b/clang/test/Analysis/taint-checker-callback-order-without-definition.c
index 6de87f736926d..5f8df871b304d 100644
--- a/clang/test/Analysis/taint-checker-callback-order-without-definition.c
+++ b/clang/test/Analysis/taint-checker-callback-order-without-definition.c
@@ -1,5 +1,5 @@
 // RUN: %clang_analyze_cc1 %s \
-// RUN:   -analyzer-checker=core,alpha.security.taint \
+// RUN:   -analyzer-checker=core,optin.taint \
 // RUN:   -mllvm -debug-only=taint-checker \
 // RUN:   2>&1 | FileCheck %s
 
diff --git a/clang/test/Analysis/taint-diagnostic-visitor.c b/clang/test/Analysis/taint-diagnostic-visitor.c
index f51423646e8ae..223df9951fd6b 100644
--- a/clang/test/Analysis/taint-diagnostic-visitor.c
+++ b/clang/test/Analysis/taint-diagnostic-visitor.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -analyze -analyzer-checker=alpha.security.taint,core,alpha.security.ArrayBoundV2,optin.taint.TaintedAlloc -analyzer-output=text -verify %s
+// RUN: %clang_cc1 -analyze -analyzer-checker=optin.taint,core,alpha.security.ArrayBoundV2 -analyzer-output=text -verify %s
 
 // This file is for testing enhanced diagnostics produced by the GenericTaintChecker
 
diff --git a/clang/test/Analysis/taint-dumps.c b/clang/test/Analysis/taint-dumps.c
index 37fb6c2f2adf7..01bf0d7deff3a 100644
--- a/clang/test/Analysis/taint-dumps.c
+++ b/clang/test/Analysis/taint-dumps.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.security.taint\
+// RUN: %clang_analyze_cc1 -analyzer-checker=optin.taint\
 // RUN:                    -analyzer-checker=debug.ExprInspection %s\
 // RUN:                                          2>&1 | FileCheck %s
 
diff --git a/clang/test/Analysis/taint-generic.c b/clang/test/Analysis/taint-generic.c
index 1c139312734bc..ad5a99fe8b3a3 100644
--- a/clang/test/Analysis/taint-generic.c
+++ b/clang/test/Analysis/taint-generic.c
@@ -1,57 +1,53 @@
 // RUN: %clang_analyze_cc1 -Wno-format-security -Wno-pointer-to-int-cast \
 // RUN:   -Wno-incompatible-library-redeclaration -verify %s \
-// RUN:   -analyzer-checker=alpha.security.taint \
+// RUN:   -analyzer-checker=optin.taint.GenericTaint \
+// RUN:   -analyzer-checker=optin.taint.TaintedDiv \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=alpha.security.ArrayBoundV2 \
 // RUN:   -analyzer-checker=debug.ExprInspection \
 // RUN:   -analyzer-config \
-// RUN:     alpha.security.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config.yaml
+// RUN:     optin.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config.yaml
 
 // RUN: %clang_analyze_cc1 -Wno-format-security -Wno-pointer-to-int-cast \
 // RUN:   -Wno-incompatible-library-redeclaration -verify %s \
 // RUN:   -DFILE_IS_STRUCT \
-// RUN:   -analyzer-checker=alpha.security.taint \
+// RUN:   -analyzer-checker=optin.taint.GenericTaint \
+// RUN:   -analyzer-checker=optin.taint.TaintedDiv \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=alpha.security.ArrayBoundV2 \
 // RUN:   -analyzer-checker=debug.ExprInspection \
 // RUN:   -analyzer-config \
-// RUN:     alpha.security.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config.yaml
+// RUN:     optin.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config.yaml
 
-// RUN: not %clang_analyze_cc1 -Wno-pointer-to-int-cast \
-// RUN:   -Wno-incompatible-library-redeclaration -verify %s \
-// RUN:   -analyzer-checker=alpha.security.taint \
-// RUN:   -analyzer-checker=debug.ExprInspection \
+// RUN: not %clang_analyze_cc1 -verify %s \
+// RUN:   -analyzer-checker=optin.taint.GenericTaint  \
 // RUN:   -analyzer-config \
-// RUN:     alpha.security.taint.TaintPropagation:Config=justguessit \
+// RUN:     optin.taint.TaintPropagation:Config=justguessit \
 // RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-INVALID-FILE
 
 // CHECK-INVALID-FILE: (frontend): invalid input for checker option
-// CHECK-INVALID-FILE-SAME:        'alpha.security.taint.TaintPropagation:Config',
+// CHECK-INVALID-FILE-SAME:        'optin.taint.TaintPropagation:Config',
 // CHECK-INVALID-FILE-SAME:        that expects a valid filename instead of
 // CHECK-INVALID-FILE-SAME:        'justguessit'
 
-// RUN: not %clang_analyze_cc1 -Wno-incompatible-library-redeclaration \
-// RUN:   -verify %s \
-// RUN:   -analyzer-checker=alpha.security.taint \
-// RUN:   -analyzer-checker=debug.ExprInspection \
+// RUN: not %clang_analyze_cc1 -verify %s \
+// RUN:   -analyzer-checker=optin.taint.GenericTaint  \
 // RUN:   -analyzer-config \
-// RUN:     alpha.security.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config-ill-formed.yaml \
+// RUN:     optin.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config-ill-formed.yaml \
 // RUN:   2>&1 | FileCheck -DMSG=%errc_EINVAL %s -check-prefix=CHECK-ILL-FORMED
 
 // CHECK-ILL-FORMED: (frontend): invalid input for checker option
-// CHECK-ILL-FORMED-SAME:        'alpha.security.taint.TaintPropagation:Config',
+// CHECK-ILL-FORMED-SAME:        'optin.taint.TaintPropagation:Config',
 // CHECK-ILL-FORMED-SAME:        that expects a valid yaml file: [[MSG]]
 
-// RUN: not %clang_analyze_cc1 -Wno-incompatible-library-redeclaration \
-// RUN:   -verify %s \
-// RUN:   -analyzer-checker=alpha.security.taint \
-// RUN:   -analyzer-checker=debug.ExprInspection \
+// RUN: not %clang_analyze_cc1 -verify %s \
+// RUN:   -analyzer-checker=optin.taint.GenericTaint \
 // RUN:   -analyzer-config \
-// RUN:     alpha.security.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config-invalid-arg.yaml \
+// RUN:     optin.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config-invalid-arg.yaml \
 // RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-INVALID-ARG
 
 // CHECK-INVALID-ARG: (frontend): invalid input for checker option
-// CHECK-INVALID-ARG-SAME:        'alpha.security.taint.TaintPropagation:Config',
+// CHECK-INVALID-ARG-SAME:        'optin.taint.TaintPropagation:Config',
 // CHECK-INVALID-ARG-SAME:        that expects an argument number for propagation
 // CHECK-INVALID-ARG-SAME:        rules greater or equal to -1
 
@@ -408,6 +404,14 @@ int testDivByZero(void) {
   return 5/x; // expected-warning {{Division by a tainted value, possibly zero}}
 }
 
+int testTaintedDivFP(void) {
+  int x;
+  scanf("%d", &x);
+  if (!x)
+    return 0;
+  return 5/x; // x cannot be 0, so no tainted warning either
+}
+
 // Zero-sized VLAs.
 void testTaintedVLASize(void) {
   int x;
diff --git a/clang/test/Analysis/taint-generic.cpp b/clang/test/Analysis/taint-generic.cpp
index 0aadef88c704c..8092ac6f270b2 100644
--- a/clang/test/Analysis/taint-generic.cpp
+++ b/clang/test/Analysis/taint-generic.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1  -analyzer-checker=alpha.security.taint,core,alpha.security.ArrayBoundV2 -analyzer-config alpha.security.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config.yaml -Wno-format-security -verify -std=c++11 %s
+// RUN: %clang_analyze_cc1  -analyzer-checker=optin.taint,core,alpha.security.ArrayBoundV2 -analyzer-config optin.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config.yaml -Wno-format-security -verify -std=c++11 %s
 
 #define BUFSIZE 10
 int Buffer[BUFSIZE];
diff --git a/clang/test/Analysis/taint-tester.c b/clang/test/Analysis/taint-tester.c
index 302349fb662dd..479a96c92ecec 100644
--- a/clang/test/Analysis/taint-tester.c
+++ b/clang/test/Analysis/taint-tester.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -Wno-int-to-pointer-cast -analyzer-checker=alpha.security.taint,debug.TaintTest %s -verify
+// RUN: %clang_analyze_cc1 -Wno-int-to-pointer-cast -analyzer-checker=optin.taint,debug.TaintTest %s -verify
 
 #include "Inputs/system-header-simulator.h"
 
diff --git a/clang/test/Analysis/taint-tester.cpp b/clang/test/Analysis/taint-tester.cpp
index 23a92cc56d248..a419938906800 100644
--- a/clang/test/Analysis/taint-tester.cpp
+++ b/clang/test/Analysis/taint-tester.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1  -analyzer-checker=alpha.security.taint,debug.TaintTest %s -verify
+// RUN: %clang_analyze_cc1  -analyzer-checker=optin.taint,debug.TaintTest %s -verify
 // expected-no-diagnostics
 
 typedef struct _FILE FILE;
@@ -32,4 +32,3 @@ void testOpaqueClass(opaque *obj) {
   char buf[20];
   snprintf(buf, 20, "%p", obj); // don't crash trying to load *obj
 }
-
diff --git a/clang/test/Analysis/taint-tester.m b/clang/test/Analysis/taint-tester.m
index 531c21b5faf88..3358a7769e257 100644
--- a/clang/test/Analysis/taint-tester.m
+++ b/clang/test/Analysis/taint-tester.m
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1  -analyzer-checker=alpha.security.taint,debug.TaintTest %s -verify
+// RUN: %clang_analyze_cc1  -analyzer-checker=optin.taint,debug.TaintTest %s -verify
 // expected-no-diagnostics
 
 #import <stdarg.h>
@@ -14,8 +14,8 @@ void TestLog (NSString *format, ...) {
     va_list ap;
     va_start(ap, format);
     NSString *string = @"AAA: ";
-    
+
     NSLogv([string stringByAppendingString:format], ap);
-    
+
     va_end(ap);
 }
\ No newline at end of file
diff --git a/clang/test/CXX/dcl/dcl.decl/p3.cpp b/clang/test/CXX/dcl/dcl.decl/p3.cpp
index f141568ba6c22..b082e1c122a09 100644
--- a/clang/test/CXX/dcl/dcl.decl/p3.cpp
+++ b/clang/test/CXX/dcl/dcl.decl/p3.cpp
@@ -65,4 +65,4 @@ struct R {
 };
 
 template<typename T>
-R(T) -> R<T> requires true; // expected-error{{deduction guide cannot have a requires clause}}
+R(T) -> R<T> requires true;
diff --git a/clang/test/CXX/drs/cwg27xx.cpp b/clang/test/CXX/drs/cwg27xx.cpp
index 2b57dbc60aed7..581e2af822d55 100644
--- a/clang/test/CXX/drs/cwg27xx.cpp
+++ b/clang/test/CXX/drs/cwg27xx.cpp
@@ -201,3 +201,31 @@ static_assert(false, f().s);
 #endif
 } // namespace cwg2798
 
+namespace cwg2707 { // cwg2707: 20
+
+#if __cplusplus >= 202002L
+
+template <class T, unsigned N> struct A {
+  T value[N];
+};
+
+template <typename... T>
+A(T...) -> A<int, sizeof...(T)> requires (sizeof...(T) == 2);
+
+// Brace elision is not allowed for synthesized CTAD guides if the array size
+// is value-dependent.
+// So this should pick up our explicit deduction guide.
+A a = {1, 2};
+
+A b = {3, 4, 5};
+// expected-error@-1 {{no viable constructor or deduction guide}} \
+// expected-note@-13 {{candidate function template not viable}} \
+// expected-note@-13 {{implicit deduction guide}} \
+// expected-note@-8 {{constraints not satisfied}} \
+// expected-note@-8 {{because 'sizeof...(T) == 2' (3 == 2) evaluated to false}} \
+// expected-note@-13 {{candidate function template not viable}} \
+// expected-note@-13 {{implicit deduction guide}}
+
+#endif
+
+} // namespace cwg2707
diff --git a/clang/test/CXX/drs/cwg29xx.cpp b/clang/test/CXX/drs/cwg29xx.cpp
index e55e8e35e86f2..9629bdd41a2a5 100644
--- a/clang/test/CXX/drs/cwg29xx.cpp
+++ b/clang/test/CXX/drs/cwg29xx.cpp
@@ -6,6 +6,26 @@
 // RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected %s
 // RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected %s
 
+namespace cwg2913 { // cwg2913: 20 tentatively ready 2024-08-16
+
+#if __cplusplus >= 202002L
+
+template<typename T>
+struct R {
+  R(T);
+  R(T, T);
+};
+
+template<typename T>
+R(T) -> R<T> requires true;
+
+template<typename T>
+R(T, T) requires true -> R<T>; // expected-error {{expected function body after function declarator}}
+
+#endif
+
+} // namespace cwg2913
+
 namespace cwg2915 { // cwg2915: 20 tentatively ready 2024-08-16
 #if __cplusplus >= 202302L
 struct A {
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/requires-expr.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/requires-expr.cpp
index 90a38292d15d3..09f8f36d3c5ba 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/requires-expr.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/requires-expr.cpp
@@ -65,4 +65,18 @@ template<typename T> requires requires { T::value; S<T>::s; }
 struct r4 { };
 
 using r4i = r4<int>;
-// expected-error@-1 {{constraints not satisfied for class template 'r4' [with T = int]}}
\ No newline at end of file
+// expected-error@-1 {{constraints not satisfied for class template 'r4' [with T = int]}}
+
+namespace GH109538 {
+static_assert(requires(void *t) { t; });
+static_assert(requires(void) { 42; });
+static_assert(requires(void t) { // expected-error {{argument may not have 'void' type}}
+  t;
+});
+static_assert(requires(void t, int a) {  // expected-error {{'void' must be the first and only parameter if specified}}
+  t;
+});
+static_assert(requires(const void) { // expected-error {{'void' as parameter must not have type qualifiers}}
+  42;
+});
+} // namespace GH109538
diff --git a/clang/test/CodeGen/RISCV/riscv-cf-protection.c b/clang/test/CodeGen/RISCV/riscv-cf-protection.c
new file mode 100644
index 0000000000000..3a9855a3d2f01
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv-cf-protection.c
@@ -0,0 +1,94 @@
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32i_zicfilp1p0 -fcf-protection=branch \
+// RUN: -mcf-branch-label-scheme=unlabeled -S -emit-llvm %s -o - | FileCheck \
+// RUN: --check-prefixes=BRANCH-PROT-FLAG,UNLABELED-FLAG %s
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32i_zicfilp1p0 -fcf-protection=branch \
+// RUN: -mcf-branch-label-scheme=func-sig -S -emit-llvm %s -o - | FileCheck \
+// RUN: --check-prefixes=BRANCH-PROT-FLAG,FUNC-SIG-FLAG %s
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32i_zicfilp1p0 -mcf-branch-label-scheme=unlabeled -S \
+// RUN: -emit-llvm %s -o - 2>&1 | FileCheck \
+// RUN: --check-prefixes=NO-FLAG,UNLABELED-SCHEME-UNUSED %s
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32i_zicfilp1p0 -mcf-branch-label-scheme=func-sig -S \
+// RUN: -emit-llvm %s -o - 2>&1 | FileCheck \
+// RUN: --check-prefixes=NO-FLAG,FUNC-SIG-SCHEME-UNUSED %s
+
+// RUN: %clang --target=riscv32 -fcf-protection=branch \
+// RUN: -mcf-branch-label-scheme=unlabeled -S -emit-llvm %s -o - | FileCheck \
+// RUN: --check-prefixes=BRANCH-PROT-FLAG,UNLABELED-FLAG %s
+
+// RUN: %clang --target=riscv32 -fcf-protection=branch \
+// RUN: -mcf-branch-label-scheme=func-sig -S -emit-llvm %s -o - | FileCheck \
+// RUN: --check-prefixes=BRANCH-PROT-FLAG,FUNC-SIG-FLAG %s
+
+// RUN: %clang --target=riscv32 -mcf-branch-label-scheme=unlabeled -S \
+// RUN: -emit-llvm %s -o - 2>&1 | FileCheck \
+// RUN: --check-prefixes=NO-FLAG,UNLABELED-SCHEME-UNUSED %s
+
+// RUN: %clang --target=riscv32 -mcf-branch-label-scheme=func-sig -S \
+// RUN: -emit-llvm %s -o - 2>&1 | FileCheck \
+// RUN: --check-prefixes=NO-FLAG,FUNC-SIG-SCHEME-UNUSED %s
+
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64i_zicfilp1p0 -fcf-protection=branch \
+// RUN: -mcf-branch-label-scheme=unlabeled -S -emit-llvm %s -o - | FileCheck \
+// RUN: --check-prefixes=BRANCH-PROT-FLAG,UNLABELED-FLAG %s
+
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64i_zicfilp1p0 -fcf-protection=branch \
+// RUN: -mcf-branch-label-scheme=func-sig -S -emit-llvm %s -o - | FileCheck \
+// RUN: --check-prefixes=BRANCH-PROT-FLAG,FUNC-SIG-FLAG %s
+
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64i_zicfilp1p0 -mcf-branch-label-scheme=unlabeled -S \
+// RUN: -emit-llvm %s -o - 2>&1 | FileCheck \
+// RUN: --check-prefixes=NO-FLAG,UNLABELED-SCHEME-UNUSED %s
+
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64i_zicfilp1p0 -mcf-branch-label-scheme=func-sig -S \
+// RUN: -emit-llvm %s -o - 2>&1 | FileCheck \
+// RUN: --check-prefixes=NO-FLAG,FUNC-SIG-SCHEME-UNUSED %s
+
+// RUN: %clang --target=riscv64 -fcf-protection=branch \
+// RUN: -mcf-branch-label-scheme=unlabeled -S -emit-llvm %s -o - | FileCheck \
+// RUN: --check-prefixes=BRANCH-PROT-FLAG,UNLABELED-FLAG %s
+
+// RUN: %clang --target=riscv64 -fcf-protection=branch \
+// RUN: -mcf-branch-label-scheme=func-sig -S -emit-llvm %s -o - | FileCheck \
+// RUN: --check-prefixes=BRANCH-PROT-FLAG,FUNC-SIG-FLAG %s
+
+// RUN: %clang --target=riscv64 -mcf-branch-label-scheme=unlabeled -S \
+// RUN: -emit-llvm %s -o - 2>&1 | FileCheck \
+// RUN: --check-prefixes=NO-FLAG,UNLABELED-SCHEME-UNUSED %s
+
+// RUN: %clang --target=riscv64 -mcf-branch-label-scheme=func-sig -S \
+// RUN: -emit-llvm %s -o - 2>&1 | FileCheck \
+// RUN: --check-prefixes=NO-FLAG,FUNC-SIG-SCHEME-UNUSED %s
+
+// Default -mcf-branch-label-scheme is func-sig
+// RUN: %clang --target=riscv32 -fcf-protection=branch -S -emit-llvm %s -o - \
+// RUN: | FileCheck --check-prefixes=BRANCH-PROT-FLAG,FUNC-SIG-FLAG %s
+
+// Default -mcf-branch-label-scheme is func-sig
+// RUN: %clang --target=riscv64 -fcf-protection=branch -S -emit-llvm %s -o - \
+// RUN: | FileCheck --check-prefixes=BRANCH-PROT-FLAG,FUNC-SIG-FLAG %s
+
+// UNLABELED-SCHEME-UNUSED: warning: argument unused during compilation:
+// UNLABELED-SCHEME-UNUSED-SAME: '-mcf-branch-label-scheme=unlabeled'
+// FUNC-SIG-SCHEME-UNUSED: warning: argument unused during compilation:
+// FUNC-SIG-SCHEME-UNUSED-SAME: '-mcf-branch-label-scheme=func-sig'
+
+// BRANCH-PROT-FLAG-DAG: [[P_FLAG:![0-9]+]] = !{i32 8, !"cf-protection-branch", i32 1}
+// UNLABELED-FLAG-DAG: [[S_FLAG:![0-9]+]] = !{i32 1, !"cf-branch-label-scheme", !"unlabeled"}
+// FUNC-SIG-FLAG-DAG: [[S_FLAG:![0-9]+]] = !{i32 1, !"cf-branch-label-scheme", !"func-sig"}
+// BRANCH-PROT-FLAG-DAG: !llvm.module.flags = !{{[{].*}}[[P_FLAG]]{{.*, }}[[S_FLAG]]{{(,.+)?[}]}}
+// NO-FLAG-NOT: !{i32 8, !"cf-protection-branch", i32 1}
+// NO-FLAG-NOT: !{i32 8, !"cf-branch-label-scheme", !"unlabeled"}
+// NO-FLAG-NOT: !{i32 8, !"cf-branch-label-scheme", !"func-sig"}
+
+int main() { return 0; }
diff --git a/clang/test/CodeGen/RISCV/riscv-xcvalu-c-api.c b/clang/test/CodeGen/RISCV/riscv-xcvalu-c-api.c
new file mode 100644
index 0000000000000..b4690a5f1c1ca
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv-xcvalu-c-api.c
@@ -0,0 +1,434 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple riscv32 -target-feature +xcvalu -emit-llvm %s -o - \
+// RUN:     | FileCheck %s
+
+#include <stdint.h>
+#include <riscv_corev_alu.h>
+
+// CHECK-LABEL: @test_alu_slet(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = icmp sle i32 [[TMP2]], [[TMP3]]
+// CHECK-NEXT:    [[SLE_I:%.*]] = zext i1 [[TMP4]] to i32
+// CHECK-NEXT:    ret i32 [[SLE_I]]
+//
+int test_alu_slet(int32_t a, int32_t b) {
+  return __riscv_cv_alu_slet(a, b);
+}
+
+// CHECK-LABEL: @test_alu_sletu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = icmp ule i32 [[TMP2]], [[TMP3]]
+// CHECK-NEXT:    [[SLEU_I:%.*]] = zext i1 [[TMP4]] to i32
+// CHECK-NEXT:    ret i32 [[SLEU_I]]
+//
+int test_alu_sletu(uint32_t a, uint32_t b) {
+  return __riscv_cv_alu_sletu(a, b);
+}
+
+// CHECK-LABEL: @test_alu_min(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[ELT_MIN_I:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT:    ret i32 [[ELT_MIN_I]]
+//
+int test_alu_min(int32_t a, int32_t b) {
+  return __riscv_cv_alu_min(a, b);
+}
+
+// CHECK-LABEL: @test_alu_minu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[ELT_MIN_I:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT:    ret i32 [[ELT_MIN_I]]
+//
+int test_alu_minu(uint32_t a, uint32_t b) {
+  return __riscv_cv_alu_minu(a, b);
+}
+
+// CHECK-LABEL: @test_alu_max(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[ELT_MAX_I:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT:    ret i32 [[ELT_MAX_I]]
+//
+int test_alu_max(int32_t a, int32_t b) {
+  return __riscv_cv_alu_max(a, b);
+}
+
+// CHECK-LABEL: @test_alu_maxu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[ELT_MAX_I:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT:    ret i32 [[ELT_MAX_I]]
+//
+int test_alu_maxu(uint32_t a, uint32_t b) {
+  return __riscv_cv_alu_maxu(a, b);
+}
+
+// CHECK-LABEL: @test_alu_exths(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store i16 [[A:%.*]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[A_ADDR_I]], align 2
+// CHECK-NEXT:    [[CONV_I:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK-NEXT:    [[EXTHS_I:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK-NEXT:    ret i32 [[EXTHS_I]]
+//
+int test_alu_exths(int16_t a) {
+  return __riscv_cv_alu_exths(a);
+}
+
+// CHECK-LABEL: @test_alu_exthz(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store i16 [[A:%.*]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[A_ADDR_I]], align 2
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i16 [[TMP1]] to i32
+// CHECK-NEXT:    [[EXTHZ_I:%.*]] = zext i16 [[TMP1]] to i32
+// CHECK-NEXT:    ret i32 [[EXTHZ_I]]
+//
+int test_alu_exthz(uint16_t a) {
+  return __riscv_cv_alu_exthz(a);
+}
+
+// CHECK-LABEL: @test_alu_extbs(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store i8 [[A:%.*]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[A_ADDR_I]], align 1
+// CHECK-NEXT:    [[CONV_I:%.*]] = sext i8 [[TMP1]] to i32
+// CHECK-NEXT:    [[EXTBS_I:%.*]] = sext i8 [[TMP1]] to i32
+// CHECK-NEXT:    ret i32 [[EXTBS_I]]
+//
+int test_alu_extbs(int8_t a) {
+  return __riscv_cv_alu_extbs(a);
+}
+
+// CHECK-LABEL: @test_alu_extbz(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store i8 [[A:%.*]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[A_ADDR_I]], align 1
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i8 [[TMP1]] to i32
+// CHECK-NEXT:    [[EXTBZ_I:%.*]] = zext i8 [[TMP1]] to i32
+// CHECK-NEXT:    ret i32 [[EXTBZ_I]]
+//
+int test_alu_extbz(uint8_t a) {
+  return __riscv_cv_alu_extbz(a);
+}
+
+// CHECK-LABEL: @test_alu_clip(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.riscv.cv.alu.clip(i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
+int test_alu_clip(int32_t a) {
+  return __riscv_cv_alu_clip(a, 0);
+}
+
+// CHECK-LABEL: @test_alu_clipu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.riscv.cv.alu.clipu(i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
+int test_alu_clipu(uint32_t a) {
+  return __riscv_cv_alu_clipu(a, 0);
+}
+
+// CHECK-LABEL: @test_alu_addN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SHFT_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    store i8 0, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.addN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]])
+// CHECK-NEXT:    ret i32 [[TMP5]]
+//
+int test_alu_addN(int32_t a, int32_t b) {
+  return __riscv_cv_alu_addN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_adduN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SHFT_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    store i8 0, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.adduN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]])
+// CHECK-NEXT:    ret i32 [[TMP5]]
+//
+int test_alu_adduN(uint32_t a, uint32_t b) {
+  return __riscv_cv_alu_adduN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_addRN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SHFT_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    store i8 0, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.addRN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]])
+// CHECK-NEXT:    ret i32 [[TMP5]]
+//
+int test_alu_addRN(int32_t a, int32_t b) {
+  return __riscv_cv_alu_addRN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_adduRN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SHFT_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    store i8 0, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.adduRN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]])
+// CHECK-NEXT:    ret i32 [[TMP5]]
+//
+int test_alu_adduRN(uint32_t a, uint32_t b) {
+  return __riscv_cv_alu_adduRN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_subN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SHFT_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    store i8 0, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.subN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]])
+// CHECK-NEXT:    ret i32 [[TMP5]]
+//
+int test_alu_subN(int32_t a, int32_t b) {
+  return __riscv_cv_alu_subN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_subuN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SHFT_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    store i8 0, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.subuN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]])
+// CHECK-NEXT:    ret i32 [[TMP5]]
+//
+int test_alu_subuN(uint32_t a, uint32_t b) {
+  return __riscv_cv_alu_subuN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_subRN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SHFT_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    store i8 0, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.subRN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]])
+// CHECK-NEXT:    ret i32 [[TMP5]]
+//
+int test_alu_subRN(int32_t a, int32_t b) {
+  return __riscv_cv_alu_subRN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_subuRN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SHFT_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    store i8 0, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.subuRN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]])
+// CHECK-NEXT:    ret i32 [[TMP5]]
+//
+int test_alu_subuRN(uint32_t a, uint32_t b) {
+  return __riscv_cv_alu_subuRN(a, b, 0);
+}
diff --git a/clang/test/CodeGen/RISCV/riscv-xcvalu.c b/clang/test/CodeGen/RISCV/riscv-xcvalu.c
new file mode 100644
index 0000000000000..e4c2a2c3ca28b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv-xcvalu.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple riscv32 -target-feature +xcvalu -emit-llvm %s -o - \
+// RUN:     | FileCheck %s
+
+#include <stdint.h>
+
+// CHECK-LABEL: @test_abs(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[TMP0]], i1 true)
+// CHECK-NEXT:    ret i32 [[TMP1]]
+//
+int test_abs(int a) {
+  return __builtin_abs(a);
+}
+
+// CHECK-LABEL: @test_alu_slet(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp sle i32 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[SLE:%.*]] = zext i1 [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[SLE]]
+//
+int test_alu_slet(int32_t a, int32_t b) {
+  return __builtin_riscv_cv_alu_slet(a, b);
+}
+
+// CHECK-LABEL: @test_alu_sletu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp ule i32 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[SLEU:%.*]] = zext i1 [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[SLEU]]
+//
+int test_alu_sletu(uint32_t a, uint32_t b) {
+  return __builtin_riscv_cv_alu_sletu(a, b);
+}
+
+// CHECK-LABEL: @test_alu_exths(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store i16 [[A:%.*]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[CONV]] to i16
+// CHECK-NEXT:    [[EXTHS:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK-NEXT:    ret i32 [[EXTHS]]
+//
+int test_alu_exths(int16_t a) {
+  return __builtin_riscv_cv_alu_exths(a);
+}
+
+// CHECK-LABEL: @test_alu_exthz(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store i16 [[A:%.*]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[CONV]] to i16
+// CHECK-NEXT:    [[EXTHZ:%.*]] = zext i16 [[TMP1]] to i32
+// CHECK-NEXT:    ret i32 [[EXTHZ]]
+//
+int test_alu_exthz(uint16_t a) {
+  return __builtin_riscv_cv_alu_exthz(a);
+}
+
+// CHECK-LABEL: @test_alu_extbs(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store i8 [[A:%.*]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[CONV]] to i8
+// CHECK-NEXT:    [[EXTBS:%.*]] = sext i8 [[TMP1]] to i32
+// CHECK-NEXT:    ret i32 [[EXTBS]]
+//
+int test_alu_extbs(int8_t a) {
+  return __builtin_riscv_cv_alu_extbs(a);
+}
+
+// CHECK-LABEL: @test_alu_extbz(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store i8 [[A:%.*]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[CONV]] to i8
+// CHECK-NEXT:    [[EXTBZ:%.*]] = zext i8 [[TMP1]] to i32
+// CHECK-NEXT:    ret i32 [[EXTBZ]]
+//
+int test_alu_extbz(uint8_t a) {
+  return __builtin_riscv_cv_alu_extbz(a);
+}
+
+// CHECK-LABEL: @test_alu_clip(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.cv.alu.clip(i32 [[TMP0]], i32 15)
+// CHECK-NEXT:    ret i32 [[TMP1]]
+//
+int test_alu_clip(int32_t a) {
+  return __builtin_riscv_cv_alu_clip(a, 15);
+}
+
+// CHECK-LABEL: @test_alu_clipu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.cv.alu.clipu(i32 [[TMP0]], i32 15)
+// CHECK-NEXT:    ret i32 [[TMP1]]
+//
+int test_alu_clipu(uint32_t a) {
+  return __builtin_riscv_cv_alu_clipu(a, 15);
+}
+
+// CHECK-LABEL: @test_alu_addN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.addN(i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int test_alu_addN(int32_t a, int32_t b) {
+  return __builtin_riscv_cv_alu_addN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_adduN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.adduN(i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int test_alu_adduN(uint32_t a, uint32_t b) {
+  return __builtin_riscv_cv_alu_adduN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_addRN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.addRN(i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int test_alu_addRN(int32_t a, int32_t b) {
+  return __builtin_riscv_cv_alu_addRN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_adduRN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.adduRN(i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int test_alu_adduRN(uint32_t a, uint32_t b) {
+  return __builtin_riscv_cv_alu_adduRN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_subN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.subN(i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int test_alu_subN(int32_t a, int32_t b) {
+  return __builtin_riscv_cv_alu_subN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_subuN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.subuN(i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int test_alu_subuN(uint32_t a, uint32_t b) {
+  return __builtin_riscv_cv_alu_subuN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_subRN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.subRN(i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int test_alu_subRN(int32_t a, int32_t b) {
+  return __builtin_riscv_cv_alu_subRN(a, b, 0);
+}
+
+// CHECK-LABEL: @test_alu_subuRN(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.subuRN(i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int test_alu_subuRN(uint32_t a, uint32_t b) {
+  return __builtin_riscv_cv_alu_subuRN(a, b, 0);
+}
diff --git a/clang/test/CodeGen/X86/bmi-builtins.c b/clang/test/CodeGen/X86/bmi-builtins.c
index 81405429b9b6c..ded40ca59781e 100644
--- a/clang/test/CodeGen/X86/bmi-builtins.c
+++ b/clang/test/CodeGen/X86/bmi-builtins.c
@@ -1,5 +1,7 @@
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT
-// RUN: %clang_cc1 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefix=TZCNT
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT
+// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefix=TZCNT
+// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT
+// RUN: %clang_cc1 -x c++ -std=c++11 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefix=TZCNT
 
 
 #include <immintrin.h>
@@ -232,3 +234,88 @@ unsigned long long test_blsr_u64(unsigned long long __X) {
 #endif
 
 #endif // !defined(TEST_TZCNT)
+
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+char andnu32[__andn_u32(0x01234567, 0xFECDBA98) == (~0x01234567 & 0xFECDBA98) ? 1 : -1];
+char andn2u32[_andn_u32(0x01234567, 0xFECDBA98) == (~0x01234567 & 0xFECDBA98) ? 1 : -1];
+
+char bextr32_0[__bextr_u32(0x00000000, 0x00000000) == 0x00000000 ? 1 : -1];
+char bextr32_1[__bextr_u32(0x000003F0, 0xFFFF1004) == 0x0000003F ? 1 : -1];
+char bextr32_2[__bextr_u32(0x000003F0, 0xFFFF3008) == 0x00000003 ? 1 : -1];
+
+char bextr32_3[_bextr2_u32(0x00000000, 0x00000000) == 0x00000000 ? 1 : -1];
+char bextr32_4[_bextr2_u32(0x000003F0, 0xFFFF1004) == 0x0000003F ? 1 : -1];
+char bextr32_5[_bextr2_u32(0x000003F0, 0xFFFF3008) == 0x00000003 ? 1 : -1];
+
+char bextr32_6[_bextr_u32(0x00000000, 0x00000000, 0x00000000) == 0x00000000 ? 1 : -1];
+char bextr32_7[_bextr_u32(0x000003F0, 0xFFFFFF04, 0xFFFFFF10) == 0x0000003F ? 1 : -1];
+char bextr32_8[_bextr_u32(0x000003F0, 0xFFFFFF08, 0xFFFFFF30) == 0x00000003 ? 1 : -1];
+
+char blsiu32[__blsi_u32(0x89ABCDEF) == (0x89ABCDEF & -0x89ABCDEF) ? 1 : -1];
+char blsi2u32[_blsi_u32(0x89ABCDEF) == (0x89ABCDEF & -0x89ABCDEF) ? 1 : -1];
+
+char blsmasku32[__blsmsk_u32(0x89ABCDEF) == (0x89ABCDEF ^ (0x89ABCDEF - 1)) ? 1 : -1];
+char blsmask2u32[_blsmsk_u32(0x89ABCDEF) == (0x89ABCDEF ^ (0x89ABCDEF - 1)) ? 1 : -1];
+
+char blsru32[__blsr_u32(0x89ABCDEF) == (0x89ABCDEF & (0x89ABCDEF - 1)) ? 1 : -1];
+char blsr2u32[_blsr_u32(0x89ABCDEF) == (0x89ABCDEF & (0x89ABCDEF - 1)) ? 1 : -1];
+
+char tzcntu16_0[__tzcnt_u16(0x0000) == 16 ? 1 : -1];
+char tzcntu16_1[__tzcnt_u16(0x0001) ==  0 ? 1 : -1];
+char tzcntu16_2[__tzcnt_u16(0x0010) ==  4 ? 1 : -1];
+
+char tzcnt2u16_0[_tzcnt_u16(0x0000) == 16 ? 1 : -1];
+char tzcnt2u16_1[_tzcnt_u16(0x0001) ==  0 ? 1 : -1];
+char tzcnt2u16_2[_tzcnt_u16(0x0010) ==  4 ? 1 : -1];
+
+char tzcntu32_0[__tzcnt_u32(0x00000000) == 32 ? 1 : -1];
+char tzcntu32_1[__tzcnt_u32(0x00000001) ==  0 ? 1 : -1];
+char tzcntu32_2[__tzcnt_u32(0x00000080) ==  7 ? 1 : -1];
+
+char tzcnt2u32_0[_tzcnt_u32(0x00000000) == 32 ? 1 : -1];
+char tzcnt2u32_1[_tzcnt_u32(0x00000001) ==  0 ? 1 : -1];
+char tzcnt2u32_2[_tzcnt_u32(0x00000080) ==  7 ? 1 : -1];
+
+char tzcnt3u32_0[_mm_tzcnt_32(0x00000000) == 32 ? 1 : -1];
+char tzcnt3u32_1[_mm_tzcnt_32(0x00000001) ==  0 ? 1 : -1];
+char tzcnt3u32_2[_mm_tzcnt_32(0x00000080) ==  7 ? 1 : -1];
+
+#ifdef __x86_64__
+char andnu64[__andn_u64(0x0123456789ABCDEFULL, 0xFECDBA9876543210ULL) == (~0x0123456789ABCDEFULL & 0xFECDBA9876543210ULL) ? 1 : -1];
+char andn2u64[_andn_u64(0x0123456789ABCDEFULL, 0xFECDBA9876543210ULL) == (~0x0123456789ABCDEFULL & 0xFECDBA9876543210ULL) ? 1 : -1];
+
+char bextr64_0[__bextr_u64(0x0000000000000000ULL, 0x0000000000000000ULL) == 0x0000000000000000ULL ? 1 : -1];
+char bextr64_1[__bextr_u64(0xF000000000000001ULL, 0x0000000000004001ULL) == 0x7800000000000000ULL ? 1 : -1];
+char bextr64_2[__bextr_u64(0xF000000000000001ULL, 0xFFFFFFFFFFFF1001ULL) == 0x0000000000000000ULL ? 1 : -1];
+
+char bextr64_3[_bextr2_u64(0x0000000000000000ULL, 0x0000000000000000ULL) == 0x0000000000000000ULL ? 1 : -1];
+char bextr64_4[_bextr2_u64(0xF000000000000001ULL, 0x0000000000004001ULL) == 0x7800000000000000ULL ? 1 : -1];
+char bextr64_5[_bextr2_u64(0xF000000000000001ULL, 0xFFFFFFFFFFFF1001ULL) == 0x0000000000000000ULL ? 1 : -1];
+
+char bextr64_6[_bextr_u64(0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL) == 0x0000000000000000ULL ? 1 : -1];
+char bextr64_7[_bextr_u64(0xF000000000000001ULL, 0x0000000000000001ULL, 0x0000000000000040ULL) == 0x7800000000000000ULL ? 1 : -1];
+char bextr64_8[_bextr_u64(0xF000000000000001ULL, 0xFFFFFFFFFFFFFF01ULL, 0xFFFFFFFFFFFFFF10ULL) == 0x0000000000000000ULL ? 1 : -1];
+
+char blsiu64[__blsi_u64(0x0123456789ABCDEFULL) == (0x0123456789ABCDEFULL & -0x0123456789ABCDEFULL) ? 1 : -1];
+char blsi2u64[_blsi_u64(0x0123456789ABCDEFULL) == (0x0123456789ABCDEFULL & -0x0123456789ABCDEFULL) ? 1 : -1];
+
+char blsmasku64[__blsmsk_u64(0x0123456789ABCDEFULL) == (0x0123456789ABCDEFULL ^ (0x0123456789ABCDEFULL - 1)) ? 1 : -1];
+char blsmask2u64[_blsmsk_u64(0x0123456789ABCDEFULL) == (0x0123456789ABCDEFULL ^ (0x0123456789ABCDEFULL - 1)) ? 1 : -1];
+
+char blsru64[__blsr_u64(0x0123456789ABCDEFULL) == (0x0123456789ABCDEFULL & (0x0123456789ABCDEFULL - 1)) ? 1 : -1];
+char blsr2u64[_blsr_u64(0x0123456789ABCDEFULL) == (0x0123456789ABCDEFULL & (0x0123456789ABCDEFULL - 1)) ? 1 : -1];
+
+char tzcntu64_0[__tzcnt_u64(0x0000000000000000ULL) == 64 ? 1 : -1];
+char tzcntu64_1[__tzcnt_u64(0x0000000000000001ULL) ==  0 ? 1 : -1];
+char tzcntu64_2[__tzcnt_u64(0x0000000800000000ULL) == 35 ? 1 : -1];
+
+char tzcnt2u64_0[_tzcnt_u64(0x0000000000000000ULL) == 64 ? 1 : -1];
+char tzcnt2u64_1[_tzcnt_u64(0x0000000000000001ULL) ==  0 ? 1 : -1];
+char tzcnt2u64_2[_tzcnt_u64(0x0000000800000000ULL) == 35 ? 1 : -1];
+
+char tzcnt3u64_0[_mm_tzcnt_64(0x0000000000000000ULL) == 64 ? 1 : -1];
+char tzcnt3u64_1[_mm_tzcnt_64(0x0000000000000001ULL) ==  0 ? 1 : -1];
+char tzcnt3u64_2[_mm_tzcnt_64(0x0000000800000000ULL) == 35 ? 1 : -1];
+#endif
+#endif
\ No newline at end of file
diff --git a/clang/test/CodeGen/X86/bmi2-builtins.c b/clang/test/CodeGen/X86/bmi2-builtins.c
index a9e9bc5862a34..48424f553768b 100644
--- a/clang/test/CodeGen/X86/bmi2-builtins.c
+++ b/clang/test/CodeGen/X86/bmi2-builtins.c
@@ -1,5 +1,7 @@
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s --check-prefix=B32
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s --check-prefix=B32
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s --check-prefix=B32
 
 
 #include <immintrin.h>
@@ -22,7 +24,6 @@ unsigned int test_pext_u32(unsigned int __X, unsigned int __Y) {
 #ifdef __i386__
 unsigned int test_mulx_u32(unsigned int __X, unsigned int __Y,
                                  unsigned int *__P) {
-  // B32: @test_mulx_u32
   // B32: mul i64
   return _mulx_u32(__X, __Y, __P);
 }
@@ -46,8 +47,68 @@ unsigned long long test_pext_u64(unsigned long long __X, unsigned long long __Y)
 
 unsigned long long test_mulx_u64(unsigned long long __X, unsigned long long __Y,
                                  unsigned long long *__P) {
-  // CHECK: @test_mulx_u64
   // CHECK: mul i128
   return _mulx_u64(__X, __Y, __P);
 }
 #endif
+
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+char bzhi32_0[_bzhi_u32(0x89ABCDEF,   0) == 0x00000000 ? 1 : -1];
+char bzhi32_1[_bzhi_u32(0x89ABCDEF,  16) == 0x0000CDEF ? 1 : -1];
+char bzhi32_2[_bzhi_u32(0x89ABCDEF,  31) == 0x09ABCDEF ? 1 : -1];
+char bzhi32_3[_bzhi_u32(0x89ABCDEF,  32) == 0x89ABCDEF ? 1 : -1];
+char bzhi32_4[_bzhi_u32(0x89ABCDEF,  99) == 0x89ABCDEF ? 1 : -1];
+char bzhi32_5[_bzhi_u32(0x89ABCDEF, 260) == 0x0000000F ? 1 : -1];
+
+char pdep32_0[_pdep_u32(0x89ABCDEF,  0x00000000) == 0x00000000 ? 1 : -1];
+char pdep32_1[_pdep_u32(0x89ABCDEF,  0x000000F0) == 0x000000F0 ? 1 : -1];
+char pdep32_2[_pdep_u32(0x89ABCDEF,  0xF00000F0) == 0xE00000F0 ? 1 : -1];
+char pdep32_3[_pdep_u32(0x89ABCDEF,  0xFFFFFFFF) == 0x89ABCDEF ? 1 : -1];
+
+char pext32_0[_pext_u32(0x89ABCDEF,  0x00000000) == 0x00000000 ? 1 : -1];
+char pext32_1[_pext_u32(0x89ABCDEF,  0x000000F0) == 0x0000000E ? 1 : -1];
+char pext32_2[_pext_u32(0x89ABCDEF,  0xF00000F0) == 0x0000008E ? 1 : -1];
+char pext32_3[_pext_u32(0x89ABCDEF,  0xFFFFFFFF) == 0x89ABCDEF ? 1 : -1];
+
+constexpr unsigned long long
+test_mulx_u32(unsigned int X, unsigned int Y)
+{
+  unsigned int H{};
+  return _mulx_u32(X, Y, &H) | ((unsigned long long) H << 32);
+}
+
+void mulxu32() {
+  constexpr unsigned X = 0x89ABCDEF, Y = 0x01234567;
+  static_assert(test_mulx_u32(X,Y) == ((unsigned long long)X * Y));
+}
+
+#ifdef __x86_64__
+char bzhi64_0[_bzhi_u64(0x0123456789ABCDEFULL,   0) == 0x0000000000000000ULL ? 1 : -1];
+char bzhi64_1[_bzhi_u64(0x0123456789ABCDEFULL,  32) == 0x0000000089ABCDEFULL ? 1 : -1];
+char bzhi64_2[_bzhi_u64(0x0123456789ABCDEFULL,  99) == 0x0123456789ABCDEFULL ? 1 : -1];
+char bzhi64_3[_bzhi_u64(0x0123456789ABCDEFULL, 520) == 0x00000000000000EFULL ? 1 : -1];
+
+char pdep64_0[_pdep_u64(0x0123456789ABCDEFULL, 0x0000000000000000ULL) == 0x0000000000000000ULL ? 1 : -1];
+char pdep64_1[_pdep_u64(0x0123456789ABCDEFULL, 0x00000000000000F0ULL) == 0x00000000000000F0ULL ? 1 : -1];
+char pdep64_2[_pdep_u64(0x0123456789ABCDEFULL, 0xF00000F0F00000F0ULL) == 0xC00000D0E00000F0ULL ? 1 : -1];
+char pdep64_3[_pdep_u64(0x0123456789ABCDEFULL, 0xFFFFFFFFFFFFFFFFULL) == 0x0123456789ABCDEFULL ? 1 : -1];
+
+char pext64_0[_pext_u64(0x0123456789ABCDEFULL, 0x0000000000000000ULL) == 0x0000000000000000ULL ? 1 : -1];
+char pext64_1[_pext_u64(0x0123456789ABCDEFULL, 0x00000000000000F0ULL) == 0x000000000000000EULL ? 1 : -1];
+char pext64_2[_pext_u64(0x0123456789ABCDEFULL, 0xF00000F0F00000F0ULL) == 0x000000000000068EULL ? 1 : -1];
+char pext64_3[_pext_u64(0x0123456789ABCDEFULL, 0xFFFFFFFFFFFFFFFFULL) == 0x0123456789ABCDEFULL ? 1 : -1];
+
+constexpr unsigned __int128
+test_mulx_u64(unsigned long long X, unsigned long long Y)
+{
+  unsigned long long H{};
+  return _mulx_u64(X, Y, &H) | ((unsigned __int128) H << 64);
+}
+
+void mulxu64() {
+  constexpr unsigned long long X = 0x0123456789ABCDEFULL, Y = 0xFEDCBA9876543210ULL;
+  static_assert(test_mulx_u64(X,Y) == ((unsigned __int128)X * Y));
+}
+#endif
+#endif
\ No newline at end of file
diff --git a/clang/test/CodeGen/X86/lzcnt-builtins.c b/clang/test/CodeGen/X86/lzcnt-builtins.c
index 9255207ffaef4..18ced89fc79b1 100644
--- a/clang/test/CodeGen/X86/lzcnt-builtins.c
+++ b/clang/test/CodeGen/X86/lzcnt-builtins.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +lzcnt -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +lzcnt -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +lzcnt -emit-llvm -o - | FileCheck %s
 
 
 #include <immintrin.h>
@@ -32,3 +33,27 @@ unsigned long long test__lzcnt_u64(unsigned long long __X)
   // CHECK: @llvm.ctlz.i64(i64 %{{.*}}, i1 false)
   return _lzcnt_u64(__X);
 }
+
+
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+char lzcnt16_0[__lzcnt16(0x0000) == 16 ? 1 : -1];
+char lzcnt16_1[__lzcnt16(0x8000) ==  0 ? 1 : -1];
+char lzcnt16_2[__lzcnt16(0x0010) == 11 ? 1 : -1];
+
+char lzcnt32_0[__lzcnt32(0x00000000) == 32 ? 1 : -1];
+char lzcnt32_1[__lzcnt32(0x80000000) ==  0 ? 1 : -1];
+char lzcnt32_2[__lzcnt32(0x00000010) == 27 ? 1 : -1];
+
+char lzcnt64_0[__lzcnt64(0x0000000000000000ULL) == 64 ? 1 : -1];
+char lzcnt64_1[__lzcnt64(0x8000000000000000ULL) ==  0 ? 1 : -1];
+char lzcnt64_2[__lzcnt64(0x0000000100000000ULL) == 31 ? 1 : -1];
+
+char lzcntu32_0[_lzcnt_u32(0x00000000) == 32 ? 1 : -1];
+char lzcntu32_1[_lzcnt_u32(0x80000000) ==  0 ? 1 : -1];
+char lzcntu32_2[_lzcnt_u32(0x00000010) == 27 ? 1 : -1];
+
+char lzcntu64_0[_lzcnt_u64(0x0000000000000000ULL) == 64 ? 1 : -1];
+char lzcntu64_1[_lzcnt_u64(0x8000000000000000ULL) ==  0 ? 1 : -1];
+char lzcntu64_2[_lzcnt_u64(0x0000000100000000ULL) == 31 ? 1 : -1];
+#endif
\ No newline at end of file
diff --git a/clang/test/CodeGen/X86/tbm-builtins.c b/clang/test/CodeGen/X86/tbm-builtins.c
index ad4247be9a442..d916627a23f57 100644
--- a/clang/test/CodeGen/X86/tbm-builtins.c
+++ b/clang/test/CodeGen/X86/tbm-builtins.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s
 
 #include <x86intrin.h>
 
@@ -177,3 +178,34 @@ unsigned long long test__tzmsk_u64(unsigned long long a) {
   return __tzmsk_u64(a);
 }
 #endif
+
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+char bextri32_0[__bextri_u32(0x00000000, 0x00000000) == 0x00000000 ? 1 : -1];
+char bextri32_1[__bextri_u32(0x000003F0, 0xFFFF1004) == 0x0000003F ? 1 : -1];
+char bextri32_2[__bextri_u32(0x000003F0, 0xFFFF3008) == 0x00000003 ? 1 : -1];
+
+char blcfill32[__blcfill_u32(0x89ABCDEF) == (0x89ABCDEF & (0x89ABCDEF + 1)) ? 1 : -1];
+char blci32[__blci_u32(0x89ABCDEF) == (0x89ABCDEF | ~(0x89ABCDEF + 1)) ? 1 : -1];
+char blcmsk32[__blcmsk_u32(0x89ABCDEF) == (0x89ABCDEF ^ (0x89ABCDEF + 1)) ? 1 : -1];
+char blcs32[__blcs_u32(0x89ABCDEF) == (0x89ABCDEF | (0x89ABCDEF + 1)) ? 1 : -1];
+char blsfill32[__blsfill_u32(0x89ABCDEF) == (0x89ABCDEF | (0x89ABCDEF - 1)) ? 1 : -1];
+char blsic32[__blsic_u32(0x89ABCDEF) == (~0x89ABCDEF | (0x89ABCDEF - 1)) ? 1 : -1];
+char t1mskc32[__t1mskc_u32(0x89ABCDEF) == (~0x89ABCDEF | (0x89ABCDEF + 1)) ? 1 : -1];
+char tzmsk32[__tzmsk_u32(0x89ABCDEF) == (~0x89ABCDEF & (0x89ABCDEF - 1)) ? 1 : -1];
+
+#ifdef __x86_64__
+char bextri64_0[__bextri_u64(0x0000000000000000ULL, 0x0000000000000000ULL) == 0x0000000000000000ULL ? 1 : -1];
+char bextri64_1[__bextri_u64(0xF000000000000001ULL, 0x0000000000004001ULL) == 0x7800000000000000ULL ? 1 : -1];
+char bextri64_2[__bextri_u64(0xF000000000000001ULL, 0xFFFFFFFFFFFF1001ULL) == 0x0000000000000000ULL ? 1 : -1];
+
+char blcfill64[__blcfill_u64(0xFEDCBA9876543210) == (0xFEDCBA9876543210 & (0xFEDCBA9876543210 + 1)) ? 1 : -1];
+char blci64[__blci_u64(0xFEDCBA9876543210) == (0xFEDCBA9876543210 | ~(0xFEDCBA9876543210 + 1)) ? 1 : -1];
+char blcmsk64[__blcmsk_u64(0xFEDCBA9876543210) == (0xFEDCBA9876543210 ^ (0xFEDCBA9876543210 + 1)) ? 1 : -1];
+char blcs64[__blcs_u64(0xFEDCBA9876543210) == (0xFEDCBA9876543210 | (0xFEDCBA9876543210 + 1)) ? 1 : -1];
+char blsfill64[__blsfill_u64(0xFEDCBA9876543210) == (0xFEDCBA9876543210 | (0xFEDCBA9876543210 - 1)) ? 1 : -1];
+char blsic64[__blsic_u64(0xFEDCBA9876543210) == (~0xFEDCBA9876543210 | (0xFEDCBA9876543210 - 1)) ? 1 : -1];
+char t1mskc64[__t1mskc_u64(0xFEDCBA9876543210) == (~0xFEDCBA9876543210 | (0xFEDCBA9876543210 + 1)) ? 1 : -1];
+char tzmsk64[__tzmsk_u64(0xFEDCBA9876543210) == (~0xFEDCBA9876543210 & (0xFEDCBA9876543210 - 1)) ? 1 : -1];
+#endif
+#endif
diff --git a/clang/test/CodeGen/aarch64-fmv-dependencies.c b/clang/test/CodeGen/aarch64-fmv-dependencies.c
index f2eec11baed37..681f7e82634fa 100644
--- a/clang/test/CodeGen/aarch64-fmv-dependencies.c
+++ b/clang/test/CodeGen/aarch64-fmv-dependencies.c
@@ -111,9 +111,6 @@ __attribute__((target_version("rpres"))) int fmv(void) { return 0; }
 // CHECK: define dso_local i32 @fmv._Msb() #[[sb:[0-9]+]] {
 __attribute__((target_version("sb"))) int fmv(void) { return 0; }
 
-// CHECK: define dso_local i32 @fmv._Msha1() #[[ATTR0:[0-9]+]] {
-__attribute__((target_version("sha1"))) int fmv(void) { return 0; }
-
 // CHECK: define dso_local i32 @fmv._Msha2() #[[sha2:[0-9]+]] {
 __attribute__((target_version("sha2"))) int fmv(void) { return 0; }
 
diff --git a/clang/test/CodeGen/aarch64-fp8-intrinsics/acle_sme2_fp8_scale.c b/clang/test/CodeGen/aarch64-fp8-intrinsics/acle_sme2_fp8_scale.c
new file mode 100644
index 0000000000000..6bcf9bc946b20
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-fp8-intrinsics/acle_sme2_fp8_scale.c
@@ -0,0 +1,416 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+#endif
+
+
+// Single x2
+// CHECK-LABEL: @test_svscale_single_f16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP1_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP1_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.single.x2.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i16> [[OP2:%.*]])
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svscale_single_f16_x213svfloat16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP1_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP1_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.single.x2.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i16> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]]
+//
+svfloat16x2_t test_svscale_single_f16_x2(svfloat16x2_t op1, svint16_t op2) __arm_streaming
+{
+  return SVE_ACLE_FUNC(svscale,_single_f16_x2)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svscale_single_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP1_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP1_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.single.x2.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i32> [[OP2:%.*]])
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svscale_single_f32_x213svfloat32x2_tu11__SVInt32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP1_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP1_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.single.x2.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i32> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]]
+//
+svfloat32x2_t test_svscale_single_f32_x2(svfloat32x2_t op1, svint32_t op2) __arm_streaming
+{
+  return SVE_ACLE_FUNC(svscale,_single_f32_x2)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svscale_single_f64_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP1_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP1_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.single.x2.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i64> [[OP2:%.*]])
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svscale_single_f64_x213svfloat64x2_tu11__SVInt64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP1_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP1_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.single.x2.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i64> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]]
+//
+svfloat64x2_t test_svscale_single_f64_x2(svfloat64x2_t op1, svint64_t op2) __arm_streaming
+{
+  return SVE_ACLE_FUNC(svscale,_single_f64_x2)(op1, op2);
+}
+
+// Single x4
+// CHECK-LABEL: @test_svscale_single_f16_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP1_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP1_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP1_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], <vscale x 8 x half> [[OP1_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.single.x4.nxv8f16(<vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], <vscale x 8 x i16> [[OP2:%.*]])
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svscale_single_f16_x413svfloat16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP1_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP1_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP1_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], <vscale x 8 x half> [[OP1_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.single.x4.nxv8f16(<vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], <vscale x 8 x i16> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]]
+//
+svfloat16x4_t test_svscale_single_f16_x4(svfloat16x4_t op1, svint16_t op2) __arm_streaming
+{
+  return SVE_ACLE_FUNC(svscale,_single_f16_x4)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svscale_single_f32_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP1_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP1_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP1_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], <vscale x 4 x float> [[OP1_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.single.x4.nxv4f32(<vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], <vscale x 4 x i32> [[OP2:%.*]])
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svscale_single_f32_x413svfloat32x4_tu11__SVInt32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP1_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP1_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP1_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], <vscale x 4 x float> [[OP1_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.single.x4.nxv4f32(<vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], <vscale x 4 x i32> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]]
+//
+svfloat32x4_t test_svscale_single_f32_x4(svfloat32x4_t op1, svint32_t op2) __arm_streaming
+{
+  return SVE_ACLE_FUNC(svscale,_single_f32_x4)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svscale_single_f64_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP1_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP1_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP1_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], <vscale x 2 x double> [[OP1_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.single.x4.nxv2f64(<vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], <vscale x 2 x i64> [[OP2:%.*]])
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svscale_single_f64_x413svfloat64x4_tu11__SVInt64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP1_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP1_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP1_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], <vscale x 2 x double> [[OP1_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.single.x4.nxv2f64(<vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], <vscale x 2 x i64> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]]
+//
+svfloat64x4_t test_svscale_single_f64_x4(svfloat64x4_t op1, svint64_t op2) __arm_streaming
+{
+  return SVE_ACLE_FUNC(svscale,_single_f64_x4)(op1, op2);
+}
+
+// Multi x2
+// CHECK-LABEL: @test_svscale_f16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP1_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP1_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP2_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP2_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.x2.nxv8f16(<vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svscale_f16_x213svfloat16x2_t11svint16x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP1_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP1_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP2_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP2_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.x2.nxv8f16(<vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]]
+//
+svfloat16x2_t test_svscale_f16_x2(svfloat16x2_t op1, svint16x2_t op2) __arm_streaming
+{
+  return SVE_ACLE_FUNC(svscale,_f16_x2)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svscale_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP1_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP1_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP2_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP2_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.x2.nxv4f32(<vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svscale_f32_x213svfloat32x2_t11svint32x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP1_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP1_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP2_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP2_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.x2.nxv4f32(<vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]]
+//
+svfloat32x2_t test_svscale_f32_x2(svfloat32x2_t op1, svint32x2_t op2) __arm_streaming
+{
+  return SVE_ACLE_FUNC(svscale,_f32_x2)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svscale_f64_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP1_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP1_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP2_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP2_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.x2.nxv2f64(<vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svscale_f64_x213svfloat64x2_t11svint64x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP1_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP1_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP2_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP2_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.x2.nxv2f64(<vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]]
+//
+svfloat64x2_t test_svscale_f64_x2(svfloat64x2_t op1, svint64x2_t op2) __arm_streaming
+{
+  return SVE_ACLE_FUNC(svscale,_f64_x2)(op1, op2);
+}
+
+// Multi x4
+// CHECK-LABEL: @test_svscale_f16_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP1_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP1_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP1_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], <vscale x 8 x half> [[OP1_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP2_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[OP2_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[OP2_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[OP2_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]], 0
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]], 1
+// CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]], 2
+// CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]], 3
+// CHECK-NEXT:    [[TMP16:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.x4.nxv8f16(<vscale x 8 x half> [[TMP8]], <vscale x 8 x half> [[TMP9]], <vscale x 8 x half> [[TMP10]], <vscale x 8 x half> [[TMP11]], <vscale x 8 x i16> [[TMP12]], <vscale x 8 x i16> [[TMP13]], <vscale x 8 x i16> [[TMP14]], <vscale x 8 x i16> [[TMP15]])
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP16]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svscale_f16_x413svfloat16x4_t11svint16x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP1_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP1_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP1_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], <vscale x 8 x half> [[OP1_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP2_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[OP2_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[OP2_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[OP2_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]], 0
+// CPP-CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]], 1
+// CPP-CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]], 2
+// CPP-CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]], 3
+// CPP-CHECK-NEXT:    [[TMP16:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.x4.nxv8f16(<vscale x 8 x half> [[TMP8]], <vscale x 8 x half> [[TMP9]], <vscale x 8 x half> [[TMP10]], <vscale x 8 x half> [[TMP11]], <vscale x 8 x i16> [[TMP12]], <vscale x 8 x i16> [[TMP13]], <vscale x 8 x i16> [[TMP14]], <vscale x 8 x i16> [[TMP15]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP16]]
+//
+svfloat16x4_t test_svscale_f16_x4(svfloat16x4_t op1, svint16x4_t op2) __arm_streaming
+{
+  return SVE_ACLE_FUNC(svscale,_f16_x4)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svscale_f32_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP1_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP1_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP1_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], <vscale x 4 x float> [[OP1_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP2_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[OP2_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[OP2_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[OP2_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], 0
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], 1
+// CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], 2
+// CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], 3
+// CHECK-NEXT:    [[TMP16:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.x4.nxv4f32(<vscale x 4 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], <vscale x 4 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[TMP15]])
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP16]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svscale_f32_x413svfloat32x4_t11svint32x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP1_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP1_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP1_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], <vscale x 4 x float> [[OP1_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP2_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[OP2_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[OP2_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[OP2_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], 0
+// CPP-CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], 1
+// CPP-CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], 2
+// CPP-CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], 3
+// CPP-CHECK-NEXT:    [[TMP16:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.x4.nxv4f32(<vscale x 4 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], <vscale x 4 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[TMP15]])
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP16]]
+//
+svfloat32x4_t test_svscale_f32_x4(svfloat32x4_t op1, svint32x4_t op2) __arm_streaming
+{
+  return SVE_ACLE_FUNC(svscale,_f32_x4)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svscale_f64_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP1_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP1_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP1_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], <vscale x 2 x double> [[OP1_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP2_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[OP2_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[OP2_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[OP2_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]], 0
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]], 1
+// CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]], 2
+// CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]], 3
+// CHECK-NEXT:    [[TMP16:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.x4.nxv2f64(<vscale x 2 x double> [[TMP8]], <vscale x 2 x double> [[TMP9]], <vscale x 2 x double> [[TMP10]], <vscale x 2 x double> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP13]], <vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]])
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP16]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svscale_f64_x413svfloat64x4_t11svint64x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP1_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP1_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP1_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], <vscale x 2 x double> [[OP1_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP2_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[OP2_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[OP2_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[OP2_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]], 0
+// CPP-CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]], 1
+// CPP-CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]], 2
+// CPP-CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]], 3
+// CPP-CHECK-NEXT:    [[TMP16:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.x4.nxv2f64(<vscale x 2 x double> [[TMP8]], <vscale x 2 x double> [[TMP9]], <vscale x 2 x double> [[TMP10]], <vscale x 2 x double> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP13]], <vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP16]]
+//
+svfloat64x4_t test_svscale_f64_x4(svfloat64x4_t op1, svint64x4_t op2) __arm_streaming
+{
+  return SVE_ACLE_FUNC(svscale,_f64_x4)(op1, op2);
+}
diff --git a/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c b/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c
new file mode 100644
index 0000000000000..87fec3a491a2d
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c
@@ -0,0 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: aarch64-registered-target
+#include <arm_neon.h>
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -S -O3 -o /dev/null %s
+
+// CHECK-LABEL: define dso_local <4 x half> @test_vscale_f16(
+// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> [[VN]], <4 x i16> [[VM]])
+// CHECK-NEXT:    ret <4 x half> [[FSCALE2_I]]
+//
+float16x4_t test_vscale_f16(float16x4_t vn, int16x4_t vm) {
+  return vscale_f16(vn, vm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vscaleq_f16(
+// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> [[VN]], <8 x i16> [[VM]])
+// CHECK-NEXT:    ret <8 x half> [[FSCALE2_I]]
+//
+float16x8_t test_vscaleq_f16(float16x8_t vn, int16x8_t vm) {
+  return vscaleq_f16(vn, vm);
+
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vscale_f32(
+// CHECK-SAME: <2 x float> noundef [[VN:%.*]], <2 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> [[VN]], <2 x i32> [[VM]])
+// CHECK-NEXT:    ret <2 x float> [[FSCALE2_I]]
+//
+float32x2_t test_vscale_f32(float32x2_t vn, int32x2_t vm) {
+  return vscale_f32(vn, vm);
+
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vscaleq_f32(
+// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> [[VN]], <4 x i32> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[FSCALE2_I]]
+//
+float32x4_t test_vscaleq_f32(float32x4_t vn, int32x4_t vm) {
+  return vscaleq_f32(vn, vm);
+
+}
+
+// CHECK-LABEL: define dso_local <2 x double> @test_vscale_f64(
+// CHECK-SAME: <2 x double> noundef [[VN:%.*]], <2 x i64> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FSCALE2_I:%.*]] = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> [[VN]], <2 x i64> [[VM]])
+// CHECK-NEXT:    ret <2 x double> [[FSCALE2_I]]
+//
+float64x2_t test_vscale_f64(float64x2_t vn, int64x2_t vm) {
+  return vscaleq_f64(vn, vm);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
index 9c639984305d1..1297185c4b50e 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
@@ -24,27 +24,13 @@
 
 // CHECK-LABEL: @test_svclamp_single_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x2.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svclamp_single_s8_x210svint8x2_tu10__SVInt8_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x2.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svclamp_single_s8_x2(svint8x2_t op1, svint8_t op2, svint8_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_s8_x2, , )(op1, op2, op3);
@@ -52,27 +38,13 @@ svint8x2_t test_svclamp_single_s8_x2(svint8x2_t op1, svint8_t op2, svint8_t op3)
 
 // CHECK-LABEL: @test_svclamp_single_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sclamp.single.x2.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_s16_x211svint16x2_tu11__SVInt16_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sclamp.single.x2.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svclamp_single_s16_x2(svint16x2_t op1, svint16_t op2, svint16_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_s16_x2, , )(op1, op2, op3);
@@ -80,27 +52,13 @@ svint16x2_t test_svclamp_single_s16_x2(svint16x2_t op1, svint16_t op2, svint16_t
 
 // CHECK-LABEL: @test_svclamp_single_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sclamp.single.x2.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_s32_x211svint32x2_tu11__SVInt32_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sclamp.single.x2.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svclamp_single_s32_x2(svint32x2_t op1, svint32_t op2, svint32_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_s32_x2, , )(op1, op2, op3);
@@ -108,27 +66,13 @@ svint32x2_t test_svclamp_single_s32_x2(svint32x2_t op1, svint32_t op2, svint32_t
 
 // CHECK-LABEL: @test_svclamp_single_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sclamp.single.x2.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_s64_x211svint64x2_tu11__SVInt64_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sclamp.single.x2.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svclamp_single_s64_x2(svint64x2_t op1, svint64_t op2, svint64_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_s64_x2, , )(op1, op2, op3);
@@ -139,35 +83,13 @@ svint64x2_t test_svclamp_single_s64_x2(svint64x2_t op1, svint64_t op2, svint64_t
 
 // CHECK-LABEL: @test_svclamp_single_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x4.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svclamp_single_s8_x410svint8x4_tu10__SVInt8_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x4.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svclamp_single_s8_x4(svint8x4_t op1, svint8_t op2, svint8_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_s8_x4, , )(op1, op2, op3);
@@ -175,35 +97,13 @@ svint8x4_t test_svclamp_single_s8_x4(svint8x4_t op1, svint8_t op2, svint8_t op3)
 
 // CHECK-LABEL: @test_svclamp_single_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sclamp.single.x4.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP1_COERCE2:%.*]], <vscale x 8 x i16> [[OP1_COERCE3:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_s16_x411svint16x4_tu11__SVInt16_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sclamp.single.x4.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP1_COERCE2:%.*]], <vscale x 8 x i16> [[OP1_COERCE3:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svclamp_single_s16_x4(svint16x4_t op1, svint16_t op2, svint16_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_s16_x4, , )(op1, op2, op3);
@@ -211,35 +111,13 @@ svint16x4_t test_svclamp_single_s16_x4(svint16x4_t op1, svint16_t op2, svint16_t
 
 // CHECK-LABEL: @test_svclamp_single_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sclamp.single.x4.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP1_COERCE2:%.*]], <vscale x 4 x i32> [[OP1_COERCE3:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_s32_x411svint32x4_tu11__SVInt32_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sclamp.single.x4.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP1_COERCE2:%.*]], <vscale x 4 x i32> [[OP1_COERCE3:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svclamp_single_s32_x4(svint32x4_t op1, svint32_t op2, svint32_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_s32_x4, , )(op1, op2, op3);
@@ -247,35 +125,13 @@ svint32x4_t test_svclamp_single_s32_x4(svint32x4_t op1, svint32_t op2, svint32_t
 
 // CHECK-LABEL: @test_svclamp_single_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sclamp.single.x4.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP1_COERCE2:%.*]], <vscale x 2 x i64> [[OP1_COERCE3:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_s64_x411svint64x4_tu11__SVInt64_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sclamp.single.x4.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP1_COERCE2:%.*]], <vscale x 2 x i64> [[OP1_COERCE3:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svclamp_single_s64_x4(svint64x4_t op1, svint64_t op2, svint64_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_s64_x4, , )(op1, op2, op3);
@@ -288,27 +144,13 @@ svint64x4_t test_svclamp_single_s64_x4(svint64x4_t op1, svint64_t op2, svint64_t
 
 // CHECK-LABEL: @test_svclamp_single_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uclamp.single.x2.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svclamp_single_u8_x211svuint8x2_tu11__SVUint8_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uclamp.single.x2.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svclamp_single_u8_x2(svuint8x2_t op1, svuint8_t op2, svuint8_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_u8_x2, , )(op1, op2, op3);
@@ -316,27 +158,13 @@ svuint8x2_t test_svclamp_single_u8_x2(svuint8x2_t op1, svuint8_t op2, svuint8_t
 
 // CHECK-LABEL: @test_svclamp_single_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uclamp.single.x2.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_u16_x212svuint16x2_tu12__SVUint16_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uclamp.single.x2.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svclamp_single_u16_x2(svuint16x2_t op1, svuint16_t op2, svuint16_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_u16_x2, , )(op1, op2, op3);
@@ -344,27 +172,13 @@ svuint16x2_t test_svclamp_single_u16_x2(svuint16x2_t op1, svuint16_t op2, svuint
 
 // CHECK-LABEL: @test_svclamp_single_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uclamp.single.x2.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_u32_x212svuint32x2_tu12__SVUint32_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uclamp.single.x2.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svclamp_single_u32_x2(svuint32x2_t op1, svuint32_t op2, svuint32_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_u32_x2, , )(op1, op2, op3);
@@ -372,27 +186,13 @@ svuint32x2_t test_svclamp_single_u32_x2(svuint32x2_t op1, svuint32_t op2, svuint
 
 // CHECK-LABEL: @test_svclamp_single_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uclamp.single.x2.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_u64_x212svuint64x2_tu12__SVUint64_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uclamp.single.x2.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svclamp_single_u64_x2(svuint64x2_t op1, svuint64_t op2, svuint64_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_u64_x2, , )(op1, op2, op3);
@@ -403,35 +203,13 @@ svuint64x2_t test_svclamp_single_u64_x2(svuint64x2_t op1, svuint64_t op2, svuint
 
 // CHECK-LABEL: @test_svclamp_single_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uclamp.single.x4.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svclamp_single_u8_x411svuint8x4_tu11__SVUint8_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uclamp.single.x4.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svclamp_single_u8_x4(svuint8x4_t op1, svuint8_t op2, svuint8_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_u8_x4, , )(op1, op2, op3);
@@ -439,35 +217,13 @@ svuint8x4_t test_svclamp_single_u8_x4(svuint8x4_t op1, svuint8_t op2, svuint8_t
 
 // CHECK-LABEL: @test_svclamp_single_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uclamp.single.x4.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP1_COERCE2:%.*]], <vscale x 8 x i16> [[OP1_COERCE3:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_u16_x412svuint16x4_tu12__SVUint16_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uclamp.single.x4.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP1_COERCE2:%.*]], <vscale x 8 x i16> [[OP1_COERCE3:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svclamp_single_u16_x4(svuint16x4_t op1, svuint16_t op2, svuint16_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_u16_x4, , )(op1, op2, op3);
@@ -475,35 +231,13 @@ svuint16x4_t test_svclamp_single_u16_x4(svuint16x4_t op1, svuint16_t op2, svuint
 
 // CHECK-LABEL: @test_svclamp_single_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uclamp.single.x4.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP1_COERCE2:%.*]], <vscale x 4 x i32> [[OP1_COERCE3:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_u32_x412svuint32x4_tu12__SVUint32_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uclamp.single.x4.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP1_COERCE2:%.*]], <vscale x 4 x i32> [[OP1_COERCE3:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svclamp_single_u32_x4(svuint32x4_t op1, svuint32_t op2, svuint32_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_u32_x4, , )(op1, op2, op3);
@@ -511,35 +245,13 @@ svuint32x4_t test_svclamp_single_u32_x4(svuint32x4_t op1, svuint32_t op2, svuint
 
 // CHECK-LABEL: @test_svclamp_single_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uclamp.single.x4.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP1_COERCE2:%.*]], <vscale x 2 x i64> [[OP1_COERCE3:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_u64_x412svuint64x4_tu12__SVUint64_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uclamp.single.x4.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP1_COERCE2:%.*]], <vscale x 2 x i64> [[OP1_COERCE3:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svclamp_single_u64_x4(svuint64x4_t op1, svuint64_t op2, svuint64_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_u64_x4, , )(op1, op2, op3);
@@ -552,27 +264,13 @@ svuint64x4_t test_svclamp_single_u64_x4(svuint64x4_t op1, svuint64_t op2, svuint
 
 // CHECK-LABEL: @test_svclamp_single_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fclamp.single.x2.nxv8f16(<vscale x 8 x half> [[OP1_COERCE0:%.*]], <vscale x 8 x half> [[OP1_COERCE1:%.*]], <vscale x 8 x half> [[OP2:%.*]], <vscale x 8 x half> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_f16_x213svfloat16x2_tu13__SVFloat16_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fclamp.single.x2.nxv8f16(<vscale x 8 x half> [[OP1_COERCE0:%.*]], <vscale x 8 x half> [[OP1_COERCE1:%.*]], <vscale x 8 x half> [[OP2:%.*]], <vscale x 8 x half> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svclamp_single_f16_x2(svfloat16x2_t op1, svfloat16_t op2, svfloat16_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_f16_x2, , )(op1, op2, op3);
@@ -580,27 +278,13 @@ svfloat16x2_t test_svclamp_single_f16_x2(svfloat16x2_t op1, svfloat16_t op2, svf
 
 // CHECK-LABEL: @test_svclamp_single_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fclamp.single.x2.nxv4f32(<vscale x 4 x float> [[OP1_COERCE0:%.*]], <vscale x 4 x float> [[OP1_COERCE1:%.*]], <vscale x 4 x float> [[OP2:%.*]], <vscale x 4 x float> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_f32_x213svfloat32x2_tu13__SVFloat32_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fclamp.single.x2.nxv4f32(<vscale x 4 x float> [[OP1_COERCE0:%.*]], <vscale x 4 x float> [[OP1_COERCE1:%.*]], <vscale x 4 x float> [[OP2:%.*]], <vscale x 4 x float> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svclamp_single_f32_x2(svfloat32x2_t op1, svfloat32_t op2, svfloat32_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_f32_x2, , )(op1, op2, op3);
@@ -609,27 +293,13 @@ svfloat32x2_t test_svclamp_single_f32_x2(svfloat32x2_t op1, svfloat32_t op2, svf
 
 // CHECK-LABEL: @test_svclamp_single_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fclamp.single.x2.nxv2f64(<vscale x 2 x double> [[OP1_COERCE0:%.*]], <vscale x 2 x double> [[OP1_COERCE1:%.*]], <vscale x 2 x double> [[OP2:%.*]], <vscale x 2 x double> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_f64_x213svfloat64x2_tu13__SVFloat64_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fclamp.single.x2.nxv2f64(<vscale x 2 x double> [[OP1_COERCE0:%.*]], <vscale x 2 x double> [[OP1_COERCE1:%.*]], <vscale x 2 x double> [[OP2:%.*]], <vscale x 2 x double> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svclamp_single_f64_x2(svfloat64x2_t op1, svfloat64_t op2, svfloat64_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_f64_x2, , )(op1, op2, op3);
@@ -640,35 +310,13 @@ svfloat64x2_t test_svclamp_single_f64_x2(svfloat64x2_t op1, svfloat64_t op2, svf
 
 // CHECK-LABEL: @test_svclamp_single_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fclamp.single.x4.nxv8f16(<vscale x 8 x half> [[OP1_COERCE0:%.*]], <vscale x 8 x half> [[OP1_COERCE1:%.*]], <vscale x 8 x half> [[OP1_COERCE2:%.*]], <vscale x 8 x half> [[OP1_COERCE3:%.*]], <vscale x 8 x half> [[OP2:%.*]], <vscale x 8 x half> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_f16_x413svfloat16x4_tu13__SVFloat16_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fclamp.single.x4.nxv8f16(<vscale x 8 x half> [[OP1_COERCE0:%.*]], <vscale x 8 x half> [[OP1_COERCE1:%.*]], <vscale x 8 x half> [[OP1_COERCE2:%.*]], <vscale x 8 x half> [[OP1_COERCE3:%.*]], <vscale x 8 x half> [[OP2:%.*]], <vscale x 8 x half> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svclamp_single_f16_x4(svfloat16x4_t op1, svfloat16_t op2, svfloat16_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_f16_x4, , )(op1, op2, op3);
@@ -676,35 +324,13 @@ svfloat16x4_t test_svclamp_single_f16_x4(svfloat16x4_t op1, svfloat16_t op2, svf
 
 // CHECK-LABEL: @test_svclamp_single_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fclamp.single.x4.nxv4f32(<vscale x 4 x float> [[OP1_COERCE0:%.*]], <vscale x 4 x float> [[OP1_COERCE1:%.*]], <vscale x 4 x float> [[OP1_COERCE2:%.*]], <vscale x 4 x float> [[OP1_COERCE3:%.*]], <vscale x 4 x float> [[OP2:%.*]], <vscale x 4 x float> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_f32_x413svfloat32x4_tu13__SVFloat32_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fclamp.single.x4.nxv4f32(<vscale x 4 x float> [[OP1_COERCE0:%.*]], <vscale x 4 x float> [[OP1_COERCE1:%.*]], <vscale x 4 x float> [[OP1_COERCE2:%.*]], <vscale x 4 x float> [[OP1_COERCE3:%.*]], <vscale x 4 x float> [[OP2:%.*]], <vscale x 4 x float> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svclamp_single_f32_x4(svfloat32x4_t op1, svfloat32_t op2, svfloat32_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_f32_x4, , )(op1, op2, op3);
@@ -712,35 +338,13 @@ svfloat32x4_t test_svclamp_single_f32_x4(svfloat32x4_t op1, svfloat32_t op2, svf
 
 // CHECK-LABEL: @test_svclamp_single_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fclamp.single.x4.nxv2f64(<vscale x 2 x double> [[OP1_COERCE0:%.*]], <vscale x 2 x double> [[OP1_COERCE1:%.*]], <vscale x 2 x double> [[OP1_COERCE2:%.*]], <vscale x 2 x double> [[OP1_COERCE3:%.*]], <vscale x 2 x double> [[OP2:%.*]], <vscale x 2 x double> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svclamp_single_f64_x413svfloat64x4_tu13__SVFloat64_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fclamp.single.x4.nxv2f64(<vscale x 2 x double> [[OP1_COERCE0:%.*]], <vscale x 2 x double> [[OP1_COERCE1:%.*]], <vscale x 2 x double> [[OP1_COERCE2:%.*]], <vscale x 2 x double> [[OP1_COERCE3:%.*]], <vscale x 2 x double> [[OP2:%.*]], <vscale x 2 x double> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svclamp_single_f64_x4(svfloat64x4_t op1, svfloat64_t op2, svfloat64_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_f64_x4, , )(op1, op2, op3);
@@ -748,27 +352,13 @@ svfloat64x4_t test_svclamp_single_f64_x4(svfloat64x4_t op1, svfloat64_t op2, svf
 
 // CHECK-LABEL: @test_svclamp_single_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[OP1_COERCE0:%.*]], <vscale x 8 x bfloat> [[OP1_COERCE1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svclamp_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[OP1_COERCE0:%.*]], <vscale x 8 x bfloat> [[OP1_COERCE1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svclamp_single_bf16_x2(svbfloat16x2_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_bf16_x2, , )(op1, op2, op3);
@@ -776,35 +366,13 @@ svbfloat16x2_t test_svclamp_single_bf16_x2(svbfloat16x2_t op1, svbfloat16_t op2,
 
 // CHECK-LABEL: @test_svclamp_single_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[OP1_COERCE0:%.*]], <vscale x 8 x bfloat> [[OP1_COERCE1:%.*]], <vscale x 8 x bfloat> [[OP1_COERCE2:%.*]], <vscale x 8 x bfloat> [[OP1_COERCE3:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svclamp_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[OP1_COERCE0:%.*]], <vscale x 8 x bfloat> [[OP1_COERCE1:%.*]], <vscale x 8 x bfloat> [[OP1_COERCE2:%.*]], <vscale x 8 x bfloat> [[OP1_COERCE3:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svclamp_single_bf16_x4(svbfloat16x4_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_bf16_x4, , )(op1, op2, op3);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
index 2d61670fd6049..2851ea9ccd22c 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
@@ -49,27 +49,13 @@ svbfloat16_t test_cvt_bf16_x2(svfloat32x2_t zn)  __arm_streaming {
 // x2
 // CHECK-LABEL: @test_svcvt_f32_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ucvtf.x2.nxv4f32.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svcvt_f32_u32_x212svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ucvtf.x2.nxv4f32.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svcvt_f32_u32_x2(svuint32x2_t zn)  __arm_streaming {
   return SVE_ACLE_FUNC(svcvt_f32,_u32_x2,,)(zn);
@@ -77,27 +63,13 @@ svfloat32x2_t test_svcvt_f32_u32_x2(svuint32x2_t zn)  __arm_streaming {
 
 // CHECK-LABEL: @test_svcvt_f32_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.scvtf.x2.nxv4f32.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svcvt_f32_s32_x211svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.scvtf.x2.nxv4f32.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svcvt_f32_s32_x2(svint32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svcvt_f32,_s32_x2,,)(zn);
@@ -105,27 +77,13 @@ svfloat32x2_t test_svcvt_f32_s32_x2(svint32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svcvt_u32_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzu.x2.nxv4i32.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svcvt_u32_f32_x213svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzu.x2.nxv4i32.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svcvt_u32_f32_x2(svfloat32x2_t zn)  __arm_streaming {
   return SVE_ACLE_FUNC(svcvt_u32,_f32_x2,,)(zn);
@@ -133,27 +91,13 @@ svuint32x2_t test_svcvt_u32_f32_x2(svfloat32x2_t zn)  __arm_streaming {
 
 // CHECK-LABEL: @test_svcvt_s32_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzs.x2.nxv4i32.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svcvt_s32_f32_x213svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzs.x2.nxv4i32.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svcvt_s32_f32_x2(svfloat32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svcvt_s32,_f32_x2,,)(zn);
@@ -162,35 +106,13 @@ svint32x2_t test_svcvt_s32_f32_x2(svfloat32x2_t zn) __arm_streaming {
 // x4
 // CHECK-LABEL: @test_svcvt_f32_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ucvtf.x4.nxv4f32.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svcvt_f32_u32_x412svuint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ucvtf.x4.nxv4f32.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svcvt_f32_u32_x4(svuint32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svcvt_f32,_u32_x4,,)(zn);
@@ -198,35 +120,13 @@ svfloat32x4_t test_svcvt_f32_u32_x4(svuint32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svcvt_f32_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.scvtf.x4.nxv4f32.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svcvt_f32_s32_x411svint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.scvtf.x4.nxv4f32.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svcvt_f32_s32_x4(svint32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svcvt_f32,_s32_x4,,)(zn);
@@ -234,35 +134,13 @@ svfloat32x4_t test_svcvt_f32_s32_x4(svint32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svcvt_u32_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzu.x4.nxv4i32.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svcvt_u32_f32_x413svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzu.x4.nxv4i32.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svcvt_u32_f32_x4(svfloat32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svcvt_u32,_f32_x4,,)(zn);
@@ -270,35 +148,13 @@ svuint32x4_t test_svcvt_u32_f32_x4(svfloat32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svcvt_s32_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzs.x4.nxv4i32.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svcvt_s32_f32_x413svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzs.x4.nxv4i32.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svcvt_s32_f32_x4(svfloat32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svcvt_s32,_f32_x4,,)(zn);
@@ -432,27 +288,13 @@ svuint16_t test_qcvt_u16_s64_x4(svint64x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_cvt_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z15test_cvt_f32_x2u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 __attribute__((target("sme-f16f16"))) svfloat32x2_t test_cvt_f32_x2(svfloat16_t zn)  __arm_streaming {
   return SVE_ACLE_FUNC(svcvt_f32,_f16_x2,,)(zn);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c
index fc5c0376e925e..5189ab4af8327 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c
@@ -19,27 +19,13 @@
 
 // CHECK-LABEL: @test_cvtl_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_cvtl_f32_x2u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_cvtl_f32_x2(svfloat16_t zn)  __arm_streaming {
   return SVE_ACLE_FUNC(svcvtl_f32,_f16_x2,,)(zn);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_faminmax.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_faminmax.c
index a1540bba2a8a9..d4d423f982e84 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_faminmax.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_faminmax.c
@@ -19,27 +19,13 @@
 
 // CHECK-LABEL: @test_svamax_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famax.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svamax_f16_x213svfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famax.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svamax_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svamax,_f16_x2)(zdn, zm);
@@ -47,27 +33,13 @@ svfloat16x2_t test_svamax_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svamax_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famax.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svamax_f32_x213svfloat32x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famax.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svamax_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svamax,_f32_x2)(zdn, zm);
@@ -75,27 +47,13 @@ svfloat32x2_t test_svamax_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svamax_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famax.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svamax_f64_x213svfloat64x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famax.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svamax_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svamax,_f64_x2)(zdn, zm);
@@ -103,27 +61,13 @@ svfloat64x2_t test_svamax_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svamin_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famin.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svamin_f16_x213svfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famin.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svamin_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svamin,_f16_x2)(zdn, zm);
@@ -131,27 +75,13 @@ svfloat16x2_t test_svamin_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svamin_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famin.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svamin_f32_x213svfloat32x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famin.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svamin_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svamin,_f32_x2)(zdn, zm);
@@ -159,27 +89,13 @@ svfloat32x2_t test_svamin_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svamin_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famin.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svamin_f64_x213svfloat64x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famin.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svamin_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svamin,_f64_x2)(zdn, zm);
@@ -189,35 +105,13 @@ svfloat64x2_t test_svamin_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svamax_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famax.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE2:%.*]], <vscale x 8 x half> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svamax_f16_x413svfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famax.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE2:%.*]], <vscale x 8 x half> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svamax_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svamax,_f16_x4)(zdn, zm);
@@ -225,35 +119,13 @@ svfloat16x4_t test_svamax_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svamax_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famax.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE2:%.*]], <vscale x 4 x float> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svamax_f32_x413svfloat32x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famax.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE2:%.*]], <vscale x 4 x float> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svamax_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svamax,_f32_x4)(zdn, zm);
@@ -261,35 +133,13 @@ svfloat32x4_t test_svamax_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svamax_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famax.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE2:%.*]], <vscale x 2 x double> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svamax_f64_x413svfloat64x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famax.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE2:%.*]], <vscale x 2 x double> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svamax_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svamax,_f64_x4)(zdn, zm);
@@ -297,35 +147,13 @@ svfloat64x4_t test_svamax_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svamin_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famin.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE2:%.*]], <vscale x 8 x half> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svamin_f16_x413svfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famin.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE2:%.*]], <vscale x 8 x half> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svamin_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svamin,_f16_x4)(zdn, zm);
@@ -333,35 +161,13 @@ svfloat16x4_t test_svamin_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svamin_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famin.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE2:%.*]], <vscale x 4 x float> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svamin_f32_x413svfloat32x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famin.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE2:%.*]], <vscale x 4 x float> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svamin_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svamin,_f32_x4)(zdn, zm);
@@ -369,35 +175,13 @@ svfloat32x4_t test_svamin_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svamin_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famin.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE2:%.*]], <vscale x 2 x double> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svamin_f64_x413svfloat64x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famin.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE2:%.*]], <vscale x 2 x double> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svamin_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svamin,_f64_x4)(zdn, zm);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
index abdb5a46d5453..8ab450587fc70 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
@@ -21,27 +21,13 @@
 
 // CHECK-LABEL: @test_svfrinta_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svfrinta_f32_x213svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svfrinta_f32_x2(svfloat32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svrinta,_f32_x2)(zn);
@@ -49,35 +35,13 @@ svfloat32x2_t test_svfrinta_f32_x2(svfloat32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svfrinta_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svfrinta_f32_x413svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svfrinta_f32_x4(svfloat32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svrinta,_f32_x4)(zn);
@@ -87,27 +51,13 @@ svfloat32x4_t test_svfrinta_f32_x4(svfloat32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svfrintam_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svfrintam_f32_x213svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svfrintam_f32_x2(svfloat32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svrintm,_f32_x2)(zn);
@@ -115,35 +65,13 @@ svfloat32x2_t test_svfrintam_f32_x2(svfloat32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svfrintm_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svfrintm_f32_x413svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svfrintm_f32_x4(svfloat32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svrintm,_f32_x4)(zn);
@@ -153,27 +81,13 @@ svfloat32x4_t test_svfrintm_f32_x4(svfloat32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svfrintn_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svfrintn_f32_x213svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svfrintn_f32_x2(svfloat32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svrintn,_f32_x2)(zn);
@@ -181,35 +95,13 @@ svfloat32x2_t test_svfrintn_f32_x2(svfloat32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svfrintn_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svfrintn_f32_x413svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svfrintn_f32_x4(svfloat32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svrintn,_f32_x4)(zn);
@@ -219,27 +111,13 @@ svfloat32x4_t test_svfrintn_f32_x4(svfloat32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svfrintp_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svfrintp_f32_x213svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svfrintp_f32_x2(svfloat32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svrintp,_f32_x2)(zn);
@@ -247,35 +125,13 @@ svfloat32x2_t test_svfrintp_f32_x2(svfloat32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svfrintp_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svfrintp_f32_x413svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svfrintp_f32_x4(svfloat32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svrintp,_f32_x4)(zn);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
index 6dd55663d7d34..3b17c6d9edb19 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
@@ -10,27 +10,13 @@
 
 // CHECK-LABEL: @test_svluti2_lane_zt_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u8_x2(0, zn, 7);
@@ -39,27 +25,13 @@ svuint8x2_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("zt0"
 
 // CHECK-LABEL: @test_svluti2_lane_zt_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_s8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s8_x2(0, zn, 7);
@@ -67,27 +39,13 @@ svint8x2_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("zt0")
 
 // CHECK-LABEL: @test_svluti2_lane_zt_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u16u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u16_x2(0, zn, 7);
@@ -96,27 +54,13 @@ svuint16x2_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt
 
 // CHECK-LABEL: @test_svluti2_lane_zt_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_s16u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s16_x2(0, zn, 7);
@@ -124,27 +68,13 @@ svint16x2_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0
 
 // CHECK-LABEL: @test_svluti2_lane_zt_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8f16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_f16u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8f16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f16_x2(0, zn, 7);
@@ -152,27 +82,13 @@ svfloat16x2_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("z
 
 // CHECK-LABEL: @test_svluti2_lane_zt_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8bf16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svluti2_lane_zt_bf16u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8bf16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_bf16_x2(0, zn, 7);
@@ -180,27 +96,13 @@ svbfloat16x2_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in(
 
 // CHECK-LABEL: @test_svluti2_lane_zt_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u32u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u32_x2(0, zn, 7);
@@ -208,27 +110,13 @@ svuint32x2_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt
 
 // CHECK-LABEL: @test_svluti2_lane_zt_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_s32u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s32_x2(0, zn, 7);
@@ -236,27 +124,13 @@ svint32x2_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0
 
 // CHECK-LABEL: @test_svluti2_lane_zt_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4f32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_f32u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4f32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f32_x2(0, zn, 7);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
index 8650ec7f62dd8..38059019737f8 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
@@ -10,35 +10,13 @@
 
 // CHECK-LABEL: @test_svluti2_lane_zt_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u8_x4(0, zn, 3);
@@ -47,35 +25,13 @@ svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("zt0"
 
 // CHECK-LABEL: @test_svluti2_lane_zt_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_s8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s8_x4(0, zn, 3);
@@ -83,35 +39,13 @@ svint8x4_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("zt0")
 
 // CHECK-LABEL: @test_svluti2_lane_zt_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u16u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u16_x4(0, zn, 3);
@@ -119,35 +53,13 @@ svuint16x4_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt
 
 // CHECK-LABEL: @test_svluti2_lane_zt_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_s16u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s16_x4(0, zn, 3);
@@ -155,35 +67,13 @@ svint16x4_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0
 
 // CHECK-LABEL: @test_svluti2_lane_zt_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8f16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_f16u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8f16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f16_x4(0, zn, 3);
@@ -191,35 +81,13 @@ svfloat16x4_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("z
 
 // CHECK-LABEL: @test_svluti2_lane_zt_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8bf16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svluti2_lane_zt_bf16u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8bf16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_bf16_x4(0, zn, 3);
@@ -227,35 +95,13 @@ svbfloat16x4_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in(
 
 // CHECK-LABEL: @test_svluti2_lane_zt_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u32u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u32_x4(0, zn, 3);
@@ -263,35 +109,13 @@ svuint32x4_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt
 
 // CHECK-LABEL: @test_svluti2_lane_zt_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_s32u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s32_x4(0, zn, 3);
@@ -299,35 +123,13 @@ svint32x4_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0
 
 // CHECK-LABEL: @test_svluti2_lane_zt_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4f32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_f32u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4f32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f32_x4(0, zn, 3);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
index f4f11c9fc5b14..db615b3cd1c24 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
@@ -10,27 +10,13 @@
 
 // CHECK-LABEL: @test_svluti4_lane_zt_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svluti4_lane_zt_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_u8_x2(0, zn, 3);
@@ -39,27 +25,13 @@ svuint8x2_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("zt0"
 
 // CHECK-LABEL: @test_svluti4_lane_zt_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svluti4_lane_zt_s8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_s8_x2(0, zn, 3);
@@ -67,27 +39,13 @@ svint8x2_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("zt0")
 
 // CHECK-LABEL: @test_svluti4_lane_zt_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti4_lane_zt_u16u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_u16_x2(0, zn, 3);
@@ -96,27 +54,13 @@ svuint16x2_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt
 
 // CHECK-LABEL: @test_svluti4_lane_zt_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti4_lane_zt_s16u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_s16_x2(0, zn, 3);
@@ -124,27 +68,13 @@ svint16x2_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0
 
 // CHECK-LABEL: @test_svluti4_lane_zt_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8f16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti4_lane_zt_f16u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8f16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_f16_x2(0, zn, 3);
@@ -152,27 +82,13 @@ svfloat16x2_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("z
 
 // CHECK-LABEL: @test_svluti4_lane_zt_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8bf16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svluti4_lane_zt_bf16u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8bf16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_bf16_x2(0, zn, 3);
@@ -180,27 +96,13 @@ svbfloat16x2_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in(
 
 // CHECK-LABEL: @test_svluti4_lane_zt_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti4_lane_zt_u32u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_u32_x2(0, zn, 3);
@@ -208,27 +110,13 @@ svuint32x2_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt
 
 // CHECK-LABEL: @test_svluti4_lane_zt_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti4_lane_zt_s32u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_s32_x2(0, zn, 3);
@@ -236,27 +124,13 @@ svint32x2_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0
 
 // CHECK-LABEL: @test_svluti4_lane_zt_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4f32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svluti4_lane_zt_f32u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4f32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 3)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_f32_x2(0, zn, 3);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
index 16a7421326235..c4c89358c16f8 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
@@ -11,36 +11,14 @@
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti4_lane_zt_u16
 // CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svluti4_lane_zt_u16u11__SVUint8_t
 // CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0:[0-9]+]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_u16_x4(0, zn, 1);
@@ -49,36 +27,14 @@ svuint16x4_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt
 // CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svluti4_lane_zt_f16
 // CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @_Z24test_svluti4_lane_zt_f16u11__SVUint8_t
 // CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_f16_x4(0, zn, 1);
@@ -87,36 +43,14 @@ svfloat16x4_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("z
 // CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svluti4_lane_zt_bf16
 // CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8bf16(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z25test_svluti4_lane_zt_bf16u11__SVUint8_t
 // CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8bf16(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_bf16_x4(0, zn, 1);
@@ -125,36 +59,14 @@ svbfloat16x4_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in(
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti4_lane_zt_s16
 // CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svluti4_lane_zt_s16u11__SVUint8_t
 // CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_s16_x4(0, zn, 1);
@@ -163,36 +75,14 @@ svint16x4_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svluti4_lane_zt_u32
 // CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z24test_svluti4_lane_zt_u32u11__SVUint8_t
 // CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_u32_x4(0, zn, 1);
@@ -201,36 +91,14 @@ svuint32x4_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svluti4_lane_zt_s32
 // CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z24test_svluti4_lane_zt_s32u11__SVUint8_t
 // CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_s32_x4(0, zn, 1);
@@ -239,36 +107,14 @@ svint32x4_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0
 // CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @test_svluti4_lane_zt_f32
 // CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4f32(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @_Z24test_svluti4_lane_zt_f32u11__SVUint8_t
 // CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4f32(i32 0, <vscale x 16 x i8> [[ZN]], i32 1)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_f32_x4(0, zn, 1);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c
new file mode 100644
index 0000000000000..a2f87aed3187c
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c
@@ -0,0 +1,42 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+
+#include <arm_sme.h>
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_luti4_zt_u8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[OP_COERCE0]], <vscale x 16 x i8> [[OP_COERCE1]])
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z19test_luti4_zt_u8_x411svuint8x2_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[OP_COERCE0]], <vscale x 16 x i8> [[OP_COERCE1]])
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op)  __arm_streaming __arm_in("zt0") {
+  return svluti4_zt_u8_x4(0, op);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_luti4_zt_s8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[OP_COERCE0]], <vscale x 16 x i8> [[OP_COERCE1]])
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z19test_luti4_zt_s8_x411svuint8x2_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[OP_COERCE0]], <vscale x 16 x i8> [[OP_COERCE1]])
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svint8x4_t test_luti4_zt_s8_x4(svuint8x2_t op)  __arm_streaming __arm_in("zt0") {
+  return svluti4_zt_s8_x4(0, op);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c
index efc68c0b42334..5d57ffb9bdf8c 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c
@@ -18,27 +18,13 @@
 
 // CHECK-LABEL: @test_svmax_single_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smax.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svmax_single_s8_x210svint8x2_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smax.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svmax_single_s8_x2(svint8x2_t zdn, svint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_s8_x2)(zdn, zm);
@@ -46,27 +32,13 @@ svint8x2_t test_svmax_single_s8_x2(svint8x2_t zdn, svint8_t zm) __arm_streaming
 
 // CHECK-LABEL: @test_svmax_single_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smax.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_s16_x211svint16x2_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smax.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svmax_single_s16_x2(svint16x2_t zdn, svint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_s16_x2)(zdn, zm);
@@ -74,27 +46,13 @@ svint16x2_t test_svmax_single_s16_x2(svint16x2_t zdn, svint16_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svmax_single_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smax.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_s32_x211svint32x2_tu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smax.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svmax_single_s32_x2(svint32x2_t zdn, svint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_s32_x2)(zdn, zm);
@@ -102,27 +60,13 @@ svint32x2_t test_svmax_single_s32_x2(svint32x2_t zdn, svint32_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svmax_single_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smax.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_s64_x211svint64x2_tu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smax.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svmax_single_s64_x2(svint64x2_t zdn, svint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_s64_x2)(zdn, zm);
@@ -130,27 +74,13 @@ svint64x2_t test_svmax_single_s64_x2(svint64x2_t zdn, svint64_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svmax_single_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umax.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svmax_single_u8_x211svuint8x2_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umax.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svmax_single_u8_x2(svuint8x2_t zdn, svuint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_u8_x2)(zdn, zm);
@@ -158,27 +88,13 @@ svuint8x2_t test_svmax_single_u8_x2(svuint8x2_t zdn, svuint8_t zm) __arm_streami
 
 // CHECK-LABEL: @test_svmax_single_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umax.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_u16_x212svuint16x2_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umax.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svmax_single_u16_x2(svuint16x2_t zdn, svuint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_u16_x2)(zdn, zm);
@@ -186,27 +102,13 @@ svuint16x2_t test_svmax_single_u16_x2(svuint16x2_t zdn, svuint16_t zm) __arm_str
 
 // CHECK-LABEL: @test_svmax_single_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umax.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_u32_x212svuint32x2_tu12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umax.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svmax_single_u32_x2(svuint32x2_t zdn, svuint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_u32_x2)(zdn, zm);
@@ -214,27 +116,13 @@ svuint32x2_t test_svmax_single_u32_x2(svuint32x2_t zdn, svuint32_t zm) __arm_str
 
 // CHECK-LABEL: @test_svmax_single_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umax.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_u64_x212svuint64x2_tu12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umax.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svmax_single_u64_x2(svuint64x2_t zdn, svuint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_u64_x2)(zdn, zm);
@@ -242,27 +130,13 @@ svuint64x2_t test_svmax_single_u64_x2(svuint64x2_t zdn, svuint64_t zm) __arm_str
 
 // CHECK-LABEL: @test_svmax_single_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmax.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svmax_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmax.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svmax_single_bf16_x2(svbfloat16x2_t zdn, svbfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_bf16_x2)(zdn, zm);
@@ -270,27 +144,13 @@ svbfloat16x2_t test_svmax_single_bf16_x2(svbfloat16x2_t zdn, svbfloat16_t zm) __
 
 // CHECK-LABEL: @test_svmax_single_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmax.single.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_f16_x213svfloat16x2_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmax.single.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svmax_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_f16_x2)(zdn, zm);
@@ -298,27 +158,13 @@ svfloat16x2_t test_svmax_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) __arm_
 
 // CHECK-LABEL: @test_svmax_single_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmax.single.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_f32_x213svfloat32x2_tu13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmax.single.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svmax_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_f32_x2)(zdn, zm);
@@ -326,27 +172,13 @@ svfloat32x2_t test_svmax_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) __arm_
 
 // CHECK-LABEL: @test_svmax_single_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmax.single.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_f64_x213svfloat64x2_tu13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmax.single.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svmax_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_f64_x2)(zdn, zm);
@@ -356,35 +188,13 @@ svfloat64x2_t test_svmax_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) __arm_
 
 // CHECK-LABEL: @test_svmax_single_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smax.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svmax_single_s8_x410svint8x4_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smax.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svmax_single_s8_x4(svint8x4_t zdn, svint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_s8_x4)(zdn, zm);
@@ -392,35 +202,13 @@ svint8x4_t test_svmax_single_s8_x4(svint8x4_t zdn, svint8_t zm) __arm_streaming
 
 // CHECK-LABEL: @test_svmax_single_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smax.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_s16_x411svint16x4_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smax.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svmax_single_s16_x4(svint16x4_t zdn, svint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_s16_x4)(zdn, zm);
@@ -428,35 +216,13 @@ svint16x4_t test_svmax_single_s16_x4(svint16x4_t zdn, svint16_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svmax_single_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smax.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_s32_x411svint32x4_tu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smax.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svmax_single_s32_x4(svint32x4_t zdn, svint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_s32_x4)(zdn, zm);
@@ -464,35 +230,13 @@ svint32x4_t test_svmax_single_s32_x4(svint32x4_t zdn, svint32_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svmax_single_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smax.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_s64_x411svint64x4_tu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smax.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svmax_single_s64_x4(svint64x4_t zdn, svint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_s64_x4)(zdn, zm);
@@ -500,35 +244,13 @@ svint64x4_t test_svmax_single_s64_x4(svint64x4_t zdn, svint64_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svmax_single_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umax.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svmax_single_u8_x411svuint8x4_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umax.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svmax_single_u8_x4(svuint8x4_t zdn, svuint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_u8_x4)(zdn, zm);
@@ -536,35 +258,13 @@ svuint8x4_t test_svmax_single_u8_x4(svuint8x4_t zdn, svuint8_t zm) __arm_streami
 
 // CHECK-LABEL: @test_svmax_single_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umax.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_u16_x412svuint16x4_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umax.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svmax_single_u16_x4(svuint16x4_t zdn, svuint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_u16_x4)(zdn, zm);
@@ -572,35 +272,13 @@ svuint16x4_t test_svmax_single_u16_x4(svuint16x4_t zdn, svuint16_t zm) __arm_str
 
 // CHECK-LABEL: @test_svmax_single_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umax.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_u32_x412svuint32x4_tu12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umax.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svmax_single_u32_x4(svuint32x4_t zdn, svuint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_u32_x4)(zdn, zm);
@@ -608,35 +286,13 @@ svuint32x4_t test_svmax_single_u32_x4(svuint32x4_t zdn, svuint32_t zm) __arm_str
 
 // CHECK-LABEL: @test_svmax_single_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umax.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_u64_x412svuint64x4_tu12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umax.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svmax_single_u64_x4(svuint64x4_t zdn, svuint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_u64_x4)(zdn, zm);
@@ -644,35 +300,13 @@ svuint64x4_t test_svmax_single_u64_x4(svuint64x4_t zdn, svuint64_t zm) __arm_str
 
 // CHECK-LABEL: @test_svmax_single_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmax.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svmax_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmax.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svmax_single_bf16_x4(svbfloat16x4_t zdn, svbfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_bf16_x4)(zdn, zm);
@@ -680,35 +314,13 @@ svbfloat16x4_t test_svmax_single_bf16_x4(svbfloat16x4_t zdn, svbfloat16_t zm) __
 
 // CHECK-LABEL: @test_svmax_single_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmax.single.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_f16_x413svfloat16x4_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmax.single.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svmax_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_f16_x4)(zdn, zm);
@@ -716,35 +328,13 @@ svfloat16x4_t test_svmax_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) __arm_
 
 // CHECK-LABEL: @test_svmax_single_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmax.single.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_f32_x413svfloat32x4_tu13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmax.single.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svmax_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_f32_x4)(zdn, zm);
@@ -752,35 +342,13 @@ svfloat32x4_t test_svmax_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) __arm_
 
 // CHECK-LABEL: @test_svmax_single_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmax.single.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmax_single_f64_x413svfloat64x4_tu13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmax.single.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svmax_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_single_f64_x4)(zdn, zm);
@@ -790,27 +358,13 @@ svfloat64x4_t test_svmax_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) __arm_
 
 // CHECK-LABEL: @test_svmax_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smax.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svmax_s8_x210svint8x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smax.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svmax_s8_x2(svint8x2_t zdn, svint8x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_s8_x2)(zdn, zm);
@@ -818,27 +372,13 @@ svint8x2_t test_svmax_s8_x2(svint8x2_t zdn, svint8x2_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmax_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smax.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_s16_x211svint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smax.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svmax_s16_x2(svint16x2_t zdn, svint16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_s16_x2)(zdn, zm);
@@ -846,27 +386,13 @@ svint16x2_t test_svmax_s16_x2(svint16x2_t zdn, svint16x2_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmax_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smax.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_s32_x211svint32x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smax.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svmax_s32_x2(svint32x2_t zdn, svint32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_s32_x2)(zdn, zm);
@@ -874,27 +400,13 @@ svint32x2_t test_svmax_s32_x2(svint32x2_t zdn, svint32x2_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmax_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smax.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_s64_x211svint64x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smax.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svmax_s64_x2(svint64x2_t zdn, svint64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_s64_x2)(zdn, zm);
@@ -902,27 +414,13 @@ svint64x2_t test_svmax_s64_x2(svint64x2_t zdn, svint64x2_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmax_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umax.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svmax_u8_x211svuint8x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umax.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svmax_u8_x2(svuint8x2_t zdn, svuint8x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_u8_x2)(zdn, zm);
@@ -930,27 +428,13 @@ svuint8x2_t test_svmax_u8_x2(svuint8x2_t zdn, svuint8x2_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmax_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umax.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_u16_x212svuint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umax.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svmax_u16_x2(svuint16x2_t zdn, svuint16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_u16_x2)(zdn, zm);
@@ -958,27 +442,13 @@ svuint16x2_t test_svmax_u16_x2(svuint16x2_t zdn, svuint16x2_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svmax_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umax.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_u32_x212svuint32x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umax.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svmax_u32_x2(svuint32x2_t zdn, svuint32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_u32_x2)(zdn, zm);
@@ -986,27 +456,13 @@ svuint32x2_t test_svmax_u32_x2(svuint32x2_t zdn, svuint32x2_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svmax_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umax.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_u64_x212svuint64x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umax.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svmax_u64_x2(svuint64x2_t zdn, svuint64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_u64_x2)(zdn, zm);
@@ -1014,27 +470,13 @@ svuint64x2_t test_svmax_u64_x2(svuint64x2_t zdn, svuint64x2_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svmax_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmax.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svmax_bf16_x214svbfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmax.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svmax_bf16_x2(svbfloat16x2_t zdn, svbfloat16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_bf16_x2)(zdn, zm);
@@ -1042,27 +484,13 @@ svbfloat16x2_t test_svmax_bf16_x2(svbfloat16x2_t zdn, svbfloat16x2_t zm) __arm_s
 
 // CHECK-LABEL: @test_svmax_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmax.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_f16_x213svfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmax.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svmax_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_f16_x2)(zdn, zm);
@@ -1070,27 +498,13 @@ svfloat16x2_t test_svmax_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svmax_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmax.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_f32_x213svfloat32x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmax.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svmax_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_f32_x2)(zdn, zm);
@@ -1098,27 +512,13 @@ svfloat32x2_t test_svmax_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svmax_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmax.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_f64_x213svfloat64x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmax.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svmax_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_f64_x2)(zdn, zm);
@@ -1128,35 +528,13 @@ svfloat64x2_t test_svmax_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svmax_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smax.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svmax_s8_x410svint8x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smax.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svmax_s8_x4(svint8x4_t zdn, svint8x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_s8_x4)(zdn, zm);
@@ -1164,35 +542,13 @@ svint8x4_t test_svmax_s8_x4(svint8x4_t zdn, svint8x4_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmax_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smax.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_s16_x411svint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smax.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svmax_s16_x4(svint16x4_t zdn, svint16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_s16_x4)(zdn, zm);
@@ -1200,35 +556,13 @@ svint16x4_t test_svmax_s16_x4(svint16x4_t zdn, svint16x4_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmax_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smax.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_s32_x411svint32x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smax.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svmax_s32_x4(svint32x4_t zdn, svint32x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_s32_x4)(zdn, zm);
@@ -1236,35 +570,13 @@ svint32x4_t test_svmax_s32_x4(svint32x4_t zdn, svint32x4_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmax_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smax.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_s64_x411svint64x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smax.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svmax_s64_x4(svint64x4_t zdn, svint64x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_s64_x4)(zdn, zm);
@@ -1272,35 +584,13 @@ svint64x4_t test_svmax_s64_x4(svint64x4_t zdn, svint64x4_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmax_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umax.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svmax_u8_x411svuint8x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umax.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svmax_u8_x4(svuint8x4_t zdn, svuint8x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_u8_x4)(zdn, zm);
@@ -1308,35 +598,13 @@ svuint8x4_t test_svmax_u8_x4(svuint8x4_t zdn, svuint8x4_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmax_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umax.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_u16_x412svuint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umax.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svmax_u16_x4(svuint16x4_t zdn, svuint16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_u16_x4)(zdn, zm);
@@ -1344,35 +612,13 @@ svuint16x4_t test_svmax_u16_x4(svuint16x4_t zdn, svuint16x4_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svmax_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umax.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_u32_x412svuint32x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umax.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svmax_u32_x4(svuint32x4_t zdn, svuint32x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_u32_x4)(zdn, zm);
@@ -1380,35 +626,13 @@ svuint32x4_t test_svmax_u32_x4(svuint32x4_t zdn, svuint32x4_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svmax_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umax.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_u64_x412svuint64x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umax.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svmax_u64_x4(svuint64x4_t zdn, svuint64x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_u64_x4)(zdn, zm);
@@ -1416,35 +640,13 @@ svuint64x4_t test_svmax_u64_x4(svuint64x4_t zdn, svuint64x4_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svmax_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmax.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svmax_bf16_x414svbfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmax.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svmax_bf16_x4(svbfloat16x4_t zdn, svbfloat16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_bf16_x4)(zdn, zm);
@@ -1452,35 +654,13 @@ svbfloat16x4_t test_svmax_bf16_x4(svbfloat16x4_t zdn, svbfloat16x4_t zm) __arm_s
 
 // CHECK-LABEL: @test_svmax_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmax.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE2:%.*]], <vscale x 8 x half> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_f16_x413svfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmax.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE2:%.*]], <vscale x 8 x half> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svmax_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_f16_x4)(zdn, zm);
@@ -1488,35 +668,13 @@ svfloat16x4_t test_svmax_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svmax_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmax.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE2:%.*]], <vscale x 4 x float> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_f32_x413svfloat32x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmax.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE2:%.*]], <vscale x 4 x float> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svmax_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_f32_x4)(zdn, zm);
@@ -1524,35 +682,13 @@ svfloat32x4_t test_svmax_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svmax_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmax.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE2:%.*]], <vscale x 2 x double> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmax_f64_x413svfloat64x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmax.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE2:%.*]], <vscale x 2 x double> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svmax_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmax,_f64_x4)(zdn, zm);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c
index 5d06895497cc7..1d47abe8d487c 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c
@@ -19,27 +19,13 @@
 
 // CHECK-LABEL: @test_svmaxnm_single_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmaxnm.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svmaxnm_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmaxnm.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svmaxnm_single_bf16_x2(svbfloat16x2_t zdn, svbfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_single_bf16_x2)(zdn, zm);
@@ -47,27 +33,13 @@ svbfloat16x2_t test_svmaxnm_single_bf16_x2(svbfloat16x2_t zdn, svbfloat16_t zm)
 
 // CHECK-LABEL: @test_svmaxnm_single_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmaxnm.single.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f16_x213svfloat16x2_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmaxnm.single.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svmaxnm_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_single_f16_x2)(zdn, zm);
@@ -75,27 +47,13 @@ svfloat16x2_t test_svmaxnm_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) __ar
 
 // CHECK-LABEL: @test_svmaxnm_single_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmaxnm.single.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f32_x213svfloat32x2_tu13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmaxnm.single.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svmaxnm_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_single_f32_x2)(zdn, zm);
@@ -103,27 +61,13 @@ svfloat32x2_t test_svmaxnm_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) __ar
 
 // CHECK-LABEL: @test_svmaxnm_single_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmaxnm.single.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f64_x213svfloat64x2_tu13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmaxnm.single.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svmaxnm_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_single_f64_x2)(zdn, zm);
@@ -133,35 +77,13 @@ svfloat64x2_t test_svmaxnm_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) __ar
 
 // CHECK-LABEL: @test_svmaxnm_single_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmaxnm.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svmaxnm_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmaxnm.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svmaxnm_single_bf16_x4(svbfloat16x4_t zdn, svbfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_single_bf16_x4)(zdn, zm);
@@ -169,35 +91,13 @@ svbfloat16x4_t test_svmaxnm_single_bf16_x4(svbfloat16x4_t zdn, svbfloat16_t zm)
 
 // CHECK-LABEL: @test_svmaxnm_single_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmaxnm.single.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f16_x413svfloat16x4_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmaxnm.single.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svmaxnm_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_single_f16_x4)(zdn, zm);
@@ -205,35 +105,13 @@ svfloat16x4_t test_svmaxnm_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) __ar
 
 // CHECK-LABEL: @test_svmaxnm_single_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmaxnm.single.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f32_x413svfloat32x4_tu13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmaxnm.single.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svmaxnm_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_single_f32_x4)(zdn, zm);
@@ -241,35 +119,13 @@ svfloat32x4_t test_svmaxnm_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) __ar
 
 // CHECK-LABEL: @test_svmaxnm_single_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmaxnm.single.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f64_x413svfloat64x4_tu13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmaxnm.single.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svmaxnm_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_single_f64_x4)(zdn, zm);
@@ -279,27 +135,13 @@ svfloat64x4_t test_svmaxnm_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) __ar
 
 // CHECK-LABEL: @test_svmaxnm_multi_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmaxnm.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svmaxnm_multi_bf16_x214svbfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmaxnm.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svmaxnm_multi_bf16_x2(svbfloat16x2_t zdn, svbfloat16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_bf16_x2)(zdn, zm);
@@ -307,27 +149,13 @@ svbfloat16x2_t test_svmaxnm_multi_bf16_x2(svbfloat16x2_t zdn, svbfloat16x2_t zm)
 
 // CHECK-LABEL: @test_svmaxnm_multi_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmaxnm.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f16_x213svfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmaxnm.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svmaxnm_multi_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_f16_x2)(zdn, zm);
@@ -335,27 +163,13 @@ svfloat16x2_t test_svmaxnm_multi_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __a
 
 // CHECK-LABEL: @test_svmaxnm_multi_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmaxnm.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f32_x213svfloat32x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmaxnm.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svmaxnm_multi_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_f32_x2)(zdn, zm);
@@ -363,27 +177,13 @@ svfloat32x2_t test_svmaxnm_multi_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __a
 
 // CHECK-LABEL: @test_svmaxnm_multi_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmaxnm.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f64_x213svfloat64x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmaxnm.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svmaxnm_multi_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_f64_x2)(zdn, zm);
@@ -393,35 +193,13 @@ svfloat64x2_t test_svmaxnm_multi_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __a
 
 // CHECK-LABEL: @test_svmaxnm_multi_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmaxnm.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svmaxnm_multi_bf16_x414svbfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmaxnm.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svmaxnm_multi_bf16_x4(svbfloat16x4_t zdn, svbfloat16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_bf16_x4)(zdn, zm);
@@ -429,35 +207,13 @@ svbfloat16x4_t test_svmaxnm_multi_bf16_x4(svbfloat16x4_t zdn, svbfloat16x4_t zm)
 
 // CHECK-LABEL: @test_svmaxnm_multi_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmaxnm.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE2:%.*]], <vscale x 8 x half> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f16_x413svfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmaxnm.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE2:%.*]], <vscale x 8 x half> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svmaxnm_multi_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_f16_x4)(zdn, zm);
@@ -465,35 +221,13 @@ svfloat16x4_t test_svmaxnm_multi_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __a
 
 // CHECK-LABEL: @test_svmaxnm_multi_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmaxnm.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE2:%.*]], <vscale x 4 x float> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f32_x413svfloat32x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmaxnm.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE2:%.*]], <vscale x 4 x float> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svmaxnm_multi_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_f32_x4)(zdn, zm);
@@ -501,35 +235,13 @@ svfloat32x4_t test_svmaxnm_multi_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __a
 
 // CHECK-LABEL: @test_svmaxnm_multi_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmaxnm.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE2:%.*]], <vscale x 2 x double> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f64_x413svfloat64x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmaxnm.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE2:%.*]], <vscale x 2 x double> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svmaxnm_multi_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmaxnm,_f64_x4)(zdn, zm);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c
index 2fa7feeee404e..4e70a39311664 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c
@@ -18,27 +18,13 @@
 
 // CHECK-LABEL: @test_svmin_single_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smin.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svmin_single_s8_x210svint8x2_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smin.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svmin_single_s8_x2(svint8x2_t zdn, svint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_s8_x2)(zdn, zm);
@@ -46,27 +32,13 @@ svint8x2_t test_svmin_single_s8_x2(svint8x2_t zdn, svint8_t zm) __arm_streaming
 
 // CHECK-LABEL: @test_svmin_single_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smin.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_s16_x211svint16x2_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smin.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svmin_single_s16_x2(svint16x2_t zdn, svint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_s16_x2)(zdn, zm);
@@ -74,27 +46,13 @@ svint16x2_t test_svmin_single_s16_x2(svint16x2_t zdn, svint16_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svmin_single_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smin.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_s32_x211svint32x2_tu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smin.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svmin_single_s32_x2(svint32x2_t zdn, svint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_s32_x2)(zdn, zm);
@@ -102,27 +60,13 @@ svint32x2_t test_svmin_single_s32_x2(svint32x2_t zdn, svint32_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svmin_single_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smin.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_s64_x211svint64x2_tu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smin.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svmin_single_s64_x2(svint64x2_t zdn, svint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_s64_x2)(zdn, zm);
@@ -130,27 +74,13 @@ svint64x2_t test_svmin_single_s64_x2(svint64x2_t zdn, svint64_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svmin_single_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umin.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svmin_single_u8_x211svuint8x2_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umin.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svmin_single_u8_x2(svuint8x2_t zdn, svuint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_u8_x2)(zdn, zm);
@@ -158,27 +88,13 @@ svuint8x2_t test_svmin_single_u8_x2(svuint8x2_t zdn, svuint8_t zm) __arm_streami
 
 // CHECK-LABEL: @test_svmin_single_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umin.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_u16_x212svuint16x2_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umin.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svmin_single_u16_x2(svuint16x2_t zdn, svuint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_u16_x2)(zdn, zm);
@@ -186,27 +102,13 @@ svuint16x2_t test_svmin_single_u16_x2(svuint16x2_t zdn, svuint16_t zm) __arm_str
 
 // CHECK-LABEL: @test_svmin_single_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umin.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_u32_x212svuint32x2_tu12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umin.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svmin_single_u32_x2(svuint32x2_t zdn, svuint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_u32_x2)(zdn, zm);
@@ -214,27 +116,13 @@ svuint32x2_t test_svmin_single_u32_x2(svuint32x2_t zdn, svuint32_t zm) __arm_str
 
 // CHECK-LABEL: @test_svmin_single_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umin.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_u64_x212svuint64x2_tu12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umin.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svmin_single_u64_x2(svuint64x2_t zdn, svuint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_u64_x2)(zdn, zm);
@@ -242,27 +130,13 @@ svuint64x2_t test_svmin_single_u64_x2(svuint64x2_t zdn, svuint64_t zm) __arm_str
 
 // CHECK-LABEL: @test_svmin_single_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmin.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svmin_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmin.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svmin_single_bf16_x2(svbfloat16x2_t zdn, svbfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_bf16_x2)(zdn, zm);
@@ -270,27 +144,13 @@ svbfloat16x2_t test_svmin_single_bf16_x2(svbfloat16x2_t zdn, svbfloat16_t zm) __
 
 // CHECK-LABEL: @test_svmin_single_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmin.single.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_f16_x213svfloat16x2_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmin.single.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svmin_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_f16_x2)(zdn, zm);
@@ -298,27 +158,13 @@ svfloat16x2_t test_svmin_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) __arm_
 
 // CHECK-LABEL: @test_svmin_single_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmin.single.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_f32_x213svfloat32x2_tu13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmin.single.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svmin_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_f32_x2)(zdn, zm);
@@ -326,27 +172,13 @@ svfloat32x2_t test_svmin_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) __arm_
 
 // CHECK-LABEL: @test_svmin_single_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmin.single.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_f64_x213svfloat64x2_tu13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmin.single.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svmin_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_f64_x2)(zdn, zm);
@@ -356,35 +188,13 @@ svfloat64x2_t test_svmin_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) __arm_
 
 // CHECK-LABEL: @test_svmin_single_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smin.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svmin_single_s8_x410svint8x4_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smin.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svmin_single_s8_x4(svint8x4_t zdn, svint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_s8_x4)(zdn, zm);
@@ -392,35 +202,13 @@ svint8x4_t test_svmin_single_s8_x4(svint8x4_t zdn, svint8_t zm) __arm_streaming
 
 // CHECK-LABEL: @test_svmin_single_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smin.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_s16_x411svint16x4_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smin.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svmin_single_s16_x4(svint16x4_t zdn, svint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_s16_x4)(zdn, zm);
@@ -428,35 +216,13 @@ svint16x4_t test_svmin_single_s16_x4(svint16x4_t zdn, svint16_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svmin_single_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smin.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_s32_x411svint32x4_tu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smin.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svmin_single_s32_x4(svint32x4_t zdn, svint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_s32_x4)(zdn, zm);
@@ -464,35 +230,13 @@ svint32x4_t test_svmin_single_s32_x4(svint32x4_t zdn, svint32_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svmin_single_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smin.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_s64_x411svint64x4_tu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smin.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svmin_single_s64_x4(svint64x4_t zdn, svint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_s64_x4)(zdn, zm);
@@ -500,35 +244,13 @@ svint64x4_t test_svmin_single_s64_x4(svint64x4_t zdn, svint64_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svmin_single_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umin.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svmin_single_u8_x411svuint8x4_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umin.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svmin_single_u8_x4(svuint8x4_t zdn, svuint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_u8_x4)(zdn, zm);
@@ -536,35 +258,13 @@ svuint8x4_t test_svmin_single_u8_x4(svuint8x4_t zdn, svuint8_t zm) __arm_streami
 
 // CHECK-LABEL: @test_svmin_single_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umin.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_u16_x412svuint16x4_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umin.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svmin_single_u16_x4(svuint16x4_t zdn, svuint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_u16_x4)(zdn, zm);
@@ -572,35 +272,13 @@ svuint16x4_t test_svmin_single_u16_x4(svuint16x4_t zdn, svuint16_t zm) __arm_str
 
 // CHECK-LABEL: @test_svmin_single_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umin.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_u32_x412svuint32x4_tu12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umin.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svmin_single_u32_x4(svuint32x4_t zdn, svuint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_u32_x4)(zdn, zm);
@@ -608,35 +286,13 @@ svuint32x4_t test_svmin_single_u32_x4(svuint32x4_t zdn, svuint32_t zm) __arm_str
 
 // CHECK-LABEL: @test_svmin_single_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umin.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_u64_x412svuint64x4_tu12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umin.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svmin_single_u64_x4(svuint64x4_t zdn, svuint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_u64_x4)(zdn, zm);
@@ -644,35 +300,13 @@ svuint64x4_t test_svmin_single_u64_x4(svuint64x4_t zdn, svuint64_t zm) __arm_str
 
 // CHECK-LABEL: @test_svmin_single_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmin.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svmin_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmin.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svmin_single_bf16_x4(svbfloat16x4_t zdn, svbfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_bf16_x4)(zdn, zm);
@@ -680,35 +314,13 @@ svbfloat16x4_t test_svmin_single_bf16_x4(svbfloat16x4_t zdn, svbfloat16_t zm) __
 
 // CHECK-LABEL: @test_svmin_single_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmin.single.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_f16_x413svfloat16x4_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmin.single.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svmin_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_f16_x4)(zdn, zm);
@@ -716,35 +328,13 @@ svfloat16x4_t test_svmin_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) __arm_
 
 // CHECK-LABEL: @test_svmin_single_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmin.single.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_f32_x413svfloat32x4_tu13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmin.single.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svmin_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_f32_x4)(zdn, zm);
@@ -752,35 +342,13 @@ svfloat32x4_t test_svmin_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) __arm_
 
 // CHECK-LABEL: @test_svmin_single_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmin.single.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svmin_single_f64_x413svfloat64x4_tu13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmin.single.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svmin_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_single_f64_x4)(zdn, zm);
@@ -790,27 +358,13 @@ svfloat64x4_t test_svmin_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) __arm_
 
 // CHECK-LABEL: @test_svmin_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smin.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svmin_s8_x210svint8x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smin.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svmin_s8_x2(svint8x2_t zdn, svint8x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_s8_x2)(zdn, zm);
@@ -818,27 +372,13 @@ svint8x2_t test_svmin_s8_x2(svint8x2_t zdn, svint8x2_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmin_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smin.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_s16_x211svint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smin.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svmin_s16_x2(svint16x2_t zdn, svint16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_s16_x2)(zdn, zm);
@@ -846,27 +386,13 @@ svint16x2_t test_svmin_s16_x2(svint16x2_t zdn, svint16x2_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmin_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smin.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_s32_x211svint32x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smin.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svmin_s32_x2(svint32x2_t zdn, svint32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_s32_x2)(zdn, zm);
@@ -874,27 +400,13 @@ svint32x2_t test_svmin_s32_x2(svint32x2_t zdn, svint32x2_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmin_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smin.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_s64_x211svint64x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smin.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svmin_s64_x2(svint64x2_t zdn, svint64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_s64_x2)(zdn, zm);
@@ -902,27 +414,13 @@ svint64x2_t test_svmin_s64_x2(svint64x2_t zdn, svint64x2_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmin_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umin.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svmin_u8_x211svuint8x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umin.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svmin_u8_x2(svuint8x2_t zdn, svuint8x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_u8_x2)(zdn, zm);
@@ -930,27 +428,13 @@ svuint8x2_t test_svmin_u8_x2(svuint8x2_t zdn, svuint8x2_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmin_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umin.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_u16_x212svuint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umin.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svmin_u16_x2(svuint16x2_t zdn, svuint16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_u16_x2)(zdn, zm);
@@ -958,27 +442,13 @@ svuint16x2_t test_svmin_u16_x2(svuint16x2_t zdn, svuint16x2_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svmin_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umin.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_u32_x212svuint32x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umin.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svmin_u32_x2(svuint32x2_t zdn, svuint32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_u32_x2)(zdn, zm);
@@ -986,27 +456,13 @@ svuint32x2_t test_svmin_u32_x2(svuint32x2_t zdn, svuint32x2_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svmin_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umin.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_u64_x212svuint64x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umin.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svmin_u64_x2(svuint64x2_t zdn, svuint64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_u64_x2)(zdn, zm);
@@ -1014,27 +470,13 @@ svuint64x2_t test_svmin_u64_x2(svuint64x2_t zdn, svuint64x2_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svmin_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmin.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svmin_bf16_x214svbfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmin.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svmin_bf16_x2(svbfloat16x2_t zdn, svbfloat16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_bf16_x2)(zdn, zm);
@@ -1042,27 +484,13 @@ svbfloat16x2_t test_svmin_bf16_x2(svbfloat16x2_t zdn, svbfloat16x2_t zm) __arm_s
 
 // CHECK-LABEL: @test_svmin_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmin.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_f16_x213svfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmin.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svmin_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_f16_x2)(zdn, zm);
@@ -1070,27 +498,13 @@ svfloat16x2_t test_svmin_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svmin_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmin.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_f32_x213svfloat32x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmin.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svmin_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_f32_x2)(zdn, zm);
@@ -1098,27 +512,13 @@ svfloat32x2_t test_svmin_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svmin_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmin.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_f64_x213svfloat64x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmin.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svmin_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_f64_x2)(zdn, zm);
@@ -1128,35 +528,13 @@ svfloat64x2_t test_svmin_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svmin_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smin.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svmin_s8_x410svint8x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.smin.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svmin_s8_x4(svint8x4_t zdn, svint8x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_s8_x4)(zdn, zm);
@@ -1164,35 +542,13 @@ svint8x4_t test_svmin_s8_x4(svint8x4_t zdn, svint8x4_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmin_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smin.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_s16_x411svint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.smin.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svmin_s16_x4(svint16x4_t zdn, svint16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_s16_x4)(zdn, zm);
@@ -1200,35 +556,13 @@ svint16x4_t test_svmin_s16_x4(svint16x4_t zdn, svint16x4_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmin_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smin.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_s32_x411svint32x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.smin.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svmin_s32_x4(svint32x4_t zdn, svint32x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_s32_x4)(zdn, zm);
@@ -1236,35 +570,13 @@ svint32x4_t test_svmin_s32_x4(svint32x4_t zdn, svint32x4_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmin_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smin.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_s64_x411svint64x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.smin.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svmin_s64_x4(svint64x4_t zdn, svint64x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_s64_x4)(zdn, zm);
@@ -1272,35 +584,13 @@ svint64x4_t test_svmin_s64_x4(svint64x4_t zdn, svint64x4_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmin_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umin.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svmin_u8_x411svuint8x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.umin.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svmin_u8_x4(svuint8x4_t zdn, svuint8x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_u8_x4)(zdn, zm);
@@ -1308,35 +598,13 @@ svuint8x4_t test_svmin_u8_x4(svuint8x4_t zdn, svuint8x4_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svmin_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umin.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_u16_x412svuint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.umin.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svmin_u16_x4(svuint16x4_t zdn, svuint16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_u16_x4)(zdn, zm);
@@ -1344,35 +612,13 @@ svuint16x4_t test_svmin_u16_x4(svuint16x4_t zdn, svuint16x4_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svmin_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umin.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_u32_x412svuint32x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.umin.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svmin_u32_x4(svuint32x4_t zdn, svuint32x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_u32_x4)(zdn, zm);
@@ -1380,35 +626,13 @@ svuint32x4_t test_svmin_u32_x4(svuint32x4_t zdn, svuint32x4_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svmin_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umin.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_u64_x412svuint64x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.umin.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svmin_u64_x4(svuint64x4_t zdn, svuint64x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_u64_x4)(zdn, zm);
@@ -1416,35 +640,13 @@ svuint64x4_t test_svmin_u64_x4(svuint64x4_t zdn, svuint64x4_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svmin_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmin.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svmin_bf16_x414svbfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmin.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svmin_bf16_x4(svbfloat16x4_t zdn, svbfloat16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_bf16_x4)(zdn, zm);
@@ -1452,35 +654,13 @@ svbfloat16x4_t test_svmin_bf16_x4(svbfloat16x4_t zdn, svbfloat16x4_t zm) __arm_s
 
 // CHECK-LABEL: @test_svmin_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmin.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE2:%.*]], <vscale x 8 x half> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_f16_x413svfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fmin.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE2:%.*]], <vscale x 8 x half> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svmin_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_f16_x4)(zdn, zm);
@@ -1488,35 +668,13 @@ svfloat16x4_t test_svmin_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svmin_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmin.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE2:%.*]], <vscale x 4 x float> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_f32_x413svfloat32x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fmin.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE2:%.*]], <vscale x 4 x float> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svmin_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_f32_x4)(zdn, zm);
@@ -1524,35 +682,13 @@ svfloat32x4_t test_svmin_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svmin_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmin.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE2:%.*]], <vscale x 2 x double> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svmin_f64_x413svfloat64x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fmin.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE2:%.*]], <vscale x 2 x double> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svmin_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svmin,_f64_x4)(zdn, zm);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c
index 71b8914b816ca..838cb644e5e39 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c
@@ -19,27 +19,13 @@
 
 // CHECK-LABEL: @test_svminnm_single_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fminnm.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svminnm_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fminnm.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svminnm_single_bf16_x2(svbfloat16x2_t zdn, svbfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_single_bf16_x2)(zdn, zm);
@@ -47,27 +33,13 @@ svbfloat16x2_t test_svminnm_single_bf16_x2(svbfloat16x2_t zdn, svbfloat16_t zm)
 
 // CHECK-LABEL: @test_svminnm_single_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fminnm.single.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svminnm_single_f16_x213svfloat16x2_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fminnm.single.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svminnm_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_single_f16_x2)(zdn, zm);
@@ -75,27 +47,13 @@ svfloat16x2_t test_svminnm_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) __ar
 
 // CHECK-LABEL: @test_svminnm_single_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fminnm.single.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svminnm_single_f32_x213svfloat32x2_tu13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fminnm.single.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svminnm_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_single_f32_x2)(zdn, zm);
@@ -103,27 +61,13 @@ svfloat32x2_t test_svminnm_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) __ar
 
 // CHECK-LABEL: @test_svminnm_single_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fminnm.single.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svminnm_single_f64_x213svfloat64x2_tu13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fminnm.single.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svminnm_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_single_f64_x2)(zdn, zm);
@@ -133,35 +77,13 @@ svfloat64x2_t test_svminnm_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) __ar
 
 // CHECK-LABEL: @test_svminnm_single_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fminnm.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svminnm_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fminnm.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svminnm_single_bf16_x4(svbfloat16x4_t zdn, svbfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_single_bf16_x4)(zdn, zm);
@@ -169,35 +91,13 @@ svbfloat16x4_t test_svminnm_single_bf16_x4(svbfloat16x4_t zdn, svbfloat16_t zm)
 
 // CHECK-LABEL: @test_svminnm_single_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fminnm.single.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svminnm_single_f16_x413svfloat16x4_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fminnm.single.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svminnm_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_single_f16_x4)(zdn, zm);
@@ -205,35 +105,13 @@ svfloat16x4_t test_svminnm_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) __ar
 
 // CHECK-LABEL: @test_svminnm_single_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fminnm.single.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svminnm_single_f32_x413svfloat32x4_tu13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fminnm.single.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svminnm_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_single_f32_x4)(zdn, zm);
@@ -241,35 +119,13 @@ svfloat32x4_t test_svminnm_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) __ar
 
 // CHECK-LABEL: @test_svminnm_single_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fminnm.single.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svminnm_single_f64_x413svfloat64x4_tu13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fminnm.single.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svminnm_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_single_f64_x4)(zdn, zm);
@@ -279,27 +135,13 @@ svfloat64x4_t test_svminnm_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) __ar
 
 // CHECK-LABEL: @test_svminnm_multi_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fminnm.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svminnm_multi_bf16_x214svbfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fminnm.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svminnm_multi_bf16_x2(svbfloat16x2_t zdn, svbfloat16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_bf16_x2)(zdn, zm);
@@ -307,27 +149,13 @@ svbfloat16x2_t test_svminnm_multi_bf16_x2(svbfloat16x2_t zdn, svbfloat16x2_t zm)
 
 // CHECK-LABEL: @test_svminnm_multi_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fminnm.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f16_x213svfloat16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fminnm.x2.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svminnm_multi_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_f16_x2)(zdn, zm);
@@ -335,27 +163,13 @@ svfloat16x2_t test_svminnm_multi_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __a
 
 // CHECK-LABEL: @test_svminnm_multi_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fminnm.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f32_x213svfloat32x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fminnm.x2.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svminnm_multi_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_f32_x2)(zdn, zm);
@@ -363,27 +177,13 @@ svfloat32x2_t test_svminnm_multi_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __a
 
 // CHECK-LABEL: @test_svminnm_multi_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fminnm.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f64_x213svfloat64x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fminnm.x2.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svminnm_multi_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_f64_x2)(zdn, zm);
@@ -393,35 +193,13 @@ svfloat64x2_t test_svminnm_multi_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __a
 
 // CHECK-LABEL: @test_svminnm_multi_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fminnm.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svminnm_multi_bf16_x414svbfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fminnm.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svminnm_multi_bf16_x4(svbfloat16x4_t zdn, svbfloat16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_bf16_x4)(zdn, zm);
@@ -429,35 +207,13 @@ svbfloat16x4_t test_svminnm_multi_bf16_x4(svbfloat16x4_t zdn, svbfloat16x4_t zm)
 
 // CHECK-LABEL: @test_svminnm_multi_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fminnm.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE2:%.*]], <vscale x 8 x half> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f16_x413svfloat16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fminnm.x4.nxv8f16(<vscale x 8 x half> [[ZDN_COERCE0:%.*]], <vscale x 8 x half> [[ZDN_COERCE1:%.*]], <vscale x 8 x half> [[ZDN_COERCE2:%.*]], <vscale x 8 x half> [[ZDN_COERCE3:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE2:%.*]], <vscale x 8 x half> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svminnm_multi_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_f16_x4)(zdn, zm);
@@ -465,35 +221,13 @@ svfloat16x4_t test_svminnm_multi_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __a
 
 // CHECK-LABEL: @test_svminnm_multi_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fminnm.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE2:%.*]], <vscale x 4 x float> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f32_x413svfloat32x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fminnm.x4.nxv4f32(<vscale x 4 x float> [[ZDN_COERCE0:%.*]], <vscale x 4 x float> [[ZDN_COERCE1:%.*]], <vscale x 4 x float> [[ZDN_COERCE2:%.*]], <vscale x 4 x float> [[ZDN_COERCE3:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE2:%.*]], <vscale x 4 x float> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svminnm_multi_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_f32_x4)(zdn, zm);
@@ -501,35 +235,13 @@ svfloat32x4_t test_svminnm_multi_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __a
 
 // CHECK-LABEL: @test_svminnm_multi_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fminnm.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE2:%.*]], <vscale x 2 x double> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f64_x413svfloat64x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.fminnm.x4.nxv2f64(<vscale x 2 x double> [[ZDN_COERCE0:%.*]], <vscale x 2 x double> [[ZDN_COERCE1:%.*]], <vscale x 2 x double> [[ZDN_COERCE2:%.*]], <vscale x 2 x double> [[ZDN_COERCE3:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE2:%.*]], <vscale x 2 x double> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svminnm_multi_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svminnm,_f64_x4)(zdn, zm);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
index da17c6b13d17c..b8cd1e1653ea9 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
@@ -9,27 +9,13 @@
 
 // CHECK-LABEL: @test_svread_ver_za8_u8_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_u8_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svread_ver_za8_u8_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za8_u8_vg2(0, base);
@@ -37,27 +23,13 @@ svuint8x2_t test_svread_ver_za8_u8_vg2(uint32_t base) __arm_streaming __arm_in("
 
 // CHECK-LABEL: @test_svread_ver_za8_s8_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_s8_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svread_ver_za8_s8_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za8_s8_vg2(0, base);
@@ -65,27 +37,13 @@ svint8x2_t test_svread_ver_za8_s8_vg2(uint32_t base) __arm_streaming __arm_in("z
 
 // CHECK-LABEL: @test_svread_hor_za8_u8_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_u8_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svread_hor_za8_u8_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za8_u8_vg2(0, base);
@@ -93,27 +51,13 @@ svuint8x2_t test_svread_hor_za8_u8_vg2(uint32_t base) __arm_streaming __arm_in("
 
 // CHECK-LABEL: @test_svread_hor_za8_s8_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_s8_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svread_hor_za8_s8_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za8_s8_vg2(0, base);
@@ -121,35 +65,13 @@ svint8x2_t test_svread_hor_za8_s8_vg2(uint32_t base) __arm_streaming __arm_in("z
 
 // CHECK-LABEL: @test_svread_hor_za8_u8_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_u8_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svread_hor_za8_u8_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za8_u8_vg4(0, base);
@@ -157,35 +79,13 @@ svuint8x4_t test_svread_hor_za8_u8_vg4(uint32_t base) __arm_streaming __arm_in("
 
 // CHECK-LABEL: @test_svread_hor_za8_s8_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_s8_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svread_hor_za8_s8_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za8_s8_vg4(0, base);
@@ -193,35 +93,13 @@ svint8x4_t test_svread_hor_za8_s8_vg4(uint32_t base) __arm_streaming __arm_in("z
 
 // CHECK-LABEL: @test_svread_ver_za8_u8_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_u8_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svread_ver_za8_u8_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za8_u8_vg4(0, base);
@@ -229,35 +107,13 @@ svuint8x4_t test_svread_ver_za8_u8_vg4(uint32_t base) __arm_streaming __arm_in("
 
 // CHECK-LABEL: @test_svread_ver_za8_s8_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_s8_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svread_ver_za8_s8_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za8_s8_vg4(0, base);
@@ -265,27 +121,13 @@ svint8x4_t test_svread_ver_za8_s8_vg4(uint32_t base) __arm_streaming __arm_in("z
 
 // CHECK-LABEL: @test_svread_hor_za16_u16_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_u16_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svread_hor_za16_u16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_u16_vg2(1, base);
@@ -293,27 +135,13 @@ svuint16x2_t test_svread_hor_za16_u16_vg2(uint32_t base) __arm_streaming __arm_i
 
 // CHECK-LABEL: @test_svread_hor_za16_bf16_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svread_hor_za16_bf16_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svread_hor_za16_bf16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_bf16_vg2(1, base);
@@ -321,27 +149,13 @@ svbfloat16x2_t test_svread_hor_za16_bf16_vg2(uint32_t base) __arm_streaming __ar
 
 // CHECK-LABEL: @test_svread_hor_za16_f16_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_f16_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svread_hor_za16_f16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_f16_vg2(1, base);
@@ -349,27 +163,13 @@ svfloat16x2_t test_svread_hor_za16_f16_vg2(uint32_t base) __arm_streaming __arm_
 
 // CHECK-LABEL: @test_svread_hor_za16_s16_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_s16_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svread_hor_za16_s16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_s16_vg2(1, base);
@@ -377,27 +177,13 @@ svint16x2_t test_svread_hor_za16_s16_vg2(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_ver_za16_u16_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_u16_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svread_ver_za16_u16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_u16_vg2(1, base);
@@ -405,27 +191,13 @@ svuint16x2_t test_svread_ver_za16_u16_vg2(uint32_t base) __arm_streaming __arm_i
 
 // CHECK-LABEL: @test_svread_ver_za16_bf16_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svread_ver_za16_bf16_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svread_ver_za16_bf16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_bf16_vg2(1, base);
@@ -433,27 +205,13 @@ svbfloat16x2_t test_svread_ver_za16_bf16_vg2(uint32_t base) __arm_streaming __ar
 
 // CHECK-LABEL: @test_svread_ver_za16_f16_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_f16_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svread_ver_za16_f16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_f16_vg2(1, base);
@@ -461,27 +219,13 @@ svfloat16x2_t test_svread_ver_za16_f16_vg2(uint32_t base) __arm_streaming __arm_
 
 // CHECK-LABEL: @test_svread_ver_za16_s16_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_s16_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svread_ver_za16_s16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_s16_vg2(1, base);
@@ -489,35 +233,13 @@ svint16x2_t test_svread_ver_za16_s16_vg2(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_hor_za16_u16_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_u16_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svread_hor_za16_u16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_u16_vg4(1, base);
@@ -525,35 +247,13 @@ svuint16x4_t test_svread_hor_za16_u16_vg4(uint32_t base) __arm_streaming __arm_i
 
 // CHECK-LABEL: @test_svread_hor_za16_bf16_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svread_hor_za16_bf16_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svread_hor_za16_bf16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_bf16_vg4(1, base);
@@ -561,35 +261,13 @@ svbfloat16x4_t test_svread_hor_za16_bf16_vg4(uint32_t base) __arm_streaming __ar
 
 // CHECK-LABEL: @test_svread_hor_za16_f16_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_f16_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svread_hor_za16_f16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_f16_vg4(1, base);
@@ -597,35 +275,13 @@ svfloat16x4_t test_svread_hor_za16_f16_vg4(uint32_t base) __arm_streaming __arm_
 
 // CHECK-LABEL: @test_svread_hor_za16_s16_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_s16_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svread_hor_za16_s16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_s16_vg4(1, base);
@@ -633,35 +289,13 @@ svint16x4_t test_svread_hor_za16_s16_vg4(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_ver_za16_u16_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_u16_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svread_ver_za16_u16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_u16_vg4(1, base);
@@ -669,35 +303,13 @@ svuint16x4_t test_svread_ver_za16_u16_vg4(uint32_t base) __arm_streaming __arm_i
 
 // CHECK-LABEL: @test_svread_ver_za16_bf16_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svread_ver_za16_bf16_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svread_ver_za16_bf16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_bf16_vg4(1, base);
@@ -705,35 +317,13 @@ svbfloat16x4_t test_svread_ver_za16_bf16_vg4(uint32_t base) __arm_streaming __ar
 
 // CHECK-LABEL: @test_svread_ver_za16_f16_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_f16_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svread_ver_za16_f16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_f16_vg4(1, base);
@@ -741,35 +331,13 @@ svfloat16x4_t test_svread_ver_za16_f16_vg4(uint32_t base) __arm_streaming __arm_
 
 // CHECK-LABEL: @test_svread_ver_za16_s16_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_s16_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svread_ver_za16_s16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_s16_vg4(1, base);
@@ -777,27 +345,13 @@ svint16x4_t test_svread_ver_za16_s16_vg4(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_hor_za32_u32_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_u32_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svread_hor_za32_u32_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za32_u32_vg2(3, base);
@@ -805,27 +359,13 @@ svuint32x2_t test_svread_hor_za32_u32_vg2(uint32_t base) __arm_streaming __arm_i
 
 // CHECK-LABEL: @test_svread_hor_za32_f32_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_f32_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svread_hor_za32_f32_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za32_f32_vg2(3, base);
@@ -833,27 +373,13 @@ svfloat32x2_t test_svread_hor_za32_f32_vg2(uint32_t base) __arm_streaming __arm_
 
 // CHECK-LABEL: @test_svread_hor_za32_s32_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_s32_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svread_hor_za32_s32_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za32_s32_vg2(3, base);
@@ -861,27 +387,13 @@ svint32x2_t test_svread_hor_za32_s32_vg2(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_ver_za32_u32_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_u32_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svread_ver_za32_u32_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za32_u32_vg2(3, base);
@@ -889,27 +401,13 @@ svuint32x2_t test_svread_ver_za32_u32_vg2(uint32_t base) __arm_streaming __arm_i
 
 // CHECK-LABEL: @test_svread_ver_za32_f32_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_f32_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svread_ver_za32_f32_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za32_f32_vg2(3, base);
@@ -917,27 +415,13 @@ svfloat32x2_t test_svread_ver_za32_f32_vg2(uint32_t base) __arm_streaming __arm_
 
 // CHECK-LABEL: @test_svread_ver_za32_s32_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_s32_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svread_ver_za32_s32_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za32_s32_vg2(3, base);
@@ -945,35 +429,13 @@ svint32x2_t test_svread_ver_za32_s32_vg2(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_hor_za32_u32_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_u32_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svread_hor_za32_u32_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za32_u32_vg4(3, base);
@@ -981,35 +443,13 @@ svuint32x4_t test_svread_hor_za32_u32_vg4(uint32_t base) __arm_streaming __arm_i
 
 // CHECK-LABEL: @test_svread_hor_za32_f32_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_f32_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svread_hor_za32_f32_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za32_f32_vg4(3, base);
@@ -1017,35 +457,13 @@ svfloat32x4_t test_svread_hor_za32_f32_vg4(uint32_t base) __arm_streaming __arm_
 
 // CHECK-LABEL: @test_svread_hor_za32_s32_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_s32_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svread_hor_za32_s32_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za32_s32_vg4(3, base);
@@ -1053,35 +471,13 @@ svint32x4_t test_svread_hor_za32_s32_vg4(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_ver_za32_u32_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_u32_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svread_ver_za32_u32_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za32_u32_vg4(3, base);
@@ -1089,35 +485,13 @@ svuint32x4_t test_svread_ver_za32_u32_vg4(uint32_t base) __arm_streaming __arm_i
 
 // CHECK-LABEL: @test_svread_ver_za32_f32_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_f32_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svread_ver_za32_f32_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za32_f32_vg4(3, base);
@@ -1125,35 +499,13 @@ svfloat32x4_t test_svread_ver_za32_f32_vg4(uint32_t base) __arm_streaming __arm_
 
 // CHECK-LABEL: @test_svread_ver_za32_s32_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_s32_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svread_ver_za32_s32_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za32_s32_vg4(3, base);
@@ -1161,27 +513,13 @@ svint32x4_t test_svread_ver_za32_s32_vg4(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_hor_za64_u64_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_u64_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svread_hor_za64_u64_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za64_u64_vg2(7, base);
@@ -1189,27 +527,13 @@ svuint64x2_t test_svread_hor_za64_u64_vg2(uint32_t base) __arm_streaming __arm_i
 
 // CHECK-LABEL: @test_svread_hor_za64_f64_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_f64_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svread_hor_za64_f64_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za64_f64_vg2(7, base);
@@ -1217,27 +541,13 @@ svfloat64x2_t test_svread_hor_za64_f64_vg2(uint32_t base) __arm_streaming __arm_
 
 // CHECK-LABEL: @test_svread_hor_za64_s64_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_s64_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svread_hor_za64_s64_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za64_s64_vg2(7, base);
@@ -1245,27 +555,13 @@ svint64x2_t test_svread_hor_za64_s64_vg2(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_ver_za64_u64_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_u64_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svread_ver_za64_u64_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za64_u64_vg2(7, base);
@@ -1273,55 +569,28 @@ svuint64x2_t test_svread_ver_za64_u64_vg2(uint32_t base) __arm_streaming __arm_i
 
 // CHECK-LABEL: @test_svread_ver_za64_f64_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_f64_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svread_ver_za64_f64_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za64_f64_vg2(7, base);
 }
 
+//
 // CHECK-LABEL: @test_svread_ver_za64_s64_vg2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_s64_vg2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svread_ver_za64_s64_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za64_s64_vg2(7, base);
@@ -1329,35 +598,13 @@ svint64x2_t test_svread_ver_za64_s64_vg2(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_hor_za64_u64_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_u64_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svread_hor_za64_u64_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za64_u64_vg4(7, base);
@@ -1365,35 +612,13 @@ svuint64x4_t test_svread_hor_za64_u64_vg4(uint32_t base) __arm_streaming __arm_i
 
 // CHECK-LABEL: @test_svread_hor_za64_f64_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_f64_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svread_hor_za64_f64_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za64_f64_vg4(7, base);
@@ -1401,35 +626,13 @@ svfloat64x4_t test_svread_hor_za64_f64_vg4(uint32_t base) __arm_streaming __arm_
 
 // CHECK-LABEL: @test_svread_hor_za64_s64_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_s64_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svread_hor_za64_s64_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za64_s64_vg4(7, base);
@@ -1437,35 +640,13 @@ svint64x4_t test_svread_hor_za64_s64_vg4(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_ver_za64_u64_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_u64_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svread_ver_za64_u64_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za64_u64_vg4(7, base);
@@ -1473,35 +654,13 @@ svuint64x4_t test_svread_ver_za64_u64_vg4(uint32_t base) __arm_streaming __arm_i
 
 // CHECK-LABEL: @test_svread_ver_za64_f64_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_f64_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svread_ver_za64_f64_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za64_f64_vg4(7, base);
@@ -1509,35 +668,13 @@ svfloat64x4_t test_svread_ver_za64_f64_vg4(uint32_t base) __arm_streaming __arm_
 
 // CHECK-LABEL: @test_svread_ver_za64_s64_vg4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_s64_vg4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svread_ver_za64_s64_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za64_s64_vg4(7, base);
@@ -1545,27 +682,13 @@ svint64x4_t test_svread_ver_za64_s64_vg4(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_za8_s8_vg1x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svread_za8_s8_vg1x2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svread_za8_s8_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za8_s8_vg1x2(base);
@@ -1573,27 +696,13 @@ svint8x2_t test_svread_za8_s8_vg1x2(uint32_t base) __arm_streaming __arm_in("za"
 
 // CHECK-LABEL: @test_svread_za8_u8_vg1x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svread_za8_u8_vg1x2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svread_za8_u8_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za8_u8_vg1x2(base);
@@ -1601,56 +710,27 @@ svuint8x2_t test_svread_za8_u8_vg1x2(uint32_t base) __arm_streaming __arm_in("za
 
 // CHECK-LABEL: @test_svread_za16_s16_vg1x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za16_s16_vg1x2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svread_za16_s16_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_s16_vg1x2(base);
 }
 
-//
 // CHECK-LABEL: @test_svread_za16_u16_vg1x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za16_u16_vg1x2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svread_za16_u16_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_u16_vg1x2(base);
@@ -1658,27 +738,13 @@ svuint16x2_t test_svread_za16_u16_vg1x2(uint32_t base) __arm_streaming __arm_in(
 
 // CHECK-LABEL: @test_svread_za16_bf16_vg1x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svread_za16_bf16_vg1x2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svread_za16_bf16_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_bf16_vg1x2(base);
@@ -1686,27 +752,13 @@ svbfloat16x2_t test_svread_za16_bf16_vg1x2(uint32_t base) __arm_streaming __arm_
 
 // CHECK-LABEL: @test_svread_za16_f16_vg1x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za16_f16_vg1x2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svread_za16_f16_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_f16_vg1x2(base);
@@ -1714,27 +766,13 @@ svfloat16x2_t test_svread_za16_f16_vg1x2(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_za32_s32_vg1x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za32_s32_vg1x2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svread_za32_s32_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za32_s32_vg1x2(base);
@@ -1742,27 +780,13 @@ svint32x2_t test_svread_za32_s32_vg1x2(uint32_t base) __arm_streaming __arm_in("
 
 // CHECK-LABEL: @test_svread_za32_u32_vg1x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za32_u32_vg1x2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svread_za32_u32_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za32_u32_vg1x2(base);
@@ -1770,27 +794,13 @@ svuint32x2_t test_svread_za32_u32_vg1x2(uint32_t base) __arm_streaming __arm_in(
 
 // CHECK-LABEL: @test_svread_za32_f32_vg1x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za32_f32_vg1x2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svread_za32_f32_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za32_f32_vg1x2(base);
@@ -1798,27 +808,13 @@ svfloat32x2_t test_svread_za32_f32_vg1x2(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_za64_u64_vg1x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za64_u64_vg1x2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svread_za64_u64_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za64_u64_vg1x2(base);
@@ -1826,27 +822,13 @@ svuint64x2_t test_svread_za64_u64_vg1x2(uint32_t base) __arm_streaming __arm_in(
 
 // CHECK-LABEL: @test_svread_za64_f64_vg1x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za64_f64_vg1x2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svread_za64_f64_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za64_f64_vg1x2(base);
@@ -1854,27 +836,13 @@ svfloat64x2_t test_svread_za64_f64_vg1x2(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_za64_s64_vg1x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za64_s64_vg1x2j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svread_za64_s64_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za64_s64_vg1x2(base);
@@ -1882,35 +850,13 @@ svint64x2_t test_svread_za64_s64_vg1x2(uint32_t base) __arm_streaming __arm_in("
 
 // CHECK-LABEL: @test_svread_za8_s8_vg1x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svread_za8_s8_vg1x4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svread_za8_s8_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za8_s8_vg1x4(base);
@@ -1918,35 +864,13 @@ svint8x4_t test_svread_za8_s8_vg1x4(uint32_t base) __arm_streaming __arm_in("za"
 
 // CHECK-LABEL: @test_svread_za8_u8_vg1x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svread_za8_u8_vg1x4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svread_za8_u8_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za8_u8_vg1x4(base);
@@ -1954,35 +878,13 @@ svuint8x4_t test_svread_za8_u8_vg1x4(uint32_t base) __arm_streaming __arm_in("za
 
 // CHECK-LABEL: @test_svread_za16_s16_vg1x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za16_s16_vg1x4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svread_za16_s16_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_s16_vg1x4(base);
@@ -1990,35 +892,13 @@ svint16x4_t test_svread_za16_s16_vg1x4(uint32_t base) __arm_streaming __arm_in("
 
 // CHECK-LABEL: @test_svread_za16_u16_vg1x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za16_u16_vg1x4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svread_za16_u16_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_u16_vg1x4(base);
@@ -2026,35 +906,13 @@ svuint16x4_t test_svread_za16_u16_vg1x4(uint32_t base) __arm_streaming __arm_in(
 
 // CHECK-LABEL: @test_svread_za16_bf16_vg1x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svread_za16_bf16_vg1x4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svread_za16_bf16_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_bf16_vg1x4(base);
@@ -2062,35 +920,13 @@ svbfloat16x4_t test_svread_za16_bf16_vg1x4(uint32_t base) __arm_streaming __arm_
 
 // CHECK-LABEL: @test_svread_za16_f16_vg1x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za16_f16_vg1x4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svread_za16_f16_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_f16_vg1x4(base);
@@ -2098,35 +934,13 @@ svfloat16x4_t test_svread_za16_f16_vg1x4(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_za32_s32_vg1x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za32_s32_vg1x4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svread_za32_s32_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za32_s32_vg1x4(base);
@@ -2134,35 +948,13 @@ svint32x4_t test_svread_za32_s32_vg1x4(uint32_t base) __arm_streaming __arm_in("
 
 // CHECK-LABEL: @test_svread_za32_u32_vg1x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za32_u32_vg1x4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svread_za32_u32_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za32_u32_vg1x4(base);
@@ -2170,35 +962,13 @@ svuint32x4_t test_svread_za32_u32_vg1x4(uint32_t base) __arm_streaming __arm_in(
 
 // CHECK-LABEL: @test_svread_za32_f32_vg1x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za32_f32_vg1x4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svread_za32_f32_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za32_f32_vg1x4(base);
@@ -2206,35 +976,13 @@ svfloat32x4_t test_svread_za32_f32_vg1x4(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_za64_u64_vg1x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za64_u64_vg1x4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svread_za64_u64_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za64_u64_vg1x4(base);
@@ -2242,35 +990,13 @@ svuint64x4_t test_svread_za64_u64_vg1x4(uint32_t base) __arm_streaming __arm_in(
 
 // CHECK-LABEL: @test_svread_za64_f64_vg1x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za64_f64_vg1x4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svread_za64_f64_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za64_f64_vg1x4(base);
@@ -2278,35 +1004,13 @@ svfloat64x4_t test_svread_za64_f64_vg1x4(uint32_t base) __arm_streaming __arm_in
 
 // CHECK-LABEL: @test_svread_za64_s64_vg1x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[BASE:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svread_za64_s64_vg1x4j(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[BASE:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svread_za64_s64_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za64_s64_vg1x4(base);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c
index 26804866a7563..5ff801666df88 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c
@@ -19,27 +19,13 @@
 
 // CHECK-LABEL: @test_svqdmulh_single_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svqdmulh_single_s8_x210svint8x2_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svqdmulh_single_s8_x2(svint8x2_t zdn, svint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_single_s8_x2,,,)(zdn, zm);
@@ -47,27 +33,13 @@ svint8x2_t test_svqdmulh_single_s8_x2(svint8x2_t zdn, svint8_t zm) __arm_streami
 
 // CHECK-LABEL: @test_svqdmulh_single_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svqdmulh_single_s16_x211svint16x2_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svqdmulh_single_s16_x2(svint16x2_t zdn, svint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_single_s16_x2,,,)(zdn, zm);
@@ -75,27 +47,13 @@ svint16x2_t test_svqdmulh_single_s16_x2(svint16x2_t zdn, svint16_t zm) __arm_str
 
 // CHECK-LABEL: @test_svqdmulh_single_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svqdmulh_single_s32_x211svint32x2_tu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svqdmulh_single_s32_x2(svint32x2_t zdn, svint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_single_s32_x2,,,)(zdn, zm);
@@ -103,27 +61,13 @@ svint32x2_t test_svqdmulh_single_s32_x2(svint32x2_t zdn, svint32_t zm) __arm_str
 
 // CHECK-LABEL: @test_svqdmulh_single_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svqdmulh_single_s64_x211svint64x2_tu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svqdmulh_single_s64_x2(svint64x2_t zdn, svint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_single_s64_x2,,,)(zdn, zm);
@@ -133,35 +77,13 @@ svint64x2_t test_svqdmulh_single_s64_x2(svint64x2_t zdn, svint64_t zm) __arm_str
 
 // CHECK-LABEL: @test_svqdmulh_single_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z26test_svqdmulh_single_s8_x410svint8x4_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svqdmulh_single_s8_x4(svint8x4_t zdn, svint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_single_s8_x4,,,)(zdn, zm);
@@ -169,35 +91,13 @@ svint8x4_t test_svqdmulh_single_s8_x4(svint8x4_t zdn, svint8_t zm) __arm_streami
 
 // CHECK-LABEL: @test_svqdmulh_single_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svqdmulh_single_s16_x411svint16x4_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svqdmulh_single_s16_x4(svint16x4_t zdn, svint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_single_s16_x4,,,)(zdn, zm);
@@ -205,35 +105,13 @@ svint16x4_t test_svqdmulh_single_s16_x4(svint16x4_t zdn, svint16_t zm) __arm_str
 
 // CHECK-LABEL: @test_svqdmulh_single_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svqdmulh_single_s32_x411svint32x4_tu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svqdmulh_single_s32_x4(svint32x4_t zdn, svint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_single_s32_x4,,,)(zdn, zm);
@@ -241,35 +119,13 @@ svint32x4_t test_svqdmulh_single_s32_x4(svint32x4_t zdn, svint32_t zm) __arm_str
 
 // CHECK-LABEL: @test_svqdmulh_single_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z27test_svqdmulh_single_s64_x411svint64x4_tu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svqdmulh_single_s64_x4(svint64x4_t zdn, svint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_single_s64_x4,,,)(zdn, zm);
@@ -279,27 +135,13 @@ svint64x4_t test_svqdmulh_single_s64_x4(svint64x4_t zdn, svint64_t zm) __arm_str
 
 // CHECK-LABEL: @test_svqdmulh_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sqdmulh.vgx2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svqdmulh_s8_x210svint8x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sqdmulh.vgx2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svqdmulh_s8_x2(svint8x2_t zdn, svint8x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_s8_x2,,,)(zdn, zm);
@@ -307,27 +149,13 @@ svint8x2_t test_svqdmulh_s8_x2(svint8x2_t zdn, svint8x2_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svqdmulh_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sqdmulh.vgx2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svqdmulh_s16_x211svint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sqdmulh.vgx2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svqdmulh_s16_x2(svint16x2_t zdn, svint16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_s16_x2,,,)(zdn, zm);
@@ -335,27 +163,13 @@ svint16x2_t test_svqdmulh_s16_x2(svint16x2_t zdn, svint16x2_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svqdmulh_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sqdmulh.vgx2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svqdmulh_s32_x211svint32x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sqdmulh.vgx2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svqdmulh_s32_x2(svint32x2_t zdn, svint32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_s32_x2,,,)(zdn, zm);
@@ -363,27 +177,13 @@ svint32x2_t test_svqdmulh_s32_x2(svint32x2_t zdn, svint32x2_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svqdmulh_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sqdmulh.vgx2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svqdmulh_s64_x211svint64x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sqdmulh.vgx2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svqdmulh_s64_x2(svint64x2_t zdn, svint64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_s64_x2,,,)(zdn, zm);
@@ -393,35 +193,13 @@ svint64x2_t test_svqdmulh_s64_x2(svint64x2_t zdn, svint64x2_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svqdmulh_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sqdmulh.vgx4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svqdmulh_s8_x410svint8x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sqdmulh.vgx4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svqdmulh_s8_x4(svint8x4_t zdn, svint8x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_s8_x4,,,)(zdn, zm);
@@ -429,35 +207,13 @@ svint8x4_t test_svqdmulh_s8_x4(svint8x4_t zdn, svint8x4_t zm) __arm_streaming {
 
 // CHECK-LABEL: @test_svqdmulh_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sqdmulh.vgx4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svqdmulh_s16_x411svint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sqdmulh.vgx4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svqdmulh_s16_x4(svint16x4_t zdn, svint16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_s16_x4,,,)(zdn, zm);
@@ -465,35 +221,13 @@ svint16x4_t test_svqdmulh_s16_x4(svint16x4_t zdn, svint16x4_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svqdmulh_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sqdmulh.vgx4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svqdmulh_s32_x411svint32x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sqdmulh.vgx4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svqdmulh_s32_x4(svint32x4_t zdn, svint32x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_s32_x4,,,)(zdn, zm);
@@ -501,35 +235,13 @@ svint32x4_t test_svqdmulh_s32_x4(svint32x4_t zdn, svint32x4_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svqdmulh_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sqdmulh.vgx4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svqdmulh_s64_x411svint64x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sqdmulh.vgx4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svqdmulh_s64_x4(svint64x4_t zdn, svint64x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svqdmulh,_s64_x4,,,)(zdn, zm);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c
index fa66c4ff19014..d3b09f071c58f 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c
@@ -19,27 +19,13 @@
 
 // CHECK-LABEL: @test_svunpk_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sunpk.x2.nxv8i16(<vscale x 16 x i8> [[ZN:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svunpk_s16_x2u10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sunpk.x2.nxv8i16(<vscale x 16 x i8> [[ZN:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svunpk_s16_x2(svint8_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svunpk_s16,_s8_x2)(zn);
@@ -47,27 +33,13 @@ svint16x2_t test_svunpk_s16_x2(svint8_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svunpk_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uunpk.x2.nxv8i16(<vscale x 16 x i8> [[ZN:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svunpk_u16_x2u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uunpk.x2.nxv8i16(<vscale x 16 x i8> [[ZN:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svunpk_u16_x2(svuint8_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svunpk_u16,_u8_x2)(zn);
@@ -75,27 +47,13 @@ svuint16x2_t test_svunpk_u16_x2(svuint8_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svunpk_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sunpk.x2.nxv4i32(<vscale x 8 x i16> [[ZN:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svunpk_s32_x2u11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sunpk.x2.nxv4i32(<vscale x 8 x i16> [[ZN:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svunpk_s32_x2(svint16_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svunpk_s32,_s16_x2)(zn);
@@ -103,27 +61,13 @@ svint32x2_t test_svunpk_s32_x2(svint16_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svunpk_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uunpk.x2.nxv4i32(<vscale x 8 x i16> [[ZN:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svunpk_u32_x2u12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uunpk.x2.nxv4i32(<vscale x 8 x i16> [[ZN:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svunpk_u32_x2(svuint16_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svunpk_u32,_u16_x2)(zn);
@@ -131,27 +75,13 @@ svuint32x2_t test_svunpk_u32_x2(svuint16_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svunpk_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sunpk.x2.nxv2i64(<vscale x 4 x i32> [[ZN:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svunpk_s64_x2u11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sunpk.x2.nxv2i64(<vscale x 4 x i32> [[ZN:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svunpk_s64_x2(svint32_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svunpk_s64,_s32_x2)(zn);
@@ -159,27 +89,13 @@ svint64x2_t test_svunpk_s64_x2(svint32_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svunpk_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uunpk.x2.nxv2i64(<vscale x 4 x i32> [[ZN:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svunpk_u64_x2u12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uunpk.x2.nxv2i64(<vscale x 4 x i32> [[ZN:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svunpk_u64_x2(svuint32_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svunpk_u64,_u32_x2)(zn);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c
index 61718f0984ef3..45bc83eac7339 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c
@@ -19,35 +19,13 @@
 
 // CHECK-LABEL: @test_svunpk_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sunpk.x4.nxv8i16(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svunpk_s16_x410svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sunpk.x4.nxv8i16(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svunpk_s16_x4(svint8x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svunpk_s16,_s8_x4)(zn);
@@ -55,35 +33,13 @@ svint16x4_t test_svunpk_s16_x4(svint8x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svunpk_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uunpk.x4.nxv8i16(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svunpk_u16_x411svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uunpk.x4.nxv8i16(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svunpk_u16_x4(svuint8x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svunpk_u16,_u8_x4)(zn);
@@ -91,35 +47,13 @@ svuint16x4_t test_svunpk_u16_x4(svuint8x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svunpk_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sunpk.x4.nxv4i32(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svunpk_s32_x411svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sunpk.x4.nxv4i32(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svunpk_s32_x4(svint16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svunpk_s32,_s16_x4)(zn);
@@ -127,35 +61,13 @@ svint32x4_t test_svunpk_s32_x4(svint16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svunpk_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uunpk.x4.nxv4i32(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svunpk_u32_x412svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uunpk.x4.nxv4i32(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svunpk_u32_x4(svuint16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svunpk_u32,_u16_x4)(zn);
@@ -163,35 +75,13 @@ svuint32x4_t test_svunpk_u32_x4(svuint16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svunpk_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sunpk.x4.nxv2i64(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svunpk_s64_x411svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sunpk.x4.nxv2i64(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svunpk_s64_x4(svint32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svunpk_s64,_s32_x4)(zn);
@@ -199,35 +89,13 @@ svint64x4_t test_svunpk_s64_x4(svint32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svunpk_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uunpk.x4.nxv2i64(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svunpk_u64_x412svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uunpk.x4.nxv2i64(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svunpk_u64_x4(svuint32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svunpk_u64,_u32_x4)(zn);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
index c118a7192c6ca..de983bcf79309 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
@@ -25,27 +25,13 @@
 
 // CHECK-LABEL: @test_svadd_vector_single2_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_s810svint8x2_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svadd_vector_single2_s8(svint8x2_t zn, svint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_s8_x2,,,)(zn, zm);
@@ -53,27 +39,13 @@ svint8x2_t test_svadd_vector_single2_s8(svint8x2_t zn, svint8_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svadd_vector_single2_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_u811svuint8x2_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svadd_vector_single2_u8(svuint8x2_t zn, svuint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_u8_x2,,,)(zn, zm);
@@ -81,27 +53,13 @@ svuint8x2_t test_svadd_vector_single2_u8(svuint8x2_t zn, svuint8_t zm) __arm_str
 
 // CHECK-LABEL: @test_svadd_vector_single2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s1611svint16x2_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svadd_vector_single2_s16(svint16x2_t zn, svint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_s16_x2,,,)(zn, zm);
@@ -109,27 +67,13 @@ svint16x2_t test_svadd_vector_single2_s16(svint16x2_t zn, svint16_t zm) __arm_st
 
 // CHECK-LABEL: @test_svadd_vector_single2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u1612svuint16x2_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svadd_vector_single2_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_u16_x2,,,)(zn, zm);
@@ -137,27 +81,13 @@ svuint16x2_t test_svadd_vector_single2_u16(svuint16x2_t zn, svuint16_t zm) __arm
 
 // CHECK-LABEL: @test_svadd_vector_single2_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s3211svint32x2_tu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svadd_vector_single2_s32(svint32x2_t zn, svint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_s32_x2,,,)(zn, zm);
@@ -165,27 +95,13 @@ svint32x2_t test_svadd_vector_single2_s32(svint32x2_t zn, svint32_t zm) __arm_st
 
 // CHECK-LABEL: @test_svadd_vector_single2_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u3212svuint32x2_tu12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svadd_vector_single2_u32(svuint32x2_t zn, svuint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_u32_x2,,,)(zn, zm);
@@ -193,27 +109,13 @@ svuint32x2_t test_svadd_vector_single2_u32(svuint32x2_t zn, svuint32_t zm) __arm
 
 // CHECK-LABEL: @test_svadd_vector_single2_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s6411svint64x2_tu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svadd_vector_single2_s64(svint64x2_t zn, svint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_s64_x2,,,)(zn, zm);
@@ -221,27 +123,13 @@ svint64x2_t test_svadd_vector_single2_s64(svint64x2_t zn, svint64_t zm) __arm_st
 
 // CHECK-LABEL: @test_svadd_vector_single2_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u6412svuint64x2_tu12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svadd_vector_single2_u64(svuint64x2_t zn, svuint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_u64_x2,,,)(zn, zm);
@@ -252,35 +140,13 @@ svuint64x2_t test_svadd_vector_single2_u64(svuint64x2_t zn, svuint64_t zm) __arm
 
 // CHECK-LABEL: @test_svadd_vector_single4_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_s810svint8x4_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svadd_vector_single4_s8(svint8x4_t zn, svint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_s8_x4,,,)(zn, zm);
@@ -288,35 +154,13 @@ svint8x4_t test_svadd_vector_single4_s8(svint8x4_t zn, svint8_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svadd_vector_single4_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_u811svuint8x4_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svadd_vector_single4_u8(svuint8x4_t zn, svuint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_u8_x4,,,)(zn, zm);
@@ -324,35 +168,13 @@ svuint8x4_t test_svadd_vector_single4_u8(svuint8x4_t zn, svuint8_t zm) __arm_str
 
 // CHECK-LABEL: @test_svadd_vector_single4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s1611svint16x4_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svadd_vector_single4_s16(svint16x4_t zn, svint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_s16_x4,,,)(zn, zm);
@@ -360,35 +182,13 @@ svint16x4_t test_svadd_vector_single4_s16(svint16x4_t zn, svint16_t zm) __arm_st
 
 // CHECK-LABEL: @test_svadd_vector_single4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u1612svuint16x4_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svadd_vector_single4_u16(svuint16x4_t zn, svuint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_u16_x4,,,)(zn, zm);
@@ -396,35 +196,13 @@ svuint16x4_t test_svadd_vector_single4_u16(svuint16x4_t zn, svuint16_t zm) __arm
 
 // CHECK-LABEL: @test_svadd_vector_single4_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s3211svint32x4_tu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svadd_vector_single4_s32(svint32x4_t zn, svint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_s32_x4,,,)(zn, zm);
@@ -432,35 +210,13 @@ svint32x4_t test_svadd_vector_single4_s32(svint32x4_t zn, svint32_t zm) __arm_st
 
 // CHECK-LABEL: @test_svadd_vector_single4_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u3212svuint32x4_tu12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svadd_vector_single4_u32(svuint32x4_t zn, svuint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_u32_x4,,,)(zn, zm);
@@ -468,35 +224,13 @@ svuint32x4_t test_svadd_vector_single4_u32(svuint32x4_t zn, svuint32_t zm) __arm
 
 // CHECK-LABEL: @test_svadd_vector_single4_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s6411svint64x4_tu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svadd_vector_single4_s64(svint64x4_t zn, svint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_s64_x4,,,)(zn, zm);
@@ -504,35 +238,13 @@ svint64x4_t test_svadd_vector_single4_s64(svint64x4_t zn, svint64_t zm) __arm_st
 
 // CHECK-LABEL: @test_svadd_vector_single4_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u6412svuint64x4_tu12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svadd_vector_single4_u64(svuint64x4_t zn, svuint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svadd,_single_u64_x4,,,)(zn, zm);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c
index 87160444e3c0d..af5a389c7f736 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c
@@ -19,27 +19,13 @@
 
 // CHECK-LABEL: @test_svrshl_single_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.srshl.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_single_s8_x210svint8x2_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.srshl.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svrshl_single_s8_x2(svint8x2_t zdn, svint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_s8_x2,,,)(zdn, zm);
@@ -47,27 +33,13 @@ svint8x2_t test_svrshl_single_s8_x2(svint8x2_t zdn, svint8_t zm) __arm_streaming
 
 // CHECK-LABEL: @test_svrshl_single_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.srshl.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svrshl_single_s16_x211svint16x2_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.srshl.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svrshl_single_s16_x2(svint16x2_t zdn, svint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_s16_x2,,,)(zdn, zm);
@@ -75,27 +47,13 @@ svint16x2_t test_svrshl_single_s16_x2(svint16x2_t zdn, svint16_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svrshl_single_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.srshl.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svrshl_single_s32_x211svint32x2_tu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.srshl.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svrshl_single_s32_x2(svint32x2_t zdn, svint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_s32_x2,,,)(zdn, zm);
@@ -103,27 +61,13 @@ svint32x2_t test_svrshl_single_s32_x2(svint32x2_t zdn, svint32_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svrshl_single_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.srshl.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svrshl_single_s64_x211svint64x2_tu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.srshl.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svrshl_single_s64_x2(svint64x2_t zdn, svint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_s64_x2,,,)(zdn, zm);
@@ -131,27 +75,13 @@ svint64x2_t test_svrshl_single_s64_x2(svint64x2_t zdn, svint64_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svrshl_single_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.urshl.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_single_u8_x211svuint8x2_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.urshl.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svrshl_single_u8_x2(svuint8x2_t zdn, svuint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_u8_x2,,,)(zdn, zm);
@@ -159,27 +89,13 @@ svuint8x2_t test_svrshl_single_u8_x2(svuint8x2_t zdn, svuint8_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svrshl_single_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.urshl.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svrshl_single_u16_x212svuint16x2_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.urshl.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svrshl_single_u16_x2(svuint16x2_t zdn, svuint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_u16_x2,,,)(zdn, zm);
@@ -187,27 +103,13 @@ svuint16x2_t test_svrshl_single_u16_x2(svuint16x2_t zdn, svuint16_t zm) __arm_st
 
 // CHECK-LABEL: @test_svrshl_single_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.urshl.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svrshl_single_u32_x212svuint32x2_tu12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.urshl.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svrshl_single_u32_x2(svuint32x2_t zdn, svuint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_u32_x2,,,)(zdn, zm);
@@ -215,27 +117,13 @@ svuint32x2_t test_svrshl_single_u32_x2(svuint32x2_t zdn, svuint32_t zm) __arm_st
 
 // CHECK-LABEL: @test_svrshl_single_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.urshl.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svrshl_single_u64_x212svuint64x2_tu12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.urshl.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svrshl_single_u64_x2(svuint64x2_t zdn, svuint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_u64_x2,,,)(zdn, zm);
@@ -245,35 +133,13 @@ svuint64x2_t test_svrshl_single_u64_x2(svuint64x2_t zdn, svuint64_t zm) __arm_st
 
 // CHECK-LABEL: @test_svrshl_single_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.srshl.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_single_s8_x410svint8x4_tu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.srshl.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svrshl_single_s8_x4(svint8x4_t zdn, svint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_s8_x4,,,)(zdn, zm);
@@ -281,35 +147,13 @@ svint8x4_t test_svrshl_single_s8_x4(svint8x4_t zdn, svint8_t zm) __arm_streaming
 
 // CHECK-LABEL: @test_svrshl_single_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.srshl.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svrshl_single_s16_x411svint16x4_tu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.srshl.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svrshl_single_s16_x4(svint16x4_t zdn, svint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_s16_x4,,,)(zdn, zm);
@@ -317,35 +161,13 @@ svint16x4_t test_svrshl_single_s16_x4(svint16x4_t zdn, svint16_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svrshl_single_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.srshl.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svrshl_single_s32_x411svint32x4_tu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.srshl.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svrshl_single_s32_x4(svint32x4_t zdn, svint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_s32_x4,,,)(zdn, zm);
@@ -353,35 +175,13 @@ svint32x4_t test_svrshl_single_s32_x4(svint32x4_t zdn, svint32_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svrshl_single_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.srshl.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svrshl_single_s64_x411svint64x4_tu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.srshl.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svrshl_single_s64_x4(svint64x4_t zdn, svint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_s64_x4,,,)(zdn, zm);
@@ -389,35 +189,13 @@ svint64x4_t test_svrshl_single_s64_x4(svint64x4_t zdn, svint64_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svrshl_single_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.urshl.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_single_u8_x411svuint8x4_tu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.urshl.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svrshl_single_u8_x4(svuint8x4_t zdn, svuint8_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_u8_x4,,,)(zdn, zm);
@@ -425,35 +203,13 @@ svuint8x4_t test_svrshl_single_u8_x4(svuint8x4_t zdn, svuint8_t zm) __arm_stream
 
 // CHECK-LABEL: @test_svrshl_single_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.urshl.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svrshl_single_u16_x412svuint16x4_tu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.urshl.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svrshl_single_u16_x4(svuint16x4_t zdn, svuint16_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_u16_x4,,,)(zdn, zm);
@@ -461,35 +217,13 @@ svuint16x4_t test_svrshl_single_u16_x4(svuint16x4_t zdn, svuint16_t zm) __arm_st
 
 // CHECK-LABEL: @test_svrshl_single_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.urshl.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svrshl_single_u32_x412svuint32x4_tu12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.urshl.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svrshl_single_u32_x4(svuint32x4_t zdn, svuint32_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_u32_x4,,,)(zdn, zm);
@@ -497,35 +231,13 @@ svuint32x4_t test_svrshl_single_u32_x4(svuint32x4_t zdn, svuint32_t zm) __arm_st
 
 // CHECK-LABEL: @test_svrshl_single_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.urshl.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svrshl_single_u64_x412svuint64x4_tu12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.urshl.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svrshl_single_u64_x4(svuint64x4_t zdn, svuint64_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_single_u64_x4,,,)(zdn, zm);
@@ -535,27 +247,13 @@ svuint64x4_t test_svrshl_single_u64_x4(svuint64x4_t zdn, svuint64_t zm) __arm_st
 
 // CHECK-LABEL: @test_svrshl_multi_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.srshl.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svrshl_multi_s8_x210svint8x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.srshl.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svrshl_multi_s8_x2(svint8x2_t zdn, svint8x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_s8_x2,,,)(zdn, zm);
@@ -563,27 +261,13 @@ svint8x2_t test_svrshl_multi_s8_x2(svint8x2_t zdn, svint8x2_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svrshl_multi_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.srshl.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s16_x211svint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.srshl.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svrshl_multi_s16_x2(svint16x2_t zdn, svint16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_s16_x2,,,)(zdn, zm);
@@ -591,27 +275,13 @@ svint16x2_t test_svrshl_multi_s16_x2(svint16x2_t zdn, svint16x2_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svrshl_multi_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.srshl.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s32_x211svint32x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.srshl.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svrshl_multi_s32_x2(svint32x2_t zdn, svint32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_s32_x2,,,)(zdn, zm);
@@ -619,27 +289,13 @@ svint32x2_t test_svrshl_multi_s32_x2(svint32x2_t zdn, svint32x2_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svrshl_multi_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.srshl.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s64_x211svint64x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.srshl.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svrshl_multi_s64_x2(svint64x2_t zdn, svint64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_s64_x2,,,)(zdn, zm);
@@ -647,27 +303,13 @@ svint64x2_t test_svrshl_multi_s64_x2(svint64x2_t zdn, svint64x2_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svrshl_multi_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.urshl.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svrshl_multi_u8_x211svuint8x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.urshl.x2.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svrshl_multi_u8_x2(svuint8x2_t zdn, svuint8x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_u8_x2,,,)(zdn, zm);
@@ -675,27 +317,13 @@ svuint8x2_t test_svrshl_multi_u8_x2(svuint8x2_t zdn, svuint8x2_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svrshl_multi_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.urshl.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u16_x212svuint16x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.urshl.x2.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svrshl_multi_u16_x2(svuint16x2_t zdn, svuint16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_u16_x2,,,)(zdn, zm);
@@ -703,27 +331,13 @@ svuint16x2_t test_svrshl_multi_u16_x2(svuint16x2_t zdn, svuint16x2_t zm) __arm_s
 
 // CHECK-LABEL: @test_svrshl_multi_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.urshl.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u32_x212svuint32x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.urshl.x2.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svrshl_multi_u32_x2(svuint32x2_t zdn, svuint32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_u32_x2,,,)(zdn, zm);
@@ -731,27 +345,13 @@ svuint32x2_t test_svrshl_multi_u32_x2(svuint32x2_t zdn, svuint32x2_t zm) __arm_s
 
 // CHECK-LABEL: @test_svrshl_multi_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.urshl.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u64_x212svuint64x2_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.urshl.x2.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svrshl_multi_u64_x2(svuint64x2_t zdn, svuint64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_u64_x2,,,)(zdn, zm);
@@ -761,35 +361,13 @@ svuint64x2_t test_svrshl_multi_u64_x2(svuint64x2_t zdn, svuint64x2_t zm) __arm_s
 
 // CHECK-LABEL: @test_svrshl_multi_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.srshl.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svrshl_multi_s8_x410svint8x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.srshl.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svrshl_multi_s8_x4(svint8x4_t zdn, svint8x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_s8_x4,,,)(zdn, zm);
@@ -797,35 +375,13 @@ svint8x4_t test_svrshl_multi_s8_x4(svint8x4_t zdn, svint8x4_t zm) __arm_streamin
 
 // CHECK-LABEL: @test_svrshl_multi_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.srshl.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s16_x411svint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.srshl.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svrshl_multi_s16_x4(svint16x4_t zdn, svint16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_s16_x4,,,)(zdn, zm);
@@ -833,35 +389,13 @@ svint16x4_t test_svrshl_multi_s16_x4(svint16x4_t zdn, svint16x4_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svrshl_multi_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.srshl.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s32_x411svint32x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.srshl.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svrshl_multi_s32_x4(svint32x4_t zdn, svint32x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_s32_x4,,,)(zdn, zm);
@@ -869,35 +403,13 @@ svint32x4_t test_svrshl_multi_s32_x4(svint32x4_t zdn, svint32x4_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svrshl_multi_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.srshl.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s64_x411svint64x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.srshl.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svrshl_multi_s64_x4(svint64x4_t zdn, svint64x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_s64_x4,,,)(zdn, zm);
@@ -905,35 +417,13 @@ svint64x4_t test_svrshl_multi_s64_x4(svint64x4_t zdn, svint64x4_t zm) __arm_stre
 
 // CHECK-LABEL: @test_svrshl_multi_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.urshl.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svrshl_multi_u8_x411svuint8x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.urshl.x4.nxv16i8(<vscale x 16 x i8> [[ZDN_COERCE0:%.*]], <vscale x 16 x i8> [[ZDN_COERCE1:%.*]], <vscale x 16 x i8> [[ZDN_COERCE2:%.*]], <vscale x 16 x i8> [[ZDN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svrshl_multi_u8_x4(svuint8x4_t zdn, svuint8x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_u8_x4,,,)(zdn, zm);
@@ -941,35 +431,13 @@ svuint8x4_t test_svrshl_multi_u8_x4(svuint8x4_t zdn, svuint8x4_t zm) __arm_strea
 
 // CHECK-LABEL: @test_svrshl_multi_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.urshl.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u16_x412svuint16x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.urshl.x4.nxv8i16(<vscale x 8 x i16> [[ZDN_COERCE0:%.*]], <vscale x 8 x i16> [[ZDN_COERCE1:%.*]], <vscale x 8 x i16> [[ZDN_COERCE2:%.*]], <vscale x 8 x i16> [[ZDN_COERCE3:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE2:%.*]], <vscale x 8 x i16> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svrshl_multi_u16_x4(svuint16x4_t zdn, svuint16x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_u16_x4,,,)(zdn, zm);
@@ -977,35 +445,13 @@ svuint16x4_t test_svrshl_multi_u16_x4(svuint16x4_t zdn, svuint16x4_t zm) __arm_s
 
 // CHECK-LABEL: @test_svrshl_multi_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.urshl.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u32_x412svuint32x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.urshl.x4.nxv4i32(<vscale x 4 x i32> [[ZDN_COERCE0:%.*]], <vscale x 4 x i32> [[ZDN_COERCE1:%.*]], <vscale x 4 x i32> [[ZDN_COERCE2:%.*]], <vscale x 4 x i32> [[ZDN_COERCE3:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE2:%.*]], <vscale x 4 x i32> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svrshl_multi_u32_x4(svuint32x4_t zdn, svuint32x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_u32_x4,,,)(zdn, zm);
@@ -1013,35 +459,13 @@ svuint32x4_t test_svrshl_multi_u32_x4(svuint32x4_t zdn, svuint32x4_t zm) __arm_s
 
 // CHECK-LABEL: @test_svrshl_multi_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.urshl.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u64_x412svuint64x4_tS_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.urshl.x4.nxv2i64(<vscale x 2 x i64> [[ZDN_COERCE0:%.*]], <vscale x 2 x i64> [[ZDN_COERCE1:%.*]], <vscale x 2 x i64> [[ZDN_COERCE2:%.*]], <vscale x 2 x i64> [[ZDN_COERCE3:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE2:%.*]], <vscale x 2 x i64> [[ZM_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svrshl_multi_u64_x4(svuint64x4_t zdn, svuint64x4_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svrshl,_u64_x4,,,)(zdn, zm);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c
index a95f89faf7783..4047b2fbd1965 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c
@@ -19,27 +19,13 @@
 
 // CHECK-LABEL: @test_svsel_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svsel_s8_x2u11__SVCount_t10svint8x2_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svsel_s8_x2(svcount_t pn, svint8x2_t zn, svint8x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_s8_x2)(pn, zn, zm);
@@ -47,27 +33,13 @@ svint8x2_t test_svsel_s8_x2(svcount_t pn, svint8x2_t zn, svint8x2_t zm) __arm_st
 
 // CHECK-LABEL: @test_svsel_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svsel_u8_x2u11__SVCount_t11svuint8x2_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svsel_u8_x2(svcount_t pn, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_u8_x2)(pn, zn, zm);
@@ -77,27 +49,13 @@ svuint8x2_t test_svsel_u8_x2(svcount_t pn, svuint8x2_t zn, svuint8x2_t zm) __arm
 
 // CHECK-LABEL: @test_svsel_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_s16_x2u11__SVCount_t11svint16x2_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svsel_s16_x2(svcount_t pn, svint16x2_t zn, svint16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_s16_x2)(pn, zn, zm);
@@ -105,27 +63,13 @@ svint16x2_t test_svsel_s16_x2(svcount_t pn, svint16x2_t zn, svint16x2_t zm) __ar
 
 // CHECK-LABEL: @test_svsel_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_u16_x2u11__SVCount_t12svuint16x2_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM_COERCE0:%.*]], <vscale x 8 x i16> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svsel_u16_x2(svcount_t pn, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_u16_x2)(pn, zn, zm);
@@ -133,27 +77,13 @@ svuint16x2_t test_svsel_u16_x2(svcount_t pn, svuint16x2_t zn, svuint16x2_t zm) _
 
 // CHECK-LABEL: @test_svsel_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.sel.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_f16_x2u11__SVCount_t13svfloat16x2_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.sel.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svsel_f16_x2(svcount_t pn, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_f16_x2)(pn, zn, zm);
@@ -161,27 +91,13 @@ svfloat16x2_t test_svsel_f16_x2(svcount_t pn, svfloat16x2_t zn, svfloat16x2_t zm
 
 // CHECK-LABEL: @test_svsel_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.sel.x2.nxv8bf16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svsel_bf16_x2u11__SVCount_t14svbfloat16x2_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.sel.x2.nxv8bf16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svsel_bf16_x2(svcount_t pn, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_bf16_x2)(pn, zn, zm);
@@ -191,27 +107,13 @@ svbfloat16x2_t test_svsel_bf16_x2(svcount_t pn, svbfloat16x2_t zn, svbfloat16x2_
 
 // CHECK-LABEL: @test_svsel_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], <vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_s32_x2u11__SVCount_t11svint32x2_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], <vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svsel_s32_x2(svcount_t pn, svint32x2_t zn, svint32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_s32_x2)(pn, zn, zm);
@@ -219,27 +121,13 @@ svint32x2_t test_svsel_s32_x2(svcount_t pn, svint32x2_t zn, svint32x2_t zm) __ar
 
 // CHECK-LABEL: @test_svsel_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], <vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_u32_x2u11__SVCount_t12svuint32x2_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], <vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZM_COERCE0:%.*]], <vscale x 4 x i32> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svsel_u32_x2(svcount_t pn, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_u32_x2)(pn, zn, zm);
@@ -247,27 +135,13 @@ svuint32x2_t test_svsel_u32_x2(svcount_t pn, svuint32x2_t zn, svuint32x2_t zm) _
 
 // CHECK-LABEL: @test_svsel_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.sel.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], <vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_f32_x2u11__SVCount_t13svfloat32x2_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.sel.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], <vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svsel_f32_x2(svcount_t pn, svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_f32_x2)(pn, zn, zm);
@@ -277,27 +151,13 @@ svfloat32x2_t test_svsel_f32_x2(svcount_t pn, svfloat32x2_t zn, svfloat32x2_t zm
 
 // CHECK-LABEL: @test_svsel_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], <vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_s64_x2u11__SVCount_t11svint64x2_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], <vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svsel_s64_x2(svcount_t pn, svint64x2_t zn, svint64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_s64_x2)(pn, zn, zm);
@@ -305,27 +165,13 @@ svint64x2_t test_svsel_s64_x2(svcount_t pn, svint64x2_t zn, svint64x2_t zm) __ar
 
 // CHECK-LABEL: @test_svsel_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], <vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_u64_x2u11__SVCount_t12svuint64x2_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], <vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZM_COERCE0:%.*]], <vscale x 2 x i64> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svsel_u64_x2(svcount_t pn, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_u64_x2)(pn, zn, zm);
@@ -333,27 +179,13 @@ svuint64x2_t test_svsel_u64_x2(svcount_t pn, svuint64x2_t zn, svuint64x2_t zm) _
 
 // CHECK-LABEL: @test_svsel_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.sel.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], <vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_f64_x2u11__SVCount_t13svfloat64x2_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.sel.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], <vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZM_COERCE0:%.*]], <vscale x 2 x double> [[ZM_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svsel_f64_x2(svcount_t pn, svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_f64_x2)(pn, zn, zm);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c
index 997b6acf96244..871d70943c9df 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c
@@ -19,35 +19,13 @@
 
 // CHECK-LABEL: @test_svsel_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], <vscale x 16 x i8> [[ZN1_COERCE0:%.*]], <vscale x 16 x i8> [[ZN1_COERCE1:%.*]], <vscale x 16 x i8> [[ZN1_COERCE2:%.*]], <vscale x 16 x i8> [[ZN1_COERCE3:%.*]], <vscale x 16 x i8> [[ZN2_COERCE0:%.*]], <vscale x 16 x i8> [[ZN2_COERCE1:%.*]], <vscale x 16 x i8> [[ZN2_COERCE2:%.*]], <vscale x 16 x i8> [[ZN2_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svsel_s8_x4u11__SVCount_t10svint8x4_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], <vscale x 16 x i8> [[ZN1_COERCE0:%.*]], <vscale x 16 x i8> [[ZN1_COERCE1:%.*]], <vscale x 16 x i8> [[ZN1_COERCE2:%.*]], <vscale x 16 x i8> [[ZN1_COERCE3:%.*]], <vscale x 16 x i8> [[ZN2_COERCE0:%.*]], <vscale x 16 x i8> [[ZN2_COERCE1:%.*]], <vscale x 16 x i8> [[ZN2_COERCE2:%.*]], <vscale x 16 x i8> [[ZN2_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svsel_s8_x4(svcount_t pn, svint8x4_t zn1, svint8x4_t zn2) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_s8_x4)(pn, zn1, zn2);
@@ -55,35 +33,13 @@ svint8x4_t test_svsel_s8_x4(svcount_t pn, svint8x4_t zn1, svint8x4_t zn2) __arm_
 
 // CHECK-LABEL: @test_svsel_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], <vscale x 16 x i8> [[ZN1_COERCE0:%.*]], <vscale x 16 x i8> [[ZN1_COERCE1:%.*]], <vscale x 16 x i8> [[ZN1_COERCE2:%.*]], <vscale x 16 x i8> [[ZN1_COERCE3:%.*]], <vscale x 16 x i8> [[ZN2_COERCE0:%.*]], <vscale x 16 x i8> [[ZN2_COERCE1:%.*]], <vscale x 16 x i8> [[ZN2_COERCE2:%.*]], <vscale x 16 x i8> [[ZN2_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svsel_u8_x4u11__SVCount_t11svuint8x4_tS0_S0_S0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], <vscale x 16 x i8> [[ZN1_COERCE0:%.*]], <vscale x 16 x i8> [[ZN1_COERCE1:%.*]], <vscale x 16 x i8> [[ZN1_COERCE2:%.*]], <vscale x 16 x i8> [[ZN1_COERCE3:%.*]], <vscale x 16 x i8> [[ZN2_COERCE0:%.*]], <vscale x 16 x i8> [[ZN2_COERCE1:%.*]], <vscale x 16 x i8> [[ZN2_COERCE2:%.*]], <vscale x 16 x i8> [[ZN2_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svsel_u8_x4(svcount_t pn, svuint8x4_t zn1, svuint8x4_t zn2, svuint8x4_t zn3, svuint8x4_t zn4) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_u8_x4)(pn, zn1, zn2);
@@ -93,35 +49,13 @@ svuint8x4_t test_svsel_u8_x4(svcount_t pn, svuint8x4_t zn1, svuint8x4_t zn2, svu
 
 // CHECK-LABEL: @test_svsel_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x i16> [[ZN1_COERCE0:%.*]], <vscale x 8 x i16> [[ZN1_COERCE1:%.*]], <vscale x 8 x i16> [[ZN1_COERCE2:%.*]], <vscale x 8 x i16> [[ZN1_COERCE3:%.*]], <vscale x 8 x i16> [[ZN2_COERCE0:%.*]], <vscale x 8 x i16> [[ZN2_COERCE1:%.*]], <vscale x 8 x i16> [[ZN2_COERCE2:%.*]], <vscale x 8 x i16> [[ZN2_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_s16_x4u11__SVCount_t11svint16x4_tS0_S0_S0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x i16> [[ZN1_COERCE0:%.*]], <vscale x 8 x i16> [[ZN1_COERCE1:%.*]], <vscale x 8 x i16> [[ZN1_COERCE2:%.*]], <vscale x 8 x i16> [[ZN1_COERCE3:%.*]], <vscale x 8 x i16> [[ZN2_COERCE0:%.*]], <vscale x 8 x i16> [[ZN2_COERCE1:%.*]], <vscale x 8 x i16> [[ZN2_COERCE2:%.*]], <vscale x 8 x i16> [[ZN2_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svsel_s16_x4(svcount_t pn, svint16x4_t zn1, svint16x4_t zn2, svint16x4_t zn3, svint16x4_t zn4) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_s16_x4)(pn, zn1, zn2);
@@ -129,35 +63,13 @@ svint16x4_t test_svsel_s16_x4(svcount_t pn, svint16x4_t zn1, svint16x4_t zn2, sv
 
 // CHECK-LABEL: @test_svsel_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x i16> [[ZN1_COERCE0:%.*]], <vscale x 8 x i16> [[ZN1_COERCE1:%.*]], <vscale x 8 x i16> [[ZN1_COERCE2:%.*]], <vscale x 8 x i16> [[ZN1_COERCE3:%.*]], <vscale x 8 x i16> [[ZN2_COERCE0:%.*]], <vscale x 8 x i16> [[ZN2_COERCE1:%.*]], <vscale x 8 x i16> [[ZN2_COERCE2:%.*]], <vscale x 8 x i16> [[ZN2_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_u16_x4u11__SVCount_t12svuint16x4_tS0_S0_S0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x i16> [[ZN1_COERCE0:%.*]], <vscale x 8 x i16> [[ZN1_COERCE1:%.*]], <vscale x 8 x i16> [[ZN1_COERCE2:%.*]], <vscale x 8 x i16> [[ZN1_COERCE3:%.*]], <vscale x 8 x i16> [[ZN2_COERCE0:%.*]], <vscale x 8 x i16> [[ZN2_COERCE1:%.*]], <vscale x 8 x i16> [[ZN2_COERCE2:%.*]], <vscale x 8 x i16> [[ZN2_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svsel_u16_x4(svcount_t pn, svuint16x4_t zn1, svuint16x4_t zn2, svuint16x4_t zn3, svuint16x4_t zn4) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_u16_x4)(pn, zn1, zn2);
@@ -165,35 +77,13 @@ svuint16x4_t test_svsel_u16_x4(svcount_t pn, svuint16x4_t zn1, svuint16x4_t zn2,
 
 // CHECK-LABEL: @test_svsel_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.sel.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x half> [[ZN1_COERCE0:%.*]], <vscale x 8 x half> [[ZN1_COERCE1:%.*]], <vscale x 8 x half> [[ZN1_COERCE2:%.*]], <vscale x 8 x half> [[ZN1_COERCE3:%.*]], <vscale x 8 x half> [[ZN2_COERCE0:%.*]], <vscale x 8 x half> [[ZN2_COERCE1:%.*]], <vscale x 8 x half> [[ZN2_COERCE2:%.*]], <vscale x 8 x half> [[ZN2_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_f16_x4u11__SVCount_t13svfloat16x4_tS0_S0_S0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.sel.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x half> [[ZN1_COERCE0:%.*]], <vscale x 8 x half> [[ZN1_COERCE1:%.*]], <vscale x 8 x half> [[ZN1_COERCE2:%.*]], <vscale x 8 x half> [[ZN1_COERCE3:%.*]], <vscale x 8 x half> [[ZN2_COERCE0:%.*]], <vscale x 8 x half> [[ZN2_COERCE1:%.*]], <vscale x 8 x half> [[ZN2_COERCE2:%.*]], <vscale x 8 x half> [[ZN2_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svsel_f16_x4(svcount_t pn, svfloat16x4_t zn1, svfloat16x4_t zn2, svfloat16x4_t zn3, svfloat16x4_t zn4) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_f16_x4)(pn, zn1, zn2);
@@ -201,35 +91,13 @@ svfloat16x4_t test_svsel_f16_x4(svcount_t pn, svfloat16x4_t zn1, svfloat16x4_t z
 
 // CHECK-LABEL: @test_svsel_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.sel.x4.nxv8bf16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x bfloat> [[ZN1_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN1_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZN1_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZN1_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZN2_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN2_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZN2_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZN2_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svsel_bf16_x4u11__SVCount_t14svbfloat16x4_tS0_S0_S0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.sel.x4.nxv8bf16(target("aarch64.svcount") [[PN:%.*]], <vscale x 8 x bfloat> [[ZN1_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN1_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZN1_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZN1_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZN2_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN2_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZN2_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZN2_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svsel_bf16_x4(svcount_t pn, svbfloat16x4_t zn1, svbfloat16x4_t zn2, svbfloat16x4_t zn3, svbfloat16x4_t zn4) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_bf16_x4)(pn, zn1, zn2);
@@ -239,35 +107,13 @@ svbfloat16x4_t test_svsel_bf16_x4(svcount_t pn, svbfloat16x4_t zn1, svbfloat16x4
 
 // CHECK-LABEL: @test_svsel_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], <vscale x 4 x i32> [[ZN1_COERCE0:%.*]], <vscale x 4 x i32> [[ZN1_COERCE1:%.*]], <vscale x 4 x i32> [[ZN1_COERCE2:%.*]], <vscale x 4 x i32> [[ZN1_COERCE3:%.*]], <vscale x 4 x i32> [[ZN2_COERCE0:%.*]], <vscale x 4 x i32> [[ZN2_COERCE1:%.*]], <vscale x 4 x i32> [[ZN2_COERCE2:%.*]], <vscale x 4 x i32> [[ZN2_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_s32_x4u11__SVCount_t11svint32x4_tS0_S0_S0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], <vscale x 4 x i32> [[ZN1_COERCE0:%.*]], <vscale x 4 x i32> [[ZN1_COERCE1:%.*]], <vscale x 4 x i32> [[ZN1_COERCE2:%.*]], <vscale x 4 x i32> [[ZN1_COERCE3:%.*]], <vscale x 4 x i32> [[ZN2_COERCE0:%.*]], <vscale x 4 x i32> [[ZN2_COERCE1:%.*]], <vscale x 4 x i32> [[ZN2_COERCE2:%.*]], <vscale x 4 x i32> [[ZN2_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svsel_s32_x4(svcount_t pn, svint32x4_t zn1, svint32x4_t zn2, svint32x4_t zn3, svint32x4_t zn4) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_s32_x4)(pn, zn1, zn2);
@@ -275,35 +121,13 @@ svint32x4_t test_svsel_s32_x4(svcount_t pn, svint32x4_t zn1, svint32x4_t zn2, sv
 
 // CHECK-LABEL: @test_svsel_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], <vscale x 4 x i32> [[ZN1_COERCE0:%.*]], <vscale x 4 x i32> [[ZN1_COERCE1:%.*]], <vscale x 4 x i32> [[ZN1_COERCE2:%.*]], <vscale x 4 x i32> [[ZN1_COERCE3:%.*]], <vscale x 4 x i32> [[ZN2_COERCE0:%.*]], <vscale x 4 x i32> [[ZN2_COERCE1:%.*]], <vscale x 4 x i32> [[ZN2_COERCE2:%.*]], <vscale x 4 x i32> [[ZN2_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_u32_x4u11__SVCount_t12svuint32x4_tS0_S0_S0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], <vscale x 4 x i32> [[ZN1_COERCE0:%.*]], <vscale x 4 x i32> [[ZN1_COERCE1:%.*]], <vscale x 4 x i32> [[ZN1_COERCE2:%.*]], <vscale x 4 x i32> [[ZN1_COERCE3:%.*]], <vscale x 4 x i32> [[ZN2_COERCE0:%.*]], <vscale x 4 x i32> [[ZN2_COERCE1:%.*]], <vscale x 4 x i32> [[ZN2_COERCE2:%.*]], <vscale x 4 x i32> [[ZN2_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svsel_u32_x4(svcount_t pn, svuint32x4_t zn1, svuint32x4_t zn2, svuint32x4_t zn3, svuint32x4_t zn4) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_u32_x4)(pn, zn1, zn2);
@@ -311,35 +135,13 @@ svuint32x4_t test_svsel_u32_x4(svcount_t pn, svuint32x4_t zn1, svuint32x4_t zn2,
 
 // CHECK-LABEL: @test_svsel_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.sel.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], <vscale x 4 x float> [[ZN1_COERCE0:%.*]], <vscale x 4 x float> [[ZN1_COERCE1:%.*]], <vscale x 4 x float> [[ZN1_COERCE2:%.*]], <vscale x 4 x float> [[ZN1_COERCE3:%.*]], <vscale x 4 x float> [[ZN2_COERCE0:%.*]], <vscale x 4 x float> [[ZN2_COERCE1:%.*]], <vscale x 4 x float> [[ZN2_COERCE2:%.*]], <vscale x 4 x float> [[ZN2_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_f32_x4u11__SVCount_t13svfloat32x4_tS0_S0_S0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.sel.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], <vscale x 4 x float> [[ZN1_COERCE0:%.*]], <vscale x 4 x float> [[ZN1_COERCE1:%.*]], <vscale x 4 x float> [[ZN1_COERCE2:%.*]], <vscale x 4 x float> [[ZN1_COERCE3:%.*]], <vscale x 4 x float> [[ZN2_COERCE0:%.*]], <vscale x 4 x float> [[ZN2_COERCE1:%.*]], <vscale x 4 x float> [[ZN2_COERCE2:%.*]], <vscale x 4 x float> [[ZN2_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svsel_f32_x4(svcount_t pn, svfloat32x4_t zn1, svfloat32x4_t zn2, svfloat32x4_t zn3, svfloat32x4_t zn4) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_f32_x4)(pn, zn1, zn2);
@@ -349,35 +151,13 @@ svfloat32x4_t test_svsel_f32_x4(svcount_t pn, svfloat32x4_t zn1, svfloat32x4_t z
 
 // CHECK-LABEL: @test_svsel_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], <vscale x 2 x i64> [[ZN1_COERCE0:%.*]], <vscale x 2 x i64> [[ZN1_COERCE1:%.*]], <vscale x 2 x i64> [[ZN1_COERCE2:%.*]], <vscale x 2 x i64> [[ZN1_COERCE3:%.*]], <vscale x 2 x i64> [[ZN2_COERCE0:%.*]], <vscale x 2 x i64> [[ZN2_COERCE1:%.*]], <vscale x 2 x i64> [[ZN2_COERCE2:%.*]], <vscale x 2 x i64> [[ZN2_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_s64_x4u11__SVCount_t11svint64x4_tS0_S0_S0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], <vscale x 2 x i64> [[ZN1_COERCE0:%.*]], <vscale x 2 x i64> [[ZN1_COERCE1:%.*]], <vscale x 2 x i64> [[ZN1_COERCE2:%.*]], <vscale x 2 x i64> [[ZN1_COERCE3:%.*]], <vscale x 2 x i64> [[ZN2_COERCE0:%.*]], <vscale x 2 x i64> [[ZN2_COERCE1:%.*]], <vscale x 2 x i64> [[ZN2_COERCE2:%.*]], <vscale x 2 x i64> [[ZN2_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svsel_s64_x4(svcount_t pn, svint64x4_t zn1, svint64x4_t zn2, svint64x4_t zn3, svint64x4_t zn4) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_s64_x4)(pn, zn1, zn2);
@@ -385,35 +165,13 @@ svint64x4_t test_svsel_s64_x4(svcount_t pn, svint64x4_t zn1, svint64x4_t zn2, sv
 
 // CHECK-LABEL: @test_svsel_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], <vscale x 2 x i64> [[ZN1_COERCE0:%.*]], <vscale x 2 x i64> [[ZN1_COERCE1:%.*]], <vscale x 2 x i64> [[ZN1_COERCE2:%.*]], <vscale x 2 x i64> [[ZN1_COERCE3:%.*]], <vscale x 2 x i64> [[ZN2_COERCE0:%.*]], <vscale x 2 x i64> [[ZN2_COERCE1:%.*]], <vscale x 2 x i64> [[ZN2_COERCE2:%.*]], <vscale x 2 x i64> [[ZN2_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_u64_x4u11__SVCount_t12svuint64x4_tS0_S0_S0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], <vscale x 2 x i64> [[ZN1_COERCE0:%.*]], <vscale x 2 x i64> [[ZN1_COERCE1:%.*]], <vscale x 2 x i64> [[ZN1_COERCE2:%.*]], <vscale x 2 x i64> [[ZN1_COERCE3:%.*]], <vscale x 2 x i64> [[ZN2_COERCE0:%.*]], <vscale x 2 x i64> [[ZN2_COERCE1:%.*]], <vscale x 2 x i64> [[ZN2_COERCE2:%.*]], <vscale x 2 x i64> [[ZN2_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svsel_u64_x4(svcount_t pn, svuint64x4_t zn1, svuint64x4_t zn2, svuint64x4_t zn3, svuint64x4_t zn4) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_u64_x4)(pn, zn1, zn2);
@@ -421,35 +179,13 @@ svuint64x4_t test_svsel_u64_x4(svcount_t pn, svuint64x4_t zn1, svuint64x4_t zn2,
 
 // CHECK-LABEL: @test_svsel_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.sel.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], <vscale x 2 x double> [[ZN1_COERCE0:%.*]], <vscale x 2 x double> [[ZN1_COERCE1:%.*]], <vscale x 2 x double> [[ZN1_COERCE2:%.*]], <vscale x 2 x double> [[ZN1_COERCE3:%.*]], <vscale x 2 x double> [[ZN2_COERCE0:%.*]], <vscale x 2 x double> [[ZN2_COERCE1:%.*]], <vscale x 2 x double> [[ZN2_COERCE2:%.*]], <vscale x 2 x double> [[ZN2_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svsel_f64_x4u11__SVCount_t13svfloat64x4_tS0_S0_S0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.sel.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], <vscale x 2 x double> [[ZN1_COERCE0:%.*]], <vscale x 2 x double> [[ZN1_COERCE1:%.*]], <vscale x 2 x double> [[ZN1_COERCE2:%.*]], <vscale x 2 x double> [[ZN1_COERCE3:%.*]], <vscale x 2 x double> [[ZN2_COERCE0:%.*]], <vscale x 2 x double> [[ZN2_COERCE1:%.*]], <vscale x 2 x double> [[ZN2_COERCE2:%.*]], <vscale x 2 x double> [[ZN2_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svsel_f64_x4(svcount_t pn, svfloat64x4_t zn1, svfloat64x4_t zn2, svfloat64x4_t zn3, svfloat64x4_t zn4) __arm_streaming {
   return SVE_ACLE_FUNC(svsel,_f64_x4)(pn, zn1, zn2);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c
index de605bab67cc3..9a66ee5262082 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c
@@ -20,27 +20,13 @@
 
 // CHECK-LABEL: @test_svuzp_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzp.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svuzp_s8_x210svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzp.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svuzp_s8_x2(svint8x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_s8_x2)(zn);
@@ -48,27 +34,13 @@ svint8x2_t test_svuzp_s8_x2(svint8x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzp.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svuzp_u8_x211svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzp.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svuzp_u8_x2(svuint8x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_u8_x2)(zn);
@@ -78,27 +50,13 @@ svuint8x2_t test_svuzp_u8_x2(svuint8x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzp.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_s16_x211svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzp.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svuzp_s16_x2(svint16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_s16_x2)(zn);
@@ -106,27 +64,13 @@ svint16x2_t test_svuzp_s16_x2(svint16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzp.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_u16_x212svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzp.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svuzp_u16_x2(svuint16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_u16_x2)(zn);
@@ -134,27 +78,13 @@ svuint16x2_t test_svuzp_u16_x2(svuint16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.uzp.x2.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_f16_x213svfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.uzp.x2.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svuzp_f16_x2(svfloat16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_f16_x2)(zn);
@@ -162,27 +92,13 @@ svfloat16x2_t test_svuzp_f16_x2(svfloat16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.uzp.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzp_bf16_x214svbfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.uzp.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svuzp_bf16_x2(svbfloat16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_bf16_x2)(zn);
@@ -192,27 +108,13 @@ svbfloat16x2_t test_svuzp_bf16_x2(svbfloat16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzp.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_s32_x211svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzp.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svuzp_s32_x2(svint32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_s32_x2)(zn);
@@ -220,27 +122,13 @@ svint32x2_t test_svuzp_s32_x2(svint32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzp.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_u32_x212svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzp.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svuzp_u32_x2(svuint32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_u32_x2)(zn);
@@ -248,27 +136,13 @@ svuint32x2_t test_svuzp_u32_x2(svuint32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.uzp.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_f32_x213svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.uzp.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svuzp_f32_x2(svfloat32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_f32_x2)(zn);
@@ -278,27 +152,13 @@ svfloat32x2_t test_svuzp_f32_x2(svfloat32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzp.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_s64_x211svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzp.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svuzp_s64_x2(svint64x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_s64_x2)(zn);
@@ -306,27 +166,13 @@ svint64x2_t test_svuzp_s64_x2(svint64x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzp.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_u64_x212svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzp.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svuzp_u64_x2(svuint64x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_u64_x2)(zn);
@@ -334,27 +180,13 @@ svuint64x2_t test_svuzp_u64_x2(svuint64x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.uzp.x2.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_f64_x213svfloat64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.uzp.x2.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svuzp_f64_x2(svfloat64x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_f64_x2)(zn);
@@ -364,27 +196,13 @@ svfloat64x2_t test_svuzp_f64_x2(svfloat64x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzpq.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzpq_s8_x210svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzpq.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svuzpq_s8_x2(svint8x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_s8_x2)(zn);
@@ -392,27 +210,13 @@ svint8x2_t test_svuzpq_s8_x2(svint8x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzpq.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzpq_u8_x211svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzpq.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svuzpq_u8_x2(svuint8x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_u8_x2)(zn);
@@ -420,27 +224,13 @@ svuint8x2_t test_svuzpq_u8_x2(svuint8x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzpq.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_s16_x211svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzpq.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svuzpq_s16_x2(svint16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_s16_x2)(zn);
@@ -448,27 +238,13 @@ svint16x2_t test_svuzpq_s16_x2(svint16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzpq.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_u16_x212svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzpq.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svuzpq_u16_x2(svuint16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_u16_x2)(zn);
@@ -476,27 +252,13 @@ svuint16x2_t test_svuzpq_u16_x2(svuint16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.uzpq.x2.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_f16_x213svfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.uzpq.x2.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svuzpq_f16_x2(svfloat16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_f16_x2)(zn);
@@ -504,27 +266,13 @@ svfloat16x2_t test_svuzpq_f16_x2(svfloat16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.uzpq.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svuzpq_bf16_x214svbfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.uzpq.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svuzpq_bf16_x2(svbfloat16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_bf16_x2)(zn);
@@ -532,27 +280,13 @@ svbfloat16x2_t test_svuzpq_bf16_x2(svbfloat16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzpq.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_s32_x211svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzpq.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svuzpq_s32_x2(svint32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_s32_x2)(zn);
@@ -560,27 +294,13 @@ svint32x2_t test_svuzpq_s32_x2(svint32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzpq.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_u32_x212svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzpq.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svuzpq_u32_x2(svuint32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_u32_x2)(zn);
@@ -588,27 +308,13 @@ svuint32x2_t test_svuzpq_u32_x2(svuint32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.uzpq.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_f32_x213svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.uzpq.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svuzpq_f32_x2(svfloat32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_f32_x2)(zn);
@@ -616,27 +322,13 @@ svfloat32x2_t test_svuzpq_f32_x2(svfloat32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzpq.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_s64_x211svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzpq.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svuzpq_s64_x2(svint64x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_s64_x2)(zn);
@@ -644,27 +336,13 @@ svint64x2_t test_svuzpq_s64_x2(svint64x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzpq.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_u64_x212svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzpq.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svuzpq_u64_x2(svuint64x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_u64_x2)(zn);
@@ -672,27 +350,13 @@ svuint64x2_t test_svuzpq_u64_x2(svuint64x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.uzpq.x2.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_f64_x213svfloat64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.uzpq.x2.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svuzpq_f64_x2(svfloat64x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_f64_x2)(zn);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c
index aa210f59508b5..131928615edcd 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c
@@ -20,35 +20,13 @@
 
 // CHECK-LABEL: @test_svuzp_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzp.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svuzp_s8_x410svint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzp.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svuzp_s8_x4(svint8x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_s8_x4)(zn);
@@ -56,35 +34,13 @@ svint8x4_t test_svuzp_s8_x4(svint8x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzp.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svuzp_u8_x411svuint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzp.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svuzp_u8_x4(svuint8x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_u8_x4)(zn);
@@ -94,35 +50,13 @@ svuint8x4_t test_svuzp_u8_x4(svuint8x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzp.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_s16_x411svint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzp.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svuzp_s16_x4(svint16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_s16_x4)(zn);
@@ -130,35 +64,13 @@ svint16x4_t test_svuzp_s16_x4(svint16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzp.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_u16_x412svuint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzp.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svuzp_u16_x4(svuint16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_u16_x4)(zn);
@@ -166,35 +78,13 @@ svuint16x4_t test_svuzp_u16_x4(svuint16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.uzp.x4.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZN_COERCE2:%.*]], <vscale x 8 x half> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_f16_x413svfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.uzp.x4.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZN_COERCE2:%.*]], <vscale x 8 x half> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svuzp_f16_x4(svfloat16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_f16_x4)(zn);
@@ -202,35 +92,13 @@ svfloat16x4_t test_svuzp_f16_x4(svfloat16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.uzp.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzp_bf16_x414svbfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.uzp.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svuzp_bf16_x4(svbfloat16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_bf16_x4)(zn);
@@ -240,35 +108,13 @@ svbfloat16x4_t test_svuzp_bf16_x4(svbfloat16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzp.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_s32_x411svint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzp.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svuzp_s32_x4(svint32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_s32_x4)(zn);
@@ -276,35 +122,13 @@ svint32x4_t test_svuzp_s32_x4(svint32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzp.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_u32_x412svuint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzp.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svuzp_u32_x4(svuint32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_u32_x4)(zn);
@@ -312,35 +136,13 @@ svuint32x4_t test_svuzp_u32_x4(svuint32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.uzp.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_f32_x413svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.uzp.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svuzp_f32_x4(svfloat32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_f32_x4)(zn);
@@ -350,35 +152,13 @@ svfloat32x4_t test_svuzp_f32_x4(svfloat32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzp.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_s64_x411svint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzp.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svuzp_s64_x4(svint64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_s64_x4)(zn);
@@ -386,35 +166,13 @@ svint64x4_t test_svuzp_s64_x4(svint64x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzp.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_u64_x412svuint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzp.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svuzp_u64_x4(svuint64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_u64_x4)(zn);
@@ -422,35 +180,13 @@ svuint64x4_t test_svuzp_u64_x4(svuint64x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzp_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.uzp.x4.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZN_COERCE2:%.*]], <vscale x 2 x double> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzp_f64_x413svfloat64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.uzp.x4.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZN_COERCE2:%.*]], <vscale x 2 x double> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svuzp_f64_x4(svfloat64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzp,_f64_x4)(zn);
@@ -460,35 +196,13 @@ svfloat64x4_t test_svuzp_f64_x4(svfloat64x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzpq.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzpq_s8_x410svint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzpq.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svuzpq_s8_x4(svint8x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_s8_x4)(zn);
@@ -496,35 +210,13 @@ svint8x4_t test_svuzpq_s8_x4(svint8x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzpq.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svuzpq_u8_x411svuint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzpq.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svuzpq_u8_x4(svuint8x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_u8_x4)(zn);
@@ -532,35 +224,13 @@ svuint8x4_t test_svuzpq_u8_x4(svuint8x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzpq.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_s16_x411svint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzpq.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svuzpq_s16_x4(svint16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_s16_x4)(zn);
@@ -568,35 +238,13 @@ svint16x4_t test_svuzpq_s16_x4(svint16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzpq.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_u16_x412svuint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzpq.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svuzpq_u16_x4(svuint16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_u16_x4)(zn);
@@ -604,35 +252,13 @@ svuint16x4_t test_svuzpq_u16_x4(svuint16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.uzpq.x4.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZN_COERCE2:%.*]], <vscale x 8 x half> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_f16_x413svfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.uzpq.x4.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZN_COERCE2:%.*]], <vscale x 8 x half> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svuzpq_f16_x4(svfloat16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_f16_x4)(zn);
@@ -640,35 +266,13 @@ svfloat16x4_t test_svuzpq_f16_x4(svfloat16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.uzpq.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svuzpq_bf16_x414svbfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.uzpq.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svuzpq_bf16_x4(svbfloat16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_bf16_x4)(zn);
@@ -676,35 +280,13 @@ svbfloat16x4_t test_svuzpq_bf16_x4(svbfloat16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzpq.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_s32_x411svint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzpq.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svuzpq_s32_x4(svint32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_s32_x4)(zn);
@@ -712,35 +294,13 @@ svint32x4_t test_svuzpq_s32_x4(svint32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzpq.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_u32_x412svuint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzpq.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svuzpq_u32_x4(svuint32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_u32_x4)(zn);
@@ -748,35 +308,13 @@ svuint32x4_t test_svuzpq_u32_x4(svuint32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.uzpq.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_f32_x413svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.uzpq.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svuzpq_f32_x4(svfloat32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_f32_x4)(zn);
@@ -784,35 +322,13 @@ svfloat32x4_t test_svuzpq_f32_x4(svfloat32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzpq.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_s64_x411svint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzpq.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svuzpq_s64_x4(svint64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_s64_x4)(zn);
@@ -820,35 +336,13 @@ svint64x4_t test_svuzpq_s64_x4(svint64x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzpq.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_u64_x412svuint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzpq.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svuzpq_u64_x4(svuint64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_u64_x4)(zn);
@@ -856,35 +350,13 @@ svuint64x4_t test_svuzpq_u64_x4(svuint64x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svuzpq_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.uzpq.x4.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZN_COERCE2:%.*]], <vscale x 2 x double> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svuzpq_f64_x413svfloat64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.uzpq.x4.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZN_COERCE2:%.*]], <vscale x 2 x double> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svuzpq_f64_x4(svfloat64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svuzpq,_f64_x4)(zn);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c
index a29c347e3197f..787b7d0b3ea1a 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c
@@ -19,27 +19,13 @@
 
 // CHECK-LABEL: @test_svzip_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zip.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svzip_s8_x210svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zip.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svzip_s8_x2(svint8x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_s8_x2)(zn);
@@ -47,27 +33,13 @@ svint8x2_t test_svzip_s8_x2(svint8x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zip.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svzip_u8_x211svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zip.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svzip_u8_x2(svuint8x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_u8_x2)(zn);
@@ -77,27 +49,13 @@ svuint8x2_t test_svzip_u8_x2(svuint8x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zip.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_s16_x211svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zip.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svzip_s16_x2(svint16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_s16_x2)(zn);
@@ -105,27 +63,13 @@ svint16x2_t test_svzip_s16_x2(svint16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zip.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_u16_x212svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zip.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svzip_u16_x2(svuint16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_u16_x2)(zn);
@@ -133,27 +77,13 @@ svuint16x2_t test_svzip_u16_x2(svuint16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.zip.x2.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_f16_x213svfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.zip.x2.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svzip_f16_x2(svfloat16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_f16_x2)(zn);
@@ -161,27 +91,13 @@ svfloat16x2_t test_svzip_f16_x2(svfloat16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.zip.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzip_bf16_x214svbfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.zip.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svzip_bf16_x2(svbfloat16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_bf16_x2)(zn);
@@ -191,27 +107,13 @@ svbfloat16x2_t test_svzip_bf16_x2(svbfloat16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zip.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_s32_x211svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zip.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svzip_s32_x2(svint32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_s32_x2)(zn);
@@ -219,27 +121,13 @@ svint32x2_t test_svzip_s32_x2(svint32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zip.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_u32_x212svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zip.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svzip_u32_x2(svuint32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_u32_x2)(zn);
@@ -247,27 +135,13 @@ svuint32x2_t test_svzip_u32_x2(svuint32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.zip.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_f32_x213svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.zip.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svzip_f32_x2(svfloat32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_f32_x2)(zn);
@@ -277,27 +151,13 @@ svfloat32x2_t test_svzip_f32_x2(svfloat32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zip.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_s64_x211svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zip.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svzip_s64_x2(svint64x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_s64_x2)(zn);
@@ -305,27 +165,13 @@ svint64x2_t test_svzip_s64_x2(svint64x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zip.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_u64_x212svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zip.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svzip_u64_x2(svuint64x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_u64_x2)(zn);
@@ -333,27 +179,13 @@ svuint64x2_t test_svzip_u64_x2(svuint64x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.zip.x2.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_f64_x213svfloat64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.zip.x2.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svzip_f64_x2(svfloat64x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_f64_x2)(zn);
@@ -363,27 +195,13 @@ svfloat64x2_t test_svzip_f64_x2(svfloat64x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zipq.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzipq_s8_x210svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zipq.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svzipq_s8_x2(svint8x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_s8_x2)(zn);
@@ -391,27 +209,13 @@ svint8x2_t test_svzipq_s8_x2(svint8x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zipq.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzipq_u8_x211svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zipq.x2.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svzipq_u8_x2(svuint8x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_u8_x2)(zn);
@@ -419,27 +223,13 @@ svuint8x2_t test_svzipq_u8_x2(svuint8x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zipq.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_s16_x211svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zipq.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svzipq_s16_x2(svint16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_s16_x2)(zn);
@@ -447,27 +237,13 @@ svint16x2_t test_svzipq_s16_x2(svint16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zipq.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_u16_x212svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zipq.x2.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svzipq_u16_x2(svuint16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_u16_x2)(zn);
@@ -475,27 +251,13 @@ svuint16x2_t test_svzipq_u16_x2(svuint16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.zipq.x2.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_f16_x213svfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.zipq.x2.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svzipq_f16_x2(svfloat16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_f16_x2)(zn);
@@ -503,27 +265,13 @@ svfloat16x2_t test_svzipq_f16_x2(svfloat16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_bf16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.zipq.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svzipq_bf16_x214svbfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.zipq.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svzipq_bf16_x2(svbfloat16x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_bf16_x2)(zn);
@@ -531,27 +279,13 @@ svbfloat16x2_t test_svzipq_bf16_x2(svbfloat16x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zipq.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_s32_x211svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zipq.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svzipq_s32_x2(svint32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_s32_x2)(zn);
@@ -559,27 +293,13 @@ svint32x2_t test_svzipq_s32_x2(svint32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zipq.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_u32_x212svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zipq.x2.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svzipq_u32_x2(svuint32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_u32_x2)(zn);
@@ -587,27 +307,13 @@ svuint32x2_t test_svzipq_u32_x2(svuint32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.zipq.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_f32_x213svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.zipq.x2.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svzipq_f32_x2(svfloat32x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_f32_x2)(zn);
@@ -615,27 +321,13 @@ svfloat32x2_t test_svzipq_f32_x2(svfloat32x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zipq.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_s64_x211svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zipq.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svzipq_s64_x2(svint64x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_s64_x2)(zn);
@@ -643,27 +335,13 @@ svint64x2_t test_svzipq_s64_x2(svint64x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zipq.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_u64_x212svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zipq.x2.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svzipq_u64_x2(svuint64x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_u64_x2)(zn);
@@ -671,27 +349,13 @@ svuint64x2_t test_svzipq_u64_x2(svuint64x2_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.zipq.x2.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_f64_x213svfloat64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.zipq.x2.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svzipq_f64_x2(svfloat64x2_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_f64_x2)(zn);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c
index be40ecb4bcaa3..9bea471bc9837 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c
@@ -19,35 +19,13 @@
 
 // CHECK-LABEL: @test_svzip_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zip.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svzip_s8_x410svint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zip.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svzip_s8_x4(svint8x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_s8_x4)(zn);
@@ -55,35 +33,13 @@ svint8x4_t test_svzip_s8_x4(svint8x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zip.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svzip_u8_x411svuint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zip.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svzip_u8_x4(svuint8x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_u8_x4)(zn);
@@ -93,35 +49,13 @@ svuint8x4_t test_svzip_u8_x4(svuint8x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zip.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_s16_x411svint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zip.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svzip_s16_x4(svint16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_s16_x4)(zn);
@@ -129,35 +63,13 @@ svint16x4_t test_svzip_s16_x4(svint16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zip.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_u16_x412svuint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zip.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svzip_u16_x4(svuint16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_u16_x4)(zn);
@@ -165,35 +77,13 @@ svuint16x4_t test_svzip_u16_x4(svuint16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.zip.x4.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZN_COERCE2:%.*]], <vscale x 8 x half> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_f16_x413svfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.zip.x4.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZN_COERCE2:%.*]], <vscale x 8 x half> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svzip_f16_x4(svfloat16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_f16_x4)(zn);
@@ -201,35 +91,13 @@ svfloat16x4_t test_svzip_f16_x4(svfloat16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.zip.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzip_bf16_x414svbfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.zip.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svzip_bf16_x4(svbfloat16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_bf16_x4)(zn);
@@ -239,35 +107,13 @@ svbfloat16x4_t test_svzip_bf16_x4(svbfloat16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zip.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_s32_x411svint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zip.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svzip_s32_x4(svint32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_s32_x4)(zn);
@@ -275,35 +121,13 @@ svint32x4_t test_svzip_s32_x4(svint32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zip.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_u32_x412svuint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zip.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svzip_u32_x4(svuint32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_u32_x4)(zn);
@@ -311,35 +135,13 @@ svuint32x4_t test_svzip_u32_x4(svuint32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.zip.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_f32_x413svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.zip.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svzip_f32_x4(svfloat32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_f32_x4)(zn);
@@ -349,35 +151,13 @@ svfloat32x4_t test_svzip_f32_x4(svfloat32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zip.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_s64_x411svint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zip.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svzip_s64_x4(svint64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_s64_x4)(zn);
@@ -385,35 +165,13 @@ svint64x4_t test_svzip_s64_x4(svint64x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zip.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_u64_x412svuint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zip.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svzip_u64_x4(svuint64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_u64_x4)(zn);
@@ -421,35 +179,13 @@ svuint64x4_t test_svzip_u64_x4(svuint64x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzip_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.zip.x4.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZN_COERCE2:%.*]], <vscale x 2 x double> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzip_f64_x413svfloat64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.zip.x4.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZN_COERCE2:%.*]], <vscale x 2 x double> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svzip_f64_x4(svfloat64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzip,_f64_x4)(zn);
@@ -459,35 +195,13 @@ svfloat64x4_t test_svzip_f64_x4(svfloat64x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zipq.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzipq_s8_x410svint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zipq.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svzipq_s8_x4(svint8x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_s8_x4)(zn);
@@ -495,35 +209,13 @@ svint8x4_t test_svzipq_s8_x4(svint8x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zipq.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svzipq_u8_x411svuint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zipq.x4.nxv16i8(<vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svzipq_u8_x4(svuint8x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_u8_x4)(zn);
@@ -531,35 +223,13 @@ svuint8x4_t test_svzipq_u8_x4(svuint8x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zipq.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_s16_x411svint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zipq.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svzipq_s16_x4(svint16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_s16_x4)(zn);
@@ -567,35 +237,13 @@ svint16x4_t test_svzipq_s16_x4(svint16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zipq.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_u16_x412svuint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zipq.x4.nxv8i16(<vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZN_COERCE2:%.*]], <vscale x 8 x i16> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svzipq_u16_x4(svuint16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_u16_x4)(zn);
@@ -603,35 +251,13 @@ svuint16x4_t test_svzipq_u16_x4(svuint16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.zipq.x4.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZN_COERCE2:%.*]], <vscale x 8 x half> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_f16_x413svfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.zipq.x4.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZN_COERCE2:%.*]], <vscale x 8 x half> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svzipq_f16_x4(svfloat16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_f16_x4)(zn);
@@ -639,35 +265,13 @@ svfloat16x4_t test_svzipq_f16_x4(svfloat16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_bf16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.zipq.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svzipq_bf16_x414svbfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.zipq.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svzipq_bf16_x4(svbfloat16x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_bf16_x4)(zn);
@@ -675,35 +279,13 @@ svbfloat16x4_t test_svzipq_bf16_x4(svbfloat16x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zipq.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_s32_x411svint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zipq.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svzipq_s32_x4(svint32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_s32_x4)(zn);
@@ -711,35 +293,13 @@ svint32x4_t test_svzipq_s32_x4(svint32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zipq.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_u32_x412svuint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zipq.x4.nxv4i32(<vscale x 4 x i32> [[ZN_COERCE0:%.*]], <vscale x 4 x i32> [[ZN_COERCE1:%.*]], <vscale x 4 x i32> [[ZN_COERCE2:%.*]], <vscale x 4 x i32> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svzipq_u32_x4(svuint32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_u32_x4)(zn);
@@ -747,35 +307,13 @@ svuint32x4_t test_svzipq_u32_x4(svuint32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.zipq.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_f32_x413svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.zipq.x4.nxv4f32(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svzipq_f32_x4(svfloat32x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_f32_x4)(zn);
@@ -783,35 +321,13 @@ svfloat32x4_t test_svzipq_f32_x4(svfloat32x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zipq.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_s64_x411svint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zipq.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svzipq_s64_x4(svint64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_s64_x4)(zn);
@@ -819,35 +335,13 @@ svint64x4_t test_svzipq_s64_x4(svint64x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zipq.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_u64_x412svuint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zipq.x4.nxv2i64(<vscale x 2 x i64> [[ZN_COERCE0:%.*]], <vscale x 2 x i64> [[ZN_COERCE1:%.*]], <vscale x 2 x i64> [[ZN_COERCE2:%.*]], <vscale x 2 x i64> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svzipq_u64_x4(svuint64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_u64_x4)(zn);
@@ -855,35 +349,13 @@ svuint64x4_t test_svzipq_u64_x4(svuint64x4_t zn) __arm_streaming {
 
 // CHECK-LABEL: @test_svzipq_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.zipq.x4.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZN_COERCE2:%.*]], <vscale x 2 x double> [[ZN_COERCE3:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svzipq_f64_x413svfloat64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.zipq.x4.nxv2f64(<vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZN_COERCE2:%.*]], <vscale x 2 x double> [[ZN_COERCE3:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svzipq_f64_x4(svfloat64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svzipq,_f64_x4)(zn);
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write_lane_zt.c
new file mode 100644
index 0000000000000..e73697a569473
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write_lane_zt.c
@@ -0,0 +1,402 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+#endif
+
+#include <arm_sme.h>
+
+// CHECK-LABEL: define dso_local void @test_write_lane_zt_u8_1(
+// CHECK-SAME: <vscale x 16 x i8> [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[V]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z23test_write_lane_zt_u8_1u11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[V]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_lane_zt_u8_1(svuint8_t v) __arm_streaming __arm_inout("zt0") {
+  SVE_ACLE_FUNC(svwrite_lane_zt, _u8)(0, v, 1);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_lane_zt_s8_2(
+// CHECK-SAME: <vscale x 16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[V]], i32 2)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z23test_write_lane_zt_s8_2u10__SVInt8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[V]], i32 2)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_lane_zt_s8_2(svint8_t v) __arm_streaming __arm_inout("zt0") {
+  SVE_ACLE_FUNC(svwrite_lane_zt, _s8)(0, v, 2);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_lane_zt_u16_3(
+// CHECK-SAME: <vscale x 8 x i16> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv8i16(i32 0, <vscale x 8 x i16> [[V]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z24test_write_lane_zt_u16_3u12__SVUint16_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[V:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv8i16(i32 0, <vscale x 8 x i16> [[V]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_lane_zt_u16_3(svuint16_t v) __arm_streaming __arm_inout("zt0") {
+  SVE_ACLE_FUNC(svwrite_lane_zt, _u16)(0, v, 1);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_lane_zt_s16_1(
+// CHECK-SAME: <vscale x 8 x i16> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv8i16(i32 0, <vscale x 8 x i16> [[V]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z24test_write_lane_zt_s16_1u11__SVInt16_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[V:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv8i16(i32 0, <vscale x 8 x i16> [[V]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_lane_zt_s16_1(svint16_t v) __arm_streaming __arm_inout("zt0") {
+  SVE_ACLE_FUNC(svwrite_lane_zt, _s16)(0, v, 1);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_lane_zt_u32_2(
+// CHECK-SAME: <vscale x 4 x i32> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv4i32(i32 0, <vscale x 4 x i32> [[V]], i32 2)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z24test_write_lane_zt_u32_2u12__SVUint32_t(
+// CPP-CHECK-SAME: <vscale x 4 x i32> [[V:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv4i32(i32 0, <vscale x 4 x i32> [[V]], i32 2)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_lane_zt_u32_2(svuint32_t v) __arm_streaming __arm_inout("zt0") {
+  SVE_ACLE_FUNC(svwrite_lane_zt, _u32)(0, v, 2);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_lane_zt_s32_3(
+// CHECK-SAME: <vscale x 4 x i32> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv4i32(i32 0, <vscale x 4 x i32> [[V]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z24test_write_lane_zt_s32_3u11__SVInt32_t(
+// CPP-CHECK-SAME: <vscale x 4 x i32> [[V:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv4i32(i32 0, <vscale x 4 x i32> [[V]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_lane_zt_s32_3(svint32_t v) __arm_streaming __arm_inout("zt0") {
+  SVE_ACLE_FUNC(svwrite_lane_zt, _s32)(0, v, 3);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_lane_zt_u64_0(
+// CHECK-SAME: <vscale x 2 x i64> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv2i64(i32 0, <vscale x 2 x i64> [[V]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z24test_write_lane_zt_u64_0u12__SVUint64_t(
+// CPP-CHECK-SAME: <vscale x 2 x i64> [[V:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv2i64(i32 0, <vscale x 2 x i64> [[V]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_lane_zt_u64_0(svuint64_t v) __arm_streaming __arm_inout("zt0") {
+  SVE_ACLE_FUNC(svwrite_lane_zt, _u64)(0, v, 1);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_lane_zt_s64_1(
+// CHECK-SAME: <vscale x 2 x i64> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv2i64(i32 0, <vscale x 2 x i64> [[V]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z24test_write_lane_zt_s64_1u11__SVInt64_t(
+// CPP-CHECK-SAME: <vscale x 2 x i64> [[V:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv2i64(i32 0, <vscale x 2 x i64> [[V]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_lane_zt_s64_1(svint64_t v) __arm_streaming __arm_inout("zt0") {
+  SVE_ACLE_FUNC(svwrite_lane_zt, _s64)(0, v, 1);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_lane_zt_f16_2(
+// CHECK-SAME: <vscale x 8 x half> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv8f16(i32 0, <vscale x 8 x half> [[V]], i32 2)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z24test_write_lane_zt_f16_2u13__SVFloat16_t(
+// CPP-CHECK-SAME: <vscale x 8 x half> [[V:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv8f16(i32 0, <vscale x 8 x half> [[V]], i32 2)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_lane_zt_f16_2(svfloat16_t v) __arm_streaming __arm_inout("zt0") {
+  SVE_ACLE_FUNC(svwrite_lane_zt, _f16)(0, v, 2);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_lane_zt_bf16_3(
+// CHECK-SAME: <vscale x 8 x bfloat> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv8bf16(i32 0, <vscale x 8 x bfloat> [[V]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z25test_write_lane_zt_bf16_3u14__SVBfloat16_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[V:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv8bf16(i32 0, <vscale x 8 x bfloat> [[V]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_lane_zt_bf16_3(svbfloat16_t v) __arm_streaming __arm_inout("zt0") {
+  SVE_ACLE_FUNC(svwrite_lane_zt, _bf16)(0, v, 3);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_lane_zt_f32_0(
+// CHECK-SAME: <vscale x 4 x float> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv4f32(i32 0, <vscale x 4 x float> [[V]], i32 2)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z24test_write_lane_zt_f32_0u13__SVFloat32_t(
+// CPP-CHECK-SAME: <vscale x 4 x float> [[V:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv4f32(i32 0, <vscale x 4 x float> [[V]], i32 2)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_lane_zt_f32_0(svfloat32_t v) __arm_streaming __arm_inout("zt0") {
+  SVE_ACLE_FUNC(svwrite_lane_zt, _f32)(0, v, 2);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_lane_zt_f64_1(
+// CHECK-SAME: <vscale x 2 x double> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv2f64(i32 0, <vscale x 2 x double> [[V]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z24test_write_lane_zt_f64_1u13__SVFloat64_t(
+// CPP-CHECK-SAME: <vscale x 2 x double> [[V:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.lane.zt.nxv2f64(i32 0, <vscale x 2 x double> [[V]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_lane_zt_f64_1(svfloat64_t v) __arm_streaming __arm_inout("zt0") {
+  SVE_ACLE_FUNC(svwrite_lane_zt, _f64)(0, v, 1);
+}
+
+//ALIAS
+// CHECK-LABEL: define dso_local void @test_write_zt_u8(
+// CHECK-SAME: <vscale x 16 x i8> [[V:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[V]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z16test_write_zt_u8u11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[V:%.*]]) #[[ATTR2:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[V]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_zt_u8(svuint8_t v) __arm_streaming __arm_out("zt0") {
+  SVE_ACLE_FUNC(svwrite_zt, _u8)(0, v);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_zt_s8(
+// CHECK-SAME: <vscale x 16 x i8> [[V:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[V]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z16test_write_zt_s8u10__SVInt8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[V:%.*]]) #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[V]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_zt_s8(svint8_t v) __arm_streaming __arm_out("zt0") {
+  SVE_ACLE_FUNC(svwrite_zt, _s8)(0, v);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_zt_u16(
+// CHECK-SAME: <vscale x 8 x i16> [[V:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv8i16(i32 0, <vscale x 8 x i16> [[V]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_write_zt_u16u12__SVUint16_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[V:%.*]]) #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv8i16(i32 0, <vscale x 8 x i16> [[V]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_zt_u16(svuint16_t v) __arm_streaming __arm_out("zt0") {
+  SVE_ACLE_FUNC(svwrite_zt, _u16)(0, v);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_zt_s16(
+// CHECK-SAME: <vscale x 8 x i16> [[V:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv8i16(i32 0, <vscale x 8 x i16> [[V]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_write_zt_s16u11__SVInt16_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[V:%.*]]) #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv8i16(i32 0, <vscale x 8 x i16> [[V]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_zt_s16(svint16_t v) __arm_streaming __arm_out("zt0") {
+  SVE_ACLE_FUNC(svwrite_zt, _s16)(0, v);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_zt_u32(
+// CHECK-SAME: <vscale x 4 x i32> [[V:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv4i32(i32 0, <vscale x 4 x i32> [[V]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_write_zt_u32u12__SVUint32_t(
+// CPP-CHECK-SAME: <vscale x 4 x i32> [[V:%.*]]) #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv4i32(i32 0, <vscale x 4 x i32> [[V]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_zt_u32(svuint32_t v) __arm_streaming __arm_out("zt0") {
+  SVE_ACLE_FUNC(svwrite_zt, _u32)(0, v);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_zt_s32(
+// CHECK-SAME: <vscale x 4 x i32> [[V:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv4i32(i32 0, <vscale x 4 x i32> [[V]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_write_zt_s32u11__SVInt32_t(
+// CPP-CHECK-SAME: <vscale x 4 x i32> [[V:%.*]]) #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv4i32(i32 0, <vscale x 4 x i32> [[V]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_zt_s32(svint32_t v) __arm_streaming __arm_out("zt0") {
+  SVE_ACLE_FUNC(svwrite_zt, _s32)(0, v);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_zt_u64(
+// CHECK-SAME: <vscale x 2 x i64> [[V:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv2i64(i32 0, <vscale x 2 x i64> [[V]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_write_zt_u64u12__SVUint64_t(
+// CPP-CHECK-SAME: <vscale x 2 x i64> [[V:%.*]]) #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv2i64(i32 0, <vscale x 2 x i64> [[V]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_zt_u64(svuint64_t v) __arm_streaming __arm_out("zt0") {
+  SVE_ACLE_FUNC(svwrite_zt, _u64)(0, v);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_zt_s64(
+// CHECK-SAME: <vscale x 2 x i64> [[V:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv2i64(i32 0, <vscale x 2 x i64> [[V]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_write_zt_s64u11__SVInt64_t(
+// CPP-CHECK-SAME: <vscale x 2 x i64> [[V:%.*]]) #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv2i64(i32 0, <vscale x 2 x i64> [[V]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_zt_s64(svint64_t v) __arm_streaming __arm_out("zt0") {
+  SVE_ACLE_FUNC(svwrite_zt, _s64)(0, v);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_zt_f16(
+// CHECK-SAME: <vscale x 8 x half> [[V:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv8f16(i32 0, <vscale x 8 x half> [[V]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_write_zt_f16u13__SVFloat16_t(
+// CPP-CHECK-SAME: <vscale x 8 x half> [[V:%.*]]) #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv8f16(i32 0, <vscale x 8 x half> [[V]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_zt_f16(svfloat16_t v) __arm_streaming __arm_out("zt0") {
+  SVE_ACLE_FUNC(svwrite_zt, _f16)(0, v);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_zt_bf16(
+// CHECK-SAME: <vscale x 8 x bfloat> [[V:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv8bf16(i32 0, <vscale x 8 x bfloat> [[V]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z18test_write_zt_bf16u14__SVBfloat16_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[V:%.*]]) #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv8bf16(i32 0, <vscale x 8 x bfloat> [[V]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_zt_bf16(svbfloat16_t v) __arm_streaming __arm_out("zt0") {
+  SVE_ACLE_FUNC(svwrite_zt, _bf16)(0, v);
+}
+
+// CHECK-LABEL: define dso_local void @test_write_zt_f32(
+// CHECK-SAME: <vscale x 4 x float> [[V:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv4f32(i32 0, <vscale x 4 x float> [[V]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z17test_write_zt_f32u13__SVFloat32_t(
+// CPP-CHECK-SAME: <vscale x 4 x float> [[V:%.*]]) #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv4f32(i32 0, <vscale x 4 x float> [[V]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write_zt_f32(svfloat32_t v) __arm_streaming __arm_out("zt0") {
+  SVE_ACLE_FUNC(svwrite_zt, _f32)(0, v);
+}
+
+// CHECK-LABEL: define dso_local void @test_write__zt_f64(
+// CHECK-SAME: <vscale x 2 x double> [[V:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv2f64(i32 0, <vscale x 2 x double> [[V]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z18test_write__zt_f64u13__SVFloat64_t(
+// CPP-CHECK-SAME: <vscale x 2 x double> [[V:%.*]]) #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.zt.nxv2f64(i32 0, <vscale x 2 x double> [[V]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_write__zt_f64(svfloat64_t v) __arm_streaming __arm_out("zt0") {
+  SVE_ACLE_FUNC(svwrite_zt, _f64)(0, v);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
index 77b02b4c4708f..7fa2249827c4e 100644
--- a/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
+++ b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
@@ -11,28 +11,14 @@
 // CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreadz_hor_za8_s8_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z26test_svreadz_hor_za8_s8_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svreadz_hor_za8_s8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -42,28 +28,14 @@ svint8x2_t test_svreadz_hor_za8_s8_x2(uint32_t slice) __arm_streaming __arm_inou
 // CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreadz_hor_za8_u8_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z26test_svreadz_hor_za8_u8_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svreadz_hor_za8_u8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -73,28 +45,14 @@ svuint8x2_t test_svreadz_hor_za8_u8_x2(uint32_t slice) __arm_streaming __arm_ino
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreadz_hor_za16_s16_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z28test_svreadz_hor_za16_s16_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svreadz_hor_za16_s16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -104,28 +62,14 @@ svint16x2_t test_svreadz_hor_za16_s16_x2(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreadz_hor_za16_u16_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 1, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z28test_svreadz_hor_za16_u16_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 1, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svreadz_hor_za16_u16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -135,28 +79,14 @@ svuint16x2_t test_svreadz_hor_za16_u16_x2(uint32_t slice) __arm_streaming __arm_
 // CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half> } @test_svreadz_hor_za16_f16_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half> } @_Z28test_svreadz_hor_za16_f16_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svreadz_hor_za16_f16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -166,28 +96,14 @@ svfloat16x2_t test_svreadz_hor_za16_f16_x2(uint32_t slice) __arm_streaming __arm
 // CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svreadz_hor_za16_bf16_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 1, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z29test_svreadz_hor_za16_bf16_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 1, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svreadz_hor_za16_bf16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -198,28 +114,14 @@ svbfloat16x2_t test_svreadz_hor_za16_bf16_x2(uint32_t slice) __arm_streaming __a
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreadz_hor_za32_s32_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z28test_svreadz_hor_za32_s32_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svreadz_hor_za32_s32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -229,28 +131,14 @@ svint32x2_t test_svreadz_hor_za32_s32_x2(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreadz_hor_za32_u32_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 2, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z28test_svreadz_hor_za32_u32_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 2, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svreadz_hor_za32_u32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -260,28 +148,14 @@ svuint32x2_t test_svreadz_hor_za32_u32_x2(uint32_t slice) __arm_streaming __arm_
 // CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float> } @test_svreadz_hor_za32_f32_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 3, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float> } @_Z28test_svreadz_hor_za32_f32_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 3, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svreadz_hor_za32_f32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -291,28 +165,14 @@ svfloat32x2_t test_svreadz_hor_za32_f32_x2(uint32_t slice) __arm_streaming __arm
 // CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreadz_hor_za64_s64_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z28test_svreadz_hor_za64_s64_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svreadz_hor_za64_s64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -322,28 +182,14 @@ svint64x2_t test_svreadz_hor_za64_s64_x2(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreadz_hor_za64_u64_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 4, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z28test_svreadz_hor_za64_u64_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 4, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svreadz_hor_za64_u64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -353,28 +199,14 @@ svuint64x2_t test_svreadz_hor_za64_u64_x2(uint32_t slice) __arm_streaming __arm_
 // CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double> } @test_svreadz_hor_za64_f64_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 7, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double> } @_Z28test_svreadz_hor_za64_f64_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 7, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svreadz_hor_za64_f64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -389,28 +221,14 @@ svfloat64x2_t test_svreadz_hor_za64_f64_x2(uint32_t slice) __arm_streaming __arm
 // CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreadz_ver_za8_s8_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z26test_svreadz_ver_za8_s8_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svreadz_ver_za8_s8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -420,28 +238,14 @@ svint8x2_t test_svreadz_ver_za8_s8_x2(uint32_t slice) __arm_streaming __arm_inou
 // CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreadz_ver_za8_u8_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z26test_svreadz_ver_za8_u8_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svreadz_ver_za8_u8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -451,28 +255,14 @@ svuint8x2_t test_svreadz_ver_za8_u8_x2(uint32_t slice) __arm_streaming __arm_ino
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreadz_ver_za16_s16_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z28test_svreadz_ver_za16_s16_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svreadz_ver_za16_s16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -482,28 +272,14 @@ svint16x2_t test_svreadz_ver_za16_s16_x2(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreadz_ver_za16_u16_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 1, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z28test_svreadz_ver_za16_u16_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 1, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svreadz_ver_za16_u16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -513,28 +289,14 @@ svuint16x2_t test_svreadz_ver_za16_u16_x2(uint32_t slice) __arm_streaming __arm_
 // CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half> } @test_svreadz_ver_za16_f16_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half> } @_Z28test_svreadz_ver_za16_f16_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svreadz_ver_za16_f16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -544,28 +306,14 @@ svfloat16x2_t test_svreadz_ver_za16_f16_x2(uint32_t slice) __arm_streaming __arm
 // CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svreadz_ver_za16_bf16_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 1, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z29test_svreadz_ver_za16_bf16_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 1, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svreadz_ver_za16_bf16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -576,28 +324,14 @@ svbfloat16x2_t test_svreadz_ver_za16_bf16_x2(uint32_t slice) __arm_streaming __a
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreadz_ver_za32_s32_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z28test_svreadz_ver_za32_s32_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svreadz_ver_za32_s32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -607,28 +341,14 @@ svint32x2_t test_svreadz_ver_za32_s32_x2(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreadz_ver_za32_u32_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 2, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z28test_svreadz_ver_za32_u32_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 2, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svreadz_ver_za32_u32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -638,28 +358,14 @@ svuint32x2_t test_svreadz_ver_za32_u32_x2(uint32_t slice) __arm_streaming __arm_
 // CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float> } @test_svreadz_ver_za32_f32_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 3, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float> } @_Z28test_svreadz_ver_za32_f32_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 3, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svreadz_ver_za32_f32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -669,28 +375,14 @@ svfloat32x2_t test_svreadz_ver_za32_f32_x2(uint32_t slice) __arm_streaming __arm
 // CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreadz_ver_za64_s64_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z28test_svreadz_ver_za64_s64_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svreadz_ver_za64_s64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -700,28 +392,14 @@ svint64x2_t test_svreadz_ver_za64_s64_x2(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreadz_ver_za64_u64_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 4, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z28test_svreadz_ver_za64_u64_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 4, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svreadz_ver_za64_u64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -731,28 +409,14 @@ svuint64x2_t test_svreadz_ver_za64_u64_x2(uint32_t slice) __arm_streaming __arm_
 // CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double> } @test_svreadz_ver_za64_f64_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 7, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double> } @_Z28test_svreadz_ver_za64_f64_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 7, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svreadz_ver_za64_f64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -765,36 +429,14 @@ svfloat64x2_t test_svreadz_ver_za64_f64_x2(uint32_t slice) __arm_streaming __arm
 // CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreadz_hor_za8_s8_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z26test_svreadz_hor_za8_s8_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svreadz_hor_za8_s8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -804,36 +446,14 @@ svint8x4_t test_svreadz_hor_za8_s8_x4(uint32_t slice) __arm_streaming __arm_inou
 // CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreadz_hor_za8_u8_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z26test_svreadz_hor_za8_u8_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svreadz_hor_za8_u8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -843,36 +463,14 @@ svuint8x4_t test_svreadz_hor_za8_u8_x4(uint32_t slice) __arm_streaming __arm_ino
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreadz_hor_za16_s16_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z28test_svreadz_hor_za16_s16_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svreadz_hor_za16_s16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -882,36 +480,14 @@ svint16x4_t test_svreadz_hor_za16_s16_x4(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreadz_hor_za16_u16_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 1, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z28test_svreadz_hor_za16_u16_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 1, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svreadz_hor_za16_u16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -921,36 +497,14 @@ svuint16x4_t test_svreadz_hor_za16_u16_x4(uint32_t slice) __arm_streaming __arm_
 // CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svreadz_hor_za16_f16_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @_Z28test_svreadz_hor_za16_f16_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svreadz_hor_za16_f16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -960,36 +514,14 @@ svfloat16x4_t test_svreadz_hor_za16_f16_x4(uint32_t slice) __arm_streaming __arm
 // CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svreadz_hor_za16_bf16_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 1, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z29test_svreadz_hor_za16_bf16_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 1, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svreadz_hor_za16_bf16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1000,36 +532,14 @@ svbfloat16x4_t test_svreadz_hor_za16_bf16_x4(uint32_t slice) __arm_streaming __a
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreadz_hor_za32_s32_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z28test_svreadz_hor_za32_s32_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svreadz_hor_za32_s32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1039,36 +549,14 @@ svint32x4_t test_svreadz_hor_za32_s32_x4(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreadz_hor_za32_u32_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 2, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z28test_svreadz_hor_za32_u32_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 2, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svreadz_hor_za32_u32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1078,36 +566,14 @@ svuint32x4_t test_svreadz_hor_za32_u32_x4(uint32_t slice) __arm_streaming __arm_
 // CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @test_svreadz_hor_za32_f32_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 3, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @_Z28test_svreadz_hor_za32_f32_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 3, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svreadz_hor_za32_f32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1117,36 +583,14 @@ svfloat32x4_t test_svreadz_hor_za32_f32_x4(uint32_t slice) __arm_streaming __arm
 // CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreadz_hor_za64_s64_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z28test_svreadz_hor_za64_s64_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svreadz_hor_za64_s64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1156,36 +600,14 @@ svint64x4_t test_svreadz_hor_za64_s64_x4(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreadz_hor_za64_u64_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 4, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z28test_svreadz_hor_za64_u64_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 4, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svreadz_hor_za64_u64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1195,36 +617,14 @@ svuint64x4_t test_svreadz_hor_za64_u64_x4(uint32_t slice) __arm_streaming __arm_
 // CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @test_svreadz_hor_za64_f64_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 7, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @_Z28test_svreadz_hor_za64_f64_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 7, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svreadz_hor_za64_f64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1236,36 +636,14 @@ svfloat64x4_t test_svreadz_hor_za64_f64_x4(uint32_t slice) __arm_streaming __arm
 // CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreadz_ver_za8_s8_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z26test_svreadz_ver_za8_s8_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svreadz_ver_za8_s8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1275,36 +653,14 @@ svint8x4_t test_svreadz_ver_za8_s8_x4(uint32_t slice) __arm_streaming __arm_inou
 // CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreadz_ver_za8_u8_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z26test_svreadz_ver_za8_u8_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svreadz_ver_za8_u8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1314,36 +670,14 @@ svuint8x4_t test_svreadz_ver_za8_u8_x4(uint32_t slice) __arm_streaming __arm_ino
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreadz_ver_za16_s16_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z28test_svreadz_ver_za16_s16_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svreadz_ver_za16_s16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1353,36 +687,14 @@ svint16x4_t test_svreadz_ver_za16_s16_x4(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreadz_ver_za16_u16_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 1, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z28test_svreadz_ver_za16_u16_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 1, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svreadz_ver_za16_u16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1392,36 +704,14 @@ svuint16x4_t test_svreadz_ver_za16_u16_x4(uint32_t slice) __arm_streaming __arm_
 // CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svreadz_ver_za16_f16_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @_Z28test_svreadz_ver_za16_f16_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svreadz_ver_za16_f16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1431,36 +721,14 @@ svfloat16x4_t test_svreadz_ver_za16_f16_x4(uint32_t slice) __arm_streaming __arm
 // CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svreadz_ver_za16_bf16_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 1, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z29test_svreadz_ver_za16_bf16_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 1, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svreadz_ver_za16_bf16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1471,36 +739,14 @@ svbfloat16x4_t test_svreadz_ver_za16_bf16_x4(uint32_t slice) __arm_streaming __a
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreadz_ver_za32_s32_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z28test_svreadz_ver_za32_s32_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svreadz_ver_za32_s32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1510,36 +756,14 @@ svint32x4_t test_svreadz_ver_za32_s32_x4(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreadz_ver_za32_u32_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 2, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z28test_svreadz_ver_za32_u32_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 2, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svreadz_ver_za32_u32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1549,36 +773,14 @@ svuint32x4_t test_svreadz_ver_za32_u32_x4(uint32_t slice) __arm_streaming __arm_
 // CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @test_svreadz_ver_za32_f32_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 3, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @_Z28test_svreadz_ver_za32_f32_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 3, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svreadz_ver_za32_f32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1588,36 +790,14 @@ svfloat32x4_t test_svreadz_ver_za32_f32_x4(uint32_t slice) __arm_streaming __arm
 // CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreadz_ver_za64_s64_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 0, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z28test_svreadz_ver_za64_s64_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 0, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svreadz_ver_za64_s64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1627,36 +807,14 @@ svint64x4_t test_svreadz_ver_za64_s64_x4(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreadz_ver_za64_u64_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 4, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z28test_svreadz_ver_za64_u64_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 4, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svreadz_ver_za64_u64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -1666,36 +824,14 @@ svuint64x4_t test_svreadz_ver_za64_u64_x4(uint32_t slice) __arm_streaming __arm_
 // CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @test_svreadz_ver_za64_f64_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 7, i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @_Z28test_svreadz_ver_za64_f64_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 7, i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svreadz_ver_za64_f64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2120,28 +1256,14 @@ svfloat64_t test_svreadz_hor_za128_f64(uint32_t slice) __arm_streaming __arm_ino
 // CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreadz_za8_s8_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x2.nxv16i8(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z22test_svreadz_za8_s8_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x2.nxv16i8(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x2_t test_svreadz_za8_s8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2151,28 +1273,14 @@ svint8x2_t test_svreadz_za8_s8_x2(uint32_t slice) __arm_streaming __arm_inout("z
 // CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreadz_za8_u8_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x2.nxv16i8(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z22test_svreadz_za8_u8_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x2.nxv16i8(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x2_t test_svreadz_za8_u8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2182,28 +1290,14 @@ svuint8x2_t test_svreadz_za8_u8_x2(uint32_t slice) __arm_streaming __arm_inout("
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreadz_za16_s16_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x2.nxv8i16(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svreadz_za16_s16_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x2.nxv8i16(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x2_t test_svreadz_za16_s16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2213,28 +1307,14 @@ svint16x2_t test_svreadz_za16_s16_x2(uint32_t slice) __arm_streaming __arm_inout
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreadz_za16_u16_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x2.nxv8i16(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svreadz_za16_u16_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x2.nxv8i16(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x2_t test_svreadz_za16_u16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2244,28 +1324,14 @@ svuint16x2_t test_svreadz_za16_u16_x2(uint32_t slice) __arm_streaming __arm_inou
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreadz_za32_s32_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x2.nxv4i32(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z24test_svreadz_za32_s32_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x2.nxv4i32(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x2_t test_svreadz_za32_s32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2275,28 +1341,14 @@ svint32x2_t test_svreadz_za32_s32_x2(uint32_t slice) __arm_streaming __arm_inout
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreadz_za32_u32_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x2.nxv4i32(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z24test_svreadz_za32_u32_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x2.nxv4i32(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x2_t test_svreadz_za32_u32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2306,28 +1358,14 @@ svuint32x2_t test_svreadz_za32_u32_x2(uint32_t slice) __arm_streaming __arm_inou
 // CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreadz_za64_s64_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x2.nxv2i64(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z24test_svreadz_za64_s64_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x2.nxv2i64(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x2_t test_svreadz_za64_s64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2337,28 +1375,14 @@ svint64x2_t test_svreadz_za64_s64_x2(uint32_t slice) __arm_streaming __arm_inout
 // CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreadz_za64_u64_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x2.nxv2i64(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z24test_svreadz_za64_u64_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x2.nxv2i64(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x2_t test_svreadz_za64_u64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2368,28 +1392,14 @@ svuint64x2_t test_svreadz_za64_u64_x2(uint32_t slice) __arm_streaming __arm_inou
 // CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svreadz_za16_bf16_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.x2.nxv8bf16(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z25test_svreadz_za16_bf16_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.x2.nxv8bf16(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x bfloat> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x2_t test_svreadz_za16_bf16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2399,28 +1409,14 @@ svbfloat16x2_t test_svreadz_za16_bf16_x2(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half> } @test_svreadz_za16_f16_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.x2.nxv8f16(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half> } @_Z24test_svreadz_za16_f16_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.x2.nxv8f16(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    store <vscale x 16 x half> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x2_t test_svreadz_za16_f16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2430,28 +1426,14 @@ svfloat16x2_t test_svreadz_za16_f16_x2(uint32_t slice) __arm_streaming __arm_ino
 // CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float> } @test_svreadz_za32_f32_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.x2.nxv4f32(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float> } @_Z24test_svreadz_za32_f32_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.x2.nxv4f32(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    store <vscale x 8 x float> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x2_t test_svreadz_za32_f32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2461,28 +1443,14 @@ svfloat32x2_t test_svreadz_za32_f32_x2(uint32_t slice) __arm_streaming __arm_ino
 // CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double> } @test_svreadz_za64_f64_x2(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.x2.nxv2f64(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double> } @_Z24test_svreadz_za64_f64_x2j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.x2.nxv2f64(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    store <vscale x 4 x double> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x2_t test_svreadz_za64_f64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2496,36 +1464,14 @@ svfloat64x2_t test_svreadz_za64_f64_x2(uint32_t slice) __arm_streaming __arm_ino
 // CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreadz_za8_s8_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x4.nxv16i8(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z22test_svreadz_za8_s8_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x4.nxv16i8(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svint8x4_t test_svreadz_za8_s8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2535,36 +1481,14 @@ svint8x4_t test_svreadz_za8_s8_x4(uint32_t slice) __arm_streaming __arm_inout("z
 // CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svreadz_za8_u8_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x4.nxv16i8(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z22test_svreadz_za8_u8_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x4.nxv16i8(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CPP-CHECK-NEXT:    store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
 //
 svuint8x4_t test_svreadz_za8_u8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2574,36 +1498,14 @@ svuint8x4_t test_svreadz_za8_u8_x4(uint32_t slice) __arm_streaming __arm_inout("
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreadz_za16_s16_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x4.nxv8i16(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svreadz_za16_s16_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x4.nxv8i16(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svreadz_za16_s16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2613,36 +1515,14 @@ svint16x4_t test_svreadz_za16_s16_x4(uint32_t slice) __arm_streaming __arm_inout
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svreadz_za16_u16_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x4.nxv8i16(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svreadz_za16_u16_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x4.nxv8i16(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i16> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svreadz_za16_u16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2652,36 +1532,14 @@ svuint16x4_t test_svreadz_za16_u16_x4(uint32_t slice) __arm_streaming __arm_inou
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreadz_za32_s32_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x4.nxv4i32(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z24test_svreadz_za32_s32_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x4.nxv4i32(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svint32x4_t test_svreadz_za32_s32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2691,36 +1549,14 @@ svint32x4_t test_svreadz_za32_s32_x4(uint32_t slice) __arm_streaming __arm_inout
 // CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_svreadz_za32_u32_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x4.nxv4i32(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @_Z24test_svreadz_za32_u32_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x4.nxv4i32(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x i32> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
 //
 svuint32x4_t test_svreadz_za32_u32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2730,36 +1566,14 @@ svuint32x4_t test_svreadz_za32_u32_x4(uint32_t slice) __arm_streaming __arm_inou
 // CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreadz_za64_s64_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x4.nxv2i64(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z24test_svreadz_za64_s64_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x4.nxv2i64(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svint64x4_t test_svreadz_za64_s64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2769,36 +1583,14 @@ svint64x4_t test_svreadz_za64_s64_x4(uint32_t slice) __arm_streaming __arm_inout
 // CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_svreadz_za64_u64_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x4.nxv2i64(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @_Z24test_svreadz_za64_u64_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x4.nxv2i64(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
 //
 svuint64x4_t test_svreadz_za64_u64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2808,36 +1600,14 @@ svuint64x4_t test_svreadz_za64_u64_x4(uint32_t slice) __arm_streaming __arm_inou
 // CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svreadz_za16_bf16_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.x4.nxv8bf16(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z25test_svreadz_za16_bf16_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.x4.nxv8bf16(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x bfloat> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svreadz_za16_bf16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2847,36 +1617,14 @@ svbfloat16x4_t test_svreadz_za16_bf16_x4(uint32_t slice) __arm_streaming __arm_i
 // CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svreadz_za16_f16_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.x4.nxv8f16(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @_Z24test_svreadz_za16_f16_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.x4.nxv8f16(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
-// CPP-CHECK-NEXT:    store <vscale x 32 x half> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svreadz_za16_f16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2886,36 +1634,14 @@ svfloat16x4_t test_svreadz_za16_f16_x4(uint32_t slice) __arm_streaming __arm_ino
 // CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @test_svreadz_za32_f32_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.x4.nxv4f32(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @_Z24test_svreadz_za32_f32_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.x4.nxv4f32(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
-// CPP-CHECK-NEXT:    store <vscale x 16 x float> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]]
 //
 svfloat32x4_t test_svreadz_za32_f32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
@@ -2925,36 +1651,14 @@ svfloat32x4_t test_svreadz_za32_f32_x4(uint32_t slice) __arm_streaming __arm_ino
 // CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @test_svreadz_za64_f64_x4(
 // CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.x4.nxv2f64(i32 [[SLICE]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @_Z24test_svreadz_za64_f64_x4j(
 // CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, align 16
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.x4.nxv2f64(i32 [[SLICE]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
-// CPP-CHECK-NEXT:    store <vscale x 8 x double> [[TMP8]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = load { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]]
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]]
 //
 svfloat64x4_t test_svreadz_za64_f64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
 {
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c
index 6cea34ee52ef6..deb126236ad57 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c
@@ -141,27 +141,13 @@ svbool_t test_svpext_lane_c64_3(svcount_t c) ATTR {
 
 // CHECK-LABEL: @test_svpext_lane_c8_x2_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svpext_lane_c8_x2_0u11__SVCount_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 svboolx2_t test_svpext_lane_c8_x2_0(svcount_t c) ATTR {
   return svpext_lane_c8_x2(c, 0);
@@ -169,27 +155,13 @@ svboolx2_t test_svpext_lane_c8_x2_0(svcount_t c) ATTR {
 
 // CHECK-LABEL: @test_svpext_lane_c8_x2_1(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 1)
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svpext_lane_c8_x2_1u11__SVCount_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 1)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 svboolx2_t test_svpext_lane_c8_x2_1(svcount_t c) ATTR {
   return svpext_lane_c8_x2(c, 1);
@@ -197,31 +169,25 @@ svboolx2_t test_svpext_lane_c8_x2_1(svcount_t c) ATTR {
 
 // CHECK-LABEL: @test_svpext_lane_c16_x2_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svpext_lane_c16_x2_0u11__SVCount_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svpext_lane_c16_x2_0(svcount_t c) ATTR {
   return svpext_lane_c16_x2(c, 0);
@@ -229,31 +195,25 @@ svboolx2_t test_svpext_lane_c16_x2_0(svcount_t c) ATTR {
 
 // CHECK-LABEL: @test_svpext_lane_c16_x2_1(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 1)
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svpext_lane_c16_x2_1u11__SVCount_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 1)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svpext_lane_c16_x2_1(svcount_t c) ATTR {
   return svpext_lane_c16_x2(c, 1);
@@ -261,31 +221,25 @@ svboolx2_t test_svpext_lane_c16_x2_1(svcount_t c) ATTR {
 
 // CHECK-LABEL: @test_svpext_lane_c32_x2_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svpext_lane_c32_x2_0u11__SVCount_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svpext_lane_c32_x2_0(svcount_t c) ATTR {
   return svpext_lane_c32_x2(c, 0);
@@ -293,31 +247,25 @@ svboolx2_t test_svpext_lane_c32_x2_0(svcount_t c) ATTR {
 
 // CHECK-LABEL: @test_svpext_lane_c32_x2_1(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 1)
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svpext_lane_c32_x2_1u11__SVCount_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 1)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svpext_lane_c32_x2_1(svcount_t c) ATTR {
   return svpext_lane_c32_x2(c, 1);
@@ -325,31 +273,25 @@ svboolx2_t test_svpext_lane_c32_x2_1(svcount_t c) ATTR {
 
 // CHECK-LABEL: @test_svpext_lane_c64_x2_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svpext_lane_c64_x2_0u11__SVCount_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svpext_lane_c64_x2_0(svcount_t c) ATTR {
   return svpext_lane_c64_x2(c, 0);
@@ -357,31 +299,25 @@ svboolx2_t test_svpext_lane_c64_x2_0(svcount_t c) ATTR {
 
 // CHECK-LABEL: @test_svpext_lane_c64_x2_1(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 1)
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: @_Z25test_svpext_lane_c64_x2_1u11__SVCount_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 8
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 1)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 8
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svpext_lane_c64_x2_1(svcount_t c) ATTR {
   return svpext_lane_c64_x2(c, 1);
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_x2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_x2.c
index 3fcc1dc6c819a..612f2d25d40d0 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_x2.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_x2.c
@@ -23,28 +23,14 @@
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilege_b8_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilege.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z21test_svwhilege_b8_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0:[0-9]+]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilege.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 svboolx2_t test_svwhilege_b8_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b8,_s64,_x2)(op1, op2);
@@ -53,28 +39,14 @@ svboolx2_t test_svwhilege_b8_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilege_b8_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z21test_svwhilege_b8_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 svboolx2_t test_svwhilege_b8_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b8,_u64,_x2)(op1, op2);
@@ -83,32 +55,26 @@ svboolx2_t test_svwhilege_b8_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilege_b16_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilege.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilege_b16_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilege.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilege_b16_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b16,_s64,_x2)(op1, op2);
@@ -117,32 +83,26 @@ svboolx2_t test_svwhilege_b16_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilege_b16_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilege_b16_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilege_b16_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b16,_u64,_x2)(op1, op2);
@@ -151,32 +111,26 @@ svboolx2_t test_svwhilege_b16_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilege_b32_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilege.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilege_b32_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilege.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilege_b32_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b32,_s64,_x2)(op1, op2);
@@ -185,32 +139,26 @@ svboolx2_t test_svwhilege_b32_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilege_b32_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilege_b32_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilege_b32_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b32,_u64,_x2)(op1, op2);
@@ -219,32 +167,26 @@ svboolx2_t test_svwhilege_b32_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilege_b64_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilege.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilege_b64_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilege.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilege_b64_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b64,_s64,_x2)(op1, op2);
@@ -253,32 +195,26 @@ svboolx2_t test_svwhilege_b64_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilege_b64_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilege_b64_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilege_b64_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b64,_u64,_x2)(op1, op2);
@@ -287,28 +223,14 @@ svboolx2_t test_svwhilege_b64_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilegt_b8_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z21test_svwhilegt_b8_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 svboolx2_t test_svwhilegt_b8_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b8,_s64,_x2)(op1, op2);
@@ -317,28 +239,14 @@ svboolx2_t test_svwhilegt_b8_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilegt_b8_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z21test_svwhilegt_b8_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 svboolx2_t test_svwhilegt_b8_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b8,_u64,_x2)(op1, op2);
@@ -347,32 +255,26 @@ svboolx2_t test_svwhilegt_b8_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilegt_b16_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilegt_b16_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilegt_b16_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b16,_s64,_x2)(op1, op2);
@@ -381,32 +283,26 @@ svboolx2_t test_svwhilegt_b16_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilegt_b16_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilegt_b16_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilegt_b16_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b16,_u64,_x2)(op1, op2);
@@ -415,32 +311,26 @@ svboolx2_t test_svwhilegt_b16_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilegt_b32_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilegt_b32_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilegt_b32_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b32,_s64,_x2)(op1, op2);
@@ -449,32 +339,26 @@ svboolx2_t test_svwhilegt_b32_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilegt_b32_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilegt_b32_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilegt_b32_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b32,_u64,_x2)(op1, op2);
@@ -483,32 +367,26 @@ svboolx2_t test_svwhilegt_b32_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilegt_b64_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilegt_b64_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilegt_b64_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b64,_s64,_x2)(op1, op2);
@@ -517,32 +395,26 @@ svboolx2_t test_svwhilegt_b64_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilegt_b64_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilegt_b64_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilegt_b64_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b64,_u64,_x2)(op1, op2);
@@ -551,28 +423,14 @@ svboolx2_t test_svwhilegt_b64_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilele_b8_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilele.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z21test_svwhilele_b8_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilele.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 svboolx2_t test_svwhilele_b8_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b8,_s64,_x2)(op1, op2);
@@ -581,28 +439,14 @@ svboolx2_t test_svwhilele_b8_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilele_b8_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilels.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z21test_svwhilele_b8_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilels.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 svboolx2_t test_svwhilele_b8_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b8,_u64,_x2)(op1, op2);
@@ -611,32 +455,26 @@ svboolx2_t test_svwhilele_b8_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilele_b16_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilele.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilele_b16_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilele.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilele_b16_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b16,_s64,_x2)(op1, op2);
@@ -645,32 +483,26 @@ svboolx2_t test_svwhilele_b16_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilele_b16_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilels.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilele_b16_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilels.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilele_b16_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b16,_u64,_x2)(op1, op2);
@@ -679,32 +511,26 @@ svboolx2_t test_svwhilele_b16_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilele_b32_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilele.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilele_b32_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilele.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilele_b32_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b32,_s64,_x2)(op1, op2);
@@ -713,32 +539,26 @@ svboolx2_t test_svwhilele_b32_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilele_b32_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilels.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilele_b32_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilels.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilele_b32_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b32,_u64,_x2)(op1, op2);
@@ -747,32 +567,26 @@ svboolx2_t test_svwhilele_b32_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilele_b64_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilele.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilele_b64_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilele.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilele_b64_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b64,_s64,_x2)(op1, op2);
@@ -781,32 +595,26 @@ svboolx2_t test_svwhilele_b64_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilele_b64_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilels.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilele_b64_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilels.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilele_b64_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b64,_u64,_x2)(op1, op2);
@@ -815,28 +623,14 @@ svboolx2_t test_svwhilele_b64_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilelt_b8_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z21test_svwhilelt_b8_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 svboolx2_t test_svwhilelt_b8_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b8,_s64,_x2)(op1, op2);
@@ -845,28 +639,14 @@ svboolx2_t test_svwhilelt_b8_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilelt_b8_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z21test_svwhilelt_b8_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv16i1(i64 [[OP1]], i64 [[OP2]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP4]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP5]]
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP0]]
 //
 svboolx2_t test_svwhilelt_b8_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b8,_u64,_x2)(op1, op2);
@@ -875,32 +655,26 @@ svboolx2_t test_svwhilelt_b8_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilelt_b16_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilelt_b16_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilelt_b16_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b16,_s64,_x2)(op1, op2);
@@ -909,32 +683,26 @@ svboolx2_t test_svwhilelt_b16_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilelt_b16_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilelt_b16_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv8i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilelt_b16_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b16,_u64,_x2)(op1, op2);
@@ -943,32 +711,26 @@ svboolx2_t test_svwhilelt_b16_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilelt_b32_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilelt_b32_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilelt_b32_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b32,_s64,_x2)(op1, op2);
@@ -977,32 +739,26 @@ svboolx2_t test_svwhilelt_b32_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilelt_b32_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilelt_b32_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv4i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilelt_b32_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b32,_u64,_x2)(op1, op2);
@@ -1011,32 +767,26 @@ svboolx2_t test_svwhilelt_b32_u64(uint64_t op1, uint64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilelt_b64_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilelt_b64_s64ll(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilelt_b64_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b64,_s64,_x2)(op1, op2);
@@ -1045,32 +795,26 @@ svboolx2_t test_svwhilelt_b64_s64(int64_t op1, int64_t op2) ATTR {
 // CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @test_svwhilelt_b64_u64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i1>, <vscale x 16 x i1> } @_Z22test_svwhilelt_b64_u64mm(
 // CPP-CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 16 x i1>, <vscale x 16 x i1> }, align 2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv2i1(i64 [[OP1]], i64 [[OP2]])
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[TMP2]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } poison, <vscale x 16 x i1> [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } [[TMP0]], 1
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP4]])
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
-// CPP-CHECK-NEXT:    store <vscale x 32 x i1> [[TMP6]], ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = load { <vscale x 16 x i1>, <vscale x 16 x i1> }, ptr [[RETVAL]], align 2
-// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP3]], <vscale x 16 x i1> [[TMP5]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i1>, <vscale x 16 x i1> } [[TMP6]]
 //
 svboolx2_t test_svwhilelt_b64_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b64,_u64,_x2)(op1, op2);
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 5dea8e9fe7903..228435a0494c3 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -24,7 +24,7 @@ int foo() {
   return fmv()+fmv_one()+fmv_two();
 }
 
-inline int __attribute__((target_version("sha1+pmull+f64mm"))) fmv_inline(void) { return 1; }
+inline int __attribute__((target_version("sha2+pmull+f64mm"))) fmv_inline(void) { return 1; }
 inline int __attribute__((target_version("fp16+fcma+rdma+sme+ fp16 "))) fmv_inline(void) { return 2; }
 inline int __attribute__((target_version("sha3+i8mm+f32mm"))) fmv_inline(void) { return 12; }
 inline int __attribute__((target_version("dit+sve-ebf16"))) fmv_inline(void) { return 8; }
@@ -659,7 +659,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf64mmMpmullMsha1
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf64mmMpmullMsha2
 // CHECK-SAME: () #[[ATTR23:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
@@ -805,12 +805,12 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:    ret ptr @fmv_inline._Msve2Msve2-bitpermMsve2-pmull128
 // CHECK:       resolver_else4:
 // CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 34359773184
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 34359773184
+// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 34359775232
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 34359775232
 // CHECK-NEXT:    [[TMP15:%.*]] = and i1 true, [[TMP14]]
 // CHECK-NEXT:    br i1 [[TMP15]], label [[RESOLVER_RETURN5:%.*]], label [[RESOLVER_ELSE6:%.*]]
 // CHECK:       resolver_return5:
-// CHECK-NEXT:    ret ptr @fmv_inline._Mf64mmMpmullMsha1
+// CHECK-NEXT:    ret ptr @fmv_inline._Mf64mmMpmullMsha2
 // CHECK:       resolver_else6:
 // CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 17246986240
@@ -1135,7 +1135,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK: attributes #[[ATTR20]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon" }
 // CHECK: attributes #[[ATTR21:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ls64" }
 // CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sb" }
-// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fp-armv8,+fullfp16,+neon,+sve" }
+// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fp-armv8,+fullfp16,+neon,+sha2,+sve" }
 // CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fp-armv8,+fullfp16,+neon,+rdm,+sme" }
 // CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fp-armv8,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve" }
 // CHECK: attributes #[[ATTR26]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fp-armv8,+fullfp16,+neon,+sve" }
diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c
index 7e094a52653ef..0e53d3e141b01 100644
--- a/clang/test/CodeGen/builtins-elementwise-math.c
+++ b/clang/test/CodeGen/builtins-elementwise-math.c
@@ -607,6 +607,26 @@ void test_builtin_elementwise_popcount(si8 vi1, si8 vi2,
   si = __builtin_elementwise_popcount(si);
 }
 
+void test_builtin_elementwise_fmod(float f1, float f2, double d1, double d2,
+                                      float4 vf1, float4 vf2) {
+
+  // CHECK-LABEL: define void @test_builtin_elementwise_fmod(
+  // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
+  // CHECK:      [[F2:%.+]] = load float, ptr %f2.addr, align 4
+  // CHECK-NEXT:  frem float [[F1]], [[F2]]
+  f2 = __builtin_elementwise_fmod(f1, f2);
+
+  // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
+  // CHECK:      [[D2:%.+]] = load double, ptr %d2.addr, align 8
+  // CHECK-NEXT: frem double [[D1]], [[D2]]
+  d2 = __builtin_elementwise_fmod(d1, d2);
+
+  // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
+  // CHECK:      [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
+  // CHECK-NEXT: frem <4 x float> [[VF1]], [[VF2]]
+  vf2 = __builtin_elementwise_fmod(vf1, vf2);
+}
+
 void test_builtin_elementwise_pow(float f1, float f2, double d1, double d2,
                                       float4 vf1, float4 vf2) {
 
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index bfa72e8bd6945..0d0e3ecdb90c9 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -114,6 +114,7 @@ __device__ int read_ids() {
 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid()
 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
 
   int a = __nvvm_read_ptx_sreg_laneid();
   int b = __nvvm_read_ptx_sreg_warpid();
@@ -121,8 +122,9 @@ __device__ int read_ids() {
   int d = __nvvm_read_ptx_sreg_smid();
   int e = __nvvm_read_ptx_sreg_nsmid();
   int f = __nvvm_read_ptx_sreg_gridid();
+  int g = __nvvm_read_ptx_sreg_warpsize();
 
-  return a + b + c + d + e + f;
+  return a + b + c + d + e + f + g;
 
 }
 
diff --git a/clang/test/CodeGen/strictfp-elementwise-bulitins.cpp b/clang/test/CodeGen/strictfp-elementwise-bulitins.cpp
index 55ba17a195580..651f5bfc94c6c 100644
--- a/clang/test/CodeGen/strictfp-elementwise-bulitins.cpp
+++ b/clang/test/CodeGen/strictfp-elementwise-bulitins.cpp
@@ -306,3 +306,14 @@ float4 strict_elementwise_fma(float4 a, float4 b, float4 c) {
 float4 strict_elementwise_pow(float4 a, float4 b) {
   return __builtin_elementwise_pow(a, b);
 }
+
+// CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_fmodDv4_fS_
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.frem.v4f32(<4 x float> [[A]], <4 x float> [[B]],
+// CHECK-SAME:    metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float4 strict_elementwise_fmod(float4 a, float4 b) {
+  return __builtin_elementwise_fmod(a, b);
+}
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index 41cbd5a0219d5..8548aa00cfe87 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -28,11 +28,11 @@
 
 // RUN: %clang_cc1 -triple sparc-sun-solaris -emit-llvm -o - %s | \
 // RUN:     FileCheck %s --check-prefix=SPARC-V8
-// SPARC-V8: target datalayout = "E-m:e-p:32:32-i64:64-f128:64-n32-S64"
+// SPARC-V8: target datalayout = "E-m:e-p:32:32-i64:64-i128:128-f128:64-n32-S64"
 
 // RUN: %clang_cc1 -triple sparcv9-sun-solaris -emit-llvm -o - %s | \
 // RUN: FileCheck %s --check-prefix=SPARC-V9
-// SPARC-V9: target datalayout = "E-m:e-i64:64-n32:64-S128"
+// SPARC-V9: target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
 
 // RUN: %clang_cc1 -triple mipsel-linux-gnu -o - -emit-llvm %s |     \
 // RUN: FileCheck %s -check-prefix=MIPS-32EL
diff --git a/clang/test/CodeGen/tbaa-pointers.c b/clang/test/CodeGen/tbaa-pointers.c
index 75d8c3d501750..8860b7042d0a2 100644
--- a/clang/test/CodeGen/tbaa-pointers.c
+++ b/clang/test/CodeGen/tbaa-pointers.c
@@ -116,17 +116,43 @@ void p2struct(struct S1 **ptr) {
   // COMMON-LABEL: define void @p2struct(
   // COMMON-SAME:    ptr noundef [[PTR:%.+]])
   // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
-  // ENABLED-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P2S1_0:!.+]]
-  // ENABLED-NEXT:   [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P2S1_0]]
-  // ENABLED-NEXT:   store ptr null, ptr [[BASE]], align 8, !tbaa [[P1S1_:!.+]]
-  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DEFAULT-NEXT:   [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DEFAULT-NEXT:   store ptr null, ptr [[BASE]], align 8, !tbaa [[ANYPTR]]
+  // ENABLED-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR:!.+]]
+  // DEFAULT-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:    [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:    store ptr null, ptr [[BASE]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:    ret void
+  //
+  *ptr = 0;
+}
+
+void p2struct_const(struct S1 const **ptr) {
+  // COMMON-LABEL: define void @p2struct_const(
+  // COMMON-SAME:    ptr noundef [[PTR:%.+]])
+  // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
+  // COMMON-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:    [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:    store ptr null, ptr [[BASE]], align 8, !tbaa [[ANYPTR]]
   // COMMON-NEXT:    ret void
   //
   *ptr = 0;
 }
 
+struct S2 {
+  struct S1 *s;
+};
+
+void p2struct2(struct S2 *ptr) {
+  // COMMON-LABEL: define void @p2struct2(
+  // COMMON-SAME:    ptr noundef [[PTR:%.+]])
+  // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
+  // COMMON-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:    [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+  // COMMON-NEXT:    [[S:%.+]] = getelementptr inbounds nuw %struct.S2, ptr [[BASE]], i32 0, i32 0
+  // COMMON-NEXT:    store ptr null, ptr [[S]], align 8, !tbaa [[S2_S_TAG:!.+]]
+  // COMMON-NEXT:    ret void
+    ptr->s = 0;
+}
+
 // ENABLED: [[P2INT_0]] = !{[[P2INT:!.+]], [[P2INT]], i64 0}
 // ENABLED: [[P2INT]] = !{!"p2 int", [[ANY_POINTER:!.+]], i64 0}
 // DEFAULT: [[ANYPTR]] = !{[[ANY_POINTER:!.+]], [[ANY_POINTER]], i64 0}
@@ -145,3 +171,5 @@ void p2struct(struct S1 **ptr) {
 // ENABLED: [[P2CHAR]] = !{!"p2 omnipotent char", [[ANY_POINTER]], i64 0}
 // ENABLED: [[P1CHAR_0]] = !{[[P1CHAR:!.+]], [[P1CHAR]], i64 0}
 // ENABLED: [[P1CHAR]] = !{!"p1 omnipotent char", [[ANY_POINTER]], i64 0}
+// COMMON: [[S2_S_TAG]]  = !{[[S2_TY:!.+]], [[ANY_POINTER]], i64 0}
+// COMMON: [[S2_TY]]  = !{!"S2", [[ANY_POINTER]], i64 0}
diff --git a/clang/test/CodeGen/tbaa-reference.cpp b/clang/test/CodeGen/tbaa-reference.cpp
index c4d9e70a8b07f..d22cd90b43ae9 100644
--- a/clang/test/CodeGen/tbaa-reference.cpp
+++ b/clang/test/CodeGen/tbaa-reference.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-linux -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s  -check-prefixes=CHECK,OLD-PATH
+// RUN: %clang_cc1 -triple x86_64-linux -O1 -disable-llvm-passes -pointer-tbaa %s -emit-llvm -o - | FileCheck %s  -check-prefixes=CHECK,OLD-PATH
 // RUN: %clang_cc1 -triple x86_64-linux -O1 -disable-llvm-passes %s -emit-llvm -new-struct-path-tbaa -o - | FileCheck %s -check-prefixes=CHECK,NEW-PATH
+// RUN: %clang_cc1 -triple x86_64-linux -O1 -disable-llvm-passes %s -pointer-tbaa -emit-llvm -new-struct-path-tbaa -o - | FileCheck %s -check-prefixes=CHECK,NEW-PATH
 //
 // Check that we generate correct TBAA information for reference accesses.
 
diff --git a/clang/test/CodeGenCUDA/address-spaces.cu b/clang/test/CodeGenCUDA/address-spaces.cu
index 0608c9cabd048..66903c81b9333 100644
--- a/clang/test/CodeGenCUDA/address-spaces.cu
+++ b/clang/test/CodeGenCUDA/address-spaces.cu
@@ -9,7 +9,7 @@
 // CHECK: @i ={{.*}} addrspace(1) externally_initialized global
 __device__ int i;
 
-// CHECK: @j ={{.*}} addrspace(4) externally_initialized global
+// CHECK: @j ={{.*}} addrspace(4) externally_initialized constant
 __constant__ int j;
 
 // CHECK: @k ={{.*}} addrspace(3) global
diff --git a/clang/test/CodeGenCUDA/amdgpu-visibility.cu b/clang/test/CodeGenCUDA/amdgpu-visibility.cu
index d7dbab112a68c..ef74d932ee8c8 100644
--- a/clang/test/CodeGenCUDA/amdgpu-visibility.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-visibility.cu
@@ -4,11 +4,11 @@
 
 #include "Inputs/cuda.h"
 
-// CHECK-DEFAULT: @c ={{.*}} addrspace(4) externally_initialized global
+// CHECK-DEFAULT: @c ={{.*}} addrspace(4) externally_initialized constant
 // CHECK-DEFAULT: @g ={{.*}} addrspace(1) externally_initialized global
-// CHECK-PROTECTED: @c = protected addrspace(4) externally_initialized global
+// CHECK-PROTECTED: @c = protected addrspace(4) externally_initialized constant
 // CHECK-PROTECTED: @g = protected addrspace(1) externally_initialized global
-// CHECK-HIDDEN: @c = protected addrspace(4) externally_initialized global
+// CHECK-HIDDEN: @c = protected addrspace(4) externally_initialized constant
 // CHECK-HIDDEN: @g = protected addrspace(1) externally_initialized global
 __constant__ int c;
 __device__ int g;
diff --git a/clang/test/CodeGenCUDA/anon-ns.cu b/clang/test/CodeGenCUDA/anon-ns.cu
index 3c55e9907dd6c..d931f31d0207c 100644
--- a/clang/test/CodeGenCUDA/anon-ns.cu
+++ b/clang/test/CodeGenCUDA/anon-ns.cu
@@ -28,13 +28,13 @@
 // HIP-DAG: define weak_odr {{.*}}void @[[KTX:_Z2ktIN12_GLOBAL__N_11XEEvT_\.intern\.b04fd23c98500190]](
 // HIP-DAG: define weak_odr {{.*}}void @[[KTL:_Z2ktIN12_GLOBAL__N_1UlvE_EEvT_\.intern\.b04fd23c98500190]](
 // HIP-DAG: @[[VM:_ZN12_GLOBAL__N_12vmE\.static\.b04fd23c98500190]] = addrspace(1) externally_initialized global
-// HIP-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE\.static\.b04fd23c98500190]] = addrspace(4) externally_initialized global
+// HIP-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE\.static\.b04fd23c98500190]] = addrspace(4) externally_initialized constant
 // HIP-DAG: @[[VT:_Z2vtIN12_GLOBAL__N_11XEE\.static\.b04fd23c98500190]] = addrspace(1) externally_initialized global
 
 // CUDA-DAG: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv__intern__b04fd23c98500190]](
 // CUDA-DAG: define weak_odr {{.*}}void @[[KTX:_Z2ktIN12_GLOBAL__N_11XEEvT___intern__b04fd23c98500190]](
 // CUDA-DAG: define weak_odr {{.*}}void @[[KTL:_Z2ktIN12_GLOBAL__N_1UlvE_EEvT___intern__b04fd23c98500190]](
-// CUDA-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE__static__b04fd23c98500190]] = addrspace(4) externally_initialized global
+// CUDA-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE__static__b04fd23c98500190]] = addrspace(4) externally_initialized constant
 // CUDA-DAG: @[[VT:_Z2vtIN12_GLOBAL__N_11XEE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global
 
 // COMMON-DAG: @_ZN12_GLOBAL__N_12vdE = internal addrspace(1) global
diff --git a/clang/test/CodeGenCUDA/device-var-init.cu b/clang/test/CodeGenCUDA/device-var-init.cu
index 226b7e295f4b4..9d62e4126b430 100644
--- a/clang/test/CodeGenCUDA/device-var-init.cu
+++ b/clang/test/CodeGenCUDA/device-var-init.cu
@@ -26,7 +26,7 @@ __shared__ int s_v;
 // DEVICE: @s_v ={{.*}} addrspace(3) global i32 undef,
 // HOST:   @s_v = internal global i32 undef,
 __constant__ int c_v;
-// DEVICE: addrspace(4) externally_initialized global i32 0,
+// DEVICE: addrspace(4) externally_initialized constant i32 0,
 // HOST:   @c_v = internal global i32 undef,
 
 __device__ int d_v_i = 1;
@@ -51,14 +51,14 @@ __shared__ T s_t;
 // DEVICE: @s_t ={{.*}} addrspace(3) global %struct.T undef,
 // HOST:   @s_t = internal global %struct.T undef,
 __constant__ T c_t;
-// DEVICE: @c_t ={{.*}} addrspace(4) externally_initialized global %struct.T zeroinitializer,
+// DEVICE: @c_t ={{.*}} addrspace(4) externally_initialized constant %struct.T zeroinitializer,
 // HOST:   @c_t = internal global %struct.T undef,
 
 __device__ T d_t_i = {2};
 // DEVICE: @d_t_i ={{.*}} addrspace(1) externally_initialized global %struct.T { i32 2 },
 // HOST:   @d_t_i = internal global %struct.T undef,
 __constant__ T c_t_i = {2};
-// DEVICE: @c_t_i ={{.*}} addrspace(4) externally_initialized global %struct.T { i32 2 },
+// DEVICE: @c_t_i ={{.*}} addrspace(4) externally_initialized constant %struct.T { i32 2 },
 // HOST:   @c_t_i = internal global %struct.T undef,
 
 // empty constructor
@@ -69,7 +69,7 @@ __shared__ EC s_ec;
 // DEVICE: @s_ec ={{.*}} addrspace(3) global %struct.EC undef,
 // HOST:   @s_ec = internal global %struct.EC undef,
 __constant__ EC c_ec;
-// DEVICE: @c_ec ={{.*}} addrspace(4) externally_initialized global %struct.EC zeroinitializer,
+// DEVICE: @c_ec ={{.*}} addrspace(4) externally_initialized constant %struct.EC zeroinitializer,
 // HOST:   @c_ec = internal global %struct.EC undef
 
 // empty destructor
@@ -80,7 +80,7 @@ __shared__ ED s_ed;
 // DEVICE: @s_ed ={{.*}} addrspace(3) global %struct.ED undef,
 // HOST:   @s_ed = internal global %struct.ED undef,
 __constant__ ED c_ed;
-// DEVICE: @c_ed ={{.*}} addrspace(4) externally_initialized global %struct.ED zeroinitializer,
+// DEVICE: @c_ed ={{.*}} addrspace(4) externally_initialized constant %struct.ED zeroinitializer,
 // HOST:   @c_ed = internal global %struct.ED undef,
 
 __device__ ECD d_ecd;
@@ -90,7 +90,7 @@ __shared__ ECD s_ecd;
 // DEVICE: @s_ecd ={{.*}} addrspace(3) global %struct.ECD undef,
 // HOST:   @s_ecd = internal global %struct.ECD undef,
 __constant__ ECD c_ecd;
-// DEVICE: @c_ecd ={{.*}} addrspace(4) externally_initialized global %struct.ECD zeroinitializer,
+// DEVICE: @c_ecd ={{.*}} addrspace(4) externally_initialized constant %struct.ECD zeroinitializer,
 // HOST:   @c_ecd = internal global %struct.ECD undef,
 
 // empty templated constructor -- allowed with no arguments
@@ -101,14 +101,14 @@ __shared__ ETC s_etc;
 // DEVICE: @s_etc ={{.*}} addrspace(3) global %struct.ETC undef,
 // HOST:   @s_etc = internal global %struct.ETC undef,
 __constant__ ETC c_etc;
-// DEVICE: @c_etc ={{.*}} addrspace(4) externally_initialized global %struct.ETC zeroinitializer,
+// DEVICE: @c_etc ={{.*}} addrspace(4) externally_initialized constant %struct.ETC zeroinitializer,
 // HOST:   @c_etc = internal global %struct.ETC undef,
 
 __device__ NCFS d_ncfs;
 // DEVICE: @d_ncfs ={{.*}} addrspace(1) externally_initialized global %struct.NCFS { i32 3 }
 // HOST:   @d_ncfs = internal global %struct.NCFS undef,
 __constant__ NCFS c_ncfs;
-// DEVICE: @c_ncfs ={{.*}} addrspace(4) externally_initialized global %struct.NCFS { i32 3 }
+// DEVICE: @c_ncfs ={{.*}} addrspace(4) externally_initialized constant %struct.NCFS { i32 3 }
 // HOST:   @c_ncfs = internal global %struct.NCFS undef,
 
 // Regular base class -- allowed
@@ -119,7 +119,7 @@ __shared__ T_B_T s_t_b_t;
 // DEVICE: @s_t_b_t ={{.*}} addrspace(3) global %struct.T_B_T undef,
 // HOST:   @s_t_b_t = internal global %struct.T_B_T undef,
 __constant__ T_B_T c_t_b_t;
-// DEVICE: @c_t_b_t ={{.*}} addrspace(4) externally_initialized global %struct.T_B_T zeroinitializer,
+// DEVICE: @c_t_b_t ={{.*}} addrspace(4) externally_initialized constant %struct.T_B_T zeroinitializer,
 // HOST:   @c_t_b_t = internal global %struct.T_B_T undef,
 
 // Incapsulated object of allowed class -- allowed
@@ -130,7 +130,7 @@ __shared__ T_F_T s_t_f_t;
 // DEVICE: @s_t_f_t ={{.*}} addrspace(3) global %struct.T_F_T undef,
 // HOST:   @s_t_f_t = internal global %struct.T_F_T undef,
 __constant__ T_F_T c_t_f_t;
-// DEVICE: @c_t_f_t ={{.*}} addrspace(4) externally_initialized global %struct.T_F_T zeroinitializer,
+// DEVICE: @c_t_f_t ={{.*}} addrspace(4) externally_initialized constant %struct.T_F_T zeroinitializer,
 // HOST:   @c_t_f_t = internal global %struct.T_F_T undef,
 
 // array of allowed objects -- allowed
@@ -141,7 +141,7 @@ __shared__ T_FA_T s_t_fa_t;
 // DEVICE: @s_t_fa_t ={{.*}} addrspace(3) global %struct.T_FA_T undef,
 // HOST:   @s_t_fa_t = internal global %struct.T_FA_T undef,
 __constant__ T_FA_T c_t_fa_t;
-// DEVICE: @c_t_fa_t ={{.*}} addrspace(4) externally_initialized global %struct.T_FA_T zeroinitializer,
+// DEVICE: @c_t_fa_t ={{.*}} addrspace(4) externally_initialized constant %struct.T_FA_T zeroinitializer,
 // HOST:   @c_t_fa_t = internal global %struct.T_FA_T undef,
 
 
@@ -153,7 +153,7 @@ __shared__ EC_I_EC s_ec_i_ec;
 // DEVICE: @s_ec_i_ec ={{.*}} addrspace(3) global %struct.EC_I_EC undef,
 // HOST:   @s_ec_i_ec = internal global %struct.EC_I_EC undef,
 __constant__ EC_I_EC c_ec_i_ec;
-// DEVICE: @c_ec_i_ec ={{.*}} addrspace(4) externally_initialized global %struct.EC_I_EC zeroinitializer,
+// DEVICE: @c_ec_i_ec ={{.*}} addrspace(4) externally_initialized constant %struct.EC_I_EC zeroinitializer,
 // HOST:   @c_ec_i_ec = internal global %struct.EC_I_EC undef,
 
 // DEVICE: @_ZZ2dfvE4s_ec = internal addrspace(3) global %struct.EC undef
diff --git a/clang/test/CodeGenCUDA/device-var-linkage.cu b/clang/test/CodeGenCUDA/device-var-linkage.cu
index 3c2efb57525c9..4c57323d85f9d 100644
--- a/clang/test/CodeGenCUDA/device-var-linkage.cu
+++ b/clang/test/CodeGenCUDA/device-var-linkage.cu
@@ -20,7 +20,7 @@
 // NORDC-H-DAG: @v1 = internal global i32 undef
 // RDC-H-DAG: @v1 = global i32 undef
 __device__ int v1;
-// DEV-DAG: @v2 = addrspace(4) externally_initialized global i32 0
+// DEV-DAG: @v2 = addrspace(4) externally_initialized constant i32 0
 // NORDC-H-DAG: @v2 = internal global i32 undef
 // RDC-H-DAG: @v2 = global i32 undef
 __constant__ int v2;
@@ -48,10 +48,10 @@ extern __managed__ int ev3;
 // HOST-DAG: @_ZL3sv1 = internal global i32 undef
 // CUDA-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
 static __device__ int sv1;
-// NORDC-DAG: @_ZL3sv2 = addrspace(4) externally_initialized global i32 0
-// RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized global i32 0
+// NORDC-DAG: @_ZL3sv2 = addrspace(4) externally_initialized constant i32 0
+// RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0
 // HOST-DAG: @_ZL3sv2 = internal global i32 undef
-// CUDA-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
+// CUDA-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0
 static __constant__ int sv2;
 // NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null
 // RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null
diff --git a/clang/test/CodeGenCUDA/filter-decl.cu b/clang/test/CodeGenCUDA/filter-decl.cu
index 0f4691f7c8aa7..02dacd0ad8ef4 100644
--- a/clang/test/CodeGenCUDA/filter-decl.cu
+++ b/clang/test/CodeGenCUDA/filter-decl.cu
@@ -10,7 +10,7 @@
 __asm__("file scope asm is host only");
 
 // CHECK-HOST: constantdata = internal global
-// CHECK-DEVICE: constantdata = {{(dso_local )?}}externally_initialized global
+// CHECK-DEVICE: constantdata = {{(dso_local )?}}externally_initialized constant
 __constant__ char constantdata[256];
 
 // CHECK-HOST: devicedata = internal global
diff --git a/clang/test/CodeGenCUDA/static-device-var-no-rdc.cu b/clang/test/CodeGenCUDA/static-device-var-no-rdc.cu
index 80655c2d29604..e92b00345e00c 100644
--- a/clang/test/CodeGenCUDA/static-device-var-no-rdc.cu
+++ b/clang/test/CodeGenCUDA/static-device-var-no-rdc.cu
@@ -50,7 +50,7 @@ static __device__ int x5;
 }
 
 // Check a static constant variable referenced by host is externalized.
-// DEV-DAG: @_ZL1y ={{.*}} addrspace(4) externally_initialized global i32 0
+// DEV-DAG: @_ZL1y ={{.*}} addrspace(4) externally_initialized constant i32 0
 // HOST-DAG: @_ZL1y = internal global i32 undef
 // HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
 
diff --git a/clang/test/CodeGenCUDA/static-device-var-rdc.cu b/clang/test/CodeGenCUDA/static-device-var-rdc.cu
index 16ec413397235..9d2811f9385e1 100644
--- a/clang/test/CodeGenCUDA/static-device-var-rdc.cu
+++ b/clang/test/CodeGenCUDA/static-device-var-rdc.cu
@@ -81,11 +81,11 @@ static __device__ int x;
 static __device__ int x2;
 
 // Test normal static device variables
-// INT-DEV-DAG: @_ZL1y[[FILEID:.*]] = addrspace(4) externally_initialized global i32 0
+// INT-DEV-DAG: @_ZL1y[[FILEID:.*]] = addrspace(4) externally_initialized constant i32 0
 // INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y[[FILEID:.*]]\00"
 
 // Test externalized static device variables
-// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized global i32 0
+// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0
 // EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00"
 
 static __constant__ int y;
diff --git a/clang/test/CodeGenCUDA/template-class-static-member.cu b/clang/test/CodeGenCUDA/template-class-static-member.cu
index d790d2dea66ba..b614cd9dcbb14 100644
--- a/clang/test/CodeGenCUDA/template-class-static-member.cu
+++ b/clang/test/CodeGenCUDA/template-class-static-member.cu
@@ -38,7 +38,7 @@ const int A<T>::const_member;
 template class A<int>;
 
 //DEV-DAG: @_ZN1AIiE8d_memberE = internal addrspace(1) global i32 0, comdat, align 4
-//DEV-DAG: @_ZN1AIiE8c_memberE = internal addrspace(4) global i32 0, comdat, align 4
+//DEV-DAG: @_ZN1AIiE8c_memberE = internal addrspace(4) constant i32 0, comdat, align 4
 //DEV-DAG: @_ZN1AIiE8m_memberE = internal addrspace(1) externally_initialized global ptr addrspace(1) null
 //DEV-DAG: @_ZN1AIiE12const_memberE = internal addrspace(4) constant i32 0, comdat, align 4
 //DEV-NEG-NOT: @_ZN1AIiE8h_memberE
diff --git a/clang/test/CodeGenCXX/cxx2a-consteval.cpp b/clang/test/CodeGenCXX/cxx2a-consteval.cpp
index a58a09554699d..bfeabc946da41 100644
--- a/clang/test/CodeGenCXX/cxx2a-consteval.cpp
+++ b/clang/test/CodeGenCXX/cxx2a-consteval.cpp
@@ -6,6 +6,14 @@
 // RUN: %clang_cc1 -emit-llvm %s -Dconsteval="" -std=c++2a -triple x86_64-unknown-linux-gnu -o %t.ll
 // RUN: FileCheck -check-prefix=EXPR -input-file=%t.ll %s
 
+// RUN: %clang_cc1 -emit-llvm %s -std=c++2a -triple x86_64-unknown-linux-gnu -o %t.ll -fexperimental-new-constant-interpreter
+// RUN: FileCheck -check-prefix=EVAL -input-file=%t.ll %s
+// RUN: FileCheck -check-prefix=EVAL-STATIC -input-file=%t.ll %s
+// RUN: FileCheck -check-prefix=EVAL-FN -input-file=%t.ll %s
+//
+// RUN: %clang_cc1 -emit-llvm %s -Dconsteval="" -std=c++2a -triple x86_64-unknown-linux-gnu -o %t.ll -fexperimental-new-constant-interpreter
+// RUN: FileCheck -check-prefix=EXPR -input-file=%t.ll %s
+
 // there is two version of symbol checks to ensure
 // that the symbol we are looking for are correct
 // EVAL-NOT: @__cxx_global_var_init()
diff --git a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
index 3d5e32516c7af..b967701ca1fa9 100644
--- a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
+++ b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --no-generate-body-for-unused-prefixes --version 4
 // RUN: %clang_cc1 -I%S %s -triple amdgcn-amd-amdhsa -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s
-// RUN: %clang_cc1 -I%S %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
+// RUN: %clang_cc1 -I%S %s -triple spirv64-amd-amdhsa -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 struct A { virtual void f(); };
 struct B : A { };
@@ -15,7 +15,7 @@ B fail;
 // CHECK: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8
 // CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
 //.
-// WITH-NONZERO-DEFAULT-AS: @_ZTV1B = linkonce_odr unnamed_addr addrspace(1) constant { [3 x ptr addrspace(1)] } { [3 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1B, ptr addrspace(1) addrspacecast (ptr @_ZN1A1fEv to ptr addrspace(1))] }, comdat, align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZTV1B = linkonce_odr unnamed_addr addrspace(1) constant { [3 x ptr addrspace(1)] } { [3 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1B, ptr addrspace(1) addrspacecast (ptr addrspace(4) @_ZN1A1fEv to ptr addrspace(1))] }, comdat, align 8
 // WITH-NONZERO-DEFAULT-AS: @fail = addrspace(1) global { ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2) }, align 8
 // WITH-NONZERO-DEFAULT-AS: @_ZTI1A = external addrspace(1) constant ptr addrspace(1)
 // WITH-NONZERO-DEFAULT-AS: @_ZTVN10__cxxabiv120__si_class_type_infoE = external addrspace(1) global [0 x ptr addrspace(1)]
@@ -60,7 +60,7 @@ B fail;
 // CHECK-NEXT:    ret ptr addrspacecast (ptr addrspace(1) @fail to ptr)
 //
 // WITH-NONZERO-DEFAULT-AS-LABEL: define spir_func noundef align 8 dereferenceable(8) ptr addrspace(4) @_Z1fP1A(
-// WITH-NONZERO-DEFAULT-AS-SAME: ptr addrspace(4) noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 {
+// WITH-NONZERO-DEFAULT-AS-SAME: ptr addrspace(4) noundef [[A:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] personality ptr addrspace(4) @__gxx_personality_v0 {
 // WITH-NONZERO-DEFAULT-AS-NEXT:  entry:
 // WITH-NONZERO-DEFAULT-AS-NEXT:    [[RETVAL:%.*]] = alloca ptr addrspace(4), align 8
 // WITH-NONZERO-DEFAULT-AS-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(4), align 8
@@ -70,11 +70,11 @@ B fail;
 // WITH-NONZERO-DEFAULT-AS-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
 // WITH-NONZERO-DEFAULT-AS-NEXT:    store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
 // WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP1:%.*]] = call spir_func ptr addrspace(4) @__dynamic_cast(ptr addrspace(4) [[TMP0]], ptr addrspace(1) @_ZTI1A, ptr addrspace(1) @_ZTI1B, i64 0) #[[ATTR3:[0-9]+]]
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP1:%.*]] = call spir_func addrspace(4) ptr addrspace(4) @__dynamic_cast(ptr addrspace(4) [[TMP0]], ptr addrspace(1) @_ZTI1A, ptr addrspace(1) @_ZTI1B, i64 0) #[[ATTR3:[0-9]+]]
 // WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP2:%.*]] = icmp eq ptr addrspace(4) [[TMP1]], null
 // WITH-NONZERO-DEFAULT-AS-NEXT:    br i1 [[TMP2]], label [[DYNAMIC_CAST_BAD_CAST:%.*]], label [[DYNAMIC_CAST_END:%.*]]
 // WITH-NONZERO-DEFAULT-AS:       dynamic_cast.bad_cast:
-// WITH-NONZERO-DEFAULT-AS-NEXT:    invoke spir_func void @__cxa_bad_cast() #[[ATTR4:[0-9]+]]
+// WITH-NONZERO-DEFAULT-AS-NEXT:    invoke spir_func addrspace(4) void @__cxa_bad_cast() #[[ATTR4:[0-9]+]]
 // WITH-NONZERO-DEFAULT-AS-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
 // WITH-NONZERO-DEFAULT-AS:       invoke.cont:
 // WITH-NONZERO-DEFAULT-AS-NEXT:    unreachable
@@ -90,8 +90,8 @@ B fail;
 // WITH-NONZERO-DEFAULT-AS-NEXT:    br label [[CATCH:%.*]]
 // WITH-NONZERO-DEFAULT-AS:       catch:
 // WITH-NONZERO-DEFAULT-AS-NEXT:    [[EXN:%.*]] = load ptr addrspace(4), ptr [[EXN_SLOT]], align 8
-// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP6:%.*]] = call spir_func ptr addrspace(4) @__cxa_begin_catch(ptr addrspace(4) [[EXN]]) #[[ATTR3]]
-// WITH-NONZERO-DEFAULT-AS-NEXT:    call spir_func void @__cxa_end_catch()
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP6:%.*]] = call spir_func addrspace(4) ptr addrspace(4) @__cxa_begin_catch(ptr addrspace(4) [[EXN]]) #[[ATTR3]]
+// WITH-NONZERO-DEFAULT-AS-NEXT:    call spir_func addrspace(4) void @__cxa_end_catch()
 // WITH-NONZERO-DEFAULT-AS-NEXT:    br label [[TRY_CONT]]
 // WITH-NONZERO-DEFAULT-AS:       try.cont:
 // WITH-NONZERO-DEFAULT-AS-NEXT:    ret ptr addrspace(4) addrspacecast (ptr addrspace(1) @fail to ptr addrspace(4))
@@ -112,9 +112,9 @@ const B& f(A *a) {
 // CHECK: attributes #[[ATTR3]] = { nounwind }
 // CHECK: attributes #[[ATTR4]] = { noreturn }
 //.
-// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { convergent mustprogress noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
 // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) }
-// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
 // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR3]] = { nounwind }
 // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR4]] = { noreturn }
 //.
@@ -122,6 +122,7 @@ const B& f(A *a) {
 // CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 //.
-// WITH-NONZERO-DEFAULT-AS: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-// WITH-NONZERO-DEFAULT-AS: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// WITH-NONZERO-DEFAULT-AS: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
+// WITH-NONZERO-DEFAULT-AS: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// WITH-NONZERO-DEFAULT-AS: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 //.
diff --git a/clang/test/CodeGenCXX/mangle-concept.cpp b/clang/test/CodeGenCXX/mangle-concept.cpp
index 91dc1b0e688e0..6053511c00a7b 100644
--- a/clang/test/CodeGenCXX/mangle-concept.cpp
+++ b/clang/test/CodeGenCXX/mangle-concept.cpp
@@ -58,19 +58,19 @@ namespace test2 {
     // CHECK: call {{.*}}@_ZN5test21AIiEF1fEzQ4TrueIT_E(
     // CLANG17: call {{.*}}@_ZN5test21fEz(
     f(ai);
-    // CHECK: call {{.*}}@_ZN5test2F1gIvEEvzQaa4TrueIT_E4TrueITL0__E(
+    // CHECK: call {{.*}}@_ZN5test21AIiEF1gIvEEvzQaa4TrueIT_E4TrueITL0__E(
     // CLANG17: call {{.*}}@_ZN5test21gIvEEvz(
     g(ai);
     // CHECK: call {{.*}}@_ZN5test21hIvEEvzQ4TrueITL0__E(
     // CLANG17: call {{.*}}@_ZN5test21hIvEEvz(
     h(ai);
-    // CHECK: call {{.*}}@_ZN5test2F1iIvQaa4TrueIT_E4TrueITL0__EEEvz(
+    // CHECK: call {{.*}}@_ZN5test21AIiEF1iIvQaa4TrueIT_E4TrueITL0__EEEvz(
     // CLANG17: call {{.*}}@_ZN5test21iIvEEvz(
     i(ai);
     // CHECK: call {{.*}}@_ZN5test21jIvQ4TrueITL0__EEEvz(
     // CLANG17: call {{.*}}@_ZN5test21jIvEEvz(
     j(ai);
-    // CHECK: call {{.*}}@_ZN5test2F1kITk4TruevQ4TrueIT_EEEvz(
+    // CHECK: call {{.*}}@_ZN5test21AIiEF1kITk4TruevQ4TrueIT_EEEvz(
     // CLANG17: call {{.*}}@_ZN5test21kIvEEvz(
     k(ai);
     // CHECK: call {{.*}}@_ZN5test21lITk4TruevEEvz(
diff --git a/clang/test/CodeGenCXX/mangle-subst.cpp b/clang/test/CodeGenCXX/mangle-subst.cpp
index 20f33a72fff83..524e0febe479a 100644
--- a/clang/test/CodeGenCXX/mangle-subst.cpp
+++ b/clang/test/CodeGenCXX/mangle-subst.cpp
@@ -1,4 +1,8 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-apple-darwin9 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-apple-darwin9 -fclang-abi-compat=19 | FileCheck %s --check-prefix=CHECK-CLANG-19
+
+//CHECK: @_ZTCN16MangleCtorVTable4InstE0_NS_1A4ImplINS1_4WrapEEE
+//CHECK-CLANG-19: @_ZTCN16MangleCtorVTable4InstE0_NS_1A4ImplINS0_4WrapEEE
 
 struct X {};
 
@@ -96,3 +100,26 @@ typename X<T>::template Y<T>::type f(typename X<T>::template Y<T>::type2) { retu
 // CHECK: @_ZN12ManglePrefix1fIiEENS_1XIT_E1YIS2_E4typeENS5_5type2E
 template int f<int>(int);
 }
+
+namespace MangleCtorVTable {
+namespace A {
+
+class VBase {
+ public:
+  virtual ~VBase() {};
+};
+
+struct Wrap {};
+
+template <typename T>
+class Impl : public virtual VBase {
+ public:
+};
+
+}  // namespace A
+
+struct Inst : public A::Impl<A::Wrap> {};
+
+void Test() { Inst a; }
+
+}
diff --git a/clang/test/CodeGenCXX/pr59765-modules-global-ctor-dtor.cppm b/clang/test/CodeGenCXX/pr59765-modules-global-ctor-dtor.cppm
index 9956348f87ff4..ad5a3e14a81db 100644
--- a/clang/test/CodeGenCXX/pr59765-modules-global-ctor-dtor.cppm
+++ b/clang/test/CodeGenCXX/pr59765-modules-global-ctor-dtor.cppm
@@ -1,9 +1,10 @@
 // https://github.com/llvm/llvm-project/issues/59765
 // FIXME: Since the signature of the constructors/destructors is
 // different in different targets. The current CHECK can't work
-// well when targeting or running on AIX and z/OS.
+// well when targeting AIX and z/OS.
 // It would be better to add the corresponding test for other test.
-// UNSUPPORTED: system-zos, system-aix
+// UNSUPPORTED: system-aix
+// UNSUPPORTED: target={{.*}}-zos{{.*}}
 //
 // RUN: rm -rf %t
 // RUN: mkdir %t
diff --git a/clang/test/CodeGenCXX/spirv-amdgcn-float16.cpp b/clang/test/CodeGenCXX/spirv-amdgcn-float16.cpp
index 2487e0fcd4343..223e408a37892 100644
--- a/clang/test/CodeGenCXX/spirv-amdgcn-float16.cpp
+++ b/clang/test/CodeGenCXX/spirv-amdgcn-float16.cpp
@@ -7,22 +7,25 @@
 // CHECK-NEXT:    [[X:%.*]] = alloca half, align 2
 // CHECK-NEXT:    [[Y:%.*]] = alloca half, align 2
 // CHECK-NEXT:    [[Z:%.*]] = alloca half, align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[X]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[Y]], align 2
+// CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST:%.*]] = addrspacecast ptr [[Y]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Z_ASCAST:%.*]] = addrspacecast ptr [[Z]] to ptr addrspace(4)
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr addrspace(4) [[X_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr addrspace(4) [[Y_ASCAST]], align 2
 // CHECK-NEXT:    [[ADD:%.*]] = fadd half [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store half [[ADD]], ptr [[Z]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr [[X]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[Y]], align 2
+// CHECK-NEXT:    store half [[ADD]], ptr addrspace(4) [[Z_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr addrspace(4) [[X_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr addrspace(4) [[Y_ASCAST]], align 2
 // CHECK-NEXT:    [[SUB:%.*]] = fsub half [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    store half [[SUB]], ptr [[Z]], align 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr [[X]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[Y]], align 2
+// CHECK-NEXT:    store half [[SUB]], ptr addrspace(4) [[Z_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr addrspace(4) [[X_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr addrspace(4) [[Y_ASCAST]], align 2
 // CHECK-NEXT:    [[MUL:%.*]] = fmul half [[TMP4]], [[TMP5]]
-// CHECK-NEXT:    store half [[MUL]], ptr [[Z]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr [[X]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[Y]], align 2
+// CHECK-NEXT:    store half [[MUL]], ptr addrspace(4) [[Z_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr addrspace(4) [[X_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr addrspace(4) [[Y_ASCAST]], align 2
 // CHECK-NEXT:    [[DIV:%.*]] = fdiv half [[TMP6]], [[TMP7]]
-// CHECK-NEXT:    store half [[DIV]], ptr [[Z]], align 2
+// CHECK-NEXT:    store half [[DIV]], ptr addrspace(4) [[Z_ASCAST]], align 2
 // CHECK-NEXT:    ret void
 //
 void f() {
diff --git a/clang/test/CodeGenCXX/template-param-objects-address-space.cpp b/clang/test/CodeGenCXX/template-param-objects-address-space.cpp
index b3733decdb550..1f7c160bb86e9 100644
--- a/clang/test/CodeGenCXX/template-param-objects-address-space.cpp
+++ b/clang/test/CodeGenCXX/template-param-objects-address-space.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -std=c++20 %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -triple spirv64-unknown-unknown -fsycl-is-device -std=c++20 %s -emit-llvm -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -std=c++20 %s -emit-llvm -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 struct S { char buf[32]; };
 template<S s> constexpr const char *begin() { return s.buf; }
@@ -37,6 +37,6 @@ const void *s = observable_addr<S{"hello world"}>();
 // CHECK: define linkonce_odr noundef ptr @_Z15observable_addrIXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEEEPKvv()
 // WITH-NONZERO-DEFAULT-AS: define linkonce_odr {{.*}} noundef ptr addrspace(4) @_Z15observable_addrIXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEEEPKvv()
 // CHECK: %call = call noundef ptr @_Z6calleePK1S(ptr noundef addrspacecast (ptr addrspace(1) [[HELLO]] to ptr))
-// WITH-NONZERO-DEFAULT-AS: %call = call {{.*}} noundef ptr addrspace(4) @_Z6calleePK1S(ptr addrspace(4) noundef addrspacecast (ptr addrspace(1) [[HELLO]] to ptr addrspace(4)))
+// WITH-NONZERO-DEFAULT-AS: %call = call {{.*}} noundef{{.*}} ptr addrspace(4) @_Z6calleePK1S(ptr addrspace(4) noundef addrspacecast (ptr addrspace(1) [[HELLO]] to ptr addrspace(4)))
 // CHECK: declare noundef ptr @_Z6calleePK1S(ptr noundef)
 // WITH-NONZERO-DEFAULT-AS: declare {{.*}} noundef ptr addrspace(4) @_Z6calleePK1S(ptr addrspace(4) noundef)
diff --git a/clang/test/CodeGenCXX/throw-expression-typeinfo-in-address-space.cpp b/clang/test/CodeGenCXX/throw-expression-typeinfo-in-address-space.cpp
index 3acbdd8fd97ee..0fb553de4485a 100644
--- a/clang/test/CodeGenCXX/throw-expression-typeinfo-in-address-space.cpp
+++ b/clang/test/CodeGenCXX/throw-expression-typeinfo-in-address-space.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm -fcxx-exceptions -fexceptions -std=c++11 -o - | FileCheck %s
-// RUN: %clang_cc1 %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -fcxx-exceptions -fexceptions -std=c++11 -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
+// RUN: %clang_cc1 %s -triple spirv64-amd-amdhsa -emit-llvm -fcxx-exceptions -fexceptions -std=c++11 -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 struct X {
   ~X();
diff --git a/clang/test/CodeGenCXX/try-catch-with-address-space.cpp b/clang/test/CodeGenCXX/try-catch-with-address-space.cpp
index 412ac6c287258..55c76ed7f344f 100644
--- a/clang/test/CodeGenCXX/try-catch-with-address-space.cpp
+++ b/clang/test/CodeGenCXX/try-catch-with-address-space.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -emit-llvm -o - -fcxx-exceptions -fexceptions | FileCheck %s
-// RUN: %clang_cc1 %s -triple=spirv64-unknown-unknown -fsycl-is-device -emit-llvm -o - -fcxx-exceptions -fexceptions | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
+// RUN: %clang_cc1 %s -triple=spirv64-amd-amdhsa -emit-llvm -o - -fcxx-exceptions -fexceptions | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 struct X { };
 
@@ -12,7 +12,7 @@ void f() {
   } catch (const X x) {
     // CHECK: catch ptr addrspace(1) @_ZTI1X
     // CHECK: call i32 @llvm.eh.typeid.for.p0(ptr addrspacecast (ptr addrspace(1) @_ZTI1X to ptr))
-    // WITH-NONZERO-DEFAULT-AS: call i32 @llvm.eh.typeid.for.p4(ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTI1X to ptr addrspace(4)))
+    // WITH-NONZERO-DEFAULT-AS: call{{.*}} i32 @llvm.eh.typeid.for.p4(ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTI1X to ptr addrspace(4)))
   }
 }
 
@@ -23,6 +23,6 @@ void h() {
   } catch (char const(&)[4]) {
     // CHECK: catch ptr addrspace(1) @_ZTIA4_c
     // CHECK: call i32 @llvm.eh.typeid.for.p0(ptr addrspacecast (ptr addrspace(1) @_ZTIA4_c to ptr))
-    // WITH-NONZERO-DEFAULT-AS: call i32 @llvm.eh.typeid.for.p4(ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIA4_c to ptr addrspace(4)))
+    // WITH-NONZERO-DEFAULT-AS: call{{.*}} i32 @llvm.eh.typeid.for.p4(ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIA4_c to ptr addrspace(4)))
   }
 }
diff --git a/clang/test/CodeGenCXX/typeid-cxx11-with-address-space.cpp b/clang/test/CodeGenCXX/typeid-cxx11-with-address-space.cpp
index f6dc38ec9f292..782e59c887bd3 100644
--- a/clang/test/CodeGenCXX/typeid-cxx11-with-address-space.cpp
+++ b/clang/test/CodeGenCXX/typeid-cxx11-with-address-space.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -I%S %s -triple amdgcn-amd-amdhsa -emit-llvm -std=c++11 -o - | FileCheck %s
-// RUN: %clang_cc1 -I%S %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -std=c++11 -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
+// RUN: %clang_cc1 -I%S %s -triple spirv64-amd-amdhsa -emit-llvm -std=c++11 -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 #include <typeinfo>
 
 namespace Test1 {
diff --git a/clang/test/CodeGenCXX/typeid-with-address-space.cpp b/clang/test/CodeGenCXX/typeid-with-address-space.cpp
index 98af17f4fc888..20699dc170ace 100644
--- a/clang/test/CodeGenCXX/typeid-with-address-space.cpp
+++ b/clang/test/CodeGenCXX/typeid-with-address-space.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -I%S %s -triple amdgcn-amd-amdhsa -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s
-// RUN: %clang_cc1 -I%S %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
+// RUN: %clang_cc1 -I%S %s -triple spirv64-amd-amdhsa -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 #include <typeinfo>
 
 namespace Test1 {
@@ -39,7 +39,7 @@ const std::type_info &A10_c_ti = typeid(char const[10]);
 // CHECK-LABEL: define{{.*}} ptr @_ZN5Test11fEv
 // CHECK-SAME:  personality ptr @__gxx_personality_v0
 // WITH-NONZERO-DEFAULT-AS-LABEL: define{{.*}} ptr addrspace(4) @_ZN5Test11fEv
-// WITH-NONZERO-DEFAULT-AS-SAME:  personality ptr @__gxx_personality_v0
+// WITH-NONZERO-DEFAULT-AS-SAME: personality ptr addrspace(4) @__gxx_personality_v0
 const char *f() {
   try {
     // CHECK: br i1
diff --git a/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp b/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp
index 350303cc6e9b3..60eb8f17f91fd 100644
--- a/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp
+++ b/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -I%S %s -triple amdgcn-amd-amdhsa -emit-llvm -o - | FileCheck %s -check-prefix=AS
-// RUN: %clang_cc1 -I%S %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -o - | FileCheck %s -check-prefix=NONZERO-DEFAULT-AS
+// RUN: %clang_cc1 -I%S %s -triple spirv64-amd-amdhsa -emit-llvm -o - | FileCheck %s -check-prefix=NONZERO-DEFAULT-AS
 // RUN: %clang_cc1 -I%S %s -triple x86_64-linux-gnu -emit-llvm -o - | FileCheck %s -check-prefix=NO-AS
 #include <typeinfo>
 
@@ -26,30 +26,30 @@ class B : A {
 
 unsigned long Fn(B& b) {
 // AS: %call = call noundef zeroext i1 @_ZNKSt9type_infoeqERKS_(ptr {{.*}} addrspacecast (ptr addrspace(1) @_ZTISt9type_info to ptr), ptr {{.*}} %2)
-// NONZERO-DEFAULT-AS: %call = call{{.*}} noundef zeroext i1 @_ZNKSt9type_infoeqERKS_(ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTISt9type_info to ptr addrspace(4)), ptr addrspace(4) {{.*}} %2)
+// NONZERO-DEFAULT-AS: %call = call{{.*}} noundef zeroext{{.*}} i1 @_ZNKSt9type_infoeqERKS_(ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTISt9type_info to ptr addrspace(4)), ptr addrspace(4) {{.*}} %2)
 // NO-AS: %call = call noundef zeroext i1 @_ZNKSt9type_infoeqERKS_(ptr {{.*}} @_ZTISt9type_info, ptr {{.*}} %2)
     if (typeid(std::type_info) == typeid(b))
         return 42;
 // AS: %call2 = call noundef zeroext i1 @_ZNKSt9type_infoneERKS_(ptr {{.*}} addrspacecast (ptr addrspace(1) @_ZTIi to ptr), ptr {{.*}} %5)
-// NONZERO-DEFAULT-AS: %call2 = call{{.*}} noundef zeroext i1 @_ZNKSt9type_infoneERKS_(ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTIi to ptr addrspace(4)), ptr addrspace(4) {{.*}} %5)
+// NONZERO-DEFAULT-AS: %call2 = call{{.*}} noundef zeroext{{.*}} i1 @_ZNKSt9type_infoneERKS_(ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTIi to ptr addrspace(4)), ptr addrspace(4) {{.*}} %5)
 // NO-AS: %call2 = call noundef zeroext i1 @_ZNKSt9type_infoneERKS_(ptr {{.*}} @_ZTIi, ptr {{.*}} %5)
     if (typeid(int) != typeid(b))
         return 1712;
 // AS: %call5 = call noundef ptr @_ZNKSt9type_info4nameEv(ptr {{.*}} addrspacecast (ptr addrspace(1) @_ZTI1A to ptr))
-// NONZERO-DEFAULT-AS: %call5 = call{{.*}} noundef ptr addrspace(4) @_ZNKSt9type_info4nameEv(ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTI1A to ptr addrspace(4)))
+// NONZERO-DEFAULT-AS: %call5 = call{{.*}} noundef{{.*}} ptr addrspace(4) @_ZNKSt9type_info4nameEv(ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTI1A to ptr addrspace(4)))
 // NO-AS: %call5 = call noundef ptr @_ZNKSt9type_info4nameEv(ptr {{.*}} @_ZTI1A)
 // AS: %call7 = call noundef ptr @_ZNKSt9type_info4nameEv(ptr {{.*}} %8)
-// NONZERO-DEFAULT-AS: %call7 = call{{.*}} noundef ptr addrspace(4) @_ZNKSt9type_info4nameEv(ptr addrspace(4) {{.*}} %8)
+// NONZERO-DEFAULT-AS: %call7 = call{{.*}} noundef{{.*}} ptr addrspace(4) @_ZNKSt9type_info4nameEv(ptr addrspace(4) {{.*}} %8)
 // NO-AS: %call7 = call noundef ptr @_ZNKSt9type_info4nameEv(ptr {{.*}} %8)
     if (typeid(A).name() == typeid(b).name())
         return 0;
 // AS: %call11 = call noundef zeroext i1 @_ZNKSt9type_info6beforeERKS_(ptr {{.*}} %11, ptr {{.*}} addrspacecast (ptr addrspace(1) @_ZTIf to ptr))
-// NONZERO-DEFAULT-AS: %call11 = call{{.*}} noundef zeroext i1 @_ZNKSt9type_info6beforeERKS_(ptr addrspace(4) {{.*}} %11, ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTIf to ptr addrspace(4)))
+// NONZERO-DEFAULT-AS: %call11 = call{{.*}} noundef zeroext{{.*}} i1 @_ZNKSt9type_info6beforeERKS_(ptr addrspace(4) {{.*}} %11, ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTIf to ptr addrspace(4)))
 // NO-AS:   %call11 = call noundef zeroext i1 @_ZNKSt9type_info6beforeERKS_(ptr {{.*}} %11, ptr {{.*}} @_ZTIf)
     if (typeid(b).before(typeid(float)))
         return 1;
 // AS: %call15 = call noundef i64 @_ZNKSt9type_info9hash_codeEv(ptr {{.*}} %14)
-// NONZERO-DEFAULT-AS: %call15 = call{{.*}} noundef i64 @_ZNKSt9type_info9hash_codeEv(ptr addrspace(4) {{.*}} %14)
+// NONZERO-DEFAULT-AS: %call15 = call{{.*}} noundef{{.*}} i64 @_ZNKSt9type_info9hash_codeEv(ptr addrspace(4) {{.*}} %14)
 // NO-AS: %call15 = call noundef i64 @_ZNKSt9type_info9hash_codeEv(ptr {{.*}} %14)
     return typeid(b).hash_code();
 }
diff --git a/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp b/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
index ecafa99d8be00..e8d3a8407cda7 100644
--- a/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o %t.ll -O1 -disable-llvm-passes -fms-extensions -fstrict-vtable-pointers
 // RUN: %clang_cc1 %s -triple i686-pc-win32 -emit-llvm -o %t.ms.ll -O1 -disable-llvm-passes -fms-extensions -fstrict-vtable-pointers
-// RUN: %clang_cc1 %s -triple=spirv64-unknown-unknown -fsycl-is-device -std=c++11 -emit-llvm -o %t.ll -O1 -disable-llvm-passes -fms-extensions -fstrict-vtable-pointers
+// RUN: %clang_cc1 %s -triple=spirv64-amd-amdhsa -std=c++11 -emit-llvm -o %t.ll -O1 -disable-llvm-passes -fms-extensions -fstrict-vtable-pointers
 // FIXME: Assume load should not require -fstrict-vtable-pointers
 
 // RUN: FileCheck --check-prefix=CHECK1 --input-file=%t.ll %s
@@ -29,7 +29,7 @@ void g(A *a) { a->foo(); }
 // CHECK1: call{{.*}} void @_ZN5test11AC1Ev(ptr {{((addrspace(4)){0,1})}}
 // CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}
 // CHECK1: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test11AE, i32 0, i32 0, i32 2)
-// CHECK1: call void @llvm.assume(i1 %[[CMP]])
+// CHECK1: call{{.*}} void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: {{^}}}
 
 void fooA() {
@@ -41,7 +41,7 @@ void fooA() {
 // CHECK1: call{{.*}} void @_ZN5test11BC1Ev(ptr {{[^,]*}} %{{.*}})
 // CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}
 // CHECK1: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test11BE, i32 0, i32 0, i32 2)
-// CHECK1: call void @llvm.assume(i1 %[[CMP]])
+// CHECK1: call{{.*}} void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: {{^}}}
 
 void fooB() {
@@ -75,12 +75,12 @@ void h(B *b) { b->bar(); }
 // CHECK2: call{{.*}} void @_ZN5test21CC1Ev(ptr
 // CHECK2: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{.*}}
 // CHECK2: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test21CE, i32 0, i32 0, i32 2)
-// CHECK2: call void @llvm.assume(i1 %[[CMP]])
+// CHECK2: call{{.*}} void @llvm.assume(i1 %[[CMP]])
 
 // CHECK2: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}, i64 8
 // CHECK2: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%[[ADD_PTR]]
 // CHECK2: %[[CMP2:.*]] = icmp eq ptr addrspace(1) %[[VTABLE2]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test21CE, i32 0, i32 1, i32 2)
-// CHECK2: call void @llvm.assume(i1 %[[CMP2]])
+// CHECK2: call{{.*}} void @llvm.assume(i1 %[[CMP2]])
 
 // CHECK2: call{{.*}} void @_ZN5test21gEPNS_1AE(
 // CHECK2-LABEL: {{^}}}
@@ -111,7 +111,7 @@ void g(B *a) { a->foo(); }
 // CHECK3-LABEL: define{{.*}} void @_ZN5test34testEv()
 // CHECK3: call{{.*}} void @_ZN5test31CC1Ev(ptr
 // CHECK3: %[[CMP:.*]] = icmp eq ptr addrspace(1) %{{.*}}, getelementptr inbounds inrange(-24, 8) ({ [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test31CE, i32 0, i32 0, i32 3)
-// CHECK3: call void @llvm.assume(i1 %[[CMP]])
+// CHECK3: call{{.*}} void @llvm.assume(i1 %[[CMP]])
 // CHECK3-LABLEL: }
 void test() {
   C c;
@@ -140,11 +140,11 @@ void g(C *c) { c->foo(); }
 // CHECK4: call{{.*}} void @_ZN5test41CC1Ev(ptr
 // CHECK4: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}
 // CHECK4: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-32, 8) ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test41CE, i32 0, i32 0, i32 4)
-// CHECK4: call void @llvm.assume(i1 %[[CMP]]
+// CHECK4: call{{.*}} void @llvm.assume(i1 %[[CMP]]
 
 // CHECK4: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}
 // CHECK4: %[[CMP2:.*]] = icmp eq ptr addrspace(1) %[[VTABLE2]], getelementptr inbounds inrange(-32, 8) ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test41CE, i32 0, i32 0, i32 4)
-// CHECK4: call void @llvm.assume(i1 %[[CMP2]])
+// CHECK4: call{{.*}} void @llvm.assume(i1 %[[CMP2]])
 // CHECK4-LABEL: {{^}}}
 
 void test() {
@@ -214,7 +214,7 @@ void A::foo() {}
 
 // CHECK7-LABEL: define{{.*}} void @_ZN5test71gEv()
 // CHECK7: call{{.*}} void @_ZN5test71AC1Ev(
-// CHECK7: call void @llvm.assume(
+// CHECK7: call{{.*}} void @llvm.assume(
 // CHECK7-LABEL: {{^}}}
 void g() {
   A *a = new A();
@@ -257,7 +257,7 @@ struct E : A {
 };
 
 // CHECK8-LABEL: define{{.*}} void @_ZN5test81bEv()
-// CHECK8: call void @llvm.assume(
+// CHECK8: call{{.*}} void @llvm.assume(
 // CHECK8-LABEL: {{^}}}
 void b() {
   B b;
@@ -285,7 +285,7 @@ void d() {
 }
 
 // CHECK8-LABEL: define{{.*}} void @_ZN5test81eEv()
-// CHECK8: call void @llvm.assume(
+// CHECK8: call{{.*}} void @llvm.assume(
 // CHECK8-LABEL: {{^}}}
 void e() {
   E e;
diff --git a/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp b/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
index 876d0845cc515..8b5b9dd353064 100644
--- a/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 %s -triple=spirv64-unknown-unknown -fsycl-is-device -std=c++11 -emit-llvm -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
+// RUN: %clang_cc1 %s -triple=spirv64-amd-amdhsa -std=c++11 -emit-llvm -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 struct Field {
   Field();
diff --git a/clang/test/CodeGenCXX/vtt-address-space.cpp b/clang/test/CodeGenCXX/vtt-address-space.cpp
index 4c3d0a534611c..3409bc7cbb69d 100644
--- a/clang/test/CodeGenCXX/vtt-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtt-address-space.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 %s -triple=spirv64-unknown-unknown -fsycl-is-device -std=c++11 -emit-llvm -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
+// RUN: %clang_cc1 %s -triple=spirv64-amd-amdhsa -std=c++11 -emit-llvm -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 // This is the sample from the C++ Itanium ABI, p2.6.2.
 namespace Test {
diff --git a/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp b/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp
index c575f49ff6971..05811bb7e1285 100644
--- a/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp
+++ b/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp
@@ -12,7 +12,7 @@
 // CHECK: @d ={{.*}} addrspace(1) externally_initialized global
 __device__ int d;
 
-// CHECK: @c ={{.*}} addrspace(1) externally_initialized global
+// CHECK: @c ={{.*}} addrspace(1) externally_initialized constant
 __constant__ int c;
 
 // CHECK: @s ={{.*}} addrspace(3) global
diff --git a/clang/test/CodeGenHLSL/buffer-array-operator.hlsl b/clang/test/CodeGenHLSL/buffer-array-operator.hlsl
index 02e570ebdcb4f..f65cdbb43e27b 100644
--- a/clang/test/CodeGenHLSL/buffer-array-operator.hlsl
+++ b/clang/test/CodeGenHLSL/buffer-array-operator.hlsl
@@ -1,5 +1,8 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 
+// XFAIL: *
+// Resource indexing will be properly implemented in llvm/llvm-project#95956
+
 const RWBuffer<float> In;
 RWBuffer<float> Out;
 
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
index 174f4c3eaaad2..19699dcf14d9f 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
@@ -1,6 +1,12 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV
 
+// XFAIL: *
+// This expectedly fails because create.handle is no longer called
+// from RWBuffer constructor and the replacement has not been
+// implemented yet. This test should be updated to expect
+// dx.create.handleFromBinding as part of issue #105076.
+
 RWBuffer<float> Buf;
 
 // CHECK: define linkonce_odr noundef ptr @"??0?$RWBuffer@M@hlsl@@QAA@XZ"
@@ -10,4 +16,4 @@ RWBuffer<float> Buf;
 // CHECK: store ptr %[[HandleRes]], ptr %h, align 4
 
 // CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1)
-// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8
\ No newline at end of file
+// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl
index 34019e5b18693..178332d03e640 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl
@@ -1,5 +1,12 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV
 
+// XFAIL: *
+// This expectedly fails because create.handle is no longer invoked
+// from StructuredBuffer constructor and the replacement has not been
+// implemented yet. This test should be updated to expect
+// dx.create.handleFromBinding as part of issue #105076.
+
 StructuredBuffer<float> Buf;
 
 // CHECK: define linkonce_odr noundef ptr @"??0?$StructuredBuffer@M@hlsl@@QAA@XZ"
diff --git a/clang/test/CodeGenHLSL/builtins/asint.hlsl b/clang/test/CodeGenHLSL/builtins/asint.hlsl
new file mode 100644
index 0000000000000..e1d80df5015c9
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/asint.hlsl
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
+
+// CHECK: define {{.*}}test_int{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}}
+// CHECK-NOT: bitcast
+// CHECK: ret i32 [[VAL]]
+int test_int(int p0) {
+  return asint(p0);
+}
+
+// CHECK: define {{.*}}test_uint{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}}
+// CHECK-NOT: bitcast
+// CHECK: ret i32 [[VAL]]
+int test_uint(uint p0) {
+  return asint(p0);
+}
+
+// CHECK: define {{.*}}test_float{{.*}}(float {{.*}} [[VAL:%.*]]){{.*}}
+// CHECK: bitcast float [[VAL]] to i32
+int test_float(float p0) {
+  return asint(p0);
+}
+
+// CHECK: define {{.*}}test_vector_int{{.*}}(<4 x i32> {{.*}} [[VAL:%.*]]){{.*}}
+// CHECK-NOT: bitcast
+// CHECK: ret <4 x i32> [[VAL]]
+int4 test_vector_int(int4 p0) {
+  return asint(p0);
+}
+
+// CHECK: define {{.*}}test_vector_uint{{.*}}(<4 x i32> {{.*}} [[VAL:%.*]]){{.*}}
+// CHECK-NOT: bitcast
+// CHECK: ret <4 x i32> [[VAL]]
+int4 test_vector_uint(uint4 p0) {
+  return asint(p0);
+}
+
+// CHECK: define {{.*}}test_vector_float{{.*}}(<4 x float> {{.*}} [[VAL:%.*]]){{.*}}
+// CHECK: bitcast <4 x float> [[VAL]] to <4 x i32>
+int4 test_vector_float(float4 p0) {
+  return asint(p0);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/create_handle.hlsl b/clang/test/CodeGenHLSL/builtins/create_handle.hlsl
deleted file mode 100644
index 61226c2b54e72..0000000000000
--- a/clang/test/CodeGenHLSL/builtins/create_handle.hlsl
+++ /dev/null
@@ -1,7 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
-
-void fn() {
-  (void)__builtin_hlsl_create_handle(0);
-}
-
-// CHECK: call ptr @llvm.dx.create.handle(i8 0)
diff --git a/clang/test/CodeGenHLSL/builtins/fmod.hlsl b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
new file mode 100644
index 0000000000000..708779daaa7b6
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
@@ -0,0 +1,77 @@
+// DirectX target:
+//
+// ---------- Native Half support test -----------
+//
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   -DFNATTRS=noundef -DTYPE=half
+
+//
+// ---------- No Native Half support test -----------
+//
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s \
+// RUN:   -DFNATTRS=noundef -DTYPE=float
+
+
+// Spirv target:
+//
+// ---------- Native Half support test -----------
+//
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   -DFNATTRS="spir_func noundef" -DTYPE=half
+
+//
+// ---------- No Native Half support test -----------
+//
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s \
+// RUN:   -DFNATTRS="spir_func noundef" -DTYPE=float
+
+
+
+// CHECK: define [[FNATTRS]] [[TYPE]] @
+// CHECK: %fmod = frem [[TYPE]]
+// CHECK: ret [[TYPE]] %fmod
+half test_fmod_half(half p0, half p1) { return fmod(p0, p1); }
+
+// CHECK: define [[FNATTRS]] <2 x [[TYPE]]> @
+// CHECK: %fmod = frem <2 x [[TYPE]]>
+// CHECK: ret <2 x [[TYPE]]> %fmod
+half2 test_fmod_half2(half2 p0, half2 p1) { return fmod(p0, p1); }
+
+// CHECK: define [[FNATTRS]] <3 x [[TYPE]]> @
+// CHECK: %fmod = frem <3 x [[TYPE]]>
+// CHECK: ret <3 x [[TYPE]]> %fmod
+half3 test_fmod_half3(half3 p0, half3 p1) { return fmod(p0, p1); }
+
+// CHECK: define [[FNATTRS]] <4 x [[TYPE]]> @
+// CHECK: %fmod = frem <4 x [[TYPE]]>
+// CHECK: ret <4 x [[TYPE]]> %fmod
+half4 test_fmod_half4(half4 p0, half4 p1) { return fmod(p0, p1); }
+
+// CHECK: define [[FNATTRS]] float @
+// CHECK: %fmod = frem float
+// CHECK: ret float %fmod
+float test_fmod_float(float p0, float p1) { return fmod(p0, p1); }
+
+// CHECK: define [[FNATTRS]] <2 x float> @
+// CHECK: %fmod = frem <2 x float>
+// CHECK: ret <2 x float> %fmod
+float2 test_fmod_float2(float2 p0, float2 p1) { return fmod(p0, p1); }
+
+// CHECK: define [[FNATTRS]] <3 x float> @
+// CHECK: %fmod = frem <3 x float>
+// CHECK: ret <3 x float> %fmod
+float3 test_fmod_float3(float3 p0, float3 p1) { return fmod(p0, p1); }
+
+// CHECK: define [[FNATTRS]] <4 x float> @
+// CHECK: %fmod = frem <4 x float>
+// CHECK: ret <4 x float> %fmod
+float4 test_fmod_float4(float4 p0, float4 p1) { return fmod(p0, p1); }
+
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
index 9481b0d60a272..6b053dc6add1f 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-pc-vulkan-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK: define spir_func void @main() [[A0:#[0-9]+]] {
+// CHECK: define spir_func void @{{.*main.*}}() [[A0:#[0-9]+]] {
 void main() {
 // CHECK: entry:
 // CHECK:   %[[CT_ENTRY:[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/entry.hlsl b/clang/test/CodeGenHLSL/entry.hlsl
index ec4254e76fb66..cd3bf948df48c 100644
--- a/clang/test/CodeGenHLSL/entry.hlsl
+++ b/clang/test/CodeGenHLSL/entry.hlsl
@@ -2,6 +2,10 @@
 // RUN:   dxil-pc-shadermodel6.3-compute %s -hlsl-entry foo \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -hlsl-entry foo \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
 // Make sure not mangle entry.
 // CHECK:define void @foo()
 // Make sure add function attribute and numthreads attribute.
diff --git a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
index ae3a3b5f90199..f72fe059cb576 100644
--- a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
+++ b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
@@ -31,7 +31,7 @@ uint Find(Node SortedTree[MAX], uint key) {
 }
 
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define noundef i1 @"?InitTree@@YA_NY0GE@UNode@@V?$RWBuffer@T?$__vector@I$03@__clang@@@hlsl@@I@Z"(ptr noundef byval([100 x %struct.Node]) align 4 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %encodedTree, i32 noundef %maxDepth) [[ExtAttr:\#[0-9]+]]
+// CHECK: define noundef i1 @"?InitTree@@YA_NY0GE@UNode@@V?$RWBuffer@T?$__vector@I$03@__clang@@@hlsl@@I@Z"(ptr noundef byval([100 x %struct.Node]) align 4 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 16 %encodedTree, i32 noundef %maxDepth) [[ExtAttr:\#[0-9]+]]
 // CHECK: ret i1
 // Initialize tree with given buffer
 // Imagine the inout works
diff --git a/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl b/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
index 59c1620334d0e..975a7264fd3f0 100644
--- a/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
+++ b/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
@@ -6,7 +6,8 @@
 // CHECK:       define void @foo()
 // CHECK-DXIL:  %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
 // CHECK-SPIRV: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
-// CHECK:       call void @{{.*}}foo{{.*}}(i32 %[[#ID]])
+// CHECK-DXIL:       call void @{{.*}}foo{{.*}}(i32 %[[#ID]])
+// CHECK-SPIRV:      call spir_func void @{{.*}}foo{{.*}}(i32 %[[#ID]])
 [shader("compute")]
 [numthreads(8,8,1)]
 void foo(uint Idx : SV_DispatchThreadID) {}
@@ -17,6 +18,7 @@ void foo(uint Idx : SV_DispatchThreadID) {}
 // CHECK:       %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].thread.id(i32 1)
 // CHECK:       %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
 // CHECK-DXIL:  call void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]])
+// CHECK-SPIRV:  call spir_func void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]])
 [shader("compute")]
 [numthreads(8,8,1)]
 void bar(uint2 Idx : SV_DispatchThreadID) {}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
index 138616ccca718..7f07160bf5e88 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
@@ -50,6 +50,10 @@ void test_s_wait_event_export_ready() {
 
 // CHECK-LABEL: @test_global_add_f32
 // CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+#if !defined(__SPIRV__)
 void test_global_add_f32(float *rtn, global float *addr, float x) {
+#else
+void test_global_add_f32(float *rtn, __attribute__((address_space(1))) float *addr, float x) {
+#endif
   *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 9274c80abd8c0..bf5f2971cf118 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -665,23 +665,24 @@ void test_s_getpc(global ulong* out)
 }
 
 // CHECK-LABEL: @test_ds_append_lds(
-// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %ptr, i1 false)
-kernel void test_ds_append_lds(global int* out, local int* ptr) {
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %{{.+}}, i1 false)
 #if !defined(__SPIRV__)
-  *out = __builtin_amdgcn_ds_append(ptr);
+kernel void test_ds_append_lds(global int* out, local int* ptr) {
 #else
-  *out = __builtin_amdgcn_ds_append((__attribute__((address_space(3))) int*)(int*)ptr);
+kernel void test_ds_append_lds(__attribute__((address_space(1))) int* out, __attribute__((address_space(3))) int* ptr) {
 #endif
+  *out = __builtin_amdgcn_ds_append(ptr);
 }
 
 // CHECK-LABEL: @test_ds_consume_lds(
-// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %ptr, i1 false)
-kernel void test_ds_consume_lds(global int* out, local int* ptr) {
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %{{.+}}, i1 false)
+
 #if !defined(__SPIRV__)
-  *out = __builtin_amdgcn_ds_consume(ptr);
+kernel void test_ds_consume_lds(global int* out, local int* ptr) {
 #else
-  *out = __builtin_amdgcn_ds_consume((__attribute__((address_space(3))) int*)(int*)ptr);
+kernel void test_ds_consume_lds(__attribute__((address_space(1))) int* out, __attribute__((address_space(3))) int* ptr) {
 #endif
+  *out = __builtin_amdgcn_ds_consume(ptr);
 }
 
 // CHECK-LABEL: @test_gws_init(
@@ -835,7 +836,11 @@ kernel void test_s_setreg(uint val) {
 }
 
 // CHECK-LABEL test_atomic_inc_dec(
+#if !defined(__SPIRV__)
 void test_atomic_inc_dec(local uint *lptr, global uint *gptr, uint val) {
+#else
+void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribute__((address_space(1))) uint *gptr, uint val) {
+#endif
   uint res;
 
   // CHECK: atomicrmw uinc_wrap ptr addrspace(3) %lptr, i32 %val syncscope("workgroup") seq_cst, align 4
@@ -851,7 +856,11 @@ void test_atomic_inc_dec(local uint *lptr, global uint *gptr, uint val) {
   res = __builtin_amdgcn_atomic_dec32(gptr, val, __ATOMIC_SEQ_CST, "");
 
   // CHECK: atomicrmw volatile udec_wrap ptr addrspace(1) %gptr, i32 %val seq_cst, align 4
+  #if !defined(__SPIRV__)
   res = __builtin_amdgcn_atomic_dec32((volatile global uint*)gptr, val, __ATOMIC_SEQ_CST, "");
+  #else
+  res = __builtin_amdgcn_atomic_dec32((volatile __attribute__((address_space(1))) uint*)gptr, val, __ATOMIC_SEQ_CST, "");
+  #endif
 }
 
 // CHECK-LABEL test_wavefrontsize(
diff --git a/clang/test/Driver/B-opt.c b/clang/test/Driver/B-opt.c
index df85dee4b7040..48139e71a9001 100644
--- a/clang/test/Driver/B-opt.c
+++ b/clang/test/Driver/B-opt.c
@@ -1,28 +1,28 @@
 // Check -B driver option.
 
 /// Target triple prefix is not detected for -B.
-// RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
+// RUN: %clang %s -### -o %t.o --target=i386-unknown-linux \
 // RUN:     -B %S/Inputs/B_opt_tree/dir1 -fuse-ld=ld 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-B-OPT-TRIPLE %s
 // CHECK-B-OPT-TRIPLE-NOT: "{{.*}}/Inputs/B_opt_tree/dir1{{/|\\\\}}i386-unknown-linux-ld"
 //
-// RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
+// RUN: %clang %s -### -o %t.o --target=i386-unknown-linux \
 // RUN:     -B %S/Inputs/B_opt_tree/dir2 -fuse-ld=ld 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-B-OPT-DIR %s
 // CHECK-B-OPT-DIR: "{{.*}}/Inputs/B_opt_tree/dir2{{/|\\\\}}ld"
 //
-// RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
+// RUN: %clang %s -### -o %t.o --target=i386-unknown-linux \
 // RUN:     -B %S/Inputs/B_opt_tree/dir3/prefix- -fuse-ld=ld 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-B-OPT-PREFIX %s
 // CHECK-B-OPT-PREFIX: "{{.*}}/Inputs/B_opt_tree/dir3{{/|\\\\}}prefix-ld"
 //
-// RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
+// RUN: %clang %s -### -o %t.o --target=i386-unknown-linux \
 // RUN:     -B %S/Inputs/B_opt_tree/dir3/prefix- \
 // RUN:     -B %S/Inputs/B_opt_tree/dir2 2>&1 -fuse-ld=ld \
 // RUN:   | FileCheck --check-prefix=CHECK-B-OPT-MULT %s
 // CHECK-B-OPT-MULT: "{{.*}}/Inputs/B_opt_tree/dir3{{/|\\\\}}prefix-ld"
 //
 // RUN: %clang -B %S/Inputs/does_not_exist -print-search-dirs \
-// RUN:     -target aarch64-linux-gnu \
+// RUN:     --target=aarch64-linux-gnu \
 // RUN:   | FileCheck --check-prefix=CHECK-B-OPT-INVALID %s
 // CHECK-B-OPT-INVALID-NOT: /..//bin
diff --git a/clang/test/Driver/arm-execute-only.c b/clang/test/Driver/arm-execute-only.c
index a9bf1656fd27e..d654ec364a87f 100644
--- a/clang/test/Driver/arm-execute-only.c
+++ b/clang/test/Driver/arm-execute-only.c
@@ -19,6 +19,9 @@
 
 // RUN: not %clang -### --target=arm-arm-none-eabi -march=armv8-m.main -mpure-code -mno-movt %s 2>&1 \
 // RUN:    | FileCheck %s -check-prefix CHECK-PURE-CODE-NO-MOVT
+// RUN: echo "-DABC"  > %t.cfg
+// RUN: not %clang -### --target=arm-arm-none-eabi -march=armv8-m.main -mpure-code -mno-movt --config %t.cfg %s 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-PURE-CODE-NO-MOVT
 // CHECK-PURE-CODE-NO-MOVT: error: option '-mpure-code' cannot be specified with '-mno-movt'
 
 // RUN: not %clang -### --target=arm-arm-none-eabi -march=armv6-m -mexecute-only -fropi %s 2>&1 \
diff --git a/clang/test/Driver/as-options.s b/clang/test/Driver/as-options.s
index 73d002c7ef7ed..1e5f392de4555 100644
--- a/clang/test/Driver/as-options.s
+++ b/clang/test/Driver/as-options.s
@@ -1,36 +1,36 @@
 // PR21000: Test that -I is passed to both external and integrated assemblers.
 
-// RUN: %clang -target x86_64-linux-gnu -c -no-integrated-as %s \
+// RUN: %clang --target=x86_64-linux-gnu -c -no-integrated-as %s \
 // RUN:   -Ifoo_dir -### 2>&1 \
 // RUN:   | FileCheck %s
 
-// RUN: %clang -target x86_64-linux-gnu -c -no-integrated-as %s \
+// RUN: %clang --target=x86_64-linux-gnu -c -no-integrated-as %s \
 // RUN:   -I foo_dir -### 2>&1 \
 // RUN:   | FileCheck %s
 
-// RUN: %clang -target x86_64-linux-gnu -c -integrated-as %s \
+// RUN: %clang --target=x86_64-linux-gnu -c -integrated-as %s \
 // RUN:   -Ifoo_dir -### 2>&1 \
 // RUN:   | FileCheck %s
 
-// RUN: %clang -target x86_64-linux-gnu -c -integrated-as %s \
+// RUN: %clang --target=x86_64-linux-gnu -c -integrated-as %s \
 // RUN:   -I foo_dir -### 2>&1 \
 // RUN:   | FileCheck %s
 
 // Other GNU targets
 
-// RUN: %clang -target aarch64-linux-gnu -c -no-integrated-as %s \
+// RUN: %clang --target=aarch64-linux-gnu -c -no-integrated-as %s \
 // RUN:   -Ifoo_dir -### 2>&1 \
 // RUN:   | FileCheck %s
 
-// RUN: %clang -target aarch64-linux-gnu -c -integrated-as %s \
+// RUN: %clang --target=aarch64-linux-gnu -c -integrated-as %s \
 // RUN:   -Ifoo_dir -### 2>&1 \
 // RUN:   | FileCheck %s
 
-// RUN: %clang -target armv7-linux-gnueabihf -c -no-integrated-as %s \
+// RUN: %clang --target=armv7-linux-gnueabihf -c -no-integrated-as %s \
 // RUN:   -Ifoo_dir -### 2>&1 \
 // RUN:   | FileCheck %s
 
-// RUN: %clang -target armv7-linux-gnueabihf -c -integrated-as %s \
+// RUN: %clang --target=armv7-linux-gnueabihf -c -integrated-as %s \
 // RUN:   -Ifoo_dir -### 2>&1 \
 // RUN:   | FileCheck %s
 
@@ -53,45 +53,45 @@
 // RUN:   -o /dev/null -x assembler-with-cpp %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN --allow-empty %s
 
-// RUN: %clang -mimplicit-it=always -target armv7-linux-gnueabi -E \
+// RUN: %clang -mimplicit-it=always --target=armv7-linux-gnueabi -E \
 // RUN:   -fintegrated-as -o /dev/null -x c++ %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=NOWARN --allow-empty %s
-// RUN: %clang -mimplicit-it=always -target armv7-linux-gnueabi -E \
+// RUN: %clang -mimplicit-it=always --target=armv7-linux-gnueabi -E \
 // RUN:   -fno-integrated-as -o /dev/null -x c++ %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN --allow-empty %s
 
-// RUN: %clang -mimplicit-it=always -target armv7-linux-gnueabi -E \
+// RUN: %clang -mimplicit-it=always --target=armv7-linux-gnueabi -E \
 // RUN:   -fintegrated-as -o /dev/null -x assembler-with-cpp %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=NOWARN --allow-empty %s
-// RUN: %clang -mimplicit-it=always -target armv7-linux-gnueabi -E \
+// RUN: %clang -mimplicit-it=always --target=armv7-linux-gnueabi -E \
 // RUN:   -fno-integrated-as -o /dev/null -x assembler-with-cpp %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN --allow-empty %s
 
-// RUN: %clang -Wa,-mbig-obj -target i386-pc-windows -E -fintegrated-as \
+// RUN: %clang -Wa,-mbig-obj --target=i386-pc-windows -E -fintegrated-as \
 // RUN:   -o /dev/null -x c++ %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=NOWARN --allow-empty %s
-// RUN: %clang -Wa,-mbig-obj -target i386-pc-windows -E -fno-integrated-as \
+// RUN: %clang -Wa,-mbig-obj --target=i386-pc-windows -E -fno-integrated-as \
 // RUN:   -o /dev/null -x c++ %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=NOWARN --allow-empty %s
 
-// RUN: %clang -Wa,-mbig-obj -target i386-pc-windows -E -fintegrated-as \
+// RUN: %clang -Wa,-mbig-obj --target=i386-pc-windows -E -fintegrated-as \
 // RUN:   -o /dev/null -x assembler-with-cpp %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=NOWARN --allow-empty %s
-// RUN: %clang -Wa,-mbig-obj -target i386-pc-windows -E -fno-integrated-as \
+// RUN: %clang -Wa,-mbig-obj --target=i386-pc-windows -E -fno-integrated-as \
 // RUN:   -o /dev/null -x assembler-with-cpp %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=NOWARN --allow-empty %s
 
-// RUN: %clang -Xassembler -mbig-obj -target i386-pc-windows -E -fintegrated-as \
+// RUN: %clang -Xassembler -mbig-obj --target=i386-pc-windows -E -fintegrated-as \
 // RUN:   -o /dev/null -x c++ %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=NOWARN --allow-empty %s
-// RUN: %clang -Xassembler -mbig-obj -target i386-pc-windows -E \
+// RUN: %clang -Xassembler -mbig-obj --target=i386-pc-windows -E \
 // RUN:   -fno-integrated-as -o /dev/null -x c++ %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=NOWARN --allow-empty %s
 
-// RUN: %clang -Xassembler -mbig-obj -target i386-pc-windows -E -fintegrated-as \
+// RUN: %clang -Xassembler -mbig-obj --target=i386-pc-windows -E -fintegrated-as \
 // RUN:   -o /dev/null -x assembler-with-cpp %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=NOWARN --allow-empty %s
-// RUN: %clang -Xassembler -mbig-obj -target i386-pc-windows -E \
+// RUN: %clang -Xassembler -mbig-obj --target=i386-pc-windows -E \
 // RUN:   -fno-integrated-as -o /dev/null -x assembler-with-cpp %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=NOWARN --allow-empty %s
 
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index a6f338533ad76..07a25da0269fd 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -406,9 +406,9 @@
 // RUN:    /Zm \
 // RUN:    /Zo \
 // RUN:    /Zo- \
-// RUN:    -### -- %s 2>&1 | FileCheck -check-prefix=IGNORED %s
+// RUN:    -### -- %s 2>&1 | FileCheck -DMSG=%errc_ENOENT -check-prefix=IGNORED %s
 // IGNORED-NOT: argument unused during compilation
-// IGNORED-NOT: no such file or directory
+// IGNORED-NOT: [[MSG]]
 // Don't confuse /openmp- with the /o flag:
 // IGNORED-NOT: "-o" "penmp-.obj"
 
diff --git a/clang/test/Driver/cl-zc.cpp b/clang/test/Driver/cl-zc.cpp
index c7cf5b1b6525b..9b1ea53888ceb 100644
--- a/clang/test/Driver/cl-zc.cpp
+++ b/clang/test/Driver/cl-zc.cpp
@@ -133,9 +133,9 @@
 // RUN:   /Zc:inline \
 // RUN:   /Zc:rvalueCast \
 // RUN:   /Zc:ternary \
-// RUN:   -### -- %s 2>&1 | FileCheck -check-prefix=IGNORED %s
+// RUN:   -### -- %s 2>&1 | FileCheck -DMSG=%errc_ENOENT -check-prefix=IGNORED %s
 // IGNORED-NOT: argument unused during compilation
-// IGNORED-NOT: no such file or directory
+// IGNORED-NOT: [[MSG]]
 
 // Negated form warns:
 // RUN: %clang_cl /c \
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index adb6f075b6c15..2e1736ca72aa7 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -463,8 +463,8 @@
 // RUN: %clang -### -S -fno-unsigned-char %s 2>&1 | FileCheck -check-prefix=CHAR-SIGN4 %s
 // CHAR-SIGN4-NOT: -fno-signed-char
 
-// RUN: %clang -target x86_64-unknown-none-none -### -fshort-wchar -fno-short-wchar %s 2>&1 | FileCheck -check-prefix=CHECK-WCHAR1 -check-prefix=DELIMITERS %s
-// RUN: %clang -target x86_64-unknown-none-none -### -fno-short-wchar -fshort-wchar %s 2>&1 | FileCheck -check-prefix=CHECK-WCHAR2 -check-prefix=DELIMITERS %s
+// RUN: %clang --target=x86_64-unknown-none-none -### -fshort-wchar -fno-short-wchar %s 2>&1 | FileCheck -check-prefix=CHECK-WCHAR1 -check-prefix=DELIMITERS %s
+// RUN: %clang --target=x86_64-unknown-none-none -### -fno-short-wchar -fshort-wchar %s 2>&1 | FileCheck -check-prefix=CHECK-WCHAR2 -check-prefix=DELIMITERS %s
 // Make sure we don't match the -NOT lines with the linker invocation.
 // Delimiters match the start of the cc1 and the start of the linker lines
 // DELIMITERS: {{^ (\(in-process\)|")}}
@@ -489,7 +489,7 @@
 // CHECK-ALLOW-PLACEHOLDERS: -fallow-editor-placeholders
 // CHECK-NO-ALLOW-PLACEHOLDERS-NOT: -fallow-editor-placeholders
 
-// RUN: %clang -### -target x86_64-unknown-windows-msvc -fno-short-wchar %s 2>&1 | FileCheck -check-prefix CHECK-WINDOWS-ISO10646 %s
+// RUN: %clang -### --target=x86_64-unknown-windows-msvc -fno-short-wchar %s 2>&1 | FileCheck -check-prefix CHECK-WINDOWS-ISO10646 %s
 // CHECK-WINDOWS-ISO10646: "-fwchar-type=int"
 // CHECK-WINDOWS-ISO10646: "-fsigned-wchar"
 
@@ -530,16 +530,16 @@
 // CHECK-NO-NULL-POINTER-CHECKS: "-fno-delete-null-pointer-checks"
 // CHECK-NULL-POINTER-CHECKS-NOT: "-fno-delete-null-pointer-checks"
 
-// RUN: %clang -### -S -target x86_64-unknown-linux -frecord-gcc-switches %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES %s
-// RUN: %clang -### -S -target x86_64-unknown-linux -fno-record-gcc-switches %s 2>&1 | FileCheck -check-prefix=CHECK-NO-RECORD-GCC-SWITCHES %s
-// RUN: %clang -### -S -target x86_64-unknown-linux -fno-record-gcc-switches -frecord-gcc-switches %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES %s
-// RUN: %clang -### -S -target x86_64-unknown-linux -frecord-gcc-switches -fno-record-gcc-switches %s 2>&1 | FileCheck -check-prefix=CHECK-NO-RECORD-GCC-SWITCHES %s
-// RUN: %clang -### -S -target x86_64-unknown-linux -frecord-command-line %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES %s
-// RUN: %clang -### -S -target x86_64-unknown-linux -fno-record-command-line %s 2>&1 | FileCheck -check-prefix=CHECK-NO-RECORD-GCC-SWITCHES %s
-// RUN: %clang -### -S -target x86_64-unknown-linux -fno-record-command-line -frecord-command-line %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES %s
-// RUN: %clang -### -S -target x86_64-unknown-linux -frecord-command-line -fno-record-command-line %s 2>&1 | FileCheck -check-prefix=CHECK-NO-RECORD-GCC-SWITCHES %s
+// RUN: %clang -### -S --target=x86_64-unknown-linux -frecord-gcc-switches %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES %s
+// RUN: %clang -### -S --target=x86_64-unknown-linux -fno-record-gcc-switches %s 2>&1 | FileCheck -check-prefix=CHECK-NO-RECORD-GCC-SWITCHES %s
+// RUN: %clang -### -S --target=x86_64-unknown-linux -fno-record-gcc-switches -frecord-gcc-switches %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES %s
+// RUN: %clang -### -S --target=x86_64-unknown-linux -frecord-gcc-switches -fno-record-gcc-switches %s 2>&1 | FileCheck -check-prefix=CHECK-NO-RECORD-GCC-SWITCHES %s
+// RUN: %clang -### -S --target=x86_64-unknown-linux -frecord-command-line %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES %s
+// RUN: %clang -### -S --target=x86_64-unknown-linux -fno-record-command-line %s 2>&1 | FileCheck -check-prefix=CHECK-NO-RECORD-GCC-SWITCHES %s
+// RUN: %clang -### -S --target=x86_64-unknown-linux -fno-record-command-line -frecord-command-line %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES %s
+// RUN: %clang -### -S --target=x86_64-unknown-linux -frecord-command-line -fno-record-command-line %s 2>&1 | FileCheck -check-prefix=CHECK-NO-RECORD-GCC-SWITCHES %s
 // Test with a couple examples of non-ELF object file formats
-// RUN: %clang -### -S -target x86_64-unknown-macosx -frecord-command-line %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES %s
+// RUN: %clang -### -S --target=x86_64-unknown-macosx -frecord-command-line %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES %s
 // RUN: not %clang -### -S --target=x86_64-unknown-windows -frecord-command-line %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES-ERROR %s
 // CHECK-RECORD-GCC-SWITCHES: "-record-command-line"
 // CHECK-NO-RECORD-GCC-SWITCHES-NOT: "-record-command-line"
@@ -553,7 +553,7 @@
 // RUN: rm -rf "%t.r/with spaces"
 // RUN: mkdir -p "%t.r/with spaces"
 // RUN: cp %clang "%t.r/with spaces/clang"
-// RUN: "%t.r/with spaces/clang" -### -S -target x86_64-unknown-linux -frecord-gcc-switches %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES-ESCAPED %s
+// RUN: "%t.r/with spaces/clang" -### -S --target=x86_64-unknown-linux -frecord-gcc-switches %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES-ESCAPED %s
 // CHECK-RECORD-GCC-SWITCHES-ESCAPED: "-record-command-line" "{{.+}}with\\ spaces{{.+}}"
 // Clean up copy of large binary copied into temp directory to avoid bloat.
 // RUN: rm -f "%t.r/with spaces/clang" || true
@@ -599,15 +599,15 @@
 // CHECK_DISABLE_DIRECT: -fobjc-disable-direct-methods-for-testing
 // CHECK_NO_DISABLE_DIRECT-NOT: -fobjc-disable-direct-methods-for-testing
 
-// RUN: %clang -### -S -fjmc -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefixes=CHECK_JMC_WARN,CHECK_NOJMC %s
-// RUN: %clang -### -S -fjmc -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefixes=CHECK_JMC_WARN,CHECK_NOJMC %s
-// RUN: %clang -### -S -fjmc -g -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK_JMC %s
-// RUN: %clang -### -S -fjmc -g -fno-jmc -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC %s
-// RUN: %clang -### -S -fjmc -g -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_JMC %s
-// RUN: %clang -### -S -fjmc -g -fno-jmc -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC %s
-// RUN: %clang -### -fjmc -g -flto -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC_LTO %s
-// RUN: %clang -### -fjmc -g -flto -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_JMC_LTO %s
-// RUN: %clang -### -fjmc -g -flto -fno-jmc -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC_LTO %s
+// RUN: %clang -### -S -fjmc --target=x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefixes=CHECK_JMC_WARN,CHECK_NOJMC %s
+// RUN: %clang -### -S -fjmc --target=x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefixes=CHECK_JMC_WARN,CHECK_NOJMC %s
+// RUN: %clang -### -S -fjmc -g --target=x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK_JMC %s
+// RUN: %clang -### -S -fjmc -g -fno-jmc --target=x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC %s
+// RUN: %clang -### -S -fjmc -g --target=x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_JMC %s
+// RUN: %clang -### -S -fjmc -g -fno-jmc --target=x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC %s
+// RUN: %clang -### -fjmc -g -flto --target=x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC_LTO %s
+// RUN: %clang -### -fjmc -g -flto --target=x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_JMC_LTO %s
+// RUN: %clang -### -fjmc -g -flto -fno-jmc --target=x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC_LTO %s
 // CHECK_JMC_WARN: -fjmc requires debug info. Use -g or debug options that enable debugger's stepping function; option ignored
 // CHECK_JMC_WARN_NOT_ELF: -fjmc works only for ELF; option ignored
 // CHECK_NOJMC-NOT: -fjmc
@@ -615,7 +615,7 @@
 // CHECK_NOJMC_LTO-NOT: -plugin-opt=-enable-jmc-instrument
 // CHECK_JMC_LTO: -plugin-opt=-enable-jmc-instrument
 
-// RUN: %clang -### -fintegrated-objemitter -target x86_64 %s 2>&1 | FileCheck -check-prefix=CHECK-INT-OBJEMITTER %s
+// RUN: %clang -### -fintegrated-objemitter --target=x86_64 %s 2>&1 | FileCheck -check-prefix=CHECK-INT-OBJEMITTER %s
 // CHECK-INT-OBJEMITTER-NOT: unsupported option '-fintegrated-objemitter' for target
 // RUN: not %clang -### -fno-integrated-objemitter --target=x86_64 %s 2>&1 | FileCheck -check-prefix=CHECK-NOINT-OBJEMITTER %s
 // CHECK-NOINT-OBJEMITTER: unsupported option '-fno-integrated-objemitter' for target
diff --git a/clang/test/Driver/config-file-errs.c b/clang/test/Driver/config-file-errs.c
index 96b49b2acf8ab..dc4fcdebf44bc 100644
--- a/clang/test/Driver/config-file-errs.c
+++ b/clang/test/Driver/config-file-errs.c
@@ -6,13 +6,13 @@
 
 //--- Argument of '--config' must be existing file, if it is specified by path.
 //
-// RUN: not %clang --config somewhere/nonexistent-config-file 2>&1 | FileCheck %s -check-prefix CHECK-NONEXISTENT
-// CHECK-NONEXISTENT: configuration file '{{.*}}somewhere{{.}}nonexistent-config-file' cannot be opened: {{[Nn]}}o such file or directory
+// RUN: not %clang --config somewhere/nonexistent-config-file 2>&1 | FileCheck -DMSG=%errc_ENOENT %s -check-prefix CHECK-NONEXISTENT
+// CHECK-NONEXISTENT: configuration file '{{.*}}somewhere{{.}}nonexistent-config-file' cannot be opened: [[MSG]]
 
 
 //--- All '--config' arguments must be existing files.
 //
-// RUN: not %clang --config %S/Inputs/config-4.cfg --config somewhere/nonexistent-config-file 2>&1 | FileCheck %s -check-prefix CHECK-NONEXISTENT
+// RUN: not %clang --config %S/Inputs/config-4.cfg --config somewhere/nonexistent-config-file 2>&1 | FileCheck -DMSG=%errc_ENOENT %s -check-prefix CHECK-NONEXISTENT
 
 
 //--- Argument of '--config' must exist somewhere in well-known directories, if it is specified by bare name.
diff --git a/clang/test/Driver/dxc_E.hlsl b/clang/test/Driver/dxc_E.hlsl
index 05cfca685c9a6..07715a2e2259a 100644
--- a/clang/test/Driver/dxc_E.hlsl
+++ b/clang/test/Driver/dxc_E.hlsl
@@ -1,4 +1,5 @@
 // RUN: not %clang_dxc -Efoo -Tlib_6_7 foo.hlsl -### %s 2>&1 | FileCheck %s
+// RUN: not %clang_dxc -Efoo -Tlib_6_7 -spirv foo.hlsl -### %s 2>&1 | FileCheck %s
 
 // Make sure E option flag which translated into "-hlsl-entry".
 // CHECK:"-hlsl-entry" "foo"
diff --git a/clang/test/Driver/fbasic-block-sections.c b/clang/test/Driver/fbasic-block-sections.c
index e13cc81e910be..6dfba5f404cee 100644
--- a/clang/test/Driver/fbasic-block-sections.c
+++ b/clang/test/Driver/fbasic-block-sections.c
@@ -22,7 +22,8 @@
 // CHECK-OPT-NONE:   "-fbasic-block-sections=none"
 // CHECK-OPT-ALL:    "-fbasic-block-sections=all"
 // CHECK-OPT-LIST:   "-fbasic-block-sections={{[^ ]*}}fbasic-block-sections.c"
-// CHECK-OPT-LABELS: "-fbasic-block-sections=labels"
+// CHECK-OPT-LABELS: warning: argument '-fbasic-block-sections=labels' is deprecated, use '-fbasic-block-address-map' instead
+// CHECK-OPT-LABELS: "-fbasic-block-address-map"
 // CHECK-TRIPLE:     error: unsupported option '-fbasic-block-sections=all' for target
 // CHECK-INVALID-VALUE: error: invalid value {{[^ ]*}} in '-fbasic-block-sections={{.*}}'
 // CHECK-OPT-NULL-LIST: "-fbasic-block-sections=list="
diff --git a/clang/test/Driver/hipstdpar.c b/clang/test/Driver/hipstdpar.c
index 2f48bf6b5cf1e..32e040ef70d75 100644
--- a/clang/test/Driver/hipstdpar.c
+++ b/clang/test/Driver/hipstdpar.c
@@ -1,6 +1,7 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: amdgpu-registered-target
 // REQUIRES: system-linux
+// UNSUPPORTED: target={{.*}}-zos{{.*}}
 // XFAIL: target={{.*}}hexagon{{.*}}
 // XFAIL: target={{.*}}-scei{{.*}}
 // XFAIL: target={{.*}}-sie{{.*}}
diff --git a/clang/test/Driver/lld-repro.c b/clang/test/Driver/lld-repro.c
index 61904c0e6df30..0e6340865b738 100644
--- a/clang/test/Driver/lld-repro.c
+++ b/clang/test/Driver/lld-repro.c
@@ -1,5 +1,5 @@
 // REQUIRES: lld
-// UNSUPPORTED: target={{.*-(ps4|ps5)}}
+// UNSUPPORTED: target={{.*-(ps4|ps5)}}, target={{.*}}-zos{{.*}}
 
 // RUN: echo "-nostartfiles -nostdlib -fuse-ld=lld -gen-reproducer=error -fcrash-diagnostics-dir=%t" \
 // RUN:   | sed -e 's/\\/\\\\/g' > %t.rsp
diff --git a/clang/test/Driver/notypecheck.s b/clang/test/Driver/notypecheck.s
index f6e78d6791182..8e924b57fbdc0 100644
--- a/clang/test/Driver/notypecheck.s
+++ b/clang/test/Driver/notypecheck.s
@@ -5,7 +5,7 @@
 
 # Verify that without -Wa,--no-type-check the assembler will error out
 # RUN: not %clang %s -c -o tmp.o -target wasm32-unknown-unknown 2>&1 | FileCheck --check-prefix=ERROR %s
-# ERROR: error: popped i64, expected i32
+# ERROR: error: type mismatch, expected [i32] but got [i64]
 
 foo:
   .functype  foo () -> (i32)
diff --git a/clang/test/Driver/ps5-linker.c b/clang/test/Driver/ps5-linker.c
index c0cf0b864028c..4ae65963e361a 100644
--- a/clang/test/Driver/ps5-linker.c
+++ b/clang/test/Driver/ps5-linker.c
@@ -46,3 +46,27 @@
 
 // CHECK-SYSROOT: {{ld(\.exe)?}}"
 // CHECK-SYSROOT-SAME: "--sysroot=mysdk"
+
+// Test that "." is always added to library search paths. This is long-standing
+// behavior, unique to PlayStation toolchains.
+
+// RUN: %clang --target=x64_64-sie-ps5 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LDOT %s
+
+// CHECK-LDOT: {{ld(\.exe)?}}"
+// CHECK-LDOT-SAME: "-L."
+
+// Test that <sdk-root>/target/lib is added to library search paths, if it
+// exists and no --sysroot is specified.
+
+// RUN: rm -rf %t.dir && mkdir %t.dir
+// RUN: env SCE_PROSPERO_SDK_DIR=%t.dir %clang --target=x64_64-sie-ps5 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-TARGETLIB %s
+// RUN: env SCE_PROSPERO_SDK_DIR=%t.dir %clang --target=x64_64-sie-ps5 %s -### --sysroot=%t.dir 2>&1 | FileCheck --check-prefixes=CHECK-NO-TARGETLIB %s
+
+// CHECK-NO-TARGETLIB: {{ld(\.exe)?}}"
+// CHECK-NO-TARGETLIB-NOT: "-L{{.*[/\\]}}target/lib"
+
+// RUN: mkdir -p %t.dir/target/lib
+// RUN: env SCE_PROSPERO_SDK_DIR=%t.dir %clang --target=x64_64-sie-ps5 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-TARGETLIB %s
+
+// CHECK-TARGETLIB: {{ld(\.exe)?}}"
+// CHECK-TARGETLIB-SAME: "-L{{.*[/\\]}}target/lib"
diff --git a/clang/test/Driver/relax.s b/clang/test/Driver/relax.s
index b4a696a328eb5..0768a38834447 100644
--- a/clang/test/Driver/relax.s
+++ b/clang/test/Driver/relax.s
@@ -1,5 +1,5 @@
 // REQUIRES: x86-registered-target
-// RUN: %clang -### -c -target x86_64-pc-linux -integrated-as -Wa,--mrelax-relocations=no %s 2>&1 | FileCheck %s
+// RUN: %clang -### -c --target=x86_64-pc-linux -integrated-as -Wa,--mrelax-relocations=no %s 2>&1 | FileCheck %s
 
 // CHECK: "-cc1as"
 // CHECK: "-mrelax-relocations=no"
diff --git a/clang/test/Driver/response-file-errs.c b/clang/test/Driver/response-file-errs.c
index efde7575a51e0..5331c8e308f48 100644
--- a/clang/test/Driver/response-file-errs.c
+++ b/clang/test/Driver/response-file-errs.c
@@ -11,5 +11,5 @@
 
 // If file in `@file` is a directory, it is an error.
 //
-// RUN: not %clang @%S/Inputs -### 2>&1 | FileCheck --check-prefix=DIRECTORY %s
-// DIRECTORY: cannot not open file '{{.*}}Inputs': {{[Ii]}}s a directory
+// RUN: not %clang @%S/Inputs -### 2>&1 | FileCheck -DMSG=%errc_EISDIR --check-prefix=DIRECTORY %s
+// DIRECTORY: cannot not open file '{{.*}}Inputs': [[MSG]]
diff --git a/clang/test/Driver/target-as.s b/clang/test/Driver/target-as.s
index 4881a330b56aa..2b639e95043c3 100644
--- a/clang/test/Driver/target-as.s
+++ b/clang/test/Driver/target-as.s
@@ -1,5 +1,5 @@
 // Make sure the -march is passed down to cc1as.
-// RUN: %clang -target i386-unknown-freebsd -### -c -integrated-as %s \
+// RUN: %clang --target=i386-unknown-freebsd -### -c -integrated-as %s \
 // RUN: -march=geode 2>&1 | FileCheck -check-prefix=TARGET %s
 //
 // TARGET: "-cc1as"
diff --git a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
index a05e21b37b912..f15130d5f8b61 100644
--- a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
+++ b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null
 
 // expected-remark@+10 {{Function Name: foo}}
-// expected-remark@+9 {{    SGPRs: 13}}
+// expected-remark@+9 {{    TotalSGPRs: 13}}
 // expected-remark@+8 {{    VGPRs: 10}}
 // expected-remark@+7 {{    AGPRs: 12}}
 // expected-remark@+6 {{    ScratchSize [bytes/lane]: 0}}
diff --git a/clang/test/Frontend/fixed_point_comparisons.c b/clang/test/Frontend/fixed_point_comparisons.c
index 59c4405e41c03..39e62bce51e2b 100644
--- a/clang/test/Frontend/fixed_point_comparisons.c
+++ b/clang/test/Frontend/fixed_point_comparisons.c
@@ -1,6 +1,9 @@
 // RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNPADDED
 // RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,PADDED
 
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,UNPADDED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,PADDED
+
 // Fixed point against other fixed point
 _Bool b_eq_true = 2.5hk == 2.5uhk;  // CHECK-DAG: @b_eq_true  = {{.*}}global i8 1, align 1
 _Bool b_eq_false = 2.5hk == 2.4uhk; // CHECK-DAG: @b_eq_false = {{.*}}global i8 0, align 1
diff --git a/clang/test/Frontend/fixed_point_conversions_const.c b/clang/test/Frontend/fixed_point_conversions_const.c
index e6e89ded534fe..889486e5eb806 100644
--- a/clang/test/Frontend/fixed_point_conversions_const.c
+++ b/clang/test/Frontend/fixed_point_conversions_const.c
@@ -1,6 +1,9 @@
 // RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
 // RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+
 // Between different fixed point types
 short _Accum sa_const = 2.5hk;
 // CHECK-DAG: @sa_const  = {{.*}}global i16 320, align 2
diff --git a/clang/test/Frontend/fixed_point_crash.c b/clang/test/Frontend/fixed_point_crash.c
index 3b3911117400c..9cfe6ba64c519 100644
--- a/clang/test/Frontend/fixed_point_crash.c
+++ b/clang/test/Frontend/fixed_point_crash.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -verify -ffixed-point %s
+// RUN: %clang_cc1 -verify -ffixed-point %s -fexperimental-new-constant-interpreter
 
 union a {
   _Accum x;
diff --git a/clang/test/Frontend/fixed_point_div_const.c b/clang/test/Frontend/fixed_point_div_const.c
index 46935207d186a..66c028e608db6 100644
--- a/clang/test/Frontend/fixed_point_div_const.c
+++ b/clang/test/Frontend/fixed_point_div_const.c
@@ -1,6 +1,9 @@
 // RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
 // RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+
 // Division between different fixed point types
 short _Accum sa_const = 1.0hk / 2.0hk;
 // CHECK-DAG: @sa_const  = {{.*}}global i16 64, align 2
diff --git a/clang/test/Frontend/fixed_point_errors.c b/clang/test/Frontend/fixed_point_errors.c
index 6a711936f2397..3472f595089a1 100644
--- a/clang/test/Frontend/fixed_point_errors.c
+++ b/clang/test/Frontend/fixed_point_errors.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -verify -ffixed-point %s
+// RUN: %clang_cc1 -verify -ffixed-point %s -fexperimental-new-constant-interpreter
 
 /* We do not yet support long long. No recommended bit widths are given for this
  * size. */
diff --git a/clang/test/Modules/lambda-definitions.cppm b/clang/test/Modules/lambda-definitions.cppm
new file mode 100644
index 0000000000000..fb4bb8d298f0f
--- /dev/null
+++ b/clang/test/Modules/lambda-definitions.cppm
@@ -0,0 +1,45 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/invocable.cppm -emit-module-interface -o %t/invocable.pcm
+// RUN: %clang_cc1 -std=c++20 %t/lambda.cppm -emit-module-interface -o %t/lambda.pcm  -fprebuilt-module-path=%t
+// RUN: %clang_cc1 -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -verify
+//
+// RUN: %clang_cc1 -std=c++20 %t/invocable.cppm -emit-reduced-module-interface -o %t/invocable.pcm
+// RUN: %clang_cc1 -std=c++20 %t/lambda.cppm -emit-reduced-module-interface -o %t/lambda.pcm  -fprebuilt-module-path=%t
+// RUN: %clang_cc1 -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -verify
+
+//--- invocable.cppm
+export module invocable;
+export template <class _Fn, class... _Args>
+concept invocable = requires(_Fn&& __fn, _Args&&... __args) {
+  _Fn(__args...);
+};
+
+export template <class _Fn, class _Args>
+constexpr bool is_callable(_Fn&& __fn, _Args&& __args) {
+    return invocable<_Fn, _Args>;
+}
+
+export template <class _Fn>
+struct Callable : _Fn {
+    constexpr explicit Callable(_Fn &&__fn) : _Fn(static_cast<_Fn&&>(__fn)) {}
+    
+    template <class _Args>
+    constexpr auto operator()(_Args&& __args) {
+        return _Fn(__args);
+    }
+};
+
+//--- lambda.cppm
+export module lambda;
+import invocable;
+export constexpr auto l = Callable([](auto &&x){});
+
+//--- test.cc
+// expected-no-diagnostics
+import invocable;
+import lambda;
+
+static_assert(is_callable(l, 4) == true);
diff --git a/clang/test/Modules/string-literal-uniqueness.cpp b/clang/test/Modules/string-literal-uniqueness.cpp
new file mode 100644
index 0000000000000..34adc2b0303bd
--- /dev/null
+++ b/clang/test/Modules/string-literal-uniqueness.cpp
@@ -0,0 +1,60 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/a.cpp \
+// RUN:  -o %t/A.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/b.cpp \
+// RUN:  -fmodule-file=A=%t/A.pcm -o %t/B.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/c.cpp \
+// RUN:  -fmodule-file=A=%t/A.pcm -o %t/C.pcm
+
+// RUN: %clang_cc1 -std=c++20 -verify %t/main.cpp \
+// RUN:  -fmodule-file=A=%t/A.pcm \
+// RUN:  -fmodule-file=B=%t/B.pcm \
+// RUN:  -fmodule-file=C=%t/C.pcm
+
+// expected-no-diagnostics
+
+//--- a.cpp
+
+export module A;
+export consteval const char *hello() { return "hello"; }
+export constexpr const char *helloA0 = hello();
+export constexpr const char *helloA1 = helloA0;
+export constexpr const char *helloA2 = hello();
+
+//--- b.cpp
+
+export module B;
+import A;
+export constexpr const char *helloB1 = helloA0;
+export constexpr const char *helloB2 = hello();
+
+//--- c.cpp
+
+export module C;
+import A;
+export constexpr const char *helloC1 = helloA1;
+export constexpr const char *helloC2 = hello();
+
+//--- main.cpp
+
+import A;
+import B;
+import C;
+
+// These are valid: they refer to the same evaluation of the same constant.
+static_assert(helloA0 == helloA1);
+static_assert(helloA0 == helloB1);
+static_assert(helloA0 == helloC1);
+
+// These refer to distinct evaluations, and so may or may not be equal.
+static_assert(helloA1 == helloA2); // expected-error {{}} expected-note {{unspecified value}}
+static_assert(helloA1 == helloB2); // expected-error {{}} expected-note {{unspecified value}}
+static_assert(helloA1 == helloC2); // expected-error {{}} expected-note {{unspecified value}}
+static_assert(helloA2 == helloB2); // expected-error {{}} expected-note {{unspecified value}}
+static_assert(helloA2 == helloC2); // expected-error {{}} expected-note {{unspecified value}}
+static_assert(helloB2 == helloC2); // expected-error {{}} expected-note {{unspecified value}}
diff --git a/clang/test/OpenMP/lit.local.cfg b/clang/test/OpenMP/lit.local.cfg
index 58ee923cb7ec5..93adc6734d1a2 100644
--- a/clang/test/OpenMP/lit.local.cfg
+++ b/clang/test/OpenMP/lit.local.cfg
@@ -1,5 +1,9 @@
 # -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
+import re
 from lit.llvm.subst import ToolSubst
 
 fc = ToolSubst("FileCheck", unresolved="fatal")
 config.substitutions.insert(0, (fc.regex, "FileCheck --allow-unused-prefixes"))
+
+if re.match(r".*-zos", config.target_triple):
+    config.unsupported = True
diff --git a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
index aeb7a8369f403..aad8836db1062 100644
--- a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
+++ b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
@@ -4,7 +4,7 @@
 // SPIRV: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
 
 // valid: "spirv-unknown-vulkan-library"
-// valid: define spir_func void @main() #0 {
+// valid: define spir_func void @{{.*main.*}}() #0 {
 
 [numthreads(1,1,1)]
 void main()
diff --git a/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl
index 1c37d72de8614..b2d492d95945c 100644
--- a/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl
+++ b/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl
@@ -22,3 +22,7 @@ __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]]
 
 // expected-warning@+1{{attribute 'contained_type' is already applied with different arguments}}
 __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] [[hlsl::contained_type(int)]] h8;
+
+// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+// expected-error@+1{{attribute 'contained_type' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+float [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] res5;
diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
index 15685bd1a3baa..3b2c12e7a96c5 100644
--- a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
+++ b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
@@ -14,3 +14,7 @@ __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(gibberish)]] res3
 
 // expected-warning@+1{{attribute 'is_rov' is already applied}}
 __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] [[hlsl::is_rov]] res4;
+
+// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+// expected-error@+1{{attribute 'is_rov' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+float [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] res5;
diff --git a/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl
index 83273426017ed..77530cbf9e4d9 100644
--- a/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl
+++ b/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl
@@ -11,3 +11,7 @@ __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(gibberish)]]
 
 // expected-warning@+1{{attribute 'raw_buffer' is already applied}}
 __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] [[hlsl::raw_buffer]] res4;
+
+// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+// expected-error@+1{{attribute 'raw_buffer' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+float [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] res5;
diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl
index 01ff1c007e2b5..63e39daff949b 100644
--- a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl
@@ -17,3 +17,6 @@ __hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(SRV)]] e4
 
 // expected-error@+1{{'resource_class' attribute takes one argument}}
 __hlsl_resource_t [[hlsl::resource_class(SRV, "aa")]] e5;
+
+// expected-error@+1{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+float [[hlsl::resource_class(UAV)]] e6;
diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
index 301d61c0e906e..e7d19c3da7216 100644
--- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
@@ -3,10 +3,10 @@
 // CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> class RWBuffer definition implicit_instantiation
 // CHECK: -TemplateArgument type 'float'
 // CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float'
-// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'float *
+// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
-// CHECK-SAME: ':'float *'
+// CHECK-SAME: ':'__hlsl_resource_t'
 // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit TypedBuffer
 RWBuffer<float> Buffer1;
 
@@ -14,10 +14,10 @@ RWBuffer<float> Buffer1;
 // CHECK: -TemplateArgument type 'vector<float, 4>'
 // CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector<float, 4>' 4
 // CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float'
-// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'vector<float *, 4>
+// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]
 // CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(vector<float, 4>)]]
-// CHECK-SAME: ':'vector<float *, 4>'
+// CHECK-SAME: ':'__hlsl_resource_t'
 // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit TypedBuffer
 RasterizerOrderedBuffer<vector<float, 4> > BufferArray3[4];
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index 15c4554a31922..6c9ce4ad5e196 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -105,17 +105,14 @@ void func() {
 #pragma acc loop collapse(force:)
   for(;;){}
 
-  // expected-error@+2{{invalid tag 'unknown' on 'collapse' clause}}
-  // expected-warning@+1{{OpenACC clause 'collapse' not yet implemented, clause ignored}}
-#pragma acc loop collapse(unknown:5)
+  // expected-error@+1{{invalid tag 'unknown' on 'collapse' clause}}
+#pragma acc loop collapse(unknown:1)
   for(;;){}
 
-  // expected-warning@+1{{OpenACC clause 'collapse' not yet implemented, clause ignored}}
-#pragma acc loop collapse(force:5)
+#pragma acc loop collapse(force:1)
   for(;;){}
 
-  // expected-warning@+1{{OpenACC clause 'collapse' not yet implemented, clause ignored}}
-#pragma acc loop collapse(5)
+#pragma acc loop collapse(1)
   for(;;){}
 
   // expected-error@+2{{expected ')'}}
diff --git a/clang/test/ParserOpenACC/parse-clauses.cpp b/clang/test/ParserOpenACC/parse-clauses.cpp
index b7e252e892bea..9613530db77dd 100644
--- a/clang/test/ParserOpenACC/parse-clauses.cpp
+++ b/clang/test/ParserOpenACC/parse-clauses.cpp
@@ -2,13 +2,23 @@
 
 template<unsigned I, typename T>
 void templ() {
-  // expected-warning@+1{{OpenACC clause 'collapse' not yet implemented, clause ignored}}
 #pragma acc loop collapse(I)
-  for(;;){}
+  for(;;)
+    for(;;)
+      for(;;)
+        for(;;)
+          for(;;)
+            for(;;)
+              for(;;);
 
-  // expected-warning@+1{{OpenACC clause 'collapse' not yet implemented, clause ignored}}
 #pragma acc loop collapse(T::value)
-  for(;;){}
+  for(;;)
+    for(;;)
+      for(;;)
+        for(;;)
+          for(;;)
+            for(;;)
+              for(;;);
 
 #pragma acc parallel vector_length(T::value)
   for(;;){}
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
index 5de97649af5d3..6f64bbf22dc66 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
-// RUN:    -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sve2 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -fsyntax-only -verify %s
+// RUN:    -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sve2 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sme-lutv2 -fsyntax-only -verify %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -350,3 +350,17 @@ void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16,
   svsudot_lane_za32_s8_vg1x2(slice_base, z_s8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
   svsudot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
 }
+
+void test_write_zt() __arm_streaming __arm_inout("zt0") {
+  // Check Zt tile 0
+  svwrite_lane_zt(1, svundef_s8(), 1);  // expected-error {{argument value 1 is outside the valid range [0, 0]}}
+  svwrite_zt(1, svundef_s8());          // expected-error {{argument value 1 is outside the valid range [0, 0]}}
+  // Check index
+  svwrite_lane_zt(0, svundef_s8(), 0);  // expected-error {{argument value 0 is outside the valid range [1, 3]}}
+  svwrite_lane_zt(0, svundef_s8(), 4);  // expected-error {{argument value 4 is outside the valid range [1, 3]}}
+}
+
+void test_luti4_zt_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") {
+  // Check Zt tile 0
+  svluti4_zt_u8_x4(1, op);  // expected-error {{argument value 1 is outside the valid range [0, 0]}}
+}
diff --git a/clang/test/Sema/attr-target-clones-aarch64.c b/clang/test/Sema/attr-target-clones-aarch64.c
index bfc08118a8ffc..6f0185816b0ac 100644
--- a/clang/test/Sema/attr-target-clones-aarch64.c
+++ b/clang/test/Sema/attr-target-clones-aarch64.c
@@ -24,17 +24,17 @@ int __attribute__((target_clones("rng", "fp16fml+fp", "default"))) redecl4(void)
 // expected-error@+3 {{'target_clones' attribute does not match previous declaration}}
 // expected-note@-2 {{previous declaration is here}}
 // expected-warning@+1 {{version list contains entries that don't impact code generation}}
-int __attribute__((target_clones("dgh+memtag+rpres", "ebf16+dpb+sha1", "default"))) redecl4(void) { return 1; }
+int __attribute__((target_clones("dgh+memtag+rpres", "ebf16+dpb", "default"))) redecl4(void) { return 1; }
 
 int __attribute__((target_version("flagm2"))) redef2(void) { return 1; }
 // expected-error@+2 {{multiversioned function redeclarations require identical target attributes}}
 // expected-note@-2 {{previous declaration is here}}
 int __attribute__((target_clones("flagm2", "default"))) redef2(void) { return 1; }
 
-int __attribute__((target_clones("f32mm", "f64mm", "sha1+fp"))) redef3(void) { return 1; }
+int __attribute__((target_clones("f32mm", "f64mm", "sha2+fp"))) redef3(void) { return 1; }
 // expected-error@+2 {{'target_clones' attribute does not match previous declaration}}
 // expected-note@-2 {{previous declaration is here}}
-int __attribute__((target_clones("f32mm", "sha1+fp", "f64mm"))) redef3(void) { return 1; }
+int __attribute__((target_clones("f32mm", "sha2+fp", "f64mm"))) redef3(void) { return 1; }
 
 int __attribute__((target_clones("rdm+lse+rdm", "lse+rdm"))) dup1(void) { return 1; }
 // expected-warning@+1 {{version list contains duplicate entries}}
diff --git a/clang/test/Sema/attr-target-version.c b/clang/test/Sema/attr-target-version.c
index 88a927a58f991..ed4f6c8556c99 100644
--- a/clang/test/Sema/attr-target-version.c
+++ b/clang/test/Sema/attr-target-version.c
@@ -36,7 +36,7 @@ void __attribute__((target_version("bti+flagm2"))) one(void) {}
 //expected-error@+1 {{multiversioned function redeclarations require identical target attributes}}
 void __attribute__((target_version("flagm2+bti"))) one(void) {}
 
-void __attribute__((target_version("ssbs+sha1"))) two(void) {}
+void __attribute__((target_version("ssbs+sha2"))) two(void) {}
 void __attribute__((target_version("ssbs+fp16fml"))) two(void) {}
 
 //expected-error@+1 {{'main' cannot be a multiversioned function}}
@@ -89,7 +89,7 @@ int bar() {
   return def();
 }
 // expected-error@+1 {{function declaration cannot become a multiversioned function after first usage}}
-int __attribute__((target_version("sha1"))) def(void) { return 1; }
+int __attribute__((target_version("sha2"))) def(void) { return 1; }
 
 int __attribute__((target_version("sve"))) prot();
 // expected-error@-1 {{multiversioned function must have a prototype}}
diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c
index 1727be1d6286d..26b153dd5b210 100644
--- a/clang/test/Sema/builtins-elementwise-math.c
+++ b/clang/test/Sema/builtins-elementwise-math.c
@@ -538,6 +538,32 @@ void test_builtin_elementwise_popcount(int i, float f, double d, float4 v, int3
   // expected-error@-1 {{assigning to 'int3' (vector of 3 'int' values) from incompatible type 'unsigned3' (vector of 3 'unsigned int' values)}}
 }
 
+void test_builtin_elementwise_fmod(int i, short s, double d, float4 v, int3 iv, unsigned3 uv, int *p) {
+  i = __builtin_elementwise_fmod(p, d);
+  // expected-error@-1 {{arguments are of different types ('int *' vs 'double')}}
+
+  struct Foo foo = __builtin_elementwise_fmod(i, i);
+  // expected-error@-1 {{1st argument must be a floating point type (was 'int')}}
+
+  i = __builtin_elementwise_fmod(i);
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 1}}
+
+  i = __builtin_elementwise_fmod();
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 0}}
+
+  i = __builtin_elementwise_fmod(i, i, i);
+  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
+
+  i = __builtin_elementwise_fmod(v, iv);
+  // expected-error@-1 {{arguments are of different types ('float4' (vector of 4 'float' values) vs 'int3' (vector of 3 'int' values))}}
+
+  i = __builtin_elementwise_fmod(uv, iv);
+  // expected-error@-1 {{arguments are of different types ('unsigned3' (vector of 3 'unsigned int' values) vs 'int3' (vector of 3 'int' values))}}
+
+  i = __builtin_elementwise_fmod(d, v);
+  // expected-error@-1 {{arguments are of different types ('double' vs 'float4' (vector of 4 'float' values))}}
+}
+
 void test_builtin_elementwise_pow(int i, short s, double d, float4 v, int3 iv, unsigned3 uv, int *p) {
   i = __builtin_elementwise_pow(p, d);
   // expected-error@-1 {{arguments are of different types ('int *' vs 'double')}}
@@ -562,7 +588,6 @@ void test_builtin_elementwise_pow(int i, short s, double d, float4 v, int3 iv, u
   
 }
 
-
 void test_builtin_elementwise_roundeven(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) {
 
   struct Foo s = __builtin_elementwise_roundeven(f);
diff --git a/clang/test/Sema/diagnose_if.c b/clang/test/Sema/diagnose_if.c
index e9b8497d5ca4e..4df39916c031e 100644
--- a/clang/test/Sema/diagnose_if.c
+++ b/clang/test/Sema/diagnose_if.c
@@ -2,10 +2,10 @@
 
 #define _diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__)))
 
-void failure1(void) _diagnose_if(); // expected-error{{at least 3 arguments}}
-void failure2(void) _diagnose_if(0); // expected-error{{at least 3 arguments}}
-void failure3(void) _diagnose_if(0, ""); // expected-error{{at least 3 arguments}}
-void failure4(void) _diagnose_if(0, "", "error", 1); // expected-error{{expected string literal as argument}}
+void failure1(void) _diagnose_if(); // expected-error{{exactly 3 arguments}}
+void failure2(void) _diagnose_if(0); // expected-error{{exactly 3 arguments}}
+void failure3(void) _diagnose_if(0, ""); // expected-error{{exactly 3 arguments}}
+void failure4(void) _diagnose_if(0, "", "error", 1); // expected-error{{exactly 3 arguments}}
 void failure5(void) _diagnose_if(0, 0, "error"); // expected-error{{expected string literal as argument of 'diagnose_if' attribute}}
 void failure6(void) _diagnose_if(0, "", "invalid"); // expected-error{{invalid diagnostic type for 'diagnose_if'; use "error" or "warning" instead}}
 void failure7(void) _diagnose_if(0, "", "ERROR"); // expected-error{{invalid diagnostic type}}
diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
index 69e5395a78a57..731639ab16a73 100644
--- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
+++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
@@ -158,17 +158,30 @@ auto begin(C &c) -> decltype(c.begin());
 template<typename T, int N>
 T *begin(T (&array)[N]);
 
+using size_t = decltype(sizeof(0));
+
+template<typename T>
+struct initializer_list {
+  const T* ptr; size_t sz;
+};
 template <typename T>
 struct vector {
   typedef __gnu_cxx::basic_iterator<T> iterator;
   iterator begin();
   iterator end();
   const T *data() const;
+  vector();
+  vector(initializer_list<T> __l);
+
+  template<typename InputIterator>
+	vector(InputIterator first, InputIterator __last);
+
   T &at(int n);
 };
 
 template<typename T>
 struct basic_string_view {
+  basic_string_view();
   basic_string_view(const T *);
   const T *begin() const;
 };
@@ -203,11 +216,21 @@ template<typename T>
 struct optional {
   optional();
   optional(const T&);
+
+  template<typename U = T>
+  optional(U&& t);
+
+  template<typename U>
+  optional(optional<U>&& __t);
+
   T &operator*() &;
   T &&operator*() &&;
   T &value() &;
   T &&value() &&;
 };
+template<typename T>
+optional<__decay(T)> make_optional(T&&);
+
 
 template<typename T>
 struct stack {
@@ -587,3 +610,171 @@ std::string_view test2() {
   return k.value(); // expected-warning {{address of stack memory associated}}
 }
 } // namespace GH108272
+
+namespace GH100526 {
+void test() {
+  std::vector<std::string_view> v1({std::string()}); // expected-warning {{object backing the pointer will be destroyed at the end}}
+  std::vector<std::string_view> v2({
+    std::string(), // expected-warning {{object backing the pointer will be destroyed at the end}}
+    std::string_view()
+  });
+  std::vector<std::string_view> v3({
+    std::string_view(),
+    std::string()  // expected-warning {{object backing the pointer will be destroyed at the end}}
+  });
+
+  std::optional<std::string_view> o1 = std::string(); // expected-warning {{object backing the pointer}}
+
+  std::string s;
+  // This is a tricky use-after-free case, what it does:
+  //   1. make_optional creates a temporary "optional<string>"" object
+  //   2. the temporary object owns the underlying string which is copied from s.
+  //   3. the t3 object holds the view to the underlying string of the temporary object.
+  std::optional<std::string_view> o2 = std::make_optional(s); // expected-warning {{object backing the pointer}}
+  std::optional<std::string_view> o3 = std::optional<std::string>(s); // expected-warning {{object backing the pointer}}
+  std::optional<std::string_view> o4 = std::optional<std::string_view>(s);
+
+  // FIXME: should work for assignment cases
+  v1 = {std::string()};
+  o1 = std::string();
+
+  // no warning on copying pointers.
+  std::vector<std::string_view> n1 = {std::string_view()};
+  std::optional<std::string_view> n2 = {std::string_view()};
+  std::optional<std::string_view> n3 = std::string_view();
+  std::optional<std::string_view> n4 = std::make_optional(std::string_view());
+  const char* b = "";
+  std::optional<std::string_view> n5 = std::make_optional(b);
+  std::optional<std::string_view> n6 = std::make_optional("test");
+}
+
+std::vector<std::string_view> test2(int i) {
+  std::vector<std::string_view> t;
+  if (i)
+    return t; // this is fine, no dangling
+  return std::vector<std::string_view>(t.begin(), t.end());
+}
+
+class Foo {
+  public:
+   operator std::string_view() const { return ""; }
+};
+class [[gsl::Owner]] FooOwner {
+  public:
+   operator std::string_view() const { return ""; }
+};
+std::optional<Foo> GetFoo();
+std::optional<FooOwner> GetFooOwner();
+
+template <typename T>
+struct [[gsl::Owner]] Container1 {
+   Container1();
+};
+template <typename T>
+struct [[gsl::Owner]] Container2 {
+  template<typename U>
+  Container2(const Container1<U>& C2);
+};
+
+std::optional<std::string_view> test3(int i) {
+  std::string s;
+  std::string_view sv;
+  if (i)
+   return s; // expected-warning {{address of stack memory associated}}
+  return sv; // fine
+  Container2<std::string_view> c1 = Container1<Foo>(); // no diagnostic as Foo is not an Owner.
+  Container2<std::string_view> c2 = Container1<FooOwner>(); // expected-warning {{object backing the pointer will be destroyed}}
+  return GetFoo(); // fine, we don't know Foo is owner or not, be conservative.
+  return GetFooOwner(); // expected-warning {{returning address of local temporary object}}
+}
+
+std::optional<int*> test4(int a) {
+  return std::make_optional(nullptr); // fine
+}
+
+
+template <typename T>
+struct [[gsl::Owner]] StatusOr {
+  const T &valueLB() const [[clang::lifetimebound]];
+  const T &valueNoLB() const;
+};
+
+template<typename T>
+struct [[gsl::Pointer]] Span {
+  Span(const std::vector<T> &V);
+
+  const int& getFieldLB() const [[clang::lifetimebound]];
+  const int& getFieldNoLB() const;
+};
+
+
+/////// From Owner<Pointer> ///////
+
+// Pointer from Owner<Pointer>
+std::string_view test5() {
+  std::string_view a = StatusOr<std::string_view>().valueLB(); // expected-warning {{object backing the pointer will be dest}}
+  return StatusOr<std::string_view>().valueLB(); // expected-warning {{returning address of local temporary}}
+
+  // No dangling diagnostics on non-lifetimebound methods.
+  std::string_view b = StatusOr<std::string_view>().valueNoLB();
+  return StatusOr<std::string_view>().valueNoLB();
+}
+
+// Pointer<Pointer> from Owner<Pointer>
+// Prevent regression GH108463
+Span<int*> test6(std::vector<int*> v) {
+  Span<int *> dangling = std::vector<int*>(); // expected-warning {{object backing the pointer}}
+  dangling = std::vector<int*>(); // expected-warning {{object backing the pointer}}
+  return v; // expected-warning {{address of stack memory}}
+}
+
+/////// From Owner<Owner<Pointer>> ///////
+
+// Pointer from Owner<Owner<Pointer>>
+int* test7(StatusOr<StatusOr<int*>> aa) {
+  // No dangling diagnostic on pointer.
+  return aa.valueLB().valueLB(); // OK.
+}
+
+// Owner<Pointer> from Owner<Owner<Pointer>>
+std::vector<int*> test8(StatusOr<std::vector<int*>> aa) {
+  return aa.valueLB(); // OK, no pointer being construct on this case.
+  return aa.valueNoLB();
+}
+
+// Pointer<Pointer> from Owner<Owner<Pointer>>
+Span<int*> test9(StatusOr<std::vector<int*>> aa) {
+  return aa.valueLB(); // expected-warning {{address of stack memory associated}}
+  return aa.valueNoLB(); // OK.
+}
+
+/////// From Owner<Owner> ///////
+
+// Pointer<Owner>> from Owner<Owner>
+Span<std::string> test10(StatusOr<std::vector<std::string>> aa) {
+  return aa.valueLB(); // expected-warning {{address of stack memory}}
+  return aa.valueNoLB(); // OK.
+}
+
+/////// From Owner<Pointer<Owner>> ///////
+
+// Pointer<Owner>> from Owner<Pointer<Owner>>
+Span<std::string> test11(StatusOr<Span<std::string>> aa) {
+  return aa.valueLB(); // expected-warning {{address of stack memory}}
+  return aa.valueNoLB(); // OK.
+}
+
+// Lifetimebound and gsl::Pointer.
+const int& test12(Span<int> a) {
+  return a.getFieldLB(); // expected-warning {{reference to stack memory associated}}
+  return a.getFieldNoLB(); // OK.
+}
+
+void test13() {
+  // FIXME: RHS is Owner<Pointer>, we skip this case to avoid false positives.
+  std::optional<Span<int*>> abc = std::vector<int*>{};
+
+  std::optional<Span<int>> t = std::vector<int> {}; // expected-warning {{object backing the pointer will be destroyed}}
+}
+
+} // namespace GH100526
diff --git a/clang/test/SemaCXX/builtins-elementwise-math.cpp b/clang/test/SemaCXX/builtins-elementwise-math.cpp
index c3d8bc593c0bb..5910796c5d298 100644
--- a/clang/test/SemaCXX/builtins-elementwise-math.cpp
+++ b/clang/test/SemaCXX/builtins-elementwise-math.cpp
@@ -255,6 +255,14 @@ void test_builtin_elementwise_fma() {
   static_assert(!is_const<decltype(__builtin_elementwise_fma(c, c, c))>::value);
 }
 
+void test_builtin_elementwise_fmod() {
+  const double a = 2;
+  double b = 1;
+  static_assert(!is_const<decltype(__builtin_elementwise_fmod(a, b))>::value);
+  static_assert(!is_const<decltype(__builtin_elementwise_fmod(b, a))>::value);
+  static_assert(!is_const<decltype(__builtin_elementwise_fmod(a, a))>::value);
+}
+
 void test_builtin_elementwise_pow() {
   const double a = 2;
   double b = 1;
diff --git a/clang/test/SemaCXX/builtins.cpp b/clang/test/SemaCXX/builtins.cpp
index f47ed3a1f7ebf..f99bb87b9cbd4 100644
--- a/clang/test/SemaCXX/builtins.cpp
+++ b/clang/test/SemaCXX/builtins.cpp
@@ -1,13 +1,21 @@
-// RUN: %clang_cc1 %s -fsyntax-only -verify -std=c++11 -fcxx-exceptions
-// RUN: %clang_cc1 %s -fsyntax-only -verify -std=c++1z -fcxx-exceptions
+// RUN: %clang_cc1 %s -fsyntax-only -verify -std=c++11 -fcxx-exceptions -fptrauth-intrinsics
+// RUN: %clang_cc1 %s -fsyntax-only -verify -std=c++1z -fcxx-exceptions -fptrauth-intrinsics
 typedef const struct __CFString * CFStringRef;
 #define CFSTR __builtin___CFStringMakeConstantString
+#define NSSTR __builtin___NSStringMakeConstantString
 
 void f() {
 #if !defined(__MVS__) && !defined(_AIX)
   // Builtin function __builtin___CFStringMakeConstantString is currently
   // unsupported on z/OS and AIX.
   (void)CFStringRef(CFSTR("Hello"));
+
+  constexpr bool a = CFSTR("Hello") == CFSTR("Hello");
+  // expected-error@-1 {{constant expression}}
+  // expected-note@-2 {{comparison against opaque constant address '&__builtin___CFStringMakeConstantString("Hello")'}}
+  constexpr bool b = NSSTR("Hello") == NSSTR("Hello");
+  // expected-error@-1 {{constant expression}}
+  // expected-note@-2 {{comparison against opaque constant address '&__builtin___NSStringMakeConstantString("Hello")'}}
 #endif
 }
 
@@ -47,7 +55,7 @@ void a(void) {}
 int n;
 void *p = __builtin_function_start(n);               // expected-error {{argument must be a function}}
 static_assert(__builtin_function_start(a) == a, ""); // expected-error {{static assertion expression is not an integral constant expression}}
-// expected-note@-1 {{comparison of addresses of literals has unspecified value}}
+// expected-note@-1 {{comparison against opaque constant address '&__builtin_function_start(a)'}}
 } // namespace function_start
 
 void no_ms_builtins() {
diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp
index 44ef540f41fa8..e2ea984b37cd0 100644
--- a/clang/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp
@@ -2,6 +2,10 @@
 // RUN: %clang_cc1 -std=c++20 -isystem %S/Inputs -fsyntax-only -verify=expected,cxx11_20,cxx20_23,pre-cxx23 -triple x86_64-linux -Wno-string-plus-int -Wno-pointer-arith -Wno-zero-length-array -Wno-c99-designator -fcxx-exceptions -pedantic %s -Wno-comment -Wno-tautological-pointer-compare -Wno-bool-conversion
 // RUN: %clang_cc1 -std=c++11 -isystem %S/Inputs -fsyntax-only -verify=expected,cxx11_20,cxx11,pre-cxx23    -triple x86_64-linux -Wno-string-plus-int -Wno-pointer-arith -Wno-zero-length-array -Wno-c99-designator -fcxx-exceptions -pedantic %s -Wno-comment -Wno-tautological-pointer-compare -Wno-bool-conversion
 
+// This macro forces its argument to be constant-folded, even if it's not
+// otherwise a constant expression.
+#define fold(x) (__builtin_constant_p(x) ? (x) : (x))
+
 namespace StaticAssertFoldTest {
 
 int x;
@@ -358,11 +362,36 @@ struct Str {
 
 extern char externalvar[];
 constexpr bool constaddress = (void *)externalvar == (void *)0x4000UL; // expected-error {{must be initialized by a constant expression}} expected-note {{reinterpret_cast}}
-constexpr bool litaddress = "foo" == "foo"; // expected-error {{must be initialized by a constant expression}}
-// expected-note@-1 {{comparison of addresses of literals has unspecified value}}
-// cxx20_23-warning@-2 {{comparison between two arrays is deprecated}}
 static_assert(0 != "foo", "");
 
+// OK: These string literals cannot possibly overlap.
+static_assert(+"foo" != +"bar", "");
+static_assert("xfoo" + 1 != "yfoo" + 1, "");
+static_assert(+"foot" != +"foo", "");
+static_assert(+"foo\0bar" != +"foo\0baz", "");
+
+// These can't overlap because the null terminator for UTF-16 is two bytes wide.
+static_assert(fold((const char*)u"A" != (const char*)"\0A\0x"), "");
+static_assert(fold((const char*)u"A" != (const char*)"A\0\0x"), "");
+
+constexpr const char *string = "hello";
+constexpr const char *also_string = string;
+static_assert(string == string, "");
+static_assert(string == also_string, "");
+
+// These strings may overlap, and so the result of the comparison is unknown.
+constexpr bool may_overlap_1 = +"foo" == +"foo"; // expected-error {{}} expected-note {{addresses of literals}}
+constexpr bool may_overlap_2 = +"foo" == +"foo\0bar"; // expected-error {{}} expected-note {{addresses of literals}}
+constexpr bool may_overlap_3 = +"foo" == "bar\0foo" + 4; // expected-error {{}} expected-note {{addresses of literals}}
+constexpr bool may_overlap_4 = "xfoo" + 1 == "xfoo" + 1; // expected-error {{}} expected-note {{addresses of literals}}
+
+// These may overlap even though they have different encodings.
+// One of these two comparisons is non-constant, but due to endianness we don't
+// know which one.
+constexpr bool may_overlap_different_encoding[] =
+  {fold((const char*)u"A" != (const char*)"xA\0\0\0x" + 1), fold((const char*)u"A" != (const char*)"x\0A\0\0x" + 1)};
+  // expected-error@-2 {{}} expected-note@-1 {{addresses of literals}}
+
 }
 
 namespace MaterializeTemporary {
@@ -1543,16 +1572,10 @@ namespace MutableMembers {
 
 namespace Fold {
 
-  // This macro forces its argument to be constant-folded, even if it's not
-  // otherwise a constant expression.
-  #define fold(x) (__builtin_constant_p(x) ? (x) : (x))
-
   constexpr int n = (long)(char*)123; // expected-error {{constant expression}} expected-note {{reinterpret_cast}}
   constexpr int m = fold((long)(char*)123); // ok
   static_assert(m == 123, "");
 
-  #undef fold
-
 }
 
 namespace DR1454 {
diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp
index 70ab5dcd357c1..936d3600953b9 100644
--- a/clang/test/SemaCXX/constant-expression-cxx14.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp
@@ -1306,3 +1306,18 @@ constexpr int field(int a) {
 static_assert(field(3), ""); // expected-error {{constant expression}} \
                              // expected-note {{in call to 'field(3)'}}
 }
+
+namespace literal_comparison {
+
+constexpr bool different_in_loop(bool b = false) {
+  if (b) return false;
+
+  const char *p[2] = {};
+  for (const char *&r : p)
+    r = "hello";
+  return p[0] == p[1]; // expected-note {{addresses of literals}}
+}
+constexpr bool check = different_in_loop();
+  // expected-error@-1 {{}} expected-note@-1 {{in call}}
+
+}
diff --git a/clang/test/SemaCXX/diagnose_if-warning-group.cpp b/clang/test/SemaCXX/diagnose_if-warning-group.cpp
deleted file mode 100644
index a39c0c0c33c9e..0000000000000
--- a/clang/test/SemaCXX/diagnose_if-warning-group.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// RUN: %clang_cc1 %s -verify=expected,wall -fno-builtin -Wno-pedantic -Werror=comment -Wno-error=abi -Wfatal-errors=assume -Wno-fatal-errors=assume -Wno-format
-// RUN: %clang_cc1 %s -verify=expected,wno-all,pedantic,format -fno-builtin -Wno-all -Werror=comment -Wno-error=abi -Werror=assume -Wformat
-
-#define diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__)))
-
-#ifndef EMTY_WARNING_GROUP
-void bougus_warning() diagnose_if(true, "oh no", "warning", "bogus warning") {} // expected-error {{unknown warning group 'bogus warning'}}
-
-void show_in_system_header() diagnose_if(true, "oh no", "warning", "assume", "Banane") {} // expected-error {{'diagnose_if' attribute takes no more than 4 arguments}}
-#endif // EMTY_WARNING_GROUP
-
-template <bool b>
-void diagnose_if_wcomma() diagnose_if(b, "oh no", "warning", "comma") {}
-
-template <bool b>
-void diagnose_if_wcomment() diagnose_if(b, "oh no", "warning", "comment") {}
-
-void empty_warning_group() diagnose_if(true, "oh no", "warning", "") {} // expected-error {{unknown warning group ''}}
-void empty_warning_group_error() diagnose_if(true, "oh no", "error", "") {} // expected-error {{unknown warning group ''}}
-
-void diagnose_if_wabi_default_error() diagnose_if(true, "ABI stuff", "error", "abi") {}
-void diagnose_assume() diagnose_if(true, "Assume diagnostic", "warning", "assume") {}
-
-void Wall() diagnose_if(true, "oh no", "warning", "all") {}
-void Wpedantic() diagnose_if(true, "oh no", "warning", "pedantic") {}
-void Wformat_extra_args() diagnose_if(true, "oh no", "warning", "format-extra-args") {}
-
-void call() {
-  diagnose_if_wcomma<true>(); // expected-warning {{oh no}}
-  diagnose_if_wcomma<false>();
-  diagnose_if_wcomment<true>(); // expected-error {{oh no}}
-  diagnose_if_wcomment<false>();
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wcomma"
-  diagnose_if_wcomma<true>();
-  diagnose_if_wcomment<true>(); // expected-error {{oh no}}
-#pragma clang diagnostic pop
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wcomment"
-  diagnose_if_wcomma<true>(); // expected-warning {{oh no}}
-  diagnose_if_wcomment<true>();
-#pragma clang diagnostic pop
-
-  diagnose_if_wcomma<true>(); // expected-warning {{oh no}}
-  diagnose_if_wcomment<true>(); // expected-error {{oh no}}
-
-  diagnose_if_wabi_default_error(); // expected-warning {{ABI stuff}}
-  diagnose_assume(); // expected-error {{Assume diagnostic}}
-
-  // Make sure that the -Wassume diagnostic isn't fatal
-  diagnose_if_wabi_default_error(); // expected-warning {{ABI stuff}}
-
-  Wall(); // wall-warning {{oh no}}
-  Wpedantic(); // pedantic-warning {{oh no}}
-  Wformat_extra_args(); // format-warning {{oh no}}
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wformat"
-  Wformat_extra_args();
-#pragma clang diagnostic pop
-}
diff --git a/clang/test/SemaCXX/fold_lambda_with_variadics.cpp b/clang/test/SemaCXX/fold_lambda_with_variadics.cpp
index 14e242f009dc5..2257a4c2d975a 100644
--- a/clang/test/SemaCXX/fold_lambda_with_variadics.cpp
+++ b/clang/test/SemaCXX/fold_lambda_with_variadics.cpp
@@ -179,3 +179,57 @@ void foo() {
 }
 
 } // namespace GH99877
+
+namespace GH101754 {
+
+template <typename... Ts> struct Overloaded : Ts... {
+  using Ts::operator()...;
+};
+
+template <typename... Ts> Overloaded(Ts...) -> Overloaded<Ts...>;
+
+template <class T, class U>
+concept same_as = __is_same(T, U);  // #same_as
+
+template <typename... Ts> constexpr auto foo() {
+  return Overloaded{[](same_as<Ts> auto value) { return value; }...}; // #lambda
+}
+
+static_assert(foo<int, double>()(123) == 123);
+static_assert(foo<int, double>()(2.718) == 2.718);
+
+static_assert(foo<int, double>()('c'));
+// expected-error@-1 {{no matching function}}
+
+// expected-note@#lambda {{constraints not satisfied}}
+// expected-note@#lambda {{'same_as<char, int>' evaluated to false}}
+// expected-note@#same_as {{evaluated to false}}
+
+// expected-note@#lambda {{constraints not satisfied}}
+// expected-note@#lambda {{'same_as<char, double>' evaluated to false}}
+// expected-note@#same_as {{evaluated to false}}
+
+template <class T, class U, class V>
+concept C = same_as<T, U> && same_as<U, V>; // #C
+
+template <typename... Ts> constexpr auto bar() {
+  return ([]<class Up>() {
+    return Overloaded{[](C<Up, Ts> auto value) { // #bar
+      return value;
+    }...};
+  }.template operator()<Ts>(), ...);
+}
+static_assert(bar<int, float>()(3.14f)); // OK, bar() returns the last overload i.e. <float>.
+
+static_assert(bar<int, float>()(123));
+// expected-error@-1 {{no matching function}}
+// expected-note@#bar {{constraints not satisfied}}
+// expected-note@#bar {{'C<int, float, int>' evaluated to false}}
+// expected-note@#C {{evaluated to false}}
+
+// expected-note@#bar {{constraints not satisfied}}
+// expected-note@#bar {{'C<int, float, float>' evaluated to false}}
+// expected-note@#C {{evaluated to false}}
+// expected-note@#same_as 2{{evaluated to false}}
+
+} // namespace GH101754
diff --git a/clang/test/SemaCXX/lambda-attributes.cpp b/clang/test/SemaCXX/lambda-attributes.cpp
new file mode 100644
index 0000000000000..97d23053b0f46
--- /dev/null
+++ b/clang/test/SemaCXX/lambda-attributes.cpp
@@ -0,0 +1,62 @@
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-linux -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-linux -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-linux -emit-pch -o %t %s
+// RUN: %clang_cc1 -x c++ -std=c++23 -triple x86_64-unknown-linux -include-pch %t -ast-dump-all /dev/null | FileCheck %s
+// expected-no-diagnostics
+
+// Check that we both don't crash on transforming FunctionProtoType's
+// wrapped in type sugar and that we don't drop it when performing
+// instantiations either.
+
+#define PRESERVE __attribute__((preserve_most))
+
+// Skip to the instantiation of f().
+// CHECK: FunctionDecl {{.*}} f 'void ()' implicit_instantiation
+template <typename T>
+void f() {
+  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int) const __attribute__((preserve_most))':'void (int) __attribute__((preserve_most)) const' implicit_instantiation
+  (void) [] (T) __attribute__((preserve_most)) { };
+
+  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int) const {{\[}}[clang::annotate_type(...)]]':'void (int) const' implicit_instantiation
+  (void) [] (T) [[clang::annotate_type("foo")]] { };
+
+  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int) const {{\[}}[clang::annotate_type(...)]] {{\[}}[clang::annotate_type(...)]] {{\[}}[clang::annotate_type(...)]]':'void (int) const' implicit_instantiation
+  (void) [] (T) [[clang::annotate_type("foo")]]
+                [[clang::annotate_type("foo")]]
+                [[clang::annotate_type("foo")]] { };
+
+  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int) const __attribute__((preserve_most)) {{\[}}[clang::annotate_type(...)]]':'void (int) __attribute__((preserve_most)) const' implicit_instantiation
+  (void) [] (T) __attribute__((preserve_most))
+                [[clang::annotate_type("foo")]] { };
+
+  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int) const __attribute__((cdecl)) {{\[}}[clang::annotate_type(...)]]':'void (int) const' implicit_instantiation
+  (void) [] (T) __attribute__((cdecl))
+                [[clang::annotate_type("foo")]] { };
+
+  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int) const {{\[}}[clang::annotate_type(...)]]':'void (int) const' implicit_instantiation
+  (void) [] (T t) [[clang::annotate_type("foo", t)]] { };
+
+  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int) const __attribute__((preserve_most)) {{\[}}[clang::annotate_type(...)]]':'void (int) __attribute__((preserve_most)) const' implicit_instantiation
+  (void) [] (T t) __attribute__((preserve_most))
+                [[clang::annotate_type("foo", t, t, t, t)]] { };
+
+  // Check that the MacroQualifiedType is preserved.
+  // CHECK: CXXMethodDecl {{.*}} operator() 'PRESERVE void (int) __attribute__((preserve_most)) const':'void (int) __attribute__((preserve_most)) const' implicit_instantiation
+  (void) [] (T) PRESERVE { };
+
+  // CHECK: CXXMethodDecl {{.*}} operator() 'PRESERVE void (int) __attribute__((preserve_most)) const {{\[}}[clang::annotate_type(...)]]':'void (int) __attribute__((preserve_most)) const' implicit_instantiation
+  (void) [] (T) PRESERVE [[clang::annotate_type("foo")]] { };
+
+  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int) const {{\[}}[clang::annotate_type(...)]]':'void (int) const' implicit_instantiation
+  (void) [] (T) [[clang::annotate_type("foo")]] {
+    // CHECK: CXXMethodDecl {{.*}} operator() 'PRESERVE void (int) __attribute__((preserve_most)) const {{\[}}[clang::annotate_type(...)]]':'void (int) __attribute__((preserve_most)) const' implicit_instantiation
+    auto l = []<typename U = T> (U u = {}) PRESERVE [[clang::annotate_type("foo", u)]] { };
+
+    // CHECK: DeclRefExpr {{.*}} 'PRESERVE void (int) __attribute__((preserve_most)) const {{\[}}[clang::annotate_type(...)]]':'void (int) __attribute__((preserve_most)) const' lvalue CXXMethod
+    l();
+  };
+}
+
+void g() {
+  f<int>();
+}
diff --git a/clang/test/SemaCXX/lambda-conversion-op-cc.cpp b/clang/test/SemaCXX/lambda-conversion-op-cc.cpp
index 3632f8c8c80aa..6d9f5d702e132 100644
--- a/clang/test/SemaCXX/lambda-conversion-op-cc.cpp
+++ b/clang/test/SemaCXX/lambda-conversion-op-cc.cpp
@@ -44,19 +44,19 @@ void useage() {
 
   // CHECK: VarDecl {{.*}} vectorcall '
   // CHECK: LambdaExpr
-  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int, float, double) __attribute__((vectorcall)) const'
+  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int, float, double) const __attribute__((vectorcall))':'void (int, float, double) __attribute__((vectorcall)) const'
   // CHECK: CXXConversionDecl {{.*}} operator void (*)(int, float, double) __attribute__((vectorcall)) 'void
   // CHECK: CXXMethodDecl {{.*}} __invoke 'void (int, float, double) __attribute__((vectorcall))' static inline
 
   // WIN32: VarDecl {{.*}} thiscall '
   // WIN32: LambdaExpr
-  // WIN32: CXXMethodDecl {{.*}} operator() 'void (int, float, double) __attribute__((thiscall)) const'
+  // WIN32: CXXMethodDecl {{.*}} operator() 'void (int, float, double) const __attribute__((thiscall))':'void (int, float, double) __attribute__((thiscall)) const'
   // WIN32: CXXConversionDecl {{.*}} operator void (*)(int, float, double) 'void
   // WIN32: CXXMethodDecl {{.*}} __invoke 'void (int, float, double)' static inline
 
   // CHECK: VarDecl {{.*}} cdecl '
   // CHECK: LambdaExpr
-  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int, float, double) const'
+  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int, float, double) const __attribute__((cdecl))':'void (int, float, double) const'
   // NODEF: CXXConversionDecl {{.*}} operator void (*)(int, float, double) 'void
   // NODEF: CXXMethodDecl {{.*}} __invoke 'void (int, float, double)' static inline
   // VECTDEF: CXXConversionDecl {{.*}} operator void (*)(int, float, double) __attribute__((vectorcall)) 'void
@@ -108,8 +108,8 @@ void useage() {
   // CHECK: LambdaExpr
   // CHECK: FunctionTemplateDecl {{.*}} operator()
   // CHECK: CXXMethodDecl {{.*}} operator() 'auto (auto) __attribute__((vectorcall)) const' inline
-  // CHECK: CXXMethodDecl {{.*}} operator() 'void (char) __attribute__((vectorcall)) const' implicit_instantiation inline
-  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int) __attribute__((vectorcall)) const' implicit_instantiation inline
+  // CHECK: CXXMethodDecl {{.*}} operator() 'void (char) const __attribute__((vectorcall))':'void (char) __attribute__((vectorcall)) const' implicit_instantiation inline
+  // CHECK: CXXMethodDecl {{.*}} operator() 'void (int) const __attribute__((vectorcall))':'void (int) __attribute__((vectorcall)) const' implicit_instantiation inline
   // CHECK: FunctionTemplateDecl {{.*}} operator auto (*)(type-parameter-0-0) __attribute__((vectorcall))
   // LIN64: CXXConversionDecl {{.*}} operator auto (*)(type-parameter-0-0) __attribute__((vectorcall)) 'auto (*() const noexcept)(auto) __attribute__((vectorcall))'
   // LIN64: CXXConversionDecl {{.*}} operator auto (*)(char) __attribute__((vectorcall)) 'void (*() const noexcept)(char) __attribute__((vectorcall))'
diff --git a/clang/test/SemaCXX/ptrauth-sign-constant.cpp b/clang/test/SemaCXX/ptrauth-sign-constant.cpp
new file mode 100644
index 0000000000000..396962e33e2fa
--- /dev/null
+++ b/clang/test/SemaCXX/ptrauth-sign-constant.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics %s
+
+int n;
+constexpr bool compare_result = __builtin_ptrauth_sign_constant(&n, 2, 0) == &n;
+// expected-error@-1 {{constant expression}}
+// expected-note@-2 {{comparison against opaque constant address '&__builtin_ptrauth_sign_constant(&n, 2, 0)'}}
\ No newline at end of file
diff --git a/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl
new file mode 100644
index 0000000000000..815a0c35cb04c
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+
+
+int4 test_asint_too_many_arg(float p0, float p1) {
+  return asint(p0, p1);
+  // expected-error@-1 {{no matching function for call to 'asint'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires single argument 'V', but 2 arguments were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires single argument 'F', but 2 arguments were provided}}
+}
+
+int test_asint_double(double p1) {
+    return asint(p1);
+    // expected-error@hlsl/hlsl_intrinsics.h:* {{no matching function for call to 'bit_cast'}}
+    // expected-note@-2 {{in instantiation of function template specialization 'hlsl::asint<double>'}}
+    // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: could not match 'vector<double, N>' against 'double'}}
+    // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: substitution failure [with U = int, T = double]: no type named 'Type'}}
+}
+
+int test_asint_half(half p1) {
+    return asint(p1);
+    // expected-error@hlsl/hlsl_intrinsics.h:* {{no matching function for call to 'bit_cast'}}
+    // expected-note@-2 {{in instantiation of function template specialization 'hlsl::asint<half>'}}
+    // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: could not match 'vector<half, N>' against 'half'}}
+    // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: substitution failure [with U = int, T = half]: no type named 'Type'}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl
new file mode 100644
index 0000000000000..e4fa609dd6a05
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl
@@ -0,0 +1,22 @@
+
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+
+float builtin_bool_to_float_type_promotion(bool p1, bool p2) {
+  return __builtin_elementwise_fmod(p1, p2);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
+}
+
+float2 builtin_fmod_int2_to_float2_promotion(int2 p1, int2 p2) {
+  return __builtin_elementwise_fmod(p1, p2);
+  // expected-error@-1 {{1st argument must be a floating point type (was 'int2' (aka 'vector<int, 2>'))}}
+}
+
+half builtin_fmod_double_type (double p0, double p1) {
+  return __builtin_elementwise_fmod(p0, p1);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
+
+half builtin_fmod_double2_type (double2 p0, double2 p1) {
+  return __builtin_elementwise_fmod(p0, p1);
+  // expected-error@-1 {{passing 'double2' (aka 'vector<double, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
diff --git a/clang/test/SemaHLSL/Language/UsualArithmeticConversions.hlsl b/clang/test/SemaHLSL/Language/UsualArithmeticConversions.hlsl
new file mode 100644
index 0000000000000..d9f20a4cb79ec
--- /dev/null
+++ b/clang/test/SemaHLSL/Language/UsualArithmeticConversions.hlsl
@@ -0,0 +1,383 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -finclude-default-header -fnative-half-type %s -DERRORS -Wconversion -Wdouble-promotion -verify
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -std=hlsl2018 -finclude-default-header -fnative-half-type %s -DERRORS -Wconversion -Wdouble-promotion -verify
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -finclude-default-header -fnative-half-type %s -ast-dump | FileCheck %s
+
+#if __HLSL_VERSION <= 2021
+// expected-warning@*{{support for HLSL language version hlsl2018 is incomplete, recommend using hlsl202x instead}}
+#endif
+
+//----------------------------------------------------------------------------//
+// Case 1: float4 * int4 and inverse.
+//
+// In both cases here the int is converted to a float and the computation
+// produces a float value.
+//----------------------------------------------------------------------------//
+
+// CHECK-LABEL: FunctionDecl {{.*}} used f4f4i4 'float4 (float4, int4)'
+// CHECK: BinaryOperator {{.*}} 'float4':'vector<float, 4>' '*'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector<float, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector<float, 4>' lvalue ParmVar {{.*}} 'A' 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector<float, 4>' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector<int, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector<int, 4>' lvalue ParmVar {{.*}} 'B' 'int4':'vector<int, 4>'
+export float4 f4f4i4(float4 A, int4 B) {
+  return A * B; // expected-warning{{implicit conversion from 'int4' (aka 'vector<int, 4>') to 'float4' (aka 'vector<float, 4>') may lose precision}}
+}
+
+// CHECK-LABEL: FunctionDecl {{.*}} used f4i4f4 'float4 (float4, int4)'
+// CHECK: BinaryOperator {{.*}} 'float4':'vector<float, 4>' '*'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector<float, 4>' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector<int, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector<int, 4>' lvalue ParmVar {{.*}} 'B' 'int4':'vector<int, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector<float, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector<float, 4>' lvalue ParmVar {{.*}} 'A' 'float4':'vector<float, 4>'
+export float4 f4i4f4(float4 A, int4 B) {
+  return B * A; // expected-warning{{implicit conversion from 'int4' (aka 'vector<int, 4>') to 'float4' (aka 'vector<float, 4>') may lose precision}}
+}
+
+//----------------------------------------------------------------------------//
+// Case 2: float4 * int2 and inverse.
+//
+// In both cases the float vector is trunctated to a float2 and the integer
+// vector is converted to a float2.
+//----------------------------------------------------------------------------//
+
+// CHECK-LABEL: FunctionDecl {{.*}} used f2f4i2 'float2 (float4, int2)'
+// CHECK: BinaryOperator {{.*}} 'vector<float, 2>' '*'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 2>' <HLSLVectorTruncation>
+// CHECK-NEXT: ImplicitCastExpr {{.*}}'float4':'vector<float, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector<float, 4>' lvalue ParmVar {{.*}} 'A' 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 2>' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int2':'vector<int, 2>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector<int, 2>' lvalue ParmVar {{.*}} 'B' 'int2':'vector<int, 2>'
+export float2 f2f4i2(float4 A, int2 B) {
+  // expected-warning@#f2f4i2 {{implicit conversion from 'int2' (aka 'vector<int, 2>') to 'vector<float, 2>' (vector of 2 'float' values) may lose precision}}
+  // expected-warning@#f2f4i2 {{implicit conversion truncates vector: 'float4' (aka 'vector<float, 4>') to 'vector<float, 2>' (vector of 2 'float' values)}}
+  return A * B; // #f2f4i2
+}
+
+// CHECK-LABEL: FunctionDecl {{.*}} used f2i2f4 'float2 (float4, int2)'
+// CHECK: BinaryOperator {{.*}} 'vector<float, 2>' '*'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 2>' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int2':'vector<int, 2>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector<int, 2>' lvalue ParmVar {{.*}} 'B' 'int2':'vector<int, 2>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 2>' <HLSLVectorTruncation>
+// CHECK-NEXT: ImplicitCastExpr {{.*}}'float4':'vector<float, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector<float, 4>' lvalue ParmVar {{.*}} 'A' 'float4':'vector<float, 4>'
+export float2 f2i2f4(float4 A, int2 B) {
+  // expected-warning@#f2i2f4 {{implicit conversion from 'int2' (aka 'vector<int, 2>') to 'vector<float, 2>' (vector of 2 'float' values) may lose precision}}
+  // expected-warning@#f2i2f4 {{implicit conversion truncates vector: 'float4' (aka 'vector<float, 4>') to 'vector<float, 2>' (vector of 2 'float' values)}}
+  return B * A; // #f2i2f4
+}
+
+//----------------------------------------------------------------------------//
+// Case 3: Integers of mismatched sign, equivalent size, but the unsigned type
+// has lower conversion rank.
+//
+// This is the odd-ball case for HLSL that isn't really in spec, but we should
+// handle gracefully. The lower-ranked unsigned type is converted to the
+// equivalent unsigned type of higher rank, and the signed type is also
+// converted to that unsigned type (meaning `unsigned long` becomes `unsinged
+// long long`, and `long long` becomes `unsigned long long`).
+//----------------------------------------------------------------------------//
+
+// CHECK-LABEL: FunctionDecl {{.*}} used wierdo 'int4 (vector<unsigned long, 4>, vector<long long, 4>)'
+// CHECK: BinaryOperator {{.*}} 'vector<unsigned long long, 4>' '*'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<unsigned long long, 4>' <IntegralCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<unsigned long, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr{{.*}} 'vector<unsigned long, 4>' lvalue ParmVar {{.*}} 'A' 'vector<unsigned long, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<unsigned long long, 4>' <IntegralCast>
+// CHECK-NEXT: ImplicitCastExpr{{.*}}> 'vector<long long, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}}'vector<long long, 4>' lvalue ParmVar {{.*}} 'B' 'vector<long long, 4>'
+export int4 wierdo(vector<unsigned long, 4> A, vector<long long, 4> B) {
+  // expected-warning@#wierdo {{implicit conversion loses integer precision: 'vector<unsigned long long, 4>' (vector of 4 'unsigned long long' values) to 'vector<int, 4>' (vector of 4 'int' values)}}
+  // expected-warning@#wierdo {{implicit conversion changes signedness: 'vector<long long, 4>' (vector of 4 'long long' values) to 'vector<unsigned long long, 4>' (vector of 4 'unsigned long long' values)}}
+  return A * B; // #wierdo
+}
+
+//----------------------------------------------------------------------------//
+// Case 4: Compound assignment of float4 with an int4.
+//
+// In compound assignment the RHS is converted to match the LHS.
+//----------------------------------------------------------------------------//
+
+// CHECK-LABEL: FunctionDecl {{.*}} used f4f4i4compound 'float4 (float4, int4)'
+// CHECK: CompoundAssignOperator {{.*}} 'float4':'vector<float, 4>' lvalue '+=' ComputeLHSTy='float4':'vector<float, 4>' ComputeResultTy='float4':'vector<float, 4>'
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector<float, 4>' lvalue ParmVar {{.*}} 'A' 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector<float, 4>' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector<int, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector<int, 4>' lvalue ParmVar {{.*}} 'B' 'int4':'vector<int, 4>'
+export float4 f4f4i4compound(float4 A, int4 B) {
+  A += B; // expected-warning{{implicit conversion from 'int4' (aka 'vector<int, 4>') to 'float4' (aka 'vector<float, 4>') may lose precision}}
+  return A;
+}
+
+
+//----------------------------------------------------------------------------//
+// Case 5: Compound assignment of float2 with an int4.
+//
+// In compound assignment the RHS is converted to match the LHS.
+//----------------------------------------------------------------------------//
+
+// CHECK-LABEL: FunctionDecl {{.*}} used f4f2i4compound 'float4 (float2, int4)'
+// CHECK: CompoundAssignOperator {{.*}} 'float2':'vector<float, 2>' lvalue '+=' ComputeLHSTy='float2':'vector<float, 2>' ComputeResultTy='float2':'vector<float, 2>'
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float2':'vector<float, 2>' lvalue ParmVar {{.*}} 'A' 'float2':'vector<float, 2>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'vector<float, 2>' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<int, 2>' <HLSLVectorTruncation>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector<int, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector<int, 4>' lvalue ParmVar {{.*}} 'B' 'int4':'vector<int, 4>'
+export float4 f4f2i4compound(float2 A, int4 B) {
+  // expected-warning@#f4f2i4compound{{implicit conversion truncates vector: 'int4' (aka 'vector<int, 4>') to 'float2' (aka 'vector<float, 2>')}}
+  // expected-warning@#f4f2i4compound{{implicit conversion from 'int4' (aka 'vector<int, 4>') to 'float2' (aka 'vector<float, 2>') may lose precision}}
+  A += B; // #f4f2i4compound
+  return A.xyxy;
+}
+
+//----------------------------------------------------------------------------//
+// Case 6: float2 * int4
+//
+// The int4 vector is trunctated to int2 then converted to float2.
+//----------------------------------------------------------------------------//
+
+// CHECK-LABEL: FunctionDecl {{.*}} used f4f2i4 'float2 (float2, int4)'
+// CHECK: BinaryOperator {{.*}} 'float2':'vector<float, 2>' '*'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'vector<float, 2>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float2':'vector<float, 2>' lvalue ParmVar {{.*}} 'A' 'float2':'vector<float, 2>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'vector<float, 2>' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<int, 2>' <HLSLVectorTruncation>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector<int, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector<int, 4>' lvalue ParmVar {{.*}} 'B' 'int4':'vector<int, 4>'
+export float2 f4f2i4(float2 A, int4 B) {
+  // expected-warning@#f4f2i4{{implicit conversion truncates vector: 'int4' (aka 'vector<int, 4>') to 'float2' (aka 'vector<float, 2>')}}
+  // expected-warning@#f4f2i4{{implicit conversion from 'int4' (aka 'vector<int, 4>') to 'float2' (aka 'vector<float, 2>') may lose precision}}
+  return A * B; // #f4f2i4
+}
+
+//----------------------------------------------------------------------------//
+// Case 7: Compound assignment of half4 with float4, and inverse.
+//
+// In compound assignment the RHS is converted to match the LHS.
+//----------------------------------------------------------------------------//
+
+// CHECK-LABEL: FunctionDecl {{.*}} used f4h4f4compound 'float4 (half4, float4)'
+// CHECK: CompoundAssignOperator {{.*}} 'half4':'vector<half, 4>' lvalue '+=' ComputeLHSTy='half4':'vector<half, 4>' ComputeResultTy='half4':'vector<half, 4>'
+// CHECK-NEXT: DeclRefExpr {{.*}} 'half4':'vector<half, 4>' lvalue ParmVar {{.*}} 'A' 'half4':'vector<half, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'half4':'vector<half, 4>' <FloatingCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector<float, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector<float, 4>' lvalue ParmVar {{.*}} 'B' 'float4':'vector<float, 4>'
+export float4 f4h4f4compound(half4 A, float4 B) {
+  A += B; // expected-warning{{implicit conversion loses floating-point precision: 'float4' (aka 'vector<float, 4>') to 'half4' (aka 'vector<half, 4>')}}
+  return B;
+}
+
+// CHECK-LABEL: FunctionDecl {{.*}} used f4f4h4compound 'float4 (float4, half4)'
+// CHECK: CompoundAssignOperator {{.*}} 'float4':'vector<float, 4>' lvalue '+=' ComputeLHSTy='float4':'vector<float, 4>' ComputeResultTy='float4':'vector<float, 4>'
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector<float, 4>' lvalue ParmVar {{.*}} 'A' 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector<float, 4>' <FloatingCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'half4':'vector<half, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'half4':'vector<half, 4>' lvalue ParmVar {{.*}} 'B' 'half4':'vector<half, 4>'
+export float4 f4f4h4compound(float4 A, half4 B) {
+  A += B; // expected-warning{{implicit conversion increases floating-point precision: 'half4' (aka 'vector<half, 4>') to 'float4' (aka 'vector<float, 4>')}}
+  return A;
+}
+
+//----------------------------------------------------------------------------//
+// Case 8: int64_t4 * uint4
+//
+// The unsigned argument is promoted to the higher ranked signed type since it
+// can express all values of the unsgined argument.
+//----------------------------------------------------------------------------//
+
+// CHECK-LABEL: FunctionDecl {{.*}} used l4l4i4 'int64_t4 (int64_t4, uint4)'
+// CHECK: BinaryOperator {{.*}} 'int64_t4':'vector<int64_t, 4>' '*'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t4':'vector<int64_t, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int64_t4':'vector<int64_t, 4>' lvalue ParmVar {{.*}} 'A' 'int64_t4':'vector<int64_t, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t4':'vector<int64_t, 4>' <IntegralCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint4':'vector<uint, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'uint4':'vector<uint, 4>' lvalue ParmVar {{.*}} 'B' 'uint4':'vector<uint, 4>'
+export int64_t4 l4l4i4(int64_t4 A, uint4 B) {
+  return A * B;
+}
+
+//----------------------------------------------------------------------------//
+// Case 9: Compound assignment of int4 from int64_t4
+//
+// In compound assignment the RHS is converted to match the LHS.
+//----------------------------------------------------------------------------//
+
+// CHECK-LABEL: FunctionDecl {{.*}} used i4i4l4compound 'int4 (int4, int64_t4)'
+// CHECK: CompoundAssignOperator {{.*}} 'int4':'vector<int, 4>' lvalue '+=' ComputeLHSTy='int4':'vector<int, 4>' ComputeResultTy='int4':'vector<int, 4>'
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector<int, 4>' lvalue ParmVar {{.*}} 'A' 'int4':'vector<int, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector<int, 4>' <IntegralCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t4':'vector<int64_t, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int64_t4':'vector<int64_t, 4>' lvalue ParmVar {{.*}} 'B' 'int64_t4':'vector<int64_t, 4>'
+export int4 i4i4l4compound(int4 A, int64_t4 B) {
+  A += B; // expected-warning{{implicit conversion loses integer precision: 'int64_t4' (aka 'vector<int64_t, 4>') to 'int4' (aka 'vector<int, 4>')}}
+  return A;
+}
+
+//----------------------------------------------------------------------------//
+// Case 10: Compound assignment of vector<unsigned long, 4> with argument of
+// vector<long long, 4>
+//
+// In compound assignment the RHS is converted to match the LHS. This one is
+// also the weird case because it is out of spec, but we should handle it
+// gracefully.
+//----------------------------------------------------------------------------//
+
+// CHECK-LABEL: FunctionDecl {{.*}} used wierdocompound 'vector<unsigned long, 4> (vector<unsigned long, 4>, vector<long long, 4>)'
+// CHECK: CompoundAssignOperator {{.*}} 'vector<unsigned long, 4>' lvalue '+=' ComputeLHSTy='vector<unsigned long, 4>' ComputeResultTy='vector<unsigned long, 4>'
+// CHECK-NEXT: DeclRefExpr {{.*}} 'vector<unsigned long, 4>' lvalue ParmVar {{.*}} 'A' 'vector<unsigned long, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<unsigned long, 4>' <IntegralCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<long long, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'vector<long long, 4>' lvalue ParmVar {{.*}} 'B' 'vector<long long, 4>'
+export vector<unsigned long, 4> wierdocompound(vector<unsigned long, 4> A, vector<long long, 4> B) {
+  // expected-warning@#wierdocompound{{implicit conversion changes signedness: 'vector<long long, 4>' (vector of 4 'long long' values) to 'vector<unsigned long, 4>' (vector of 4 'unsigned long' values)}}
+  A += B; // #wierdocompound
+  return A;
+}
+
+//----------------------------------------------------------------------------//
+// Case 11: Compound assignment of scalar with vector argument.
+//
+// Because the LHS of a compound assignment cannot change type, the RHS must be
+// implicitly convertable to the LHS type.
+//----------------------------------------------------------------------------//
+
+// CHECK-LABEL: FunctionDecl {{.*}} used ffi2compound 'float (float, int2)'
+// CHECK: CompoundAssignOperator {{.*}} 'float' lvalue '+=' ComputeLHSTy='float' ComputeResultTy='float'
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'A' 'float'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <HLSLVectorTruncation>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int2':'vector<int, 2>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector<int, 2>' lvalue ParmVar {{.*}} 'B' 'int2':'vector<int, 2>'
+export float ffi2compound(float A, int2 B) {
+  A += B; // expected-warning {{implicit conversion turns vector to scalar: 'int2' (aka 'vector<int, 2>') to 'float'}}
+  return A;
+}
+
+// CHECK-LABEL: FunctionDecl {{.*}} used iif2compound 'int (int, float2)'
+// CHECK: CompoundAssignOperator {{.*}} 'int' lvalue '+=' ComputeLHSTy='int' ComputeResultTy='int'
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'A' 'int'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <FloatingToIntegral>
+// CHECK-NEXT: mplicitCastExpr {{.*}} 'float' <HLSLVectorTruncation>
+// CHECK-NEXT: ImplicitCastExpr{{.*}} 'float2':'vector<float, 2>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float2':'vector<float, 2>' lvalue ParmVar {{.*}} 'B' 'float2':'vector<float, 2>'
+export int iif2compound(int A, float2 B) {
+  A += B; // expected-warning{{implicit conversion turns vector to scalar: 'float2' (aka 'vector<float, 2>') to 'int'}}
+  return A;
+}
+
+
+//----------------------------------------------------------------------------//
+// Case 12: Compound assignment of vector of larger size than the argument.
+//
+// Because the LHS of a compound assignment cannot change type, the RHS must be
+// implicitly convertable to the LHS type. This fails since the RHS type can't
+// be vector-extended implicitly.
+//----------------------------------------------------------------------------//
+
+#ifdef ERRORS
+// The only cases that are really illegal here are when the RHS is a vector that
+// is larger than the LHS or when the LHS is a scalar.
+
+export float2 f2f4i2compound(float4 A, int2 B) {
+  A += B; // expected-error{{left hand operand of type 'float4' (aka 'vector<float, 4>') to compound assignment cannot be truncated when used with right hand operand of type 'int2' (aka 'vector<int, 2>')}}
+  return A.xy;
+}
+
+#endif
+
+//----------------------------------------------------------------------------//
+// Case 13: Comparison operators for mismatched arguments follow the same rules.
+//
+// Compare operators convert each argument following the usual arithmetic
+// conversions.
+//----------------------------------------------------------------------------//
+
+// Note: these cases work and generate correct code, but the way they get there
+// may change with https://github.com/llvm/llvm-project/issues/91639, because
+// representing boolean vectors as 32-bit integer vectors will allow more
+// efficient code generation.
+
+// CHECK-LABEL: FunctionDecl {{.*}} used b4f4i4Compare 'bool4 (float4, int4)'
+// CHECK: ImplicitCastExpr {{.*}} 'vector<bool, 4>' <IntegralToBoolean>
+// CHECK-NEXT: BinaryOperator {{.*}} 'vector<int, 4>' '<'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector<float, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector<float, 4>' lvalue ParmVar {{.*}} 'A' 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector<float, 4>' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector<int, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector<int, 4>' lvalue ParmVar {{.*}} 'B' 'int4':'vector<int, 4>'
+export bool4 b4f4i4Compare(float4 A, int4 B) {
+  return A < B; // expected-warning{{implicit conversion from 'int4' (aka 'vector<int, 4>') to 'float4' (aka 'vector<float, 4>') may lose precision}}
+}
+
+
+// CHECK-LABEL: FunctionDecl {{.*}} used b2f2i4Compare 'bool2 (float2, int4)'
+// CHECK: ImplicitCastExpr {{.*}} 'vector<bool, 2>' <IntegralToBoolean>
+// CHECK-NEXT: BinaryOperator {{.*}} 'vector<int, 2>' '<='
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'vector<float, 2>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float2':'vector<float, 2>' lvalue ParmVar {{.*}} 'A' 'float2':'vector<float, 2>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'vector<float, 2>' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<int, 2>' <HLSLVectorTruncation>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector<int, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector<int, 4>' lvalue ParmVar {{.*}} 'B' 'int4':'vector<int, 4>'
+
+export bool2 b2f2i4Compare(float2 A, int4 B) {
+  // expected-warning@#b2f2i4Compare{{implicit conversion truncates vector: 'int4' (aka 'vector<int, 4>') to 'float2' (aka 'vector<float, 2>')}}
+  // expected-warning@#b2f2i4Compare{{implicit conversion from 'int4' (aka 'vector<int, 4>') to 'float2' (aka 'vector<float, 2>') may lose precision}}
+  return A <= B; // #b2f2i4Compare
+}
+
+// CHECK-LABEL: FunctionDecl {{.*}} used b4fi4Compare 'bool4 (float, int4)'
+// CHECK: ImplicitCastExpr {{.*}} 'vector<bool, 4>' <IntegralToBoolean>
+// CHECK-NEXT: BinaryOperator {{.*}} 'vector<int, 4>' '>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 4>' <VectorSplat>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'A' 'float'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 4>' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int4':'vector<int, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int4':'vector<int, 4>' lvalue ParmVar {{.*}} 'B' 'int4':'vector<int, 4>'
+export bool4 b4fi4Compare(float A, int4 B) {
+  return A > B; // expected-warning{{implicit conversion from 'int4' (aka 'vector<int, 4>') to 'vector<float, 4>' (vector of 4 'float' values) may lose precision}}
+}
+
+//----------------------------------------------------------------------------//
+// Case 14: Logical operators on vectors are disallowed in HLSL 2021+
+//----------------------------------------------------------------------------//
+
+#ifdef ERRORS
+
+#if __HLSL_VERSION >= 2021
+// expected-error@#b4f4i4Logical{{invalid operands to binary expression ('float4' (aka 'vector<float, 4>') and 'int4' (aka 'vector<int, 4>'))}}
+// expected-note@#b4f4i4Logical{{did you mean or?}}
+#else
+// expected-warning@#b4f4i4Logical{{implicit conversion from 'int4' (aka 'vector<int, 4>') to 'float4' (aka 'vector<float, 4>') may lose precision}}
+#endif
+
+export bool4 b4f4i4Logical(float4 A, int4 B) {
+  return A || B; // #b4f4i4Logical
+}
+
+#if __HLSL_VERSION >= 2021
+// expected-error@#b2f2i4Logical{{invalid operands to binary expression ('float2' (aka 'vector<float, 2>') and 'int4' (aka 'vector<int, 4>'))}}
+// expected-note@#b2f2i4Logical{{did you mean and?}}
+#else
+// expected-warning@#b2f2i4Logical{{implicit conversion truncates vector: 'int4' (aka 'vector<int, 4>') to 'float2' (aka 'vector<float, 2>')}}
+// expected-warning@#b2f2i4Logical{{implicit conversion from 'int4' (aka 'vector<int, 4>') to 'float2' (aka 'vector<float, 2>') may lose precision}}
+#endif
+
+export bool2 b2f2i4Logical(float2 A, int4 B) {
+  return A && B; // #b2f2i4Logical
+}
+
+#if __HLSL_VERSION >= 2021
+// expected-error@#b2b2b2Logical{{invalid operands to binary expression ('bool2' (aka 'vector<bool, 2>') and 'bool2')}}
+// expected-note@#b2b2b2Logical{{did you mean and?}}
+#endif
+
+export bool2 b2b2b2Logical(bool2 A, bool2 B) {
+  return A && B; // #b2b2b2Logical
+}
+
+#endif
diff --git a/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl
index 92cba1dcd4bdf..8c0f8d6f271db 100644
--- a/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl
+++ b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl
@@ -76,3 +76,6 @@ template<typename T> struct SimpleTemplate {
 };
 _Static_assert(__builtin_hlsl_is_intangible(SimpleTemplate<__hlsl_resource_t>), "");
 _Static_assert(!__builtin_hlsl_is_intangible(SimpleTemplate<float>), "");
+
+_Static_assert(__builtin_hlsl_is_intangible(RWBuffer<float>), "");
+_Static_assert(__builtin_hlsl_is_intangible(StructuredBuffer<Simple>), "");
diff --git a/clang/test/SemaOpenACC/compute-construct-device_type-clause.c b/clang/test/SemaOpenACC/compute-construct-device_type-clause.c
index b300abe577801..26f0315fb86f1 100644
--- a/clang/test/SemaOpenACC/compute-construct-device_type-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-device_type-clause.c
@@ -188,8 +188,7 @@ void uses() {
   // expected-note@+1{{previous clause is here}}
 #pragma acc serial device_type(*) reduction(+:Var)
   while(1);
-  // expected-error@+2{{OpenACC clause 'collapse' may not follow a 'device_type' clause in a compute construct}}
-  // expected-note@+1{{previous clause is here}}
+  // expected-error@+1{{OpenACC 'collapse' clause is not valid on 'kernels' directive}}
 #pragma acc kernels device_type(*) collapse(1)
   while(1);
   // expected-error@+2{{OpenACC clause 'bind' may not follow a 'device_type' clause in a compute construct}}
diff --git a/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c b/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c
index ac61976ff620d..3212c19d089fc 100644
--- a/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c
+++ b/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c
@@ -138,7 +138,6 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'reduction' not yet implemented}}
 #pragma acc loop auto reduction(+:Var)
   for(;;);
-  // expected-warning@+1{{OpenACC clause 'collapse' not yet implemented}}
 #pragma acc loop auto collapse(1)
   for(;;);
   // expected-warning@+1{{OpenACC clause 'bind' not yet implemented}}
@@ -277,7 +276,6 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'reduction' not yet implemented}}
 #pragma acc loop reduction(+:Var) auto
   for(;;);
-  // expected-warning@+1{{OpenACC clause 'collapse' not yet implemented}}
 #pragma acc loop collapse(1) auto
   for(;;);
   // expected-warning@+1{{OpenACC clause 'bind' not yet implemented}}
@@ -417,7 +415,6 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'reduction' not yet implemented}}
 #pragma acc loop independent reduction(+:Var)
   for(;;);
-  // expected-warning@+1{{OpenACC clause 'collapse' not yet implemented}}
 #pragma acc loop independent collapse(1)
   for(;;);
   // expected-warning@+1{{OpenACC clause 'bind' not yet implemented}}
@@ -556,7 +553,6 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'reduction' not yet implemented}}
 #pragma acc loop reduction(+:Var) independent
   for(;;);
-  // expected-warning@+1{{OpenACC clause 'collapse' not yet implemented}}
 #pragma acc loop collapse(1) independent
   for(;;);
   // expected-warning@+1{{OpenACC clause 'bind' not yet implemented}}
@@ -705,7 +701,6 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'reduction' not yet implemented}}
 #pragma acc loop seq reduction(+:Var)
   for(;;);
-  // expected-warning@+1{{OpenACC clause 'collapse' not yet implemented}}
 #pragma acc loop seq collapse(1)
   for(;;);
   // expected-warning@+1{{OpenACC clause 'bind' not yet implemented}}
@@ -853,7 +848,6 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'reduction' not yet implemented}}
 #pragma acc loop reduction(+:Var) seq
   for(;;);
-  // expected-warning@+1{{OpenACC clause 'collapse' not yet implemented}}
 #pragma acc loop collapse(1) seq
   for(;;);
   // expected-warning@+1{{OpenACC clause 'bind' not yet implemented}}
diff --git a/clang/test/SemaOpenACC/loop-construct-collapse-ast.cpp b/clang/test/SemaOpenACC/loop-construct-collapse-ast.cpp
new file mode 100644
index 0000000000000..3bdcfbf95b96c
--- /dev/null
+++ b/clang/test/SemaOpenACC/loop-construct-collapse-ast.cpp
@@ -0,0 +1,158 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
+struct S {
+  constexpr S(){};
+  constexpr operator auto() {return 1;}
+};
+
+void NormalUses() {
+  // CHECK: FunctionDecl{{.*}}NormalUses
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc loop collapse(1)
+  for(;;);
+  // CHECK-NEXT: OpenACCLoopConstruct
+  // CHECK-NEXT: collapse clause
+  // CHECK-NEXT: ConstantExpr{{.*}}'int'
+  // CHECK-NEXT: value: Int 1
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 1
+  // CHECK-NEXT: ForStmt
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: NullStmt
+
+#pragma acc loop collapse(force:S{})
+  for(;;);
+  // CHECK-NEXT: OpenACCLoopConstruct
+  // CHECK-NEXT: collapse clause
+  // CHECK-NEXT: ConstantExpr{{.*}}'int'
+  // CHECK-NEXT: value: Int 1
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'int'
+  // CHECK-NEXT: MemberExpr{{.*}} .operator auto
+  // CHECK-NEXT: MaterializeTemporaryExpr{{.*}}'S' lvalue
+  // CHECK-NEXT: CXXTemporaryObjectExpr{{.*}}'S' 'void ()' list
+  // CHECK-NEXT: ForStmt
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: NullStmt
+}
+
+template<typename T, unsigned Value>
+void TemplUses() {
+  // CHECK: FunctionTemplateDecl{{.*}}TemplUses
+  // CHECK-NEXT: TemplateTypeParmDecl {{.*}} referenced typename depth 0 index 0 T
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'unsigned int' depth 0 index 1 Value
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplUses 'void ()'
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc loop collapse(Value)
+  for(;;)
+  for(;;);
+  // CHECK-NEXT: OpenACCLoopConstruct
+  // CHECK-NEXT: collapse clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'unsigned int' NonTypeTemplateParm{{.*}} 'Value'
+  // CHECK-NEXT: ForStmt
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: ForStmt
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: NullStmt
+
+#pragma acc loop collapse(force:T{} + S{})
+  for(;;)
+  for(;;);
+  // CHECK-NEXT: OpenACCLoopConstruct
+  // CHECK-NEXT: collapse clause
+  // CHECK-NEXT: BinaryOperator {{.*}}'+'
+  // CHECK-NEXT: CXXUnresolvedConstructExpr{{.*}}'T' 'T' list
+  // CHECK-NEXT: InitListExpr
+  // CHECK-NEXT: CXXTemporaryObjectExpr{{.*}}'S' 'void ()' list
+  // CHECK-NEXT: ForStmt
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: ForStmt
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: NullStmt
+
+  // Instantiation:
+  // CHECK-NEXT: FunctionDecl{{.*}} used TemplUses 'void ()' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'S'
+  // CHECK-NEXT: RecordType{{.*}} 'S'
+  // CHECK-NEXT: CXXRecord{{.*}} 'S'
+  // CHECK-NEXT: TemplateArgument integral '2U'
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCLoopConstruct
+  // CHECK-NEXT: collapse clause
+  // CHECK-NEXT: ConstantExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: value: Int 2
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}} 'unsigned int'
+  // CHECK-NEXT: NonTypeTemplateParmDecl
+  // CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned int' 2
+  // CHECK-NEXT: ForStmt
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: ForStmt
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: NullStmt
+
+  // CHECK-NEXT: OpenACCLoopConstruct
+  // CHECK-NEXT: collapse clause
+  // CHECK-NEXT: ConstantExpr{{.*}}'int'
+  // CHECK-NEXT: value: Int 2
+  // CHECK-NEXT: BinaryOperator {{.*}}'+'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'int'
+  // CHECK-NEXT: MemberExpr{{.*}} .operator auto
+  // CHECK-NEXT: MaterializeTemporaryExpr{{.*}}'S' lvalue
+  // CHECK-NEXT: CXXTemporaryObjectExpr{{.*}}'S' 'void ()' list
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'int'
+  // CHECK-NEXT: MemberExpr{{.*}} .operator auto
+  // CHECK-NEXT: MaterializeTemporaryExpr{{.*}}'S' lvalue
+  // CHECK-NEXT: CXXTemporaryObjectExpr{{.*}}'S' 'void ()' list
+  // CHECK-NEXT: ForStmt
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: ForStmt
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: NullStmt
+
+}
+
+void Inst() {
+  TemplUses<S, 2>();
+}
+
+#endif // PCH_HELPER
diff --git a/clang/test/SemaOpenACC/loop-construct-collapse-clause.cpp b/clang/test/SemaOpenACC/loop-construct-collapse-clause.cpp
new file mode 100644
index 0000000000000..9c1e577773e8f
--- /dev/null
+++ b/clang/test/SemaOpenACC/loop-construct-collapse-clause.cpp
@@ -0,0 +1,117 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+
+void only_for_loops() {
+  // expected-error@+3{{OpenACC 'loop' construct can only be applied to a 'for' loop}}
+  // expected-note@+1{{'loop' construct is here}}
+#pragma acc loop collapse(1)
+  while(true);
+
+  // expected-error@+3{{OpenACC 'loop' construct can only be applied to a 'for' loop}}
+  // expected-note@+1{{'loop' construct is here}}
+#pragma acc loop collapse(1)
+  do{}while(true);
+
+}
+
+void only_one_on_loop() {
+  // expected-error@+2{{OpenACC 'collapse' clause cannot appear more than once on a 'loop' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc loop collapse(1) collapse(1)
+  for(;;);
+}
+
+constexpr int three() { return 3; }
+constexpr int one() { return 1; }
+constexpr int neg() { return -1; }
+constexpr int zero() { return 0; }
+
+struct NotConstexpr {
+  constexpr NotConstexpr(){};
+
+  operator int(){ return 1; }
+};
+struct ConvertsNegative {
+  constexpr ConvertsNegative(){};
+
+  constexpr operator int(){ return -1; }
+};
+struct ConvertsOne{
+  constexpr ConvertsOne(){};
+
+  constexpr operator int(){ return 1; }
+};
+
+struct ConvertsThree{
+  constexpr ConvertsThree(){};
+
+  constexpr operator int(){ return 3; }
+};
+
+template <typename T, int Val>
+void negative_constexpr_templ() {
+  // expected-error@+3 2{{OpenACC 'collapse' clause loop count must be a positive integer value, evaluated to 0}}
+  // expected-note@#NCETN1{{in instantiation of function template specialization 'negative_constexpr_templ<int, -1>'}}
+  // expected-note@#NCET1{{in instantiation of function template specialization 'negative_constexpr_templ<int, 1>'}}
+#pragma acc loop collapse(T{})
+  for(;;)
+    for(;;);
+
+  // expected-error@+1{{OpenACC 'collapse' clause loop count must be a positive integer value, evaluated to -1}}
+#pragma acc loop collapse(Val)
+  for(;;)
+    for(;;);
+}
+
+void negative_constexpr(int i) {
+#pragma acc loop collapse(2)
+  for(;;)
+    for(;;);
+
+#pragma acc loop collapse(1)
+  for(;;)
+    for(;;);
+
+  // expected-error@+1{{OpenACC 'collapse' clause loop count must be a positive integer value, evaluated to 0}}
+#pragma acc loop collapse(0)
+  for(;;)
+    for(;;);
+
+  // expected-error@+1{{OpenACC 'collapse' clause loop count must be a positive integer value, evaluated to -1}}
+#pragma acc loop collapse(-1)
+  for(;;)
+    for(;;);
+
+#pragma acc loop collapse(one())
+  for(;;)
+    for(;;);
+
+  // expected-error@+1{{OpenACC 'collapse' clause loop count must be a positive integer value, evaluated to 0}}
+#pragma acc loop collapse(zero())
+  for(;;)
+    for(;;);
+
+  // expected-error@+1{{OpenACC 'collapse' clause loop count must be a positive integer value, evaluated to -1}}
+#pragma acc loop collapse(neg())
+  for(;;)
+    for(;;);
+
+  // expected-error@+1{{OpenACC 'collapse' clause loop count must be a constant expression}}
+#pragma acc loop collapse(NotConstexpr{})
+  for(;;)
+    for(;;);
+
+  // expected-error@+1{{OpenACC 'collapse' clause loop count must be a positive integer value, evaluated to -1}}
+#pragma acc loop collapse(ConvertsNegative{})
+  for(;;)
+    for(;;);
+
+#pragma acc loop collapse(ConvertsOne{})
+  for(;;)
+    for(;;);
+
+  negative_constexpr_templ<int, -1>(); // #NCETN1
+
+  negative_constexpr_templ<int, 1>(); // #NCET1
+}
+
diff --git a/clang/test/SemaOpenACC/loop-construct-device_type-clause.c b/clang/test/SemaOpenACC/loop-construct-device_type-clause.c
index 520ba45aaebf4..47c9239f4f0e9 100644
--- a/clang/test/SemaOpenACC/loop-construct-device_type-clause.c
+++ b/clang/test/SemaOpenACC/loop-construct-device_type-clause.c
@@ -162,7 +162,6 @@ void uses() {
   // expected-note@+1{{previous clause is here}}
 #pragma acc loop device_type(*) reduction(+:Var)
   for(;;);
-  // expected-warning@+1{{OpenACC clause 'collapse' not yet implemented, clause ignored}}
 #pragma acc loop device_type(*) collapse(1)
   for(;;);
   // expected-error@+2{{OpenACC clause 'bind' may not follow a 'device_type' clause in a 'loop' construct}}
diff --git a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
index 5450d105a6f54..8ca399a0f729a 100644
--- a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
+++ b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
@@ -666,3 +666,37 @@ int foo() {
 }
 
 } // namespace eve
+
+namespace GH93099 {
+
+// Issues with sizeof...(expr)
+
+template <typename T = int> struct C {
+  template <int... N>
+    requires(sizeof...(N) > 0)
+  friend class NTTP;
+
+  template <class... Tp>
+    requires(sizeof...(Tp) > 0)
+  friend class TP;
+
+  template <template <typename> class... TTp>
+    requires(sizeof...(TTp) > 0)
+  friend class TTP;
+};
+
+template <int... N>
+  requires(sizeof...(N) > 0)
+class NTTP;
+
+template <class... Tp>
+  requires(sizeof...(Tp) > 0)
+class TP;
+
+template <template <typename> class... TTp>
+  requires(sizeof...(TTp) > 0)
+class TTP;
+
+C v;
+
+} // namespace GH93099
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index 1ca1d583d5ccd..3fa79636de5d7 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -554,7 +554,7 @@ bool Options::processFrontendOptions(InputArgList &Args) {
 bool Options::addFilePaths(InputArgList &Args, PathSeq &Headers,
                            OptSpecifier ID) {
   for (const StringRef Path : Args.getAllArgValues(ID)) {
-    if ((bool)FM->getDirectory(Path, /*CacheFailure=*/false)) {
+    if ((bool)FM->getOptionalDirectoryRef(Path, /*CacheFailure=*/false)) {
       auto InputHeadersOrErr = enumerateFiles(*FM, Path);
       if (!InputHeadersOrErr) {
         Diags->Report(diag::err_cannot_open_file)
diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
index 871fe5e4553cc..b4b376fe0d114 100644
--- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
+++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
@@ -236,9 +236,8 @@ void printCommands(ArrayRef<StringRef> CmdArgs) {
   if (CmdArgs.empty())
     return;
 
-  llvm::errs() << " \"" << CmdArgs.front() << "\" ";
-  llvm::errs() << llvm::join(std::next(CmdArgs.begin()), CmdArgs.end(), " ")
-               << "\n";
+  errs() << " \"" << CmdArgs.front() << "\" ";
+  errs() << join(std::next(CmdArgs.begin()), CmdArgs.end(), " ") << "\n";
 }
 
 /// A minimum symbol interface that provides the necessary information to
@@ -329,12 +328,12 @@ Expected<std::unique_ptr<lto::LTO>> createLTO(const ArgList &Args) {
   lto::ThinBackend Backend;
   unsigned Jobs = 0;
   if (auto *Arg = Args.getLastArg(OPT_jobs))
-    if (!llvm::to_integer(Arg->getValue(), Jobs) || Jobs == 0)
+    if (!to_integer(Arg->getValue(), Jobs) || Jobs == 0)
       reportError(createStringError("%s: expected a positive integer, got '%s'",
                                     Arg->getSpelling().data(),
                                     Arg->getValue()));
-  Backend = lto::createInProcessThinBackend(
-      llvm::heavyweight_hardware_concurrency(Jobs));
+  Backend =
+      lto::createInProcessThinBackend(heavyweight_hardware_concurrency(Jobs));
 
   Conf.CPU = Args.getLastArgValue(OPT_arch);
   Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
@@ -378,7 +377,7 @@ Expected<std::unique_ptr<lto::LTO>> createLTO(const ArgList &Args) {
 
   unsigned Partitions = 1;
   if (auto *Arg = Args.getLastArg(OPT_lto_partitions))
-    if (!llvm::to_integer(Arg->getValue(), Partitions) || Partitions == 0)
+    if (!to_integer(Arg->getValue(), Partitions) || Partitions == 0)
       reportError(createStringError("%s: expected a positive integer, got '%s'",
                                     Arg->getSpelling().data(),
                                     Arg->getValue()));
@@ -510,7 +509,7 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
       InputFiles.emplace_back(std::move(*BufferOrErr), /*IsLazy=*/false);
       break;
     case file_magic::archive: {
-      Expected<std::unique_ptr<llvm::object::Archive>> LibFile =
+      Expected<std::unique_ptr<object::Archive>> LibFile =
           object::Archive::create(Buffer);
       if (!LibFile)
         return LibFile.takeError();
@@ -563,7 +562,7 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
   for (auto &Input : LinkerInput)
     if (identify_magic(Input->getBuffer()) == file_magic::bitcode)
       BitcodeFiles.emplace_back(std::move(Input));
-  llvm::erase_if(LinkerInput, [](const auto &F) { return !F; });
+  erase_if(LinkerInput, [](const auto &F) { return !F; });
 
   // Run the LTO pipeline on the extracted inputs.
   SmallVector<StringRef> Files;
@@ -574,7 +573,7 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
     lto::LTO &LTOBackend = **LTOBackendOrErr;
     for (auto &BitcodeFile : BitcodeFiles) {
       Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
-          llvm::lto::InputFile::create(*BitcodeFile);
+          lto::InputFile::create(*BitcodeFile);
       if (!BitcodeFileOrErr)
         return BitcodeFileOrErr.takeError();
 
@@ -638,7 +637,7 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
       if (std::error_code EC = sys::fs::openFileForWrite(TempFile, FD))
         reportError(errorCodeToError(EC));
       return std::make_unique<CachedFileStream>(
-          std::make_unique<llvm::raw_fd_ostream>(FD, true));
+          std::make_unique<raw_fd_ostream>(FD, true));
     };
 
     if (Error Err = LTOBackend.run(AddStream))
@@ -655,8 +654,10 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
     }
   }
 
-  // Copy all of the input files to a new file ending in `.cubin`. The 'nvlink'
+  // Create a copy for each file to a new file ending in `.cubin`. The 'nvlink'
   // linker requires all NVPTX inputs to have this extension for some reason.
+  // We don't use a symbolic link because it's not supported on Windows and some
+  // of this input files could be extracted from an archive.
   for (auto &Input : LinkerInput) {
     auto TempFileOrErr = createTempFile(
         Args, sys::path::stem(Input->getBufferIdentifier()), "cubin");
@@ -667,7 +668,7 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
     if (!OutputOrErr)
       return OutputOrErr.takeError();
     std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
-    llvm::copy(Input->getBuffer(), Output->getBufferStart());
+    copy(Input->getBuffer(), Output->getBufferStart());
     if (Error E = Output->commit())
       return E;
     Files.emplace_back(Args.MakeArgString(*TempFileOrErr));
@@ -704,8 +705,8 @@ Error runNVLink(ArrayRef<StringRef> Files, const ArgList &Args) {
     Arg->render(Args, NewLinkerArgs);
   }
 
-  llvm::transform(Files, std::back_inserter(NewLinkerArgs),
-                  [&](StringRef Arg) { return Args.MakeArgString(Arg); });
+  transform(Files, std::back_inserter(NewLinkerArgs),
+            [&](StringRef Arg) { return Args.MakeArgString(Arg); });
 
   SmallVector<StringRef> LinkerArgs({*NVLinkPath});
   if (!Args.hasArg(OPT_o))
diff --git a/clang/tools/clang-refactor/ClangRefactor.cpp b/clang/tools/clang-refactor/ClangRefactor.cpp
index 9310263c446ae..968f0594085d4 100644
--- a/clang/tools/clang-refactor/ClangRefactor.cpp
+++ b/clang/tools/clang-refactor/ClangRefactor.cpp
@@ -117,7 +117,7 @@ class SourceRangeSelectionArgument final : public SourceSelectionArgument {
 
   bool forAllRanges(const SourceManager &SM,
                     llvm::function_ref<void(SourceRange R)> Callback) override {
-    auto FE = SM.getFileManager().getFile(Range.FileName);
+    auto FE = SM.getFileManager().getOptionalFileRef(Range.FileName);
     FileID FID = FE ? SM.translateFile(*FE) : FileID();
     if (!FE || FID.isInvalid()) {
       llvm::errs() << "error: -selection=" << Range.FileName
diff --git a/clang/tools/clang-refactor/TestSupport.cpp b/clang/tools/clang-refactor/TestSupport.cpp
index 3fae18c2109a6..8b6e250b3632d 100644
--- a/clang/tools/clang-refactor/TestSupport.cpp
+++ b/clang/tools/clang-refactor/TestSupport.cpp
@@ -43,7 +43,7 @@ void TestSelectionRangesInFile::dump(raw_ostream &OS) const {
 bool TestSelectionRangesInFile::foreachRange(
     const SourceManager &SM,
     llvm::function_ref<void(SourceRange)> Callback) const {
-  auto FE = SM.getFileManager().getFile(Filename);
+  auto FE = SM.getFileManager().getOptionalFileRef(Filename);
   FileID FID = FE ? SM.translateFile(*FE) : FileID();
   if (!FE || FID.isInvalid()) {
     llvm::errs() << "error: -selection=test:" << Filename
diff --git a/clang/tools/diagtool/ListWarnings.cpp b/clang/tools/diagtool/ListWarnings.cpp
index 9f9647126dd8a..a71f6e3a66c8e 100644
--- a/clang/tools/diagtool/ListWarnings.cpp
+++ b/clang/tools/diagtool/ListWarnings.cpp
@@ -53,13 +53,13 @@ int ListWarnings::run(unsigned int argc, char **argv, llvm::raw_ostream &out) {
   for (const DiagnosticRecord &DR : getBuiltinDiagnosticsByName()) {
     const unsigned diagID = DR.DiagID;
 
-    if (DiagnosticIDs{}.isNote(diagID))
+    if (DiagnosticIDs::isBuiltinNote(diagID))
       continue;
 
-    if (!DiagnosticIDs{}.isWarningOrExtension(diagID))
+    if (!DiagnosticIDs::isBuiltinWarningOrExtension(diagID))
       continue;
 
-    Entry entry(DR.getName(), DiagnosticIDs{}.getWarningOptionForDiag(diagID));
+    Entry entry(DR.getName(), DiagnosticIDs::getWarningOptionForDiag(diagID));
 
     if (entry.Flag.empty())
       Unflagged.push_back(entry);
@@ -97,3 +97,4 @@ int ListWarnings::run(unsigned int argc, char **argv, llvm::raw_ostream &out) {
 
   return 0;
 }
+
diff --git a/clang/tools/diagtool/ShowEnabledWarnings.cpp b/clang/tools/diagtool/ShowEnabledWarnings.cpp
index caf67223921d4..66a295db054c3 100644
--- a/clang/tools/diagtool/ShowEnabledWarnings.cpp
+++ b/clang/tools/diagtool/ShowEnabledWarnings.cpp
@@ -117,10 +117,10 @@ int ShowEnabledWarnings::run(unsigned int argc, char **argv, raw_ostream &Out) {
   for (const DiagnosticRecord &DR : getBuiltinDiagnosticsByName()) {
     unsigned DiagID = DR.DiagID;
 
-    if (DiagnosticIDs{}.isNote(DiagID))
+    if (DiagnosticIDs::isBuiltinNote(DiagID))
       continue;
 
-    if (!DiagnosticIDs{}.isWarningOrExtension(DiagID))
+    if (!DiagnosticIDs::isBuiltinWarningOrExtension(DiagID))
       continue;
 
     DiagnosticsEngine::Level DiagLevel =
@@ -128,7 +128,7 @@ int ShowEnabledWarnings::run(unsigned int argc, char **argv, raw_ostream &Out) {
     if (DiagLevel == DiagnosticsEngine::Ignored)
       continue;
 
-    StringRef WarningOpt = DiagnosticIDs{}.getWarningOptionForDiag(DiagID);
+    StringRef WarningOpt = DiagnosticIDs::getWarningOptionForDiag(DiagID);
     Active.push_back(PrettyDiag(DR.getName(), WarningOpt, DiagLevel));
   }
 
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index e821c5e4c588b..d188f794bad20 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -1646,7 +1646,7 @@ bool CursorVisitor::VisitBuiltinTypeLoc(BuiltinTypeLoc TL) {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
@@ -2893,6 +2893,9 @@ void OpenACCClauseEnqueue::VisitAutoClause(const OpenACCAutoClause &C) {}
 void OpenACCClauseEnqueue::VisitIndependentClause(
     const OpenACCIndependentClause &C) {}
 void OpenACCClauseEnqueue::VisitSeqClause(const OpenACCSeqClause &C) {}
+void OpenACCClauseEnqueue::VisitCollapseClause(const OpenACCCollapseClause &C) {
+  Visitor.AddStmt(C.getLoopCount());
+}
 } // namespace
 
 void EnqueueVisitor::EnqueueChildren(const OpenACCClause *C) {
diff --git a/clang/tools/libclang/CXStoredDiagnostic.cpp b/clang/tools/libclang/CXStoredDiagnostic.cpp
index 6fb3050f5f844..03018229549bd 100644
--- a/clang/tools/libclang/CXStoredDiagnostic.cpp
+++ b/clang/tools/libclang/CXStoredDiagnostic.cpp
@@ -51,9 +51,7 @@ CXString CXStoredDiagnostic::getSpelling() const {
 
 CXString CXStoredDiagnostic::getDiagnosticOption(CXString *Disable) const {
   unsigned ID = Diag.getID();
-  if (DiagnosticIDs::IsCustomDiag(ID))
-    return cxstring::createEmpty();
-  StringRef Option = DiagnosticIDs{}.getWarningOptionForDiag(ID);
+  StringRef Option = DiagnosticIDs::getWarningOptionForDiag(ID);
   if (!Option.empty()) {
     if (Disable)
       *Disable = cxstring::createDup((Twine("-Wno-") + Option).str());
diff --git a/clang/tools/scan-build-py/tests/unit/test_report.py b/clang/tools/scan-build-py/tests/unit/test_report.py
index 4d85590a21122..88dcb965d0da3 100644
--- a/clang/tools/scan-build-py/tests/unit/test_report.py
+++ b/clang/tools/scan-build-py/tests/unit/test_report.py
@@ -538,7 +538,7 @@ def test_merge_updates_embedded_link(self):
                         "test message 6-1 [link](sarif:/runs/4/results/0)",
                     ],
                 )
-                self.assertEquals(
+                self.assertEqual(
                     thread_flows,
                     [
                         "test message 1-2 [link](sarif:/runs/1/results/0)",
diff --git a/clang/unittests/AST/MatchVerifier.h b/clang/unittests/AST/MatchVerifier.h
index 60bb4a8716ae8..e28946977de88 100644
--- a/clang/unittests/AST/MatchVerifier.h
+++ b/clang/unittests/AST/MatchVerifier.h
@@ -88,38 +88,13 @@ MatchVerifier<NodeType>::match(const std::string &Code,
 
   StringRef FileName;
   switch (L) {
-  case Lang_C89:
-    Args.push_back("-std=c89");
-    FileName = "input.c";
-    break;
-  case Lang_C99:
-    Args.push_back("-std=c99");
-    FileName = "input.c";
-    break;
-  case Lang_CXX03:
-    Args.push_back("-std=c++03");
-    FileName = "input.cc";
-    break;
-  case Lang_CXX11:
-    Args.push_back("-std=c++11");
-    FileName = "input.cc";
-    break;
-  case Lang_CXX14:
-    Args.push_back("-std=c++14");
-    FileName = "input.cc";
-    break;
-  case Lang_CXX17:
-    Args.push_back("-std=c++17");
-    FileName = "input.cc";
-    break;
-  case Lang_CXX20:
-    Args.push_back("-std=c++20");
-    FileName = "input.cc";
-    break;
-  case Lang_CXX23:
-    Args.push_back("-std=c++23");
-    FileName = "input.cc";
+#define TESTLANGUAGE(lang, version, std_flag, version_index)                   \
+  case Lang_##lang##version:                                                   \
+    Args.push_back("-std=" #std_flag);                                         \
+    FileName = getFilenameForTesting(Lang_##lang##version);                    \
     break;
+#include "clang/Testing/TestLanguage.def"
+
   case Lang_OpenCL:
     Args.push_back("-cl-no-stdinc");
     FileName = "input.cl";
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
index f2eaf19d61402..ebf548eb25431 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
@@ -1224,7 +1224,7 @@ TEST_P(ASTMatchersTest, CastExpression_MatchesImplicitCasts) {
 }
 
 TEST_P(ASTMatchersTest, CastExpr_DoesNotMatchNonCasts) {
-  if (GetParam().Language == Lang_C89 || GetParam().Language == Lang_C99) {
+  if (GetParam().isC()) {
     // This does have a cast in C
     EXPECT_TRUE(matches("char c = '0';", implicitCastExpr()));
   } else {
@@ -1678,7 +1678,7 @@ TEST_P(ASTMatchersTest, FunctionProtoType) {
 }
 
 TEST_P(ASTMatchersTest, FunctionProtoType_C) {
-  if (!GetParam().isC()) {
+  if (!GetParam().isCOrEarlier(17)) {
     return;
   }
   EXPECT_TRUE(notMatches("void f();", functionProtoType()));
@@ -2030,8 +2030,6 @@ TEST_P(ASTMatchersTest,
 template <typename T>
 class VerifyAncestorHasChildIsEqual : public BoundNodesCallback {
 public:
-  bool run(const BoundNodes *Nodes) override { return false; }
-
   bool run(const BoundNodes *Nodes, ASTContext *Context) override {
     const T *Node = Nodes->getNodeAs<T>("");
     return verify(*Nodes, *Context, Node);
@@ -2745,8 +2743,11 @@ TEST(MatchFinderAPI, MatchesDynamic) {
 
 static std::vector<TestClangConfig> allTestClangConfigs() {
   std::vector<TestClangConfig> all_configs;
-  for (TestLanguage lang : {Lang_C89, Lang_C99, Lang_CXX03, Lang_CXX11,
-                            Lang_CXX14, Lang_CXX17, Lang_CXX20, Lang_CXX23}) {
+  for (TestLanguage lang : {
+#define TESTLANGUAGE(lang, version, std_flag, version_index)                   \
+  Lang_##lang##version,
+#include "clang/Testing/TestLanguage.def"
+       }) {
     TestClangConfig config;
     config.Language = lang;
 
@@ -2770,8 +2771,11 @@ static std::vector<TestClangConfig> allTestClangConfigs() {
   return all_configs;
 }
 
-INSTANTIATE_TEST_SUITE_P(ASTMatchersTests, ASTMatchersTest,
-                         testing::ValuesIn(allTestClangConfigs()));
+INSTANTIATE_TEST_SUITE_P(
+    ASTMatchersTests, ASTMatchersTest, testing::ValuesIn(allTestClangConfigs()),
+    [](const testing::TestParamInfo<TestClangConfig> &Info) {
+      return Info.param.toShortString();
+    });
 
 } // namespace ast_matchers
 } // namespace clang
diff --git a/clang/unittests/ASTMatchers/ASTMatchersTest.h b/clang/unittests/ASTMatchers/ASTMatchersTest.h
index e981299531574..ad2f5f355621c 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersTest.h
+++ b/clang/unittests/ASTMatchers/ASTMatchersTest.h
@@ -28,7 +28,6 @@ using clang::tooling::runToolOnCodeWithArgs;
 class BoundNodesCallback {
 public:
   virtual ~BoundNodesCallback() {}
-  virtual bool run(const BoundNodes *BoundNodes) = 0;
   virtual bool run(const BoundNodes *BoundNodes, ASTContext *Context) = 0;
   virtual void onEndOfTranslationUnit() {}
 };
@@ -403,7 +402,7 @@ template <typename T> class VerifyIdIsBoundTo : public BoundNodesCallback {
     EXPECT_EQ("", Name);
   }
 
-  bool run(const BoundNodes *Nodes) override {
+  bool run(const BoundNodes *Nodes, ASTContext * /*Context*/) override {
     const BoundNodes::IDToNodeMap &M = Nodes->getMap();
     if (Nodes->getNodeAs<T>(Id)) {
       ++Count;
@@ -426,10 +425,6 @@ template <typename T> class VerifyIdIsBoundTo : public BoundNodesCallback {
     return false;
   }
 
-  bool run(const BoundNodes *Nodes, ASTContext *Context) override {
-    return run(Nodes);
-  }
-
 private:
   const std::string Id;
   const int ExpectedCount;
diff --git a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
index ec0be27774d8b..1d18869a6b8af 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
@@ -3082,10 +3082,13 @@ B func1() { return 42; }
     auto M = expr(unless(integerLiteral(equals(24)))).bind("intLit");
     EXPECT_TRUE(matchAndVerifyResultTrue(
         Code, traverse(TK_AsIs, M),
-        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 6)));
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 6),
+        {"-std=c++11"}));
+
     EXPECT_TRUE(matchAndVerifyResultTrue(
         Code, traverse(TK_IgnoreUnlessSpelledInSource, M),
-        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 1)));
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 1),
+        {"-std=c++11"}));
   }
   {
     auto M =
@@ -3128,7 +3131,8 @@ B func1() { return 42; }
     auto M = expr().bind("allExprs");
     EXPECT_TRUE(matchAndVerifyResultTrue(
         Code, traverse(TK_AsIs, M),
-        std::make_unique<VerifyIdIsBoundTo<Expr>>("allExprs", 6)));
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("allExprs", 6),
+        {"-std=c++11"}));
     EXPECT_TRUE(matchAndVerifyResultTrue(
         Code, traverse(TK_IgnoreUnlessSpelledInSource, M),
         std::make_unique<VerifyIdIsBoundTo<Expr>>("allExprs", 1)));
@@ -5662,7 +5666,6 @@ TEST(HasParent, MatchesAllParents) {
 TEST(HasParent, NoDuplicateParents) {
   class HasDuplicateParents : public BoundNodesCallback {
   public:
-    bool run(const BoundNodes *Nodes) override { return false; }
     bool run(const BoundNodes *Nodes, ASTContext *Context) override {
       const Stmt *Node = Nodes->getNodeAs<Stmt>("node");
       std::set<const void *> Parents;
@@ -5871,16 +5874,14 @@ template <typename T> class VerifyMatchOnNode : public BoundNodesCallback {
 public:
   VerifyMatchOnNode(StringRef Id, const internal::Matcher<T> &InnerMatcher,
                     StringRef InnerId)
-    : Id(Id), InnerMatcher(InnerMatcher), InnerId(InnerId) {
-  }
-
-  bool run(const BoundNodes *Nodes) override { return false; }
+      : Id(Id), InnerMatcher(InnerMatcher), InnerId(InnerId) {}
 
   bool run(const BoundNodes *Nodes, ASTContext *Context) override {
     const T *Node = Nodes->getNodeAs<T>(Id);
     return selectFirst<T>(InnerId, match(InnerMatcher, *Node, *Context)) !=
-      nullptr;
+           nullptr;
   }
+
 private:
   std::string Id;
   internal::Matcher<T> InnerMatcher;
@@ -6074,7 +6075,7 @@ namespace {
 class ForCallablePreservesBindingWithMultipleParentsTestCallback
     : public BoundNodesCallback {
 public:
-  bool run(const BoundNodes *BoundNodes) override {
+  bool run(const BoundNodes *BoundNodes, ASTContext *Context) override {
     FunctionDecl const *FunDecl =
         BoundNodes->getNodeAs<FunctionDecl>("funDecl");
     // Validate test assumptions. This would be expressed as ASSERT_* in
@@ -6111,10 +6112,6 @@ class ForCallablePreservesBindingWithMultipleParentsTestCallback
     return true;
   }
 
-  bool run(const BoundNodes *BoundNodes, ASTContext *Context) override {
-    return run(BoundNodes);
-  }
-
 private:
   void ExpectCorrectResult(StringRef LogInfo,
                            ArrayRef<BoundNodes> Results) const {
diff --git a/clang/unittests/Basic/FileManagerTest.cpp b/clang/unittests/Basic/FileManagerTest.cpp
index d32036d975ce9..88d778fccd68e 100644
--- a/clang/unittests/Basic/FileManagerTest.cpp
+++ b/clang/unittests/Basic/FileManagerTest.cpp
@@ -116,9 +116,9 @@ TEST_F(FileManagerTest, NoVirtualDirectoryExistsBeforeAVirtualFileIsAdded) {
   // by what's in the real file system.
   manager.setStatCache(std::make_unique<FakeStatCache>());
 
-  ASSERT_FALSE(manager.getDirectory("virtual/dir/foo"));
-  ASSERT_FALSE(manager.getDirectory("virtual/dir"));
-  ASSERT_FALSE(manager.getDirectory("virtual"));
+  ASSERT_FALSE(manager.getOptionalDirectoryRef("virtual/dir/foo"));
+  ASSERT_FALSE(manager.getOptionalDirectoryRef("virtual/dir"));
+  ASSERT_FALSE(manager.getOptionalDirectoryRef("virtual"));
 }
 
 // When a virtual file is added, all of its ancestors should be created.
@@ -126,10 +126,12 @@ TEST_F(FileManagerTest, getVirtualFileCreatesDirectoryEntriesForAncestors) {
   // Fake an empty real file system.
   manager.setStatCache(std::make_unique<FakeStatCache>());
 
-  manager.getVirtualFile("virtual/dir/bar.h", 100, 0);
-  ASSERT_FALSE(manager.getDirectory("virtual/dir/foo"));
+  manager.getVirtualFileRef("virtual/dir/bar.h", 100, 0);
 
-  auto dir = manager.getDirectoryRef("virtual/dir");
+  auto dir = manager.getDirectoryRef("virtual/dir/foo");
+  ASSERT_THAT_EXPECTED(dir, llvm::Failed());
+
+  dir = manager.getDirectoryRef("virtual/dir");
   ASSERT_THAT_EXPECTED(dir, llvm::Succeeded());
   EXPECT_EQ("virtual/dir", dir->getName());
 
@@ -172,7 +174,7 @@ TEST_F(FileManagerTest, getFileReturnsValidFileEntryForExistingVirtualFile) {
   // Fake an empty real file system.
   manager.setStatCache(std::make_unique<FakeStatCache>());
 
-  manager.getVirtualFile("virtual/dir/bar.h", 100, 0);
+  manager.getVirtualFileRef("virtual/dir/bar.h", 100, 0);
   auto file = manager.getFileRef("virtual/dir/bar.h");
   ASSERT_THAT_EXPECTED(file, llvm::Succeeded());
   EXPECT_EQ("virtual/dir/bar.h", file->getName());
@@ -190,11 +192,11 @@ TEST_F(FileManagerTest, getFileReturnsDifferentFileEntriesForDifferentFiles) {
   statCache->InjectFile("bar.cpp", 43);
   manager.setStatCache(std::move(statCache));
 
-  auto fileFoo = manager.getFile("foo.cpp");
-  auto fileBar = manager.getFile("bar.cpp");
+  auto fileFoo = manager.getOptionalFileRef("foo.cpp");
+  auto fileBar = manager.getOptionalFileRef("bar.cpp");
   ASSERT_TRUE(fileFoo);
   ASSERT_TRUE(fileBar);
-  EXPECT_NE(*fileFoo, *fileBar);
+  EXPECT_NE(&fileFoo->getFileEntry(), &fileBar->getFileEntry());
 }
 
 // getFile() returns an error if neither a real file nor a virtual file
@@ -208,19 +210,22 @@ TEST_F(FileManagerTest, getFileReturnsErrorForNonexistentFile) {
   manager.setStatCache(std::move(statCache));
 
   // Create a virtual bar.cpp file.
-  manager.getVirtualFile("bar.cpp", 200, 0);
+  manager.getVirtualFileRef("bar.cpp", 200, 0);
 
-  auto file = manager.getFile("xyz.txt");
+  auto file = manager.getFileRef("xyz.txt");
   ASSERT_FALSE(file);
-  ASSERT_EQ(file.getError(), std::errc::no_such_file_or_directory);
+  ASSERT_EQ(llvm::errorToErrorCode(file.takeError()),
+            std::make_error_code(std::errc::no_such_file_or_directory));
 
-  auto readingDirAsFile = manager.getFile("MyDirectory");
+  auto readingDirAsFile = manager.getFileRef("MyDirectory");
   ASSERT_FALSE(readingDirAsFile);
-  ASSERT_EQ(readingDirAsFile.getError(), std::errc::is_a_directory);
+  ASSERT_EQ(llvm::errorToErrorCode(readingDirAsFile.takeError()),
+            std::make_error_code(std::errc::is_a_directory));
 
-  auto readingFileAsDir = manager.getDirectory("foo.cpp");
+  auto readingFileAsDir = manager.getDirectoryRef("foo.cpp");
   ASSERT_FALSE(readingFileAsDir);
-  ASSERT_EQ(readingFileAsDir.getError(), std::errc::not_a_directory);
+  ASSERT_EQ(llvm::errorToErrorCode(readingFileAsDir.takeError()),
+            std::make_error_code(std::errc::not_a_directory));
 }
 
 // The following tests apply to Unix-like system only.
@@ -236,11 +241,11 @@ TEST_F(FileManagerTest, getFileReturnsSameFileEntryForAliasedRealFiles) {
   statCache->InjectFile("abc/bar.cpp", 42);
   manager.setStatCache(std::move(statCache));
 
-  auto f1 = manager.getFile("abc/foo.cpp");
-  auto f2 = manager.getFile("abc/bar.cpp");
+  auto f1 = manager.getOptionalFileRef("abc/foo.cpp");
+  auto f2 = manager.getOptionalFileRef("abc/bar.cpp");
 
-  EXPECT_EQ(f1 ? *f1 : nullptr,
-            f2 ? *f2 : nullptr);
+  EXPECT_EQ(f1 ? &f1->getFileEntry() : nullptr,
+            f2 ? &f2->getFileEntry() : nullptr);
 
   // Check that getFileRef also does the right thing.
   auto r1 = manager.getFileRef("abc/foo.cpp");
@@ -250,8 +255,8 @@ TEST_F(FileManagerTest, getFileReturnsSameFileEntryForAliasedRealFiles) {
 
   EXPECT_EQ("abc/foo.cpp", r1->getName());
   EXPECT_EQ("abc/bar.cpp", r2->getName());
-  EXPECT_EQ((f1 ? *f1 : nullptr), &r1->getFileEntry());
-  EXPECT_EQ((f2 ? *f2 : nullptr), &r2->getFileEntry());
+  EXPECT_EQ((f1 ? &f1->getFileEntry() : nullptr), &r1->getFileEntry());
+  EXPECT_EQ((f2 ? &f2->getFileEntry() : nullptr), &r2->getFileEntry());
 }
 
 TEST_F(FileManagerTest, getFileRefReturnsCorrectNameForDifferentStatPath) {
@@ -338,11 +343,11 @@ TEST_F(FileManagerTest, getFileReturnsSameFileEntryForAliasedVirtualFiles) {
   statCache->InjectFile("abc/bar.cpp", 42);
   manager.setStatCache(std::move(statCache));
 
-  auto f1 = manager.getFile("abc/foo.cpp");
-  auto f2 = manager.getFile("abc/bar.cpp");
+  auto f1 = manager.getOptionalFileRef("abc/foo.cpp");
+  auto f2 = manager.getOptionalFileRef("abc/bar.cpp");
 
-  EXPECT_EQ(f1 ? *f1 : nullptr,
-            f2 ? *f2 : nullptr);
+  EXPECT_EQ(f1 ? &f1->getFileEntry() : nullptr,
+            f2 ? &f2->getFileEntry() : nullptr);
 }
 
 TEST_F(FileManagerTest, getFileRefEquality) {
@@ -420,20 +425,19 @@ TEST_F(FileManagerTest, getVirtualFileWithDifferentName) {
   manager.setStatCache(std::move(statCache));
 
   // Inject the virtual file:
-  const FileEntry *file1 = manager.getVirtualFile("c:\\tmp\\test", 123, 1);
-  ASSERT_TRUE(file1 != nullptr);
-  EXPECT_EQ(43U, file1->getUniqueID().getFile());
-  EXPECT_EQ(123, file1->getSize());
+  FileEntryRef file1 = manager.getVirtualFileRef("c:\\tmp\\test", 123, 1);
+  EXPECT_EQ(43U, file1.getUniqueID().getFile());
+  EXPECT_EQ(123, file1.getSize());
 
   // Lookup the virtual file with a different name:
-  auto file2 = manager.getFile("c:/tmp/test", 100, 1);
+  auto file2 = manager.getOptionalFileRef("c:/tmp/test", 100, 1);
   ASSERT_TRUE(file2);
   // Check that it's the same UFE:
   EXPECT_EQ(file1, *file2);
-  EXPECT_EQ(43U, (*file2)->getUniqueID().getFile());
+  EXPECT_EQ(43U, file2->getUniqueID().getFile());
   // Check that the contents of the UFE are not overwritten by the entry in the
   // filesystem:
-  EXPECT_EQ(123, (*file2)->getSize());
+  EXPECT_EQ(123, file2->getSize());
 }
 
 #endif  // !_WIN32
@@ -487,12 +491,11 @@ TEST_F(FileManagerTest, getVirtualFileFillsRealPathName) {
   Manager.setStatCache(std::move(statCache));
 
   // Check for real path.
-  const FileEntry *file = Manager.getVirtualFile("/tmp/test", 123, 1);
-  ASSERT_TRUE(file != nullptr);
+  FileEntryRef file = Manager.getVirtualFileRef("/tmp/test", 123, 1);
   SmallString<64> ExpectedResult = CustomWorkingDir;
 
   llvm::sys::path::append(ExpectedResult, "tmp", "test");
-  EXPECT_EQ(file->tryGetRealPathName(), ExpectedResult);
+  EXPECT_EQ(file.getFileEntry().tryGetRealPathName(), ExpectedResult);
 }
 
 TEST_F(FileManagerTest, getFileDontOpenRealPath) {
@@ -514,12 +517,12 @@ TEST_F(FileManagerTest, getFileDontOpenRealPath) {
   Manager.setStatCache(std::move(statCache));
 
   // Check for real path.
-  auto file = Manager.getFile("/tmp/test", /*OpenFile=*/false);
+  auto file = Manager.getOptionalFileRef("/tmp/test", /*OpenFile=*/false);
   ASSERT_TRUE(file);
   SmallString<64> ExpectedResult = CustomWorkingDir;
 
   llvm::sys::path::append(ExpectedResult, "tmp", "test");
-  EXPECT_EQ((*file)->tryGetRealPathName(), ExpectedResult);
+  EXPECT_EQ(file->getFileEntry().tryGetRealPathName(), ExpectedResult);
 }
 
 TEST_F(FileManagerTest, getBypassFile) {
diff --git a/clang/unittests/Basic/SourceManagerTest.cpp b/clang/unittests/Basic/SourceManagerTest.cpp
index 0f2476bd8b061..2b3fce9128ba9 100644
--- a/clang/unittests/Basic/SourceManagerTest.cpp
+++ b/clang/unittests/Basic/SourceManagerTest.cpp
@@ -549,7 +549,7 @@ TEST_F(SourceManagerTest, getMacroArgExpandedLocation) {
   // These are different than normal includes since predefines buffer doesn't
   // have a valid insertion location.
   PP.setPredefines("#include \"/implicit-header.h\"");
-  FileMgr.getVirtualFile("/implicit-header.h", 0, 0);
+  FileMgr.getVirtualFileRef("/implicit-header.h", 0, 0);
   PP.Initialize(*Target);
   PP.EnterMainSourceFile();
 
diff --git a/clang/unittests/Frontend/CompilerInstanceTest.cpp b/clang/unittests/Frontend/CompilerInstanceTest.cpp
index 8bc705dd21993..5cf548e913cc1 100644
--- a/clang/unittests/Frontend/CompilerInstanceTest.cpp
+++ b/clang/unittests/Frontend/CompilerInstanceTest.cpp
@@ -71,7 +71,7 @@ TEST(CompilerInstance, DefaultVFSOverlayFromInvocation) {
 
   // Check if the virtual file exists which means that our VFS is used by the
   // CompilerInstance.
-  ASSERT_TRUE(Instance.getFileManager().getFile("vfs-virtual.file"));
+  ASSERT_TRUE(Instance.getFileManager().getOptionalFileRef("vfs-virtual.file"));
 }
 
 TEST(CompilerInstance, AllowDiagnosticLogWithUnownedDiagnosticConsumer) {
diff --git a/clang/unittests/Interpreter/InterpreterTest.cpp b/clang/unittests/Interpreter/InterpreterTest.cpp
index a2e960f143111..30b051e747f92 100644
--- a/clang/unittests/Interpreter/InterpreterTest.cpp
+++ b/clang/unittests/Interpreter/InterpreterTest.cpp
@@ -381,4 +381,26 @@ TEST_F(InterpreterTest, Value) {
   EXPECT_TRUE(V9.isManuallyAlloc());
 }
 
+TEST_F(InterpreterTest, TranslationUnit_CanonicalDecl) {
+  std::vector<const char *> Args;
+  std::unique_ptr<Interpreter> Interp = createInterpreter(Args);
+
+  Sema &sema = Interp->getCompilerInstance()->getSema();
+
+  llvm::cantFail(Interp->ParseAndExecute("int x = 42;"));
+
+  TranslationUnitDecl *TU =
+      sema.getASTContext().getTranslationUnitDecl()->getCanonicalDecl();
+
+  llvm::cantFail(Interp->ParseAndExecute("long y = 84;"));
+
+  EXPECT_EQ(TU,
+            sema.getASTContext().getTranslationUnitDecl()->getCanonicalDecl());
+
+  llvm::cantFail(Interp->ParseAndExecute("char z = 'z';"));
+
+  EXPECT_EQ(TU,
+            sema.getASTContext().getTranslationUnitDecl()->getCanonicalDecl());
+}
+
 } // end anonymous namespace
diff --git a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
index 37e3546dc9087..4ff5e8b65a686 100644
--- a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
+++ b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
@@ -88,8 +88,12 @@ class BuildSyntaxTreeTest : public SyntaxTreeTest {
   }
 };
 
-INSTANTIATE_TEST_SUITE_P(SyntaxTreeTests, BuildSyntaxTreeTest,
-                        testing::ValuesIn(allTestClangConfigs()) );
+INSTANTIATE_TEST_SUITE_P(
+    SyntaxTreeTests, BuildSyntaxTreeTest,
+    testing::ValuesIn(allTestClangConfigs()),
+    [](const testing::TestParamInfo<TestClangConfig> &Info) {
+      return Info.param.toShortString();
+    });
 
 TEST_P(BuildSyntaxTreeTest, Simple) {
   EXPECT_TRUE(treeDumpEqual(
diff --git a/clang/unittests/Tooling/Syntax/MutationsTest.cpp b/clang/unittests/Tooling/Syntax/MutationsTest.cpp
index 1c3d6aac7183b..35692fd52181a 100644
--- a/clang/unittests/Tooling/Syntax/MutationsTest.cpp
+++ b/clang/unittests/Tooling/Syntax/MutationsTest.cpp
@@ -54,8 +54,11 @@ class MutationTest : public SyntaxTreeTest {
   };
 };
 
-INSTANTIATE_TEST_SUITE_P(SyntaxTreeTests, MutationTest,
-                        ::testing::ValuesIn(allTestClangConfigs()) );
+INSTANTIATE_TEST_SUITE_P(
+    SyntaxTreeTests, MutationTest, ::testing::ValuesIn(allTestClangConfigs()),
+    [](const testing::TestParamInfo<TestClangConfig> &Info) {
+      return Info.param.toShortString();
+    });
 
 TEST_P(MutationTest, RemoveStatement_InCompound) {
   CheckTransformation(RemoveStatement, "void test() { [[100+100;]] test(); }",
diff --git a/clang/unittests/Tooling/Syntax/SynthesisTest.cpp b/clang/unittests/Tooling/Syntax/SynthesisTest.cpp
index be8851267532c..ccfdcd05c7848 100644
--- a/clang/unittests/Tooling/Syntax/SynthesisTest.cpp
+++ b/clang/unittests/Tooling/Syntax/SynthesisTest.cpp
@@ -38,8 +38,11 @@ class SynthesisTest : public SyntaxTreeTest {
   }
 };
 
-INSTANTIATE_TEST_SUITE_P(SynthesisTests, SynthesisTest,
-                        ::testing::ValuesIn(allTestClangConfigs()) );
+INSTANTIATE_TEST_SUITE_P(
+    SynthesisTests, SynthesisTest, ::testing::ValuesIn(allTestClangConfigs()),
+    [](const testing::TestParamInfo<TestClangConfig> &Info) {
+      return Info.param.toShortString();
+    });
 
 TEST_P(SynthesisTest, Leaf_Punctuation) {
   buildTree("", GetParam());
diff --git a/clang/unittests/Tooling/Syntax/TreeTest.cpp b/clang/unittests/Tooling/Syntax/TreeTest.cpp
index 44cf42fa944a2..b553f704ae00c 100644
--- a/clang/unittests/Tooling/Syntax/TreeTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTest.cpp
@@ -103,8 +103,11 @@ class TreeTest : public SyntaxTreeTest {
   }
 };
 
-INSTANTIATE_TEST_SUITE_P(TreeTests, TreeTest,
-                        ::testing::ValuesIn(allTestClangConfigs()) );
+INSTANTIATE_TEST_SUITE_P(
+    TreeTests, TreeTest, ::testing::ValuesIn(allTestClangConfigs()),
+    [](const testing::TestParamInfo<TestClangConfig> &Info) {
+      return Info.param.toShortString();
+    });
 
 TEST_P(TreeTest, FirstLeaf) {
   buildTree("", GetParam());
@@ -221,8 +224,11 @@ class ListTest : public SyntaxTreeTest {
   }
 };
 
-INSTANTIATE_TEST_SUITE_P(TreeTests, ListTest,
-                        ::testing::ValuesIn(allTestClangConfigs()) );
+INSTANTIATE_TEST_SUITE_P(
+    TreeTests, ListTest, ::testing::ValuesIn(allTestClangConfigs()),
+    [](const testing::TestParamInfo<TestClangConfig> &Info) {
+      return Info.param.toShortString();
+    });
 
 /// "a, b, c"  <=> [("a", ","), ("b", ","), ("c", null)]
 TEST_P(ListTest, List_Separated_WellFormed) {
diff --git a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
index f387b503f3368..14c446c199906 100644
--- a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
@@ -48,8 +48,11 @@ ArrayRef<syntax::Token> tokens(syntax::Node *N,
 
 std::vector<TestClangConfig> clang::syntax::allTestClangConfigs() {
   std::vector<TestClangConfig> all_configs;
-  for (TestLanguage lang : {Lang_C89, Lang_C99, Lang_CXX03, Lang_CXX11,
-                            Lang_CXX14, Lang_CXX17, Lang_CXX20}) {
+  for (TestLanguage lang : {
+#define TESTLANGUAGE(lang, version, std_flag, version_index)                   \
+  Lang_##lang##version,
+#include "clang/Testing/TestLanguage.def"
+       }) {
     TestClangConfig config;
     config.Language = lang;
     config.Target = "x86_64-pc-linux-gnu";
diff --git a/clang/utils/TableGen/ASTTableGen.h b/clang/utils/TableGen/ASTTableGen.h
index 143d779a8a64f..827fce9e213cb 100644
--- a/clang/utils/TableGen/ASTTableGen.h
+++ b/clang/utils/TableGen/ASTTableGen.h
@@ -319,8 +319,8 @@ class PropertyType : public WrappedRecord {
     return get()->getValueAsString(UnpackOptionalCodeFieldName);
   }
 
-  std::vector<llvm::Record*> getBufferElementTypes() const {
-    return get()->getValueAsListOfDefs(BufferElementTypesFieldName);
+  std::vector<const llvm::Record *> getBufferElementTypes() const {
+    return get()->getValueAsListOfConstDefs(BufferElementTypesFieldName);
   }
 
   static llvm::StringRef getTableGenNodeClassName() {
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 87be48c215e23..e5d92b343b3dd 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -99,10 +99,9 @@ static bool isVariadicStringLiteralArgument(const Record *Arg);
 
 static std::vector<FlattenedSpelling>
 GetFlattenedSpellings(const Record &Attr) {
-  std::vector<Record *> Spellings = Attr.getValueAsListOfDefs("Spellings");
   std::vector<FlattenedSpelling> Ret;
 
-  for (const auto &Spelling : Spellings) {
+  for (const auto &Spelling : Attr.getValueAsListOfDefs("Spellings")) {
     StringRef Variety = Spelling->getValueAsString("Variety");
     StringRef Name = Spelling->getValueAsString("Name");
     if (Variety == "GCC") {
@@ -1747,7 +1746,8 @@ getSpellingListIndex(const std::vector<FlattenedSpelling> &SpellingList,
 }
 
 static void writeAttrAccessorDefinition(const Record &R, raw_ostream &OS) {
-  std::vector<Record*> Accessors = R.getValueAsListOfDefs("Accessors");
+  std::vector<const Record *> Accessors =
+      R.getValueAsListOfConstDefs("Accessors");
   if (Accessors.empty())
     return;
 
@@ -1866,7 +1866,7 @@ static LateAttrParseKind getLateAttrParseKind(const Record *Attr) {
   auto *LAPK = Attr->getValueAsDef(LateParsedStr);
 
   // Typecheck the `LateParsed` field.
-  SmallVector<Record *, 1> SuperClasses;
+  SmallVector<const Record *, 1> SuperClasses;
   LAPK->getDirectSuperClasses(SuperClasses);
   if (SuperClasses.size() != 1)
     PrintFatalError(Attr, "Field `" + Twine(LateParsedStr) +
@@ -1962,20 +1962,21 @@ struct AttributeSubjectMatchRule {
 
   bool isSubRule() const { return Constraint != nullptr; }
 
-  std::vector<Record *> getSubjects() const {
+  std::vector<const Record *> getSubjects() const {
     return (Constraint ? Constraint : MetaSubject)
-        ->getValueAsListOfDefs("Subjects");
+        ->getValueAsListOfConstDefs("Subjects");
   }
 
-  std::vector<Record *> getLangOpts() const {
+  std::vector<const Record *> getLangOpts() const {
     if (Constraint) {
       // Lookup the options in the sub-rule first, in case the sub-rule
       // overrides the rules options.
-      std::vector<Record *> Opts = Constraint->getValueAsListOfDefs("LangOpts");
+      std::vector<const Record *> Opts =
+          Constraint->getValueAsListOfConstDefs("LangOpts");
       if (!Opts.empty())
         return Opts;
     }
-    return MetaSubject->getValueAsListOfDefs("LangOpts");
+    return MetaSubject->getValueAsListOfConstDefs("LangOpts");
   }
 
   // Abstract rules are used only for sub-rules
@@ -2103,9 +2104,8 @@ PragmaClangAttributeSupport::PragmaClangAttributeSupport(
                                        const Record *MetaSubject,
                                        const Record *Constraint) {
     Rules.emplace_back(MetaSubject, Constraint);
-    std::vector<Record *> ApplicableSubjects =
-        SubjectContainer->getValueAsListOfDefs("Subjects");
-    for (const auto *Subject : ApplicableSubjects) {
+    for (const Record *Subject :
+         SubjectContainer->getValueAsListOfConstDefs("Subjects")) {
       bool Inserted =
           SubjectsToRules
               .try_emplace(Subject, RuleOrAggregateRuleSet::getRule(
@@ -2121,9 +2121,8 @@ PragmaClangAttributeSupport::PragmaClangAttributeSupport(
   for (const auto *MetaSubject :
        Records.getAllDerivedDefinitions("AttrSubjectMatcherRule")) {
     MapFromSubjectsToRules(MetaSubject, MetaSubject, /*Constraints=*/nullptr);
-    std::vector<Record *> Constraints =
-        MetaSubject->getValueAsListOfDefs("Constraints");
-    for (const auto *Constraint : Constraints)
+    for (const Record *Constraint :
+         MetaSubject->getValueAsListOfDefs("Constraints"))
       MapFromSubjectsToRules(Constraint, MetaSubject, Constraint);
   }
 
@@ -2131,7 +2130,7 @@ PragmaClangAttributeSupport::PragmaClangAttributeSupport(
       Records.getAllDerivedDefinitions(DeclNodeClassName);
   for (const auto *Aggregate :
        Records.getAllDerivedDefinitions("AttrSubjectMatcherAggregateRule")) {
-    Record *SubjectDecl = Aggregate->getValueAsDef("Subject");
+    const Record *SubjectDecl = Aggregate->getValueAsDef("Subject");
 
     // Gather sub-classes of the aggregate subject that act as attribute
     // subject rules.
@@ -2218,9 +2217,8 @@ bool PragmaClangAttributeSupport::isAttributedSupported(
   if (Attribute.isValueUnset("Subjects"))
     return false;
   const Record *SubjectObj = Attribute.getValueAsDef("Subjects");
-  std::vector<Record *> Subjects = SubjectObj->getValueAsListOfDefs("Subjects");
   bool HasAtLeastOneValidSubject = false;
-  for (const auto *Subject : Subjects) {
+  for (const auto *Subject : SubjectObj->getValueAsListOfDefs("Subjects")) {
     if (!isSupportedPragmaClangAttributeSubject(*Subject))
       continue;
     if (!SubjectsToRules.contains(Subject))
@@ -2230,7 +2228,7 @@ bool PragmaClangAttributeSupport::isAttributedSupported(
   return HasAtLeastOneValidSubject;
 }
 
-static std::string GenerateTestExpression(ArrayRef<Record *> LangOpts) {
+static std::string GenerateTestExpression(ArrayRef<const Record *> LangOpts) {
   std::string Test;
 
   for (auto *E : LangOpts) {
@@ -2271,8 +2269,7 @@ PragmaClangAttributeSupport::generateStrictConformsTo(const Record &Attr,
      << AttributeSubjectMatchRule::EnumName
      << ", bool>> &MatchRules, const LangOptions &LangOpts) const override {\n";
   const Record *SubjectObj = Attr.getValueAsDef("Subjects");
-  std::vector<Record *> Subjects = SubjectObj->getValueAsListOfDefs("Subjects");
-  for (const auto *Subject : Subjects) {
+  for (const auto *Subject : SubjectObj->getValueAsListOfDefs("Subjects")) {
     if (!isSupportedPragmaClangAttributeSubject(*Subject))
       continue;
     auto It = SubjectsToRules.find(Subject);
@@ -2281,7 +2278,7 @@ PragmaClangAttributeSupport::generateStrictConformsTo(const Record &Attr,
     for (const auto &Rule : It->getSecond().getAggregateRuleSet()) {
       // The rule might be language specific, so only subtract it from the given
       // rules if the specific language options are specified.
-      std::vector<Record *> LangOpts = Rule.getLangOpts();
+      std::vector<const Record *> LangOpts = Rule.getLangOpts();
       OS << "  MatchRules.push_back(std::make_pair(" << Rule.getEnumValue()
          << ", /*IsSupported=*/" << GenerateTestExpression(LangOpts)
          << "));\n";
@@ -2506,7 +2503,7 @@ static void emitClangAttrTypeArgList(const RecordKeeper &Records,
   std::map<std::string, FSIVecTy> FSIMap;
   for (const auto *Attr : Records.getAllDerivedDefinitions("Attr")) {
     // Determine whether the first argument is a type.
-    std::vector<Record *> Args = Attr->getValueAsListOfDefs("Args");
+    std::vector<const Record *> Args = Attr->getValueAsListOfConstDefs("Args");
     if (Args.empty())
       continue;
 
@@ -2584,7 +2581,7 @@ static void emitClangAttrVariadicIdentifierArgList(const RecordKeeper &Records,
   std::map<std::string, FSIVecTy> FSIMap;
   for (const auto *A : Records.getAllDerivedDefinitions("Attr")) {
     // Determine whether the first argument is a variadic identifier.
-    std::vector<Record *> Args = A->getValueAsListOfDefs("Args");
+    std::vector<const Record *> Args = A->getValueAsListOfConstDefs("Args");
     if (Args.empty() || !isVariadicIdentifierArgument(Args[0]))
       continue;
     generateFlattenedSpellingInfo(*A, FSIMap);
@@ -2600,7 +2597,7 @@ emitClangAttrUnevaluatedStringLiteralList(const RecordKeeper &Records,
                                           raw_ostream &OS) {
   OS << "#if defined(CLANG_ATTR_STRING_LITERAL_ARG_LIST)\n";
 
-  auto MakeMask = [](ArrayRef<Record *> Args) {
+  auto MakeMask = [](ArrayRef<const Record *> Args) {
     uint32_t Bits = 0;
     assert(Args.size() <= 32 && "unsupported number of arguments in attribute");
     for (uint32_t N = 0; N < Args.size(); ++N) {
@@ -2617,7 +2614,7 @@ emitClangAttrUnevaluatedStringLiteralList(const RecordKeeper &Records,
   std::map<std::string, FSIVecTy> FSIMap;
   for (const auto *Attr : Records.getAllDerivedDefinitions("Attr")) {
     // Determine whether there are any string arguments.
-    uint32_t ArgMask = MakeMask(Attr->getValueAsListOfDefs("Args"));
+    uint32_t ArgMask = MakeMask(Attr->getValueAsListOfConstDefs("Args"));
     if (!ArgMask)
       continue;
     generateFlattenedSpellingInfo(*Attr, FSIMap, ArgMask);
@@ -2633,7 +2630,7 @@ static void emitClangAttrIdentifierArgList(const RecordKeeper &Records,
   std::map<std::string, FSIVecTy> FSIMap;
   for (const auto *Attr : Records.getAllDerivedDefinitions("Attr")) {
     // Determine whether the first argument is an identifier.
-    std::vector<Record *> Args = Attr->getValueAsListOfDefs("Args");
+    std::vector<const Record *> Args = Attr->getValueAsListOfConstDefs("Args");
     if (Args.empty() || !isIdentifierArgument(Args[0]))
       continue;
     generateFlattenedSpellingInfo(*Attr, FSIMap);
@@ -2651,8 +2648,8 @@ static void emitClangAttrStrictIdentifierArgList(const RecordKeeper &Records,
     if (!Attr->getValueAsBit("StrictEnumParameters"))
       continue;
     // Check that there is really an identifier argument.
-    std::vector<Record *> Args = Attr->getValueAsListOfDefs("Args");
-    if (none_of(Args, [&](Record *R) { return isIdentifierArgument(R); }))
+    std::vector<const Record *> Args = Attr->getValueAsListOfConstDefs("Args");
+    if (none_of(Args, [&](const Record *R) { return isIdentifierArgument(R); }))
       continue;
     generateFlattenedSpellingInfo(*Attr, FSIMap);
   }
@@ -2673,7 +2670,7 @@ static void emitClangAttrThisIsaIdentifierArgList(const RecordKeeper &Records,
   std::map<std::string, FSIVecTy> FSIMap;
   for (const auto *A : Records.getAllDerivedDefinitions("Attr")) {
     // Determine whether the first argument is a variadic identifier.
-    std::vector<Record *> Args = A->getValueAsListOfDefs("Args");
+    std::vector<const Record *> Args = A->getValueAsListOfConstDefs("Args");
     if (Args.empty() || !keywordThisIsaIdentifierInArgument(Args[0]))
       continue;
     generateFlattenedSpellingInfo(*A, FSIMap);
@@ -2766,7 +2763,8 @@ static void emitAttributes(const RecordKeeper &Records, raw_ostream &OS,
     else
       OS << "\n// " << R.getName() << "Attr implementation\n\n";
 
-    std::vector<Record*> ArgRecords = R.getValueAsListOfDefs("Args");
+    std::vector<const Record *> ArgRecords =
+        R.getValueAsListOfConstDefs("Args");
     std::vector<std::unique_ptr<Argument>> Args;
     Args.reserve(ArgRecords.size());
 
@@ -3389,7 +3387,7 @@ namespace {
     AttrClassHierarchy(const RecordKeeper &Records) {
       // Find records for all the classes.
       for (auto &Descriptor : AttrClassDescriptors) {
-        Record *ClassRecord = Records.getClass(Descriptor.TableGenName);
+        const Record *ClassRecord = Records.getClass(Descriptor.TableGenName);
         AttrClass *Class = new AttrClass(Descriptor, ClassRecord);
         Classes.emplace_back(Class);
       }
@@ -3519,8 +3517,8 @@ void EmitClangAttrSubjectMatchRuleList(const RecordKeeper &Records,
 void EmitClangAttrPCHRead(const RecordKeeper &Records, raw_ostream &OS) {
   emitSourceFileHeader("Attribute deserialization code", OS, Records);
 
-  Record *InhClass = Records.getClass("InheritableAttr");
-  std::vector<Record *> ArgRecords;
+  const Record *InhClass = Records.getClass("InheritableAttr");
+  std::vector<const Record *> ArgRecords;
   std::vector<std::unique_ptr<Argument>> Args;
   std::unique_ptr<VariadicExprArgument> DelayedArgs;
 
@@ -3541,7 +3539,7 @@ void EmitClangAttrPCHRead(const RecordKeeper &Records, raw_ostream &OS) {
           std::make_unique<VariadicExprArgument>("DelayedArgs", R.getName());
       DelayedArgs->writePCHReadDecls(OS);
     }
-    ArgRecords = R.getValueAsListOfDefs("Args");
+    ArgRecords = R.getValueAsListOfConstDefs("Args");
     Args.clear();
     for (const auto *Arg : ArgRecords) {
       Args.emplace_back(createArgument(*Arg, R.getName()));
@@ -3573,14 +3571,14 @@ void EmitClangAttrPCHRead(const RecordKeeper &Records, raw_ostream &OS) {
 void EmitClangAttrPCHWrite(const RecordKeeper &Records, raw_ostream &OS) {
   emitSourceFileHeader("Attribute serialization code", OS, Records);
 
-  Record *InhClass = Records.getClass("InheritableAttr");
+  const Record *InhClass = Records.getClass("InheritableAttr");
   OS << "  switch (A->getKind()) {\n";
   for (const auto *Attr : Records.getAllDerivedDefinitions("Attr")) {
     const Record &R = *Attr;
     if (!R.getValueAsBit("ASTNode"))
       continue;
     OS << "  case attr::" << R.getName() << ": {\n";
-    std::vector<Record *> Args = R.getValueAsListOfDefs("Args");
+    std::vector<const Record *> Args = R.getValueAsListOfConstDefs("Args");
     if (R.isSubClassOf(InhClass) || !Args.empty())
       OS << "    const auto *SA = cast<" << R.getName()
          << "Attr>(A);\n";
@@ -3731,9 +3729,8 @@ static void GenerateHasAttrSpellingStringSwitch(
       GenerateTargetSpecificAttrChecks(R, Arches, Test, nullptr);
     } else if (!Attr->getValueAsListOfDefs("TargetSpecificSpellings").empty()) {
       // Add target checks if this spelling is target-specific.
-      const std::vector<Record *> TargetSpellings =
-          Attr->getValueAsListOfDefs("TargetSpecificSpellings");
-      for (const auto &TargetSpelling : TargetSpellings) {
+      for (const auto &TargetSpelling :
+           Attr->getValueAsListOfDefs("TargetSpecificSpellings")) {
         // Find spelling that matches current scope and name.
         for (const auto &Spelling : GetFlattenedSpellings(*TargetSpelling)) {
           if (Scope == Spelling.nameSpace() && Name == Spelling.name()) {
@@ -3772,7 +3769,7 @@ void EmitClangRegularKeywordAttributeInfo(const RecordKeeper &Records,
     for (const auto &S : GetFlattenedSpellings(*R)) {
       if (!isRegularKeywordAttribute(S))
         continue;
-      std::vector<Record *> Args = R->getValueAsListOfDefs("Args");
+      std::vector<const Record *> Args = R->getValueAsListOfConstDefs("Args");
       bool HasArgs = any_of(
           Args, [](const Record *Arg) { return !Arg->getValueAsBit("Fake"); });
 
@@ -3938,8 +3935,7 @@ void EmitClangAttrASTVisitor(const RecordKeeper &Records, raw_ostream &OS) {
        << "  if (!getDerived().Visit" << R.getName() << "Attr(A))\n"
        << "    return false;\n";
 
-    std::vector<Record*> ArgRecords = R.getValueAsListOfDefs("Args");
-    for (const auto *Arg : ArgRecords)
+    for (const auto *Arg : R.getValueAsListOfDefs("Args"))
       createArgument(*Arg, R.getName())->writeASTVisitorTraversal(OS);
 
     if (Attr->getValueAsBit("AcceptsExprPack"))
@@ -4003,7 +3999,8 @@ void EmitClangAttrTemplateInstantiateHelper(ArrayRef<const Record *> Attrs,
       continue;
     }
 
-    std::vector<Record*> ArgRecords = R.getValueAsListOfDefs("Args");
+    std::vector<const Record *> ArgRecords =
+        R.getValueAsListOfConstDefs("Args");
     std::vector<std::unique_ptr<Argument>> Args;
     Args.reserve(ArgRecords.size());
 
@@ -4073,10 +4070,9 @@ static void emitArgInfo(const Record &R, raw_ostream &OS) {
   // This function will count the number of arguments specified for the
   // attribute and emit the number of required arguments followed by the
   // number of optional arguments.
-  std::vector<Record *> Args = R.getValueAsListOfDefs("Args");
   unsigned ArgCount = 0, OptCount = 0, ArgMemberCount = 0;
   bool HasVariadic = false;
-  for (const auto *Arg : Args) {
+  for (const auto *Arg : R.getValueAsListOfDefs("Args")) {
     // If the arg is fake, it's the user's job to supply it: general parsing
     // logic shouldn't need to know anything about it.
     if (Arg->getValueAsBit("Fake"))
@@ -4116,8 +4112,7 @@ static std::string CalculateDiagnostic(const Record &S) {
     return ("\"" + Twine(CustomDiag) + "\"").str();
 
   std::vector<std::string> DiagList;
-  std::vector<Record *> Subjects = S.getValueAsListOfDefs("Subjects");
-  for (const auto *Subject : Subjects) {
+  for (const auto *Subject : S.getValueAsListOfDefs("Subjects")) {
     const Record &R = *Subject;
     // Get the diagnostic text from the Decl or Stmt node given.
     std::string V = GetDiagnosticSpelling(R);
@@ -4182,7 +4177,7 @@ static void GenerateCustomAppertainsTo(const Record &Subject, raw_ostream &OS) {
     return;
 
   // This only works with non-root Decls.
-  Record *Base = Subject.getValueAsDef(BaseFieldName);
+  const Record *Base = Subject.getValueAsDef(BaseFieldName);
 
   // Not currently support custom subjects within custom subjects.
   if (Base->isSubClassOf("SubsetSubject")) {
@@ -4209,7 +4204,8 @@ static void GenerateAppertainsTo(const Record &Attr, raw_ostream &OS) {
     return;
 
   const Record *SubjectObj = Attr.getValueAsDef("Subjects");
-  std::vector<Record *> Subjects = SubjectObj->getValueAsListOfDefs("Subjects");
+  std::vector<const Record *> Subjects =
+      SubjectObj->getValueAsListOfConstDefs("Subjects");
 
   // If the list of subjects is empty, it is assumed that the attribute
   // appertains to everything.
@@ -4222,7 +4218,7 @@ static void GenerateAppertainsTo(const Record &Attr, raw_ostream &OS) {
   // FIXME: subset subjects are added to the declaration list until there are
   // enough statement attributes with custom subject needs to warrant
   // the implementation effort.
-  std::vector<Record *> DeclSubjects, StmtSubjects;
+  std::vector<const Record *> DeclSubjects, StmtSubjects;
   copy_if(Subjects, std::back_inserter(DeclSubjects), [](const Record *R) {
     return R->isSubClassOf("SubsetSubject") || !R->isSubClassOf("StmtNode");
   });
@@ -4340,8 +4336,8 @@ static void GenerateMutualExclusionsChecks(const Record &Attr,
   // diagMutualExclusion() check.
   for (const Record *Exclusion :
        Records.getAllDerivedDefinitions("MutualExclusions")) {
-    std::vector<Record *> MutuallyExclusiveAttrs =
-        Exclusion->getValueAsListOfDefs("Exclusions");
+    std::vector<const Record *> MutuallyExclusiveAttrs =
+        Exclusion->getValueAsListOfConstDefs("Exclusions");
     auto IsCurAttr = [Attr](const Record *R) {
       return R->getName() == Attr.getName();
     };
@@ -4460,7 +4456,7 @@ emitAttributeMatchRules(PragmaClangAttributeSupport &PragmaAttributeSupport,
       OS << "    return false;\n";
       continue;
     }
-    std::vector<Record *> Subjects = Rule.getSubjects();
+    std::vector<const Record *> Subjects = Rule.getSubjects();
     assert(!Subjects.empty() && "Missing subjects");
     OS << "  case " << Rule.getEnumValue() << ":\n";
     OS << "    return ";
@@ -4487,7 +4483,8 @@ static void GenerateLangOptRequirements(const Record &R,
                                         raw_ostream &OS) {
   // If the attribute has an empty or unset list of language requirements,
   // use the default handler.
-  std::vector<Record *> LangOpts = R.getValueAsListOfDefs("LangOpts");
+  std::vector<const Record *> LangOpts =
+      R.getValueAsListOfConstDefs("LangOpts");
   if (LangOpts.empty())
     return;
 
@@ -4539,7 +4536,7 @@ static void GenerateTargetRequirements(const Record &Attr,
 
 static void
 GenerateSpellingTargetRequirements(const Record &Attr,
-                                   const std::vector<Record *> &TargetSpellings,
+                                   ArrayRef<const Record *> TargetSpellings,
                                    raw_ostream &OS) {
   // If there are no target specific spellings, use the default target handler.
   if (TargetSpellings.empty())
@@ -4632,7 +4629,7 @@ static bool isParamExpr(const Record *Arg) {
 void GenerateIsParamExpr(const Record &Attr, raw_ostream &OS) {
   OS << "bool isParamExpr(size_t N) const override {\n";
   OS << "  return ";
-  auto Args = Attr.getValueAsListOfDefs("Args");
+  auto Args = Attr.getValueAsListOfConstDefs("Args");
   for (size_t I = 0; I < Args.size(); ++I)
     if (isParamExpr(Args[I]))
       OS << "(N == " << I << ") || ";
@@ -4698,7 +4695,7 @@ void EmitClangAttrParsedAttrImpl(const RecordKeeper &Records, raw_ostream &OS) {
     if (Attr.isValueUnset("Subjects"))
       continue;
     const Record *SubjectObj = Attr.getValueAsDef("Subjects");
-    for (auto Subject : SubjectObj->getValueAsListOfDefs("Subjects"))
+    for (const Record *Subject : SubjectObj->getValueAsListOfDefs("Subjects"))
       if (Subject->isSubClassOf("SubsetSubject"))
         GenerateCustomAppertainsTo(*Subject, OS);
   }
@@ -4741,7 +4738,7 @@ void EmitClangAttrParsedAttrImpl(const RecordKeeper &Records, raw_ostream &OS) {
     }
 
     std::vector<std::string> ArgNames;
-    for (const auto &Arg : Attr.getValueAsListOfDefs("Args")) {
+    for (const auto *Arg : Attr.getValueAsListOfDefs("Args")) {
       bool UnusedUnset;
       if (Arg->getValueAsBitOrUnset("Fake", UnusedUnset))
         continue;
@@ -4795,7 +4792,7 @@ void EmitClangAttrParsedAttrImpl(const RecordKeeper &Records, raw_ostream &OS) {
     GenerateLangOptRequirements(Attr, OS);
     GenerateTargetRequirements(Attr, Dupes, OS);
     GenerateSpellingTargetRequirements(
-        Attr, Attr.getValueAsListOfDefs("TargetSpecificSpellings"), OS);
+        Attr, Attr.getValueAsListOfConstDefs("TargetSpecificSpellings"), OS);
     GenerateSpellingIndexToSemanticSpelling(Attr, OS);
     PragmaAttributeSupport.generateStrictConformsTo(*I->second, OS);
     GenerateHandleDeclAttribute(Attr, OS);
@@ -4962,7 +4959,7 @@ void EmitClangAttrTextNodeDump(const RecordKeeper &Records, raw_ostream &OS) {
     if (Spellings.size() > 1 && !SpellingNamesAreCommon(Spellings))
       SS << "    OS << \" \" << A->getSpelling();\n";
 
-    std::vector<Record *> Args = R.getValueAsListOfDefs("Args");
+    std::vector<const Record *> Args = R.getValueAsListOfConstDefs("Args");
     for (const auto *Arg : Args)
       createArgument(*Arg, R.getName())->writeDump(SS);
 
@@ -4992,7 +4989,7 @@ void EmitClangAttrNodeTraverse(const RecordKeeper &Records, raw_ostream &OS) {
     std::string FunctionContent;
     raw_string_ostream SS(FunctionContent);
 
-    std::vector<Record *> Args = R.getValueAsListOfDefs("Args");
+    std::vector<const Record *> Args = R.getValueAsListOfConstDefs("Args");
     for (const auto *Arg : Args)
       createArgument(*Arg, R.getName())->writeDumpChildren(SS);
     if (Attr->getValueAsBit("AcceptsExprPack"))
@@ -5036,7 +5033,8 @@ void EmitClangAttrDocTable(const RecordKeeper &Records, raw_ostream &OS) {
   for (const auto *A : Records.getAllDerivedDefinitions("Attr")) {
     if (!A->getValueAsBit("ASTNode"))
       continue;
-    std::vector<Record *> Docs = A->getValueAsListOfDefs("Documentation");
+    std::vector<const Record *> Docs =
+        A->getValueAsListOfConstDefs("Documentation");
     assert(!Docs.empty());
     // Only look at the first documentation if there are several.
     // (Currently there's only one such attr, revisit if this becomes common).
@@ -5255,7 +5253,8 @@ void EmitClangAttrDocs(const RecordKeeper &Records, raw_ostream &OS) {
       SplitDocs;
   for (const auto *A : Records.getAllDerivedDefinitions("Attr")) {
     const Record &Attr = *A;
-    std::vector<Record *> Docs = Attr.getValueAsListOfDefs("Documentation");
+    std::vector<const Record *> Docs =
+        Attr.getValueAsListOfConstDefs("Documentation");
     for (const auto *D : Docs) {
       const Record &Doc = *D;
       const Record *Category = Doc.getValueAsDef("Category");
@@ -5306,11 +5305,10 @@ void EmitTestPragmaAttributeSupportedAttributes(const RecordKeeper &Records,
       continue;
     }
     const Record *SubjectObj = I.second->getValueAsDef("Subjects");
-    std::vector<Record *> Subjects =
-        SubjectObj->getValueAsListOfDefs("Subjects");
     OS << " (";
     bool PrintComma = false;
-    for (const auto &Subject : enumerate(Subjects)) {
+    for (const auto &Subject :
+         enumerate(SubjectObj->getValueAsListOfDefs("Subjects"))) {
       if (!isSupportedPragmaClangAttributeSubject(*Subject.value()))
         continue;
       if (PrintComma)
diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index 7a8aa181c7588..d9bb0630aff5f 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -47,8 +47,8 @@ class DiagGroupParentMap {
     ArrayRef<const Record *> DiagGroups =
         Records.getAllDerivedDefinitions("DiagGroup");
     for (unsigned i = 0, e = DiagGroups.size(); i != e; ++i) {
-      std::vector<Record*> SubGroups =
-        DiagGroups[i]->getValueAsListOfDefs("SubGroups");
+      std::vector<const Record *> SubGroups =
+          DiagGroups[i]->getValueAsListOfConstDefs("SubGroups");
       for (unsigned j = 0, e = SubGroups.size(); j != e; ++j)
         Mapping[SubGroups[j]].push_back(DiagGroups[i]);
     }
@@ -180,10 +180,8 @@ static void groupDiagnostics(ArrayRef<const Record *> Diags,
     GI.GroupName = Group->getName();
     GI.Defs.push_back(Group);
 
-    std::vector<Record*> SubGroups = Group->getValueAsListOfDefs("SubGroups");
-    for (unsigned j = 0, e = SubGroups.size(); j != e; ++j)
-      GI.SubGroups.push_back(
-          std::string(SubGroups[j]->getValueAsString("GroupName")));
+    for (const Record *SubGroup : Group->getValueAsListOfDefs("SubGroups"))
+      GI.SubGroups.push_back(SubGroup->getValueAsString("GroupName").str());
   }
 
   // Assign unique ID numbers to the groups.
diff --git a/clang/utils/TableGen/ClangOpcodesEmitter.cpp b/clang/utils/TableGen/ClangOpcodesEmitter.cpp
index a18220fe23c41..94c361c7d544f 100644
--- a/clang/utils/TableGen/ClangOpcodesEmitter.cpp
+++ b/clang/utils/TableGen/ClangOpcodesEmitter.cpp
@@ -69,7 +69,7 @@ void Enumerate(const Record *R, StringRef N,
 
     if (const auto *TypeClass = dyn_cast<DefInit>(Types->getElement(I))) {
       for (const auto *Type :
-           TypeClass->getDef()->getValueAsListOfDefs("Types")) {
+           TypeClass->getDef()->getValueAsListOfConstDefs("Types")) {
         TypePath.push_back(Type);
         Rec(I + 1, ID + Type->getName());
         TypePath.pop_back();
@@ -117,7 +117,7 @@ void ClangOpcodesEmitter::EmitInterp(raw_ostream &OS, StringRef N,
             [this, R, &OS, &N](ArrayRef<const Record *> TS, const Twine &ID) {
               bool CanReturn = R->getValueAsBit("CanReturn");
               bool ChangesPC = R->getValueAsBit("ChangesPC");
-              const auto &Args = R->getValueAsListOfDefs("Args");
+              const auto &Args = R->getValueAsListOfConstDefs("Args");
 
               OS << "case OP_" << ID << ": {\n";
 
@@ -176,7 +176,7 @@ void ClangOpcodesEmitter::EmitDisasm(raw_ostream &OS, StringRef N,
     OS << "  PrintName(\"" << ID << "\");\n";
     OS << "  OS << \"\\t\"";
 
-    for (const auto *Arg : R->getValueAsListOfDefs("Args")) {
+    for (const auto *Arg : R->getValueAsListOfConstDefs("Args")) {
       OS << " << ReadArg<" << Arg->getValueAsString("Name") << ">(P, PC)";
       OS << " << \" \"";
     }
@@ -194,7 +194,7 @@ void ClangOpcodesEmitter::EmitEmitter(raw_ostream &OS, StringRef N,
 
   OS << "#ifdef GET_LINK_IMPL\n";
   Enumerate(R, N, [R, &OS](ArrayRef<const Record *>, const Twine &ID) {
-    const auto &Args = R->getValueAsListOfDefs("Args");
+    const auto &Args = R->getValueAsListOfConstDefs("Args");
 
     // Emit the list of arguments.
     OS << "bool ByteCodeEmitter::emit" << ID << "(";
@@ -227,7 +227,7 @@ void ClangOpcodesEmitter::EmitEmitter(raw_ostream &OS, StringRef N,
 void ClangOpcodesEmitter::EmitProto(raw_ostream &OS, StringRef N,
                                     const Record *R) {
   OS << "#if defined(GET_EVAL_PROTO) || defined(GET_LINK_PROTO)\n";
-  auto Args = R->getValueAsListOfDefs("Args");
+  auto Args = R->getValueAsListOfConstDefs("Args");
   Enumerate(R, N, [&OS, &Args](ArrayRef<const Record *> TS, const Twine &ID) {
     OS << "bool emit" << ID << "(";
     for (size_t I = 0, N = Args.size(); I < N; ++I) {
@@ -268,7 +268,7 @@ void ClangOpcodesEmitter::EmitGroup(raw_ostream &OS, StringRef N,
     return;
 
   const auto *Types = R->getValueAsListInit("Types");
-  const auto &Args = R->getValueAsListOfDefs("Args");
+  const auto &Args = R->getValueAsListOfConstDefs("Args");
 
   Twine EmitFuncName = "emit" + N;
 
@@ -333,7 +333,7 @@ void ClangOpcodesEmitter::EmitGroup(raw_ostream &OS, StringRef N,
     // Print a switch statement selecting T.
     if (auto *TypeClass = dyn_cast<DefInit>(Types->getElement(I))) {
       OS << "  switch (T" << I << ") {\n";
-      auto Cases = TypeClass->getDef()->getValueAsListOfDefs("Types");
+      auto Cases = TypeClass->getDef()->getValueAsListOfConstDefs("Types");
       for (auto *Case : Cases) {
         OS << "  case PT_" << Case->getName() << ":\n";
         TS.push_back(Case);
@@ -364,7 +364,7 @@ void ClangOpcodesEmitter::EmitEval(raw_ostream &OS, StringRef N,
   OS << "#ifdef GET_EVAL_IMPL\n";
   Enumerate(R, N,
             [this, R, &N, &OS](ArrayRef<const Record *> TS, const Twine &ID) {
-              auto Args = R->getValueAsListOfDefs("Args");
+              auto Args = R->getValueAsListOfConstDefs("Args");
 
               OS << "bool EvalEmitter::emit" << ID << "(";
               for (size_t I = 0, N = Args.size(); I < N; ++I) {
diff --git a/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp b/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp
index 80cb2ee28e256..6607086f0b117 100644
--- a/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp
+++ b/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp
@@ -185,7 +185,7 @@ class BuiltinNameEmitter {
   //        <<float>, 5>,
   //        ...
   //        <<double, double>, 35>.
-  std::vector<std::pair<std::vector<Record *>, unsigned>> SignaturesList;
+  std::vector<std::pair<std::vector<const Record *>, unsigned>> SignaturesList;
 
   // Map the name of a builtin function to its prototypes (instances of the
   // TableGen "Builtin" class).
@@ -261,8 +261,8 @@ class OpenCLBuiltinFileEmitterBase {
   // Return the type(s) and vector size(s) for the given type.  For
   // non-GenericTypes, the resulting vectors will contain 1 element.  For
   // GenericTypes, the resulting vectors typically contain multiple elements.
-  void getTypeLists(Record *Type, TypeFlags &Flags,
-                    std::vector<Record *> &TypeList,
+  void getTypeLists(const Record *Type, TypeFlags &Flags,
+                    std::vector<const Record *> &TypeList,
                     std::vector<int64_t> &VectorList) const;
 
   // Expand the TableGen Records representing a builtin function signature into
@@ -278,7 +278,7 @@ class OpenCLBuiltinFileEmitterBase {
   //   [char, float3, float3]
   //   ...
   void
-  expandTypesInSignature(const std::vector<Record *> &Signature,
+  expandTypesInSignature(ArrayRef<const Record *> Signature,
                          SmallVectorImpl<SmallVector<std::string, 2>> &Types);
 
   // Emit extension enabling pragmas.
@@ -458,7 +458,7 @@ struct OpenCLBuiltinStruct {
 // the same number of actual scalar or vector types.
 //
 // Exit with a fatal error if an unsupported construct is encountered.
-static void VerifySignature(const std::vector<Record *> &Signature,
+static void VerifySignature(ArrayRef<const Record *> Signature,
                             const Record *BuiltinRec) {
   unsigned GenTypeVecSizes = 1;
   unsigned GenTypeTypes = 1;
@@ -480,8 +480,9 @@ static void VerifySignature(const std::vector<Record *> &Signature,
       }
 
       // Check number of data types.
-      unsigned NTypes =
-          T->getValueAsDef("TypeList")->getValueAsListOfDefs("List").size();
+      unsigned NTypes = T->getValueAsDef("TypeList")
+                            ->getValueAsListOfConstDefs("List")
+                            .size();
       if (NTypes != GenTypeTypes && NTypes != 1) {
         if (GenTypeTypes > 1) {
           // We already saw a gentype with a different number of types.
@@ -511,12 +512,13 @@ void BuiltinNameEmitter::GetOverloads() {
     StringRef BName = B->getValueAsString("Name");
     FctOverloadMap.try_emplace(BName);
 
-    auto Signature = B->getValueAsListOfDefs("Signature");
+    auto Signature = B->getValueAsListOfConstDefs("Signature");
     // Reuse signatures to avoid unnecessary duplicates.
-    auto it = find_if(SignaturesList,
-                      [&](const std::pair<std::vector<Record *>, unsigned> &a) {
-                        return a.first == Signature;
-                      });
+    auto it =
+        find_if(SignaturesList,
+                [&](const std::pair<std::vector<const Record *>, unsigned> &a) {
+                  return a.first == Signature;
+                });
     unsigned SignIndex;
     if (it == SignaturesList.end()) {
       VerifySignature(Signature, B);
@@ -634,8 +636,8 @@ void BuiltinNameEmitter::EmitBuiltinTable() {
           Overload.first->getValueAsDef("MaxVersion")->getValueAsInt("ID");
 
       OS << "  { " << Overload.second << ", "
-         << Overload.first->getValueAsListOfDefs("Signature").size() << ", "
-         << (Overload.first->getValueAsBit("IsPure")) << ", "
+         << Overload.first->getValueAsListOfConstDefs("Signature").size()
+         << ", " << (Overload.first->getValueAsBit("IsPure")) << ", "
          << (Overload.first->getValueAsBit("IsConst")) << ", "
          << (Overload.first->getValueAsBit("IsConv")) << ", "
          << FunctionExtensionIndex[ExtName] << ", "
@@ -849,8 +851,8 @@ static void OCL2Qual(Sema &S, const OpenCLTypeStruct &Ty,
     // Build the Cartesian product of (vector sizes) x (types).  Only insert
     // the plain scalar types for now; other type information such as vector
     // size and type qualifiers will be added after the switch statement.
-    std::vector<Record *> BaseTypes =
-        GenType->getValueAsDef("TypeList")->getValueAsListOfDefs("List");
+    std::vector<const Record *> BaseTypes =
+        GenType->getValueAsDef("TypeList")->getValueAsListOfConstDefs("List");
 
     // Collect all QualTypes for a single vector size into TypeList.
     OS << "      SmallVector<QualType, " << BaseTypes.size() << "> TypeList;\n";
@@ -1022,11 +1024,12 @@ std::string OpenCLBuiltinFileEmitterBase::getTypeString(const Record *Type,
 }
 
 void OpenCLBuiltinFileEmitterBase::getTypeLists(
-    Record *Type, TypeFlags &Flags, std::vector<Record *> &TypeList,
+    const Record *Type, TypeFlags &Flags, std::vector<const Record *> &TypeList,
     std::vector<int64_t> &VectorList) const {
   bool isGenType = Type->isSubClassOf("GenericType");
   if (isGenType) {
-    TypeList = Type->getValueAsDef("TypeList")->getValueAsListOfDefs("List");
+    TypeList =
+        Type->getValueAsDef("TypeList")->getValueAsListOfConstDefs("List");
     VectorList =
         Type->getValueAsDef("VectorList")->getValueAsListOfInts("List");
     return;
@@ -1035,7 +1038,7 @@ void OpenCLBuiltinFileEmitterBase::getTypeLists(
   if (Type->isSubClassOf("PointerType") || Type->isSubClassOf("ConstType") ||
       Type->isSubClassOf("VolatileType")) {
     StringRef SubTypeName = Type->getValueAsString("Name");
-    Record *PossibleGenType = Records.getDef(SubTypeName);
+    const Record *PossibleGenType = Records.getDef(SubTypeName);
     if (PossibleGenType && PossibleGenType->isSubClassOf("GenericType")) {
       // When PointerType, ConstType, or VolatileType is applied to a
       // GenericType, the flags need to be taken from the subtype, not from the
@@ -1055,7 +1058,7 @@ void OpenCLBuiltinFileEmitterBase::getTypeLists(
 }
 
 void OpenCLBuiltinFileEmitterBase::expandTypesInSignature(
-    const std::vector<Record *> &Signature,
+    ArrayRef<const Record *> Signature,
     SmallVectorImpl<SmallVector<std::string, 2>> &Types) {
   // Find out if there are any GenTypes in this signature, and if so, calculate
   // into how many signatures they will expand.
@@ -1063,7 +1066,7 @@ void OpenCLBuiltinFileEmitterBase::expandTypesInSignature(
   SmallVector<SmallVector<std::string, 4>, 4> ExpandedGenTypes;
   for (const auto &Arg : Signature) {
     SmallVector<std::string, 4> ExpandedArg;
-    std::vector<Record *> TypeList;
+    std::vector<const Record *> TypeList;
     std::vector<int64_t> VectorList;
     TypeFlags Flags;
 
@@ -1212,7 +1215,7 @@ void OpenCLBuiltinTestEmitter::emit() {
     StringRef Name = B->getValueAsString("Name");
 
     SmallVector<SmallVector<std::string, 2>, 4> FTypes;
-    expandTypesInSignature(B->getValueAsListOfDefs("Signature"), FTypes);
+    expandTypesInSignature(B->getValueAsListOfConstDefs("Signature"), FTypes);
 
     OS << "// Test " << Name << "\n";
 
@@ -1281,7 +1284,7 @@ void OpenCLBuiltinHeaderEmitter::emit() {
     std::string OptionalVersionEndif = emitVersionGuard(B);
 
     SmallVector<SmallVector<std::string, 2>, 4> FTypes;
-    expandTypesInSignature(B->getValueAsListOfDefs("Signature"), FTypes);
+    expandTypesInSignature(B->getValueAsListOfConstDefs("Signature"), FTypes);
 
     for (const auto &Signature : FTypes) {
       StringRef OptionalTypeExtEndif = emitTypeExtensionGuards(Signature);
diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp
index 84afd4c0afb26..39c178bc4f9ba 100644
--- a/clang/utils/TableGen/TableGen.cpp
+++ b/clang/utils/TableGen/TableGen.cpp
@@ -317,7 +317,7 @@ ClangComponent("clang-component",
                cl::desc("Only use warnings from specified component"),
                cl::value_desc("component"), cl::Hidden);
 
-bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
+bool ClangTableGenMain(raw_ostream &OS, const RecordKeeper &Records) {
   switch (Action) {
   case PrintRecords:
     OS << Records;           // No argument, dump all contents
diff --git a/clang/utils/analyzer/SATestBuild.py b/clang/utils/analyzer/SATestBuild.py
index bc86ed8b64e0e..66e1ab72985cd 100644
--- a/clang/utils/analyzer/SATestBuild.py
+++ b/clang/utils/analyzer/SATestBuild.py
@@ -176,7 +176,7 @@ def stdout(message: str):
 CHECKERS = ",".join(
     [
         "alpha.unix.SimpleStream",
-        "alpha.security.taint",
+        "optin.taint",
         "cplusplus.NewDeleteLeaks",
         "core",
         "cplusplus",
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index e5c5e50104fda..ba63106ccc387 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -16089,7 +16089,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2707.html">2707</a></td>
     <td>DRWP</td>
     <td>Deduction guides cannot have a trailing <I>requires-clause</I></td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="unreleased" align="center">Clang 20</td>
   </tr>
   <tr id="2708">
     <td><a href="https://cplusplus.github.io/CWG/issues/2708.html">2708</a></td>
@@ -17334,7 +17334,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2913.html">2913</a></td>
     <td>tentatively ready</td>
     <td>Grammar for <I>deduction-guide</I> has <I>requires-clause</I> in the wrong position</td>
-    <td align="center">Not resolved</td>
+    <td align="center">
+      <details>
+        <summary>Not resolved</summary>
+        Clang 20 implements 2024-08-16 resolution
+      </details></td>
   </tr>
   <tr class="open" id="2914">
     <td><a href="https://cplusplus.github.io/CWG/issues/2914.html">2914</a></td>
diff --git a/compiler-rt/cmake/Modules/CompilerRTCompile.cmake b/compiler-rt/cmake/Modules/CompilerRTCompile.cmake
index 1629db18f1c2d..447ffd98fe570 100644
--- a/compiler-rt/cmake/Modules/CompilerRTCompile.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTCompile.cmake
@@ -73,9 +73,9 @@ function(clang_compile object_file source)
   endif()
   string(REGEX MATCH "[.](cc|cpp)$" is_cxx ${source_rpath})
   if (is_cxx)
-    set(compiler ${COMPILER_RT_TEST_COMPILER})
-  else()
     set(compiler ${COMPILER_RT_TEST_CXX_COMPILER})
+  else()
+    set(compiler ${COMPILER_RT_TEST_COMPILER})
   endif()
   if(COMPILER_RT_STANDALONE_BUILD)
     # Only add global flags in standalone build.
diff --git a/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc
index ef90f02f57f9f..5c6c3475ed910 100644
--- a/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc
+++ b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc
@@ -33,7 +33,7 @@ enum CPUFeatures {
   FEAT_FP,
   FEAT_SIMD,
   FEAT_CRC,
-  FEAT_SHA1,
+  RESERVED_FEAT_SHA1, // previously used and now ABI legacy
   FEAT_SHA2,
   FEAT_SHA3,
   FEAT_AES,
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
index f0694900f231f..da1c4dfcb0c9a 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
@@ -77,7 +77,6 @@ void __init_cpu_features_resolver(void) {
     CHECK_BIT(CAP_BIT_FEAT_RDM, FEAT_RDM);
     CHECK_BIT(CAP_BIT_FEAT_LSE, FEAT_LSE);
     CHECK_BIT(CAP_BIT_FEAT_SHA256, FEAT_SHA2);
-    CHECK_BIT(CAP_BIT_FEAT_SHA1, FEAT_SHA1);
     CHECK_BIT(CAP_BIT_FEAT_AES, FEAT_AES);
     CHECK_BIT(CAP_BIT_FEAT_PMULL, FEAT_PMULL);
     CHECK_BIT(CAP_BIT_FEAT_SPECRES, FEAT_PREDRES);
@@ -123,7 +122,6 @@ void __init_cpu_features_resolver(void) {
       {"hw.optional.arm.FEAT_LSE", FEAT_LSE},
       {"hw.optional.AdvSIMD", FEAT_SIMD},
       {"hw.optional.armv8_crc32", FEAT_CRC},
-      {"hw.optional.arm.FEAT_SHA1", FEAT_SHA1},
       {"hw.optional.arm.FEAT_SHA256", FEAT_SHA2},
       {"hw.optional.arm.FEAT_SHA3", FEAT_SHA3},
       {"hw.optional.arm.FEAT_AES", FEAT_AES},
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
index 1ae4780e4978e..b0c4c801a633a 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
@@ -24,8 +24,6 @@ void __init_cpu_features_resolver() {
     setCPUFeature(FEAT_AES);
   if (features & ZX_ARM64_FEATURE_ISA_PMULL)
     setCPUFeature(FEAT_PMULL);
-  if (features & ZX_ARM64_FEATURE_ISA_SHA1)
-    setCPUFeature(FEAT_SHA1);
   if (features & ZX_ARM64_FEATURE_ISA_SHA256)
     setCPUFeature(FEAT_SHA2);
   if (features & ZX_ARM64_FEATURE_ISA_CRC32)
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
index ccee4a274ea70..6f6bd19a2120a 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
@@ -35,8 +35,6 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
     setCPUFeature(FEAT_RDM);
   if (hwcap & HWCAP_AES)
     setCPUFeature(FEAT_AES);
-  if (hwcap & HWCAP_SHA1)
-    setCPUFeature(FEAT_SHA1);
   if (hwcap & HWCAP_SHA2)
     setCPUFeature(FEAT_SHA2);
   if (hwcap & HWCAP_JSCVT)
diff --git a/compiler-rt/lib/orc/elfnix_platform.cpp b/compiler-rt/lib/orc/elfnix_platform.cpp
index bd76e3ed0ece7..57673f088f77c 100644
--- a/compiler-rt/lib/orc/elfnix_platform.cpp
+++ b/compiler-rt/lib/orc/elfnix_platform.cpp
@@ -15,6 +15,7 @@
 #include "compiler.h"
 #include "error.h"
 #include "jit_dispatch.h"
+#include "record_section_tracker.h"
 #include "wrapper_function_utils.h"
 
 #include <algorithm>
@@ -29,8 +30,7 @@ using namespace orc_rt;
 using namespace orc_rt::elfnix;
 
 // Declare function tags for functions in the JIT process.
-ORC_RT_JIT_DISPATCH_TAG(__orc_rt_elfnix_get_initializers_tag)
-ORC_RT_JIT_DISPATCH_TAG(__orc_rt_elfnix_get_deinitializers_tag)
+ORC_RT_JIT_DISPATCH_TAG(__orc_rt_elfnix_push_initializers_tag)
 ORC_RT_JIT_DISPATCH_TAG(__orc_rt_elfnix_symbol_lookup_tag)
 
 // eh-frame registration functions, made available via aliases
@@ -45,33 +45,6 @@ __unw_remove_dynamic_eh_frame_section(const void *) ORC_RT_WEAK_IMPORT;
 
 namespace {
 
-Error validatePointerSectionExtent(const char *SectionName,
-                                   const ExecutorAddrRange &SE) {
-  if (SE.size() % sizeof(uintptr_t)) {
-    std::ostringstream ErrMsg;
-    ErrMsg << std::hex << "Size of " << SectionName << " 0x"
-           << SE.Start.getValue() << " -- 0x" << SE.End.getValue()
-           << " is not a pointer multiple";
-    return make_error<StringError>(ErrMsg.str());
-  }
-  return Error::success();
-}
-
-Error runInitArray(const std::vector<ExecutorAddrRange> &InitArraySections,
-                   const ELFNixJITDylibInitializers &MOJDIs) {
-
-  for (const auto &ModInits : InitArraySections) {
-    if (auto Err = validatePointerSectionExtent(".init_array", ModInits))
-      return Err;
-
-    using InitFunc = void (*)();
-    for (auto *Init : ModInits.toSpan<InitFunc>())
-      (*Init)();
-  }
-
-  return Error::success();
-}
-
 struct TLSInfoEntry {
   unsigned long Key = 0;
   unsigned long DataAddress = 0;
@@ -92,10 +65,18 @@ class ELFNixPlatformRuntimeState {
   using AtExitsVector = std::vector<AtExitEntry>;
 
   struct PerJITDylibState {
+    std::string Name;
     void *Header = nullptr;
     size_t RefCount = 0;
+    size_t LinkedAgainstRefCount = 0;
     bool AllowReinitialization = false;
     AtExitsVector AtExits;
+    std::vector<PerJITDylibState *> Deps;
+    RecordSectionsTracker<void (*)()> RecordedInits;
+
+    bool referenced() const {
+      return LinkedAgainstRefCount != 0 || RefCount != 0;
+    }
   };
 
 public:
@@ -113,6 +94,12 @@ class ELFNixPlatformRuntimeState {
   ELFNixPlatformRuntimeState &operator=(ELFNixPlatformRuntimeState &&) = delete;
 
   Error registerObjectSections(ELFNixPerObjectSectionsToRegister POSR);
+  Error registerJITDylib(std::string &Name, void *Handle);
+  Error deregisterJITDylib(void *Handle);
+  Error registerInits(ExecutorAddr HeaderAddr,
+                      std::vector<ExecutorAddrRange> Inits);
+  Error deregisterInits(ExecutorAddr HeaderAddr,
+                        std::vector<ExecutorAddrRange> Inits);
   Error deregisterObjectSections(ELFNixPerObjectSectionsToRegister POSR);
 
   const char *dlerror();
@@ -122,6 +109,8 @@ class ELFNixPlatformRuntimeState {
 
   int registerAtExit(void (*F)(void *), void *Arg, void *DSOHandle);
   void runAtExits(void *DSOHandle);
+  void runAtExits(std::unique_lock<std::recursive_mutex> &JDStateLock,
+                  PerJITDylibState &JDS);
 
   /// Returns the base address of the section containing ThreadData.
   Expected<std::pair<const char *, size_t>>
@@ -132,18 +121,23 @@ class ELFNixPlatformRuntimeState {
 private:
   PerJITDylibState *getJITDylibStateByHeaderAddr(void *DSOHandle);
   PerJITDylibState *getJITDylibStateByName(std::string_view Path);
-  PerJITDylibState &
-  getOrCreateJITDylibState(ELFNixJITDylibInitializers &MOJDIs);
 
   Error registerThreadDataSection(span<const char> ThreadDataSection);
 
   Expected<ExecutorAddr> lookupSymbolInJITDylib(void *DSOHandle,
                                                 std::string_view Symbol);
 
-  Expected<ELFNixJITDylibInitializerSequence>
-  getJITDylibInitializersByName(std::string_view Path);
-  Expected<void *> dlopenInitialize(std::string_view Path, int Mode);
-  Error initializeJITDylib(ELFNixJITDylibInitializers &MOJDIs);
+  Error runInits(std::unique_lock<std::recursive_mutex> &JDStatesLock,
+                 PerJITDylibState &JDS);
+  Expected<void *> dlopenImpl(std::string_view Path, int Mode);
+  Error dlopenFull(std::unique_lock<std::recursive_mutex> &JDStatesLock,
+                   PerJITDylibState &JDS);
+  Error dlopenInitialize(std::unique_lock<std::recursive_mutex> &JDStatesLock,
+                         PerJITDylibState &JDS,
+                         ELFNixJITDylibDepInfoMap &DepInfo);
+  Error dlcloseImpl(void *DSOHandle);
+  Error dlcloseInitialize(std::unique_lock<std::recursive_mutex> &JDStatesLock,
+                          PerJITDylibState &JDS);
 
   static ELFNixPlatformRuntimeState *MOPS;
 
@@ -215,31 +209,110 @@ Error ELFNixPlatformRuntimeState::deregisterObjectSections(
   return Error::success();
 }
 
-const char *ELFNixPlatformRuntimeState::dlerror() { return DLFcnError.c_str(); }
+Error ELFNixPlatformRuntimeState::registerJITDylib(std::string &Name,
+                                                   void *Handle) {
+  std::lock_guard<std::recursive_mutex> Lock(JDStatesMutex);
 
-void *ELFNixPlatformRuntimeState::dlopen(std::string_view Path, int Mode) {
+  if (JDStates.count(Handle)) {
+    std::ostringstream ErrStream;
+    ErrStream << "Duplicate JITDylib registration for header " << Handle
+              << " (name = " << Name << ")";
+    return make_error<StringError>(ErrStream.str());
+  }
+
+  if (JDNameToHeader.count(Name)) {
+    std::ostringstream ErrStream;
+    ErrStream << "Duplicate JITDylib registration for header " << Handle
+              << " (header = " << Handle << ")";
+    return make_error<StringError>(ErrStream.str());
+  }
+
+  auto &JD = JDStates[Handle];
+  JD.Header = Handle;
+  JD.Name = std::move(Name);
+  JDNameToHeader[JD.Name] = Handle;
+  return Error::success();
+}
+
+Error ELFNixPlatformRuntimeState::deregisterJITDylib(void *Handle) {
   std::lock_guard<std::recursive_mutex> Lock(JDStatesMutex);
 
-  // Use fast path if all JITDylibs are already loaded and don't require
-  // re-running initializers.
-  if (auto *JDS = getJITDylibStateByName(Path)) {
-    if (!JDS->AllowReinitialization) {
-      ++JDS->RefCount;
-      return JDS->Header;
-    }
+  auto I = JDStates.find(Handle);
+  if (I == JDStates.end()) {
+    std::ostringstream ErrStream;
+    ErrStream << "Attempted to deregister unrecognized header " << Handle;
+    return make_error<StringError>(ErrStream.str());
   }
 
-  auto H = dlopenInitialize(Path, Mode);
-  if (!H) {
+  auto J = JDNameToHeader.find(
+      std::string(I->second.Name.data(), I->second.Name.size()));
+  assert(J != JDNameToHeader.end() &&
+         "Missing JDNameToHeader entry for JITDylib");
+  JDNameToHeader.erase(J);
+  JDStates.erase(I);
+  return Error::success();
+}
+
+Error ELFNixPlatformRuntimeState::registerInits(
+    ExecutorAddr HeaderAddr, std::vector<ExecutorAddrRange> Inits) {
+  std::lock_guard<std::recursive_mutex> Lock(JDStatesMutex);
+  PerJITDylibState *JDS =
+      getJITDylibStateByHeaderAddr(HeaderAddr.toPtr<void *>());
+
+  if (!JDS) {
+    std::ostringstream ErrStream;
+    ErrStream << "Could not register object platform sections for "
+                 "unrecognized header "
+              << HeaderAddr.toPtr<void *>();
+    return make_error<StringError>(ErrStream.str());
+  }
+
+  for (auto &I : Inits) {
+    JDS->RecordedInits.add(I.toSpan<void (*)()>());
+  }
+
+  return Error::success();
+}
+
+Error ELFNixPlatformRuntimeState::deregisterInits(
+    ExecutorAddr HeaderAddr, std::vector<ExecutorAddrRange> Inits) {
+
+  std::lock_guard<std::recursive_mutex> Lock(JDStatesMutex);
+  PerJITDylibState *JDS =
+      getJITDylibStateByHeaderAddr(HeaderAddr.toPtr<void *>());
+
+  if (!JDS) {
+    std::ostringstream ErrStream;
+    ErrStream << "Could not register object platform sections for unrecognized "
+                 "header "
+              << HeaderAddr.toPtr<void *>();
+    return make_error<StringError>(ErrStream.str());
+  }
+
+  for (auto &I : Inits) {
+    JDS->RecordedInits.removeIfPresent(I);
+  }
+
+  return Error::success();
+}
+
+const char *ELFNixPlatformRuntimeState::dlerror() { return DLFcnError.c_str(); }
+
+void *ELFNixPlatformRuntimeState::dlopen(std::string_view Path, int Mode) {
+  if (auto H = dlopenImpl(Path, Mode))
+    return *H;
+  else {
+    // FIXME: Make dlerror thread safe.
     DLFcnError = toString(H.takeError());
     return nullptr;
   }
-
-  return *H;
 }
 
 int ELFNixPlatformRuntimeState::dlclose(void *DSOHandle) {
-  runAtExits(DSOHandle);
+  if (auto Err = dlcloseImpl(DSOHandle)) {
+    DLFcnError = toString(std::move(Err));
+    return -1;
+  }
   return 0;
 }
 
@@ -265,15 +338,17 @@ int ELFNixPlatformRuntimeState::registerAtExit(void (*F)(void *), void *Arg,
 }
 
 void ELFNixPlatformRuntimeState::runAtExits(void *DSOHandle) {
-  // FIXME: Should atexits be allowed to run concurrently with access to
-  // JDState?
-  AtExitsVector V;
-  {
-    std::lock_guard<std::recursive_mutex> Lock(JDStatesMutex);
-    auto *JDS = getJITDylibStateByHeaderAddr(DSOHandle);
-    assert(JDS && "JITDlybi state not initialized");
-    std::swap(V, JDS->AtExits);
-  }
+  std::unique_lock<std::recursive_mutex> Lock(JDStatesMutex);
+  PerJITDylibState *JDS = getJITDylibStateByHeaderAddr(DSOHandle);
+
+  if (JDS)
+    runAtExits(Lock, *JDS);
+}
+
+void ELFNixPlatformRuntimeState::runAtExits(
+    std::unique_lock<std::recursive_mutex> &JDStateLock,
+    PerJITDylibState &JDS) {
+  AtExitsVector V = std::move(JDS.AtExits);
 
   while (!V.empty()) {
     auto &AE = V.back();
@@ -300,6 +375,7 @@ ELFNixPlatformRuntimeState::getJITDylibStateByHeaderAddr(void *DSOHandle) {
   auto I = JDStates.find(DSOHandle);
   if (I == JDStates.end())
     return nullptr;
+
   return &I->second;
 }
 
@@ -316,24 +392,6 @@ ELFNixPlatformRuntimeState::getJITDylibStateByName(std::string_view Name) {
   return &J->second;
 }
 
-ELFNixPlatformRuntimeState::PerJITDylibState &
-ELFNixPlatformRuntimeState::getOrCreateJITDylibState(
-    ELFNixJITDylibInitializers &MOJDIs) {
-  void *Header = MOJDIs.DSOHandleAddress.toPtr<void *>();
-
-  auto &JDS = JDStates[Header];
-
-  // If this entry hasn't been created yet.
-  if (!JDS.Header) {
-    assert(!JDNameToHeader.count(MOJDIs.Name) &&
-           "JITDylib has header map entry but no name map entry");
-    JDNameToHeader[MOJDIs.Name] = Header;
-    JDS.Header = Header;
-  }
-
-  return JDS;
-}
-
 Error ELFNixPlatformRuntimeState::registerThreadDataSection(
     span<const char> ThreadDataSection) {
   std::lock_guard<std::mutex> Lock(ThreadDataSectionsMutex);
@@ -360,74 +418,142 @@ ELFNixPlatformRuntimeState::lookupSymbolInJITDylib(void *DSOHandle,
   return Result;
 }
 
-Expected<ELFNixJITDylibInitializerSequence>
-ELFNixPlatformRuntimeState::getJITDylibInitializersByName(
-    std::string_view Path) {
-  Expected<ELFNixJITDylibInitializerSequence> Result(
-      (ELFNixJITDylibInitializerSequence()));
-  std::string PathStr(Path.data(), Path.size());
-  if (auto Err =
-          WrapperFunction<SPSExpected<SPSELFNixJITDylibInitializerSequence>(
-              SPSString)>::
-              call(JITDispatch(&__orc_rt_elfnix_get_initializers_tag), Result,
-                   Path))
-    return std::move(Err);
-  return Result;
+Error ELFNixPlatformRuntimeState::runInits(
+    std::unique_lock<std::recursive_mutex> &JDStatesLock,
+    PerJITDylibState &JDS) {
+  std::vector<span<void (*)()>> InitSections;
+  InitSections.reserve(JDS.RecordedInits.numNewSections());
+
+  JDS.RecordedInits.processNewSections(
+      [&](span<void (*)()> Inits) { InitSections.push_back(Inits); });
+
+  JDStatesLock.unlock();
+  for (auto Sec : InitSections)
+    for (auto *Init : Sec)
+      Init();
+
+  JDStatesLock.lock();
+
+  return Error::success();
 }
 
-Expected<void *>
-ELFNixPlatformRuntimeState::dlopenInitialize(std::string_view Path, int Mode) {
-  // Either our JITDylib wasn't loaded, or it or one of its dependencies allows
-  // reinitialization. We need to call in to the JIT to see if there's any new
-  // work pending.
-  auto InitSeq = getJITDylibInitializersByName(Path);
-  if (!InitSeq)
-    return InitSeq.takeError();
-
-  // Init sequences should be non-empty.
-  if (InitSeq->empty())
-    return make_error<StringError>(
-        "__orc_rt_elfnix_get_initializers returned an "
-        "empty init sequence");
-
-  // Otherwise register and run initializers for each JITDylib.
-  for (auto &MOJDIs : *InitSeq)
-    if (auto Err = initializeJITDylib(MOJDIs))
-      return std::move(Err);
-
-  // Return the header for the last item in the list.
-  auto *JDS = getJITDylibStateByHeaderAddr(
-      InitSeq->back().DSOHandleAddress.toPtr<void *>());
-  assert(JDS && "Missing state entry for JD");
+Expected<void *> ELFNixPlatformRuntimeState::dlopenImpl(std::string_view Path,
+                                                        int Mode) {
+  std::unique_lock<std::recursive_mutex> Lock(JDStatesMutex);
+  PerJITDylibState *JDS = getJITDylibStateByName(Path);
+
+  if (!JDS)
+    return make_error<StringError>("No registered JTIDylib for path " +
+                                   std::string(Path.data(), Path.size()));
+
+  if (auto Err = dlopenFull(Lock, *JDS))
+    return std::move(Err);
+
+  ++JDS->RefCount;
+
   return JDS->Header;
 }
 
-long getPriority(const std::string &name) {
-  auto pos = name.find_last_not_of("0123456789");
-  if (pos == name.size() - 1)
-    return 65535;
-  else
-    return std::strtol(name.c_str() + pos + 1, nullptr, 10);
+Error ELFNixPlatformRuntimeState::dlopenFull(
+    std::unique_lock<std::recursive_mutex> &JDStateLock,
+    PerJITDylibState &JDS) {
+  Expected<ELFNixJITDylibDepInfoMap> DepInfo((ELFNixJITDylibDepInfoMap()));
+  JDStateLock.unlock();
+  if (auto Err = WrapperFunction<SPSExpected<SPSELFNixJITDylibDepInfoMap>(
+          SPSExecutorAddr)>::
+          call(JITDispatch(&__orc_rt_elfnix_push_initializers_tag), DepInfo,
+               ExecutorAddr::fromPtr(JDS.Header)))
+    return Err;
+  JDStateLock.lock();
+
+  if (!DepInfo)
+    return DepInfo.takeError();
+
+  if (auto Err = dlopenInitialize(JDStateLock, JDS, *DepInfo))
+    return Err;
+
+  if (!DepInfo->empty()) {
+    std::ostringstream ErrStream;
+    ErrStream << "Encountered unrecognized dep-info key headers "
+                 "while processing dlopen of "
+              << JDS.Name;
+    return make_error<StringError>(ErrStream.str());
+  }
+
+  return Error::success();
 }
 
-Error ELFNixPlatformRuntimeState::initializeJITDylib(
-    ELFNixJITDylibInitializers &MOJDIs) {
+Error ELFNixPlatformRuntimeState::dlopenInitialize(
+    std::unique_lock<std::recursive_mutex> &JDStatesLock, PerJITDylibState &JDS,
+    ELFNixJITDylibDepInfoMap &DepInfo) {
 
-  auto &JDS = getOrCreateJITDylibState(MOJDIs);
-  ++JDS.RefCount;
+  auto I = DepInfo.find(ExecutorAddr::fromPtr(JDS.Header));
+  if (I == DepInfo.end())
+    return Error::success();
 
-  using SectionList = std::vector<ExecutorAddrRange>;
-  std::sort(MOJDIs.InitSections.begin(), MOJDIs.InitSections.end(),
-            [](const std::pair<std::string, SectionList> &LHS,
-               const std::pair<std::string, SectionList> &RHS) -> bool {
-              return getPriority(LHS.first) < getPriority(RHS.first);
-            });
-  for (auto &Entry : MOJDIs.InitSections)
-    if (auto Err = runInitArray(Entry.second, MOJDIs))
+  auto Deps = std::move(I->second);
+  DepInfo.erase(I);
+
+  std::vector<PerJITDylibState *> OldDeps;
+  std::swap(JDS.Deps, OldDeps);
+  JDS.Deps.reserve(Deps.size());
+  for (auto H : Deps) {
+    PerJITDylibState *DepJDS = getJITDylibStateByHeaderAddr(H.toPtr<void *>());
+    if (!DepJDS) {
+      std::ostringstream ErrStream;
+      ErrStream << "Encountered unrecognized dep header " << H.toPtr<void *>()
+                << " while initializing " << JDS.Name;
+      return make_error<StringError>(ErrStream.str());
+    }
+    ++DepJDS->LinkedAgainstRefCount;
+    if (auto Err = dlopenInitialize(JDStatesLock, *DepJDS, DepInfo))
       return Err;
+  }
+
+  if (auto Err = runInits(JDStatesLock, JDS))
+    return Err;
+
+  for (auto *DepJDS : OldDeps) {
+    --DepJDS->LinkedAgainstRefCount;
+    if (!DepJDS->referenced())
+      if (auto Err = dlcloseInitialize(JDStatesLock, *DepJDS))
+        return Err;
+  }
+  return Error::success();
+}
+
+Error ELFNixPlatformRuntimeState::dlcloseImpl(void *DSOHandle) {
+
+  std::unique_lock<std::recursive_mutex> Lock(JDStatesMutex);
+  PerJITDylibState *JDS = getJITDylibStateByHeaderAddr(DSOHandle);
+
+  if (!JDS) {
+    std::ostringstream ErrStream;
+    ErrStream << "No registered JITDylib for " << DSOHandle;
+    return make_error<StringError>(ErrStream.str());
+  }
+
+  --JDS->RefCount;
+
+  if (!JDS->referenced())
+    return dlcloseInitialize(Lock, *JDS);
+
+  return Error::success();
+}
+
+Error ELFNixPlatformRuntimeState::dlcloseInitialize(
+    std::unique_lock<std::recursive_mutex> &JDStatesLock,
+    PerJITDylibState &JDS) {
+  runAtExits(JDStatesLock, JDS);
+  JDS.RecordedInits.reset();
+  for (auto *DepJDS : JDS.Deps)
+    if (!JDS.referenced())
+      if (auto Err = dlcloseInitialize(JDStatesLock, *DepJDS))
+        return Err;
 
   return Error::success();
 }
+
 class ELFNixPlatformRuntimeTLVManager {
 public:
   void *getInstance(const char *ThreadData);
@@ -473,19 +599,73 @@ void destroyELFNixTLVMgr(void *ELFNixTLVMgr) {
 
 ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
 __orc_rt_elfnix_platform_bootstrap(char *ArgData, size_t ArgSize) {
-  return WrapperFunction<void(uint64_t)>::handle(
+  return WrapperFunction<SPSError(SPSExecutorAddr)>::handle(
              ArgData, ArgSize,
-             [](uint64_t &DSOHandle) {
+             [](ExecutorAddr DSOHandle) {
                ELFNixPlatformRuntimeState::initialize(
-                   reinterpret_cast<void *>(DSOHandle));
+                   DSOHandle.toPtr<void *>());
+               return Error::success();
              })
       .release();
 }
 
 ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
 __orc_rt_elfnix_platform_shutdown(char *ArgData, size_t ArgSize) {
-  ELFNixPlatformRuntimeState::destroy();
-  return WrapperFunctionResult().release();
+  return WrapperFunction<SPSError()>::handle(
+             ArgData, ArgSize,
+             []() {
+               ELFNixPlatformRuntimeState::destroy();
+               return Error::success();
+             })
+      .release();
+}
+
+ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
+__orc_rt_elfnix_register_jitdylib(char *ArgData, size_t ArgSize) {
+  return WrapperFunction<SPSError(SPSString, SPSExecutorAddr)>::handle(
+             ArgData, ArgSize,
+             [](std::string &JDName, ExecutorAddr HeaderAddr) {
+               return ELFNixPlatformRuntimeState::get().registerJITDylib(
+                   JDName, HeaderAddr.toPtr<void *>());
+             })
+      .release();
+}
+
+ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
+__orc_rt_elfnix_deregister_jitdylib(char *ArgData, size_t ArgSize) {
+  return WrapperFunction<SPSError(SPSExecutorAddr)>::handle(
+             ArgData, ArgSize,
+             [](ExecutorAddr HeaderAddr) {
+               return ELFNixPlatformRuntimeState::get().deregisterJITDylib(
+                   HeaderAddr.toPtr<void *>());
+             })
+      .release();
+}
+
+ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
+__orc_rt_elfnix_register_init_sections(char *ArgData, size_t ArgSize) {
+  return WrapperFunction<SPSError(SPSExecutorAddr,
+                                  SPSSequence<SPSExecutorAddrRange>)>::
+      handle(ArgData, ArgSize,
+             [](ExecutorAddr HeaderAddr,
+                std::vector<ExecutorAddrRange> &Inits) {
+               return ELFNixPlatformRuntimeState::get().registerInits(
+                   HeaderAddr, std::move(Inits));
+             })
+          .release();
+}
+
+ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
+__orc_rt_elfnix_deregister_init_sections(char *ArgData, size_t ArgSize) {
+  return WrapperFunction<SPSError(SPSExecutorAddr,
+                                  SPSSequence<SPSExecutorAddrRange>)>::
+      handle(ArgData, ArgSize,
+             [](ExecutorAddr HeaderAddr,
+                std::vector<ExecutorAddrRange> &Inits) {
+               return ELFNixPlatformRuntimeState::get().deregisterInits(
+                   HeaderAddr, std::move(Inits));
+             })
+          .release();
 }
 
 /// Wrapper function for registering metadata on a per-object basis.
diff --git a/compiler-rt/lib/orc/elfnix_platform.h b/compiler-rt/lib/orc/elfnix_platform.h
index 3efac4b2327f3..5ecbdf0cb9c86 100644
--- a/compiler-rt/lib/orc/elfnix_platform.h
+++ b/compiler-rt/lib/orc/elfnix_platform.h
@@ -37,26 +37,10 @@ struct ELFNixPerObjectSectionsToRegister {
   ExecutorAddrRange ThreadDataSection;
 };
 
-struct ELFNixJITDylibInitializers {
-  using SectionList = std::vector<ExecutorAddrRange>;
+using ELFNixJITDylibDepInfo = std::vector<ExecutorAddr>;
 
-  ELFNixJITDylibInitializers() = default;
-  ELFNixJITDylibInitializers(std::string Name, ExecutorAddr DSOHandleAddress)
-      : Name(std::move(Name)), DSOHandleAddress(std::move(DSOHandleAddress)) {}
-
-  std::string Name;
-  ExecutorAddr DSOHandleAddress;
-
-  std::vector<std::pair<std::string, SectionList>> InitSections;
-};
-
-class ELFNixJITDylibDeinitializers {};
-
-using ELFNixJITDylibInitializerSequence =
-    std::vector<ELFNixJITDylibInitializers>;
-
-using ELFNixJITDylibDeinitializerSequence =
-    std::vector<ELFNixJITDylibDeinitializers>;
+using ELFNixJITDylibDepInfoMap =
+    std::unordered_map<ExecutorAddr, ELFNixJITDylibDepInfo>;
 
 enum dlopen_mode : int {
   ORC_RT_RTLD_LAZY = 0x1,
@@ -94,37 +78,9 @@ class SPSSerializationTraits<SPSELFNixPerObjectSectionsToRegister,
   }
 };
 
-using SPSNamedExecutorAddrRangeSequenceMap =
-    SPSSequence<SPSTuple<SPSString, SPSExecutorAddrRangeSequence>>;
-
-using SPSELFNixJITDylibInitializers =
-    SPSTuple<SPSString, SPSExecutorAddr, SPSNamedExecutorAddrRangeSequenceMap>;
-
-using SPSELFNixJITDylibInitializerSequence =
-    SPSSequence<SPSELFNixJITDylibInitializers>;
-
-/// Serialization traits for ELFNixJITDylibInitializers.
-template <>
-class SPSSerializationTraits<SPSELFNixJITDylibInitializers,
-                             elfnix::ELFNixJITDylibInitializers> {
-public:
-  static size_t size(const elfnix::ELFNixJITDylibInitializers &MOJDIs) {
-    return SPSELFNixJITDylibInitializers::AsArgList::size(
-        MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections);
-  }
-
-  static bool serialize(SPSOutputBuffer &OB,
-                        const elfnix::ELFNixJITDylibInitializers &MOJDIs) {
-    return SPSELFNixJITDylibInitializers::AsArgList::serialize(
-        OB, MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections);
-  }
-
-  static bool deserialize(SPSInputBuffer &IB,
-                          elfnix::ELFNixJITDylibInitializers &MOJDIs) {
-    return SPSELFNixJITDylibInitializers::AsArgList::deserialize(
-        IB, MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections);
-  }
-};
+using SPSELFNixJITDylibDepInfo = SPSSequence<SPSExecutorAddr>;
+using SPSELFNixJITDylibDepInfoMap =
+    SPSSequence<SPSTuple<SPSExecutorAddr, SPSELFNixJITDylibDepInfo>>;
 
 } // namespace orc_rt
 
diff --git a/compiler-rt/lib/orc/macho_platform.cpp b/compiler-rt/lib/orc/macho_platform.cpp
index 8cc3594b5d0cf..afd90c791ae13 100644
--- a/compiler-rt/lib/orc/macho_platform.cpp
+++ b/compiler-rt/lib/orc/macho_platform.cpp
@@ -17,6 +17,7 @@
 #include "error.h"
 #include "interval_map.h"
 #include "jit_dispatch.h"
+#include "record_section_tracker.h"
 #include "wrapper_function_utils.h"
 
 #include <algorithm>
@@ -168,93 +169,6 @@ class MachOPlatformRuntimeState {
 
   using AtExitsVector = std::vector<AtExitEntry>;
 
-  /// Used to manage sections of fixed-sized metadata records (e.g. pointer
-  /// sections, selector refs, etc.)
-  template <typename RecordElement> class RecordSectionsTracker {
-  public:
-    /// Add a section to the "new" list.
-    void add(span<RecordElement> Sec) { New.push_back(std::move(Sec)); }
-
-    /// Returns true if there are new sections to process.
-    bool hasNewSections() const { return !New.empty(); }
-
-    /// Returns the number of new sections to process.
-    size_t numNewSections() const { return New.size(); }
-
-    /// Process all new sections.
-    template <typename ProcessSectionFunc>
-    std::enable_if_t<std::is_void_v<
-        std::invoke_result_t<ProcessSectionFunc, span<RecordElement>>>>
-    processNewSections(ProcessSectionFunc &&ProcessSection) {
-      for (auto &Sec : New)
-        ProcessSection(Sec);
-      moveNewToProcessed();
-    }
-
-    /// Proces all new sections with a fallible handler.
-    ///
-    /// Successfully handled sections will be moved to the Processed
-    /// list.
-    template <typename ProcessSectionFunc>
-    std::enable_if_t<
-        std::is_same_v<Error, std::invoke_result_t<ProcessSectionFunc,
-                                                   span<RecordElement>>>,
-        Error>
-    processNewSections(ProcessSectionFunc &&ProcessSection) {
-      for (size_t I = 0; I != New.size(); ++I) {
-        if (auto Err = ProcessSection(New[I])) {
-          for (size_t J = 0; J != I; ++J)
-            Processed.push_back(New[J]);
-          New.erase(New.begin(), New.begin() + I);
-          return Err;
-        }
-      }
-      moveNewToProcessed();
-      return Error::success();
-    }
-
-    /// Move all sections back to New for reprocessing.
-    void reset() {
-      moveNewToProcessed();
-      New = std::move(Processed);
-    }
-
-    /// Remove the section with the given range.
-    bool removeIfPresent(ExecutorAddrRange R) {
-      if (removeIfPresent(New, R))
-        return true;
-      return removeIfPresent(Processed, R);
-    }
-
-  private:
-    void moveNewToProcessed() {
-      if (Processed.empty())
-        Processed = std::move(New);
-      else {
-        Processed.reserve(Processed.size() + New.size());
-        std::copy(New.begin(), New.end(), std::back_inserter(Processed));
-        New.clear();
-      }
-    }
-
-    bool removeIfPresent(std::vector<span<RecordElement>> &V,
-                         ExecutorAddrRange R) {
-      auto RI = std::find_if(
-          V.rbegin(), V.rend(),
-          [RS = R.toSpan<RecordElement>()](const span<RecordElement> &E) {
-            return E.data() == RS.data();
-          });
-      if (RI != V.rend()) {
-        V.erase(std::next(RI).base());
-        return true;
-      }
-      return false;
-    }
-
-    std::vector<span<RecordElement>> Processed;
-    std::vector<span<RecordElement>> New;
-  };
-
   struct UnwindSections {
     UnwindSections(const UnwindSectionInfo &USI)
         : DwarfSection(USI.DwarfSection.toSpan<char>()),
diff --git a/compiler-rt/lib/orc/record_section_tracker.h b/compiler-rt/lib/orc/record_section_tracker.h
new file mode 100644
index 0000000000000..1ac729a953919
--- /dev/null
+++ b/compiler-rt/lib/orc/record_section_tracker.h
@@ -0,0 +1,113 @@
+//===- record_section_tracker.h -- for fixed-sized record sects -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RecordSectionsTracker: Responsible for managing sections of metadata records
+// with fixed sizes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ORC_RT_RECORD_SECTION_TRACKER_H
+#define ORC_RT_RECORD_SECTION_TRACKER_H
+
+#include "error.h"
+#include "executor_address.h"
+#include <algorithm>
+#include <vector>
+
+namespace orc_rt {
+
+/// Used to manage sections of fixed-sized metadata records (e.g. pointer
+/// sections, selector refs, etc.)
+template <typename RecordElement> class RecordSectionsTracker {
+public:
+  /// Add a section to the "new" list.
+  void add(span<RecordElement> Sec) { New.push_back(std::move(Sec)); }
+
+  /// Returns true if there are new sections to process.
+  bool hasNewSections() const { return !New.empty(); }
+
+  /// Returns the number of new sections to process.
+  size_t numNewSections() const { return New.size(); }
+
+  /// Process all new sections.
+  template <typename ProcessSectionFunc>
+  std::enable_if_t<std::is_void_v<
+      std::invoke_result_t<ProcessSectionFunc, span<RecordElement>>>>
+  processNewSections(ProcessSectionFunc &&ProcessSection) {
+    for (auto &Sec : New)
+      ProcessSection(Sec);
+    moveNewToProcessed();
+  }
+
+  /// Proces all new sections with a fallible handler.
+  ///
+  /// Successfully handled sections will be moved to the Processed
+  /// list.
+  template <typename ProcessSectionFunc>
+  std::enable_if_t<
+      std::is_same_v<
+          Error, std::invoke_result_t<ProcessSectionFunc, span<RecordElement>>>,
+      Error>
+  processNewSections(ProcessSectionFunc &&ProcessSection) {
+    for (size_t I = 0; I != New.size(); ++I) {
+      if (auto Err = ProcessSection(New[I])) {
+        for (size_t J = 0; J != I; ++J)
+          Processed.push_back(New[J]);
+        New.erase(New.begin(), New.begin() + I);
+        return Err;
+      }
+    }
+    moveNewToProcessed();
+    return Error::success();
+  }
+
+  /// Move all sections back to New for reprocessing.
+  void reset() {
+    moveNewToProcessed();
+    New = std::move(Processed);
+  }
+
+  /// Remove the section with the given range.
+  bool removeIfPresent(ExecutorAddrRange R) {
+    if (removeIfPresent(New, R))
+      return true;
+    return removeIfPresent(Processed, R);
+  }
+
+private:
+  void moveNewToProcessed() {
+    if (Processed.empty())
+      Processed = std::move(New);
+    else {
+      Processed.reserve(Processed.size() + New.size());
+      std::copy(New.begin(), New.end(), std::back_inserter(Processed));
+      New.clear();
+    }
+  }
+
+  bool removeIfPresent(std::vector<span<RecordElement>> &V,
+                       ExecutorAddrRange R) {
+    auto RI = std::find_if(
+        V.rbegin(), V.rend(),
+        [RS = R.toSpan<RecordElement>()](const span<RecordElement> &E) {
+          return E.data() == RS.data();
+        });
+    if (RI != V.rend()) {
+      V.erase(std::next(RI).base());
+      return true;
+    }
+    return false;
+  }
+
+  std::vector<span<RecordElement>> Processed;
+  std::vector<span<RecordElement>> New;
+};
+
+} // namespace orc_rt
+
+#endif // ORC_RT_RECORD_SECTION_TRACKER_H
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 64ed8b62e9eba..bad4cc71801ec 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -196,7 +196,7 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   }
   return 0;
 }
-#elif defined(__ELF__) || defined(_WIN32)
+#elif defined(__ELF__) || defined(_WIN32) || defined(_AIX)
 
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR                            \
   INSTR_PROF_CONCAT(INSTR_PROF_PROFILE_COUNTER_BIAS_VAR, _default)
@@ -862,7 +862,7 @@ static int parseFilenamePattern(const char *FilenamePat,
           __llvm_profile_disable_continuous_mode();
           return -1;
         }
-#if defined(__APPLE__) || defined(__ELF__) || defined(_WIN32)
+#if defined(__APPLE__) || defined(__ELF__) || defined(_WIN32) || defined(_AIX)
         __llvm_profile_set_page_size(getpagesize());
         __llvm_profile_enable_continuous_mode();
 #else
diff --git a/compiler-rt/lib/rtsan/CMakeLists.txt b/compiler-rt/lib/rtsan/CMakeLists.txt
index 0fc3a3f8f4896..af34fb63cf53c 100644
--- a/compiler-rt/lib/rtsan/CMakeLists.txt
+++ b/compiler-rt/lib/rtsan/CMakeLists.txt
@@ -5,7 +5,9 @@ set(RTSAN_CXX_SOURCES
   rtsan_context.cpp
   rtsan_diagnostics.cpp
   rtsan_flags.cpp
-  rtsan_interceptors.cpp)
+  rtsan_interceptors.cpp
+  rtsan_stats.cpp
+  )
 
 set(RTSAN_PREINIT_SOURCES
   rtsan_preinit.cpp)
@@ -16,7 +18,9 @@ set(RTSAN_HEADERS
   rtsan_context.h
   rtsan_diagnostics.h
   rtsan_flags.h
-  rtsan_flags.inc)
+  rtsan_flags.inc
+  rtsan_stats.h
+  )
 
 set(RTSAN_DEPS)
 
@@ -25,9 +29,10 @@ set(RTSAN_CFLAGS
   ${COMPILER_RT_CXX_CFLAGS}
   -DSANITIZER_COMMON_NO_REDEFINE_BUILTINS)
 set(RTSAN_LINK_FLAGS ${COMPILER_RT_COMMON_LINK_FLAGS})
-set(RTSAN_LINK_LIBS
+set(RTSAN_DYNAMIC_LIBS
   ${COMPILER_RT_UNWINDER_LINK_LIBS}
-  ${COMPILER_RT_CXX_LINK_LIBS})
+  ${SANITIZER_CXX_ABI_LIBRARIES}
+  ${SANITIZER_COMMON_LINK_LIBS})
 
 append_rtti_flag(OFF RTSAN_CFLAGS)
 
@@ -60,11 +65,11 @@ set(RTSAN_COMMON_RUNTIME_OBJECT_LIBS
   RTSanitizerCommonCoverage
   RTSanitizerCommonSymbolizer)
 
-append_list_if(COMPILER_RT_HAS_LIBDL dl RTSAN_LINK_LIBS)
-append_list_if(COMPILER_RT_HAS_LIBRT rt RTSAN_LINK_LIBS)
-append_list_if(COMPILER_RT_HAS_LIBM m RTSAN_LINK_LIBS)
-append_list_if(COMPILER_RT_HAS_LIBPTHREAD pthread RTSAN_LINK_LIBS)
-append_list_if(COMPILER_RT_HAS_LIBLOG log RTSAN_LINK_LIBS)
+append_list_if(COMPILER_RT_HAS_LIBDL dl RTSAN_DYNAMIC_LIBS)
+append_list_if(COMPILER_RT_HAS_LIBRT rt RTSAN_DYNAMIC_LIBS)
+append_list_if(COMPILER_RT_HAS_LIBM m RTSAN_DYNAMIC_LIBS)
+append_list_if(COMPILER_RT_HAS_LIBPTHREAD pthread RTSAN_DYNAMIC_LIBS)
+append_list_if(COMPILER_RT_HAS_LIBLOG log RTSAN_DYNAMIC_LIBS)
 
 add_compiler_rt_component(rtsan)
 
@@ -79,7 +84,7 @@ if (APPLE)
     OBJECT_LIBS RTRtsan
                 ${RTSAN_COMMON_RUNTIME_OBJECT_LIBS}
     LINK_FLAGS ${RTSAN_LINK_FLAGS}
-    LINK_LIBS ${RTSAN_LINK_LIBS}
+    LINK_LIBS ${RTSAN_DYNAMIC_LIBS}
     PARENT_TARGET rtsan)
 else()
   add_compiler_rt_runtime(clang_rt.rtsan
diff --git a/compiler-rt/lib/rtsan/rtsan.cpp b/compiler-rt/lib/rtsan/rtsan.cpp
index 1e10069f51dd3..f9741b4fe3509 100644
--- a/compiler-rt/lib/rtsan/rtsan.cpp
+++ b/compiler-rt/lib/rtsan/rtsan.cpp
@@ -8,15 +8,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <rtsan/rtsan.h>
-#include <rtsan/rtsan_assertions.h>
-#include <rtsan/rtsan_diagnostics.h>
-#include <rtsan/rtsan_flags.h>
-#include <rtsan/rtsan_interceptors.h>
+#include "rtsan/rtsan.h"
+#include "rtsan/rtsan_assertions.h"
+#include "rtsan/rtsan_diagnostics.h"
+#include "rtsan/rtsan_flags.h"
+#include "rtsan/rtsan_interceptors.h"
+#include "rtsan/rtsan_stats.h"
 
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_mutex.h"
+#include "sanitizer_common/sanitizer_stackdepot.h"
 #include "sanitizer_common/sanitizer_stacktrace.h"
 
 using namespace __rtsan;
@@ -46,7 +48,33 @@ static InitializationState GetInitializationState() {
 
 static auto OnViolationAction(DiagnosticsInfo info) {
   return [info]() {
-    __rtsan::PrintDiagnostics(info);
+    IncrementTotalErrorCount();
+
+    BufferedStackTrace stack;
+
+    // We use the unwind_on_fatal flag here because of precedent with other
+    // sanitizers, this action is not necessarily fatal if halt_on_error=false
+    stack.Unwind(info.pc, info.bp, nullptr,
+                 common_flags()->fast_unwind_on_fatal);
+
+    // If in the future we interop with other sanitizers, we will
+    // need to make our own stackdepot
+    StackDepotHandle handle = StackDepotPut_WithHandle(stack);
+
+    const bool is_stack_novel = handle.use_count() == 0;
+
+    // Marked UNLIKELY as if user is runing with halt_on_error=false
+    // we expect a high number of duplicate stacks. We are willing
+    // To pay for the first insertion.
+    if (UNLIKELY(is_stack_novel)) {
+      IncrementUniqueErrorCount();
+
+      PrintDiagnostics(info);
+      stack.Print();
+
+      handle.inc_use_count_unsafe();
+    }
+
     if (flags().halt_on_error)
       Die();
   };
@@ -62,6 +90,9 @@ SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_init() {
   InitializeFlags();
   InitializeInterceptors();
 
+  if (flags().print_stats_on_exit)
+    Atexit(PrintStatisticsSummary);
+
   SetInitializationState(InitializationState::Initialized);
 }
 
@@ -83,19 +114,19 @@ SANITIZER_INTERFACE_ATTRIBUTE bool __rtsan_is_initialized() {
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_enter() {
-  __rtsan::GetContextForThisThread().RealtimePush();
+  GetContextForThisThread().RealtimePush();
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_exit() {
-  __rtsan::GetContextForThisThread().RealtimePop();
+  GetContextForThisThread().RealtimePop();
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_disable() {
-  __rtsan::GetContextForThisThread().BypassPush();
+  GetContextForThisThread().BypassPush();
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_enable() {
-  __rtsan::GetContextForThisThread().BypassPop();
+  GetContextForThisThread().BypassPop();
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE void
diff --git a/compiler-rt/lib/rtsan/rtsan_context.cpp b/compiler-rt/lib/rtsan/rtsan_context.cpp
index 37ac817db76e4..536d62e81e2fb 100644
--- a/compiler-rt/lib/rtsan/rtsan_context.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_context.cpp
@@ -8,15 +8,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <rtsan/rtsan.h>
-#include <rtsan/rtsan_context.h>
+#include "rtsan/rtsan_context.h"
+#include "rtsan/rtsan.h"
 
-#include <sanitizer_common/sanitizer_allocator_internal.h>
+#include "sanitizer_common/sanitizer_allocator_internal.h"
 
 #include <new>
 #include <pthread.h>
 
 using namespace __sanitizer;
+using namespace __rtsan;
 
 static pthread_key_t context_key;
 static pthread_once_t key_once = PTHREAD_ONCE_INIT;
@@ -31,12 +32,12 @@ static __rtsan::Context &GetContextForThisThreadImpl() {
   };
 
   pthread_once(&key_once, MakeThreadLocalContextKey);
-  __rtsan::Context *current_thread_context =
-      static_cast<__rtsan::Context *>(pthread_getspecific(context_key));
+  Context *current_thread_context =
+      static_cast<Context *>(pthread_getspecific(context_key));
   if (current_thread_context == nullptr) {
-    current_thread_context = static_cast<__rtsan::Context *>(
-        __sanitizer::InternalAlloc(sizeof(__rtsan::Context)));
-    new (current_thread_context) __rtsan::Context();
+    current_thread_context =
+        static_cast<Context *>(InternalAlloc(sizeof(Context)));
+    new (current_thread_context) Context();
     pthread_setspecific(context_key, current_thread_context);
   }
 
@@ -57,6 +58,6 @@ bool __rtsan::Context::InRealtimeContext() const { return realtime_depth_ > 0; }
 
 bool __rtsan::Context::IsBypassed() const { return bypass_depth_ > 0; }
 
-__rtsan::Context &__rtsan::GetContextForThisThread() {
+Context &__rtsan::GetContextForThisThread() {
   return GetContextForThisThreadImpl();
 }
diff --git a/compiler-rt/lib/rtsan/rtsan_context.h b/compiler-rt/lib/rtsan/rtsan_context.h
index 8512017793a48..cb0c2eb0a5e0d 100644
--- a/compiler-rt/lib/rtsan/rtsan_context.h
+++ b/compiler-rt/lib/rtsan/rtsan_context.h
@@ -10,8 +10,6 @@
 
 #pragma once
 
-#include <sanitizer_common/sanitizer_internal_defs.h>
-
 namespace __rtsan {
 
 class Context {
diff --git a/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp b/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp
index f82001f5b2057..ecba30d2ab8df 100644
--- a/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp
@@ -31,7 +31,7 @@ void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
 } // namespace __sanitizer
 
 namespace {
-class Decorator : public __sanitizer::SanitizerCommonDecorator {
+class Decorator : public SanitizerCommonDecorator {
 public:
   Decorator() : SanitizerCommonDecorator() {}
   const char *FunctionName() const { return Green(); }
@@ -39,13 +39,6 @@ class Decorator : public __sanitizer::SanitizerCommonDecorator {
 };
 } // namespace
 
-static void PrintStackTrace(uptr pc, uptr bp) {
-  BufferedStackTrace stack{};
-
-  stack.Unwind(pc, bp, nullptr, common_flags()->fast_unwind_on_fatal);
-  stack.Print();
-}
-
 static void PrintError(const Decorator &decorator,
                        const DiagnosticsInfo &info) {
   const auto ErrorTypeStr = [&info]() -> const char * {
@@ -91,5 +84,4 @@ void __rtsan::PrintDiagnostics(const DiagnosticsInfo &info) {
   PrintError(d, info);
   PrintReason(d, info);
   Printf("%s", d.Default());
-  PrintStackTrace(info.pc, info.bp);
 }
diff --git a/compiler-rt/lib/rtsan/rtsan_flags.inc b/compiler-rt/lib/rtsan/rtsan_flags.inc
index 25d62cf0a60fb..1df71127d19d3 100644
--- a/compiler-rt/lib/rtsan/rtsan_flags.inc
+++ b/compiler-rt/lib/rtsan/rtsan_flags.inc
@@ -17,3 +17,4 @@
 // See COMMON_FLAG in sanitizer_flags.inc for more details.
 
 RTSAN_FLAG(bool, halt_on_error, true, "Exit after first reported error.")
+RTSAN_FLAG(bool, print_stats_on_exit, false, "Print stats on exit.")
diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
index 9cc7214aef85c..63b0ca28a1f40 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
@@ -157,22 +157,38 @@ INTERCEPTOR(int, creat64, const char *path, mode_t mode) {
 INTERCEPTOR(int, fcntl, int filedes, int cmd, ...) {
   __rtsan_notify_intercepted_call("fcntl");
 
-  va_list args;
-  va_start(args, cmd);
-
   // Following precedent here. The linux source (fcntl.c, do_fcntl) accepts the
   // final argument in a variable that will hold the largest of the possible
-  // argument types (pointers and ints are typical in fcntl) It is then assumed
-  // that the implementation of fcntl will cast it properly depending on cmd.
+  // argument types. It is then assumed that the implementation of fcntl will
+  // cast it properly depending on cmd.
   //
-  // This is also similar to what is done in
-  // sanitizer_common/sanitizer_common_syscalls.inc
-  const unsigned long arg = va_arg(args, unsigned long);
-  int result = REAL(fcntl)(filedes, cmd, arg);
+  // The two types we expect for possible args are `struct flock*` and `int`
+  // we will cast to `intptr_t` which should hold both comfortably.
+  // Why `intptr_t`? It should fit both types, and it follows the freeBSD
+  // approach linked below.
+  using arg_type = intptr_t;
+  static_assert(sizeof(arg_type) >= sizeof(struct flock *));
+  static_assert(sizeof(arg_type) >= sizeof(int));
+
+  // Some cmds will not actually have an argument passed in this va_list.
+  // Calling va_arg when no arg exists is UB, however all currently
+  // supported architectures will give us a result in all three cases
+  // (no arg/int arg/struct flock* arg)
+  // va_arg() will generally read the next argument register or the
+  // stack. If we ever support an arch like CHERI with bounds checking, we
+  // may have to re-evaluate this approach.
+  //
+  // More discussion, and other examples following this approach
+  // https://discourse.llvm.org/t/how-to-write-an-interceptor-for-fcntl/81203
+  // https://reviews.freebsd.org/D46403
+  // https://github.com/bminor/glibc/blob/c444cc1d8335243c5c4e636d6a26c472df85522c/sysdeps/unix/sysv/linux/fcntl64.c#L37-L46
 
+  va_list args;
+  va_start(args, cmd);
+  const arg_type arg = va_arg(args, arg_type);
   va_end(args);
 
-  return result;
+  return REAL(fcntl)(filedes, cmd, arg);
 }
 
 #if SANITIZER_INTERCEPT_FCNTL64
diff --git a/compiler-rt/lib/rtsan/rtsan_preinit.cpp b/compiler-rt/lib/rtsan/rtsan_preinit.cpp
index 1307268951fbc..5d49223bc8beb 100644
--- a/compiler-rt/lib/rtsan/rtsan_preinit.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_preinit.cpp
@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "rtsan/rtsan.h"
 #include "sanitizer_common/sanitizer_internal_defs.h"
-#include <rtsan/rtsan.h>
 
 #if SANITIZER_CAN_USE_PREINIT_ARRAY
 
diff --git a/compiler-rt/lib/rtsan/rtsan_stats.cpp b/compiler-rt/lib/rtsan/rtsan_stats.cpp
new file mode 100644
index 0000000000000..dac7b23c3ef52
--- /dev/null
+++ b/compiler-rt/lib/rtsan/rtsan_stats.cpp
@@ -0,0 +1,45 @@
+//===--- rtsan_stats.cpp - Realtime Sanitizer -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Part of the RealtimeSanitizer runtime library
+//
+//===----------------------------------------------------------------------===//
+
+#include "rtsan/rtsan_stats.h"
+
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_common.h"
+
+using namespace __sanitizer;
+using namespace __rtsan;
+
+static atomic_uint32_t rtsan_total_error_count{0};
+static atomic_uint32_t rtsan_unique_error_count{0};
+
+void __rtsan::IncrementTotalErrorCount() {
+  atomic_fetch_add(&rtsan_total_error_count, 1, memory_order_relaxed);
+}
+
+void __rtsan::IncrementUniqueErrorCount() {
+  atomic_fetch_add(&rtsan_unique_error_count, 1, memory_order_relaxed);
+}
+
+static u32 GetTotalErrorCount() {
+  return atomic_load(&rtsan_total_error_count, memory_order_relaxed);
+}
+
+static u32 GetUniqueErrorCount() {
+  return atomic_load(&rtsan_unique_error_count, memory_order_relaxed);
+}
+
+void __rtsan::PrintStatisticsSummary() {
+  ScopedErrorReportLock l;
+  Printf("RealtimeSanitizer exit stats:\n");
+  Printf("    Total error count: %u\n", GetTotalErrorCount());
+  Printf("    Unique error count: %u\n", GetUniqueErrorCount());
+}
diff --git a/libcxx/test/std/experimental/utilities/utility/utility.synop/includes.pass.cpp b/compiler-rt/lib/rtsan/rtsan_stats.h
similarity index 56%
rename from libcxx/test/std/experimental/utilities/utility/utility.synop/includes.pass.cpp
rename to compiler-rt/lib/rtsan/rtsan_stats.h
index 7e27adfab1971..a72098792c89c 100644
--- a/libcxx/test/std/experimental/utilities/utility/utility.synop/includes.pass.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_stats.h
@@ -1,23 +1,22 @@
-//===----------------------------------------------------------------------===//
+//===--- rtsan_stats.h - Realtime Sanitizer ---------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// Part of the RealtimeSanitizer runtime library
+//
+//===----------------------------------------------------------------------===//
 
-// <experimental/utility>
-
-#include <experimental/utility>
+#pragma once
 
-#include "test_macros.h"
+namespace __rtsan {
 
-#ifndef _LIBCPP_UTILITY
-#   error "<experimental/utility> must include <utility>"
-#endif
+void IncrementTotalErrorCount();
+void IncrementUniqueErrorCount();
 
-int main(int, char**)
-{
+void PrintStatisticsSummary();
 
-  return 0;
-}
+} // namespace __rtsan
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_assertions.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_assertions.cpp
index 58f7dbae96e9f..3b279989a49cb 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_assertions.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_assertions.cpp
@@ -23,7 +23,7 @@ class TestRtsanAssertions : public ::testing::Test {
   void SetUp() override { __rtsan_ensure_initialized(); }
 };
 
-static void ExpectViolationAction(__rtsan::Context &context,
+static void ExpectViolationAction(Context &context,
                                   bool expect_violation_callback) {
   ::testing::MockFunction<void()> mock_on_violation;
   EXPECT_CALL(mock_on_violation, Call).Times(expect_violation_callback ? 1 : 0);
@@ -32,14 +32,14 @@ static void ExpectViolationAction(__rtsan::Context &context,
 
 TEST_F(TestRtsanAssertions,
        ExpectNotRealtimeDoesNotCallViolationActionIfNotInRealtimeContext) {
-  __rtsan::Context context{};
+  Context context{};
   ASSERT_FALSE(context.InRealtimeContext());
   ExpectViolationAction(context, false);
 }
 
 TEST_F(TestRtsanAssertions,
        ExpectNotRealtimeCallsViolationActionIfInRealtimeContext) {
-  __rtsan::Context context{};
+  Context context{};
   context.RealtimePush();
   ASSERT_TRUE(context.InRealtimeContext());
   ExpectViolationAction(context, true);
@@ -47,7 +47,7 @@ TEST_F(TestRtsanAssertions,
 
 TEST_F(TestRtsanAssertions,
        ExpectNotRealtimeDoesNotCallViolationActionIfRealtimeButBypassed) {
-  __rtsan::Context context{};
+  Context context{};
   context.RealtimePush();
   context.BypassPush();
   ASSERT_TRUE(context.IsBypassed());
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_context.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_context.cpp
index 7551f67b38d78..2b6f53b4f572d 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_context.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_context.cpp
@@ -15,6 +15,7 @@
 
 #include <gtest/gtest.h>
 
+using namespace __rtsan;
 using namespace ::testing;
 
 class TestRtsanContext : public Test {
@@ -23,18 +24,18 @@ class TestRtsanContext : public Test {
 };
 
 TEST_F(TestRtsanContext, IsNotRealtimeAfterDefaultConstruction) {
-  __rtsan::Context context{};
+  Context context{};
   EXPECT_THAT(context.InRealtimeContext(), Eq(false));
 }
 
 TEST_F(TestRtsanContext, IsRealtimeAfterRealtimePush) {
-  __rtsan::Context context{};
+  Context context{};
   context.RealtimePush();
   EXPECT_THAT(context.InRealtimeContext(), Eq(true));
 }
 
 TEST_F(TestRtsanContext, IsNotRealtimeAfterRealtimePushAndPop) {
-  __rtsan::Context context{};
+  Context context{};
   context.RealtimePush();
   ASSERT_THAT(context.InRealtimeContext(), Eq(true));
   context.RealtimePop();
@@ -42,7 +43,7 @@ TEST_F(TestRtsanContext, IsNotRealtimeAfterRealtimePushAndPop) {
 }
 
 TEST_F(TestRtsanContext, RealtimeContextStateIsStatefullyTracked) {
-  __rtsan::Context context{};
+  Context context{};
   auto const ExpectRealtime = [&context](bool is_rt) {
     EXPECT_THAT(context.InRealtimeContext(), Eq(is_rt));
   };
@@ -64,18 +65,18 @@ TEST_F(TestRtsanContext, RealtimeContextStateIsStatefullyTracked) {
 }
 
 TEST_F(TestRtsanContext, IsNotBypassedAfterDefaultConstruction) {
-  __rtsan::Context context{};
+  Context context{};
   EXPECT_THAT(context.IsBypassed(), Eq(false));
 }
 
 TEST_F(TestRtsanContext, IsBypassedAfterBypassPush) {
-  __rtsan::Context context{};
+  Context context{};
   context.BypassPush();
   EXPECT_THAT(context.IsBypassed(), Eq(true));
 }
 
 TEST_F(TestRtsanContext, BypassedStateIsStatefullyTracked) {
-  __rtsan::Context context{};
+  Context context{};
   auto const ExpectBypassed = [&context](bool is_bypassed) {
     EXPECT_THAT(context.IsBypassed(), Eq(is_bypassed));
   };
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_functional.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_functional.cpp
index dff3c527350fd..9e455f0326a54 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_functional.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_functional.cpp
@@ -15,9 +15,10 @@
 #include "gtest/gtest.h"
 
 #include "rtsan_test_utilities.h"
-#include <rtsan.h>
-#include <sanitizer_common/sanitizer_platform.h>
-#include <sanitizer_common/sanitizer_platform_interceptors.h>
+
+#include "rtsan/rtsan.h"
+#include "sanitizer_common/sanitizer_platform.h"
+#include "sanitizer_common/sanitizer_platform_interceptors.h"
 
 #include <array>
 #include <atomic>
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp
index e96d3758bcaf8..c65b1bb01fbe0 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp
@@ -10,8 +10,8 @@
 
 #include "gtest/gtest.h"
 
-#include <sanitizer_common/sanitizer_platform.h>
-#include <sanitizer_common/sanitizer_platform_interceptors.h>
+#include "sanitizer_common/sanitizer_platform.h"
+#include "sanitizer_common/sanitizer_platform_interceptors.h"
 
 #include "rtsan_test_utilities.h"
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 7898af4a335e3..139c80b4f4a53 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -41,6 +41,7 @@
 #include "sanitizer_errno.h"
 #include "sanitizer_placement_new.h"
 #include "sanitizer_platform_interceptors.h"
+#include "sanitizer_platform_limits_posix.h"
 #include "sanitizer_symbolizer.h"
 #include "sanitizer_tls_get_addr.h"
 
@@ -3473,23 +3474,27 @@ INTERCEPTOR(uptr, ptrace, int request, int pid, void *addr, void *data) {
   COMMON_INTERCEPTOR_ENTER(ctx, ptrace, request, pid, addr, data);
   __sanitizer_iovec local_iovec;
 
-  if (data) {
+  void *data_arg = ptrace_data_arg(request, addr, data);
+  if (data_arg) {
     if (request == ptrace_setregs) {
-      COMMON_INTERCEPTOR_READ_RANGE(ctx, data, struct_user_regs_struct_sz);
+      COMMON_INTERCEPTOR_READ_RANGE(ctx, data_arg, struct_user_regs_struct_sz);
     } else if (request == ptrace_setfpregs) {
-      COMMON_INTERCEPTOR_READ_RANGE(ctx, data, struct_user_fpregs_struct_sz);
+      COMMON_INTERCEPTOR_READ_RANGE(ctx, data_arg,
+                                    struct_user_fpregs_struct_sz);
     } else if (request == ptrace_setfpxregs) {
-      COMMON_INTERCEPTOR_READ_RANGE(ctx, data, struct_user_fpxregs_struct_sz);
+      COMMON_INTERCEPTOR_READ_RANGE(ctx, data_arg,
+                                    struct_user_fpxregs_struct_sz);
     } else if (request == ptrace_setvfpregs) {
-      COMMON_INTERCEPTOR_READ_RANGE(ctx, data, struct_user_vfpregs_struct_sz);
+      COMMON_INTERCEPTOR_READ_RANGE(ctx, data_arg,
+                                    struct_user_vfpregs_struct_sz);
     } else if (request == ptrace_setsiginfo) {
-      COMMON_INTERCEPTOR_READ_RANGE(ctx, data, siginfo_t_sz);
+      COMMON_INTERCEPTOR_READ_RANGE(ctx, data_arg, siginfo_t_sz);
 
-    // Some kernel might zero the iovec::iov_base in case of invalid
-    // write access.  In this case copy the invalid address for further
-    // inspection.
+      // Some kernel might zero the iovec::iov_base in case of invalid
+      // write access.  In this case copy the invalid address for further
+      // inspection.
     } else if (request == ptrace_setregset || request == ptrace_getregset) {
-      __sanitizer_iovec *iovec = (__sanitizer_iovec*)data;
+      __sanitizer_iovec *iovec = (__sanitizer_iovec *)data_arg;
       COMMON_INTERCEPTOR_READ_RANGE(ctx, iovec, sizeof(*iovec));
       local_iovec = *iovec;
       if (request == ptrace_setregset)
@@ -3502,23 +3507,26 @@ INTERCEPTOR(uptr, ptrace, int request, int pid, void *addr, void *data) {
   // https://github.com/google/sanitizers/issues/321.
   uptr res = REAL(ptrace)(request, pid, addr, data);
 
-  if (!res && data) {
+  if (!res && data_arg) {
     // Note that PEEK* requests assign different meaning to the return value.
     // This function does not handle them (nor does it need to).
     if (request == ptrace_getregs) {
-      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data, struct_user_regs_struct_sz);
+      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data_arg, struct_user_regs_struct_sz);
     } else if (request == ptrace_getfpregs) {
-      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data, struct_user_fpregs_struct_sz);
+      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data_arg,
+                                     struct_user_fpregs_struct_sz);
     } else if (request == ptrace_getfpxregs) {
-      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data, struct_user_fpxregs_struct_sz);
+      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data_arg,
+                                     struct_user_fpxregs_struct_sz);
     } else if (request == ptrace_getvfpregs) {
-      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data, struct_user_vfpregs_struct_sz);
+      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data_arg,
+                                     struct_user_vfpregs_struct_sz);
     } else if (request == ptrace_getsiginfo) {
-      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data, siginfo_t_sz);
+      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data_arg, siginfo_t_sz);
     } else if (request == ptrace_geteventmsg) {
-      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data, sizeof(unsigned long));
+      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, data_arg, sizeof(unsigned long));
     } else if (request == ptrace_getregset) {
-      __sanitizer_iovec *iovec = (__sanitizer_iovec*)data;
+      __sanitizer_iovec *iovec = (__sanitizer_iovec *)data_arg;
       COMMON_INTERCEPTOR_WRITE_RANGE(ctx, iovec, sizeof(*iovec));
       COMMON_INTERCEPTOR_WRITE_RANGE(ctx, local_iovec.iov_base,
                                      local_iovec.iov_len);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_syscalls.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_syscalls.inc
index 14615f9668dea..29fe4721ba40d 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_syscalls.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_syscalls.inc
@@ -48,6 +48,7 @@
 #if SANITIZER_LINUX
 
 #  include "sanitizer_libc.h"
+#  include "sanitizer_platform_limits_posix.h"
 
 #  define PRE_SYSCALL(name) \
     SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_syscall_pre_impl_##name
@@ -2530,18 +2531,19 @@ PRE_SYSCALL(ptrace)(long request, long pid, long addr, long data) {
 #  if !SANITIZER_ANDROID &&                                                   \
       (defined(__i386) || defined(__x86_64) || defined(__mips64) ||           \
        defined(__powerpc64__) || defined(__aarch64__) || defined(__s390__) || \
-       defined(__loongarch__) || SANITIZER_RISCV64)
-  if (data) {
+       defined(__loongarch__) || SANITIZER_RISCV64 || defined(__sparc__))
+  long data_arg = ptrace_data_arg(request, addr, data);
+  if (data_arg) {
     if (request == ptrace_setregs) {
-      PRE_READ((void *)data, struct_user_regs_struct_sz);
+      PRE_READ((void *)data_arg, struct_user_regs_struct_sz);
     } else if (request == ptrace_setfpregs) {
-      PRE_READ((void *)data, struct_user_fpregs_struct_sz);
+      PRE_READ((void *)data_arg, struct_user_fpregs_struct_sz);
     } else if (request == ptrace_setfpxregs) {
-      PRE_READ((void *)data, struct_user_fpxregs_struct_sz);
+      PRE_READ((void *)data_arg, struct_user_fpxregs_struct_sz);
     } else if (request == ptrace_setsiginfo) {
-      PRE_READ((void *)data, siginfo_t_sz);
+      PRE_READ((void *)data_arg, siginfo_t_sz);
     } else if (request == ptrace_setregset) {
-      __sanitizer_iovec *iov = (__sanitizer_iovec *)data;
+      __sanitizer_iovec *iov = (__sanitizer_iovec *)data_arg;
       PRE_READ(iov->iov_base, iov->iov_len);
     }
   }
@@ -2552,25 +2554,26 @@ POST_SYSCALL(ptrace)(long res, long request, long pid, long addr, long data) {
 #  if !SANITIZER_ANDROID &&                                                   \
       (defined(__i386) || defined(__x86_64) || defined(__mips64) ||           \
        defined(__powerpc64__) || defined(__aarch64__) || defined(__s390__) || \
-       defined(__loongarch__) || SANITIZER_RISCV64)
-  if (res >= 0 && data) {
+       defined(__loongarch__) || SANITIZER_RISCV64 || defined(__sparc__))
+  long data_arg = ptrace_data_arg(request, addr, data);
+  if (res >= 0 && data_arg) {
     // Note that this is different from the interceptor in
     // sanitizer_common_interceptors.inc.
     // PEEK* requests return resulting values through data pointer.
     if (request == ptrace_getregs) {
-      POST_WRITE((void *)data, struct_user_regs_struct_sz);
+      POST_WRITE((void *)data_arg, struct_user_regs_struct_sz);
     } else if (request == ptrace_getfpregs) {
-      POST_WRITE((void *)data, struct_user_fpregs_struct_sz);
+      POST_WRITE((void *)data_arg, struct_user_fpregs_struct_sz);
     } else if (request == ptrace_getfpxregs) {
-      POST_WRITE((void *)data, struct_user_fpxregs_struct_sz);
+      POST_WRITE((void *)data_arg, struct_user_fpxregs_struct_sz);
     } else if (request == ptrace_getsiginfo) {
-      POST_WRITE((void *)data, siginfo_t_sz);
+      POST_WRITE((void *)data_arg, siginfo_t_sz);
     } else if (request == ptrace_getregset) {
-      __sanitizer_iovec *iov = (__sanitizer_iovec *)data;
+      __sanitizer_iovec *iov = (__sanitizer_iovec *)data_arg;
       POST_WRITE(iov->iov_base, iov->iov_len);
     } else if (request == ptrace_peekdata || request == ptrace_peektext ||
                request == ptrace_peekuser) {
-      POST_WRITE((void *)data, sizeof(void *));
+      POST_WRITE((void *)data_arg, sizeof(void *));
     }
   }
 #  endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
index 4fc99197aae3d..525bc1038619d 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
@@ -217,27 +217,14 @@ static void GetGLibcVersion(int *major, int *minor, int *patch) {
   *minor = (*p == '.') ? internal_simple_strtoll(p + 1, &p, 10) : 0;
   *patch = (*p == '.') ? internal_simple_strtoll(p + 1, &p, 10) : 0;
 }
-#  endif  // SANITIZER_GLIBC && !SANITIZER_GO
 
-// On glibc x86_64, ThreadDescriptorSize() needs to be precise due to the usage
-// of g_tls_size. On other targets, ThreadDescriptorSize() is only used by lsan
-// to get the pointer to thread-specific data keys in the thread control block.
-#  if (SANITIZER_FREEBSD || SANITIZER_GLIBC) && !SANITIZER_GO
-// sizeof(struct pthread) from glibc.
-static uptr thread_descriptor_size;
-
-// FIXME: Implementation is very GLIBC specific, but it's used by FreeBSD.
 static uptr ThreadDescriptorSizeFallback() {
 #    if defined(__x86_64__) || defined(__i386__) || defined(__arm__) || \
         SANITIZER_RISCV64
-#      if SANITIZER_GLIBC
   int major;
   int minor;
   int patch;
   GetGLibcVersion(&major, &minor, &patch);
-#      else   // SANITIZER_GLIBC
-  return 0;
-#      endif  // SANITIZER_GLIBC
 #    endif
 
 #    if defined(__x86_64__) || defined(__i386__) || defined(__arm__)
@@ -304,6 +291,48 @@ static uptr ThreadDescriptorSizeFallback() {
   return 1776;  // from glibc.ppc64le 2.20-8.fc21
 #    endif
 }
+#  endif  // SANITIZER_GLIBC && !SANITIZER_GO
+
+#  if SANITIZER_FREEBSD && !SANITIZER_GO
+// FIXME: Implementation is very GLIBC specific, but it's used by FreeBSD.
+static uptr ThreadDescriptorSizeFallback() {
+#    if defined(__s390__) || defined(__sparc__)
+  // The size of a prefix of TCB including pthread::{specific_1stblock,specific}
+  // suffices. Just return offsetof(struct pthread, specific_used), which hasn't
+  // changed since 2007-05. Technically this applies to i386/x86_64 as well but
+  // we call _dl_get_tls_static_info and need the precise size of struct
+  // pthread.
+  return FIRST_32_SECOND_64(524, 1552);
+#    endif
+
+#    if defined(__mips__)
+  // TODO(sagarthakur): add more values as per different glibc versions.
+  return FIRST_32_SECOND_64(1152, 1776);
+#    endif
+
+#    if SANITIZER_LOONGARCH64
+  return 1856;  // from glibc 2.36
+#    endif
+
+#    if defined(__aarch64__)
+  // The sizeof (struct pthread) is the same from GLIBC 2.17 to 2.22.
+  return 1776;
+#    endif
+
+#    if defined(__powerpc64__)
+  return 1776;  // from glibc.ppc64le 2.20-8.fc21
+#    endif
+
+  return 0;
+}
+#  endif  // SANITIZER_FREEBSD && !SANITIZER_GO
+
+#  if (SANITIZER_FREEBSD || SANITIZER_GLIBC) && !SANITIZER_GO
+// On glibc x86_64, ThreadDescriptorSize() needs to be precise due to the usage
+// of g_tls_size. On other targets, ThreadDescriptorSize() is only used by lsan
+// to get the pointer to thread-specific data keys in the thread control block.
+// sizeof(struct pthread) from glibc.
+static uptr thread_descriptor_size;
 
 uptr ThreadDescriptorSize() { return thread_descriptor_size; }
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 05fa7e63268f2..272e4a02e6a7f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -279,8 +279,9 @@
 #if SI_LINUX_NOT_ANDROID &&                                                \
     (defined(__i386) || defined(__x86_64) || defined(__mips64) ||          \
      defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__) || \
-     defined(__s390__) || defined(__loongarch__) || SANITIZER_RISCV64)
-#define SANITIZER_INTERCEPT_PTRACE 1
+     defined(__s390__) || defined(__loongarch__) || SANITIZER_RISCV64 ||   \
+     defined(__sparc__))
+#  define SANITIZER_INTERCEPT_PTRACE 1
 #else
 #define SANITIZER_INTERCEPT_PTRACE 0
 #endif
@@ -606,7 +607,13 @@
 // FIXME: also available from musl 1.2.5
 #define SANITIZER_INTERCEPT_PREADV2 (SI_LINUX && __GLIBC_PREREQ(2, 26))
 #define SANITIZER_INTERCEPT_PWRITEV2 (SI_LINUX && __GLIBC_PREREQ(2, 26))
-#define SANITIZER_INTERCEPT_FREADLINK SI_MAC
+#if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && \
+    __MAC_OS_X_VERSION_MIN_REQUIRED >= 130000
+#  define SI_MAC_OS_DEPLOYMENT_MIN_13_00 1
+#else
+#  define SI_MAC_OS_DEPLOYMENT_MIN_13_00 0
+#endif
+#define SANITIZER_INTERCEPT_FREADLINK (SI_MAC && SI_MAC_OS_DEPLOYMENT_MIN_13_00)
 
 // This macro gives a way for downstream users to override the above
 // interceptor macros irrespective of the platform they are on. They have
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
index 5eeb2a89efa8c..c87d5ef42c924 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
@@ -94,8 +94,9 @@
 #if SANITIZER_LINUX
 # include <utime.h>
 # include <sys/ptrace.h>
-#    if defined(__mips64) || defined(__aarch64__) || defined(__arm__) || \
-        defined(__hexagon__) || defined(__loongarch__) ||SANITIZER_RISCV64
+#    if defined(__mips64) || defined(__aarch64__) || defined(__arm__) ||       \
+        defined(__hexagon__) || defined(__loongarch__) || SANITIZER_RISCV64 || \
+        defined(__sparc__)
 #      include <asm/ptrace.h>
 #      ifdef __arm__
 typedef struct user_fpregs elf_fpregset_t;
@@ -359,11 +360,12 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
   const int wordexp_wrde_dooffs = WRDE_DOOFFS;
 #  endif  // !SANITIZER_ANDROID
 
-#if SANITIZER_LINUX && !SANITIZER_ANDROID &&                               \
-    (defined(__i386) || defined(__x86_64) || defined(__mips64) ||          \
-     defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__) || \
-     defined(__s390__) || defined(__loongarch__)|| SANITIZER_RISCV64)
-#if defined(__mips64) || defined(__powerpc64__) || defined(__arm__)
+#  if SANITIZER_LINUX && !SANITIZER_ANDROID &&                               \
+      (defined(__i386) || defined(__x86_64) || defined(__mips64) ||          \
+       defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__) || \
+       defined(__s390__) || defined(__loongarch__) || SANITIZER_RISCV64 ||   \
+       defined(__sparc__))
+#    if defined(__mips64) || defined(__powerpc64__) || defined(__arm__)
   unsigned struct_user_regs_struct_sz = sizeof(struct pt_regs);
   unsigned struct_user_fpregs_struct_sz = sizeof(elf_fpregset_t);
 #elif SANITIZER_RISCV64
@@ -378,19 +380,22 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
 #elif defined(__s390__)
   unsigned struct_user_regs_struct_sz = sizeof(struct _user_regs_struct);
   unsigned struct_user_fpregs_struct_sz = sizeof(struct _user_fpregs_struct);
-#else
+#    elif defined(__sparc__)
+  unsigned struct_user_regs_struct_sz = sizeof(struct sunos_regs);
+  unsigned struct_user_fpregs_struct_sz = sizeof(struct sunos_fp);
+#    else
   unsigned struct_user_regs_struct_sz = sizeof(struct user_regs_struct);
   unsigned struct_user_fpregs_struct_sz = sizeof(struct user_fpregs_struct);
-#endif // __mips64 || __powerpc64__ || __aarch64__ || __loongarch__
-#if defined(__x86_64) || defined(__mips64) || defined(__powerpc64__) || \
-    defined(__aarch64__) || defined(__arm__) || defined(__s390__) ||    \
-    defined(__loongarch__) || SANITIZER_RISCV64
+#    endif  // __mips64 || __powerpc64__ || __aarch64__ || __loongarch__
+#    if defined(__x86_64) || defined(__mips64) || defined(__powerpc64__) || \
+        defined(__aarch64__) || defined(__arm__) || defined(__s390__) ||    \
+        defined(__loongarch__) || SANITIZER_RISCV64 || defined(__sparc__)
   unsigned struct_user_fpxregs_struct_sz = 0;
 #else
   unsigned struct_user_fpxregs_struct_sz = sizeof(struct user_fpxregs_struct);
 #endif // __x86_64 || __mips64 || __powerpc64__ || __aarch64__ || __arm__
-// || __s390__ || __loongarch__
-#ifdef __arm__
+  // || __s390__ || __loongarch__ || SANITIZER_RISCV64 || __sparc__
+#    ifdef __arm__
   unsigned struct_user_vfpregs_struct_sz = ARM_VFPREGS_SIZE;
 #else
   unsigned struct_user_vfpregs_struct_sz = 0;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index ca03841ccc198..e8c81aa8e2816 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -855,10 +855,11 @@ typedef void __sanitizer_FILE;
 # define SANITIZER_HAS_STRUCT_FILE 0
 #endif
 
-#if SANITIZER_LINUX && !SANITIZER_ANDROID &&                               \
-    (defined(__i386) || defined(__x86_64) || defined(__mips64) ||          \
-     defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__) || \
-     defined(__s390__) || defined(__loongarch__) || SANITIZER_RISCV64)
+#  if SANITIZER_LINUX && !SANITIZER_ANDROID &&                               \
+      (defined(__i386) || defined(__x86_64) || defined(__mips64) ||          \
+       defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__) || \
+       defined(__s390__) || defined(__loongarch__) || SANITIZER_RISCV64 ||   \
+       defined(__sparc__))
 extern unsigned struct_user_regs_struct_sz;
 extern unsigned struct_user_fpregs_struct_sz;
 extern unsigned struct_user_fpxregs_struct_sz;
@@ -880,9 +881,24 @@ extern int ptrace_setsiginfo;
 extern int ptrace_getregset;
 extern int ptrace_setregset;
 extern int ptrace_geteventmsg;
-#endif
 
-#if SANITIZER_LINUX  && !SANITIZER_ANDROID
+// Helper for the ptrace interceptor.
+template <class T>
+inline T ptrace_data_arg(int request, T addr, T data) {
+#    if SANITIZER_LINUX && SANITIZER_SPARC
+  // As described in ptrace(2), the meanings of addr and data are reversed
+  // for the PTRACE_GETREGS, PTRACE_GETFPREGS, PTRACE_GETREGS, and
+  // PTRACE_GETFPREGS requests on Linux/sparc64.
+  if (request == ptrace_getregs || request == ptrace_getfpregs ||
+      request == ptrace_setregs || request == ptrace_setfpregs)
+    return addr;
+  else
+#    endif
+    return data;
+}
+#  endif
+
+#  if SANITIZER_LINUX && !SANITIZER_ANDROID
 extern unsigned struct_shminfo_sz;
 extern unsigned struct_shm_info_sz;
 extern int shmctl_ipc_stat;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
index b44e016a0e5bc..5ff8d1832556f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
@@ -433,7 +433,9 @@ void MemoryMappingLayout::DumpListOfModules(
   MemoryMappedSegmentData data;
   segment.data_ = &data;
   while (Next(&segment)) {
-    if (segment.filename[0] == '\0') continue;
+    // skip the __PAGEZERO segment, its vmsize is 0
+    if (segment.filename[0] == '\0' || (segment.start == segment.end))
+      continue;
     LoadedModule *cur_module = nullptr;
     if (!modules->empty() &&
         0 == internal_strcmp(segment.filename, modules->back().full_name())) {
diff --git a/compiler-rt/test/asan/TestCases/Linux/ptrace.cpp b/compiler-rt/test/asan/TestCases/Linux/ptrace.cpp
index e01021ff344c3..edff30e5a4753 100644
--- a/compiler-rt/test/asan/TestCases/Linux/ptrace.cpp
+++ b/compiler-rt/test/asan/TestCases/Linux/ptrace.cpp
@@ -81,6 +81,13 @@ typedef __riscv_q_ext_state fpregs_struct;
 #define PRINT_REG_PC(__regs) printf("%lx\n", (unsigned long)(__regs.pc))
 #define PRINT_REG_FP(__fpregs) printf("%lx\n", (unsigned long)(__fpregs.fcsr))
 #define ARCH_IOVEC_FOR_GETREGSET
+
+#elif defined(__sparc__)
+typedef sunos_regs regs_struct;
+typedef sunos_fp fpregs_struct;
+#  define PRINT_REG_PC(__regs) printf("%x\n", (unsigned)(__regs.pc))
+#  define PRINT_REG_FP(__fpregs) printf("%x\n", (unsigned)(__fpregs.fsr))
+#  define __PTRACE_FPREQUEST PTRACE_GETFPREGS
 #endif
 
 
@@ -110,7 +117,13 @@ int main(void) {
     regset_io.iov_len = sizeof(regs_struct);
 #else
 # define __PTRACE_REQUEST  PTRACE_GETREGS
-# define __PTRACE_ARGS     NULL, pregs
+#  ifdef __sparc__
+    // The meanings of addr and data are reversed for a few requests on
+    // Linux/sparc64.
+#    define __PTRACE_ARGS pregs, NULL
+#  else
+#    define __PTRACE_ARGS NULL, pregs
+#  endif
 #endif
     res = ptrace((enum __ptrace_request)__PTRACE_REQUEST, pid, __PTRACE_ARGS);
     // CHECK: AddressSanitizer: stack-buffer-overflow
@@ -127,7 +140,13 @@ int main(void) {
     res = ptrace((enum __ptrace_request)PTRACE_GETREGSET, pid, (void*)NT_FPREGSET,
                  (void*)&regset_io);
 #else
-# define __PTRACE_FPARGS     NULL, &fpregs
+    // The meanings of addr and data are reversed for a few requests on
+    // Linux/sparc64.
+#  ifdef __sparc__
+#    define __PTRACE_FPARGS &fpregs, NULL
+#  else
+#    define __PTRACE_FPARGS NULL, &fpregs
+#  endif
 #endif
     res = ptrace((enum __ptrace_request)__PTRACE_FPREQUEST, pid, __PTRACE_FPARGS);
     assert(!res);
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/runtime-counter-relocation.c b/compiler-rt/test/profile/ContinuousSyncMode/runtime-counter-relocation.c
index 19a7aae70cb0d..6ec9077f4d614 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/runtime-counter-relocation.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/runtime-counter-relocation.c
@@ -1,11 +1,11 @@
-// REQUIRES: target={{.*(linux|solaris|windows-msvc).*}}
+// REQUIRES: target={{.*(linux|solaris|windows-msvc|aix).*}}
 
 // RUN: %clang -fprofile-instr-generate -fcoverage-mapping -mllvm -runtime-counter-relocation=true -o %t.exe %s
 // RUN: echo "garbage" > %t.profraw
 // RUN: env LLVM_PROFILE_FILE="%c%t.profraw" %run %t.exe
 // RUN: llvm-profdata show --counts --all-functions %t.profraw | FileCheck %s -check-prefix=CHECK-COUNTS
 // RUN: llvm-profdata merge -o %t.profdata %t.profraw
-// RUN: llvm-cov report %t.exe -instr-profile %t.profdata | FileCheck %s -check-prefix=CHECK-COVERAGE
+// RUN: %if !target={{.*aix.*}} %{ llvm-cov report %t.exe -instr-profile %t.profdata | FileCheck %s -check-prefix=CHECK-COVERAGE %}
 
 // CHECK-COUNTS: Counters:
 // CHECK-COUNTS-NEXT:   main:
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/set-file-object.c b/compiler-rt/test/profile/ContinuousSyncMode/set-file-object.c
index 53609f5838f75..321a69b4f23d0 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/set-file-object.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/set-file-object.c
@@ -1,11 +1,11 @@
-// REQUIRES: target={{.*(darwin|linux|solaris).*}}
+// REQUIRES: target={{.*(darwin|linux|solaris|aix).*}}
 
 // Test using __llvm_profile_set_file_object in continuous mode (%c).
 // Create & cd into a temporary directory.
 // RUN: rm -rf %t.dir && mkdir -p %t.dir && cd %t.dir
 
 // The -mllvm -runtime-counter-relocation=true flag has effect only on linux.
-// RUN: %clang -fprofile-instr-generate -fcoverage-mapping -mllvm -instrprof-atomic-counter-update-all=1 -mllvm -runtime-counter-relocation=true -o main.exe %s
+// RUN: %clang -fprofile-instr-generate -fcoverage-mapping -fprofile-update=atomic -mllvm -runtime-counter-relocation=true -o main.exe %s
 
 // Test continuous mode with __llvm_profile_set_file_object with mergin disabled.
 // RUN: env LLVM_PROFILE_FILE="%t.dir/profdir/%c%mprofraw.old" %run  %t.dir/main.exe nomerge %t.dir/profdir/profraw.new 2>&1 | FileCheck %s -check-prefix=WARN
diff --git a/compiler-rt/test/rtsan/deduplicate_errors.cpp b/compiler-rt/test/rtsan/deduplicate_errors.cpp
new file mode 100644
index 0000000000000..7d60d4d7da7dd
--- /dev/null
+++ b/compiler-rt/test/rtsan/deduplicate_errors.cpp
@@ -0,0 +1,39 @@
+// RUN: %clangxx -fsanitize=realtime %s -o %t
+// RUN: env RTSAN_OPTIONS="halt_on_error=false,print_stats_on_exit=true" %run %t 2>&1 | FileCheck %s
+
+// UNSUPPORTED: ios
+
+// Intent: Ensure all errors are deduplicated.
+
+#include <unistd.h>
+
+const int kNumViolations = 10;
+
+void violation() [[clang::nonblocking]] {
+  for (int i = 0; i < kNumViolations; i++)
+    usleep(1);
+}
+
+void violation2() [[clang::nonblocking]] {
+  for (int i = 0; i < kNumViolations; i++)
+    violation();
+}
+
+void double_violation() [[clang::nonblocking]] {
+  violation();
+  violation2();
+}
+
+int main() {
+  violation();        // 1 unique errors here, but 10 total
+  violation2();       // 1 unique errors here, but 100 total
+  double_violation(); // 2 unique errors here, but 110 total
+  return 0;
+}
+
+// CHECK-COUNT-4: ==ERROR:
+// CHECK-NOT: ==ERROR:
+
+// CHECK: RealtimeSanitizer exit stats:
+// CHECK-NEXT: Total error count: 220
+// CHECK-NEXT: Unique error count: 4
diff --git a/compiler-rt/test/rtsan/exit_stats.cpp b/compiler-rt/test/rtsan/exit_stats.cpp
new file mode 100644
index 0000000000000..4341fbb0f9cf2
--- /dev/null
+++ b/compiler-rt/test/rtsan/exit_stats.cpp
@@ -0,0 +1,24 @@
+// RUN: %clangxx -fsanitize=realtime %s -o %t
+// RUN: env RTSAN_OPTIONS="halt_on_error=false,print_stats_on_exit=true" %run %t 2>&1 | FileCheck %s
+
+// UNSUPPORTED: ios
+
+// Intent: Ensure exits stats are printed on exit.
+
+#include <unistd.h>
+
+void violation() [[clang::nonblocking]] {
+  const int kNumViolations = 10;
+  for (int i = 0; i < kNumViolations; i++) {
+    usleep(1);
+  }
+}
+
+int main() {
+  violation();
+  return 0;
+}
+
+// CHECK: RealtimeSanitizer exit stats:
+// CHECK-NEXT: Total error count: 10
+// CHECK-NEXT: Unique error count: 1
diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md
index e1c1106212502..815c26a28dfdf 100644
--- a/flang/docs/FlangDriver.md
+++ b/flang/docs/FlangDriver.md
@@ -521,7 +521,7 @@ e.g. during the semantic checks.
 ## FIR Optimizer Pass Pipeline Extension Points
 
 The default FIR optimizer pass pipeline `createDefaultFIROptimizerPassPipeline`
-in `flang/include/flang/Tools/CLOptions.inc` contains extension point callback
+in `flang/lib/Optimizer/Passes/Pipelines.cpp` contains extension point callback
 invocations `invokeFIROptEarlyEPCallbacks`, `invokeFIRInlinerCallback`, and
 `invokeFIROptLastEPCallbacks` for Flang drivers to be able to insert additonal
 passes at different points of the default pass pipeline. An example use of these
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index 87716731ead85..d6f48a7fd87d7 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -700,7 +700,7 @@ IBCHNG, ISHA, ISHC, ISHL, IXOR
 IARG, IARGC, NARGS, NUMARG
 BADDRESS, IADDR
 CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, LOC
-MALLOC
+MALLOC, FREE
 ```
 
 ### Library subroutine 
@@ -765,7 +765,7 @@ This phase currently supports all the intrinsic procedures listed above but the
 | Coarray intrinsic functions | COSHAPE |
 | Object characteristic inquiry functions | ALLOCATED, ASSOCIATED, EXTENDS_TYPE_OF, IS_CONTIGUOUS, PRESENT, RANK, SAME_TYPE, STORAGE_SIZE |
 | Type inquiry intrinsic functions | BIT_SIZE, DIGITS, EPSILON, HUGE, KIND, MAXEXPONENT, MINEXPONENT, NEW_LINE, PRECISION, RADIX, RANGE, TINY|
-| Non-standard intrinsic functions | AND, OR, XOR, SHIFT, ZEXT, IZEXT, COSD, SIND, TAND, ACOSD, ASIND, ATAND, ATAN2D, COMPL, EQV, NEQV, INT8, JINT, JNINT, KNINT, QCMPLX, DREAL, DFLOAT, QEXT, QFLOAT, QREAL, DNUM, NUM, JNUM, KNUM, QNUM, RNUM, RAN, RANF, ILEN, SIZEOF, MCLOCK, SECNDS, COTAN, IBCHNG, ISHA, ISHC, ISHL, IXOR, IARG, IARGC, NARGS, GETPID, NUMARG, BADDRESS, IADDR, CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, MALLOC |
+| Non-standard intrinsic functions | AND, OR, XOR, SHIFT, ZEXT, IZEXT, COSD, SIND, TAND, ACOSD, ASIND, ATAND, ATAN2D, COMPL, EQV, NEQV, INT8, JINT, JNINT, KNINT, QCMPLX, DREAL, DFLOAT, QEXT, QFLOAT, QREAL, DNUM, NUM, JNUM, KNUM, QNUM, RNUM, RAN, RANF, ILEN, SIZEOF, MCLOCK, SECNDS, COTAN, IBCHNG, ISHA, ISHC, ISHL, IXOR, IARG, IARGC, NARGS, GETPID, NUMARG, BADDRESS, IADDR, CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, MALLOC, FREE |
 | Intrinsic subroutines |MVBITS (elemental), CPU_TIME, DATE_AND_TIME, EVENT_QUERY, EXECUTE_COMMAND_LINE, GET_COMMAND, GET_COMMAND_ARGUMENT, GET_ENVIRONMENT_VARIABLE, MOVE_ALLOC, RANDOM_INIT, RANDOM_NUMBER, RANDOM_SEED, SIGNAL, SLEEP, SYSTEM, SYSTEM_CLOCK |
 | Atomic intrinsic subroutines | ATOMIC_ADD |
 | Collective intrinsic subroutines | CO_REDUCE |
diff --git a/flang/include/flang/Frontend/LangOptions.def b/flang/include/flang/Common/LangOptions.def
similarity index 100%
rename from flang/include/flang/Frontend/LangOptions.def
rename to flang/include/flang/Common/LangOptions.def
diff --git a/flang/include/flang/Frontend/LangOptions.h b/flang/include/flang/Common/LangOptions.h
similarity index 79%
rename from flang/include/flang/Frontend/LangOptions.h
rename to flang/include/flang/Common/LangOptions.h
index 57d86d46df5ab..52a45047deb0e 100644
--- a/flang/include/flang/Frontend/LangOptions.h
+++ b/flang/include/flang/Common/LangOptions.h
@@ -12,15 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_FRONTEND_LANGOPTIONS_H
-#define FORTRAN_FRONTEND_LANGOPTIONS_H
+#ifndef FORTRAN_COMMON_LANGOPTIONS_H
+#define FORTRAN_COMMON_LANGOPTIONS_H
 
 #include <string>
 #include <vector>
 
 #include "llvm/TargetParser/Triple.h"
 
-namespace Fortran::frontend {
+namespace Fortran::common {
 
 /// Bitfields of LangOptions, split out from LangOptions to ensure
 /// that this large collection of bitfields is a trivial class type.
@@ -37,12 +37,12 @@ class LangOptionsBase {
 
 #define LANGOPT(Name, Bits, Default) unsigned Name : Bits;
 #define ENUM_LANGOPT(Name, Type, Bits, Default)
-#include "flang/Frontend/LangOptions.def"
+#include "flang/Common/LangOptions.def"
 
 protected:
 #define LANGOPT(Name, Bits, Default)
 #define ENUM_LANGOPT(Name, Type, Bits, Default) unsigned Name : Bits;
-#include "flang/Frontend/LangOptions.def"
+#include "flang/Common/LangOptions.def"
 };
 
 /// Tracks various options which control the dialect of Fortran that is
@@ -52,10 +52,10 @@ class LangOptions : public LangOptionsBase {
 public:
   // Define accessors/mutators for code generation options of enumeration type.
 #define LANGOPT(Name, Bits, Default)
-#define ENUM_LANGOPT(Name, Type, Bits, Default)                                \
-  Type get##Name() const { return static_cast<Type>(Name); }                   \
+#define ENUM_LANGOPT(Name, Type, Bits, Default) \
+  Type get##Name() const { return static_cast<Type>(Name); } \
   void set##Name(Type Value) { Name = static_cast<unsigned>(Value); }
-#include "flang/Frontend/LangOptions.def"
+#include "flang/Common/LangOptions.def"
 
   /// Name of the IR file that contains the result of the OpenMP target
   /// host code generation.
@@ -67,6 +67,6 @@ class LangOptions : public LangOptionsBase {
   LangOptions();
 };
 
-} // end namespace Fortran::frontend
+} // end namespace Fortran::common
 
-#endif // FORTRAN_FRONTEND_LANGOPTIONS_H
+#endif // FORTRAN_COMMON_LANGOPTIONS_H
diff --git a/flang/include/flang/Common/format.h b/flang/include/flang/Common/format.h
index 2374ff6983cf4..67d37bee32ab3 100644
--- a/flang/include/flang/Common/format.h
+++ b/flang/include/flang/Common/format.h
@@ -136,7 +136,7 @@ template <typename CHAR = char> class FormatValidator {
 
   const CHAR *cursor_{}; // current location in format_
   const CHAR *laCursor_{}; // lookahead cursor
-  TokenKind previousTokenKind_{TokenKind::None};
+  Token previousToken_{};
   Token token_{}; // current token
   Token knrToken_{}; // k, n, or r UnsignedInteger token
   Token scaleFactorToken_{}; // most recent scale factor token P
@@ -193,7 +193,7 @@ template <typename CHAR> void FormatValidator<CHAR>::NextToken() {
   // At entry, cursor_ points before the start of the next token.
   // At exit, cursor_ points to last CHAR of token_.
 
-  previousTokenKind_ = token_.kind();
+  previousToken_ = token_;
   CHAR c{NextChar()};
   token_.set_kind(TokenKind::None);
   token_.set_offset(cursor_ - format_);
@@ -431,7 +431,7 @@ template <typename CHAR> void FormatValidator<CHAR>::NextToken() {
     }
     SetLength();
     if (stmt_ == IoStmtKind::Read &&
-        previousTokenKind_ != TokenKind::DT) { // 13.3.2p6
+        previousToken_.kind() != TokenKind::DT) { // 13.3.2p6
       ReportError("String edit descriptor in READ format expression");
     } else if (token_.kind() != TokenKind::String) {
       ReportError("Unterminated string");
@@ -887,8 +887,10 @@ template <typename CHAR> bool FormatValidator<CHAR>::Check() {
       // Possible first token of the next format item; token not yet processed.
       if (commaRequired) {
         const char *s{"Expected ',' or ')' in format expression"}; // C1302
-        if (previousTokenKind_ == TokenKind::UnsignedInteger &&
+        if (previousToken_.kind() == TokenKind::UnsignedInteger &&
+            previousToken_.length() > 1 &&
             itemsWithLeadingInts_.test(token_.kind())) {
+          // F10.32F10.3 is ambiguous, F10.3F10.3 is not
           ReportError(s);
         } else {
           ReportWarning(s);
diff --git a/flang/include/flang/Frontend/CompilerInvocation.h b/flang/include/flang/Frontend/CompilerInvocation.h
index d1646f585cf85..50d908d083202 100644
--- a/flang/include/flang/Frontend/CompilerInvocation.h
+++ b/flang/include/flang/Frontend/CompilerInvocation.h
@@ -13,9 +13,9 @@
 #ifndef FORTRAN_FRONTEND_COMPILERINVOCATION_H
 #define FORTRAN_FRONTEND_COMPILERINVOCATION_H
 
+#include "flang/Common/LangOptions.h"
 #include "flang/Frontend/CodeGenOptions.h"
 #include "flang/Frontend/FrontendOptions.h"
-#include "flang/Frontend/LangOptions.h"
 #include "flang/Frontend/PreprocessorOptions.h"
 #include "flang/Frontend/TargetOptions.h"
 #include "flang/Lower/LoweringOptions.h"
@@ -84,7 +84,7 @@ class CompilerInvocation : public CompilerInvocationBase {
   Fortran::frontend::CodeGenOptions codeGenOpts;
 
   /// Options controlling language dialect.
-  Fortran::frontend::LangOptions langOpts;
+  Fortran::common::LangOptions langOpts;
 
   // The original invocation of the compiler driver.
   // This string will be set as the return value from the COMPILER_OPTIONS
@@ -158,8 +158,8 @@ class CompilerInvocation : public CompilerInvocationBase {
   CodeGenOptions &getCodeGenOpts() { return codeGenOpts; }
   const CodeGenOptions &getCodeGenOpts() const { return codeGenOpts; }
 
-  LangOptions &getLangOpts() { return langOpts; }
-  const LangOptions &getLangOpts() const { return langOpts; }
+  Fortran::common::LangOptions &getLangOpts() { return langOpts; }
+  const Fortran::common::LangOptions &getLangOpts() const { return langOpts; }
 
   Fortran::lower::LoweringOptions &getLoweringOpts() { return loweringOpts; }
   const Fortran::lower::LoweringOptions &getLoweringOpts() const {
diff --git a/flang/include/flang/Frontend/PreprocessorOptions.h b/flang/include/flang/Frontend/PreprocessorOptions.h
index 13a91ee9a184f..2de9dabb1b372 100644
--- a/flang/include/flang/Frontend/PreprocessorOptions.h
+++ b/flang/include/flang/Frontend/PreprocessorOptions.h
@@ -56,6 +56,9 @@ struct PreprocessorOptions {
   // -fno-reformat: Emit cooked character stream as -E output
   bool noReformat{false};
 
+  // -fpreprocess-include-lines: Treat INCLUDE as #include for -E output
+  bool preprocessIncludeLines{false};
+
   // -dM: Show macro definitions with -dM -E
   bool showMacros{false};
 
diff --git a/flang/include/flang/Lower/CallInterface.h b/flang/include/flang/Lower/CallInterface.h
index 9a688330e8bd2..72bc9dd890a94 100644
--- a/flang/include/flang/Lower/CallInterface.h
+++ b/flang/include/flang/Lower/CallInterface.h
@@ -42,6 +42,10 @@ namespace mlir {
 class Location;
 }
 
+namespace fir {
+class FortranProcedureFlagsEnumAttr;
+}
+
 namespace Fortran::lower {
 class AbstractConverter;
 class SymMap;
@@ -235,10 +239,16 @@ class CallInterface {
     return characteristic && characteristic->CanBeCalledViaImplicitInterface();
   }
 
+  /// Translate Fortran procedure attributes into FIR attribute.
+  /// Return attribute is nullptr if the procedure has no attributes.
+  fir::FortranProcedureFlagsEnumAttr
+  getProcedureAttrs(mlir::MLIRContext *) const;
+
 protected:
   CallInterface(Fortran::lower::AbstractConverter &c) : converter{c} {}
   /// CRTP handle.
   T &side() { return *static_cast<T *>(this); }
+  const T &side() const { return *static_cast<const T *>(this); }
   /// Entry point to be called by child ctor to analyze the signature and
   /// create/find the mlir::func::FuncOp. Child needs to be initialized first.
   void declare();
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 78bb82b17d405..1cfd6b83353b4 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -249,6 +249,7 @@ struct IntrinsicLibrary {
   mlir::Value genFloor(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genFraction(mlir::Type resultType,
                           mlir::ArrayRef<mlir::Value> args);
+  void genFree(mlir::ArrayRef<fir::ExtendedValue> args);
   fir::ExtendedValue genGetCwd(std::optional<mlir::Type> resultType,
                                llvm::ArrayRef<fir::ExtendedValue> args);
   void genGetCommand(mlir::ArrayRef<fir::ExtendedValue> args);
@@ -273,6 +274,7 @@ struct IntrinsicLibrary {
   template <bool isGet>
   void genIeeeGetOrSetStatus(llvm::ArrayRef<fir::ExtendedValue>);
   void genIeeeGetRoundingMode(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genIeeeInt(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genIeeeIsFinite(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genIeeeIsNan(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genIeeeIsNegative(mlir::Type, llvm::ArrayRef<mlir::Value>);
@@ -283,6 +285,7 @@ struct IntrinsicLibrary {
   template <mlir::arith::CmpFPredicate pred>
   mlir::Value genIeeeQuietCompare(mlir::Type resultType,
                                   llvm::ArrayRef<mlir::Value>);
+  mlir::Value genIeeeRint(mlir::Type, llvm::ArrayRef<mlir::Value>);
   template <bool isFlag>
   void genIeeeSetFlagOrHaltingMode(llvm::ArrayRef<fir::ExtendedValue>);
   void genIeeeSetRoundingMode(llvm::ArrayRef<fir::ExtendedValue>);
@@ -315,6 +318,7 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genLen(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genLenTrim(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genMalloc(mlir::Type, llvm::ArrayRef<mlir::Value>);
   template <typename Shift>
   mlir::Value genMask(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genMatmul(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h
index 240de5a899d37..f62071a49e3bf 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h
@@ -47,6 +47,10 @@ void genDateAndTime(fir::FirOpBuilder &, mlir::Location,
 void genEtime(fir::FirOpBuilder &builder, mlir::Location loc,
               mlir::Value values, mlir::Value time);
 
+void genFree(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value ptr);
+mlir::Value genMalloc(fir::FirOpBuilder &builder, mlir::Location loc,
+                      mlir::Value size);
+
 void genRandomInit(fir::FirOpBuilder &, mlir::Location, mlir::Value repeatable,
                    mlir::Value imageDistinct);
 void genRandomNumber(fir::FirOpBuilder &, mlir::Location, mlir::Value harvest);
diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index 6400756b38448..4e84959a3b3e1 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -62,6 +62,8 @@ def fir_FortranVariableFlagsAttr : fir_Attr<"FortranVariableFlags"> {
 /// Fortran procedure attributes (F2023 15.6.2.1). BIND attribute (18.3.7)
 /// is also tracked in the same enum. Recursive (resp. Impure) attribute
 /// is implied by the absence of opposite NonRecursive (resp. Pure) attribute.
+/// Beware that "elemental" does not implicitly imply "pure" as it does in
+/// Fortran, "pure" must be made explicit when generating the FIR attribute.
 def FIRfuncNoAttributes  : I32BitEnumAttrCaseNone<"none">;
 def FIRfuncElemental     : I32BitEnumAttrCaseBit<"elemental", 0>;
 def FIRfuncPure          : I32BitEnumAttrCaseBit<"pure", 1>;
diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
index 50e18792a167a..cdbefdb234148 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
@@ -160,6 +160,10 @@ static constexpr llvm::StringRef getFuncRecursiveAttrName() {
   return "fir.func_recursive";
 }
 
+static constexpr llvm::StringRef getFortranProcedureFlagsAttrName() {
+  return "fir.proc_attrs";
+}
+
 // Attribute for an alloca that is a trivial adaptor for converting a value to
 // pass-by-ref semantics for a VALUE parameter. The optimizer may be able to
 // eliminate these.
diff --git a/flang/include/flang/Optimizer/Passes/CommandLineOpts.h b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
new file mode 100644
index 0000000000000..1cfaf285e75e6
--- /dev/null
+++ b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
@@ -0,0 +1,60 @@
+//===-- CommandLineOpts.h -- shared command line options --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/// This file declares some shared command-line options that can be used when
+/// debugging the test tools.
+
+#ifndef FORTRAN_OPTIMIZER_PASSES_COMMANDLINEOPTS_H
+#define FORTRAN_OPTIMIZER_PASSES_COMMANDLINEOPTS_H
+
+#include "llvm/Frontend/Debug/Options.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/CommandLine.h"
+
+/// Shared option in tools to control whether dynamically sized array
+/// allocations should always be on the heap.
+extern llvm::cl::opt<bool> dynamicArrayStackToHeapAllocation;
+
+/// Shared option in tools to set a maximum value for the number of elements in
+/// a compile-time sized array that can be allocated on the stack.
+extern llvm::cl::opt<std::size_t> arrayStackAllocationThreshold;
+
+/// Shared option in tools to ignore missing runtime type descriptor objects
+/// when translating FIR to LLVM. The resulting program will crash if the
+/// runtime needs the derived type descriptors, this is only a debug option to
+/// allow compiling manually written FIR programs involving derived types
+/// without having to write the derived type descriptors which are normally
+/// generated by the frontend.
+extern llvm::cl::opt<bool> ignoreMissingTypeDescriptors;
+
+/// Default optimization level used to create Flang pass pipeline is O0.
+extern llvm::OptimizationLevel defaultOptLevel;
+
+extern llvm::codegenoptions::DebugInfoKind noDebugInfo;
+
+/// Optimizer Passes
+extern llvm::cl::opt<bool> disableCfgConversion;
+extern llvm::cl::opt<bool> disableFirAvc;
+extern llvm::cl::opt<bool> disableFirMao;
+
+extern llvm::cl::opt<bool> disableFirAliasTags;
+extern llvm::cl::opt<bool> useOldAliasTags;
+
+/// CodeGen Passes
+extern llvm::cl::opt<bool> disableCodeGenRewrite;
+extern llvm::cl::opt<bool> disableTargetRewrite;
+extern llvm::cl::opt<bool> disableDebugInfo;
+extern llvm::cl::opt<bool> disableFirToLlvmIr;
+extern llvm::cl::opt<bool> disableLlvmIrToLlvm;
+extern llvm::cl::opt<bool> disableBoxedProcedureRewrite;
+
+extern llvm::cl::opt<bool> disableExternalNameConversion;
+extern llvm::cl::opt<bool> enableConstantArgumentGlobalisation;
+extern llvm::cl::opt<bool> disableCompilerGeneratedNamesConversion;
+
+#endif // FORTRAN_OPTIMIZER_PASSES_COMMANDLINE_OPTS_H
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
new file mode 100644
index 0000000000000..3b54ac3883858
--- /dev/null
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -0,0 +1,162 @@
+//===-- Pipelines.h -- FIR pass pipelines -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/// This file declares some utilties to setup FIR pass pipelines. These are
+/// common to flang and the test tools.
+
+#ifndef FORTRAN_OPTIMIZER_PASSES_PIPELINES_H
+#define FORTRAN_OPTIMIZER_PASSES_PIPELINES_H
+
+#include "flang/Optimizer/CodeGen/CodeGen.h"
+#include "flang/Optimizer/HLFIR/Passes.h"
+#include "flang/Optimizer/OpenMP/Passes.h"
+#include "flang/Optimizer/Passes/CommandLineOpts.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "flang/Tools/CrossToolHelpers.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/Frontend/Debug/Options.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace fir {
+
+using PassConstructor = std::unique_ptr<mlir::Pass>();
+
+template <typename OP>
+void addNestedPassToOps(mlir::PassManager &pm, PassConstructor ctor) {
+  pm.addNestedPass<OP>(ctor());
+}
+
+template <typename OP, typename... OPS,
+          typename = std::enable_if_t<sizeof...(OPS) != 0>>
+void addNestedPassToOps(mlir::PassManager &pm, PassConstructor ctor) {
+  addNestedPassToOps<OP>(pm, ctor);
+  addNestedPassToOps<OPS...>(pm, ctor);
+}
+
+/// Generic for adding a pass to the pass manager if it is not disabled.
+template <typename F>
+void addPassConditionally(mlir::PassManager &pm, llvm::cl::opt<bool> &disabled,
+                          F ctor) {
+  if (!disabled)
+    pm.addPass(ctor());
+}
+
+template <typename OP, typename F>
+void addNestedPassConditionally(mlir::PassManager &pm,
+                                llvm::cl::opt<bool> &disabled, F ctor) {
+  if (!disabled)
+    pm.addNestedPass<OP>(ctor());
+}
+
+void addNestedPassToAllTopLevelOperations(mlir::PassManager &pm,
+                                          PassConstructor ctor);
+
+void addNestedPassToAllTopLevelOperationsConditionally(
+    mlir::PassManager &pm, llvm::cl::opt<bool> &disabled, PassConstructor ctor);
+
+/// Add MLIR Canonicalizer pass with region simplification disabled.
+/// FIR does not support the promotion of some SSA value to block arguments (or
+/// into arith.select operands) that may be done by mlir block merging in the
+/// region simplification (e.g., !fir.shape<> SSA values are not supported as
+/// block arguments).
+/// Aside from the fir.shape issue, moving some abstract SSA value into block
+/// arguments may have a heavy cost since it forces their code generation that
+/// may be expensive (array temporary). The MLIR pass does not take these
+/// extra costs into account when doing block merging.
+void addCanonicalizerPassWithoutRegionSimplification(mlir::OpPassManager &pm);
+
+void addCfgConversionPass(mlir::PassManager &pm,
+                          const MLIRToLLVMPassPipelineConfig &config);
+
+void addAVC(mlir::PassManager &pm, const llvm::OptimizationLevel &optLevel);
+
+void addMemoryAllocationOpt(mlir::PassManager &pm);
+
+void addCodeGenRewritePass(mlir::PassManager &pm, bool preserveDeclare);
+
+void addTargetRewritePass(mlir::PassManager &pm);
+
+mlir::LLVM::DIEmissionKind
+getEmissionKind(llvm::codegenoptions::DebugInfoKind kind);
+
+void addBoxedProcedurePass(mlir::PassManager &pm);
+
+void addExternalNameConversionPass(mlir::PassManager &pm,
+                                   bool appendUnderscore = true);
+
+void addCompilerGeneratedNamesConversionPass(mlir::PassManager &pm);
+
+void addDebugInfoPass(mlir::PassManager &pm,
+                      llvm::codegenoptions::DebugInfoKind debugLevel,
+                      llvm::OptimizationLevel optLevel,
+                      llvm::StringRef inputFilename);
+
+void addFIRToLLVMPass(mlir::PassManager &pm,
+                      const MLIRToLLVMPassPipelineConfig &config);
+
+void addLLVMDialectToLLVMPass(mlir::PassManager &pm, llvm::raw_ostream &output);
+
+/// Use inliner extension point callback to register the default inliner pass.
+void registerDefaultInlinerPass(MLIRToLLVMPassPipelineConfig &config);
+
+/// Create a pass pipeline for running default optimization passes for
+/// incremental conversion of FIR.
+///
+/// \param pm - MLIR pass manager that will hold the pipeline definition
+void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
+                                           MLIRToLLVMPassPipelineConfig &pc);
+
+/// Create a pass pipeline for lowering from HLFIR to FIR
+///
+/// \param pm - MLIR pass manager that will hold the pipeline definition
+/// \param optLevel - optimization level used for creating FIR optimization
+///   passes pipeline
+void createHLFIRToFIRPassPipeline(
+    mlir::PassManager &pm, llvm::OptimizationLevel optLevel = defaultOptLevel);
+
+/// Create a pass pipeline for handling certain OpenMP transformations needed
+/// prior to FIR lowering.
+///
+/// WARNING: These passes must be run immediately after the lowering to ensure
+/// that the FIR is correct with respect to OpenMP operations/attributes.
+///
+/// \param pm - MLIR pass manager that will hold the pipeline definition.
+/// \param isTargetDevice - Whether code is being generated for a target device
+/// rather than the host device.
+void createOpenMPFIRPassPipeline(mlir::PassManager &pm, bool isTargetDevice);
+
+#if !defined(FLANG_EXCLUDE_CODEGEN)
+void createDebugPasses(mlir::PassManager &pm,
+                       llvm::codegenoptions::DebugInfoKind debugLevel,
+                       llvm::OptimizationLevel OptLevel,
+                       llvm::StringRef inputFilename);
+
+void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
+                                         MLIRToLLVMPassPipelineConfig config,
+                                         llvm::StringRef inputFilename = {});
+
+/// Create a pass pipeline for lowering from MLIR to LLVM IR
+///
+/// \param pm - MLIR pass manager that will hold the pipeline definition
+/// \param optLevel - optimization level used for creating FIR optimization
+///   passes pipeline
+void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
+                                  MLIRToLLVMPassPipelineConfig &config,
+                                  llvm::StringRef inputFilename = {});
+#undef FLANG_EXCLUDE_CODEGEN
+#endif
+
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_PASSES_PIPELINES_H
diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h
index 02bec4164fca0..06cf9c0be157c 100644
--- a/flang/include/flang/Optimizer/Support/Utils.h
+++ b/flang/include/flang/Optimizer/Support/Utils.h
@@ -185,6 +185,10 @@ mlirTypeToCategoryKind(mlir::Location loc, mlir::Type type) {
     return {Fortran::common::TypeCategory::Complex, complexType.getFKind()};
   else if (auto logicalType = mlir::dyn_cast<fir::LogicalType>(type))
     return {Fortran::common::TypeCategory::Logical, logicalType.getFKind()};
+  else if (auto charType = mlir::dyn_cast<fir::CharacterType>(type))
+    return {Fortran::common::TypeCategory::Character, charType.getFKind()};
+  else if (mlir::isa<fir::RecordType>(type))
+    return {Fortran::common::TypeCategory::Derived, 0};
   else
     fir::emitFatalError(loc,
                         "unsupported type: " + fir::mlirTypeToString(type));
diff --git a/flang/include/flang/Parser/parsing.h b/flang/include/flang/Parser/parsing.h
index 4d329c189cb80..0c774decb16d3 100644
--- a/flang/include/flang/Parser/parsing.h
+++ b/flang/include/flang/Parser/parsing.h
@@ -40,6 +40,7 @@ struct Options {
   bool needProvenanceRangeToCharBlockMappings{false};
   Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF_8};
   bool prescanAndReformat{false}; // -E
+  bool expandIncludeLinesInPreprocessedOutput{true};
   bool showColors{false};
 };
 
diff --git a/flang/include/flang/Runtime/CUDA/common.h b/flang/include/flang/Runtime/CUDA/common.h
index b73bc390ea8c9..4abccf5b341cf 100644
--- a/flang/include/flang/Runtime/CUDA/common.h
+++ b/flang/include/flang/Runtime/CUDA/common.h
@@ -9,6 +9,7 @@
 #ifndef FORTRAN_RUNTIME_CUDA_COMMON_H_
 #define FORTRAN_RUNTIME_CUDA_COMMON_H_
 
+#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/entry-names.h"
 
@@ -34,4 +35,16 @@ static constexpr unsigned kDeviceToDevice = 2;
     terminator.Crash("'%s' failed with '%s'", #expr, name); \
   }(expr)
 
+static inline unsigned getMemType(cuf::DataAttribute attr) {
+  if (attr == cuf::DataAttribute::Device)
+    return kMemTypeDevice;
+  if (attr == cuf::DataAttribute::Managed)
+    return kMemTypeManaged;
+  if (attr == cuf::DataAttribute::Unified)
+    return kMemTypeUnified;
+  if (attr == cuf::DataAttribute::Pinned)
+    return kMemTypePinned;
+  llvm::report_fatal_error("unsupported memory type");
+}
+
 #endif // FORTRAN_RUNTIME_CUDA_COMMON_H_
diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h
index fef651f3b2eed..8b7607be7e999 100644
--- a/flang/include/flang/Runtime/extensions.h
+++ b/flang/include/flang/Runtime/extensions.h
@@ -28,6 +28,8 @@ void FORTRAN_PROCEDURE_NAME(flush)(const int &unit);
 // GNU extension subroutine FDATE
 void FORTRAN_PROCEDURE_NAME(fdate)(char *string, std::int64_t length);
 
+void RTNAME(Free)(std::intptr_t ptr);
+
 // GNU Fortran 77 compatibility function IARGC.
 std::int32_t FORTRAN_PROCEDURE_NAME(iargc)();
 
@@ -38,6 +40,8 @@ void FORTRAN_PROCEDURE_NAME(getarg)(
 // GNU extension subroutine GETLOG(C).
 void FORTRAN_PROCEDURE_NAME(getlog)(char *name, std::int64_t length);
 
+std::intptr_t RTNAME(Malloc)(std::size_t size);
+
 // GNU extension function STATUS = SIGNAL(number, handler)
 std::int64_t RTNAME(Signal)(std::int64_t number, void (*handler)(int));
 
diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h
index e73f9d2e85d58..2a326074b3dc6 100644
--- a/flang/include/flang/Semantics/semantics.h
+++ b/flang/include/flang/Semantics/semantics.h
@@ -12,6 +12,7 @@
 #include "scope.h"
 #include "symbol.h"
 #include "flang/Common/Fortran-features.h"
+#include "flang/Common/LangOptions.h"
 #include "flang/Evaluate/common.h"
 #include "flang/Evaluate/intrinsics.h"
 #include "flang/Evaluate/target.h"
@@ -65,7 +66,8 @@ using ConstructStack = std::vector<ConstructNode>;
 class SemanticsContext {
 public:
   SemanticsContext(const common::IntrinsicTypeDefaultKinds &,
-      const common::LanguageFeatureControl &, parser::AllCookedSources &);
+      const common::LanguageFeatureControl &, const common::LangOptions &,
+      parser::AllCookedSources &);
   ~SemanticsContext();
 
   const common::IntrinsicTypeDefaultKinds &defaultKinds() const {
@@ -73,7 +75,8 @@ class SemanticsContext {
   }
   const common::LanguageFeatureControl &languageFeatures() const {
     return languageFeatures_;
-  };
+  }
+  const common::LangOptions &langOptions() const { return langOpts_; }
   int GetDefaultKind(TypeCategory) const;
   int doublePrecisionKind() const {
     return defaultKinds_.doublePrecisionKind();
@@ -273,6 +276,7 @@ class SemanticsContext {
 
   const common::IntrinsicTypeDefaultKinds &defaultKinds_;
   const common::LanguageFeatureControl &languageFeatures_;
+  const common::LangOptions &langOpts_;
   parser::AllCookedSources &allCookedSources_;
   std::optional<parser::CharBlock> location_;
   std::vector<std::string> searchDirectories_;
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
deleted file mode 100644
index 04b7f0ba370b8..0000000000000
--- a/flang/include/flang/Tools/CLOptions.inc
+++ /dev/null
@@ -1,438 +0,0 @@
-//===-- CLOptions.inc -- command line options -------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-/// This file defines some shared command-line options that can be used when
-/// debugging the test tools. This file must be included into the tool.
-
-#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
-#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
-#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/Passes.h"
-#include "flang/Optimizer/CodeGen/CodeGen.h"
-#include "flang/Optimizer/HLFIR/Passes.h"
-#include "flang/Optimizer/OpenMP/Passes.h"
-#include "flang/Optimizer/Transforms/Passes.h"
-#include "llvm/Passes/OptimizationLevel.h"
-#include "llvm/Support/CommandLine.h"
-#include <type_traits>
-
-#define DisableOption(DOName, DOOption, DODescription) \
-  static llvm::cl::opt<bool> disable##DOName("disable-" DOOption, \
-      llvm::cl::desc("disable " DODescription " pass"), llvm::cl::init(false), \
-      llvm::cl::Hidden)
-#define EnableOption(EOName, EOOption, EODescription) \
-  static llvm::cl::opt<bool> enable##EOName("enable-" EOOption, \
-      llvm::cl::desc("enable " EODescription " pass"), llvm::cl::init(false), \
-      llvm::cl::Hidden)
-
-/// Shared option in tools to control whether dynamically sized array
-/// allocations should always be on the heap.
-static llvm::cl::opt<bool> dynamicArrayStackToHeapAllocation(
-    "fdynamic-heap-array",
-    llvm::cl::desc("place all array allocations of dynamic size on the heap"),
-    llvm::cl::init(false), llvm::cl::Hidden);
-
-/// Shared option in tools to set a maximum value for the number of elements in
-/// a compile-time sized array that can be allocated on the stack.
-static llvm::cl::opt<std::size_t> arrayStackAllocationThreshold(
-    "fstack-array-size",
-    llvm::cl::desc(
-        "place all array allocations more than <size> elements on the heap"),
-    llvm::cl::init(~static_cast<std::size_t>(0)), llvm::cl::Hidden);
-
-/// Shared option in tools to ignore missing runtime type descriptor objects
-/// when translating FIR to LLVM. The resulting program will crash if the
-/// runtime needs the derived type descriptors, this is only a debug option to
-/// allow compiling manually written FIR programs involving derived types
-/// without having to write the derived type descriptors which are normally
-/// generated by the frontend.
-static llvm::cl::opt<bool> ignoreMissingTypeDescriptors(
-    "ignore-missing-type-desc",
-    llvm::cl::desc("ignore failures to find derived type descriptors when "
-                   "translating FIR to LLVM"),
-    llvm::cl::init(false), llvm::cl::Hidden);
-
-namespace {
-/// Default optimization level used to create Flang pass pipeline is O0.
-const static llvm::OptimizationLevel &defaultOptLevel{
-    llvm::OptimizationLevel::O0};
-
-const static llvm::codegenoptions::DebugInfoKind &NoDebugInfo{
-    llvm::codegenoptions::NoDebugInfo};
-
-/// Optimizer Passes
-DisableOption(CfgConversion, "cfg-conversion", "disable FIR to CFG pass");
-DisableOption(FirAvc, "avc", "array value copy analysis and transformation");
-DisableOption(
-    FirMao, "memory-allocation-opt", "memory allocation optimization");
-
-DisableOption(FirAliasTags, "fir-alias-tags", "fir alias analysis");
-static llvm::cl::opt<bool> useOldAliasTags("use-old-alias-tags",
-    llvm::cl::desc("Use a single TBAA tree for all functions and do not use "
-                   "the FIR alias tags pass"),
-    llvm::cl::init(false), llvm::cl::Hidden);
-
-/// CodeGen Passes
-#if !defined(FLANG_EXCLUDE_CODEGEN)
-DisableOption(CodeGenRewrite, "codegen-rewrite", "rewrite FIR for codegen");
-DisableOption(TargetRewrite, "target-rewrite", "rewrite FIR for target");
-DisableOption(DebugInfo, "debug-info", "Add debug info");
-DisableOption(FirToLlvmIr, "fir-to-llvmir", "FIR to LLVM-IR dialect");
-DisableOption(LlvmIrToLlvm, "llvm", "conversion to LLVM");
-DisableOption(BoxedProcedureRewrite, "boxed-procedure-rewrite",
-    "rewrite boxed procedures");
-#endif
-
-DisableOption(ExternalNameConversion, "external-name-interop",
-    "convert names with external convention");
-EnableOption(ConstantArgumentGlobalisation, "constant-argument-globalisation",
-    "the local constant argument to global constant conversion");
-DisableOption(CompilerGeneratedNamesConversion, "compiler-generated-names",
-    "replace special symbols in compiler generated names");
-
-using PassConstructor = std::unique_ptr<mlir::Pass>();
-
-template <typename OP>
-void addNestedPassToOps(mlir::PassManager &pm, PassConstructor ctor) {
-  pm.addNestedPass<OP>(ctor());
-}
-
-template <typename OP, typename... OPS,
-    typename = std::enable_if_t<sizeof...(OPS) != 0>>
-void addNestedPassToOps(mlir::PassManager &pm, PassConstructor ctor) {
-  addNestedPassToOps<OP>(pm, ctor);
-  addNestedPassToOps<OPS...>(pm, ctor);
-}
-
-void addNestedPassToAllTopLevelOperations(
-    mlir::PassManager &pm, PassConstructor ctor) {
-  addNestedPassToOps<mlir::func::FuncOp, mlir::omp::DeclareReductionOp,
-      mlir::omp::PrivateClauseOp, fir::GlobalOp>(pm, ctor);
-}
-
-void addNestedPassToAllTopLevelOperationsConditionally(mlir::PassManager &pm,
-    llvm::cl::opt<bool> &disabled, PassConstructor ctor) {
-  if (!disabled)
-    addNestedPassToAllTopLevelOperations(pm, ctor);
-}
-
-/// Generic for adding a pass to the pass manager if it is not disabled.
-template <typename F>
-void addPassConditionally(
-    mlir::PassManager &pm, llvm::cl::opt<bool> &disabled, F ctor) {
-  if (!disabled)
-    pm.addPass(ctor());
-}
-
-template <typename OP, typename F>
-void addNestedPassConditionally(
-    mlir::PassManager &pm, llvm::cl::opt<bool> &disabled, F ctor) {
-  if (!disabled)
-    pm.addNestedPass<OP>(ctor());
-}
-
-} // namespace
-
-namespace fir {
-
-/// Add MLIR Canonicalizer pass with region simplification disabled.
-/// FIR does not support the promotion of some SSA value to block arguments (or
-/// into arith.select operands) that may be done by mlir block merging in the
-/// region simplification (e.g., !fir.shape<> SSA values are not supported as
-/// block arguments).
-/// Aside from the fir.shape issue, moving some abstract SSA value into block
-/// arguments may have a heavy cost since it forces their code generation that
-/// may be expensive (array temporary). The MLIR pass does not take these
-/// extra costs into account when doing block merging.
-static void addCanonicalizerPassWithoutRegionSimplification(
-    mlir::OpPassManager &pm) {
-  mlir::GreedyRewriteConfig config;
-  config.enableRegionSimplification = mlir::GreedySimplifyRegionLevel::Disabled;
-  pm.addPass(mlir::createCanonicalizerPass(config));
-}
-
-inline void addCfgConversionPass(
-    mlir::PassManager &pm, const MLIRToLLVMPassPipelineConfig &config) {
-  if (config.NSWOnLoopVarInc)
-    addNestedPassToAllTopLevelOperationsConditionally(
-        pm, disableCfgConversion, fir::createCFGConversionPassWithNSW);
-  else
-    addNestedPassToAllTopLevelOperationsConditionally(
-        pm, disableCfgConversion, fir::createCFGConversion);
-}
-
-inline void addAVC(
-    mlir::PassManager &pm, const llvm::OptimizationLevel &optLevel) {
-  ArrayValueCopyOptions options;
-  options.optimizeConflicts = optLevel.isOptimizingForSpeed();
-  addNestedPassConditionally<mlir::func::FuncOp>(
-      pm, disableFirAvc, [&]() { return createArrayValueCopyPass(options); });
-}
-
-inline void addMemoryAllocationOpt(mlir::PassManager &pm) {
-  addNestedPassConditionally<mlir::func::FuncOp>(pm, disableFirMao, [&]() {
-    return fir::createMemoryAllocationOpt(
-        {dynamicArrayStackToHeapAllocation, arrayStackAllocationThreshold});
-  });
-}
-
-#if !defined(FLANG_EXCLUDE_CODEGEN)
-inline void addCodeGenRewritePass(mlir::PassManager &pm, bool preserveDeclare) {
-  fir::CodeGenRewriteOptions options;
-  options.preserveDeclare = preserveDeclare;
-  addPassConditionally(pm, disableCodeGenRewrite,
-      [&]() { return fir::createCodeGenRewrite(options); });
-}
-
-inline void addTargetRewritePass(mlir::PassManager &pm) {
-  addPassConditionally(pm, disableTargetRewrite,
-      []() { return fir::createTargetRewritePass(); });
-}
-
-inline mlir::LLVM::DIEmissionKind getEmissionKind(
-    llvm::codegenoptions::DebugInfoKind kind) {
-  switch (kind) {
-  case llvm::codegenoptions::DebugInfoKind::FullDebugInfo:
-    return mlir::LLVM::DIEmissionKind::Full;
-  case llvm::codegenoptions::DebugInfoKind::DebugLineTablesOnly:
-    return mlir::LLVM::DIEmissionKind::LineTablesOnly;
-  default:
-    return mlir::LLVM::DIEmissionKind::None;
-  }
-}
-
-inline void addDebugInfoPass(mlir::PassManager &pm,
-    llvm::codegenoptions::DebugInfoKind debugLevel,
-    llvm::OptimizationLevel optLevel, llvm::StringRef inputFilename) {
-  fir::AddDebugInfoOptions options;
-  options.debugLevel = getEmissionKind(debugLevel);
-  options.isOptimized = optLevel != llvm::OptimizationLevel::O0;
-  options.inputFilename = inputFilename;
-  addPassConditionally(pm, disableDebugInfo,
-      [&]() { return fir::createAddDebugInfoPass(options); });
-}
-
-inline void addFIRToLLVMPass(
-    mlir::PassManager &pm, const MLIRToLLVMPassPipelineConfig &config) {
-  fir::FIRToLLVMPassOptions options;
-  options.ignoreMissingTypeDescriptors = ignoreMissingTypeDescriptors;
-  options.applyTBAA = config.AliasAnalysis;
-  options.forceUnifiedTBAATree = useOldAliasTags;
-  options.typeDescriptorsRenamedForAssembly =
-      !disableCompilerGeneratedNamesConversion;
-  addPassConditionally(pm, disableFirToLlvmIr,
-      [&]() { return fir::createFIRToLLVMPass(options); });
-  // The dialect conversion framework may leave dead unrealized_conversion_cast
-  // ops behind, so run reconcile-unrealized-casts to clean them up.
-  addPassConditionally(pm, disableFirToLlvmIr,
-      [&]() { return mlir::createReconcileUnrealizedCastsPass(); });
-}
-
-inline void addLLVMDialectToLLVMPass(
-    mlir::PassManager &pm, llvm::raw_ostream &output) {
-  addPassConditionally(pm, disableLlvmIrToLlvm,
-      [&]() { return fir::createLLVMDialectToLLVMPass(output); });
-}
-
-inline void addBoxedProcedurePass(mlir::PassManager &pm) {
-  addPassConditionally(pm, disableBoxedProcedureRewrite,
-      [&]() { return fir::createBoxedProcedurePass(); });
-}
-#endif
-
-inline void addExternalNameConversionPass(
-    mlir::PassManager &pm, bool appendUnderscore = true) {
-  addPassConditionally(pm, disableExternalNameConversion,
-      [&]() { return fir::createExternalNameConversion({appendUnderscore}); });
-}
-
-inline void addCompilerGeneratedNamesConversionPass(mlir::PassManager &pm) {
-  addPassConditionally(pm, disableCompilerGeneratedNamesConversion,
-      [&]() { return fir::createCompilerGeneratedNamesConversion(); });
-}
-
-// Use inliner extension point callback to register the default inliner pass.
-inline void registerDefaultInlinerPass(MLIRToLLVMPassPipelineConfig &config) {
-  config.registerFIRInlinerCallback(
-      [](mlir::PassManager &pm, llvm::OptimizationLevel level) {
-        llvm::StringMap<mlir::OpPassManager> pipelines;
-        // The default inliner pass adds the canonicalizer pass with the default
-        // configuration.
-        pm.addPass(mlir::createInlinerPass(
-            pipelines, addCanonicalizerPassWithoutRegionSimplification));
-      });
-}
-
-/// Create a pass pipeline for running default optimization passes for
-/// incremental conversion of FIR.
-///
-/// \param pm - MLIR pass manager that will hold the pipeline definition
-inline void createDefaultFIROptimizerPassPipeline(
-    mlir::PassManager &pm, MLIRToLLVMPassPipelineConfig &pc) {
-  // Early Optimizer EP Callback
-  pc.invokeFIROptEarlyEPCallbacks(pm, pc.OptLevel);
-
-  // simplify the IR
-  mlir::GreedyRewriteConfig config;
-  config.enableRegionSimplification = mlir::GreedySimplifyRegionLevel::Disabled;
-  pm.addPass(mlir::createCSEPass());
-  fir::addAVC(pm, pc.OptLevel);
-  addNestedPassToAllTopLevelOperations(pm, fir::createCharacterConversion);
-  pm.addPass(mlir::createCanonicalizerPass(config));
-  pm.addPass(fir::createSimplifyRegionLite());
-  if (pc.OptLevel.isOptimizingForSpeed()) {
-    // These passes may increase code size.
-    pm.addPass(fir::createSimplifyIntrinsics());
-    pm.addPass(fir::createAlgebraicSimplificationPass(config));
-    if (enableConstantArgumentGlobalisation)
-      pm.addPass(fir::createConstantArgumentGlobalisationOpt());
-  }
-
-  if (pc.LoopVersioning)
-    pm.addPass(fir::createLoopVersioning());
-
-  pm.addPass(mlir::createCSEPass());
-
-  if (pc.StackArrays)
-    pm.addPass(fir::createStackArrays());
-  else
-    fir::addMemoryAllocationOpt(pm);
-
-  // FIR Inliner Callback
-  pc.invokeFIRInlinerCallback(pm, pc.OptLevel);
-
-  pm.addPass(fir::createSimplifyRegionLite());
-  pm.addPass(mlir::createCSEPass());
-
-  // Polymorphic types
-  pm.addPass(fir::createPolymorphicOpConversion());
-  pm.addPass(fir::createAssumedRankOpConversion());
-
-  if (pc.AliasAnalysis && !disableFirAliasTags && !useOldAliasTags)
-    pm.addPass(fir::createAddAliasTags());
-
-  addNestedPassToAllTopLevelOperations(pm, fir::createStackReclaim);
-  // convert control flow to CFG form
-  fir::addCfgConversionPass(pm, pc);
-  pm.addPass(mlir::createConvertSCFToCFPass());
-
-  pm.addPass(mlir::createCanonicalizerPass(config));
-  pm.addPass(fir::createSimplifyRegionLite());
-  pm.addPass(mlir::createCSEPass());
-
-  // Last Optimizer EP Callback
-  pc.invokeFIROptLastEPCallbacks(pm, pc.OptLevel);
-}
-
-/// Create a pass pipeline for lowering from HLFIR to FIR
-///
-/// \param pm - MLIR pass manager that will hold the pipeline definition
-/// \param optLevel - optimization level used for creating FIR optimization
-///   passes pipeline
-inline void createHLFIRToFIRPassPipeline(
-    mlir::PassManager &pm, llvm::OptimizationLevel optLevel = defaultOptLevel) {
-  if (optLevel.isOptimizingForSpeed()) {
-    addCanonicalizerPassWithoutRegionSimplification(pm);
-    addNestedPassToAllTopLevelOperations(
-        pm, hlfir::createSimplifyHLFIRIntrinsics);
-  }
-  addNestedPassToAllTopLevelOperations(pm, hlfir::createInlineElementals);
-  if (optLevel.isOptimizingForSpeed()) {
-    addCanonicalizerPassWithoutRegionSimplification(pm);
-    pm.addPass(mlir::createCSEPass());
-    addNestedPassToAllTopLevelOperations(
-        pm, hlfir::createOptimizedBufferization);
-  }
-  pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
-  pm.addPass(hlfir::createLowerHLFIRIntrinsics());
-  pm.addPass(hlfir::createBufferizeHLFIR());
-  pm.addPass(hlfir::createConvertHLFIRtoFIR());
-}
-
-/// Create a pass pipeline for handling certain OpenMP transformations needed
-/// prior to FIR lowering.
-///
-/// WARNING: These passes must be run immediately after the lowering to ensure
-/// that the FIR is correct with respect to OpenMP operations/attributes.
-///
-/// \param pm - MLIR pass manager that will hold the pipeline definition.
-/// \param isTargetDevice - Whether code is being generated for a target device
-/// rather than the host device.
-inline void createOpenMPFIRPassPipeline(
-    mlir::PassManager &pm, bool isTargetDevice) {
-  pm.addPass(flangomp::createMapInfoFinalizationPass());
-  pm.addPass(flangomp::createMarkDeclareTargetPass());
-  if (isTargetDevice)
-    pm.addPass(flangomp::createFunctionFilteringPass());
-}
-
-#if !defined(FLANG_EXCLUDE_CODEGEN)
-inline void createDebugPasses(mlir::PassManager &pm,
-    llvm::codegenoptions::DebugInfoKind debugLevel,
-    llvm::OptimizationLevel OptLevel, llvm::StringRef inputFilename) {
-  if (debugLevel != llvm::codegenoptions::NoDebugInfo)
-    addDebugInfoPass(pm, debugLevel, OptLevel, inputFilename);
-}
-
-inline void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
-    MLIRToLLVMPassPipelineConfig config, llvm::StringRef inputFilename = {}) {
-  fir::addBoxedProcedurePass(pm);
-  addNestedPassToAllTopLevelOperations(pm, fir::createAbstractResultOpt);
-  fir::addCodeGenRewritePass(
-      pm, (config.DebugInfo != llvm::codegenoptions::NoDebugInfo));
-  fir::addTargetRewritePass(pm);
-  fir::addCompilerGeneratedNamesConversionPass(pm);
-  fir::addExternalNameConversionPass(pm, config.Underscoring);
-  fir::createDebugPasses(pm, config.DebugInfo, config.OptLevel, inputFilename);
-
-  if (config.VScaleMin != 0)
-    pm.addPass(fir::createVScaleAttr({{config.VScaleMin, config.VScaleMax}}));
-
-  // Add function attributes
-  mlir::LLVM::framePointerKind::FramePointerKind framePointerKind;
-
-  if (config.FramePointerKind != llvm::FramePointerKind::None ||
-      config.NoInfsFPMath || config.NoNaNsFPMath || config.ApproxFuncFPMath ||
-      config.NoSignedZerosFPMath || config.UnsafeFPMath) {
-    if (config.FramePointerKind == llvm::FramePointerKind::NonLeaf)
-      framePointerKind =
-          mlir::LLVM::framePointerKind::FramePointerKind::NonLeaf;
-    else if (config.FramePointerKind == llvm::FramePointerKind::All)
-      framePointerKind = mlir::LLVM::framePointerKind::FramePointerKind::All;
-    else
-      framePointerKind = mlir::LLVM::framePointerKind::FramePointerKind::None;
-
-    pm.addPass(fir::createFunctionAttr({framePointerKind, config.NoInfsFPMath,
-        config.NoNaNsFPMath, config.ApproxFuncFPMath,
-        config.NoSignedZerosFPMath, config.UnsafeFPMath}));
-  }
-
-  fir::addFIRToLLVMPass(pm, config);
-}
-
-/// Create a pass pipeline for lowering from MLIR to LLVM IR
-///
-/// \param pm - MLIR pass manager that will hold the pipeline definition
-/// \param optLevel - optimization level used for creating FIR optimization
-///   passes pipeline
-inline void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
-    MLIRToLLVMPassPipelineConfig &config, llvm::StringRef inputFilename = {}) {
-  fir::createHLFIRToFIRPassPipeline(pm, config.OptLevel);
-
-  // Add default optimizer pass pipeline.
-  fir::createDefaultFIROptimizerPassPipeline(pm, config);
-
-  // Add codegen pass pipeline.
-  fir::createDefaultFIRCodeGenPassPipeline(pm, config, inputFilename);
-}
-#undef FLANG_EXCLUDE_CODEGEN
-#endif
-
-} // namespace fir
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index 75fd783af237d..3e703de545950 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -13,9 +13,9 @@
 #ifndef FORTRAN_TOOLS_CROSS_TOOL_HELPERS_H
 #define FORTRAN_TOOLS_CROSS_TOOL_HELPERS_H
 
+#include "flang/Common/LangOptions.h"
 #include "flang/Common/MathOptionsBase.h"
 #include "flang/Frontend/CodeGenOptions.h"
-#include "flang/Frontend/LangOptions.h"
 #include <cstdint>
 
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
@@ -145,7 +145,7 @@ struct OffloadModuleOpts {
         OMPTargetTriples(OMPTargetTriples.begin(), OMPTargetTriples.end()),
         NoGPULib(NoGPULib) {}
 
-  OffloadModuleOpts(Fortran::frontend::LangOptions &Opts)
+  OffloadModuleOpts(Fortran::common::LangOptions &Opts)
       : OpenMPTargetDebug(Opts.OpenMPTargetDebug),
         OpenMPTeamSubscription(Opts.OpenMPTeamSubscription),
         OpenMPThreadSubscription(Opts.OpenMPThreadSubscription),
diff --git a/flang/lib/Common/CMakeLists.txt b/flang/lib/Common/CMakeLists.txt
index c6f818ad27cd1..be72391847f3d 100644
--- a/flang/lib/Common/CMakeLists.txt
+++ b/flang/lib/Common/CMakeLists.txt
@@ -12,7 +12,7 @@ endif()
 if(flang_vc AND LLVM_APPEND_VC_REV)
   set(flang_source_dir ${FLANG_SOURCE_DIR})
 endif()
- 
+
 # Create custom target to generate the VC revision include.
 add_custom_command(OUTPUT "${version_inc}"
   DEPENDS "${llvm_vc}" "${flang_vc}" "${generate_vcs_version_script}"
@@ -34,12 +34,12 @@ if(FLANG_VENDOR)
     PROPERTIES COMPILE_DEFINITIONS "FLANG_VENDOR=\"${FLANG_VENDOR} \"")
 endif()
 
-
 add_flang_library(FortranCommon
   Fortran.cpp
   Fortran-features.cpp
   default-kinds.cpp
   idioms.cpp
+  LangOptions.cpp
   Version.cpp
   ${version_inc}
 
diff --git a/flang/lib/Frontend/LangOptions.cpp b/flang/lib/Common/LangOptions.cpp
similarity index 82%
rename from flang/lib/Frontend/LangOptions.cpp
rename to flang/lib/Common/LangOptions.cpp
index a08cb363384c6..415c715156e7b 100644
--- a/flang/lib/Frontend/LangOptions.cpp
+++ b/flang/lib/Common/LangOptions.cpp
@@ -10,15 +10,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "flang/Frontend/LangOptions.h"
+#include "flang/Common/LangOptions.h"
 #include <string.h>
 
-namespace Fortran::frontend {
+namespace Fortran::common {
 
 LangOptions::LangOptions() {
 #define LANGOPT(Name, Bits, Default) Name = Default;
 #define ENUM_LANGOPT(Name, Type, Bits, Default) set##Name(Default);
-#include "flang/Frontend/LangOptions.def"
+#include "flang/Common/LangOptions.def"
 }
 
-} // end namespace Fortran::frontend
+} // end namespace Fortran::common
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index 1b14a305b87f4..89477dfb36435 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -1736,8 +1736,9 @@ Expr<TO> FoldOperation(
                   msvcWorkaround.context.languageFeatures().ShouldWarn(
                       common::UsageWarning::FoldingException)) {
                 ctx.messages().Say(
-                    "INTEGER(%d) to INTEGER(%d) conversion overflowed"_warn_en_US,
-                    Operand::kind, TO::kind);
+                    "conversion of %s_%d to INTEGER(%d) overflowed; result is %s"_warn_en_US,
+                    value->SignedDecimal(), Operand::kind, TO::kind,
+                    converted.value.SignedDecimal());
               }
               return ScalarConstantToExpr(std::move(converted.value));
             } else if constexpr (FromCat == TypeCategory::Real) {
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index 17a09c080e72c..a89e9732228cb 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -620,6 +620,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
     {"log10", {{"x", SameReal}}, SameReal},
     {"logical", {{"l", AnyLogical}, DefaultingKIND}, KINDLogical},
     {"log_gamma", {{"x", SameReal}}, SameReal},
+    {"malloc", {{"size", AnyInt}}, SubscriptInt},
     {"matmul",
         {{"matrix_a", AnyLogical, Rank::vector},
             {"matrix_b", AnyLogical, Rank::matrix}},
@@ -1409,6 +1410,7 @@ static const IntrinsicInterface intrinsicSubroutine[]{
         {}, Rank::elemental, IntrinsicClass::impureSubroutine},
     {"exit", {{"status", DefaultInt, Rank::scalar, Optionality::optional}}, {},
         Rank::elemental, IntrinsicClass::impureSubroutine},
+    {"free", {{"ptr", Addressable}}, {}},
     {"get_command",
         {{"command", DefaultChar, Rank::scalar, Optionality::optional,
              common::Intent::Out},
diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt
index ecdcc73d61ec1..e954800c3b88b 100644
--- a/flang/lib/Frontend/CMakeLists.txt
+++ b/flang/lib/Frontend/CMakeLists.txt
@@ -8,7 +8,6 @@ add_flang_library(flangFrontend
   FrontendAction.cpp
   FrontendActions.cpp
   FrontendOptions.cpp
-  LangOptions.cpp
   TextDiagnosticPrinter.cpp
   TextDiagnosticBuffer.cpp
   TextDiagnostic.cpp
@@ -38,6 +37,7 @@ add_flang_library(flangFrontend
   FIRTransforms
   HLFIRDialect
   HLFIRTransforms
+  flangPasses
   FlangOpenMPTransforms
   MLIRTransforms
   MLIRBuiltinToLLVMIRTranslation
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 90c327546198b..2154b9ab2fbf4 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/Triple.h"
+#include <algorithm>
 #include <cstdlib>
 #include <memory>
 #include <optional>
@@ -819,6 +820,8 @@ static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts,
             : PPMacrosFlag::Exclude;
 
   opts.noReformat = args.hasArg(clang::driver::options::OPT_fno_reformat);
+  opts.preprocessIncludeLines =
+      args.hasArg(clang::driver::options::OPT_fpreprocess_include_lines);
   opts.noLineDirectives = args.hasArg(clang::driver::options::OPT_P);
   opts.showMacros = args.hasArg(clang::driver::options::OPT_dM);
 }
@@ -830,14 +833,20 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   unsigned numErrorsBefore = diags.getNumErrors();
 
   // -J/module-dir option
-  auto moduleDirList =
+  std::vector<std::string> moduleDirList =
       args.getAllArgValues(clang::driver::options::OPT_module_dir);
-  // User can only specify -J/-module-dir once
+  // User can only specify one -J/-module-dir directory, but may repeat
+  // -J/-module-dir as long as the directory is the same each time.
   // https://gcc.gnu.org/onlinedocs/gfortran/Directory-Options.html
+  std::sort(moduleDirList.begin(), moduleDirList.end());
+  moduleDirList.erase(std::unique(moduleDirList.begin(), moduleDirList.end()),
+                      moduleDirList.end());
   if (moduleDirList.size() > 1) {
     const unsigned diagID =
         diags.getCustomDiagID(clang::DiagnosticsEngine::Error,
-                              "Only one '-module-dir/-J' option allowed");
+                              "Only one '-module-dir/-J' directory allowed. "
+                              "'-module-dir/-J' may be given multiple times "
+                              "but the directory must be the same each time.");
     diags.Report(diagID);
   }
   if (moduleDirList.size() == 1)
@@ -1110,17 +1119,17 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
 static bool parseFloatingPointArgs(CompilerInvocation &invoc,
                                    llvm::opt::ArgList &args,
                                    clang::DiagnosticsEngine &diags) {
-  LangOptions &opts = invoc.getLangOpts();
+  Fortran::common::LangOptions &opts = invoc.getLangOpts();
 
   if (const llvm::opt::Arg *a =
           args.getLastArg(clang::driver::options::OPT_ffp_contract)) {
     const llvm::StringRef val = a->getValue();
-    enum LangOptions::FPModeKind fpContractMode;
+    enum Fortran::common::LangOptions::FPModeKind fpContractMode;
 
     if (val == "off")
-      fpContractMode = LangOptions::FPM_Off;
+      fpContractMode = Fortran::common::LangOptions::FPM_Off;
     else if (val == "fast")
-      fpContractMode = LangOptions::FPM_Fast;
+      fpContractMode = Fortran::common::LangOptions::FPM_Fast;
     else {
       diags.Report(clang::diag::err_drv_unsupported_option_argument)
           << a->getSpelling() << val;
@@ -1161,7 +1170,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
     opts.ReciprocalMath = true;
     opts.ApproxFunc = true;
     opts.NoSignedZeros = true;
-    opts.setFPContractMode(LangOptions::FPM_Fast);
+    opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast);
   }
 
   return true;
@@ -1194,7 +1203,7 @@ static bool parseVScaleArgs(CompilerInvocation &invoc, llvm::opt::ArgList &args,
     return false;
   }
 
-  LangOptions &opts = invoc.getLangOpts();
+  Fortran::common::LangOptions &opts = invoc.getLangOpts();
   if (vscaleMin) {
     llvm::StringRef argValue = llvm::StringRef(vscaleMin->getValue());
     unsigned vscaleMinVal;
@@ -1479,6 +1488,10 @@ void CompilerInvocation::setFortranOpts() {
   }
   fortranOptions.fixedFormColumns = frontendOptions.fixedFormColumns;
 
+  // -E
+  fortranOptions.prescanAndReformat =
+      frontendOptions.programAction == PrintPreprocessedInput;
+
   fortranOptions.features = frontendOptions.features;
   fortranOptions.encoding = frontendOptions.encoding;
 
@@ -1531,7 +1544,8 @@ CompilerInvocation::getSemanticsCtx(
   auto &fortranOptions = getFortranOpts();
 
   auto semanticsContext = std::make_unique<semantics::SemanticsContext>(
-      getDefaultKinds(), fortranOptions.features, allCookedSources);
+      getDefaultKinds(), fortranOptions.features, getLangOpts(),
+      allCookedSources);
 
   semanticsContext->set_moduleDirectory(getModuleDir())
       .set_searchDirectories(fortranOptions.searchDirectories)
@@ -1556,14 +1570,14 @@ void CompilerInvocation::setLoweringOptions() {
   loweringOpts.setOptimizeTranspose(codegenOpts.OptimizationLevel > 0);
   loweringOpts.setUnderscoring(codegenOpts.Underscoring);
 
-  const LangOptions &langOptions = getLangOpts();
+  const Fortran::common::LangOptions &langOptions = getLangOpts();
   Fortran::common::MathOptionsBase &mathOpts = loweringOpts.getMathOptions();
   // TODO: when LangOptions are finalized, we can represent
   //       the math related options using Fortran::commmon::MathOptionsBase,
   //       so that we can just copy it into LoweringOptions.
   mathOpts
       .setFPContractEnabled(langOptions.getFPContractMode() ==
-                            LangOptions::FPM_Fast)
+                            Fortran::common::LangOptions::FPM_Fast)
       .setNoHonorInfs(langOptions.NoHonorInfs)
       .setNoHonorNaNs(langOptions.NoHonorNaNs)
       .setApproxFunc(langOptions.ApproxFunc)
diff --git a/flang/lib/Frontend/FrontendAction.cpp b/flang/lib/Frontend/FrontendAction.cpp
index 42a614fe46be5..041182bdf6178 100644
--- a/flang/lib/Frontend/FrontendAction.cpp
+++ b/flang/lib/Frontend/FrontendAction.cpp
@@ -95,6 +95,10 @@ bool FrontendAction::beginSourceFile(CompilerInstance &ci,
         getCurrentInput().getIsCUDAFortran());
   }
 
+  // -fpreprocess-include-lines
+  invoc.getFortranOpts().expandIncludeLinesInPreprocessedOutput =
+      invoc.getPreprocessorOpts().preprocessIncludeLines;
+
   // Decide between fixed and free form (if the user didn't express any
   // preference, use the file extension to decide)
   if (invoc.getFrontendOpts().fortranForm == FortranForm::Unknown) {
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 267c3ceb44f33..4a52edc436e0e 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -21,6 +21,7 @@
 #include "flang/Lower/Support/Verifier.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/Dialect/Support/KindMapping.h"
+#include "flang/Optimizer/Passes/Pipelines.h"
 #include "flang/Optimizer/Support/DataLayout.h"
 #include "flang/Optimizer/Support/InitFIR.h"
 #include "flang/Optimizer/Support/Utils.h"
@@ -77,8 +78,6 @@
 #include <memory>
 #include <system_error>
 
-#include "flang/Tools/CLOptions.inc"
-
 namespace llvm {
 extern cl::opt<bool> PrintPipelinePasses;
 } // namespace llvm
diff --git a/flang/lib/Frontend/TextDiagnosticPrinter.cpp b/flang/lib/Frontend/TextDiagnosticPrinter.cpp
index dc182d68a1a97..2ab02c0b70ab5 100644
--- a/flang/lib/Frontend/TextDiagnosticPrinter.cpp
+++ b/flang/lib/Frontend/TextDiagnosticPrinter.cpp
@@ -38,8 +38,8 @@ TextDiagnosticPrinter::~TextDiagnosticPrinter() {}
 static void printRemarkOption(llvm::raw_ostream &os,
                               clang::DiagnosticsEngine::Level level,
                               const clang::Diagnostic &info) {
-  llvm::StringRef opt = info.getDiags()->getDiagnosticIDs()
-      ->getWarningOptionForDiag(info.getID());
+  llvm::StringRef opt =
+      clang::DiagnosticIDs::getWarningOptionForDiag(info.getID());
   if (!opt.empty()) {
     // We still need to check if the level is a Remark since, an unknown option
     // warning could be printed i.e. [-Wunknown-warning-option]
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index c0ef96adc20c3..7fc6b14f9c660 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -582,6 +582,7 @@ mlir::Value Fortran::lower::CalleeInterface::getHostAssociatedTuple() const {
 
 static void addSymbolAttribute(mlir::func::FuncOp func,
                                const Fortran::semantics::Symbol &sym,
+                               fir::FortranProcedureFlagsEnumAttr procAttrs,
                                mlir::MLIRContext &mlirContext) {
   const Fortran::semantics::Symbol &ultimate = sym.GetUltimate();
   // The link between an internal procedure and its host procedure is lost
@@ -611,16 +612,8 @@ static void addSymbolAttribute(mlir::func::FuncOp func,
     }
   }
 
-  // Set procedure attributes to the func op.
-  if (IsPureProcedure(sym))
-    func->setAttr(fir::getFuncPureAttrName(),
-                  mlir::UnitAttr::get(&mlirContext));
-  if (IsElementalProcedure(sym))
-    func->setAttr(fir::getFuncElementalAttrName(),
-                  mlir::UnitAttr::get(&mlirContext));
-  if (sym.attrs().test(Fortran::semantics::Attr::RECURSIVE))
-    func->setAttr(fir::getFuncRecursiveAttrName(),
-                  mlir::UnitAttr::get(&mlirContext));
+  if (procAttrs)
+    func->setAttr(fir::getFortranProcedureFlagsAttrName(), procAttrs);
 
   // Only add this on bind(C) functions for which the symbol is not reflected in
   // the current context.
@@ -703,6 +696,7 @@ void Fortran::lower::CallInterface<T>::declare() {
     func = fir::FirOpBuilder::getNamedFunction(module, symbolTable, name);
     if (!func) {
       mlir::Location loc = side().getCalleeLocation();
+      mlir::MLIRContext &mlirContext = converter.getMLIRContext();
       mlir::FunctionType ty = genFunctionType();
       func =
           fir::FirOpBuilder::createFunction(loc, module, name, ty, symbolTable);
@@ -712,7 +706,8 @@ void Fortran::lower::CallInterface<T>::declare() {
                         mlir::StringAttr::get(&converter.getMLIRContext(),
                                               sym->name().ToString()));
         } else {
-          addSymbolAttribute(func, *sym, converter.getMLIRContext());
+          addSymbolAttribute(func, *sym, getProcedureAttrs(&mlirContext),
+                             mlirContext);
         }
       }
       for (const auto &placeHolder : llvm::enumerate(inputs))
@@ -1546,6 +1541,44 @@ Fortran::lower::CallInterface<T>::getResultType() const {
   return types;
 }
 
+template <typename T>
+fir::FortranProcedureFlagsEnumAttr
+Fortran::lower::CallInterface<T>::getProcedureAttrs(
+    mlir::MLIRContext *mlirContext) const {
+  fir::FortranProcedureFlagsEnum flags = fir::FortranProcedureFlagsEnum::none;
+  if (characteristic) {
+    if (characteristic->IsBindC())
+      flags = flags | fir::FortranProcedureFlagsEnum::bind_c;
+    if (characteristic->IsPure())
+      flags = flags | fir::FortranProcedureFlagsEnum::pure;
+    if (characteristic->IsElemental())
+      flags = flags | fir::FortranProcedureFlagsEnum::elemental;
+    // TODO:
+    // - SIMPLE: F2023, not yet handled by semantics.
+  }
+
+  if constexpr (std::is_same_v<Fortran::lower::CalleeInterface, T>) {
+    // Only gather and set NON_RECURSIVE for procedure definition. It is
+    // meaningless on calls since this is not part of Fortran characteristics
+    // (Fortran 2023 15.3.1) so there is no way to always know if the procedure
+    // called is recursive or not.
+    if (const Fortran::semantics::Symbol *sym = side().getProcedureSymbol()) {
+      // Note: By default procedures are RECURSIVE unless
+      // -fno-automatic/-save/-Msave is set. NON_RECURSIVE is is made explicit
+      // in that case in FIR.
+      if (sym->attrs().test(Fortran::semantics::Attr::NON_RECURSIVE) ||
+          (sym->owner().context().languageFeatures().IsEnabled(
+               Fortran::common::LanguageFeature::DefaultSave) &&
+           !sym->attrs().test(Fortran::semantics::Attr::RECURSIVE))) {
+        flags = flags | fir::FortranProcedureFlagsEnum::non_recursive;
+      }
+    }
+  }
+  if (flags != fir::FortranProcedureFlagsEnum::none)
+    return fir::FortranProcedureFlagsEnumAttr::get(mlirContext, flags);
+  return nullptr;
+}
+
 template class Fortran::lower::CallInterface<Fortran::lower::CalleeInterface>;
 template class Fortran::lower::CallInterface<Fortran::lower::CallerInterface>;
 
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index 017bfd049d3dc..ee5eb225f0d7e 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -631,13 +631,9 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     if (callNumResults != 0)
       callResult = dispatch.getResult(0);
   } else {
-    // TODO: gather other procedure attributes.
-    fir::FortranProcedureFlagsEnumAttr procAttrs;
-    if (caller.characterize().IsBindC())
-      procAttrs = fir::FortranProcedureFlagsEnumAttr::get(
-          builder.getContext(), fir::FortranProcedureFlagsEnum::bind_c);
-
     // Standard procedure call with fir.call.
+    fir::FortranProcedureFlagsEnumAttr procAttrs =
+        caller.getProcedureAttrs(builder.getContext());
     auto call = builder.create<fir::CallOp>(
         loc, funcType.getResults(), funcSymbolAttr, operands, procAttrs);
 
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 5f4138e0f63e7..23a171c657638 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -478,8 +478,7 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym,
       return existingPrivatizer;
 
     mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
-    firOpBuilder.setInsertionPoint(&moduleOp.getBodyRegion().front(),
-                                   moduleOp.getBodyRegion().front().begin());
+    firOpBuilder.setInsertionPointToStart(moduleOp.getBody());
     auto result = firOpBuilder.create<mlir::omp::PrivateClauseOp>(
         symLoc, uniquePrivatizerName, symType,
         isFirstPrivate ? mlir::omp::DataSharingClauseType::FirstPrivate
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 960286732c90c..17ebf93edcce1 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -472,17 +472,26 @@ markDeclareTarget(mlir::Operation *op, lower::AbstractConverter &converter,
 /// \param [in] infoAccessor       - for a private variable, this returns the
 /// data we want to merge: type or location.
 /// \param [out] allRegionArgsInfo - the merged list of region info.
+/// \param [in] addBeforePrivate - `true` if the passed information goes before
+/// private information.
 template <typename OMPOp, typename InfoTy>
 static void
 mergePrivateVarsInfo(OMPOp op, llvm::ArrayRef<InfoTy> currentList,
                      llvm::function_ref<InfoTy(mlir::Value)> infoAccessor,
-                     llvm::SmallVectorImpl<InfoTy> &allRegionArgsInfo) {
+                     llvm::SmallVectorImpl<InfoTy> &allRegionArgsInfo,
+                     bool addBeforePrivate) {
   mlir::OperandRange privateVars = op.getPrivateVars();
 
-  llvm::transform(currentList, std::back_inserter(allRegionArgsInfo),
-                  [](InfoTy i) { return i; });
+  if (addBeforePrivate)
+    llvm::transform(currentList, std::back_inserter(allRegionArgsInfo),
+                    [](InfoTy i) { return i; });
+
   llvm::transform(privateVars, std::back_inserter(allRegionArgsInfo),
                   infoAccessor);
+
+  if (!addBeforePrivate)
+    llvm::transform(currentList, std::back_inserter(allRegionArgsInfo),
+                    [](InfoTy i) { return i; });
 }
 
 //===----------------------------------------------------------------------===//
@@ -868,12 +877,12 @@ static void genBodyOfTargetOp(
   mergePrivateVarsInfo(targetOp, mapSymTypes,
                        llvm::function_ref<mlir::Type(mlir::Value)>{
                            [](mlir::Value v) { return v.getType(); }},
-                       allRegionArgTypes);
+                       allRegionArgTypes, /*addBeforePrivate=*/true);
 
   mergePrivateVarsInfo(targetOp, mapSymLocs,
                        llvm::function_ref<mlir::Location(mlir::Value)>{
                            [](mlir::Value v) { return v.getLoc(); }},
-                       allRegionArgLocs);
+                       allRegionArgLocs, /*addBeforePrivate=*/true);
 
   mlir::Block *regionBlock = firOpBuilder.createBlock(
       &region, {}, allRegionArgTypes, allRegionArgLocs);
@@ -1478,21 +1487,21 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
     mergePrivateVarsInfo(parallelOp, reductionTypes,
                          llvm::function_ref<mlir::Type(mlir::Value)>{
                              [](mlir::Value v) { return v.getType(); }},
-                         allRegionArgTypes);
+                         allRegionArgTypes, /*addBeforePrivate=*/false);
 
     llvm::SmallVector<mlir::Location> allRegionArgLocs;
     mergePrivateVarsInfo(parallelOp, llvm::ArrayRef(reductionLocs),
                          llvm::function_ref<mlir::Location(mlir::Value)>{
                              [](mlir::Value v) { return v.getLoc(); }},
-                         allRegionArgLocs);
+                         allRegionArgLocs, /*addBeforePrivate=*/false);
 
     mlir::Region &region = parallelOp.getRegion();
     firOpBuilder.createBlock(&region, /*insertPt=*/{}, allRegionArgTypes,
                              allRegionArgLocs);
 
-    llvm::SmallVector<const semantics::Symbol *> allSymbols(reductionSyms);
-    allSymbols.append(dsp->getDelayedPrivSymbols().begin(),
-                      dsp->getDelayedPrivSymbols().end());
+    llvm::SmallVector<const semantics::Symbol *> allSymbols(
+        dsp->getDelayedPrivSymbols());
+    allSymbols.append(reductionSyms.begin(), reductionSyms.end());
 
     unsigned argIdx = 0;
     for (const semantics::Symbol *arg : allSymbols) {
@@ -1681,7 +1690,6 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    mapTypes, deviceAddrSyms, deviceAddrLocs, deviceAddrTypes,
                    devicePtrSyms, devicePtrLocs, devicePtrTypes);
 
-  llvm::SmallVector<const semantics::Symbol *> privateSyms;
   DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
                            /*shouldCollectPreDeterminedSymbols=*/
                            lower::omp::isLastItemInQueue(item, queue),
@@ -1936,24 +1944,26 @@ static void genStandaloneDistribute(lower::AbstractConverter &converter,
   genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                        distributeClauseOps);
 
-  // TODO: Support delayed privatization.
   DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
                            /*shouldCollectPreDeterminedSymbols=*/true,
-                           /*useDelayedPrivatization=*/false, &symTable);
-  dsp.processStep1();
+                           enableDelayedPrivatizationStaging, &symTable);
+  dsp.processStep1(&distributeClauseOps);
+  llvm::SmallVector<mlir::Type> privateVarTypes{};
+
+  for (mlir::Value privateVar : distributeClauseOps.privateVars)
+    privateVarTypes.push_back(privateVar.getType());
 
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
 
-  // TODO: Populate entry block arguments with private variables.
   auto distributeOp = genWrapperOp<mlir::omp::DistributeOp>(
-      converter, loc, distributeClauseOps, /*blockArgTypes=*/{});
+      converter, loc, distributeClauseOps, privateVarTypes);
 
   genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item,
-                loopNestClauseOps, iv,
-                /*wrapperSyms=*/{}, distributeOp.getRegion().getArguments(),
+                loopNestClauseOps, iv, dsp.getDelayedPrivSymbols(),
+                distributeOp.getRegion().getArguments(),
                 llvm::omp::Directive::OMPD_distribute, dsp);
 }
 
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 4e6d92213c124..6e9fa0a433cb1 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -96,11 +96,9 @@ static bool isStaticallyPresent(const fir::ExtendedValue &exv) {
 }
 
 /// IEEE module procedure names not yet implemented for genModuleProcTODO.
-static constexpr char ieee_int[] = "ieee_int";
 static constexpr char ieee_get_underflow_mode[] = "ieee_get_underflow_mode";
 static constexpr char ieee_real[] = "ieee_real";
 static constexpr char ieee_rem[] = "ieee_rem";
-static constexpr char ieee_rint[] = "ieee_rint";
 static constexpr char ieee_set_underflow_mode[] = "ieee_set_underflow_mode";
 
 using I = IntrinsicLibrary;
@@ -265,6 +263,7 @@ static constexpr IntrinsicHandler handlers[]{
      /*isElemental=*/false},
     {"floor", &I::genFloor},
     {"fraction", &I::genFraction},
+    {"free", &I::genFree},
     {"get_command",
      &I::genGetCommand,
      {{{"command", asBox, handleDynamicOptional},
@@ -330,7 +329,7 @@ static constexpr IntrinsicHandler handlers[]{
      /*isElemental=*/false},
     {"ieee_get_status", &I::genIeeeGetOrSetStatus</*isGet=*/true>},
     {"ieee_get_underflow_mode", &I::genModuleProcTODO<ieee_get_underflow_mode>},
-    {"ieee_int", &I::genModuleProcTODO<ieee_int>},
+    {"ieee_int", &I::genIeeeInt},
     {"ieee_is_finite", &I::genIeeeIsFinite},
     {"ieee_is_nan", &I::genIeeeIsNan},
     {"ieee_is_negative", &I::genIeeeIsNegative},
@@ -363,7 +362,7 @@ static constexpr IntrinsicHandler handlers[]{
     {"ieee_quiet_ne", &I::genIeeeQuietCompare<mlir::arith::CmpFPredicate::UNE>},
     {"ieee_real", &I::genModuleProcTODO<ieee_real>},
     {"ieee_rem", &I::genModuleProcTODO<ieee_rem>},
-    {"ieee_rint", &I::genModuleProcTODO<ieee_rint>},
+    {"ieee_rint", &I::genIeeeRint},
     {"ieee_round_eq", &I::genIeeeTypeCompare<mlir::arith::CmpIPredicate::eq>},
     {"ieee_round_ne", &I::genIeeeTypeCompare<mlir::arith::CmpIPredicate::ne>},
     {"ieee_set_flag", &I::genIeeeSetFlagOrHaltingMode</*isFlag=*/true>},
@@ -436,6 +435,7 @@ static constexpr IntrinsicHandler handlers[]{
     {"lle", &I::genCharacterCompare<mlir::arith::CmpIPredicate::sle>},
     {"llt", &I::genCharacterCompare<mlir::arith::CmpIPredicate::slt>},
     {"loc", &I::genLoc, {{{"x", asBox}}}, /*isElemental=*/false},
+    {"malloc", &I::genMalloc},
     {"maskl", &I::genMask<mlir::arith::ShLIOp>},
     {"maskr", &I::genMask<mlir::arith::ShRUIOp>},
     {"matmul",
@@ -1238,6 +1238,14 @@ static constexpr MathOperation mathOperations[] = {
     {"log_gamma", "lgamma", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
     {"log_gamma", RTNAME_STRING(LgammaF128), FuncTypeReal16Real16,
      genLibF128Call},
+    {"nearbyint", "llvm.nearbyint.f32", genFuncType<Ty::Real<4>, Ty::Real<4>>,
+     genLibCall},
+    {"nearbyint", "llvm.nearbyint.f64", genFuncType<Ty::Real<8>, Ty::Real<8>>,
+     genLibCall},
+    {"nearbyint", "llvm.nearbyint.f80", genFuncType<Ty::Real<10>, Ty::Real<10>>,
+     genLibCall},
+    {"nearbyint", RTNAME_STRING(NearbyintF128), FuncTypeReal16Real16,
+     genLibF128Call},
     // llvm.lround behaves the same way as libm's lround.
     {"nint", "llvm.lround.i64.f64", genFuncType<Ty::Integer<8>, Ty::Real<8>>,
      genLibCall},
@@ -3581,6 +3589,12 @@ mlir::Value IntrinsicLibrary::genFraction(mlir::Type resultType,
       fir::runtime::genFraction(builder, loc, fir::getBase(args[0])));
 }
 
+void IntrinsicLibrary::genFree(llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+
+  fir::runtime::genFree(builder, loc, fir::getBase(args[0]));
+}
+
 // GETCWD
 fir::ExtendedValue
 IntrinsicLibrary::genGetCwd(std::optional<mlir::Type> resultType,
@@ -4461,6 +4475,62 @@ void IntrinsicLibrary::genIeeeGetOrSetStatus(
   genRuntimeCall(isGet ? "fegetenv" : "fesetenv", i32Ty, addr);
 }
 
+// IEEE_INT
+mlir::Value IntrinsicLibrary::genIeeeInt(mlir::Type resultType,
+                                         llvm::ArrayRef<mlir::Value> args) {
+  // Convert real argument A to an integer, with rounding according to argument
+  // ROUND. Signal IEEE_INVALID if A is a NaN, an infinity, or out of range,
+  // and return either the largest or smallest integer result value (*).
+  // For valid results (when IEEE_INVALID is not signaled), signal IEEE_INEXACT
+  // if A is not an exact integral value (*). The (*) choices are processor
+  // dependent implementation choices not mandated by the standard.
+  // The primary result is generated with a call to IEEE_RINT.
+  assert(args.size() == 3);
+  mlir::FloatType realType = mlir::cast<mlir::FloatType>(args[0].getType());
+  mlir::Value realResult = genIeeeRint(realType, {args[0], args[1]});
+  int intWidth = mlir::cast<mlir::IntegerType>(resultType).getWidth();
+  mlir::Value intLBound = builder.create<mlir::arith::ConstantOp>(
+      loc, resultType,
+      builder.getIntegerAttr(resultType,
+                             llvm::APInt::getBitsSet(intWidth,
+                                                     /*lo=*/intWidth - 1,
+                                                     /*hi=*/intWidth)));
+  mlir::Value intUBound = builder.create<mlir::arith::ConstantOp>(
+      loc, resultType,
+      builder.getIntegerAttr(resultType,
+                             llvm::APInt::getBitsSet(intWidth, /*lo=*/0,
+                                                     /*hi=*/intWidth - 1)));
+  mlir::Value realLBound =
+      builder.create<fir::ConvertOp>(loc, realType, intLBound);
+  mlir::Value realUBound = builder.create<mlir::arith::NegFOp>(loc, realLBound);
+  mlir::Value aGreaterThanLBound = builder.create<mlir::arith::CmpFOp>(
+      loc, mlir::arith::CmpFPredicate::OGE, realResult, realLBound);
+  mlir::Value aLessThanUBound = builder.create<mlir::arith::CmpFOp>(
+      loc, mlir::arith::CmpFPredicate::OLT, realResult, realUBound);
+  mlir::Value resultIsValid = builder.create<mlir::arith::AndIOp>(
+      loc, aGreaterThanLBound, aLessThanUBound);
+
+  // Result is valid. It may be exact or inexact.
+  mlir::Value result;
+  fir::IfOp ifOp = builder.create<fir::IfOp>(loc, resultType, resultIsValid,
+                                             /*withElseRegion=*/true);
+  builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+  mlir::Value inexact = builder.create<mlir::arith::CmpFOp>(
+      loc, mlir::arith::CmpFPredicate::ONE, args[0], realResult);
+  genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INEXACT, inexact);
+  result = builder.create<fir::ConvertOp>(loc, resultType, realResult);
+  builder.create<fir::ResultOp>(loc, result);
+
+  // Result is invalid.
+  builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
+  genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INVALID);
+  result = builder.create<mlir::arith::SelectOp>(loc, aGreaterThanLBound,
+                                                 intUBound, intLBound);
+  builder.create<fir::ResultOp>(loc, result);
+  builder.setInsertionPointAfter(ifOp);
+  return ifOp.getResult(0);
+}
+
 // IEEE_IS_FINITE
 mlir::Value
 IntrinsicLibrary::genIeeeIsFinite(mlir::Type resultType,
@@ -4740,6 +4810,37 @@ IntrinsicLibrary::genIeeeQuietCompare(mlir::Type resultType,
   return builder.create<fir::ConvertOp>(loc, resultType, res);
 }
 
+// IEEE_RINT
+mlir::Value IntrinsicLibrary::genIeeeRint(mlir::Type resultType,
+                                          llvm::ArrayRef<mlir::Value> args) {
+  // Return the value of real argument A rounded to an integer value according
+  // to argument ROUND if present, otherwise according to the current rounding
+  // mode. If ROUND is not present, signal IEEE_INEXACT if A is not an exact
+  // integral value.
+  assert(args.size() == 2);
+  mlir::Value a = args[0];
+  mlir::func::FuncOp getRound = fir::factory::getLlvmGetRounding(builder);
+  mlir::func::FuncOp setRound = fir::factory::getLlvmSetRounding(builder);
+  mlir::Value mode;
+  if (isStaticallyPresent(args[1])) {
+    mode = builder.create<fir::CallOp>(loc, getRound).getResult(0);
+    genIeeeSetRoundingMode({args[1]});
+  }
+  if (mlir::cast<mlir::FloatType>(resultType).getWidth() == 16)
+    a = builder.create<fir::ConvertOp>(
+        loc, mlir::FloatType::getF32(builder.getContext()), a);
+  mlir::Value result = builder.create<fir::ConvertOp>(
+      loc, resultType, genRuntimeCall("nearbyint", a.getType(), a));
+  if (isStaticallyPresent(args[1])) {
+    builder.create<fir::CallOp>(loc, setRound, mode);
+  } else {
+    mlir::Value inexact = builder.create<mlir::arith::CmpFOp>(
+        loc, mlir::arith::CmpFPredicate::ONE, args[0], result);
+    genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INEXACT, inexact);
+  }
+  return result;
+}
+
 // IEEE_SET_FLAG, IEEE_SET_HALTING_MODE
 template <bool isFlag>
 void IntrinsicLibrary::genIeeeSetFlagOrHaltingMode(
@@ -5307,6 +5408,13 @@ IntrinsicLibrary::genLoc(mlir::Type resultType,
       .getResults()[0];
 }
 
+mlir::Value IntrinsicLibrary::genMalloc(mlir::Type resultType,
+                                        llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 1);
+  return builder.createConvert(loc, resultType,
+                               fir::runtime::genMalloc(builder, loc, args[0]));
+}
+
 // MASKL, MASKR
 template <typename Shift>
 mlir::Value IntrinsicLibrary::genMask(mlir::Type resultType,
diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
index aff3cadc3c300..cf2483d36c027 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
@@ -120,6 +120,26 @@ void fir::runtime::genEtime(fir::FirOpBuilder &builder, mlir::Location loc,
   builder.create<fir::CallOp>(loc, runtimeFunc, args);
 }
 
+void fir::runtime::genFree(fir::FirOpBuilder &builder, mlir::Location loc,
+                           mlir::Value ptr) {
+  auto runtimeFunc = fir::runtime::getRuntimeFunc<mkRTKey(Free)>(loc, builder);
+  mlir::Type intPtrTy = builder.getIntPtrType();
+
+  builder.create<fir::CallOp>(loc, runtimeFunc,
+                              builder.createConvert(loc, intPtrTy, ptr));
+}
+
+mlir::Value fir::runtime::genMalloc(fir::FirOpBuilder &builder,
+                                    mlir::Location loc, mlir::Value size) {
+  auto runtimeFunc =
+      fir::runtime::getRuntimeFunc<mkRTKey(Malloc)>(loc, builder);
+  auto argTy = runtimeFunc.getArgumentTypes()[0];
+  return builder
+      .create<fir::CallOp>(loc, runtimeFunc,
+                           builder.createConvert(loc, argTy, size))
+      .getResult(0);
+}
+
 void fir::runtime::genRandomInit(fir::FirOpBuilder &builder, mlir::Location loc,
                                  mlir::Value repeatable,
                                  mlir::Value imageDistinct) {
diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
index 18eff93727856..b39824428c78a 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
@@ -120,6 +120,16 @@ struct ForcedMinvalInteger16 {
   }
 };
 
+// Maxloc/Minloc take descriptor, so these runtime signature are not ifdef
+// and the mkRTKey can safely be used here. Define alias so that the
+// REAL_INTRINSIC_INSTANCES macro works with them too
+using ForcedMaxlocReal10 = mkRTKey(MaxlocReal10);
+using ForcedMaxlocReal16 = mkRTKey(MaxlocReal16);
+using ForcedMaxlocInteger16 = mkRTKey(MaxlocInteger16);
+using ForcedMinlocReal10 = mkRTKey(MinlocReal10);
+using ForcedMinlocReal16 = mkRTKey(MinlocReal16);
+using ForcedMinlocInteger16 = mkRTKey(MinlocInteger16);
+
 /// Placeholder for real*10 version of Norm2 Intrinsic
 struct ForcedNorm2Real10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Norm2_10));
@@ -468,7 +478,7 @@ struct ForcedIParity16 {
 };
 
 /// Placeholder for real*10 version of Reduce Intrinsic
-struct ForcedReduceReal10 {
+struct ForcedReduceReal10Ref {
   static constexpr const char *name =
       ExpandAndQuoteKey(RTNAME(ReduceReal10Ref));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
@@ -508,7 +518,7 @@ struct ForcedReduceReal10Value {
 };
 
 /// Placeholder for real*16 version of Reduce Intrinsic
-struct ForcedReduceReal16 {
+struct ForcedReduceReal16Ref {
   static constexpr const char *name =
       ExpandAndQuoteKey(RTNAME(ReduceReal16Ref));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
@@ -548,7 +558,7 @@ struct ForcedReduceReal16Value {
 };
 
 /// Placeholder for DIM real*10 version of Reduce Intrinsic
-struct ForcedReduceReal10Dim {
+struct ForcedReduceReal10DimRef {
   static constexpr const char *name =
       ExpandAndQuoteKey(RTNAME(ReduceReal10DimRef));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
@@ -592,7 +602,7 @@ struct ForcedReduceReal10DimValue {
 };
 
 /// Placeholder for DIM real*16 version of Reduce Intrinsic
-struct ForcedReduceReal16Dim {
+struct ForcedReduceReal16DimRef {
   static constexpr const char *name =
       ExpandAndQuoteKey(RTNAME(ReduceReal16DimRef));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
@@ -636,7 +646,7 @@ struct ForcedReduceReal16DimValue {
 };
 
 /// Placeholder for integer*16 version of Reduce Intrinsic
-struct ForcedReduceInteger16 {
+struct ForcedReduceInteger16Ref {
   static constexpr const char *name =
       ExpandAndQuoteKey(RTNAME(ReduceInteger16Ref));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
@@ -676,7 +686,7 @@ struct ForcedReduceInteger16Value {
 };
 
 /// Placeholder for DIM integer*16 version of Reduce Intrinsic
-struct ForcedReduceInteger16Dim {
+struct ForcedReduceInteger16DimRef {
   static constexpr const char *name =
       ExpandAndQuoteKey(RTNAME(ReduceInteger16DimRef));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
@@ -720,7 +730,7 @@ struct ForcedReduceInteger16DimValue {
 };
 
 /// Placeholder for complex(10) version of Reduce Intrinsic
-struct ForcedReduceComplex10 {
+struct ForcedReduceComplex10Ref {
   static constexpr const char *name =
       ExpandAndQuoteKey(RTNAME(CppReduceComplex10Ref));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
@@ -762,7 +772,7 @@ struct ForcedReduceComplex10Value {
 };
 
 /// Placeholder for Dim complex(10) version of Reduce Intrinsic
-struct ForcedReduceComplex10Dim {
+struct ForcedReduceComplex10DimRef {
   static constexpr const char *name =
       ExpandAndQuoteKey(RTNAME(CppReduceComplex10DimRef));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
@@ -806,7 +816,7 @@ struct ForcedReduceComplex10DimValue {
 };
 
 /// Placeholder for complex(16) version of Reduce Intrinsic
-struct ForcedReduceComplex16 {
+struct ForcedReduceComplex16Ref {
   static constexpr const char *name =
       ExpandAndQuoteKey(RTNAME(CppReduceComplex16Ref));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
@@ -848,7 +858,7 @@ struct ForcedReduceComplex16Value {
 };
 
 /// Placeholder for Dim complex(16) version of Reduce Intrinsic
-struct ForcedReduceComplex16Dim {
+struct ForcedReduceComplex16DimRef {
   static constexpr const char *name =
       ExpandAndQuoteKey(RTNAME(CppReduceComplex16DimRef));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
@@ -891,6 +901,63 @@ struct ForcedReduceComplex16DimValue {
   }
 };
 
+#define INTRINSIC_INSTANCE(NAME, CAT, KIND, SUFFIX)                            \
+  if (!func && cat == TypeCategory::CAT && kind == KIND) {                     \
+    func = fir::runtime::getRuntimeFunc<mkRTKey(NAME##CAT##KIND##SUFFIX)>(     \
+        loc, builder);                                                         \
+  }
+#define FORCED_INTRINSIC_INSTANCE(NAME, CAT, KIND, SUFFIX)                     \
+  if (!func && cat == TypeCategory::CAT && kind == KIND) {                     \
+    func = fir::runtime::getRuntimeFunc<Forced##NAME##CAT##KIND##SUFFIX>(      \
+        loc, builder);                                                         \
+  }
+
+#define INTEGER_INTRINSIC_INSTANCES(NAME, SUFFIX)                              \
+  INTRINSIC_INSTANCE(NAME, Integer, 1, SUFFIX)                                 \
+  INTRINSIC_INSTANCE(NAME, Integer, 2, SUFFIX)                                 \
+  INTRINSIC_INSTANCE(NAME, Integer, 4, SUFFIX)                                 \
+  INTRINSIC_INSTANCE(NAME, Integer, 8, SUFFIX)                                 \
+  FORCED_INTRINSIC_INSTANCE(NAME, Integer, 16, SUFFIX)
+
+#define REAL_INTRINSIC_INSTANCES(NAME, SUFFIX)                                 \
+  INTRINSIC_INSTANCE(NAME, Real, 4, SUFFIX)                                    \
+  INTRINSIC_INSTANCE(NAME, Real, 8, SUFFIX)                                    \
+  FORCED_INTRINSIC_INSTANCE(NAME, Real, 10, SUFFIX)                            \
+  FORCED_INTRINSIC_INSTANCE(NAME, Real, 16, SUFFIX)
+
+#define COMPLEX_INTRINSIC_INSTANCES(NAME, SUFFIX)                              \
+  INTRINSIC_INSTANCE(Cpp##NAME, Complex, 4, SUFFIX)                            \
+  INTRINSIC_INSTANCE(Cpp##NAME, Complex, 8, SUFFIX)                            \
+  FORCED_INTRINSIC_INSTANCE(NAME, Complex, 10, SUFFIX)                         \
+  FORCED_INTRINSIC_INSTANCE(NAME, Complex, 16, SUFFIX)
+
+#define NUMERICAL_INTRINSIC_INSTANCES(NAME)                                    \
+  INTEGER_INTRINSIC_INSTANCES(NAME, )                                          \
+  REAL_INTRINSIC_INSTANCES(NAME, )                                             \
+  COMPLEX_INTRINSIC_INSTANCES(NAME, )
+
+#define LOGICAL_INTRINSIC_INSTANCES(NAME, SUFFIX)                              \
+  INTRINSIC_INSTANCE(NAME, Logical, 1, SUFFIX)                                 \
+  INTRINSIC_INSTANCE(NAME, Logical, 2, SUFFIX)                                 \
+  INTRINSIC_INSTANCE(NAME, Logical, 4, SUFFIX)                                 \
+  INTRINSIC_INSTANCE(NAME, Logical, 8, SUFFIX)
+
+#define NUMERICAL_AND_LOGICAL_INSTANCES(NAME, SUFFIX)                          \
+  INTEGER_INTRINSIC_INSTANCES(NAME, SUFFIX)                                    \
+  REAL_INTRINSIC_INSTANCES(NAME, SUFFIX)                                       \
+  COMPLEX_INTRINSIC_INSTANCES(NAME, SUFFIX)                                    \
+  LOGICAL_INTRINSIC_INSTANCES(NAME, SUFFIX)
+
+// REAL/COMPLEX 2 and 3 usually have no runtime implementation, so they have
+// special macros.
+#define REAL_2_3_INTRINSIC_INSTANCES(NAME, SUFFIX)                             \
+  INTRINSIC_INSTANCE(NAME, Real, 2, SUFFIX)                                    \
+  INTRINSIC_INSTANCE(NAME, Real, 3, SUFFIX)
+
+#define COMPLEX_2_3_INTRINSIC_INSTANCES(NAME, SUFFIX)                          \
+  INTRINSIC_INSTANCE(Cpp##NAME, Complex, 2, SUFFIX)                            \
+  INTRINSIC_INSTANCE(Cpp##NAME, Complex, 3, SUFFIX)
+
 /// Generate call to specialized runtime function that takes a mask and
 /// dim argument. The All, Any, and Count intrinsics use this pattern.
 template <typename FN>
@@ -1086,36 +1153,21 @@ void fir::runtime::genFindlocDim(fir::FirOpBuilder &builder, mlir::Location loc,
 /// that does not take a dim argument.
 void fir::runtime::genMaxloc(fir::FirOpBuilder &builder, mlir::Location loc,
                              mlir::Value resultBox, mlir::Value arrayBox,
-                             mlir::Value maskBox, mlir::Value kind,
+                             mlir::Value maskBox, mlir::Value kindVal,
                              mlir::Value back) {
-  mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
   auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   fir::factory::CharacterExprHelper charHelper{builder, loc};
-  if (eleTy.isF32())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxlocReal4)>(loc, builder);
-  else if (eleTy.isF64())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxlocReal8)>(loc, builder);
-  else if (eleTy.isF80())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxlocReal10)>(loc, builder);
-  else if (eleTy.isF128())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxlocReal16)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxlocInteger1)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(2)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxlocInteger2)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(4)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxlocInteger4)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(8)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxlocInteger8)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxlocInteger16)>(loc, builder);
-  else if (charHelper.isCharacterScalar(eleTy))
+  auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy);
+  mlir::func::FuncOp func;
+  REAL_INTRINSIC_INSTANCES(Maxloc, )
+  INTEGER_INTRINSIC_INSTANCES(Maxloc, )
+  if (charHelper.isCharacterScalar(eleTy))
     func = fir::runtime::getRuntimeFunc<mkRTKey(MaxlocCharacter)>(loc, builder);
-  else
+  if (!func)
     fir::intrinsicTypeTODO(builder, eleTy, loc, "MAXLOC");
-  genReduction4Args(func, builder, loc, resultBox, arrayBox, maskBox, kind,
+  genReduction4Args(func, builder, loc, resultBox, arrayBox, maskBox, kindVal,
                     back);
 }
 
@@ -1135,31 +1187,15 @@ void fir::runtime::genMaxlocDim(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Value fir::runtime::genMaxval(fir::FirOpBuilder &builder,
                                     mlir::Location loc, mlir::Value arrayBox,
                                     mlir::Value maskBox) {
-  mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
   auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
-
-  if (eleTy.isF32())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxvalReal4)>(loc, builder);
-  else if (eleTy.isF64())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxvalReal8)>(loc, builder);
-  else if (eleTy.isF80())
-    func = fir::runtime::getRuntimeFunc<ForcedMaxvalReal10>(loc, builder);
-  else if (eleTy.isF128())
-    func = fir::runtime::getRuntimeFunc<ForcedMaxvalReal16>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxvalInteger1)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(2)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxvalInteger2)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(4)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxvalInteger4)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(8)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MaxvalInteger8)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)))
-    func = fir::runtime::getRuntimeFunc<ForcedMaxvalInteger16>(loc, builder);
-  else
+  auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy);
+  mlir::func::FuncOp func;
+  REAL_INTRINSIC_INSTANCES(Maxval, )
+  INTEGER_INTRINSIC_INSTANCES(Maxval, )
+  if (!func)
     fir::intrinsicTypeTODO(builder, eleTy, loc, "MAXVAL");
 
   auto fTy = func.getFunctionType();
@@ -1201,36 +1237,21 @@ void fir::runtime::genMaxvalChar(fir::FirOpBuilder &builder, mlir::Location loc,
 /// that does not take a dim argument.
 void fir::runtime::genMinloc(fir::FirOpBuilder &builder, mlir::Location loc,
                              mlir::Value resultBox, mlir::Value arrayBox,
-                             mlir::Value maskBox, mlir::Value kind,
+                             mlir::Value maskBox, mlir::Value kindVal,
                              mlir::Value back) {
-  mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
   auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
+  auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy);
+  mlir::func::FuncOp func;
+  REAL_INTRINSIC_INSTANCES(Minloc, )
+  INTEGER_INTRINSIC_INSTANCES(Minloc, )
   fir::factory::CharacterExprHelper charHelper{builder, loc};
-  if (eleTy.isF32())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinlocReal4)>(loc, builder);
-  else if (eleTy.isF64())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinlocReal8)>(loc, builder);
-  else if (eleTy.isF80())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinlocReal10)>(loc, builder);
-  else if (eleTy.isF128())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinlocReal16)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinlocInteger1)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(2)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinlocInteger2)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(4)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinlocInteger4)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(8)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinlocInteger8)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinlocInteger16)>(loc, builder);
-  else if (charHelper.isCharacterScalar(eleTy))
+  if (charHelper.isCharacterScalar(eleTy))
     func = fir::runtime::getRuntimeFunc<mkRTKey(MinlocCharacter)>(loc, builder);
-  else
+  if (!func)
     fir::intrinsicTypeTODO(builder, eleTy, loc, "MINLOC");
-  genReduction4Args(func, builder, loc, resultBox, arrayBox, maskBox, kind,
+  genReduction4Args(func, builder, loc, resultBox, arrayBox, maskBox, kindVal,
                     back);
 }
 
@@ -1275,31 +1296,16 @@ void fir::runtime::genMinvalChar(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Value fir::runtime::genMinval(fir::FirOpBuilder &builder,
                                     mlir::Location loc, mlir::Value arrayBox,
                                     mlir::Value maskBox) {
-  mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
   auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
+  auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy);
 
-  if (eleTy.isF32())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinvalReal4)>(loc, builder);
-  else if (eleTy.isF64())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinvalReal8)>(loc, builder);
-  else if (eleTy.isF80())
-    func = fir::runtime::getRuntimeFunc<ForcedMinvalReal10>(loc, builder);
-  else if (eleTy.isF128())
-    func = fir::runtime::getRuntimeFunc<ForcedMinvalReal16>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinvalInteger1)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(2)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinvalInteger2)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(4)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinvalInteger4)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(8)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(MinvalInteger8)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)))
-    func = fir::runtime::getRuntimeFunc<ForcedMinvalInteger16>(loc, builder);
-  else
+  mlir::func::FuncOp func;
+  REAL_INTRINSIC_INSTANCES(Minval, )
+  INTEGER_INTRINSIC_INSTANCES(Minval, )
+  if (!func)
     fir::intrinsicTypeTODO(builder, eleTy, loc, "MINVAL");
 
   auto fTy = func.getFunctionType();
@@ -1390,41 +1396,15 @@ mlir::Value fir::runtime::genProduct(fir::FirOpBuilder &builder,
                                      mlir::Location loc, mlir::Value arrayBox,
                                      mlir::Value maskBox,
                                      mlir::Value resultBox) {
-  mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
   auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
 
-  if (eleTy.isF32())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ProductReal4)>(loc, builder);
-  else if (eleTy.isF64())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ProductReal8)>(loc, builder);
-  else if (eleTy.isF80())
-    func = fir::runtime::getRuntimeFunc<ForcedProductReal10>(loc, builder);
-  else if (eleTy.isF128())
-    func = fir::runtime::getRuntimeFunc<ForcedProductReal16>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ProductInteger1)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(2)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ProductInteger2)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(4)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ProductInteger4)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(8)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ProductInteger8)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)))
-    func = fir::runtime::getRuntimeFunc<ForcedProductInteger16>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(builder.getContext(), 4))
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(CppProductComplex4)>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(builder.getContext(), 8))
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(CppProductComplex8)>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(builder.getContext(), 10))
-    func = fir::runtime::getRuntimeFunc<ForcedProductComplex10>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(builder.getContext(), 16))
-    func = fir::runtime::getRuntimeFunc<ForcedProductComplex16>(loc, builder);
-  else
+  auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy);
+  mlir::func::FuncOp func;
+  NUMERICAL_INTRINSIC_INSTANCES(Product)
+  if (!func)
     fir::intrinsicTypeTODO(builder, eleTy, loc, "PRODUCT");
 
   auto fTy = func.getFunctionType();
@@ -1453,51 +1433,18 @@ mlir::Value fir::runtime::genDotProduct(fir::FirOpBuilder &builder,
                                         mlir::Value vectorABox,
                                         mlir::Value vectorBBox,
                                         mlir::Value resultBox) {
-  mlir::func::FuncOp func;
   // For complex data types, resultBox is !fir.ref<!fir.complex<N>>,
   // otherwise it is !fir.box<T>.
   auto ty = resultBox.getType();
   auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
 
-  if (eleTy.isF32())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(DotProductReal4)>(loc, builder);
-  else if (eleTy.isF64())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(DotProductReal8)>(loc, builder);
-  else if (eleTy.isF80())
-    func = fir::runtime::getRuntimeFunc<ForcedDotProductReal10>(loc, builder);
-  else if (eleTy.isF128())
-    func = fir::runtime::getRuntimeFunc<ForcedDotProductReal16>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(builder.getContext(), 4))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppDotProductComplex4)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(builder.getContext(), 8))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppDotProductComplex8)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(builder.getContext(), 10))
-    func =
-        fir::runtime::getRuntimeFunc<ForcedDotProductComplex10>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(builder.getContext(), 16))
-    func =
-        fir::runtime::getRuntimeFunc<ForcedDotProductComplex16>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)))
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(DotProductInteger1)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(2)))
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(DotProductInteger2)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(4)))
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(DotProductInteger4)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(8)))
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(DotProductInteger8)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)))
-    func =
-        fir::runtime::getRuntimeFunc<ForcedDotProductInteger16>(loc, builder);
-  else if (mlir::isa<fir::LogicalType>(eleTy))
+  auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy);
+  mlir::func::FuncOp func;
+  NUMERICAL_INTRINSIC_INSTANCES(DotProduct)
+  if (cat == Fortran::common::TypeCategory::Logical)
     func =
         fir::runtime::getRuntimeFunc<mkRTKey(DotProductLogical)>(loc, builder);
-  else
+  if (!func)
     fir::intrinsicTypeTODO(builder, eleTy, loc, "DOTPRODUCT");
 
   auto fTy = func.getFunctionType();
@@ -1533,39 +1480,15 @@ void fir::runtime::genSumDim(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Value fir::runtime::genSum(fir::FirOpBuilder &builder, mlir::Location loc,
                                  mlir::Value arrayBox, mlir::Value maskBox,
                                  mlir::Value resultBox) {
-  mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
   auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
 
-  if (eleTy.isF32())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(SumReal4)>(loc, builder);
-  else if (eleTy.isF64())
-    func = fir::runtime::getRuntimeFunc<mkRTKey(SumReal8)>(loc, builder);
-  else if (eleTy.isF80())
-    func = fir::runtime::getRuntimeFunc<ForcedSumReal10>(loc, builder);
-  else if (eleTy.isF128())
-    func = fir::runtime::getRuntimeFunc<ForcedSumReal16>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(SumInteger1)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(2)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(SumInteger2)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(4)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(SumInteger4)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(8)))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(SumInteger8)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)))
-    func = fir::runtime::getRuntimeFunc<ForcedSumInteger16>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(builder.getContext(), 4))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppSumComplex4)>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(builder.getContext(), 8))
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppSumComplex8)>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(builder.getContext(), 10))
-    func = fir::runtime::getRuntimeFunc<ForcedSumComplex10>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(builder.getContext(), 16))
-    func = fir::runtime::getRuntimeFunc<ForcedSumComplex16>(loc, builder);
-  else
+  auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy);
+  mlir::func::FuncOp func;
+  NUMERICAL_INTRINSIC_INSTANCES(Sum)
+  if (!func)
     fir::intrinsicTypeTODO(builder, eleTy, loc, "SUM");
 
   auto fTy = func.getFunctionType();
@@ -1671,7 +1594,6 @@ void fir::runtime::genReduce(fir::FirOpBuilder &builder, mlir::Location loc,
                              mlir::Value maskBox, mlir::Value identity,
                              mlir::Value ordered, mlir::Value resultBox,
                              bool argByRef) {
-  mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
   auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
@@ -1682,44 +1604,17 @@ void fir::runtime::genReduce(fir::FirOpBuilder &builder, mlir::Location loc,
           fir::isa_derived(eleTy)) &&
          "expect character, complex or derived-type");
 
-  mlir::MLIRContext *ctx = builder.getContext();
+  auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy);
+  mlir::func::FuncOp func;
+  if (argByRef) {
+    COMPLEX_2_3_INTRINSIC_INSTANCES(Reduce, Ref)
+    COMPLEX_INTRINSIC_INSTANCES(Reduce, Ref)
+  } else {
+    COMPLEX_2_3_INTRINSIC_INSTANCES(Reduce, Value)
+    COMPLEX_INTRINSIC_INSTANCES(Reduce, Value)
+  }
   fir::factory::CharacterExprHelper charHelper{builder, loc};
-
-  if (eleTy == fir::ComplexType::get(ctx, 2) && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex2Ref)>(loc,
-                                                                       builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 2) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex2Value)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 3) && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex3Ref)>(loc,
-                                                                       builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 3) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex3Value)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 4) && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex4Ref)>(loc,
-                                                                       builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 4) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex4Value)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 8) && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex8Ref)>(loc,
-                                                                       builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 8) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex8Value)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 10) && argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceComplex10>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 10) && !argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<ForcedReduceComplex10Value>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 16) && argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceComplex16>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 16) && !argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<ForcedReduceComplex16Value>(loc, builder);
-  else if (fir::isa_char(eleTy) && charHelper.getCharacterKind(eleTy) == 1)
+  if (fir::isa_char(eleTy) && charHelper.getCharacterKind(eleTy) == 1)
     func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceChar1)>(loc, builder);
   else if (fir::isa_char(eleTy) && charHelper.getCharacterKind(eleTy) == 2)
     func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceChar2)>(loc, builder);
@@ -1728,7 +1623,7 @@ void fir::runtime::genReduce(fir::FirOpBuilder &builder, mlir::Location loc,
   else if (fir::isa_derived(eleTy))
     func =
         fir::runtime::getRuntimeFunc<mkRTKey(ReduceDerivedType)>(loc, builder);
-  else
+  if (!func)
     fir::intrinsicTypeTODO(builder, eleTy, loc, "REDUCE");
 
   auto fTy = func.getFunctionType();
@@ -1749,111 +1644,29 @@ mlir::Value fir::runtime::genReduce(fir::FirOpBuilder &builder,
                                     mlir::Value operation, mlir::Value maskBox,
                                     mlir::Value identity, mlir::Value ordered,
                                     bool argByRef) {
-  mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
   auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getI32Type(), 1);
 
-  mlir::MLIRContext *ctx = builder.getContext();
-  fir::factory::CharacterExprHelper charHelper{builder, loc};
-
   assert((fir::isa_real(eleTy) || fir::isa_integer(eleTy) ||
           mlir::isa<fir::LogicalType>(eleTy)) &&
          "expect real, interger or logical");
 
-  if (eleTy.isF16() && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal2Ref)>(loc, builder);
-  else if (eleTy.isF16() && !argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal2Value)>(loc, builder);
-  else if (eleTy.isBF16() && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal3Ref)>(loc, builder);
-  else if (eleTy.isBF16() && !argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal3Value)>(loc, builder);
-  else if (eleTy.isF32() && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal4Ref)>(loc, builder);
-  else if (eleTy.isF32() && !argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal4Value)>(loc, builder);
-  else if (eleTy.isF64() && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal8Ref)>(loc, builder);
-  else if (eleTy.isF64() && !argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal8Value)>(loc, builder);
-  else if (eleTy.isF80() && argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceReal10>(loc, builder);
-  else if (eleTy.isF80() && !argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceReal10Value>(loc, builder);
-  else if (eleTy.isF128() && argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceReal16>(loc, builder);
-  else if (eleTy.isF128() && !argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceReal16Value>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)) &&
-           argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger1Ref)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)) &&
-           !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger1Value)>(loc,
-                                                                      builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(2)) &&
-           argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger2Ref)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(2)) &&
-           !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger2Value)>(loc,
-                                                                      builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(4)) &&
-           argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger4Ref)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(4)) &&
-           !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger4Value)>(loc,
-                                                                      builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(8)) &&
-           argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger8Ref)>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(8)) &&
-           !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger8Value)>(loc,
-                                                                      builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)) &&
-           argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceInteger16>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)) &&
-           !argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<ForcedReduceInteger16Value>(loc, builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 1) && argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical1Ref)>(loc, builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 1) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical1Value)>(loc,
-                                                                      builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 2) && argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical2Ref)>(loc, builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 2) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical2Value)>(loc,
-                                                                      builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 4) && argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical4Ref)>(loc, builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 4) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical4Value)>(loc,
-                                                                      builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 8) && argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical8Ref)>(loc, builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 8) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical8Value)>(loc,
-                                                                      builder);
-  else
+  auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy);
+  mlir::func::FuncOp func;
+  if (argByRef) {
+    REAL_2_3_INTRINSIC_INSTANCES(Reduce, Ref)
+    REAL_INTRINSIC_INSTANCES(Reduce, Ref)
+    INTEGER_INTRINSIC_INSTANCES(Reduce, Ref)
+    LOGICAL_INTRINSIC_INSTANCES(Reduce, Ref)
+  } else {
+    REAL_2_3_INTRINSIC_INSTANCES(Reduce, Value)
+    REAL_INTRINSIC_INSTANCES(Reduce, Value)
+    INTEGER_INTRINSIC_INSTANCES(Reduce, Value)
+    LOGICAL_INTRINSIC_INSTANCES(Reduce, Value)
+  }
+  if (!func)
     fir::intrinsicTypeTODO(builder, eleTy, loc, "REDUCE");
 
   auto fTy = func.getFunctionType();
@@ -1872,146 +1685,23 @@ void fir::runtime::genReduceDim(fir::FirOpBuilder &builder, mlir::Location loc,
                                 mlir::Value dim, mlir::Value maskBox,
                                 mlir::Value identity, mlir::Value ordered,
                                 mlir::Value resultBox, bool argByRef) {
-  mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
   auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
+  auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy);
 
-  mlir::MLIRContext *ctx = builder.getContext();
+  mlir::func::FuncOp func;
+  if (argByRef) {
+    REAL_2_3_INTRINSIC_INSTANCES(Reduce, DimRef)
+    COMPLEX_2_3_INTRINSIC_INSTANCES(Reduce, DimRef)
+    NUMERICAL_AND_LOGICAL_INSTANCES(Reduce, DimRef)
+  } else {
+    REAL_2_3_INTRINSIC_INSTANCES(Reduce, DimValue)
+    COMPLEX_2_3_INTRINSIC_INSTANCES(Reduce, DimValue)
+    NUMERICAL_AND_LOGICAL_INSTANCES(Reduce, DimValue)
+  }
   fir::factory::CharacterExprHelper charHelper{builder, loc};
-
-  if (eleTy.isF16() && argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal2DimRef)>(loc, builder);
-  else if (eleTy.isF16() && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal2DimValue)>(loc,
-                                                                      builder);
-  else if (eleTy.isBF16() && argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal3DimRef)>(loc, builder);
-  else if (eleTy.isBF16() && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal3DimValue)>(loc,
-                                                                      builder);
-  else if (eleTy.isF32() && argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal4DimRef)>(loc, builder);
-  else if (eleTy.isF32() && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal4DimValue)>(loc,
-                                                                      builder);
-  else if (eleTy.isF64() && argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal8DimRef)>(loc, builder);
-  else if (eleTy.isF64() && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal8DimValue)>(loc,
-                                                                      builder);
-  else if (eleTy.isF80() && argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceReal10Dim>(loc, builder);
-  else if (eleTy.isF80() && !argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<ForcedReduceReal10DimValue>(loc, builder);
-  else if (eleTy.isF128() && argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceReal16Dim>(loc, builder);
-  else if (eleTy.isF128() && !argByRef)
-    func =
-        fir::runtime::getRuntimeFunc<ForcedReduceReal16DimValue>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)) &&
-           argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger1DimRef)>(loc,
-                                                                       builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)) &&
-           !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger1DimValue)>(
-        loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(2)) &&
-           argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger2DimRef)>(loc,
-                                                                       builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(2)) &&
-           !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger2DimValue)>(
-        loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(4)) &&
-           argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger4DimRef)>(loc,
-                                                                       builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(4)) &&
-           !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger4DimValue)>(
-        loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(8)) &&
-           argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger8DimRef)>(loc,
-                                                                       builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(8)) &&
-           !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger8DimValue)>(
-        loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)) &&
-           argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceInteger16Dim>(loc, builder);
-  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)) &&
-           !argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceInteger16DimValue>(loc,
-                                                                       builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 2) && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex2DimRef)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 2) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex2DimValue)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 3) && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex3DimRef)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 3) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex3DimValue)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 4) && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex4DimRef)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 4) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex4DimValue)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 8) && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex8DimRef)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 8) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex8DimValue)>(
-        loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 10) && argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceComplex10Dim>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 10) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceComplex10DimValue>(loc,
-                                                                       builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 16) && argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceComplex16Dim>(loc, builder);
-  else if (eleTy == fir::ComplexType::get(ctx, 16) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<ForcedReduceComplex16DimValue>(loc,
-                                                                       builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 1) && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical1DimRef)>(loc,
-                                                                       builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 1) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical1DimValue)>(
-        loc, builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 2) && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical2DimRef)>(loc,
-                                                                       builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 2) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical2DimValue)>(
-        loc, builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 4) && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical4DimRef)>(loc,
-                                                                       builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 4) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical4DimValue)>(
-        loc, builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 8) && argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical8DimRef)>(loc,
-                                                                       builder);
-  else if (eleTy == fir::LogicalType::get(ctx, 8) && !argByRef)
-    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical8DimValue)>(
-        loc, builder);
-  else if (fir::isa_char(eleTy) && charHelper.getCharacterKind(eleTy) == 1)
+  if (fir::isa_char(eleTy) && charHelper.getCharacterKind(eleTy) == 1)
     func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceCharacter1Dim)>(loc,
                                                                       builder);
   else if (fir::isa_char(eleTy) && charHelper.getCharacterKind(eleTy) == 2)
@@ -2023,7 +1713,7 @@ void fir::runtime::genReduceDim(fir::FirOpBuilder &builder, mlir::Location loc,
   else if (fir::isa_derived(eleTy))
     func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceDerivedTypeDim)>(loc,
                                                                        builder);
-  else
+  if (!func)
     fir::intrinsicTypeTODO(builder, eleTy, loc, "REDUCE");
 
   auto fTy = func.getFunctionType();
diff --git a/flang/lib/Optimizer/CMakeLists.txt b/flang/lib/Optimizer/CMakeLists.txt
index dd153ac33c0fb..5354d7181e651 100644
--- a/flang/lib/Optimizer/CMakeLists.txt
+++ b/flang/lib/Optimizer/CMakeLists.txt
@@ -1,8 +1,9 @@
+add_subdirectory(Analysis)
 add_subdirectory(Builder)
 add_subdirectory(CodeGen)
 add_subdirectory(Dialect)
 add_subdirectory(HLFIR)
+add_subdirectory(OpenMP)
+add_subdirectory(Passes)
 add_subdirectory(Support)
 add_subdirectory(Transforms)
-add_subdirectory(Analysis)
-add_subdirectory(OpenMP)
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 88293bcf36a78..efc28e9708e19 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -110,6 +110,26 @@ static unsigned getLenParamFieldId(mlir::Type ty) {
   return getTypeDescFieldId(ty) + 1;
 }
 
+static llvm::SmallVector<mlir::NamedAttribute>
+addLLVMOpBundleAttrs(mlir::ConversionPatternRewriter &rewriter,
+                     llvm::ArrayRef<mlir::NamedAttribute> attrs,
+                     int32_t numCallOperands) {
+  llvm::SmallVector<mlir::NamedAttribute> newAttrs;
+  newAttrs.reserve(attrs.size() + 2);
+
+  for (mlir::NamedAttribute attr : attrs) {
+    if (attr.getName() != "operandSegmentSizes")
+      newAttrs.push_back(attr);
+  }
+
+  newAttrs.push_back(rewriter.getNamedAttr(
+      "operandSegmentSizes",
+      rewriter.getDenseI32ArrayAttr({numCallOperands, 0})));
+  newAttrs.push_back(rewriter.getNamedAttr("op_bundle_sizes",
+                                           rewriter.getDenseI32ArrayAttr({})));
+  return newAttrs;
+}
+
 namespace {
 /// Lower `fir.address_of` operation to `llvm.address_of` operation.
 struct AddrOfOpConversion : public fir::FIROpConversion<fir::AddrOfOp> {
@@ -229,7 +249,8 @@ struct AllocaOpConversion : public fir::FIROpConversion<fir::AllocaOp> {
         mlir::NamedAttribute attr = rewriter.getNamedAttr(
             "callee", mlir::SymbolRefAttr::get(memSizeFn));
         auto call = rewriter.create<mlir::LLVM::CallOp>(
-            loc, ity, lenParams, llvm::ArrayRef<mlir::NamedAttribute>{attr});
+            loc, ity, lenParams,
+            addLLVMOpBundleAttrs(rewriter, {attr}, lenParams.size()));
         size = call.getResult();
         llvmObjectType = ::getI8Type(alloc.getContext());
       } else {
@@ -559,7 +580,9 @@ struct CallOpConversion : public fir::FIROpConversion<fir::CallOp> {
     mlir::arith::AttrConvertFastMathToLLVM<fir::CallOp, mlir::LLVM::CallOp>
         attrConvert(call);
     rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
-        call, resultTys, adaptor.getOperands(), attrConvert.getAttrs());
+        call, resultTys, adaptor.getOperands(),
+        addLLVMOpBundleAttrs(rewriter, attrConvert.getAttrs(),
+                             adaptor.getOperands().size()));
     return mlir::success();
   }
 };
@@ -980,7 +1003,8 @@ struct AllocMemOpConversion : public fir::FIROpConversion<fir::AllocMemOp> {
           loc, ity, size, integerCast(loc, rewriter, ity, opnd));
     heap->setAttr("callee", getMalloc(heap, rewriter));
     rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
-        heap, ::getLlvmPtrType(heap.getContext()), size, heap->getAttrs());
+        heap, ::getLlvmPtrType(heap.getContext()), size,
+        addLLVMOpBundleAttrs(rewriter, heap->getAttrs(), 1));
     return mlir::success();
   }
 
@@ -1037,9 +1061,9 @@ struct FreeMemOpConversion : public fir::FIROpConversion<fir::FreeMemOp> {
                   mlir::ConversionPatternRewriter &rewriter) const override {
     mlir::Location loc = freemem.getLoc();
     freemem->setAttr("callee", getFree(freemem, rewriter));
-    rewriter.create<mlir::LLVM::CallOp>(loc, mlir::TypeRange{},
-                                        mlir::ValueRange{adaptor.getHeapref()},
-                                        freemem->getAttrs());
+    rewriter.create<mlir::LLVM::CallOp>(
+        loc, mlir::TypeRange{}, mlir::ValueRange{adaptor.getHeapref()},
+        addLLVMOpBundleAttrs(rewriter, freemem->getAttrs(), 1));
     rewriter.eraseOp(freemem);
     return mlir::success();
   }
@@ -2671,7 +2695,8 @@ struct FieldIndexOpConversion : public fir::FIROpConversion<fir::FieldIndexOp> {
         "field", mlir::IntegerAttr::get(lowerTy().indexType(), index));
     rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
         field, lowerTy().offsetType(), adaptor.getOperands(),
-        llvm::ArrayRef<mlir::NamedAttribute>{callAttr, fieldAttr});
+        addLLVMOpBundleAttrs(rewriter, {callAttr, fieldAttr},
+                             adaptor.getOperands().size()));
     return mlir::success();
   }
 
diff --git a/flang/lib/Optimizer/Passes/CMakeLists.txt b/flang/lib/Optimizer/Passes/CMakeLists.txt
new file mode 100644
index 0000000000000..3df988940e005
--- /dev/null
+++ b/flang/lib/Optimizer/Passes/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_flang_library(flangPasses
+  CommandLineOpts.cpp
+  Pipelines.cpp
+
+  DEPENDS
+  FortranCommon
+
+  LINK_LIBS
+  FIRCodeGen
+  FIRTransforms
+  FlangOpenMPTransforms
+  FortranCommon
+  HLFIRTransforms
+  MLIRPass
+  MLIRReconcileUnrealizedCasts
+  MLIRSCFToControlFlow
+  MLIRSupport
+  MLIRTransforms
+
+  LINK_COMPONENTS
+  Passes
+)
diff --git a/flang/lib/Optimizer/Passes/CommandLineOpts.cpp b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
new file mode 100644
index 0000000000000..f95a280883cba
--- /dev/null
+++ b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
@@ -0,0 +1,73 @@
+//===-- CommandLineOpts.cpp -- shared command line options ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/// This file defines some shared command-line options that can be used when
+/// debugging the test tools.
+
+#include "flang/Optimizer/Passes/CommandLineOpts.h"
+
+using namespace llvm;
+
+#define DisableOption(DOName, DOOption, DODescription)                         \
+  cl::opt<bool> disable##DOName("disable-" DOOption,                           \
+                                cl::desc("disable " DODescription " pass"),    \
+                                cl::init(false), cl::Hidden)
+#define EnableOption(EOName, EOOption, EODescription)                          \
+  cl::opt<bool> enable##EOName("enable-" EOOption,                             \
+                               cl::desc("enable " EODescription " pass"),      \
+                               cl::init(false), cl::Hidden)
+
+cl::opt<bool> dynamicArrayStackToHeapAllocation(
+    "fdynamic-heap-array",
+    cl::desc("place all array allocations of dynamic size on the heap"),
+    cl::init(false), cl::Hidden);
+
+cl::opt<std::size_t> arrayStackAllocationThreshold(
+    "fstack-array-size",
+    cl::desc(
+        "place all array allocations more than <size> elements on the heap"),
+    cl::init(~static_cast<std::size_t>(0)), cl::Hidden);
+
+cl::opt<bool> ignoreMissingTypeDescriptors(
+    "ignore-missing-type-desc",
+    cl::desc("ignore failures to find derived type descriptors when "
+             "translating FIR to LLVM"),
+    cl::init(false), cl::Hidden);
+
+OptimizationLevel defaultOptLevel{OptimizationLevel::O0};
+
+codegenoptions::DebugInfoKind noDebugInfo{codegenoptions::NoDebugInfo};
+
+/// Optimizer Passes
+DisableOption(CfgConversion, "cfg-conversion", "disable FIR to CFG pass");
+DisableOption(FirAvc, "avc", "array value copy analysis and transformation");
+DisableOption(FirMao, "memory-allocation-opt",
+              "memory allocation optimization");
+
+DisableOption(FirAliasTags, "fir-alias-tags", "fir alias analysis");
+cl::opt<bool> useOldAliasTags(
+    "use-old-alias-tags",
+    cl::desc("Use a single TBAA tree for all functions and do not use "
+             "the FIR alias tags pass"),
+    cl::init(false), cl::Hidden);
+
+/// CodeGen Passes
+DisableOption(CodeGenRewrite, "codegen-rewrite", "rewrite FIR for codegen");
+DisableOption(TargetRewrite, "target-rewrite", "rewrite FIR for target");
+DisableOption(DebugInfo, "debug-info", "Add debug info");
+DisableOption(FirToLlvmIr, "fir-to-llvmir", "FIR to LLVM-IR dialect");
+DisableOption(LlvmIrToLlvm, "llvm", "conversion to LLVM");
+DisableOption(BoxedProcedureRewrite, "boxed-procedure-rewrite",
+              "rewrite boxed procedures");
+
+DisableOption(ExternalNameConversion, "external-name-interop",
+              "convert names with external convention");
+EnableOption(ConstantArgumentGlobalisation, "constant-argument-globalisation",
+             "the local constant argument to global constant conversion");
+DisableOption(CompilerGeneratedNamesConversion, "compiler-generated-names",
+              "replace special symbols in compiler generated names");
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
new file mode 100644
index 0000000000000..3fa5c54403bd8
--- /dev/null
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -0,0 +1,314 @@
+//===-- Pipelines.cpp -- FIR pass pipelines ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/// This file defines some utilties to setup FIR pass pipelines. These are
+/// common to flang and the test tools.
+
+#include "flang/Optimizer/Passes/Pipelines.h"
+
+namespace fir {
+
+void addNestedPassToAllTopLevelOperations(mlir::PassManager &pm,
+                                          PassConstructor ctor) {
+  addNestedPassToOps<mlir::func::FuncOp, mlir::omp::DeclareReductionOp,
+                     mlir::omp::PrivateClauseOp, fir::GlobalOp>(pm, ctor);
+}
+
+void addNestedPassToAllTopLevelOperationsConditionally(
+    mlir::PassManager &pm, llvm::cl::opt<bool> &disabled,
+    PassConstructor ctor) {
+  if (!disabled)
+    addNestedPassToAllTopLevelOperations(pm, ctor);
+}
+
+void addCanonicalizerPassWithoutRegionSimplification(mlir::OpPassManager &pm) {
+  mlir::GreedyRewriteConfig config;
+  config.enableRegionSimplification = mlir::GreedySimplifyRegionLevel::Disabled;
+  pm.addPass(mlir::createCanonicalizerPass(config));
+}
+
+void addCfgConversionPass(mlir::PassManager &pm,
+                          const MLIRToLLVMPassPipelineConfig &config) {
+  if (config.NSWOnLoopVarInc)
+    addNestedPassToAllTopLevelOperationsConditionally(
+        pm, disableCfgConversion, fir::createCFGConversionPassWithNSW);
+  else
+    addNestedPassToAllTopLevelOperationsConditionally(pm, disableCfgConversion,
+                                                      fir::createCFGConversion);
+}
+
+void addAVC(mlir::PassManager &pm, const llvm::OptimizationLevel &optLevel) {
+  ArrayValueCopyOptions options;
+  options.optimizeConflicts = optLevel.isOptimizingForSpeed();
+  addNestedPassConditionally<mlir::func::FuncOp>(
+      pm, disableFirAvc, [&]() { return createArrayValueCopyPass(options); });
+}
+
+void addMemoryAllocationOpt(mlir::PassManager &pm) {
+  addNestedPassConditionally<mlir::func::FuncOp>(pm, disableFirMao, [&]() {
+    return fir::createMemoryAllocationOpt(
+        {dynamicArrayStackToHeapAllocation, arrayStackAllocationThreshold});
+  });
+}
+
+void addCodeGenRewritePass(mlir::PassManager &pm, bool preserveDeclare) {
+  fir::CodeGenRewriteOptions options;
+  options.preserveDeclare = preserveDeclare;
+  addPassConditionally(pm, disableCodeGenRewrite,
+                       [&]() { return fir::createCodeGenRewrite(options); });
+}
+
+void addTargetRewritePass(mlir::PassManager &pm) {
+  addPassConditionally(pm, disableTargetRewrite,
+                       []() { return fir::createTargetRewritePass(); });
+}
+
+mlir::LLVM::DIEmissionKind
+getEmissionKind(llvm::codegenoptions::DebugInfoKind kind) {
+  switch (kind) {
+  case llvm::codegenoptions::DebugInfoKind::FullDebugInfo:
+    return mlir::LLVM::DIEmissionKind::Full;
+  case llvm::codegenoptions::DebugInfoKind::DebugLineTablesOnly:
+    return mlir::LLVM::DIEmissionKind::LineTablesOnly;
+  default:
+    return mlir::LLVM::DIEmissionKind::None;
+  }
+}
+
+void addDebugInfoPass(mlir::PassManager &pm,
+                      llvm::codegenoptions::DebugInfoKind debugLevel,
+                      llvm::OptimizationLevel optLevel,
+                      llvm::StringRef inputFilename) {
+  fir::AddDebugInfoOptions options;
+  options.debugLevel = getEmissionKind(debugLevel);
+  options.isOptimized = optLevel != llvm::OptimizationLevel::O0;
+  options.inputFilename = inputFilename;
+  addPassConditionally(pm, disableDebugInfo,
+                       [&]() { return fir::createAddDebugInfoPass(options); });
+}
+
+void addFIRToLLVMPass(mlir::PassManager &pm,
+                      const MLIRToLLVMPassPipelineConfig &config) {
+  fir::FIRToLLVMPassOptions options;
+  options.ignoreMissingTypeDescriptors = ignoreMissingTypeDescriptors;
+  options.applyTBAA = config.AliasAnalysis;
+  options.forceUnifiedTBAATree = useOldAliasTags;
+  options.typeDescriptorsRenamedForAssembly =
+      !disableCompilerGeneratedNamesConversion;
+  addPassConditionally(pm, disableFirToLlvmIr,
+                       [&]() { return fir::createFIRToLLVMPass(options); });
+  // The dialect conversion framework may leave dead unrealized_conversion_cast
+  // ops behind, so run reconcile-unrealized-casts to clean them up.
+  addPassConditionally(pm, disableFirToLlvmIr, [&]() {
+    return mlir::createReconcileUnrealizedCastsPass();
+  });
+}
+
+void addLLVMDialectToLLVMPass(mlir::PassManager &pm,
+                              llvm::raw_ostream &output) {
+  addPassConditionally(pm, disableLlvmIrToLlvm, [&]() {
+    return fir::createLLVMDialectToLLVMPass(output);
+  });
+}
+
+void addBoxedProcedurePass(mlir::PassManager &pm) {
+  addPassConditionally(pm, disableBoxedProcedureRewrite,
+                       [&]() { return fir::createBoxedProcedurePass(); });
+}
+
+void addExternalNameConversionPass(mlir::PassManager &pm,
+                                   bool appendUnderscore) {
+  addPassConditionally(pm, disableExternalNameConversion, [&]() {
+    return fir::createExternalNameConversion({appendUnderscore});
+  });
+}
+
+void addCompilerGeneratedNamesConversionPass(mlir::PassManager &pm) {
+  addPassConditionally(pm, disableCompilerGeneratedNamesConversion, [&]() {
+    return fir::createCompilerGeneratedNamesConversion();
+  });
+}
+
+// Use inliner extension point callback to register the default inliner pass.
+void registerDefaultInlinerPass(MLIRToLLVMPassPipelineConfig &config) {
+  config.registerFIRInlinerCallback(
+      [](mlir::PassManager &pm, llvm::OptimizationLevel level) {
+        llvm::StringMap<mlir::OpPassManager> pipelines;
+        // The default inliner pass adds the canonicalizer pass with the default
+        // configuration.
+        pm.addPass(mlir::createInlinerPass(
+            pipelines, addCanonicalizerPassWithoutRegionSimplification));
+      });
+}
+
+/// Create a pass pipeline for running default optimization passes for
+/// incremental conversion of FIR.
+///
+/// \param pm - MLIR pass manager that will hold the pipeline definition
+void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
+                                           MLIRToLLVMPassPipelineConfig &pc) {
+  // Early Optimizer EP Callback
+  pc.invokeFIROptEarlyEPCallbacks(pm, pc.OptLevel);
+
+  // simplify the IR
+  mlir::GreedyRewriteConfig config;
+  config.enableRegionSimplification = mlir::GreedySimplifyRegionLevel::Disabled;
+  pm.addPass(mlir::createCSEPass());
+  fir::addAVC(pm, pc.OptLevel);
+  addNestedPassToAllTopLevelOperations(pm, fir::createCharacterConversion);
+  pm.addPass(mlir::createCanonicalizerPass(config));
+  pm.addPass(fir::createSimplifyRegionLite());
+  if (pc.OptLevel.isOptimizingForSpeed()) {
+    // These passes may increase code size.
+    pm.addPass(fir::createSimplifyIntrinsics());
+    pm.addPass(fir::createAlgebraicSimplificationPass(config));
+    if (enableConstantArgumentGlobalisation)
+      pm.addPass(fir::createConstantArgumentGlobalisationOpt());
+  }
+
+  if (pc.LoopVersioning)
+    pm.addPass(fir::createLoopVersioning());
+
+  pm.addPass(mlir::createCSEPass());
+
+  if (pc.StackArrays)
+    pm.addPass(fir::createStackArrays());
+  else
+    fir::addMemoryAllocationOpt(pm);
+
+  // FIR Inliner Callback
+  pc.invokeFIRInlinerCallback(pm, pc.OptLevel);
+
+  pm.addPass(fir::createSimplifyRegionLite());
+  pm.addPass(mlir::createCSEPass());
+
+  // Polymorphic types
+  pm.addPass(fir::createPolymorphicOpConversion());
+  pm.addPass(fir::createAssumedRankOpConversion());
+
+  if (pc.AliasAnalysis && !disableFirAliasTags && !useOldAliasTags)
+    pm.addPass(fir::createAddAliasTags());
+
+  addNestedPassToAllTopLevelOperations(pm, fir::createStackReclaim);
+  // convert control flow to CFG form
+  fir::addCfgConversionPass(pm, pc);
+  pm.addPass(mlir::createConvertSCFToCFPass());
+
+  pm.addPass(mlir::createCanonicalizerPass(config));
+  pm.addPass(fir::createSimplifyRegionLite());
+  pm.addPass(mlir::createCSEPass());
+
+  // Last Optimizer EP Callback
+  pc.invokeFIROptLastEPCallbacks(pm, pc.OptLevel);
+}
+
+/// Create a pass pipeline for lowering from HLFIR to FIR
+///
+/// \param pm - MLIR pass manager that will hold the pipeline definition
+/// \param optLevel - optimization level used for creating FIR optimization
+///   passes pipeline
+void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
+                                  llvm::OptimizationLevel optLevel) {
+  if (optLevel.isOptimizingForSpeed()) {
+    addCanonicalizerPassWithoutRegionSimplification(pm);
+    addNestedPassToAllTopLevelOperations(pm,
+                                         hlfir::createSimplifyHLFIRIntrinsics);
+  }
+  addNestedPassToAllTopLevelOperations(pm, hlfir::createInlineElementals);
+  if (optLevel.isOptimizingForSpeed()) {
+    addCanonicalizerPassWithoutRegionSimplification(pm);
+    pm.addPass(mlir::createCSEPass());
+    addNestedPassToAllTopLevelOperations(pm,
+                                         hlfir::createOptimizedBufferization);
+  }
+  pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
+  pm.addPass(hlfir::createLowerHLFIRIntrinsics());
+  pm.addPass(hlfir::createBufferizeHLFIR());
+  pm.addPass(hlfir::createConvertHLFIRtoFIR());
+}
+
+/// Create a pass pipeline for handling certain OpenMP transformations needed
+/// prior to FIR lowering.
+///
+/// WARNING: These passes must be run immediately after the lowering to ensure
+/// that the FIR is correct with respect to OpenMP operations/attributes.
+///
+/// \param pm - MLIR pass manager that will hold the pipeline definition.
+/// \param isTargetDevice - Whether code is being generated for a target device
+/// rather than the host device.
+void createOpenMPFIRPassPipeline(mlir::PassManager &pm, bool isTargetDevice) {
+  pm.addPass(flangomp::createMapInfoFinalizationPass());
+  pm.addPass(flangomp::createMarkDeclareTargetPass());
+  if (isTargetDevice)
+    pm.addPass(flangomp::createFunctionFilteringPass());
+}
+
+void createDebugPasses(mlir::PassManager &pm,
+                       llvm::codegenoptions::DebugInfoKind debugLevel,
+                       llvm::OptimizationLevel OptLevel,
+                       llvm::StringRef inputFilename) {
+  if (debugLevel != llvm::codegenoptions::NoDebugInfo)
+    addDebugInfoPass(pm, debugLevel, OptLevel, inputFilename);
+}
+
+void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
+                                         MLIRToLLVMPassPipelineConfig config,
+                                         llvm::StringRef inputFilename) {
+  fir::addBoxedProcedurePass(pm);
+  addNestedPassToAllTopLevelOperations(pm, fir::createAbstractResultOpt);
+  fir::addCodeGenRewritePass(
+      pm, (config.DebugInfo != llvm::codegenoptions::NoDebugInfo));
+  fir::addTargetRewritePass(pm);
+  fir::addCompilerGeneratedNamesConversionPass(pm);
+  fir::addExternalNameConversionPass(pm, config.Underscoring);
+  fir::createDebugPasses(pm, config.DebugInfo, config.OptLevel, inputFilename);
+
+  if (config.VScaleMin != 0)
+    pm.addPass(fir::createVScaleAttr({{config.VScaleMin, config.VScaleMax}}));
+
+  // Add function attributes
+  mlir::LLVM::framePointerKind::FramePointerKind framePointerKind;
+
+  if (config.FramePointerKind != llvm::FramePointerKind::None ||
+      config.NoInfsFPMath || config.NoNaNsFPMath || config.ApproxFuncFPMath ||
+      config.NoSignedZerosFPMath || config.UnsafeFPMath) {
+    if (config.FramePointerKind == llvm::FramePointerKind::NonLeaf)
+      framePointerKind =
+          mlir::LLVM::framePointerKind::FramePointerKind::NonLeaf;
+    else if (config.FramePointerKind == llvm::FramePointerKind::All)
+      framePointerKind = mlir::LLVM::framePointerKind::FramePointerKind::All;
+    else
+      framePointerKind = mlir::LLVM::framePointerKind::FramePointerKind::None;
+
+    pm.addPass(fir::createFunctionAttr(
+        {framePointerKind, config.NoInfsFPMath, config.NoNaNsFPMath,
+         config.ApproxFuncFPMath, config.NoSignedZerosFPMath,
+         config.UnsafeFPMath}));
+  }
+
+  fir::addFIRToLLVMPass(pm, config);
+}
+
+/// Create a pass pipeline for lowering from MLIR to LLVM IR
+///
+/// \param pm - MLIR pass manager that will hold the pipeline definition
+/// \param optLevel - optimization level used for creating FIR optimization
+///   passes pipeline
+void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
+                                  MLIRToLLVMPassPipelineConfig &config,
+                                  llvm::StringRef inputFilename) {
+  fir::createHLFIRToFIRPassPipeline(pm, config.OptLevel);
+
+  // Add default optimizer pass pipeline.
+  fir::createDefaultFIROptimizerPassPipeline(pm, config);
+
+  // Add codegen pass pipeline.
+  fir::createDefaultFIRCodeGenPassPipeline(pm, config, inputFilename);
+}
+
+} // namespace fir
diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
index f8ace2dd96a0d..a80ca7a125abd 100644
--- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
@@ -183,6 +183,29 @@ static bool inDeviceContext(mlir::Operation *op) {
   return false;
 }
 
+static int computeWidth(mlir::Location loc, mlir::Type type,
+                        fir::KindMapping &kindMap) {
+  auto eleTy = fir::unwrapSequenceType(type);
+  int width = 0;
+  if (auto t{mlir::dyn_cast<mlir::IntegerType>(eleTy)}) {
+    width = t.getWidth() / 8;
+  } else if (auto t{mlir::dyn_cast<mlir::FloatType>(eleTy)}) {
+    width = t.getWidth() / 8;
+  } else if (eleTy.isInteger(1)) {
+    width = 1;
+  } else if (auto t{mlir::dyn_cast<fir::LogicalType>(eleTy)}) {
+    int kind = t.getFKind();
+    width = kindMap.getLogicalBitsize(kind) / 8;
+  } else if (auto t{mlir::dyn_cast<fir::ComplexType>(eleTy)}) {
+    int kind = t.getFKind();
+    int elemSize = kindMap.getRealBitsize(kind) / 8;
+    width = 2 * elemSize;
+  } else {
+    llvm::report_fatal_error("unsupported type");
+  }
+  return width;
+}
+
 struct CufAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -193,11 +216,6 @@ struct CufAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
   mlir::LogicalResult
   matchAndRewrite(cuf::AllocOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType());
-
-    // Only convert cuf.alloc that allocates a descriptor.
-    if (!boxTy)
-      return failure();
 
     if (inDeviceContext(op.getOperation())) {
       // In device context just replace the cuf.alloc operation with a fir.alloc
@@ -212,11 +230,56 @@ struct CufAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
     auto mod = op->getParentOfType<mlir::ModuleOp>();
     fir::FirOpBuilder builder(rewriter, mod);
     mlir::Location loc = op.getLoc();
+    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+
+    if (!mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType())) {
+      // Convert scalar and known size array allocations.
+      mlir::Value bytes;
+      fir::KindMapping kindMap{fir::getKindMapping(mod)};
+      if (fir::isa_trivial(op.getInType())) {
+        int width = computeWidth(loc, op.getInType(), kindMap);
+        bytes =
+            builder.createIntegerConstant(loc, builder.getIndexType(), width);
+      } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(
+                     op.getInType())) {
+        mlir::Value width = builder.createIntegerConstant(
+            loc, builder.getIndexType(),
+            computeWidth(loc, seqTy.getEleTy(), kindMap));
+        mlir::Value nbElem;
+        if (fir::sequenceWithNonConstantShape(seqTy)) {
+          assert(!op.getShape().empty() && "expect shape with dynamic arrays");
+          nbElem = builder.loadIfRef(loc, op.getShape()[0]);
+          for (unsigned i = 1; i < op.getShape().size(); ++i) {
+            nbElem = rewriter.create<mlir::arith::MulIOp>(
+                loc, nbElem, builder.loadIfRef(loc, op.getShape()[i]));
+          }
+        } else {
+          nbElem = builder.createIntegerConstant(loc, builder.getIndexType(),
+                                                 seqTy.getConstantArraySize());
+        }
+        bytes = rewriter.create<mlir::arith::MulIOp>(loc, nbElem, width);
+      }
+      mlir::func::FuncOp func =
+          fir::runtime::getRuntimeFunc<mkRTKey(CUFMemAlloc)>(loc, builder);
+      auto fTy = func.getFunctionType();
+      mlir::Value sourceLine =
+          fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
+      mlir::Value memTy = builder.createIntegerConstant(
+          loc, builder.getI32Type(), getMemType(op.getDataAttr()));
+      llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+          builder, loc, fTy, bytes, memTy, sourceFile, sourceLine)};
+      auto callOp = builder.create<fir::CallOp>(loc, func, args);
+      auto convOp = builder.createConvert(loc, op.getResult().getType(),
+                                          callOp.getResult(0));
+      rewriter.replaceOp(op, convOp);
+      return mlir::success();
+    }
+
+    // Convert descriptor allocations to function call.
+    auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType());
     mlir::func::FuncOp func =
         fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocDesciptor)>(loc, builder);
-
     auto fTy = func.getFunctionType();
-    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
     mlir::Value sourceLine =
         fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
 
@@ -245,26 +308,39 @@ struct CufFreeOpConversion : public mlir::OpRewritePattern<cuf::FreeOp> {
   mlir::LogicalResult
   matchAndRewrite(cuf::FreeOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    // Only convert cuf.free on descriptor.
-    if (!mlir::isa<fir::ReferenceType>(op.getDevptr().getType()))
-      return failure();
-    auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getDevptr().getType());
-    if (!mlir::isa<fir::BaseBoxType>(refTy.getEleTy()))
-      return failure();
-
     if (inDeviceContext(op.getOperation())) {
       rewriter.eraseOp(op);
       return mlir::success();
     }
 
+    if (!mlir::isa<fir::ReferenceType>(op.getDevptr().getType()))
+      return failure();
+
     auto mod = op->getParentOfType<mlir::ModuleOp>();
     fir::FirOpBuilder builder(rewriter, mod);
     mlir::Location loc = op.getLoc();
+    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+
+    auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getDevptr().getType());
+    if (!mlir::isa<fir::BaseBoxType>(refTy.getEleTy())) {
+      mlir::func::FuncOp func =
+          fir::runtime::getRuntimeFunc<mkRTKey(CUFMemFree)>(loc, builder);
+      auto fTy = func.getFunctionType();
+      mlir::Value sourceLine =
+          fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
+      mlir::Value memTy = builder.createIntegerConstant(
+          loc, builder.getI32Type(), getMemType(op.getDataAttr()));
+      llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+          builder, loc, fTy, op.getDevptr(), memTy, sourceFile, sourceLine)};
+      builder.create<fir::CallOp>(loc, func, args);
+      rewriter.eraseOp(op);
+      return mlir::success();
+    }
+
+    // Convert cuf.free on descriptors.
     mlir::func::FuncOp func =
         fir::runtime::getRuntimeFunc<mkRTKey(CUFFreeDesciptor)>(loc, builder);
-
     auto fTy = func.getFunctionType();
-    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
     mlir::Value sourceLine =
         fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
     llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
@@ -275,29 +351,6 @@ struct CufFreeOpConversion : public mlir::OpRewritePattern<cuf::FreeOp> {
   }
 };
 
-static int computeWidth(mlir::Location loc, mlir::Type type,
-                        fir::KindMapping &kindMap) {
-  auto eleTy = fir::unwrapSequenceType(type);
-  int width = 0;
-  if (auto t{mlir::dyn_cast<mlir::IntegerType>(eleTy)}) {
-    width = t.getWidth() / 8;
-  } else if (auto t{mlir::dyn_cast<mlir::FloatType>(eleTy)}) {
-    width = t.getWidth() / 8;
-  } else if (eleTy.isInteger(1)) {
-    width = 1;
-  } else if (auto t{mlir::dyn_cast<fir::LogicalType>(eleTy)}) {
-    int kind = t.getFKind();
-    width = kindMap.getLogicalBitsize(kind) / 8;
-  } else if (auto t{mlir::dyn_cast<fir::ComplexType>(eleTy)}) {
-    int kind = t.getFKind();
-    int elemSize = kindMap.getRealBitsize(kind) / 8;
-    width = 2 * elemSize;
-  } else {
-    llvm::report_fatal_error("unsupported type");
-  }
-  return width;
-}
-
 static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter,
                                    mlir::Location loc, mlir::Type toTy,
                                    mlir::Value val) {
@@ -317,24 +370,78 @@ struct CufDataTransferOpConversion
     mlir::Type srcTy = fir::unwrapRefType(op.getSrc().getType());
     mlir::Type dstTy = fir::unwrapRefType(op.getDst().getType());
 
-    // Only convert cuf.data_transfer with at least one descripor.
-    if (!mlir::isa<fir::BaseBoxType>(srcTy) &&
-        !mlir::isa<fir::BaseBoxType>(dstTy))
-      return failure();
-
-    unsigned mode;
+    mlir::Location loc = op.getLoc();
+    unsigned mode = 0;
     if (op.getTransferKind() == cuf::DataTransferKind::HostDevice) {
       mode = kHostToDevice;
     } else if (op.getTransferKind() == cuf::DataTransferKind::DeviceHost) {
       mode = kDeviceToHost;
     } else if (op.getTransferKind() == cuf::DataTransferKind::DeviceDevice) {
       mode = kDeviceToDevice;
+    } else {
+      mlir::emitError(loc, "unsupported transfer kind\n");
     }
 
     auto mod = op->getParentOfType<mlir::ModuleOp>();
     fir::FirOpBuilder builder(rewriter, mod);
-    mlir::Location loc = op.getLoc();
+    fir::KindMapping kindMap{fir::getKindMapping(mod)};
+    mlir::Value modeValue =
+        builder.createIntegerConstant(loc, builder.getI32Type(), mode);
 
+    // Convert data transfer without any descriptor.
+    if (!mlir::isa<fir::BaseBoxType>(srcTy) &&
+        !mlir::isa<fir::BaseBoxType>(dstTy)) {
+
+      if (fir::isa_trivial(srcTy) && !fir::isa_trivial(dstTy)) {
+        // TODO: scalar to array data transfer.
+        mlir::emitError(loc,
+                        "not yet implemented: scalar to array data transfer\n");
+        return mlir::failure();
+      }
+
+      mlir::Type i64Ty = builder.getI64Type();
+      mlir::Value nbElement;
+      if (op.getShape()) {
+        auto shapeOp =
+            mlir::dyn_cast<fir::ShapeOp>(op.getShape().getDefiningOp());
+        nbElement = rewriter.create<fir::ConvertOp>(loc, i64Ty,
+                                                    shapeOp.getExtents()[0]);
+        for (unsigned i = 1; i < shapeOp.getExtents().size(); ++i) {
+          auto operand = rewriter.create<fir::ConvertOp>(
+              loc, i64Ty, shapeOp.getExtents()[i]);
+          nbElement =
+              rewriter.create<mlir::arith::MulIOp>(loc, nbElement, operand);
+        }
+      } else {
+        if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(dstTy))
+          nbElement = builder.createIntegerConstant(
+              loc, i64Ty, seqTy.getConstantArraySize());
+      }
+      int width = computeWidth(loc, dstTy, kindMap);
+      mlir::Value widthValue = rewriter.create<mlir::arith::ConstantOp>(
+          loc, i64Ty, rewriter.getIntegerAttr(i64Ty, width));
+      mlir::Value bytes =
+          nbElement
+              ? rewriter.create<mlir::arith::MulIOp>(loc, nbElement, widthValue)
+              : widthValue;
+
+      mlir::func::FuncOp func =
+          fir::runtime::getRuntimeFunc<mkRTKey(CUFDataTransferPtrPtr)>(loc,
+                                                                       builder);
+      auto fTy = func.getFunctionType();
+      mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+      mlir::Value sourceLine =
+          fir::factory::locationToLineNo(builder, loc, fTy.getInput(5));
+
+      llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+          builder, loc, fTy, op.getDst(), op.getSrc(), bytes, modeValue,
+          sourceFile, sourceLine)};
+      builder.create<fir::CallOp>(loc, func, args);
+      rewriter.eraseOp(op);
+      return mlir::success();
+    }
+
+    // Conversion of data transfer involving at least one descriptor.
     if (mlir::isa<fir::BaseBoxType>(srcTy) &&
         mlir::isa<fir::BaseBoxType>(dstTy)) {
       // Transfer between two descriptor.
@@ -343,8 +450,6 @@ struct CufDataTransferOpConversion
               loc, builder);
 
       auto fTy = func.getFunctionType();
-      mlir::Value modeValue =
-          builder.createIntegerConstant(loc, builder.getI32Type(), mode);
       mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
       mlir::Value sourceLine =
           fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
@@ -377,8 +482,6 @@ struct CufDataTransferOpConversion
       builder.create<fir::CallOp>(loc, func, args);
       rewriter.eraseOp(op);
     } else {
-      mlir::Value modeValue =
-          builder.createIntegerConstant(loc, builder.getI32Type(), mode);
       // Type used to compute the width.
       mlir::Type computeType = dstTy;
       auto seqTy = mlir::dyn_cast<fir::SequenceType>(dstTy);
@@ -388,7 +491,6 @@ struct CufDataTransferOpConversion
         computeType = srcTy;
         seqTy = mlir::dyn_cast<fir::SequenceType>(srcTy);
       }
-      fir::KindMapping kindMap{fir::getKindMapping(mod)};
       int width = computeWidth(loc, computeType, kindMap);
 
       mlir::Value nbElement;
@@ -456,23 +558,6 @@ class CufOpConversion : public fir::impl::CufOpConversionBase<CufOpConversion> {
         fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
     fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
                                          /*forceUnifiedTBAATree=*/false, *dl);
-    target.addDynamicallyLegalOp<cuf::AllocOp>([](::cuf::AllocOp op) {
-      return !mlir::isa<fir::BaseBoxType>(op.getInType());
-    });
-    target.addDynamicallyLegalOp<cuf::FreeOp>([](::cuf::FreeOp op) {
-      if (auto refTy = mlir::dyn_cast_or_null<fir::ReferenceType>(
-              op.getDevptr().getType())) {
-        return !mlir::isa<fir::BaseBoxType>(refTy.getEleTy());
-      }
-      return true;
-    });
-    target.addDynamicallyLegalOp<cuf::DataTransferOp>(
-        [](::cuf::DataTransferOp op) {
-          mlir::Type srcTy = fir::unwrapRefType(op.getSrc().getType());
-          mlir::Type dstTy = fir::unwrapRefType(op.getDst().getType());
-          return !mlir::isa<fir::BaseBoxType>(srcTy) &&
-                 !mlir::isa<fir::BaseBoxType>(dstTy);
-        });
     target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect>();
     cuf::populateCUFToFIRConversionPatterns(typeConverter, *dl, patterns);
     if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index 1390fae062b93..29e61d505bf6a 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -14,7 +14,6 @@
 
 #include "DebugTypeGenerator.h"
 #include "flang/Optimizer/CodeGen/DescriptorModel.h"
-#include "flang/Optimizer/CodeGen/TypeConverter.h"
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "mlir/Pass/Pass.h"
 #include "llvm/ADT/ScopeExit.h"
@@ -48,7 +47,7 @@ DebugTypeGenerator::DebugTypeGenerator(mlir::ModuleOp m,
                                        mlir::SymbolTable *symbolTable_,
                                        const mlir::DataLayout &dl)
     : module(m), symbolTable(symbolTable_), dataLayout{&dl},
-      kindMapping(getKindMapping(m)) {
+      kindMapping(getKindMapping(m)), llvmTypeConverter(m, false, false, dl) {
   LLVM_DEBUG(llvm::dbgs() << "DITypeAttr generator\n");
 
   mlir::MLIRContext *context = module.getContext();
@@ -114,10 +113,21 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType(
   mlir::LLVM::DITypeAttr elemTy =
       convertType(seqTy.getEleTy(), fileAttr, scope, declOp);
   unsigned offset = dimsOffset;
+  unsigned index = 0;
+  mlir::IntegerType intTy = mlir::IntegerType::get(context, 64);
   const unsigned indexSize = dimsSize / 3;
   for ([[maybe_unused]] auto _ : seqTy.getShape()) {
     // For each dimension, find the offset of count, lower bound and stride in
     // the descriptor and generate the dwarf expression to extract it.
+    mlir::Attribute lowerAttr = nullptr;
+    // If declaration has a lower bound, use it.
+    if (declOp && declOp.getShift().size() > index) {
+      // TODO: Handle case where lower bound is a variable (instead of a
+      // constant as handled here)
+      if (std::optional<std::int64_t> optint =
+              getIntIfConstant(declOp.getShift()[index]))
+        lowerAttr = mlir::IntegerAttr::get(intTy, llvm::APInt(64, *optint));
+    }
     // FIXME: If `indexSize` happens to be bigger than address size on the
     // system then we may have to change 'DW_OP_deref' here.
     addOp(llvm::dwarf::DW_OP_push_object_address, {});
@@ -130,14 +140,19 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType(
         mlir::LLVM::DIExpressionAttr::get(context, ops);
     ops.clear();
 
-    addOp(llvm::dwarf::DW_OP_push_object_address, {});
-    addOp(llvm::dwarf::DW_OP_plus_uconst,
-          {offset + (indexSize * kDimLowerBoundPos)});
-    addOp(llvm::dwarf::DW_OP_deref, {});
-    // lower_bound[i] = *(base_addr + offset + (indexSize * kDimLowerBoundPos))
-    mlir::LLVM::DIExpressionAttr lowerAttr =
-        mlir::LLVM::DIExpressionAttr::get(context, ops);
-    ops.clear();
+    // If a lower bound was not found in the declOp, then we will get them from
+    // descriptor only for pointer and allocatable case. DWARF assumes lower
+    // bound of 1 when this attribute is missing.
+    if (!lowerAttr && (genAllocated || genAssociated)) {
+      addOp(llvm::dwarf::DW_OP_push_object_address, {});
+      addOp(llvm::dwarf::DW_OP_plus_uconst,
+            {offset + (indexSize * kDimLowerBoundPos)});
+      addOp(llvm::dwarf::DW_OP_deref, {});
+      // lower_bound[i] = *(base_addr + offset + (indexSize *
+      // kDimLowerBoundPos))
+      lowerAttr = mlir::LLVM::DIExpressionAttr::get(context, ops);
+      ops.clear();
+    }
 
     addOp(llvm::dwarf::DW_OP_push_object_address, {});
     addOp(llvm::dwarf::DW_OP_plus_uconst,
@@ -152,6 +167,7 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType(
     mlir::LLVM::DISubrangeAttr subrangeTy = mlir::LLVM::DISubrangeAttr::get(
         context, countAttr, lowerAttr, /*upperBound=*/nullptr, strideAttr);
     elements.push_back(subrangeTy);
+    ++index;
   }
   return mlir::LLVM::DICompositeTypeAttr::get(
       context, llvm::dwarf::DW_TAG_array_type, /*name=*/nullptr,
@@ -160,10 +176,91 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType(
       dataLocation, /*rank=*/nullptr, allocated, associated);
 }
 
+// If the type is a pointer or array type then gets its underlying type.
+static mlir::LLVM::DITypeAttr getUnderlyingType(mlir::LLVM::DITypeAttr Ty) {
+  if (auto ptrTy =
+          mlir::dyn_cast_if_present<mlir::LLVM::DIDerivedTypeAttr>(Ty)) {
+    if (ptrTy.getTag() == llvm::dwarf::DW_TAG_pointer_type)
+      Ty = getUnderlyingType(ptrTy.getBaseType());
+  }
+  if (auto comTy =
+          mlir::dyn_cast_if_present<mlir::LLVM::DICompositeTypeAttr>(Ty)) {
+    if (comTy.getTag() == llvm::dwarf::DW_TAG_array_type)
+      Ty = getUnderlyingType(comTy.getBaseType());
+  }
+  return Ty;
+}
+
+// Currently, the handling of recursive debug type in mlir has some limitations.
+// Those limitations were discussed at the end of the thread for following PR.
+// https://github.com/llvm/llvm-project/pull/106571
+//
+// Problem could be explained with the following example code:
+//  type t2
+//   type(t1), pointer :: p1
+// end type
+// type t1
+//   type(t2), pointer :: p2
+// end type
+// In the description below, type_self means a temporary type that is generated
+// as a place holder while the members of that type are being processed.
+//
+// If we process t1 first then we will have the following structure after it has
+// been processed.
+// t1 -> t2 -> t1_self
+// This is because when we started processing t2, we did not have the complete
+// t1 but its place holder t1_self.
+// Now if some entity requires t2, we will already have that in cache and will
+// return it. But this t2 refers to t1_self and not to t1. In mlir handling,
+// only those types are allowed to have _self reference which are wrapped by
+// entity whose reference it is. So t1 -> t2 -> t1_self is ok because the
+// t1_self reference can be resolved by the outer t1. But standalone t2 is not
+// because there will be no way to resolve it. Until this is fixed in mlir, we
+// avoid caching such types. Please see DebugTranslation::translateRecursive for
+// details on how mlir handles recursive types.
+static bool canCacheThisType(mlir::LLVM::DICompositeTypeAttr comTy) {
+  for (auto el : comTy.getElements()) {
+    if (auto mem =
+            mlir::dyn_cast_if_present<mlir::LLVM::DIDerivedTypeAttr>(el)) {
+      mlir::LLVM::DITypeAttr memTy = getUnderlyingType(mem.getBaseType());
+      if (auto baseTy =
+              mlir::dyn_cast_if_present<mlir::LLVM::DICompositeTypeAttr>(
+                  memTy)) {
+        // We will not cache a type if one of its member meets the following
+        // conditions:
+        // 1. It is a structure type
+        // 2. It is a place holder type (getIsRecSelf() is true)
+        // 3. It is not a self reference. It is ok to have t1_self in t1.
+        if (baseTy.getTag() == llvm::dwarf::DW_TAG_structure_type &&
+            baseTy.getIsRecSelf() && (comTy.getRecId() != baseTy.getRecId()))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
 mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType(
     fir::RecordType Ty, mlir::LLVM::DIFileAttr fileAttr,
     mlir::LLVM::DIScopeAttr scope, fir::cg::XDeclareOp declOp) {
+  // Check if this type has already been converted.
+  auto iter = typeCache.find(Ty);
+  if (iter != typeCache.end())
+    return iter->second;
+
+  llvm::SmallVector<mlir::LLVM::DINodeAttr> elements;
   mlir::MLIRContext *context = module.getContext();
+  auto recId = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
+  // Generate a place holder TypeAttr which will be used if a member
+  // references the parent type.
+  auto comAttr = mlir::LLVM::DICompositeTypeAttr::get(
+      context, recId, /*isRecSelf=*/true, llvm::dwarf::DW_TAG_structure_type,
+      mlir::StringAttr::get(context, ""), fileAttr, /*line=*/0, scope,
+      /*baseType=*/nullptr, mlir::LLVM::DIFlags::Zero, /*sizeInBits=*/0,
+      /*alignInBits=*/0, elements, /*dataLocation=*/nullptr, /*rank=*/nullptr,
+      /*allocated=*/nullptr, /*associated=*/nullptr);
+  typeCache[Ty] = comAttr;
+
   auto result = fir::NameUniquer::deconstruct(Ty.getName());
   if (result.first != fir::NameUniquer::NameKind::DERIVED_TYPE)
     return genPlaceholderType(context);
@@ -171,18 +268,18 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType(
   fir::TypeInfoOp tiOp = symbolTable->lookup<fir::TypeInfoOp>(Ty.getName());
   unsigned line = (tiOp) ? getLineFromLoc(tiOp.getLoc()) : 1;
 
-  llvm::SmallVector<mlir::LLVM::DINodeAttr> elements;
   std::uint64_t offset = 0;
   for (auto [fieldName, fieldTy] : Ty.getTypeList()) {
-    auto result = fir::getTypeSizeAndAlignment(module.getLoc(), fieldTy,
-                                               *dataLayout, kindMapping);
-    // If we get a type whose size we can't determine, we will break the loop
-    // and generate the derived type with whatever components we have
-    // assembled thus far.
-    if (!result)
-      break;
-    auto [byteSize, byteAlign] = *result;
+    mlir::Type llvmTy;
+    if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(fieldTy))
+      llvmTy =
+          llvmTypeConverter.convertBoxTypeAsStruct(boxTy, getBoxRank(boxTy));
+    else
+      llvmTy = llvmTypeConverter.convertType(fieldTy);
+
     // FIXME: Handle non defaults array bound in derived types
+    uint64_t byteSize = dataLayout->getTypeSize(llvmTy);
+    unsigned short byteAlign = dataLayout->getTypeABIAlignment(llvmTy);
     mlir::LLVM::DITypeAttr elemTy =
         convertType(fieldTy, fileAttr, scope, /*declOp=*/nullptr);
     offset = llvm::alignTo(offset, byteAlign);
@@ -195,12 +292,20 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType(
     offset += llvm::alignTo(byteSize, byteAlign);
   }
 
-  return mlir::LLVM::DICompositeTypeAttr::get(
-      context, llvm::dwarf::DW_TAG_structure_type,
+  auto finalAttr = mlir::LLVM::DICompositeTypeAttr::get(
+      context, recId, /*isRecSelf=*/false, llvm::dwarf::DW_TAG_structure_type,
       mlir::StringAttr::get(context, result.second.name), fileAttr, line, scope,
       /*baseType=*/nullptr, mlir::LLVM::DIFlags::Zero, offset * 8,
       /*alignInBits=*/0, elements, /*dataLocation=*/nullptr, /*rank=*/nullptr,
       /*allocated=*/nullptr, /*associated=*/nullptr);
+  if (canCacheThisType(finalAttr)) {
+    typeCache[Ty] = finalAttr;
+  } else {
+    auto iter = typeCache.find(Ty);
+    if (iter != typeCache.end())
+      typeCache.erase(iter);
+  }
+  return finalAttr;
 }
 
 mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType(
@@ -295,23 +400,24 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertCharacterType(
     // variable that will contain that length. This variable is used as
     // 'stringLength' in DIStringTypeAttr.
     if (declOp && !declOp.getTypeparams().empty()) {
-      mlir::Operation *op = declOp.getTypeparams()[0].getDefiningOp();
-      if (auto unbox = mlir::dyn_cast_or_null<fir::UnboxCharOp>(op)) {
-        auto name =
-            mlir::StringAttr::get(context, "." + declOp.getUniqName().str());
-        mlir::OpBuilder builder(context);
-        builder.setInsertionPoint(declOp);
-        mlir::Type i64Ty = builder.getIntegerType(64);
-        auto convOp = builder.create<fir::ConvertOp>(unbox.getLoc(), i64Ty,
-                                                     unbox.getResult(1));
-        mlir::LLVM::DITypeAttr Ty = convertType(i64Ty, fileAttr, scope, declOp);
-        auto lvAttr = mlir::LLVM::DILocalVariableAttr::get(
-            context, scope, name, fileAttr, /*line=*/0, /*argNo=*/0,
-            /*alignInBits=*/0, Ty, mlir::LLVM::DIFlags::Artificial);
-        builder.create<mlir::LLVM::DbgValueOp>(convOp.getLoc(), convOp, lvAttr,
-                                               nullptr);
-        varAttr = mlir::cast<mlir::LLVM::DIVariableAttr>(lvAttr);
+      auto name =
+          mlir::StringAttr::get(context, "." + declOp.getUniqName().str());
+      mlir::OpBuilder builder(context);
+      builder.setInsertionPoint(declOp);
+      mlir::Value sizeVal = declOp.getTypeparams()[0];
+      mlir::Type type = sizeVal.getType();
+      if (!mlir::isa<mlir::IntegerType>(type) || !type.isSignlessInteger()) {
+        type = builder.getIntegerType(64);
+        sizeVal =
+            builder.create<fir::ConvertOp>(declOp.getLoc(), type, sizeVal);
       }
+      mlir::LLVM::DITypeAttr Ty = convertType(type, fileAttr, scope, declOp);
+      auto lvAttr = mlir::LLVM::DILocalVariableAttr::get(
+          context, scope, name, fileAttr, /*line=*/0, /*argNo=*/0,
+          /*alignInBits=*/0, Ty, mlir::LLVM::DIFlags::Artificial);
+      builder.create<mlir::LLVM::DbgValueOp>(declOp.getLoc(), sizeVal, lvAttr,
+                                             nullptr);
+      varAttr = mlir::cast<mlir::LLVM::DIVariableAttr>(lvAttr);
     }
   }
 
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
index e3220f18958df..b8a068e5ba148 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
@@ -14,6 +14,7 @@
 #define FORTRAN_OPTIMIZER_TRANSFORMS_DEBUGTYPEGENERATOR_H
 
 #include "flang/Optimizer/CodeGen/CGOps.h"
+#include "flang/Optimizer/CodeGen/TypeConverter.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/Dialect/Support/KindMapping.h"
@@ -68,10 +69,12 @@ class DebugTypeGenerator {
   mlir::SymbolTable *symbolTable;
   const mlir::DataLayout *dataLayout;
   KindMapping kindMapping;
+  fir::LLVMTypeConverter llvmTypeConverter;
   std::uint64_t dimsSize;
   std::uint64_t dimsOffset;
   std::uint64_t ptrSize;
   std::uint64_t lenOffset;
+  llvm::DenseMap<mlir::Type, mlir::LLVM::DITypeAttr> typeCache;
 };
 
 } // namespace fir
diff --git a/flang/lib/Parser/parsing.cpp b/flang/lib/Parser/parsing.cpp
index 37dc113436aa0..d8448e4c527ac 100644
--- a/flang/lib/Parser/parsing.cpp
+++ b/flang/lib/Parser/parsing.cpp
@@ -75,6 +75,8 @@ const SourceFile *Parsing::Prescan(const std::string &path, Options options) {
       messages_, *currentCooked_, preprocessor_, options.features};
   prescanner.set_fixedForm(options.isFixedForm)
       .set_fixedFormColumnLimit(options.fixedFormColumns)
+      .set_expandIncludeLines(!options.prescanAndReformat ||
+          options.expandIncludeLinesInPreprocessedOutput)
       .AddCompilerDirectiveSentinel("dir$");
   if (options.features.IsEnabled(LanguageFeature::OpenACC)) {
     prescanner.AddCompilerDirectiveSentinel("$acc");
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index b231c5859cf87..eabfcc244001a 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -1031,6 +1031,9 @@ const char *Prescanner::IsFreeFormComment(const char *p) const {
 }
 
 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
+  if (!expandIncludeLines_) {
+    return std::nullopt;
+  }
   const char *p{SkipWhiteSpace(start)};
   if (*p == '0' && inFixedForm_ && p == start + 5) {
     // Accept "     0INCLUDE" in fixed form.
diff --git a/flang/lib/Parser/prescan.h b/flang/lib/Parser/prescan.h
index 9d4f7c0c302a1..c50bf231e3c70 100644
--- a/flang/lib/Parser/prescan.h
+++ b/flang/lib/Parser/prescan.h
@@ -48,6 +48,10 @@ class Prescanner {
   Preprocessor &preprocessor() { return preprocessor_; }
   common::LanguageFeatureControl &features() { return features_; }
 
+  Prescanner &set_expandIncludeLines(bool yes) {
+    expandIncludeLines_ = yes;
+    return *this;
+  }
   Prescanner &set_fixedForm(bool yes) {
     inFixedForm_ = yes;
     return *this;
@@ -209,6 +213,7 @@ class Prescanner {
   Preprocessor &preprocessor_;
   AllSources &allSources_;
   common::LanguageFeatureControl features_;
+  bool expandIncludeLines_{true};
   bool isNestedInIncludeDirective_{false};
   bool backslashFreeFormContinuation_{false};
   bool inFixedForm_{false};
diff --git a/flang/lib/Parser/program-parsers.cpp b/flang/lib/Parser/program-parsers.cpp
index c43696c52c160..2b7da18a09bb3 100644
--- a/flang/lib/Parser/program-parsers.cpp
+++ b/flang/lib/Parser/program-parsers.cpp
@@ -249,8 +249,9 @@ TYPE_CONTEXT_PARSER("PROGRAM statement"_en_US,
 
 // R1403 end-program-stmt -> END [PROGRAM [program-name]]
 TYPE_CONTEXT_PARSER("END PROGRAM statement"_en_US,
-    construct<EndProgramStmt>(recovery(
-        "END PROGRAM" >> maybe(name) || bareEnd, progUnitEndStmtErrorRecovery)))
+    construct<EndProgramStmt>(
+        recovery("END" >> defaulted("PROGRAM" >> maybe(name)) / atEndOfStmt,
+            progUnitEndStmtErrorRecovery)))
 
 // R1404 module ->
 //         module-stmt [specification-part] [module-subprogram-part]
@@ -266,8 +267,9 @@ TYPE_CONTEXT_PARSER(
 
 // R1406 end-module-stmt -> END [MODULE [module-name]]
 TYPE_CONTEXT_PARSER("END MODULE statement"_en_US,
-    construct<EndModuleStmt>(recovery(
-        "END MODULE" >> maybe(name) || bareEnd, progUnitEndStmtErrorRecovery)))
+    construct<EndModuleStmt>(
+        recovery("END" >> defaulted("MODULE" >> maybe(name)) / atEndOfStmt,
+            progUnitEndStmtErrorRecovery)))
 
 // R1407 module-subprogram-part -> contains-stmt [module-subprogram]...
 TYPE_CONTEXT_PARSER("module subprogram part"_en_US,
@@ -334,7 +336,7 @@ TYPE_PARSER(construct<ParentIdentifier>(name, maybe(":" >> name)))
 // R1419 end-submodule-stmt -> END [SUBMODULE [submodule-name]]
 TYPE_CONTEXT_PARSER("END SUBMODULE statement"_en_US,
     construct<EndSubmoduleStmt>(
-        recovery("END SUBMODULE" >> maybe(name) || bareEnd,
+        recovery("END" >> defaulted("SUBMODULE" >> maybe(name)) / atEndOfStmt,
             progUnitEndStmtErrorRecovery)))
 
 // R1420 block-data -> block-data-stmt [specification-part] end-block-data-stmt
@@ -350,7 +352,7 @@ TYPE_CONTEXT_PARSER("BLOCK DATA statement"_en_US,
 // R1422 end-block-data-stmt -> END [BLOCK DATA [block-data-name]]
 TYPE_CONTEXT_PARSER("END BLOCK DATA statement"_en_US,
     construct<EndBlockDataStmt>(
-        recovery("END BLOCK DATA" >> maybe(name) || bareEnd,
+        recovery("END" >> defaulted("BLOCK DATA" >> maybe(name)) / atEndOfStmt,
             progUnitEndStmtErrorRecovery)))
 
 // R1501 interface-block ->
@@ -564,8 +566,9 @@ TYPE_PARSER(construct<Suffix>(
         "RESULT" >> parenthesized(name), maybe(languageBindingSpec)))
 
 // R1533 end-function-stmt -> END [FUNCTION [function-name]]
-TYPE_PARSER(construct<EndFunctionStmt>(recovery(
-    "END FUNCTION" >> maybe(name) || bareEnd, progUnitEndStmtErrorRecovery)))
+TYPE_PARSER(construct<EndFunctionStmt>(
+    recovery("END" >> defaulted("FUNCTION" >> maybe(name)) / atEndOfStmt,
+        progUnitEndStmtErrorRecovery)))
 
 // R1534 subroutine-subprogram ->
 //         subroutine-stmt [specification-part] [execution-part]
@@ -591,8 +594,9 @@ TYPE_PARSER(
 TYPE_PARSER(construct<DummyArg>(name) || construct<DummyArg>(star))
 
 // R1537 end-subroutine-stmt -> END [SUBROUTINE [subroutine-name]]
-TYPE_PARSER(construct<EndSubroutineStmt>(recovery(
-    "END SUBROUTINE" >> maybe(name) || bareEnd, progUnitEndStmtErrorRecovery)))
+TYPE_PARSER(construct<EndSubroutineStmt>(
+    recovery("END" >> defaulted("SUBROUTINE" >> maybe(name)) / atEndOfStmt,
+        progUnitEndStmtErrorRecovery)))
 
 // R1538 separate-module-subprogram ->
 //         mp-subprogram-stmt [specification-part] [execution-part]
@@ -609,7 +613,7 @@ TYPE_CONTEXT_PARSER("MODULE PROCEDURE statement"_en_US,
 // R1540 end-mp-subprogram-stmt -> END [PROCEDURE [procedure-name]]
 TYPE_CONTEXT_PARSER("END PROCEDURE statement"_en_US,
     construct<EndMpSubprogramStmt>(
-        recovery("END PROCEDURE" >> maybe(name) || bareEnd,
+        recovery("END" >> defaulted("PROCEDURE" >> maybe(name)) / atEndOfStmt,
             progUnitEndStmtErrorRecovery)))
 
 // R1541 entry-stmt -> ENTRY entry-name [( [dummy-arg-list] ) [suffix]]
diff --git a/flang/lib/Parser/stmt-parser.h b/flang/lib/Parser/stmt-parser.h
index 00bae2bf950c8..ee45c6fd5d38c 100644
--- a/flang/lib/Parser/stmt-parser.h
+++ b/flang/lib/Parser/stmt-parser.h
@@ -90,7 +90,6 @@ constexpr auto executionPartErrorRecovery{stmtErrorRecoveryStart >>
 // END statement error recovery
 constexpr auto missingOptionalName{pure<std::optional<Name>>()};
 constexpr auto noNameEnd{"END" >> missingOptionalName};
-constexpr auto bareEnd{noNameEnd / recovery(atEndOfStmt, SkipTo<'\n'>{})};
 
 // For unrecognizable construct END statements.  Be sure to not consume
 // a program unit's END statement.
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 71d1c083c3127..31079174239c2 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -1600,6 +1600,18 @@ static void CheckMaxMin(const characteristics::Procedure &proc,
   }
 }
 
+static void CheckFree(evaluate::ActualArguments &arguments,
+    parser::ContextualMessages &messages) {
+  if (arguments.size() != 1) {
+    messages.Say("FREE expects a single argument"_err_en_US);
+  }
+  auto arg = arguments[0];
+  if (const Symbol * symbol{evaluate::UnwrapWholeSymbolDataRef(arg)};
+      !symbol || !symbol->test(Symbol::Flag::CrayPointer)) {
+    messages.Say("FREE should only be used with Cray pointers"_warn_en_US);
+  }
+}
+
 // MOVE_ALLOC (F'2023 16.9.147)
 static void CheckMove_Alloc(evaluate::ActualArguments &arguments,
     parser::ContextualMessages &messages) {
@@ -1885,6 +1897,8 @@ static void CheckSpecificIntrinsic(const characteristics::Procedure &proc,
     CheckReduce(arguments, context.foldingContext());
   } else if (intrinsic.name == "transfer") {
     CheckTransfer(arguments, context, scope);
+  } else if (intrinsic.name == "free") {
+    CheckFree(arguments, context.foldingContext().messages());
   }
 }
 
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index dfc3f3290a81b..51341b3faf3a4 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -16,25 +16,25 @@ namespace Fortran::semantics {
 // Use when clause falls under 'struct OmpClause' in 'parse-tree.h'.
 #define CHECK_SIMPLE_CLAUSE(X, Y) \
   void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \
-    CheckAllowed(llvm::omp::Clause::Y); \
+    CheckAllowedClause(llvm::omp::Clause::Y); \
   }
 
 #define CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(X, Y) \
   void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
-    CheckAllowed(llvm::omp::Clause::Y); \
+    CheckAllowedClause(llvm::omp::Clause::Y); \
     RequiresConstantPositiveParameter(llvm::omp::Clause::Y, c.v); \
   }
 
 #define CHECK_REQ_SCALAR_INT_CLAUSE(X, Y) \
   void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
-    CheckAllowed(llvm::omp::Clause::Y); \
+    CheckAllowedClause(llvm::omp::Clause::Y); \
     RequiresPositiveParameter(llvm::omp::Clause::Y, c.v); \
   }
 
 // Use when clause don't falls under 'struct OmpClause' in 'parse-tree.h'.
 #define CHECK_SIMPLE_PARSER_CLAUSE(X, Y) \
   void OmpStructureChecker::Enter(const parser::X &) { \
-    CheckAllowed(llvm::omp::Y); \
+    CheckAllowedClause(llvm::omp::Y); \
   }
 
 // 'OmpWorkshareBlockChecker' is used to check the validity of the assignment
@@ -163,6 +163,43 @@ class AssociatedLoopChecker {
   std::map<std::string, std::int64_t> constructNamesAndLevels_;
 };
 
+bool OmpStructureChecker::CheckAllowedClause(llvmOmpClause clause) {
+  unsigned version{context_.langOptions().OpenMPVersion};
+  DirectiveContext &dirCtx = GetContext();
+  llvm::omp::Directive dir{dirCtx.directive};
+
+  if (!llvm::omp::isAllowedClauseForDirective(dir, clause, version)) {
+    unsigned allowedInVersion{[&] {
+      for (unsigned v : {45, 50, 51, 52, 60}) {
+        if (v <= version) {
+          continue;
+        }
+        if (llvm::omp::isAllowedClauseForDirective(dir, clause, v)) {
+          return v;
+        }
+      }
+      return 0u;
+    }()};
+
+    // Only report it if there is a later version that allows it.
+    // If it's not allowed at all, it will be reported by CheckAllowed.
+    if (allowedInVersion != 0) {
+      auto clauseName{parser::ToUpperCaseLetters(getClauseName(clause).str())};
+      auto dirName{parser::ToUpperCaseLetters(getDirectiveName(dir).str())};
+
+      std::string thisVersion{
+          std::to_string(version / 10) + "." + std::to_string(version % 10)};
+      std::string goodVersion{std::to_string(allowedInVersion)};
+
+      context_.Say(dirCtx.clauseSource,
+          "%s clause is not allowed on directive %s in OpenMP v%s, "
+          "try -fopenmp-version=%d"_err_en_US,
+          clauseName, dirName, thisVersion, allowedInVersion);
+    }
+  }
+  return CheckAllowed(clause);
+}
+
 bool OmpStructureChecker::IsCloselyNestedRegion(const OmpDirectiveSet &set) {
   // Definition of close nesting:
   //
@@ -1156,7 +1193,7 @@ void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) {
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Allocator &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_allocator);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_allocator);
   // Note: Predefined allocators are stored in ScalarExpr as numbers
   //   whereas custom allocators are stored as strings, so if the ScalarExpr
   //   actually has an int value, then it must be a predefined allocator
@@ -1165,7 +1202,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Allocator &x) {
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Allocate &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_allocate);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_allocate);
   if (const auto &modifier{
           std::get<std::optional<parser::OmpAllocateClause::AllocateModifier>>(
               x.v.t)}) {
@@ -1210,7 +1247,7 @@ void OmpStructureChecker::Leave(const parser::OmpDeclareTargetWithClause &x) {
         FindClause(llvm::omp::Clause::OMPC_link);
     if (!enterClause && !toClause && !linkClause) {
       context_.Say(x.source,
-          "If the DECLARE TARGET directive has a clause, it must contain at lease one ENTER clause or LINK clause"_err_en_US);
+          "If the DECLARE TARGET directive has a clause, it must contain at least one ENTER clause or LINK clause"_err_en_US);
     }
     if (toClause && context_.ShouldWarn(common::UsageWarning::OpenMPUsage)) {
       context_.Say(toClause->source,
@@ -1800,9 +1837,23 @@ inline void OmpStructureChecker::ErrIfLHSAndRHSSymbolsMatch(
     const Symbol &varSymbol = vSyms.front();
     for (const Symbol &symbol : evaluate::GetSymbolVector(*e)) {
       if (varSymbol == symbol) {
-        context_.Say(expr.source,
-            "RHS expression on atomic assignment statement cannot access '%s'"_err_en_US,
-            var.GetSource().ToString());
+        const Fortran::common::Indirection<Fortran::parser::Designator>
+            *designator = std::get_if<
+                Fortran::common::Indirection<Fortran::parser::Designator>>(
+                &expr.u);
+        if (designator) {
+          auto *z{var.typedExpr.get()};
+          auto *c{expr.typedExpr.get()};
+          if (z->v == c->v) {
+            context_.Say(expr.source,
+                "RHS expression on atomic assignment statement cannot access '%s'"_err_en_US,
+                var.GetSource());
+          }
+        } else {
+          context_.Say(expr.source,
+              "RHS expression on atomic assignment statement cannot access '%s'"_err_en_US,
+              var.GetSource());
+        }
       }
     }
   }
@@ -2362,7 +2413,7 @@ CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen)
 // Restrictions specific to each clause are implemented apart from the
 // generalized restrictions.
 void OmpStructureChecker::Enter(const parser::OmpClause::Reduction &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_reduction);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_reduction);
   if (CheckReductionOperators(x)) {
     CheckReductionTypeList(x);
   }
@@ -2686,7 +2737,7 @@ void OmpStructureChecker::CheckSharedBindingInOuterContext(
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Ordered &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_ordered);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_ordered);
   // the parameter of ordered clause is optional
   if (const auto &expr{x.v}) {
     RequiresConstantPositiveParameter(llvm::omp::Clause::OMPC_ordered, *expr);
@@ -2701,17 +2752,17 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Ordered &x) {
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Shared &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_shared);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_shared);
   CheckIsVarPartOfAnotherVar(GetContext().clauseSource, x.v, "SHARED");
 }
 void OmpStructureChecker::Enter(const parser::OmpClause::Private &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_private);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_private);
   CheckIsVarPartOfAnotherVar(GetContext().clauseSource, x.v, "PRIVATE");
   CheckIntentInPointer(x.v, llvm::omp::Clause::OMPC_private);
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Nowait &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_nowait);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_nowait);
   if (llvm::omp::noWaitClauseNotAllowedSet.test(GetContext().directive)) {
     context_.Say(GetContext().clauseSource,
         "%s clause is not allowed on the OMP %s directive,"
@@ -2784,7 +2835,7 @@ void OmpStructureChecker::CheckIsVarPartOfAnotherVar(
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Firstprivate &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_firstprivate);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_firstprivate);
 
   CheckIsVarPartOfAnotherVar(GetContext().clauseSource, x.v, "FIRSTPRIVATE");
   CheckIsLoopIvPartOfClause(llvmOmpClause::OMPC_firstprivate, x.v);
@@ -2871,7 +2922,7 @@ void OmpStructureChecker::Leave(const parser::OmpAtomic &) {
 // Restrictions specific to each clause are implemented apart from the
 // generalized restrictions.
 void OmpStructureChecker::Enter(const parser::OmpClause::Aligned &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_aligned);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_aligned);
 
   if (const auto &expr{
           std::get<std::optional<parser::ScalarIntConstantExpr>>(x.v.t)}) {
@@ -2880,7 +2931,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Aligned &x) {
   // 2.8.1 TODO: list-item attribute check
 }
 void OmpStructureChecker::Enter(const parser::OmpClause::Defaultmap &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_defaultmap);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_defaultmap);
   using VariableCategory = parser::OmpDefaultmapClause::VariableCategory;
   if (!std::get<std::optional<VariableCategory>>(x.v.t)) {
     context_.Say(GetContext().clauseSource,
@@ -2889,7 +2940,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Defaultmap &x) {
   }
 }
 void OmpStructureChecker::Enter(const parser::OmpClause::If &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_if);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_if);
   using dirNameModifier = parser::OmpIfClause::DirectiveNameModifier;
   // TODO Check that, when multiple 'if' clauses are applied to a combined
   // construct, at most one of them applies to each directive.
@@ -2925,7 +2976,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::If &x) {
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Linear &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_linear);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_linear);
 
   // 2.7 Loop Construct Restriction
   if ((llvm::omp::allDoSet | llvm::omp::allSimdSet)
@@ -2959,7 +3010,7 @@ void OmpStructureChecker::CheckAllowedMapTypes(
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Map &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_map);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_map);
 
   if (const auto &maptype{std::get<std::optional<parser::OmpMapType>>(x.v.t)}) {
     using Type = parser::OmpMapType::Type;
@@ -3005,7 +3056,7 @@ bool OmpStructureChecker::ScheduleModifierHasType(
   return false;
 }
 void OmpStructureChecker::Enter(const parser::OmpClause::Schedule &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_schedule);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_schedule);
   const parser::OmpScheduleClause &scheduleClause = x.v;
 
   // 2.7 Loop Construct Restriction
@@ -3041,7 +3092,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Schedule &x) {
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Device &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_device);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_device);
   const parser::OmpDeviceClause &deviceClause = x.v;
   const auto &device{std::get<1>(deviceClause.t)};
   RequiresPositiveParameter(
@@ -3060,7 +3111,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Device &x) {
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Depend &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_depend);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_depend);
   if ((std::holds_alternative<parser::OmpDependClause::Source>(x.v.u) ||
           std::holds_alternative<parser::OmpDependClause::Sink>(x.v.u)) &&
       GetContext().directive != llvm::omp::OMPD_ordered) {
@@ -3103,7 +3154,7 @@ void OmpStructureChecker::CheckCopyingPolymorphicAllocatable(
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Copyprivate &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_copyprivate);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_copyprivate);
   CheckIntentInPointer(x.v, llvm::omp::Clause::OMPC_copyprivate);
   SymbolSourceMap currSymbols;
   GetSymbolsInObjectList(x.v, currSymbols);
@@ -3121,7 +3172,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Copyprivate &x) {
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Lastprivate &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_lastprivate);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_lastprivate);
 
   CheckIsVarPartOfAnotherVar(GetContext().clauseSource, x.v, "LASTPRIVATE");
 
@@ -3145,7 +3196,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Lastprivate &x) {
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Copyin &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_copyin);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_copyin);
 
   SymbolSourceMap currSymbols;
   GetSymbolsInObjectList(x.v, currSymbols);
@@ -3180,7 +3231,7 @@ void OmpStructureChecker::CheckStructureElement(
 
 void OmpStructureChecker::Enter(const parser::OmpClause::UseDevicePtr &x) {
   CheckStructureElement(x.v, llvm::omp::Clause::OMPC_use_device_ptr);
-  CheckAllowed(llvm::omp::Clause::OMPC_use_device_ptr);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_use_device_ptr);
   SymbolSourceMap currSymbols;
   GetSymbolsInObjectList(x.v, currSymbols);
   semantics::UnorderedSymbolSet listVars;
@@ -3213,7 +3264,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::UseDevicePtr &x) {
 
 void OmpStructureChecker::Enter(const parser::OmpClause::UseDeviceAddr &x) {
   CheckStructureElement(x.v, llvm::omp::Clause::OMPC_use_device_addr);
-  CheckAllowed(llvm::omp::Clause::OMPC_use_device_addr);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_use_device_addr);
   SymbolSourceMap currSymbols;
   GetSymbolsInObjectList(x.v, currSymbols);
   semantics::UnorderedSymbolSet listVars;
@@ -3238,7 +3289,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::UseDeviceAddr &x) {
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::IsDevicePtr &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_is_device_ptr);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_is_device_ptr);
   SymbolSourceMap currSymbols;
   GetSymbolsInObjectList(x.v, currSymbols);
   semantics::UnorderedSymbolSet listVars;
@@ -3276,7 +3327,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::IsDevicePtr &x) {
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::HasDeviceAddr &x) {
-  CheckAllowed(llvm::omp::Clause::OMPC_has_device_addr);
+  CheckAllowedClause(llvm::omp::Clause::OMPC_has_device_addr);
   SymbolSourceMap currSymbols;
   GetSymbolsInObjectList(x.v, currSymbols);
   semantics::UnorderedSymbolSet listVars;
@@ -3621,7 +3672,7 @@ void OmpStructureChecker::Enter(
 }
 
 void OmpStructureChecker::CheckAllowedRequiresClause(llvmOmpClause clause) {
-  CheckAllowed(clause);
+  CheckAllowedClause(clause);
 
   if (clause != llvm::omp::Clause::OMPC_atomic_default_mem_order) {
     // Check that it does not appear after a device construct
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 8bfd4d594b028..605f3f05b4bc8 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -139,6 +139,7 @@ class OmpStructureChecker
   }
 
 private:
+  bool CheckAllowedClause(llvmOmpClause clause);
   void CheckMultipleOccurrence(semantics::UnorderedSymbolSet &listVars,
       const std::list<parser::Name> &nameList, const parser::CharBlock &item,
       const std::string &clauseName);
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
index 8592d1e5d6217..1f2980b07b3e0 100644
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -348,9 +348,10 @@ class CommonBlockMap {
 SemanticsContext::SemanticsContext(
     const common::IntrinsicTypeDefaultKinds &defaultKinds,
     const common::LanguageFeatureControl &languageFeatures,
+    const common::LangOptions &langOpts,
     parser::AllCookedSources &allCookedSources)
     : defaultKinds_{defaultKinds}, languageFeatures_{languageFeatures},
-      allCookedSources_{allCookedSources},
+      langOpts_{langOpts}, allCookedSources_{allCookedSources},
       intrinsics_{evaluate::IntrinsicProcTable::Configure(defaultKinds_)},
       globalScope_{*this}, intrinsicModulesScope_{globalScope_.MakeScope(
                                Scope::Kind::IntrinsicModules, nullptr)},
diff --git a/flang/runtime/Float128Math/CMakeLists.txt b/flang/runtime/Float128Math/CMakeLists.txt
index 87f791f225d0d..c8a51cc2508b9 100644
--- a/flang/runtime/Float128Math/CMakeLists.txt
+++ b/flang/runtime/Float128Math/CMakeLists.txt
@@ -47,6 +47,7 @@ set(sources
   mod-real.cpp
   modulo-real.cpp
   nearest.cpp
+  nearbyint.cpp
   norm2.cpp
   pow.cpp
   random.cpp
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index 151ed8a09fde0..90a983b787f53 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -92,6 +92,7 @@ DEFINE_FALLBACK_I64(Llround)
 DEFINE_FALLBACK_F128(Log)
 DEFINE_FALLBACK_F128(Log10)
 DEFINE_FALLBACK_I32(Lround)
+DEFINE_FALLBACK_F128(Nearbyint)
 DEFINE_FALLBACK_F128(Nextafter)
 DEFINE_FALLBACK_F128(Pow)
 DEFINE_FALLBACK_F128(Qnan)
@@ -140,6 +141,7 @@ DEFINE_SIMPLE_ALIAS(Llround, llroundq)
 DEFINE_SIMPLE_ALIAS(Log, logq)
 DEFINE_SIMPLE_ALIAS(Log10, log10q)
 DEFINE_SIMPLE_ALIAS(Lround, lroundq)
+DEFINE_SIMPLE_ALIAS(Nearbyint, nearbyintq)
 DEFINE_SIMPLE_ALIAS(Nextafter, nextafterq)
 DEFINE_SIMPLE_ALIAS(Pow, powq)
 DEFINE_SIMPLE_ALIAS(Round, roundq)
@@ -194,6 +196,7 @@ DEFINE_SIMPLE_ALIAS(Llround, std::llround)
 DEFINE_SIMPLE_ALIAS(Log, std::log)
 DEFINE_SIMPLE_ALIAS(Log10, std::log10)
 DEFINE_SIMPLE_ALIAS(Lround, std::lround)
+DEFINE_SIMPLE_ALIAS(Nearbyint, std::nearbyint)
 DEFINE_SIMPLE_ALIAS(Nextafter, std::nextafter)
 DEFINE_SIMPLE_ALIAS(Pow, std::pow)
 DEFINE_SIMPLE_ALIAS(Round, std::round)
diff --git a/flang/runtime/Float128Math/nearbyint.cpp b/flang/runtime/Float128Math/nearbyint.cpp
new file mode 100644
index 0000000000000..9eecb0c5f3e2f
--- /dev/null
+++ b/flang/runtime/Float128Math/nearbyint.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/nearbyint.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if HAS_LDBL128 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(NearbyintF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Nearbyint<true>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/extensions.cpp b/flang/runtime/extensions.cpp
index be3833db88b07..4412a9cbeb6d2 100644
--- a/flang/runtime/extensions.cpp
+++ b/flang/runtime/extensions.cpp
@@ -96,6 +96,10 @@ void FORTRAN_PROCEDURE_NAME(fdate)(char *arg, std::int64_t length) {
   CopyAndPad(arg, str, length, 24);
 }
 
+std::intptr_t RTNAME(Malloc)(std::size_t size) {
+  return reinterpret_cast<std::intptr_t>(std::malloc(size));
+}
+
 // RESULT = IARGC()
 std::int32_t FORTRAN_PROCEDURE_NAME(iargc)() { return RTNAME(ArgumentCount)(); }
 
@@ -124,6 +128,10 @@ void FORTRAN_PROCEDURE_NAME(getlog)(char *arg, std::int64_t length) {
 #endif
 }
 
+void RTNAME(Free)(std::intptr_t ptr) {
+  std::free(reinterpret_cast<void *>(ptr));
+}
+
 std::int64_t RTNAME(Signal)(std::int64_t number, void (*handler)(int)) {
   // using auto for portability:
   // on Windows, this is a void *
diff --git a/flang/test/Driver/B-opt.f90 b/flang/test/Driver/B-opt.f90
new file mode 100644
index 0000000000000..f5cd58fa97d08
--- /dev/null
+++ b/flang/test/Driver/B-opt.f90
@@ -0,0 +1,23 @@
+! Check -B driver option.
+!
+! Target triple prefix is not detected for -B.
+! RUN: %flang %s -### -o %t.o --target=i386-unknown-linux \
+! RUN:     -B %S/Inputs/B_opt_tree/dir1 -fuse-ld=ld 2>&1 \
+! RUN:   | FileCheck --check-prefix=CHECK-B-OPT-TRIPLE %s
+! CHECK-B-OPT-TRIPLE-NOT: "{{.*}}/Inputs/B_opt_tree/dir1{{/|\\\\}}i386-unknown-linux-ld"
+!
+! RUN: %flang %s -### -o %t.o --target=i386-unknown-linux \
+! RUN:     -B %S/Inputs/B_opt_tree/dir2 -fuse-ld=ld 2>&1 \
+! RUN:   | FileCheck --check-prefix=CHECK-B-OPT-DIR %s
+! CHECK-B-OPT-DIR: "{{.*}}/Inputs/B_opt_tree/dir2{{/|\\\\}}ld"
+!
+! RUN: %flang %s -### -o %t.o --target=i386-unknown-linux \
+! RUN:     -B %S/Inputs/B_opt_tree/dir3/prefix- -fuse-ld=ld 2>&1 \
+! RUN:   | FileCheck --check-prefix=CHECK-B-OPT-PREFIX %s
+! CHECK-B-OPT-PREFIX: "{{.*}}/Inputs/B_opt_tree/dir3{{/|\\\\}}prefix-ld"
+!
+! RUN: %flang %s -### -o %t.o --target=i386-unknown-linux \
+! RUN:     -B %S/Inputs/B_opt_tree/dir3/prefix- \
+! RUN:     -B %S/Inputs/B_opt_tree/dir2 2>&1 -fuse-ld=ld \
+! RUN:   | FileCheck --check-prefix=CHECK-B-OPT-MULT %s
+! CHECK-B-OPT-MULT: "{{.*}}/Inputs/B_opt_tree/dir3{{/|\\\\}}prefix-ld"
diff --git a/flang/test/Driver/Inputs/B_opt_tree/dir1/i386-unknown-linux-ld b/flang/test/Driver/Inputs/B_opt_tree/dir1/i386-unknown-linux-ld
new file mode 100755
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/B_opt_tree/dir1/ld b/flang/test/Driver/Inputs/B_opt_tree/dir1/ld
new file mode 100755
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/B_opt_tree/dir2/ld b/flang/test/Driver/Inputs/B_opt_tree/dir2/ld
new file mode 100755
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/B_opt_tree/dir3/prefix-ld b/flang/test/Driver/Inputs/B_opt_tree/dir3/prefix-ld
new file mode 100755
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/color-diagnostics-forwarding.f90 b/flang/test/Driver/color-diagnostics-forwarding.f90
index daef17cb75787..368fa8834142a 100644
--- a/flang/test/Driver/color-diagnostics-forwarding.f90
+++ b/flang/test/Driver/color-diagnostics-forwarding.f90
@@ -1,21 +1,53 @@
-! Test that flang-new forwards -f{no-}color-diagnostics options to
-! flang-new -fc1 as expected.
+! Test that flang-new forwards -f{no-}color-diagnostics and
+! -f{no-}diagnostics-color options to flang-new -fc1 as expected.
 
 ! RUN: %flang -fsyntax-only -### %s -o %t 2>&1 -fcolor-diagnostics \
 ! RUN:   | FileCheck %s --check-prefix=CHECK-CD
+! RUN: %flang -fsyntax-only -### %s -o %t 2>&1 -fdiagnostics-color \
+! RUN:   | FileCheck %s --check-prefix=CHECK-CD
+! RUN: %flang -fsyntax-only -### %s -o %t 2>&1 -fdiagnostics-color=always \
+! RUN:   | FileCheck %s --check-prefix=CHECK-CD
 ! CHECK-CD: "-fc1"{{.*}} "-fcolor-diagnostics"
 
 ! RUN: %flang -fsyntax-only -### %s -o %t 2>&1 -fno-color-diagnostics \
 ! RUN:   | FileCheck %s --check-prefix=CHECK-NCD
+! RUN: %flang -fsyntax-only -### %s -o %t -fno-diagnostics-color 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=CHECK-NCD
+! RUN: %flang -fsyntax-only -### %s -o %t 2>&1 -fdiagnostics-color=never \
+! RUN:   | FileCheck %s --check-prefix=CHECK-NCD
 ! CHECK-NCD-NOT: "-fc1"{{.*}} "-fcolor-diagnostics"
 
 ! Check that the last flag wins.
 ! RUN: %flang -fsyntax-only -### %s -o %t 2>&1 \
 ! RUN:     -fno-color-diagnostics -fcolor-diagnostics \
 ! RUN:   | FileCheck %s --check-prefix=CHECK-NCD_CD_S
+! RUN: %flang -fsyntax-only -### %s -o %t 2>&1 \
+! RUN:     -fno-diagnostics-color -fdiagnostics-color \
+! RUN:   | FileCheck %s --check-prefix=CHECK-NCD_CD_S
+! RUN: %flang -fsyntax-only -### %s -o %t \
+! RUN:     -fno-color-diagnostics -fdiagnostics-color=always 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=CHECK-NCD_CD_S
+! RUN: %flang -fsyntax-only -### %s -o %t \
+! RUN:     -fdiagnostics-color=never -fdiagnostics-color=always 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=CHECK-NCD_CD_S
+! RUN: %flang -fsyntax-only -### %s -o %t \
+! RUN:     -fdiagnostics-color=never -fcolor-diagnostics 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=CHECK-NCD_CD_S
 ! CHECK-NCD_CD_S: "-fc1"{{.*}} "-fcolor-diagnostics"
 
 ! RUN: %flang -fsyntax-only -### %s -o %t 2>&1 \
 ! RUN:     -fcolor-diagnostics -fno-color-diagnostics \
 ! RUN:   | FileCheck %s --check-prefix=CHECK-CD_NCD_S
+! RUN: %flang -fsyntax-only -### %s -o %t \
+! RUN:     -fdiagnostics-color -fno-diagnostics-color  2>&1 \
+! RUN:   | FileCheck %s --check-prefix=CHECK-CD_NCD_S
+! RUN: %flang -fsyntax-only -### %s -o %t \
+! RUN:     -fdiagnostics-color=always -fno-color-diagnostics 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=CHECK-CD_NCD_S
+! RUN: %flang -fsyntax-only -### %s -o %t \
+! RUN:     -fdiagnostics-color=always -fdiagnostics-color=never 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=CHECK-CD_NCD_S
+! RUN: %flang -fsyntax-only -### %s -o %t \
+! RUN:     -fcolor-diagnostics -fdiagnostics-color=never 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=CHECK-CD_NCD_S
 ! CHECK-CD_NCD_S-NOT: "-fc1"{{.*}} "-fcolor-diagnostics"
diff --git a/flang/test/Driver/color-diagnostics-parse.f90 b/flang/test/Driver/color-diagnostics-parse.f90
index 11a1c7b57c9e2..3682224ac9525 100644
--- a/flang/test/Driver/color-diagnostics-parse.f90
+++ b/flang/test/Driver/color-diagnostics-parse.f90
@@ -1,12 +1,22 @@
-! Test the behaviors of -f{no-}color-diagnostics when emitting parsing
-! diagnostics.
+! Test the behaviors of -f{no-}color-diagnostics and -f{no-}diagnostics-color
+! when emitting parsing diagnostics.
 ! Windows command prompt doesn't support ANSI escape sequences.
 ! REQUIRES: shell
 
 ! RUN: not %flang %s -fcolor-diagnostics 2>&1 \
 ! RUN:     | FileCheck %s --check-prefix=CHECK_CD
+! RUN: not %flang %s -fdiagnostics-color 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_CD
+! RUN: not %flang %s -fdiagnostics-color=always 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_CD
+
 ! RUN: not %flang %s -fno-color-diagnostics 2>&1 \
 ! RUN:     | FileCheck %s --check-prefix=CHECK_NCD
+! RUN: not %flang %s -fno-diagnostics-color 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_NCD
+! RUN: not %flang %s -fdiagnostics-color=never 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_NCD
+
 ! RUN: not %flang_fc1 %s -fcolor-diagnostics 2>&1 \
 ! RUN:     | FileCheck %s --check-prefix=CHECK_CD
 ! RUN: not %flang_fc1 %s 2>&1 | FileCheck %s --check-prefix=CHECK_NCD
diff --git a/flang/test/Driver/color-diagnostics-scan.f b/flang/test/Driver/color-diagnostics-scan.f
index d901d77adaf8f..29d4635b4fb03 100644
--- a/flang/test/Driver/color-diagnostics-scan.f
+++ b/flang/test/Driver/color-diagnostics-scan.f
@@ -1,5 +1,5 @@
-! Test the behaviors of -f{no-}color-diagnostics when emitting scanning
-! diagnostics.
+! Test the behaviors of -f{no-}color-diagnostics and -f{no}-diagnostic-colors
+! when emitting scanning diagnostics.
 ! Windows command prompt doesn't support ANSI escape sequences.
 ! REQUIRES: shell
 
@@ -9,6 +9,17 @@
 ! RUN:     | FileCheck %s --check-prefix=CHECK_NCD
 ! RUN: not %flang_fc1 -E -Werror %s -fcolor-diagnostics 2>&1 \
 ! RUN:     | FileCheck %s --check-prefix=CHECK_CD
+
+! RUN: not %flang %s -E -Werror -fdiagnostics-color 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_CD
+! RUN: not %flang %s -E -Werror -fno-diagnostics-color 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_NCD
+
+! RUN: not %flang %s -E -Werror -fdiagnostics-color=always 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_CD
+! RUN: not %flang %s -E -Werror -fdiagnostics-color=never 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_NCD
+
 ! RUN: not %flang_fc1 -E -Werror %s 2>&1 | FileCheck %s --check-prefix=CHECK_NCD
 
 ! CHECK_CD: {{.*}}[0;1;35mwarning: {{.*}}[0mCharacter in fixed-form label field must be a digit
diff --git a/flang/test/Driver/color-diagnostics-sema.f90 b/flang/test/Driver/color-diagnostics-sema.f90
index df7a69f297f12..ca87b196a82f0 100644
--- a/flang/test/Driver/color-diagnostics-sema.f90
+++ b/flang/test/Driver/color-diagnostics-sema.f90
@@ -1,5 +1,5 @@
-! Test the behaviors of -f{no-}color-diagnostics when emitting semantic
-! diagnostics.
+! Test the behaviors of -f{no-}color-diagnostics and -f{no}diagnostics-color
+! when emitting semantic diagnostics.
 ! Windows command prompt doesn't support ANSI escape sequences.
 ! REQUIRES: shell
 
@@ -9,6 +9,17 @@
 ! RUN:     | FileCheck %s --check-prefix=CHECK_NCD
 ! RUN: not %flang_fc1 %s -fcolor-diagnostics 2>&1 \
 ! RUN:     | FileCheck %s --check-prefix=CHECK_CD
+
+! RUN: not %flang %s -fdiagnostics-color 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_CD
+! RUN: not %flang %s -fno-diagnostics-color 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_NCD
+
+! RUN: not %flang %s -fdiagnostics-color=always 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_CD
+! RUN: not %flang %s -fdiagnostics-color=never 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_NCD
+
 ! RUN: not %flang_fc1 %s 2>&1 | FileCheck %s --check-prefix=CHECK_NCD
 
 ! CHECK_CD: {{.*}}[0;1;31merror: {{.*}}[0mMust be a constant value
diff --git a/flang/test/Driver/color-diagnostics.f90 b/flang/test/Driver/color-diagnostics.f90
index 2d18196d0af73..cbb6bf74f97f7 100644
--- a/flang/test/Driver/color-diagnostics.f90
+++ b/flang/test/Driver/color-diagnostics.f90
@@ -1,4 +1,4 @@
-! Test the behaviors of -f{no-}color-diagnostics.
+! Test the behaviors of -f{no-}color-diagnostics and -f{no}-diagnostics-color.
 ! Windows command prompt doesn't support ANSI escape sequences.
 ! REQUIRES: shell
 
@@ -9,14 +9,36 @@
 ! RUN: not %flang_fc1 %s -fcolor-diagnostics 2>&1 \
 ! RUN:     | FileCheck %s --check-prefix=CHECK_CD
 ! RUN: not %flang_fc1 %s -fno-color-diagnostics 2>&1 \
-! RUN:     | FileCheck %s --check-prefix=UNSUPPORTED_OPTION
+! RUN:     | FileCheck %s --check-prefix=UNSUPPORTED_COLOR_DIAGS
+
+! RUN: not %flang %s -fdiagnostics-color 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_CD
+! RUN: not %flang %s -fno-diagnostics-color 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_NCD
+! RUN: not %flang_fc1 %s -fdiagnostics-color 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=UNSUPPORTED_DIAGS_COLOR
+! RUN: not %flang_fc1 %s -fno-diagnostics-color 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=UNSUPPORTED_NO_DIAGS_COLOR
+
+! RUN: not %flang %s -fdiagnostics-color=always 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_CD
+! RUN: not %flang %s -fdiagnostics-color=never 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=CHECK_NCD
+
 ! RUN: not %flang_fc1 %s 2>&1 | FileCheck %s --check-prefix=CHECK_NCD
 
 ! CHECK_CD: {{.*}}[0;1;31merror: {{.*}}[0m{{.*}}[1mSemantic errors in {{.*}}color-diagnostics.f90{{.*}}[0m
 
 ! CHECK_NCD: Semantic errors in {{.*}}color-diagnostics.f90
 
-! UNSUPPORTED_OPTION: error: unknown argument: '-fno-color-diagnostics'
+! UNSUPPORTED_COLOR_DIAGS: error: unknown argument: '-fno-color-diagnostics'
+! UNSUPPORTED_DIAGS_COLOR: error: unknown argument: '-fdiagnostics-color'
+! UNSUPPORTED_NO_DIAGS_COLOR: error: unknown argument: '-fno-diagnostics-color'
+
+! Check that invalid values of -fdiagnostics-color= are disallowed.
+! RUN: not %flang %s -fdiagnostics-color=sometimes 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=DCEQ_BAD
+! DCEQ_BAD: error: invalid argument 'sometimes' to -fdiagnostics-color=
 
 program m
   integer :: i = k
diff --git a/flang/test/Driver/use-module-error.f90 b/flang/test/Driver/use-module-error.f90
index 42d6650621c8c..0b47b682d938c 100644
--- a/flang/test/Driver/use-module-error.f90
+++ b/flang/test/Driver/use-module-error.f90
@@ -3,6 +3,11 @@
 !--------------------------
 ! FLANG DRIVER (flang-new)
 !--------------------------
+! RUN: %flang -fsyntax-only -J %S/Inputs/ %s 2>&1 | FileCheck %s --allow-empty --check-prefix=SINGLEINCLUDE
+! RUN: %flang -fsyntax-only -J %S/Inputs/ -J %S/Inputs/ %s  2>&1 | FileCheck %s --allow-empty --check-prefix=SINGLEINCLUDE
+! RUN: %flang -fsyntax-only -module-dir %S/Inputs/module-dir %s 2>&1 | FileCheck %s --allow-empty --check-prefix=SINGLEINCLUDE
+! RUN: %flang -fsyntax-only -module-dir %S/Inputs/module-dir -module-dir %S/Inputs/module-dir %s 2>&1 | FileCheck %s --allow-empty --check-prefix=SINGLEINCLUDE
+! RUN: %flang -fsyntax-only -module-dir %S/Inputs/module-dir -J%S/Inputs/module-dir %s 2>&1 | FileCheck %s --allow-empty --check-prefix=SINGLEINCLUDE
 ! RUN: not %flang -fsyntax-only -J %S/Inputs/module-dir -J %S/Inputs/ %s  2>&1 | FileCheck %s --check-prefix=DOUBLEINCLUDE
 ! RUN: not %flang -fsyntax-only -J %S/Inputs/module-dir -module-dir %S/Inputs/ %s 2>&1 | FileCheck %s --check-prefix=DOUBLEINCLUDE
 ! RUN: not %flang -fsyntax-only -module-dir %S/Inputs/module-dir -J%S/Inputs/ %s 2>&1 | FileCheck %s --check-prefix=DOUBLEINCLUDE
@@ -10,11 +15,17 @@
 !-----------------------------------------
 ! FRONTEND FLANG DRIVER (flang-new -fc1)
 !-----------------------------------------
+! RUN: %flang_fc1 -fsyntax-only -J %S/Inputs/ %s 2>&1 | FileCheck %s --allow-empty --check-prefix=SINGLEINCLUDE
+! RUN: %flang_fc1 -fsyntax-only -J %S/Inputs/ -J %S/Inputs/ %s 2>&1 | FileCheck %s --allow-empty --check-prefix=SINGLEINCLUDE
+! RUN: %flang_fc1 -fsyntax-only -module-dir %S/Inputs/module-dir %s 2>&1 | FileCheck %s --allow-empty  --check-prefix=SINGLEINCLUDE
+! RUN: %flang_fc1 -fsyntax-only -module-dir %S/Inputs/module-dir -module-dir %S/Inputs/module-dir %s 2>&1 | FileCheck %s --allow-empty --check-prefix=SINGLEINCLUDE
+! RUN: %flang_fc1 -fsyntax-only -module-dir %S/Inputs/module-dir -J%S/Inputs/module-dir %s 2>&1 | FileCheck %s --allow-empty --check-prefix=SINGLEINCLUDE
 ! RUN: not %flang_fc1 -fsyntax-only -J %S/Inputs/module-dir -J %S/Inputs/ %s 2>&1 | FileCheck %s --check-prefix=DOUBLEINCLUDE
 ! RUN: not %flang_fc1 -fsyntax-only -J %S/Inputs/module-dir -module-dir %S/Inputs/ %s 2>&1 | FileCheck %s --check-prefix=DOUBLEINCLUDE
 ! RUN: not %flang_fc1 -fsyntax-only -module-dir %S/Inputs/module-dir -J%S/Inputs/ %s 2>&1 | FileCheck %s --check-prefix=DOUBLEINCLUDE
 
-! DOUBLEINCLUDE:error: Only one '-module-dir/-J' option allowed
+! DOUBLEINCLUDE:error: Only one '-module-dir/-J' directory allowed
+! SINGLEINCLUDE-NOT:error: Only one '-module-dir/-J' directory allowed
 
 program too_many_module_dirs
 end
diff --git a/flang/test/Driver/xoffload-linker.f90 b/flang/test/Driver/xoffload-linker.f90
new file mode 100644
index 0000000000000..c8988de1fdec8
--- /dev/null
+++ b/flang/test/Driver/xoffload-linker.f90
@@ -0,0 +1,10 @@
+! Test the -Xoffload-linker flag that forwards link commands to the clang-linker-wrapper used
+! to help link offloading device libraries
+
+! RUN: %flang -### %s -o %t 2>&1 -fopenmp --offload-arch=gfx90a --target=aarch64-unknown-linux-gnu -nogpulib -Xoffload-linker a | FileCheck %s --check-prefix=CHECK-XLINKER
+
+! CHECK-XLINKER: "{{[^"]*}}clang-linker-wrapper{{.*}}"{{.*}}"--device-linker=a"{{.*}}
+
+! RUN: %flang -### %s -o %t 2>&1 -fopenmp --offload-arch=gfx90a --target=aarch64-unknown-linux-gnu -nogpulib -Xoffload-linker a -Xoffload-linker-amdgcn-amd-amdhsa b | FileCheck %s --check-prefix=CHECK-XLINKER-AMDGCN
+
+! CHECK-XLINKER-AMDGCN: "{{[^"]*}}clang-linker-wrapper{{.*}}"{{.*}}"--device-linker=a"{{.*}}"--device-linker=amdgcn-amd-amdhsa=b"{{.*}}
diff --git a/flang/test/Evaluate/errors01.f90 b/flang/test/Evaluate/errors01.f90
index bad73f2e8b160..684f3fc34e687 100644
--- a/flang/test/Evaluate/errors01.f90
+++ b/flang/test/Evaluate/errors01.f90
@@ -192,6 +192,8 @@ subroutine warnings
     real, parameter :: bad10 = product([huge(1.),huge(1.)])
     !CHECK: warning: PRODUCT() of COMPLEX(4) data overflowed
     complex, parameter :: bad11 = product([(huge(1.),0.),(huge(1.),0.)])
+    !CHECK: warning: conversion of 111111111111111111111_16 to INTEGER(8) overflowed; result is 430646668853801415
+    integer(8), parameter :: bad12 = int(111111111111111111111, 8)
     !CHECK: warning: overflow on REAL(8) to REAL(4) conversion
     x = 1.D40
     !CHECK-NOT: warning: invalid argument
diff --git a/flang/test/Examples/omp-in-reduction-clause.f90 b/flang/test/Examples/omp-in-reduction-clause.f90
index cf06b6dc1a8a0..fc3fff549a8d2 100644
--- a/flang/test/Examples/omp-in-reduction-clause.f90
+++ b/flang/test/Examples/omp-in-reduction-clause.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: plugins, examples, shell
 
-! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport.so -plugin flang-omp-report -fopenmp  %s -o - | FileCheck %s
+! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport.so -plugin flang-omp-report -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
 
 ! Check for IN_REDUCTION() clause on OpenMP constructs
 
diff --git a/flang/test/Examples/omp-order-clause.f90 b/flang/test/Examples/omp-order-clause.f90
index 97c0707b18810..8d1c3f4e59137 100644
--- a/flang/test/Examples/omp-order-clause.f90
+++ b/flang/test/Examples/omp-order-clause.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: plugins, examples, shell
 
-! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport.so -plugin flang-omp-report -fopenmp  %s -o - | FileCheck %s
+! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport.so -plugin flang-omp-report -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
 
 ! Check for ORDER([order-modifier :]concurrent) clause on OpenMP constructs
 
diff --git a/flang/test/Fir/CUDA/cuda-alloc-free.fir b/flang/test/Fir/CUDA/cuda-alloc-free.fir
new file mode 100644
index 0000000000000..25821418a40f1
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-alloc-free.fir
@@ -0,0 +1,64 @@
+// RUN: fir-opt --cuf-convert %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+
+func.func @_QPsub1() {
+  %0 = cuf.alloc i32 {bindc_name = "idev", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Eidev"} -> !fir.ref<i32>
+  %1:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Eidev"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  cuf.free %1#1 : !fir.ref<i32> {data_attr = #cuf.cuda<device>}
+  return
+}
+
+// CHECK-LABEL: func.func @_QPsub1()
+// CHECK: %[[BYTES:.*]] = fir.convert %c4{{.*}} : (index) -> i64
+// CHECK: %[[ALLOC:.*]] = fir.call @_FortranACUFMemAlloc(%[[BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+// CHECK: %[[CONV:.*]] = fir.convert %3 : (!fir.llvm_ptr<i8>) -> !fir.ref<i32>
+// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CONV]] {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Eidev"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK: %[[DEVPTR:.*]] = fir.convert %[[DECL]]#1 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
+// CHECK: fir.call @_FortranACUFMemFree(%[[DEVPTR]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, i32, !fir.ref<i8>, i32) -> none
+
+func.func @_QPsub2() {
+  %0 = cuf.alloc !fir.array<10xf32> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFcuda_alloc_freeEa"} -> !fir.ref<!fir.array<10xf32>>
+  cuf.free %0 : !fir.ref<!fir.array<10xf32>> {data_attr = #cuf.cuda<device>}
+  return
+}
+
+// CHECK-LABEL: func.func @_QPsub2()
+// CHECK: %[[BYTES:.*]] = arith.muli %c10{{.*}}, %c4{{.*}} : index
+// CHECK: %[[CONV_BYTES:.*]] = fir.convert %[[BYTES]] : (index) -> i64 
+// CHECK: %{{.*}} = fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+// CHECK: fir.call @_FortranACUFMemFree
+
+func.func @_QPsub3(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref<i32> {fir.bindc_name = "m"}) {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFsub3En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %2:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFsub3Em"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %3 = fir.load %1#0 : !fir.ref<i32>
+  %4 = fir.convert %3 : (i32) -> i64
+  %5 = fir.convert %4 : (i64) -> index
+  %c0 = arith.constant 0 : index
+  %6 = arith.cmpi sgt, %5, %c0 : index
+  %7 = arith.select %6, %5, %c0 : index
+  %8 = fir.load %2#0 : !fir.ref<i32>
+  %9 = fir.convert %8 : (i32) -> i64
+  %10 = fir.convert %9 : (i64) -> index
+  %c0_0 = arith.constant 0 : index
+  %11 = arith.cmpi sgt, %10, %c0_0 : index
+  %12 = arith.select %11, %10, %c0_0 : index
+  %13 = cuf.alloc !fir.array<?x?xi32>, %7, %12 : index, index {bindc_name = "idev", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub3Eidev"} -> !fir.ref<!fir.array<?x?xi32>>
+  %14 = fir.shape %7, %12 : (index, index) -> !fir.shape<2>
+  %15:2 = hlfir.declare %13(%14) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub3Eidev"} : (!fir.ref<!fir.array<?x?xi32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xi32>>, !fir.ref<!fir.array<?x?xi32>>)
+  cuf.free %15#1 : !fir.ref<!fir.array<?x?xi32>> {data_attr = #cuf.cuda<device>}
+  return
+}
+
+// CHECK-LABEL: func.func @_QPsub3
+// CHECK: %[[N:.*]] = arith.select 
+// CHECK: %[[M:.*]] = arith.select
+// CHECK: %[[NBELEM:.*]] = arith.muli %[[N]], %[[M]] : index
+// CHECK: %[[BYTES:.*]] = arith.muli %[[NBELEM]], %c4{{.*}} : index
+// CHECK: %[[CONV_BYTES:.*]] = fir.convert %[[BYTES]] : (index) -> i64
+// CHECK: %{{.*}} = fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+// CHECK: fir.call @_FortranACUFMemFree
+
+} // end module
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 65c68bb69301a..d68ff894d5af5 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -26,17 +26,6 @@ func.func @_QPsub1() {
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFFreeDesciptor(%[[BOX_NONE]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> none
 
-// Check operations that should not be transformed yet.
-func.func @_QPsub2() {
-  %0 = cuf.alloc !fir.array<10xf32> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFcuda_alloc_freeEa"} -> !fir.ref<!fir.array<10xf32>>
-  cuf.free %0 : !fir.ref<!fir.array<10xf32>> {data_attr = #cuf.cuda<device>}
-  return
-}
-
-// CHECK-LABEL: func.func @_QPsub2()
-// CHECK: cuf.alloc !fir.array<10xf32>
-// CHECK: cuf.free %{{.*}} : !fir.ref<!fir.array<10xf32>>
-
 fir.global @_QMmod1Ea {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xf32>>> {
     %0 = fir.zero_bits !fir.heap<!fir.array<?xf32>>
     %c0 = arith.constant 0 : index
diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir
index f639a6c22b76d..ed894aed5534a 100644
--- a/flang/test/Fir/CUDA/cuda-data-transfer.fir
+++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir
@@ -70,7 +70,6 @@ func.func @_QPsub4() {
   cuf.free %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
   return
 }
-
 // CHECK-LABEL: func.func @_QPsub4()
 // CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Eadev"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub4Eahost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
@@ -137,4 +136,57 @@ func.func @_QPsub5(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}) {
 // CHECK: %[[BYTES_CONV:.*]] = fir.convert %[[BYTES]] : (index) -> i64
 // CHECK: fir.call @_FortranACUFDataTransferPtrDesc(%[[AHOST_PTR]], %[[ADEV_BOX]], %[[BYTES_CONV]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.box<none>, i64, i32, !fir.ref<i8>, i32) -> none
 
+func.func @_QPsub6() {
+  %0 = cuf.alloc i32 {bindc_name = "idev", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub6Eidev"} -> !fir.ref<i32>
+  %1:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub6Eidev"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %2 = fir.alloca i32 {bindc_name = "ihost", uniq_name = "_QFsub6Eihost"}
+  %3:2 = hlfir.declare %2 {uniq_name = "_QFsub6Eihost"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  cuf.data_transfer %1#0 to %3#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<i32>, !fir.ref<i32>
+  %4 = fir.load %3#0 : !fir.ref<i32>
+  %5:3 = hlfir.associate %4 {uniq_name = ".cuf_host_tmp"} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+  cuf.data_transfer %5#0 to %1#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
+  hlfir.end_associate %5#1, %5#2 : !fir.ref<i32>, i1
+  cuf.free %1#1 : !fir.ref<i32> {data_attr = #cuf.cuda<device>}
+  return
+}
+
+// CHECK-LABEL: func.func @_QPsub6()
+// CHECK: %[[IDEV:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub6Eidev"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK: %[[IHOST:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub6Eihost"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK: %[[DST:.*]] = fir.convert %[[IHOST]]#0 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
+// CHECK: %[[SRC:.*]] = fir.convert %[[IDEV]]#0 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %c4{{.*}}, %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: %[[LOAD:.*]] = fir.load %[[IHOST]]#0 : !fir.ref<i32>
+// CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[LOAD]] {uniq_name = ".cuf_host_tmp"} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+// CHECK: %[[DST:.*]] = fir.convert %[[IDEV]]#0 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
+// CHECK: %[[SRC:.*]] = fir.convert %[[ASSOC]]#0 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %c4{{.*}}, %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+
+func.func @_QPsub7() {
+  %c10 = arith.constant 10 : index
+  %0 = cuf.alloc !fir.array<10xi32> {bindc_name = "idev", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub7Eidev"} -> !fir.ref<!fir.array<10xi32>>
+  %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+  %2:2 = hlfir.declare %0(%1) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub7Eidev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+  %c10_0 = arith.constant 10 : index
+  %3 = fir.alloca !fir.array<10xi32> {bindc_name = "ihost", uniq_name = "_QFsub7Eihost"}
+  %4 = fir.shape %c10_0 : (index) -> !fir.shape<1>
+  %5:2 = hlfir.declare %3(%4) {uniq_name = "_QFsub7Eihost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+  cuf.data_transfer %2#0 to %5#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
+  cuf.data_transfer %5#0 to %2#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
+  cuf.free %2#1 : !fir.ref<!fir.array<10xi32>> {data_attr = #cuf.cuda<device>}
+  return
+}
+
+// CHECK-LABEL: func.func @_QPsub7()
+// CHECK: %[[IDEV:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub7Eidev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+// CHECK: %[[IHOST:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub7Eihost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+// CHECK: %[[BYTES:.*]] = arith.muli %c10{{.*}}, %c4{{.*}} : i64
+// CHECK: %[[DST:.*]] = fir.convert %[[IHOST]]#0 : (!fir.ref<!fir.array<10xi32>>) -> !fir.llvm_ptr<i8>
+// CHECK: %[[SRC:.*]] = fir.convert %[[IDEV]]#0 : (!fir.ref<!fir.array<10xi32>>) -> !fir.llvm_ptr<i8>
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %[[BYTES]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: %[[BYTES:.*]] = arith.muli %c10{{.*}}, %c4{{.*}} : i64
+// CHECK: %[[DST:.*]] = fir.convert %[[IDEV]]#0 : (!fir.ref<!fir.array<10xi32>>) -> !fir.llvm_ptr<i8>
+// CHECK: %[[SRC:.*]] = fir.convert %[[IHOST]]#0 : (!fir.ref<!fir.array<10xi32>>) -> !fir.llvm_ptr<i8>
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %[[BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+
 } // end of module
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index 4b9afd5675ea3..61f18008633d5 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -429,13 +429,14 @@ func.func @_QPopenmp_target_data_region() {
 
 func.func @_QPomp_target_data_empty() {
   %0 = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_data_emptyEa"}
-  omp.target_data use_device_addr(%0 : !fir.ref<!fir.array<1024xi32>>) {
+  omp.target_data use_device_addr(%0 -> %arg0 : !fir.ref<!fir.array<1024xi32>>) {
+    omp.terminator
   }
   return
 }
 
 // CHECK-LABEL:   llvm.func @_QPomp_target_data_empty
-// CHECK: omp.target_data   use_device_addr(%1 : !llvm.ptr) {
+// CHECK: omp.target_data   use_device_addr(%1 -> %{{.*}} : !llvm.ptr) {
 // CHECK: }
 
 // -----
@@ -450,7 +451,6 @@ func.func @_QPomp_target() {
   %2 = omp.map.bounds   lower_bound(%c0 : index) upper_bound(%1 : index) extent(%c512 : index) stride(%c1 : index) start_idx(%c1 : index)
   %3 = omp.map.info var_ptr(%0 : !fir.ref<!fir.array<512xi32>>, !fir.array<512xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%2) -> !fir.ref<!fir.array<512xi32>> {name = "a"}
   omp.target   thread_limit(%c64_i32 : i32) map_entries(%3 -> %arg0 : !fir.ref<!fir.array<512xi32>>) {
-    ^bb0(%arg0: !fir.ref<!fir.array<512xi32>>):
     %c10_i32 = arith.constant 10 : i32
     %c1_i64 = arith.constant 1 : i64
     %c1_i64_0 = arith.constant 1 : i64
@@ -472,8 +472,7 @@ func.func @_QPomp_target() {
 // CHECK:           %[[UPPER:.*]] = llvm.mlir.constant(511 : index) : i64
 // CHECK:           %[[BOUNDS:.*]] = omp.map.bounds   lower_bound(%[[LOWER]] : i64) upper_bound(%[[UPPER]] : i64) extent(%[[EXTENT]] : i64) stride(%[[STRIDE]] : i64) start_idx(%[[STRIDE]] : i64)
 // CHECK:           %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !llvm.ptr {name = "a"}
-// CHECK:           omp.target map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !llvm.ptr) thread_limit(%[[VAL_2]] : i32) {
-// CHECK:           ^bb0(%[[ARG_0]]: !llvm.ptr):
+// CHECK:           omp.target thread_limit(%[[VAL_2]] : i32) map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !llvm.ptr) {
 // CHECK:             %[[VAL_3:.*]] = llvm.mlir.constant(10 : i32) : i32
 // CHECK:             %[[VAL_4:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CHECK:             %[[VAL_5:.*]] = llvm.mlir.constant(1 : i64) : i64
@@ -971,9 +970,7 @@ func.func @omp_map_info_derived_type_explicit_member_conversion(%arg0 : !fir.ref
   // CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.struct<"_QFderived_type", (f32, array<10 x i32>, i32)>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_1]], %[[MAP_MEMBER_2]] : [2], [0] : !llvm.ptr, !llvm.ptr) -> !llvm.ptr {name = "dtype", partial_map = true} 
   %6 = omp.map.info var_ptr(%arg0 : !fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(tofrom) capture(ByRef) members(%2, %5 : [2], [0] : !fir.ref<i32>, !fir.ref<f32>) -> !fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "dtype", partial_map = true}
   // CHECK: omp.target map_entries(%[[MAP_MEMBER_1]] -> %[[ARG_1:.*]], %[[MAP_MEMBER_2]] -> %[[ARG_2:.*]], %[[MAP_PARENT]] -> %[[ARG_3:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
-  // CHECK: ^bb0(%[[ARG_1]]: !llvm.ptr, %[[ARG_2]]: !llvm.ptr, %[[ARG_3]]: !llvm.ptr):
   omp.target map_entries(%2 -> %arg1, %5 -> %arg2, %6 -> %arg3 : !fir.ref<i32>, !fir.ref<f32>, !fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
-  ^bb0(%arg1: !fir.ref<f32>, %arg2: !fir.ref<i32>, %arg3: !fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>>):
     omp.terminator
   }
   return
@@ -1001,9 +998,7 @@ func.func @omp_map_info_nested_derived_type_explicit_member_conversion(%arg0 : !
     // CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.struct<"_QFTtop_layer", (array<10 x i32>, struct<"_QFTbottom_layer", (array<10 x f32>, f64)>, i32)>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_1]], %[[MAP_MEMBER_2]] : [1,1], [2,-1] : !llvm.ptr, !llvm.ptr) -> !llvm.ptr {partial_map = true}
     %9 = omp.map.info var_ptr(%arg0 : !fir.ref<!fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>>, !fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>) map_clauses(tofrom) capture(ByRef) members(%4, %7 : [1,1], [2,-1] : !fir.ref<f64>, !fir.ref<i32>) -> !fir.ref<!fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>> {partial_map = true}
     // CHECK: omp.target map_entries(%[[MAP_MEMBER_1]] -> %{{.*}}, %[[MAP_MEMBER_2]] -> %{{.*}}, %[[PARENT_MAP]] -> %{{.*}} : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
-    // CHECK: ^bb0(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr):
     omp.target map_entries(%4 -> %arg1, %7 -> %arg2, %9 -> %arg3 : !fir.ref<f64>, !fir.ref<i32>, !fir.ref<!fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>>) {
-     ^bb0(%arg1: !fir.ref<i32>, %arg2: !fir.ref<f64>, %arg3: !fir.ref<!fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>>):
       omp.terminator
     }
   return
@@ -1016,7 +1011,6 @@ func.func @omp_map_info_nested_derived_type_explicit_member_conversion(%arg0 : !
 // CHECK: %[[ADDR_OF:.*]] = llvm.mlir.addressof @var_common_ : !llvm.ptr
 // CHECK: %[[CB_MAP:.*]] = omp.map.info var_ptr(%[[ADDR_OF]] : !llvm.ptr, !llvm.array<8 x i8>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var_common"}
 // CHECK:    omp.target map_entries(%[[CB_MAP]] -> %[[ARG0:.*]] : !llvm.ptr) {
-// CHECK:    ^bb0(%[[ARG0]]: !llvm.ptr):
 // CHECK:      %[[VAR_2_OFFSET:.*]] = llvm.mlir.constant(4 : index) : i64
 // CHECK:      %[[VAR_1_OFFSET:.*]] = llvm.mlir.constant(0 : index) : i64
 // CHECK:      %{{.*}} = llvm.getelementptr %[[ARG0]][%[[VAR_1_OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8
@@ -1026,7 +1020,6 @@ func.func @omp_map_common_block_using_common_block_symbol() {
   %0 = fir.address_of(@var_common_) : !fir.ref<!fir.array<8xi8>>
   %1 = omp.map.info var_ptr(%0 : !fir.ref<!fir.array<8xi8>>, !fir.array<8xi8>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.array<8xi8>> {name = "var_common"}
   omp.target map_entries(%1 -> %arg0 : !fir.ref<!fir.array<8xi8>>) {
-  ^bb0(%arg0: !fir.ref<!fir.array<8xi8>>):
     %c4 = arith.constant 4 : index
     %c0 = arith.constant 0 : index
     %c20_i32 = arith.constant 20 : i32
@@ -1058,7 +1051,6 @@ fir.global common @var_common_(dense<0> : vector<8xi8>) {alignment = 4 : i64} :
 // CHECK:    %[[MAP_CB_VAR_1:.*]] = omp.map.info var_ptr(%[[VAR_1_CB_GEP]] : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var1"}
 // CHECK:    %[[MAP_CB_VAR_2:.*]] = omp.map.info var_ptr(%[[VAR_2_CB_GEP]] : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var2"}
 // CHECK:    omp.target map_entries(%[[MAP_CB_VAR_1]] -> %[[ARG0:.*]], %[[MAP_CB_VAR_2]] -> %[[ARG1:.*]] : !llvm.ptr, !llvm.ptr) {
-// CHECK:     ^bb0(%[[ARG0]]: !llvm.ptr, %[[ARG1]]: !llvm.ptr):
 
 func.func @omp_map_common_block_using_common_block_members() {
   %c4 = arith.constant 4 : index
@@ -1073,7 +1065,6 @@ func.func @omp_map_common_block_using_common_block_members() {
   %7 = omp.map.info var_ptr(%3 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "var1"}
   %8 = omp.map.info var_ptr(%6 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "var2"}
   omp.target map_entries(%7 -> %arg0, %8 -> %arg1 : !fir.ref<i32>, !fir.ref<i32>) {
-  ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
     %c10_i32 = arith.constant 10 : i32
     %9 = fir.load %arg0 : !fir.ref<i32>
     %10 = arith.muli %9, %c10_i32 : i32
diff --git a/flang/test/Integration/debug-allocatable-1.f90 b/flang/test/Integration/debug-allocatable-1.f90
index 471c8cdb7d54e..b9de3b26cdf98 100644
--- a/flang/test/Integration/debug-allocatable-1.f90
+++ b/flang/test/Integration/debug-allocatable-1.f90
@@ -17,8 +17,8 @@ end subroutine ff
 ! CHECK-DAG: !DILocalVariable(name: "ar1"{{.*}}type: ![[TY1:[0-9]+]])
 ! CHECK-DAG: ![[TY1]] = !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS2:[0-9]+]]{{.*}}dataLocation{{.*}}allocated: !DIExpression(DW_OP_push_object_address, DW_OP_deref, DW_OP_lit0, DW_OP_ne))
 ! CHECK-DAG: ![[ELEMS2]] = !{![[ELEM1:[0-9]+]], ![[ELEM2:[0-9]+]]}
-! CHECK-DAG: ![[ELEM1]] = !DISubrange
-! CHECK-DAG: ![[ELEM2]] = !DISubrange
+! CHECK-DAG: ![[ELEM1]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref))
+! CHECK-DAG: ![[ELEM2]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref))
 ! CHECK-DAG: !DILocalVariable(name: "sc"{{.*}}type: ![[TY2:[0-9]+]])
 ! CHECK-DAG: ![[TY2]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[TY3:[0-9]+]]{{.*}})
 ! CHECK-DAG: ![[TY3]] = !DIBasicType(name: "real"{{.*}})
diff --git a/flang/test/Integration/debug-assumed-shape-array.f90 b/flang/test/Integration/debug-assumed-shape-array.f90
index 9a439e20d1981..bdfbf34ca4d15 100644
--- a/flang/test/Integration/debug-assumed-shape-array.f90
+++ b/flang/test/Integration/debug-assumed-shape-array.f90
@@ -1,13 +1,20 @@
 ! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck  %s
 
-subroutine ff(arr)
+subroutine ff(arr, arr1)
   implicit none
     integer :: arr(:, :)
-    return arr(1,1)
+    integer :: arr1(3:, 4:)
+    return arr(1,1) + arr1(3,4)
 end subroutine ff
 
-! CHECK-DAG: !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS:[0-9]+]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref))
+! CHECK-DAG: !DILocalVariable(name: "arr"{{.*}}type: ![[TY1:[0-9]+]]{{.*}})
+! CHECK-DAG: ![[TY1]] = !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS:[0-9]+]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref))
 ! CHECK-DAG: ![[ELEMS]] = !{![[ELEM1:[0-9]+]], ![[ELEM2:[0-9]+]]}
-! CHECK-DAG: ![[ELEM1]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 24, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 40, DW_OP_deref))
-! CHECK-DAG: ![[ELEM2]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 56, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 48, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 64, DW_OP_deref))
+! CHECK-DAG: ![[ELEM1]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 40, DW_OP_deref))
+! CHECK-DAG: ![[ELEM2]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 56, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 64, DW_OP_deref))
 
+! CHECK-DAG: !DILocalVariable(name: "arr1"{{.*}}type: ![[TY2:[0-9]+]]{{.*}})
+! CHECK-DAG: ![[TY2]] = !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS1:[0-9]+]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref))
+! CHECK-DAG: ![[ELEMS1]] = !{![[ELEM11:[0-9]+]], ![[ELEM12:[0-9]+]]}
+! CHECK-DAG: ![[ELEM11]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref), lowerBound: 3, stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 40, DW_OP_deref))
+! CHECK-DAG: ![[ELEM12]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 56, DW_OP_deref), lowerBound: 4, stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 64, DW_OP_deref))
diff --git a/flang/test/Integration/debug-cyclic-derived-type-2.f90 b/flang/test/Integration/debug-cyclic-derived-type-2.f90
new file mode 100644
index 0000000000000..c49c9d00957e8
--- /dev/null
+++ b/flang/test/Integration/debug-cyclic-derived-type-2.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck  %s
+
+! mainly test that this program does not cause an assertion failure
+module m
+ type t2
+   type(t1), pointer :: p1
+ end type
+ type t1
+   type(t2), pointer :: p2
+   integer abc
+ end type
+ type(t1) :: tee1
+end module
+
+program test
+  use m
+  type(t2) :: lc2
+  print *, lc2%p1%abc
+end program test
+
+! CHECK-DAG: DICompositeType(tag: DW_TAG_structure_type, name: "t1"{{.*}})
+! CHECK-DAG: DICompositeType(tag: DW_TAG_structure_type, name: "t2"{{.*}})
diff --git a/flang/test/Integration/debug-cyclic-derived-type.f90 b/flang/test/Integration/debug-cyclic-derived-type.f90
index 03e06336a6e08..a26ffd19ef6b1 100644
--- a/flang/test/Integration/debug-cyclic-derived-type.f90
+++ b/flang/test/Integration/debug-cyclic-derived-type.f90
@@ -11,5 +11,11 @@ module m
  type(t2) :: v3
 end module
 
-! CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "t1"{{.*}})
-! CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "t2"{{.*}})
+! CHECK-DAG: ![[T1:[0-9]+]] = {{.*}}!DICompositeType(tag: DW_TAG_structure_type, name: "t1"{{.*}}elements: ![[T1_ELEMS:[0-9]+]])
+! CHECK-DAG: ![[T1_ELEMS]] = !{![[T1_ELEM1:[0-9]+]]}
+! CHECK-DAG: ![[T1_ELEM1]] = !DIDerivedType(tag: DW_TAG_member, name: "p", baseType: ![[T2P:[0-9]+]]{{.*}})
+! CHECK-DAG: ![[T2P]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[T2:[0-9]+]]{{.*}})
+
+! CHECK-DAG: ![[T2]] = {{.*}}!DICompositeType(tag: DW_TAG_structure_type, name: "t2"{{.*}}elements: ![[T2_ELEMS:[0-9]+]])
+! CHECK-DAG: ![[T2_ELEMS]] = !{![[T2_ELEM1:[0-9]+]]}
+! CHECK-DAG: ![[T2_ELEM1]] = !DIDerivedType(tag: DW_TAG_member, name: "v1", baseType: ![[T1]]{{.*}})
diff --git a/flang/test/Integration/debug-ptr-type.f90 b/flang/test/Integration/debug-ptr-type.f90
index bff7bcb862b5c..6d7178f7aca41 100644
--- a/flang/test/Integration/debug-ptr-type.f90
+++ b/flang/test/Integration/debug-ptr-type.f90
@@ -41,7 +41,9 @@ end subroutine ff
 ! CHECK-DAG: ![[ELEMS1:[0-9]+]] = !{!{{[0-9]+}}}
 ! CHECK-DAG: !DILocalVariable(name: "par"{{.*}}type: ![[ARR_TY1:[0-9]+]])
 ! CHECK-DAG: ![[ARR_TY1]] = !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS2:[0-9]+]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref), associated: !DIExpression(DW_OP_push_object_address, DW_OP_deref, DW_OP_lit0, DW_OP_ne))
-! CHECK-DAG: ![[ELEMS2]] = !{!{{[0-9]+}}, !{{[0-9]+}}}
+! CHECK-DAG: ![[ELEMS2]] = !{![[ELEM21:[0-9]+]], ![[ELEM22:[0-9]+]]}
+! CHECK-DAG: ![[ELEM21]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref))
+! CHECK-DAG: ![[ELEM22]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref))
 ! CHECK-DAG: !DILocalVariable(name: "par2"{{.*}}type: ![[ARR_TY2:[0-9]+]])
 ! CHECK-DAG: ![[ARR_TY2]] = !DICompositeType(tag: DW_TAG_array_type{{.*}}, elements: ![[ELEMS1]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref), associated: !DIExpression(DW_OP_push_object_address, DW_OP_deref, DW_OP_lit0, DW_OP_ne))
 ! CHECK-DAG: !DILocalVariable(name: "psc"{{.*}}type: ![[PTR_TY:[0-9]+]])
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index bed0a4574fe94..1331b644130c8 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -26,11 +26,11 @@ end
 ! CHECK: %{{.*}} = fir.call @__syncthreads_count(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (!fir.ref<i32>) -> i32
 ! CHECK: %{{.*}} = fir.call @__syncthreads_or(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (!fir.ref<i32>) -> i32
 
-! CHECK: func.func private @__syncthreads() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads"}
-! CHECK: func.func private @__syncwarp(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncwarp"}
-! CHECK: func.func private @__threadfence() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence"}
-! CHECK: func.func private @__threadfence_block() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence_block"}
-! CHECK: func.func private @__threadfence_system() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence_system"}
-! CHECK: func.func private @__syncthreads_and(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_and"}
-! CHECK: func.func private @__syncthreads_count(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_count"}
-! CHECK: func.func private @__syncthreads_or(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_or"}
+! CHECK: func.func private @__syncthreads() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads", fir.proc_attrs = #fir.proc_attrs<bind_c>}
+! CHECK: func.func private @__syncwarp(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncwarp", fir.proc_attrs = #fir.proc_attrs<bind_c>}
+! CHECK: func.func private @__threadfence() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence", fir.proc_attrs = #fir.proc_attrs<bind_c>}
+! CHECK: func.func private @__threadfence_block() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence_block", fir.proc_attrs = #fir.proc_attrs<bind_c>}
+! CHECK: func.func private @__threadfence_system() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence_system", fir.proc_attrs = #fir.proc_attrs<bind_c>}
+! CHECK: func.func private @__syncthreads_and(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_and", fir.proc_attrs = #fir.proc_attrs<bind_c>}
+! CHECK: func.func private @__syncthreads_count(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_count", fir.proc_attrs = #fir.proc_attrs<bind_c>}
+! CHECK: func.func private @__syncthreads_or(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_or", fir.proc_attrs = #fir.proc_attrs<bind_c>}
diff --git a/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90 b/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90
index a30c6c6e4a227..1dc033d0ba033 100644
--- a/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90
@@ -31,7 +31,7 @@
 ! CHECK:             %[[VAL_21:.*]]:3 = hlfir.associate %[[VAL_22:.*]](%[[VAL_17]]) {adapt.valuebyref} : (!hlfir.expr<2xf32>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xf32>>, !fir.ref<!fir.array<2xf32>>, i1)
 ! CHECK:             %[[VAL_23:.*]] = fir.embox %[[VAL_21]]#0(%[[VAL_17]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
 ! CHECK:             %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<!fir.array<?xf32>>
-! CHECK:             %[[VAL_25:.*]] = fir.call @_QPfoo(%[[VAL_24]]) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> f32
+! CHECK:             %[[VAL_25:.*]] = fir.call @_QPfoo(%[[VAL_24]]) proc_attrs<pure> fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> f32
 ! CHECK:             hlfir.end_associate %[[VAL_21]]#1, %[[VAL_21]]#2 : !fir.ref<!fir.array<2xf32>>, i1
 ! CHECK:             hlfir.destroy %[[VAL_22]] : !hlfir.expr<2xf32>
 ! CHECK:             hlfir.yield_element %[[VAL_25]] : f32
diff --git a/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90 b/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90
index 277e2683c64f8..4d3f93c7d48ce 100644
--- a/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90
@@ -107,7 +107,7 @@ integer pure function foo(i)
 ! CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_5]], %[[VAL_12]] : index
 ! CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (index) -> i64
 ! CHECK:             %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i64) -> i32
-! CHECK:             %[[VAL_16:.*]] = fir.call @_QPfoo(%[[VAL_15]]) fastmath<contract> : (i32) -> i32
+! CHECK:             %[[VAL_16:.*]] = fir.call @_QPfoo(%[[VAL_15]]) proc_attrs<pure> fastmath<contract> : (i32) -> i32
 ! CHECK:             hlfir.yield_element %[[VAL_16]] : i32
 ! CHECK:           }
 ! CHECK:           %[[VAL_17:.*]]:3 = hlfir.associate %[[VAL_18:.*]](%[[VAL_3]]) {adapt.valuebyref} : (!hlfir.expr<4xi32>, !fir.shape<1>) -> (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<4xi32>>, i1)
diff --git a/flang/test/Lower/HLFIR/bindc-value-derived.f90 b/flang/test/Lower/HLFIR/bindc-value-derived.f90
index a54b29b470e0b..7a2196dfc8bf1 100644
--- a/flang/test/Lower/HLFIR/bindc-value-derived.f90
+++ b/flang/test/Lower/HLFIR/bindc-value-derived.f90
@@ -14,7 +14,7 @@ subroutine test(x) bind(c)
     call use_it(x%i)
   end subroutine
 ! CHECK-LABEL:   func.func @test(
-! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.type<_QMbindc_byvalTt{i:i32}> {fir.bindc_name = "x"}) attributes {fir.bindc_name = "test"} {
+! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.type<_QMbindc_byvalTt{i:i32}>
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.type<_QMbindc_byvalTt{i:i32}>
 ! CHECK:           fir.store %[[VAL_0]] to %[[VAL_1]] : !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QMbindc_byvalFtestEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>)
@@ -28,7 +28,7 @@ subroutine call_it(x)
     call test(x)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMbindc_byvalPcall_it(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>> {fir.bindc_name = "x"}) {
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>
 ! CHECK:           fir.call @test(%[[VAL_2]]) proc_attrs<bind_c> fastmath<contract> : (!fir.type<_QMbindc_byvalTt{i:i32}>) -> ()
diff --git a/flang/test/Lower/HLFIR/block_bindc_pocs.f90 b/flang/test/Lower/HLFIR/block_bindc_pocs.f90
index ed07d88c53a60..fc04226dfd23d 100644
--- a/flang/test/Lower/HLFIR/block_bindc_pocs.f90
+++ b/flang/test/Lower/HLFIR/block_bindc_pocs.f90
@@ -11,7 +11,7 @@ end module m
 !CHECK-DAG: %[[S0:.*]] = llvm.intr.stacksave : !llvm.ptr
 !CHECK-DAG: fir.call @test_proc() proc_attrs<bind_c> fastmath<contract> : () -> ()
 !CHECK-DAG: llvm.intr.stackrestore %[[S0]] : !llvm.ptr
-!CHECK-DAG: func.func private @test_proc() attributes {fir.bindc_name = "test_proc"}
+!CHECK-DAG: func.func private @test_proc() attributes {fir.bindc_name = "test_proc", fir.proc_attrs = #fir.proc_attrs<bind_c>}
 subroutine test
     BLOCK
         use m
diff --git a/flang/test/Lower/HLFIR/elemental-array-ops.f90 b/flang/test/Lower/HLFIR/elemental-array-ops.f90
index 18e1fb0a787e7..aefc4d978a27d 100644
--- a/flang/test/Lower/HLFIR/elemental-array-ops.f90
+++ b/flang/test/Lower/HLFIR/elemental-array-ops.f90
@@ -182,7 +182,7 @@ end subroutine char_return
 ! CHECK:             %[[VAL_23:.*]] = arith.constant 0 : index
 ! CHECK:             %[[VAL_24:.*]] = arith.cmpi sgt, %[[VAL_22]], %[[VAL_23]] : index
 ! CHECK:             %[[VAL_25:.*]] = arith.select %[[VAL_24]], %[[VAL_22]], %[[VAL_23]] : index
-! CHECK:             %[[VAL_27:.*]] = fir.call @_QPcallee(%[[VAL_2]], %[[VAL_25]], %[[VAL_20]]) fastmath<contract> : (!fir.ref<!fir.char<1,3>>, index, !fir.boxchar<1>) -> !fir.boxchar<1>
+! CHECK:             %[[VAL_27:.*]] = fir.call @_QPcallee(%[[VAL_2]], %[[VAL_25]], %[[VAL_20]]) proc_attrs<elemental, pure> fastmath<contract> : (!fir.ref<!fir.char<1,3>>, index, !fir.boxchar<1>) -> !fir.boxchar<1>
 ! CHECK:             %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_2]] typeparams %[[VAL_25]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
 ! CHECK:             %[[MustFree:.*]] = arith.constant false
 ! CHECK:             %[[ResultTemp:.*]] = hlfir.as_expr %[[VAL_28]]#0 move %[[MustFree]] : (!fir.ref<!fir.char<1,3>>, i1) -> !hlfir.expr<!fir.char<1,3>>
diff --git a/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90 b/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90
index aea23d8d94672..d4d8b858aaeea 100644
--- a/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90
+++ b/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90
@@ -18,7 +18,7 @@ real elemental function elem(a, b)
 ! CHECK:  %[[VAL_6:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<100xf32> {
 ! CHECK:  ^bb0(%[[VAL_7:.*]]: index):
 ! CHECK:    %[[VAL_8:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_7]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK:    %[[VAL_9:.*]] = fir.call @_QPelem(%[[VAL_2]]#1, %[[VAL_8]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<f32>) -> f32
+! CHECK:    %[[VAL_9:.*]] = fir.call @_QPelem(%[[VAL_2]]#1, %[[VAL_8]]) proc_attrs<elemental, pure>  fastmath<contract> : (!fir.ref<i32>, !fir.ref<f32>) -> f32
 ! CHECK:    hlfir.yield_element %[[VAL_9]] : f32
 ! CHECK:  }
 ! CHECK: fir.call
@@ -43,7 +43,7 @@ real elemental function elem_val(a, b)
 ! CHECK:  ^bb0(%[[VAL_9:.*]]: index, %[[VAL_10:.*]]: index):
 ! CHECK:    %[[VAL_11:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_9]], %[[VAL_10]])  : (!fir.ref<!fir.array<10x20xf32>>, index, index) -> !fir.ref<f32>
 ! CHECK:    %[[VAL_12:.*]] = fir.load %[[VAL_11]] : !fir.ref<f32>
-! CHECK:    %[[VAL_13:.*]] = fir.call @_QPelem_val(%[[VAL_7]], %[[VAL_12]]) fastmath<contract> : (i32, f32) -> f32
+! CHECK:    %[[VAL_13:.*]] = fir.call @_QPelem_val(%[[VAL_7]], %[[VAL_12]]) proc_attrs<elemental, pure>  fastmath<contract> : (i32, f32) -> f32
 ! CHECK:    hlfir.yield_element %[[VAL_13]] : f32
 ! CHECK:  }
 ! CHECK: fir.call
@@ -67,7 +67,7 @@ real elemental function char_elem(a, b)
 ! CHECK:  %[[VAL_9:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<1>) -> !hlfir.expr<100xf32> {
 ! CHECK:  ^bb0(%[[VAL_10:.*]]: index):
 ! CHECK:    %[[VAL_11:.*]] = hlfir.designate %[[VAL_8]]#0 (%[[VAL_10]])  typeparams %[[VAL_4]]#1 : (!fir.box<!fir.array<100x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1>
-! CHECK:    %[[VAL_12:.*]] = fir.call @_QPchar_elem(%[[VAL_3]]#0, %[[VAL_11]]) fastmath<contract> : (!fir.boxchar<1>, !fir.boxchar<1>) -> f32
+! CHECK:    %[[VAL_12:.*]] = fir.call @_QPchar_elem(%[[VAL_3]]#0, %[[VAL_11]]) proc_attrs<elemental, pure>  fastmath<contract> : (!fir.boxchar<1>, !fir.boxchar<1>) -> f32
 ! CHECK:    hlfir.yield_element %[[VAL_12]] : f32
 ! CHECK:  }
 ! CHECK: fir.call
@@ -93,7 +93,7 @@ elemental subroutine elem_sub(a, b)
 ! CHECK:  fir.do_loop %[[VAL_8:.*]] = %[[VAL_7]] to %[[VAL_4]] step %[[VAL_7]] unordered {
 ! CHECK:    fir.do_loop %[[VAL_9:.*]] = %[[VAL_7]] to %[[VAL_3]] step %[[VAL_7]] unordered {
 ! CHECK:      %[[VAL_10:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_9]], %[[VAL_8]])  : (!fir.ref<!fir.array<10x20xf32>>, index, index) -> !fir.ref<f32>
-! CHECK:      fir.call @_QPelem_sub(%[[VAL_2]]#1, %[[VAL_10]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<f32>) -> ()
+! CHECK:      fir.call @_QPelem_sub(%[[VAL_2]]#1, %[[VAL_10]]) proc_attrs<elemental, pure>  fastmath<contract> : (!fir.ref<i32>, !fir.ref<f32>) -> ()
 ! CHECK:    }
 ! CHECK:  }
 
@@ -116,7 +116,7 @@ impure elemental subroutine impure_elem(a)
 ! CHECK:           fir.do_loop %[[VAL_6:.*]] = %[[VAL_5]] to %[[VAL_2]] step %[[VAL_5]] {
 ! CHECK:             fir.do_loop %[[VAL_7:.*]] = %[[VAL_5]] to %[[VAL_1]] step %[[VAL_5]] {
 ! CHECK:               %[[VAL_8:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_7]], %[[VAL_6]])  : (!fir.ref<!fir.array<10x20xf32>>, index, index) -> !fir.ref<f32>
-! CHECK:               fir.call @_QPimpure_elem(%[[VAL_8]]) fastmath<contract> : (!fir.ref<f32>) -> ()
+! CHECK:               fir.call @_QPimpure_elem(%[[VAL_8]]) proc_attrs<elemental> fastmath<contract> : (!fir.ref<f32>) -> ()
 ! CHECK:             }
 ! CHECK:           }
 ! CHECK:           return
@@ -141,7 +141,7 @@ elemental subroutine ordered_elem(a)
 ! CHECK:           fir.do_loop %[[VAL_6:.*]] = %[[VAL_5]] to %[[VAL_2]] step %[[VAL_5]] {
 ! CHECK:             fir.do_loop %[[VAL_7:.*]] = %[[VAL_5]] to %[[VAL_1]] step %[[VAL_5]] {
 ! CHECK:               %[[VAL_8:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_7]], %[[VAL_6]])  : (!fir.ref<!fir.array<10x20xf32>>, index, index) -> !fir.ref<f32>
-! CHECK:               fir.call @_QPordered_elem(%[[VAL_8]]) fastmath<contract> : (!fir.ref<f32>) -> ()
+! CHECK:               fir.call @_QPordered_elem(%[[VAL_8]]) proc_attrs<elemental, pure> fastmath<contract> : (!fir.ref<f32>) -> ()
 ! CHECK:             }
 ! CHECK:           }
 ! CHECK:           return
@@ -174,7 +174,7 @@ impure elemental subroutine impure_elem(a)
 ! CHECK:           fir.do_loop %[[VAL_14:.*]] = %[[VAL_13]] to %[[VAL_2]] step %[[VAL_13]] {
 ! CHECK:             fir.do_loop %[[VAL_15:.*]] = %[[VAL_13]] to %[[VAL_1]] step %[[VAL_13]] {
 ! CHECK:               %[[VAL_16:.*]] = hlfir.designate %[[VAL_11]]#0 (%[[VAL_15]], %[[VAL_14]])  : (!fir.ref<!fir.array<10x20xf32>>, index, index) -> !fir.ref<f32>
-! CHECK:               fir.call @_QPimpure_elem(%[[VAL_16]]) fastmath<contract> : (!fir.ref<f32>) -> ()
+! CHECK:               fir.call @_QPimpure_elem(%[[VAL_16]]) proc_attrs<elemental> fastmath<contract> : (!fir.ref<f32>) -> ()
 ! CHECK:             }
 ! CHECK:           }
 ! CHECK:           hlfir.end_associate %[[VAL_11]]#1, %[[VAL_11]]#2 : !fir.ref<!fir.array<10x20xf32>>, i1
diff --git a/flang/test/Lower/HLFIR/forall.f90 b/flang/test/Lower/HLFIR/forall.f90
index c12f0c6a826b5..709e233746a91 100644
--- a/flang/test/Lower/HLFIR/forall.f90
+++ b/flang/test/Lower/HLFIR/forall.f90
@@ -86,7 +86,7 @@ subroutine test_forall_mask()
 ! CHECK:  }  (%[[VAL_9:.*]]: i64) {
 ! CHECK:    %[[VAL_10:.*]] = hlfir.forall_index "i" %[[VAL_9]] : (i64) -> !fir.ref<i64>
 ! CHECK:    hlfir.forall_mask {
-! CHECK:      %[[VAL_11:.*]] = fir.call @_QPpredicate(%[[VAL_10]]) fastmath<contract> : (!fir.ref<i64>) -> !fir.logical<4>
+! CHECK:      %[[VAL_11:.*]] = fir.call @_QPpredicate(%[[VAL_10]]) proc_attrs<pure> fastmath<contract> : (!fir.ref<i64>) -> !fir.logical<4>
 ! CHECK:      %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
 ! CHECK:      hlfir.yield %[[VAL_12]] : i1
 ! CHECK:    } do {
@@ -113,8 +113,8 @@ subroutine test_forall_several_indices()
 ! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare {{.*}}Ey
 ! CHECK:  %[[VAL_7:.*]] = fir.call @_QPibar() fastmath<contract> : () -> i32
 ! CHECK:  %[[VAL_8:.*]] = fir.call @_QPifoo() fastmath<contract> : () -> i32
-! CHECK:  %[[VAL_9:.*]] = fir.call @_QPjfoo() fastmath<contract> : () -> i64
-! CHECK:  %[[VAL_10:.*]] = fir.call @_QPjbar() fastmath<contract> : () -> i64
+! CHECK:  %[[VAL_9:.*]] = fir.call @_QPjfoo() proc_attrs<pure> fastmath<contract> : () -> i64
+! CHECK:  %[[VAL_10:.*]] = fir.call @_QPjbar() proc_attrs<pure> fastmath<contract> : () -> i64
 ! CHECK:  hlfir.forall lb {
 ! CHECK:    hlfir.yield %[[VAL_7]] : i32
 ! CHECK:  } ub {
@@ -126,7 +126,7 @@ subroutine test_forall_several_indices()
 ! CHECK:      hlfir.yield %[[VAL_10]] : i64
 ! CHECK:    }  (%[[VAL_12:.*]]: i64) {
 ! CHECK:      hlfir.region_assign {
-! CHECK:        %[[VAL_13:.*]] = fir.call @_QPifoo2(%[[VAL_11]], %[[VAL_12]]) fastmath<contract> : (i64, i64) -> i64
+! CHECK:        %[[VAL_13:.*]] = fir.call @_QPifoo2(%[[VAL_11]], %[[VAL_12]]) proc_attrs<pure> fastmath<contract> : (i64, i64) -> i64
 ! CHECK:        %[[VAL_14:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_13]])  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
 ! CHECK:        %[[VAL_15:.*]] = fir.load %[[VAL_14]] : !fir.ref<i32>
 ! CHECK:        hlfir.yield %[[VAL_15]] : i32
@@ -169,10 +169,10 @@ subroutine test_nested_foralls()
 ! CHECK:      hlfir.yield %[[VAL_12]] : !fir.ref<i32>
 ! CHECK:    }
 ! CHECK:    hlfir.forall lb {
-! CHECK:      %[[VAL_13:.*]] = fir.call @_QPjfoo() fastmath<contract> : () -> i64
+! CHECK:      %[[VAL_13:.*]] = fir.call @_QPjfoo() proc_attrs<pure> fastmath<contract> : () -> i64
 ! CHECK:      hlfir.yield %[[VAL_13]] : i64
 ! CHECK:    } ub {
-! CHECK:      %[[VAL_14:.*]] = fir.call @_QPjbar() fastmath<contract> : () -> i64
+! CHECK:      %[[VAL_14:.*]] = fir.call @_QPjbar() proc_attrs<pure> fastmath<contract> : () -> i64
 ! CHECK:      hlfir.yield %[[VAL_14]] : i64
 ! CHECK:    }  (%[[VAL_15:.*]]: i64) {
 ! CHECK:      hlfir.region_assign {
diff --git a/flang/test/Lower/HLFIR/where-nonelemental.f90 b/flang/test/Lower/HLFIR/where-nonelemental.f90
index 15a281b0ba681..643f417c47674 100644
--- a/flang/test/Lower/HLFIR/where-nonelemental.f90
+++ b/flang/test/Lower/HLFIR/where-nonelemental.f90
@@ -125,7 +125,7 @@ integer pure function pure_ifoo()
 ! CHECK:             hlfir.where {
 ! CHECK:               %[[VAL_21:.*]] = llvm.intr.stacksave : !llvm.ptr
 ! CHECK-NOT: hlfir.exactly_once
-! CHECK:               %[[VAL_23:.*]] = fir.call @_QPpure_logical_func1() fastmath<contract> : () -> !fir.array<100x!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.call @_QPpure_logical_func1() proc_attrs<pure> fastmath<contract> : () -> !fir.array<100x!fir.logical<4>>
 ! CHECK:               hlfir.yield %{{.*}} : !hlfir.expr<100x!fir.logical<4>> cleanup {
 ! CHECK:                 llvm.intr.stackrestore %[[VAL_21]] : !llvm.ptr
 ! CHECK:               }
@@ -173,7 +173,7 @@ integer pure function pure_ifoo()
 ! CHECK:               hlfir.elsewhere mask {
 ! CHECK:                 %[[VAL_129:.*]] = hlfir.exactly_once : !hlfir.expr<100x!fir.logical<4>> {
 ! CHECK:                   %[[VAL_139:.*]] = llvm.intr.stacksave : !llvm.ptr
-! CHECK:                   %[[VAL_141:.*]] = fir.call @_QPpure_logical_func2() fastmath<contract> : () -> !fir.array<100x!fir.logical<4>>
+! CHECK:                   %[[VAL_141:.*]] = fir.call @_QPpure_logical_func2() proc_attrs<pure> fastmath<contract> : () -> !fir.array<100x!fir.logical<4>>
 ! CHECK:                   hlfir.yield %{{.*}} : !hlfir.expr<100x!fir.logical<4>> cleanup {
 ! CHECK:                     llvm.intr.stackrestore %[[VAL_139]] : !llvm.ptr
 ! CHECK:                   }
@@ -185,7 +185,7 @@ integer pure function pure_ifoo()
 ! CHECK:                   hlfir.yield %{{.*}} : !fir.box<!fir.array<?xf32>>
 ! CHECK:                 } to {
 ! CHECK:                   %[[VAL_165:.*]] = hlfir.exactly_once : i32 {
-! CHECK:                     %[[VAL_166:.*]] = fir.call @_QPpure_ifoo() fastmath<contract> : () -> i32
+! CHECK:                     %[[VAL_166:.*]] = fir.call @_QPpure_ifoo() proc_attrs<pure> fastmath<contract> : () -> i32
 ! CHECK:                     hlfir.yield %[[VAL_166]] : i32
 ! CHECK:                   }
 ! CHECK:                   hlfir.designate
diff --git a/flang/test/Lower/Intrinsics/free.f90 b/flang/test/Lower/Intrinsics/free.f90
new file mode 100644
index 0000000000000..bb8d38e737aa7
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/free.f90
@@ -0,0 +1,66 @@
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK-LABEL:   func.func @_QPfree_ptr() {
+subroutine free_ptr()
+  integer :: x
+  pointer (ptr_x, x)
+  ! CHECK:           %[[X:.*]] = fir.alloca !fir.box<!fir.ptr<i32>>
+  ! CHECK:           %[[X_PTR:.*]] = fir.alloca i64 {bindc_name = "ptr_x", uniq_name = "_QFfree_ptrEptr_x"}
+  ! CHECK:           %[[X_PTR_DECL:.*]]:2 = hlfir.declare %[[X_PTR]] {uniq_name = "_QFfree_ptrEptr_x"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+  ! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFfree_ptrEx"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+  ! CHECK:           %[[X_LD:.*]] = fir.load %[[X_PTR_DECL]]#0 : !fir.ref<i64>
+  ! CHECK:           %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_LD]]) fastmath<contract> : (i64) -> none
+  ! CHECK:           return
+  call free(ptr_x)
+end subroutine
+
+! gfortran allows free to be used on integers, so we accept it with a warning.
+
+! CHECK-LABEL:   func.func @_QPfree_i8() {
+subroutine free_i8
+  integer (kind=1) :: x
+  ! CHECK:           %[[X:.*]] = fir.alloca i8 {bindc_name = "x", uniq_name = "_QFfree_i8Ex"}
+  ! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFfree_i8Ex"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+  ! CHECK:           %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i8>
+  ! CHECK:           %[[X_I64:.*]] = fir.convert %[[X_LD]] : (i8) -> i64
+  ! CHECK:           %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_I64]]) fastmath<contract> : (i64) -> none
+  ! CHECK:           return
+  call free(x)
+end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPfree_i16() {
+subroutine free_i16
+  integer (kind=2) :: x
+  ! CHECK:           %[[X:.*]] = fir.alloca i16 {bindc_name = "x", uniq_name = "_QFfree_i16Ex"}
+  ! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFfree_i16Ex"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+  ! CHECK:           %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i16>
+  ! CHECK:           %[[X_I64:.*]] = fir.convert %[[X_LD]] : (i16) -> i64
+  ! CHECK:           %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_I64]]) fastmath<contract> : (i64) -> none
+  ! CHECK:           return
+  call free(x)
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPfree_i32() {
+subroutine free_i32
+  integer (kind=4) :: x
+  ! CHECK:           %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFfree_i32Ex"}
+  ! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFfree_i32Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:           %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:           %[[X_I64:.*]] = fir.convert %[[X_LD]] : (i32) -> i64
+  ! CHECK:           %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_I64]]) fastmath<contract> : (i64) -> none
+  ! CHECK:           return
+  call free(x)
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPfree_i64() {
+subroutine free_i64
+  integer (kind=8) :: x
+  ! CHECK:           %[[X:.*]] = fir.alloca i64 {bindc_name = "x", uniq_name = "_QFfree_i64Ex"}
+  ! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFfree_i64Ex"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+  ! CHECK:           %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i64>
+  ! CHECK:           %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_LD]]) fastmath<contract> : (i64) -> none
+  ! CHECK:           return
+  call free(x)
+end subroutine
diff --git a/flang/test/Lower/Intrinsics/ieee_rint_int.f90 b/flang/test/Lower/Intrinsics/ieee_rint_int.f90
new file mode 100644
index 0000000000000..e4b1a5e26f358
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/ieee_rint_int.f90
@@ -0,0 +1,247 @@
+! RUN: bbc -emit-fir -o - %s | FileCheck %s
+
+! CHECK-LABEL: c.func @_QQmain
+program p
+  use ieee_arithmetic, only: ieee_int, ieee_rint
+  use ieee_arithmetic, only: ieee_value, ieee_positive_inf
+  use ieee_arithmetic, only: ieee_to_zero, ieee_nearest, ieee_up, ieee_away
+
+  ! CHECK:     %[[V_10:[0-9]+]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFEn"}
+  ! CHECK:     %[[V_11:[0-9]+]] = fir.declare %[[V_10]] {uniq_name = "_QFEn"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  ! CHECK:     %[[V_12:[0-9]+]] = fir.alloca i128 {bindc_name = "n16", uniq_name = "_QFEn16"}
+  ! CHECK:     %[[V_13:[0-9]+]] = fir.declare %[[V_12]] {uniq_name = "_QFEn16"} : (!fir.ref<i128>) -> !fir.ref<i128>
+  ! CHECK:     %[[V_14:[0-9]+]] = fir.alloca i16 {bindc_name = "n2", uniq_name = "_QFEn2"}
+  ! CHECK:     %[[V_15:[0-9]+]] = fir.declare %[[V_14]] {uniq_name = "_QFEn2"} : (!fir.ref<i16>) -> !fir.ref<i16>
+  ! CHECK:     %[[V_16:[0-9]+]] = fir.alloca i64 {bindc_name = "n8", uniq_name = "_QFEn8"}
+  ! CHECK:     %[[V_17:[0-9]+]] = fir.declare %[[V_16]] {uniq_name = "_QFEn8"} : (!fir.ref<i64>) -> !fir.ref<i64>
+  ! CHECK:     %[[V_18:[0-9]+]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFEx"}
+  ! CHECK:     %[[V_19:[0-9]+]] = fir.declare %[[V_18]] {uniq_name = "_QFEx"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  ! CHECK:     %[[V_20:[0-9]+]] = fir.alloca f16 {bindc_name = "x2", uniq_name = "_QFEx2"}
+  ! CHECK:     %[[V_21:[0-9]+]] = fir.declare %[[V_20]] {uniq_name = "_QFEx2"} : (!fir.ref<f16>) -> !fir.ref<f16>
+  ! CHECK:     %[[V_22:[0-9]+]] = fir.alloca bf16 {bindc_name = "x3", uniq_name = "_QFEx3"}
+  ! CHECK:     %[[V_23:[0-9]+]] = fir.declare %[[V_22]] {uniq_name = "_QFEx3"} : (!fir.ref<bf16>) -> !fir.ref<bf16>
+  ! CHECK:     %[[V_24:[0-9]+]] = fir.alloca f32 {bindc_name = "x8", uniq_name = "_QFEx8"}
+  ! CHECK:     %[[V_25:[0-9]+]] = fir.declare %[[V_24]] {uniq_name = "_QFEx8"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  ! CHECK:     %[[V_26:[0-9]+]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFEy"}
+  ! CHECK:     %[[V_27:[0-9]+]] = fir.declare %[[V_26]] {uniq_name = "_QFEy"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  ! CHECK:     %[[V_28:[0-9]+]] = fir.alloca f16 {bindc_name = "y2", uniq_name = "_QFEy2"}
+  ! CHECK:     %[[V_29:[0-9]+]] = fir.declare %[[V_28]] {uniq_name = "_QFEy2"} : (!fir.ref<f16>) -> !fir.ref<f16>
+  ! CHECK:     %[[V_30:[0-9]+]] = fir.alloca bf16 {bindc_name = "y3", uniq_name = "_QFEy3"}
+  ! CHECK:     %[[V_31:[0-9]+]] = fir.declare %[[V_30]] {uniq_name = "_QFEy3"} : (!fir.ref<bf16>) -> !fir.ref<bf16>
+  ! CHECK:     %[[V_32:[0-9]+]] = fir.alloca f32 {bindc_name = "y8", uniq_name = "_QFEy8"}
+  ! CHECK:     %[[V_33:[0-9]+]] = fir.declare %[[V_32]] {uniq_name = "_QFEy8"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  integer(2) n2
+  integer(8) n8
+  integer(16) n16
+  real(2) x2, y2
+  real(3) x3, y3
+
+  ! CHECK:     fir.store %cst{{[_0-9]*}} to %[[V_19]] : !fir.ref<f32>
+  x = -200.7
+
+  ! CHECK:     %[[V_34:[0-9]+]] = fir.address_of(@_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.0)
+  ! CHECK:     %[[V_35:[0-9]+]] = fir.declare %[[V_34]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.0"}
+  ! CHECK:     %[[V_36:[0-9]+]] = fir.load %[[V_19]] : !fir.ref<f32>
+  ! CHECK:     %[[V_37:[0-9]+]] = fir.call @llvm.get.rounding() fastmath<contract> : () -> i32
+  ! CHECK:     %[[V_38:[0-9]+]] = fir.field_index _QM__fortran_builtinsT__builtin_ieee_round_type.mode, !fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>
+  ! CHECK:     %[[V_39:[0-9]+]] = fir.coordinate_of %[[V_35]], %[[V_38]]
+  ! CHECK:     %[[V_40:[0-9]+]] = fir.load %[[V_39]] : !fir.ref<i8>
+  ! CHECK:     %[[V_41:[0-9]+]] = fir.convert %[[V_40]] : (i8) -> i32
+  ! CHECK:     fir.call @llvm.set.rounding(%[[V_41]]) fastmath<contract> : (i32) -> ()
+  ! CHECK:     %[[V_42:[0-9]+]] = fir.call @llvm.nearbyint.f32(%[[V_36]]) fastmath<contract> : (f32) -> f32
+  ! CHECK:     fir.call @llvm.set.rounding(%[[V_37]]) fastmath<contract> : (i32) -> ()
+  ! CHECK:     fir.store %[[V_42]] to %[[V_27]] : !fir.ref<f32>
+  y = ieee_rint(x, ieee_nearest)
+
+  ! CHECK:     %[[V_43:[0-9]+]] = fir.declare %[[V_34]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.0"}
+  ! CHECK:     %[[V_44:[0-9]+]] = fir.load %[[V_19]] : !fir.ref<f32>
+  ! CHECK:     %[[V_45:[0-9]+]] = fir.call @llvm.get.rounding() fastmath<contract> : () -> i32
+  ! CHECK:     %[[V_46:[0-9]+]] = fir.coordinate_of %[[V_43]], %[[V_38]]
+  ! CHECK:     %[[V_47:[0-9]+]] = fir.load %[[V_46]] : !fir.ref<i8>
+  ! CHECK:     %[[V_48:[0-9]+]] = fir.convert %[[V_47]] : (i8) -> i32
+  ! CHECK:     fir.call @llvm.set.rounding(%[[V_48]]) fastmath<contract> : (i32) -> ()
+  ! CHECK:     %[[V_49:[0-9]+]] = fir.call @llvm.nearbyint.f32(%[[V_44]]) fastmath<contract> : (f32) -> f32
+  ! CHECK:     fir.call @llvm.set.rounding(%[[V_45]]) fastmath<contract> : (i32) -> ()
+  ! CHECK:     %[[V_50:[0-9]+]] = fir.convert %c-2147483648{{.*}} : (i32) -> f32
+  ! CHECK:     %[[V_51:[0-9]+]] = arith.negf %[[V_50]] fastmath<contract> : f32
+  ! CHECK:     %[[V_52:[0-9]+]] = arith.cmpf oge, %[[V_49]], %[[V_50]] fastmath<contract> : f32
+  ! CHECK:     %[[V_53:[0-9]+]] = arith.cmpf olt, %[[V_49]], %[[V_51]] fastmath<contract> : f32
+  ! CHECK:     %[[V_54:[0-9]+]] = arith.andi %[[V_52]], %[[V_53]] : i1
+  ! CHECK:     %[[V_55:[0-9]+]] = fir.if %[[V_54]] -> (i32) {
+  ! CHECK:       %[[V_128:[0-9]+]] = arith.cmpf one, %[[V_44]], %[[V_49]] fastmath<contract> : f32
+  ! CHECK:       fir.if %[[V_128]] {
+  ! CHECK:         %[[V_130:[0-9]+]] = fir.call @_FortranAMapException(%c32{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_131:[0-9]+]] = fir.call @feraiseexcept(%[[V_130]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       }
+  ! CHECK:       %[[V_129:[0-9]+]] = fir.convert %[[V_49]] : (f32) -> i32
+  ! CHECK:       fir.result %[[V_129]] : i32
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_128:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:       %[[V_129:[0-9]+]] = fir.call @feraiseexcept(%[[V_128]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       %[[V_130:[0-9]+]] = arith.select %[[V_52]], %c2147483647{{.*}}, %c-2147483648{{.*}} : i32
+  ! CHECK:       fir.result %[[V_130]] : i32
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_55]] to %[[V_11]] : !fir.ref<i32>
+  n = ieee_int(x, ieee_nearest)
+! print*, x, ' -> ', y, ' -> ', n
+
+  ! CHECK:     fir.store %cst{{[_0-9]*}} to %[[V_21]] : !fir.ref<f16>
+  x2 = huge(x2)
+
+  ! CHECK:     %[[V_56:[0-9]+]] = fir.address_of(@_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.1)
+  ! CHECK:     %[[V_57:[0-9]+]] = fir.declare %[[V_56]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.1"}
+  ! CHECK:     %[[V_58:[0-9]+]] = fir.load %[[V_21]] : !fir.ref<f16>
+  ! CHECK:     %[[V_59:[0-9]+]] = fir.call @llvm.get.rounding() fastmath<contract> : () -> i32
+  ! CHECK:     %[[V_60:[0-9]+]] = fir.coordinate_of %[[V_57]], %[[V_38]]
+  ! CHECK:     %[[V_61:[0-9]+]] = fir.load %[[V_60]] : !fir.ref<i8>
+  ! CHECK:     %[[V_62:[0-9]+]] = fir.convert %[[V_61]] : (i8) -> i32
+  ! CHECK:     fir.call @llvm.set.rounding(%[[V_62]]) fastmath<contract> : (i32) -> ()
+  ! CHECK:     %[[V_63:[0-9]+]] = fir.convert %[[V_58]] : (f16) -> f32
+  ! CHECK:     %[[V_64:[0-9]+]] = fir.call @llvm.nearbyint.f32(%[[V_63]]) fastmath<contract> : (f32) -> f32
+  ! CHECK:     %[[V_65:[0-9]+]] = fir.convert %[[V_64]] : (f32) -> f16
+  ! CHECK:     fir.call @llvm.set.rounding(%[[V_59]]) fastmath<contract> : (i32) -> ()
+  ! CHECK:     fir.store %[[V_65]] to %[[V_29]] : !fir.ref<f16>
+  y2 = ieee_rint(x2, ieee_up)
+
+  ! CHECK:     %[[V_66:[0-9]+]] = fir.declare %[[V_56]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.1"}
+  ! CHECK:     %[[V_67:[0-9]+]] = fir.load %[[V_21]] : !fir.ref<f16>
+  ! CHECK:     %[[V_68:[0-9]+]] = fir.call @llvm.get.rounding() fastmath<contract> : () -> i32
+  ! CHECK:     %[[V_69:[0-9]+]] = fir.coordinate_of %[[V_66]], %[[V_38]]
+  ! CHECK:     %[[V_70:[0-9]+]] = fir.load %[[V_69]] : !fir.ref<i8>
+  ! CHECK:     %[[V_71:[0-9]+]] = fir.convert %[[V_70]] : (i8) -> i32
+  ! CHECK:     fir.call @llvm.set.rounding(%[[V_71]]) fastmath<contract> : (i32) -> ()
+  ! CHECK:     %[[V_72:[0-9]+]] = fir.convert %[[V_67]] : (f16) -> f32
+  ! CHECK:     %[[V_73:[0-9]+]] = fir.call @llvm.nearbyint.f32(%[[V_72]]) fastmath<contract> : (f32) -> f32
+  ! CHECK:     %[[V_74:[0-9]+]] = fir.convert %[[V_73]] : (f32) -> f16
+  ! CHECK:     fir.call @llvm.set.rounding(%[[V_68]]) fastmath<contract> : (i32) -> ()
+  ! CHECK:     %[[V_75:[0-9]+]] = fir.convert %c-9223372036854775808{{.*}} : (i64) -> f16
+  ! CHECK:     %[[V_76:[0-9]+]] = arith.negf %[[V_75]] fastmath<contract> : f16
+  ! CHECK:     %[[V_77:[0-9]+]] = arith.cmpf oge, %[[V_74]], %[[V_75]] fastmath<contract> : f16
+  ! CHECK:     %[[V_78:[0-9]+]] = arith.cmpf olt, %[[V_74]], %[[V_76]] fastmath<contract> : f16
+  ! CHECK:     %[[V_79:[0-9]+]] = arith.andi %[[V_77]], %[[V_78]] : i1
+  ! CHECK:     %[[V_80:[0-9]+]] = fir.if %[[V_79]] -> (i64) {
+  ! CHECK:       %[[V_128:[0-9]+]] = arith.cmpf one, %[[V_67]], %[[V_74]] fastmath<contract> : f16
+  ! CHECK:       fir.if %[[V_128]] {
+  ! CHECK:         %[[V_130:[0-9]+]] = fir.call @_FortranAMapException(%c32{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_131:[0-9]+]] = fir.call @feraiseexcept(%[[V_130]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       }
+  ! CHECK:       %[[V_129:[0-9]+]] = fir.convert %[[V_74]] : (f16) -> i64
+  ! CHECK:       fir.result %[[V_129]] : i64
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_128:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:       %[[V_129:[0-9]+]] = fir.call @feraiseexcept(%[[V_128]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       %[[V_130:[0-9]+]] = arith.select %[[V_77]], %c9223372036854775807{{.*}}, %c-9223372036854775808{{.*}} : i64
+  ! CHECK:       fir.result %[[V_130]] : i64
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_80]] to %[[V_17]] : !fir.ref<i64>
+  n8 = ieee_int(x2, ieee_up, 8)
+
+! print*, x2, ' -> ', y2, ' -> ', n8
+
+  ! CHECK:     fir.store %cst{{[_0-9]*}} to %[[V_23]] : !fir.ref<bf16>
+  x3 = -0.
+
+  ! CHECK:     %[[V_81:[0-9]+]] = fir.load %[[V_23]] : !fir.ref<bf16>
+  ! CHECK:     %[[V_82:[0-9]+]] = fir.convert %[[V_81]] : (bf16) -> f32
+  ! CHECK:     %[[V_83:[0-9]+]] = fir.call @llvm.nearbyint.f32(%[[V_82]]) fastmath<contract> : (f32) -> f32
+  ! CHECK:     %[[V_84:[0-9]+]] = fir.convert %[[V_83]] : (f32) -> bf16
+  ! CHECK:     %[[V_85:[0-9]+]] = arith.cmpf one, %[[V_81]], %[[V_84]] fastmath<contract> : bf16
+  ! CHECK:     fir.if %[[V_85]] {
+  ! CHECK:       %[[V_128:[0-9]+]] = fir.call @_FortranAMapException(%c32{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:       %[[V_129:[0-9]+]] = fir.call @feraiseexcept(%[[V_128]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_84]] to %[[V_31]] : !fir.ref<bf16>
+  y3 = ieee_rint(x3)
+
+  ! CHECK:     %[[V_86:[0-9]+]] = fir.address_of(@_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.2)
+  ! CHECK:     %[[V_87:[0-9]+]] = fir.declare %[[V_86]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.2"}
+  ! CHECK:     %[[V_88:[0-9]+]] = fir.load %[[V_23]] : !fir.ref<bf16>
+  ! CHECK:     %[[V_89:[0-9]+]] = fir.call @llvm.get.rounding() fastmath<contract> : () -> i32
+  ! CHECK:     %[[V_90:[0-9]+]] = fir.coordinate_of %[[V_87]], %[[V_38]]
+  ! CHECK:     %[[V_91:[0-9]+]] = fir.load %[[V_90]] : !fir.ref<i8>
+  ! CHECK:     %[[V_92:[0-9]+]] = fir.convert %[[V_91]] : (i8) -> i32
+  ! CHECK:     fir.call @llvm.set.rounding(%[[V_92]]) fastmath<contract> : (i32) -> ()
+  ! CHECK:     %[[V_93:[0-9]+]] = fir.convert %[[V_88]] : (bf16) -> f32
+  ! CHECK:     %[[V_94:[0-9]+]] = fir.call @llvm.nearbyint.f32(%[[V_93]]) fastmath<contract> : (f32) -> f32
+  ! CHECK:     %[[V_95:[0-9]+]] = fir.convert %[[V_94]] : (f32) -> bf16
+  ! CHECK:     fir.call @llvm.set.rounding(%[[V_89]]) fastmath<contract> : (i32) -> ()
+  ! CHECK:     %[[V_96:[0-9]+]] = fir.convert %c-170141183460469231731687303715884105728{{.*}} : (i128) -> bf16
+  ! CHECK:     %[[V_97:[0-9]+]] = arith.negf %[[V_96]] fastmath<contract> : bf16
+  ! CHECK:     %[[V_98:[0-9]+]] = arith.cmpf oge, %[[V_95]], %[[V_96]] fastmath<contract> : bf16
+  ! CHECK:     %[[V_99:[0-9]+]] = arith.cmpf olt, %[[V_95]], %[[V_97]] fastmath<contract> : bf16
+  ! CHECK:     %[[V_100:[0-9]+]] = arith.andi %[[V_98]], %[[V_99]] : i1
+  ! CHECK:     %[[V_101:[0-9]+]] = fir.if %[[V_100]] -> (i128) {
+  ! CHECK:       %[[V_128:[0-9]+]] = arith.cmpf one, %[[V_88]], %[[V_95]] fastmath<contract> : bf16
+  ! CHECK:       fir.if %[[V_128]] {
+  ! CHECK:         %[[V_130:[0-9]+]] = fir.call @_FortranAMapException(%c32{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_131:[0-9]+]] = fir.call @feraiseexcept(%[[V_130]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       }
+  ! CHECK:       %[[V_129:[0-9]+]] = fir.convert %[[V_95]] : (bf16) -> i128
+  ! CHECK:       fir.result %[[V_129]] : i128
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_128:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:       %[[V_129:[0-9]+]] = fir.call @feraiseexcept(%[[V_128]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       %[[V_130:[0-9]+]] = arith.select %[[V_98]], %c170141183460469231731687303715884105727{{.*}}, %c-170141183460469231731687303715884105728{{.*}} : i128
+  ! CHECK:       fir.result %[[V_130]] : i128
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_101]] to %[[V_13]] : !fir.ref<i128>
+  n16 = ieee_int(x3, ieee_away, 16)
+
+! print*, x3, ' -> ', y3, ' -> ', n16
+
+  ! CHECK:     %[[V_102:[0-9]+]] = fir.address_of(@_QQro._QMieee_arithmeticTieee_class_type.3)
+  ! CHECK:     %[[V_103:[0-9]+]] = fir.declare %[[V_102]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QMieee_arithmeticTieee_class_type.3"}
+  ! CHECK:     %[[V_104:[0-9]+]] = fir.field_index _QMieee_arithmeticTieee_class_type.which, !fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>
+  ! CHECK:     %[[V_105:[0-9]+]] = fir.coordinate_of %[[V_103]], %[[V_104]]
+  ! CHECK:     %[[V_106:[0-9]+]] = fir.load %[[V_105]] : !fir.ref<i8>
+  ! CHECK:     %[[V_107:[0-9]+]] = fir.address_of(@_FortranAIeeeValueTable_4) : !fir.ref<!fir.array<12xi32>>
+  ! CHECK:     %[[V_108:[0-9]+]] = fir.coordinate_of %[[V_107]], %[[V_106]]
+  ! CHECK:     %[[V_109:[0-9]+]] = fir.load %[[V_108]] : !fir.ref<i32>
+  ! CHECK:     %[[V_110:[0-9]+]] = arith.bitcast %[[V_109]] : i32 to f32
+  ! CHECK:     fir.store %[[V_110]] to %[[V_25]] : !fir.ref<f32>
+  x8 = ieee_value(x8, ieee_positive_inf)
+
+  ! CHECK:     %[[V_111:[0-9]+]] = fir.load %[[V_25]] : !fir.ref<f32>
+  ! CHECK:     %[[V_112:[0-9]+]] = fir.call @llvm.nearbyint.f32(%[[V_111]]) fastmath<contract> : (f32) -> f32
+  ! CHECK:     %[[V_113:[0-9]+]] = arith.cmpf one, %[[V_111]], %[[V_112]] fastmath<contract> : f32
+  ! CHECK:     fir.if %[[V_113]] {
+  ! CHECK:       %[[V_128:[0-9]+]] = fir.call @_FortranAMapException(%c32{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:       %[[V_129:[0-9]+]] = fir.call @feraiseexcept(%[[V_128]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_112]] to %[[V_33]] : !fir.ref<f32>
+  y8 = ieee_rint(x8)
+
+  ! CHECK:     %[[V_114:[0-9]+]] = fir.address_of(@_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.4)
+  ! CHECK:     %[[V_115:[0-9]+]] = fir.declare %[[V_114]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.4"}
+  ! CHECK:     %[[V_116:[0-9]+]] = fir.load %[[V_25]] : !fir.ref<f32>
+  ! CHECK:     %[[V_117:[0-9]+]] = fir.call @llvm.get.rounding() fastmath<contract> : () -> i32
+  ! CHECK:     %[[V_118:[0-9]+]] = fir.coordinate_of %[[V_115]], %[[V_38]]
+  ! CHECK:     %[[V_119:[0-9]+]] = fir.load %[[V_118]] : !fir.ref<i8>
+  ! CHECK:     %[[V_120:[0-9]+]] = fir.convert %[[V_119]] : (i8) -> i32
+  ! CHECK:     fir.call @llvm.set.rounding(%[[V_120]]) fastmath<contract> : (i32) -> ()
+  ! CHECK:     %[[V_121:[0-9]+]] = fir.call @llvm.nearbyint.f32(%[[V_116]]) fastmath<contract> : (f32) -> f32
+  ! CHECK:     fir.call @llvm.set.rounding(%[[V_117]]) fastmath<contract> : (i32) -> ()
+  ! CHECK:     %[[V_122:[0-9]+]] = fir.convert %c-32768{{.*}} : (i16) -> f32
+  ! CHECK:     %[[V_123:[0-9]+]] = arith.negf %[[V_122]] fastmath<contract> : f32
+  ! CHECK:     %[[V_124:[0-9]+]] = arith.cmpf oge, %[[V_121]], %[[V_122]] fastmath<contract> : f32
+  ! CHECK:     %[[V_125:[0-9]+]] = arith.cmpf olt, %[[V_121]], %[[V_123]] fastmath<contract> : f32
+  ! CHECK:     %[[V_126:[0-9]+]] = arith.andi %[[V_124]], %[[V_125]] : i1
+  ! CHECK:     %[[V_127:[0-9]+]] = fir.if %[[V_126]] -> (i16) {
+  ! CHECK:       %[[V_128:[0-9]+]] = arith.cmpf one, %[[V_116]], %[[V_121]] fastmath<contract> : f32
+  ! CHECK:       fir.if %[[V_128]] {
+  ! CHECK:         %[[V_130:[0-9]+]] = fir.call @_FortranAMapException(%c32{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_131:[0-9]+]] = fir.call @feraiseexcept(%[[V_130]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       }
+  ! CHECK:       %[[V_129:[0-9]+]] = fir.convert %[[V_121]] : (f32) -> i16
+  ! CHECK:       fir.result %[[V_129]] : i16
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_128:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:       %[[V_129:[0-9]+]] = fir.call @feraiseexcept(%[[V_128]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       %[[V_130:[0-9]+]] = arith.select %[[V_124]], %c32767{{.*}}, %c-32768{{.*}} : i16
+  ! CHECK:       fir.result %[[V_130]] : i16
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_127]] to %[[V_15]] : !fir.ref<i16>
+  n2 = ieee_int(x8, ieee_to_zero, 2)
+
+! print*, x8, ' -> ', y8, ' -> ', n2
+end
diff --git a/flang/test/Lower/Intrinsics/malloc.f90 b/flang/test/Lower/Intrinsics/malloc.f90
new file mode 100644
index 0000000000000..4a9b65bf7ae18
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/malloc.f90
@@ -0,0 +1,75 @@
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK-LABEL: func.func @_QPmalloc_ptr() {
+subroutine malloc_ptr()
+  integer :: x
+  pointer (ptr_x, x)
+  ! CHECK:           %[[X:.*]] = fir.alloca !fir.box<!fir.ptr<i32>>
+  ! CHECK:           %[[X_PTR:.*]] = fir.alloca i64 {bindc_name = "ptr_x", uniq_name = "_QFmalloc_ptrEptr_x"}
+  ! CHECK:           %[[X_PTR_DECL:.*]]:2 = hlfir.declare %[[X_PTR]] {uniq_name = "_QFmalloc_ptrEptr_x"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+  ! CHECK:           %[[CST:.*]] = arith.constant 4 : i32
+  ! CHECK:           %[[CST_I64:.*]] = fir.convert %[[CST]] : (i32) -> i64
+  ! CHECK:           %[[ALLOC:.*]] = fir.call @_FortranAMalloc(%[[CST_I64]]) fastmath<contract> : (i64) -> i64
+  ! CHECK:           hlfir.assign %[[ALLOC]] to %[[X_PTR_DECL]]#0 : i64, !fir.ref<i64>
+  ! CHECK:           return
+  ptr_x = malloc(4)
+end subroutine
+
+! gfortran allows malloc to be assigned to integers, so we accept it.
+
+! CHECK-LABEL:   func.func @_QPmalloc_i8() {
+subroutine malloc_i8()
+  integer(kind=1) :: x
+! CHECK:           %[[X:.*]] = fir.alloca i8 {bindc_name = "x", uniq_name = "_QFmalloc_i8Ex"}
+! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFmalloc_i8Ex"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+! CHECK:           %[[CST:.*]] = arith.constant 1 : i32
+! CHECK:           %[[CST_I64:.*]] = fir.convert %[[CST]] : (i32) -> i64
+! CHECK:           %[[ALLOC:.*]] = fir.call @_FortranAMalloc(%[[CST_I64]]) fastmath<contract> : (i64) -> i64
+! CHECK:           %[[ALLOC_I8:.*]] = fir.convert %[[ALLOC]] : (i64) -> i8
+! CHECK:           hlfir.assign %[[ALLOC_I8]] to %[[X_DECL]]#0 : i8, !fir.ref<i8>
+! CHECK:           return
+  x = malloc(1)
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmalloc_i16() {
+subroutine malloc_i16()
+  integer(kind=2) :: x
+! CHECK:           %[[X:.*]] = fir.alloca i16 {bindc_name = "x", uniq_name = "_QFmalloc_i16Ex"}
+! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFmalloc_i16Ex"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+! CHECK:           %[[CST:.*]] = arith.constant 1 : i32
+! CHECK:           %[[CST_I64:.*]] = fir.convert %[[CST]] : (i32) -> i64
+! CHECK:           %[[ALLOC:.*]] = fir.call @_FortranAMalloc(%[[CST_I64]]) fastmath<contract> : (i64) -> i64
+! CHECK:           %[[ALLOC_I16:.*]] = fir.convert %[[ALLOC]] : (i64) -> i16
+! CHECK:           hlfir.assign %[[ALLOC_I16]] to %[[X_DECL]]#0 : i16, !fir.ref<i16>
+! CHECK:           return
+  x = malloc(1)
+end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPmalloc_i32() {
+subroutine malloc_i32()
+  integer(kind=4) :: x
+! CHECK:           %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmalloc_i32Ex"}
+! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFmalloc_i32Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[CST:.*]] = arith.constant 1 : i32
+! CHECK:           %[[CST_I64:.*]] = fir.convert %[[CST]] : (i32) -> i64
+! CHECK:           %[[ALLOC:.*]] = fir.call @_FortranAMalloc(%[[CST_I64]]) fastmath<contract> : (i64) -> i64
+! CHECK:           %[[ALLOC_I32:.*]] = fir.convert %[[ALLOC]] : (i64) -> i32
+! CHECK:           hlfir.assign %[[ALLOC_I32]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+! CHECK:           return
+  x = malloc(1)
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmalloc_i64() {
+subroutine malloc_i64()
+  integer(kind=8) :: x
+! CHECK:           %[[X:.*]] = fir.alloca i64 {bindc_name = "x", uniq_name = "_QFmalloc_i64Ex"}
+! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFmalloc_i64Ex"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[CST:.*]] = arith.constant 1 : i32
+! CHECK:           %[[CST_I64:.*]] = fir.convert %[[CST]] : (i32) -> i64
+! CHECK:           %[[ALLOC:.*]] = fir.call @_FortranAMalloc(%[[CST_I64]]) fastmath<contract> : (i64) -> i64
+! CHECK:           hlfir.assign %[[ALLOC]] to %[[X_DECL]]#0 : i64, !fir.ref<i64>
+! CHECK:           return
+  x = malloc(1)
+end subroutine
diff --git a/flang/test/Lower/Intrinsics/signal.f90 b/flang/test/Lower/Intrinsics/signal.f90
index 5d20bb5c5c074..39fef122d7754 100644
--- a/flang/test/Lower/Intrinsics/signal.f90
+++ b/flang/test/Lower/Intrinsics/signal.f90
@@ -4,14 +4,14 @@
 module m
 contains
 ! CHECK-LABEL:   func.func @handler(
-! CHECK-SAME:                       %[[VAL_0:.*]]: i32 {fir.bindc_name = "signum"}) attributes {fir.bindc_name = "handler"} {
+! CHECK-SAME:                       %[[VAL_0:.*]]: i32
   subroutine handler(signum) bind(C)
     use iso_c_binding, only: c_int
     integer(c_int), value :: signum
   end subroutine
 
 ! CHECK-LABEL:   func.func @_QMmPsetup_signals(
-! CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "optional_status", fir.optional}) {
+! CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.ref<i32>
   subroutine setup_signals(optional_status)
     ! not portable accross systems
     integer, parameter :: SIGFPE = 8
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/distribute-standalone-private.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/distribute-standalone-private.f90
new file mode 100644
index 0000000000000..9c2ff8b528485
--- /dev/null
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/distribute-standalone-private.f90
@@ -0,0 +1,42 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization-staging \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization-staging -o - %s 2>&1 \
+! RUN:   | FileCheck %s
+
+subroutine standalone_distribute
+    implicit none
+    integer :: simple_var, i
+
+    !$omp teams
+    !$omp distribute private(simple_var)
+    do i = 1, 10
+      simple_var = simple_var + i
+    end do
+    !$omp end distribute
+    !$omp end teams
+end subroutine standalone_distribute
+
+! CHECK: omp.private {type = private} @[[I_PRIVATIZER_SYM:.*]] : !fir.ref<i32>
+! CHECK: omp.private {type = private} @[[VAR_PRIVATIZER_SYM:.*]] : !fir.ref<i32>
+
+
+! CHECK-LABEL: func.func @_QPstandalone_distribute() {
+! CHECK:         %[[I_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFstandalone_distributeEi"}
+! CHECK:         %[[VAR_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFstandalone_distributeEsimple_var"}
+! CHECK:         omp.teams {
+! CHECK:           omp.distribute
+! CHECK-SAME:        private(@[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %[[VAR_ARG:[^,]+]],
+! CHECK-SAME:                @[[I_PRIVATIZER_SYM]] %[[I_DECL]]#0 -> %[[I_ARG:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+! CHECK:             omp.loop_nest {{.*}} {
+! CHECK:               %[[VAR_PRIV_DECL:.*]]:2 = hlfir.declare %[[VAR_ARG]]
+! CHECK:               %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_ARG]]
+
+! CHECK:               fir.store %{{.*}} to %[[I_PRIV_DECL]]#1 : !fir.ref<i32>
+! CHECK:               %{{.*}} = fir.load %[[VAR_PRIV_DECL]]#0 : !fir.ref<i32>
+! CHECK:               %{{.*}} = fir.load %[[I_PRIV_DECL]]#0 : !fir.ref<i32>
+! CHECK:               arith.addi %{{.*}}, %{{.*}} : i32
+! CHECK:               hlfir.assign %{{.*}} to %[[VAR_PRIV_DECL]]#0 : i32, !fir.ref<i32>
+! CHECK:             }
+! CHECK:           }
+! CHECK:         }
+! CHECK:       }
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90
index 6e8282b2af625..e3c1dc805d07b 100644
--- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90
@@ -154,12 +154,13 @@ end subroutine target_allocatable
 ! CHECK:        omp.target
 ! CHECK-SAME:     map_entries(%[[MAPPED_MI]] -> %[[MAPPED_ARG:.*]] : !fir.ref<i32>)
 ! CHECK-SAME:     private(
-! CHECK-SAME:       @[[ALLOC_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[ALLOC_ARG:.*]] : !fir.ref<!fir.box<!fir.heap<i32>>>,
-! CHECK-SAME:       @[[REAL_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[REAL_ARG:.*]] : !fir.ref<f32>,
-! CHECK-SAME:       @[[LB_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[LB_ARG:.*]] : !fir.ref<i64>,
-! CHECK-SAME:       @[[ARR_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[ARR_ARG:.*]] : !fir.box<!fir.array<?xf32>>,
-! CHECK-SAME:       @[[COMP_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[COMP_ARG:.*]] : !fir.ref<!fir.complex<4>>,
-! CHECK-SAME:       @[[CHAR_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[CHAR_ARG:.*]] : !fir.boxchar<1>) {
+! CHECK-SAME:       @[[ALLOC_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[ALLOC_ARG:[^,]+]],
+! CHECK-SAME:       @[[REAL_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[REAL_ARG:[^,]+]],
+! CHECK-SAME:       @[[LB_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[LB_ARG:[^,]+]],
+! CHECK-SAME:       @[[ARR_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[ARR_ARG:[^,]+]],
+! CHECK-SAME:       @[[COMP_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[COMP_ARG:[^,]+]],
+! CHECK-SAME:       @[[CHAR_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[CHAR_ARG:[^,]+]] :
+! CHECK-SAME:       !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<f32>, !fir.ref<i64>, !fir.box<!fir.array<?xf32>>, !fir.ref<!fir.complex<4>>, !fir.boxchar<1>) {
 ! CHECK-NOT:      fir.alloca
 ! CHECK:          hlfir.declare %[[MAPPED_ARG]]
 ! CHECK:          hlfir.declare %[[ALLOC_ARG]]
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-simple.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-simple.f90
index 524e973780c49..3c6836e81abe1 100644
--- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-simple.f90
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-simple.f90
@@ -27,8 +27,7 @@ end subroutine target_simple
 ! CHECK:  %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]]
 
 ! CHECK:  omp.target private(
-! CHECK-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} : !fir.ref<i32>) {
-! CHECK:    ^bb0(%[[REG_ARG:.*]]: !fir.ref<i32>):
+! CHECK-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %[[REG_ARG:.*]] : !fir.ref<i32>) {
 ! CHECK:      %[[REG_DECL:.*]]:2 = hlfir.declare %[[REG_ARG]]
 ! CHECK:      %[[C10:.*]] = arith.constant 10
 ! CHECK:      hlfir.assign %[[C10]] to %[[REG_DECL]]#0
diff --git a/flang/test/Lower/OpenMP/Todo/omp-default-clause-inner-loop.f90 b/flang/test/Lower/OpenMP/Todo/omp-default-clause-inner-loop.f90
index a08cfc1a92e35..42ebd37d1c431 100644
--- a/flang/test/Lower/OpenMP/Todo/omp-default-clause-inner-loop.f90
+++ b/flang/test/Lower/OpenMP/Todo/omp-default-clause-inner-loop.f90
@@ -8,7 +8,7 @@
 
 ! The string "EXPECTED" denotes the expected FIR
 
-! CHECK: omp.parallel  private(@{{.*}} %{{.*}} -> %[[PRIVATE_Y:.*]] : !fir.ref<i32>, @{{.*}} %{{.*}} -> %[[PRIVATE_Y:.*]] : !fir.ref<i32>) {
+! CHECK: omp.parallel  private(@{{.*}} %{{.*}} -> %[[PRIVATE_Y:.*]], @{{.*}} %{{.*}} -> %[[PRIVATE_Y:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
 ! CHECK: %[[TEMP:.*]] = fir.alloca i32 {bindc_name = "x", pinned, {{.*}}}
 ! CHECK: %[[const_1:.*]] = arith.constant 1 : i32
 ! CHECK: %[[const_2:.*]] = arith.constant 10 : i32
diff --git a/flang/test/Lower/OpenMP/atomic-capture.f90 b/flang/test/Lower/OpenMP/atomic-capture.f90
index 667ae8ed7a133..af82e4b2a20eb 100644
--- a/flang/test/Lower/OpenMP/atomic-capture.f90
+++ b/flang/test/Lower/OpenMP/atomic-capture.f90
@@ -2,8 +2,8 @@
 
 ! This test checks the lowering of atomic capture
 
-! RUN: bbc %openmp_flags -emit-hlfir %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-hlfir %openmp_flags %s -o - | FileCheck %s
+! RUN: bbc %openmp_flags -fopenmp-version=50 -emit-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=50 %s -o - | FileCheck %s
 
 
 program OmpAtomicCapture
diff --git a/flang/test/Lower/OpenMP/atomic-read.f90 b/flang/test/Lower/OpenMP/atomic-read.f90
index d578df959a474..c3270dd6c1d67 100644
--- a/flang/test/Lower/OpenMP/atomic-read.f90
+++ b/flang/test/Lower/OpenMP/atomic-read.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: bbc %openmp_flags -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc %openmp_flags -fopenmp-version=50 -emit-hlfir %s -o - | FileCheck %s
 
 ! This test checks the lowering of atomic read
 
diff --git a/flang/test/Lower/OpenMP/atomic-update.f90 b/flang/test/Lower/OpenMP/atomic-update.f90
index 85edfdf4de84d..16dae9d5f301c 100644
--- a/flang/test/Lower/OpenMP/atomic-update.f90
+++ b/flang/test/Lower/OpenMP/atomic-update.f90
@@ -1,8 +1,8 @@
 ! REQUIRES: openmp_runtime
 
 ! This test checks lowering of atomic and atomic update constructs
-! RUN: bbc %openmp_flags -emit-hlfir %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-hlfir %openmp_flags %s -o - | FileCheck %s
+! RUN: bbc %openmp_flags -fopenmp-version=50 -emit-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=50 %s -o - | FileCheck %s
 
 program OmpAtomicUpdate
     use omp_lib
diff --git a/flang/test/Lower/OpenMP/atomic-write.f90 b/flang/test/Lower/OpenMP/atomic-write.f90
index 8867dc5921192..b30dc483e6b84 100644
--- a/flang/test/Lower/OpenMP/atomic-write.f90
+++ b/flang/test/Lower/OpenMP/atomic-write.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: bbc %openmp_flags -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc %openmp_flags -fopenmp-version=50 -emit-hlfir %s -o - | FileCheck %s
 
 ! This test checks the lowering of atomic write
 
diff --git a/flang/test/Lower/OpenMP/common-block-map.f90 b/flang/test/Lower/OpenMP/common-block-map.f90
index 0c423efd5eef4..06df0d2d9fb18 100644
--- a/flang/test/Lower/OpenMP/common-block-map.f90
+++ b/flang/test/Lower/OpenMP/common-block-map.f90
@@ -7,7 +7,6 @@
 !CHECK: %[[CB_ADDR:.*]] = fir.address_of(@var_common_) : !fir.ref<!fir.array<8xi8>>
 !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[CB_ADDR]] : !fir.ref<!fir.array<8xi8>>, !fir.array<8xi8>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.array<8xi8>> {name = "var_common"}
 !CHECK: omp.target map_entries(%[[MAP]] -> %[[MAP_ARG:.*]] : !fir.ref<!fir.array<8xi8>>) {
-!CHECK:  ^bb0(%[[MAP_ARG]]: !fir.ref<!fir.array<8xi8>>):
 !CHECK:    %[[CONV:.*]] = fir.convert %[[MAP_ARG]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
 !CHECK:    %[[INDEX:.*]] = arith.constant 0 : index
 !CHECK:    %[[COORD:.*]] = fir.coordinate_of %[[CONV]], %[[INDEX]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
@@ -43,7 +42,6 @@ subroutine map_full_block
 !CHECK: %[[MAP_EXP:.*]] = omp.map.info var_ptr(%[[CB_MEMBER_2]]#1 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "var2"}
 !CHECK: %[[MAP_IMP:.*]] = omp.map.info var_ptr(%[[CB_MEMBER_1]]#1 : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "var1"}
 !CHECK: omp.target map_entries(%[[MAP_EXP]] -> %[[ARG_EXP:.*]], %[[MAP_IMP]] -> %[[ARG_IMP:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
-!CHECK: ^bb0(%[[ARG_EXP]]: !fir.ref<i32>, %[[ARG_IMP]]: !fir.ref<i32>):
 !CHECK:  %[[EXP_MEMBER:.*]]:2 = hlfir.declare %[[ARG_EXP]] {uniq_name = "_QFmap_mix_of_membersEvar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:  %[[IMP_MEMBER:.*]]:2 = hlfir.declare %[[ARG_IMP]] {uniq_name = "_QFmap_mix_of_membersEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 subroutine map_mix_of_members
@@ -60,7 +58,6 @@ subroutine map_mix_of_members
 !CHECK: %[[DECL_TAR_CB:.*]] = fir.address_of(@var_common_link_) : !fir.ref<!fir.array<8xi8>>
 !CHECK: %[[MAP_DECL_TAR_CB:.*]] = omp.map.info var_ptr(%[[DECL_TAR_CB]] : !fir.ref<!fir.array<8xi8>>, !fir.array<8xi8>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.array<8xi8>> {name = "var_common_link"}
 !CHECK: omp.target map_entries(%[[MAP_DECL_TAR_CB]] -> %[[MAP_DECL_TAR_ARG:.*]] : !fir.ref<!fir.array<8xi8>>) {
-!CHECK: ^bb0(%[[MAP_DECL_TAR_ARG]]: !fir.ref<!fir.array<8xi8>>):
 !CHECK:  %[[CONV:.*]] = fir.convert %[[MAP_DECL_TAR_ARG]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
 !CHECK:  %[[INDEX:.*]] = arith.constant 0 : index
 !CHECK:  %[[COORD:.*]] = fir.coordinate_of %[[CONV]], %[[INDEX]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
diff --git a/flang/test/Lower/OpenMP/declare-target-data.f90 b/flang/test/Lower/OpenMP/declare-target-data.f90
index d86f74d18b6df..154853a0fa20c 100644
--- a/flang/test/Lower/OpenMP/declare-target-data.f90
+++ b/flang/test/Lower/OpenMP/declare-target-data.f90
@@ -1,5 +1,5 @@
-!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s 
-!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s 
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -fopenmp-is-target-device %s -o - | FileCheck %s
 
 module test_0
     implicit none
diff --git a/flang/test/Lower/OpenMP/declare-target-deferred-marking.f90 b/flang/test/Lower/OpenMP/declare-target-deferred-marking.f90
index 1998c3da23af5..079d43e309028 100644
--- a/flang/test/Lower/OpenMP/declare-target-deferred-marking.f90
+++ b/flang/test/Lower/OpenMP/declare-target-deferred-marking.f90
@@ -1,5 +1,5 @@
-!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes ALL,HOST
-!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-device %s -o - | FileCheck %s --check-prefixes ALL
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s --check-prefixes ALL,HOST
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -fopenmp-is-device %s -o - | FileCheck %s --check-prefixes ALL
 
 program main
     use, intrinsic ::  iso_c_binding
diff --git a/flang/test/Lower/OpenMP/declare-target-func-and-subr.f90 b/flang/test/Lower/OpenMP/declare-target-func-and-subr.f90
index 0d138321445ce..db8320a598052 100644
--- a/flang/test/Lower/OpenMP/declare-target-func-and-subr.f90
+++ b/flang/test/Lower/OpenMP/declare-target-func-and-subr.f90
@@ -1,5 +1,5 @@
-!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes ALL,HOST
-!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-device %s -o - | FileCheck %s --check-prefixes ALL,DEVICE
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s --check-prefixes ALL,HOST
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -fopenmp-is-device %s -o - | FileCheck %s --check-prefixes ALL,DEVICE
 
 ! Check specification valid forms of declare target with functions 
 ! utilising device_type and to clauses as well as the default 
@@ -154,7 +154,7 @@ SUBROUTINE SUBR_DEFAULT_EXTENDEDLIST()
 !! -----
 
 ! DEVICE-LABEL: func.func @_QPrecursive_declare_target
-! DEVICE-SAME: {{.*}}attributes {fir.func_recursive, omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}
+! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}
 RECURSIVE FUNCTION RECURSIVE_DECLARE_TARGET(INCREMENT) RESULT(K)
 !$omp declare target to(RECURSIVE_DECLARE_TARGET) device_type(nohost)
     INTEGER :: INCREMENT, K
@@ -166,7 +166,7 @@ RECURSIVE FUNCTION RECURSIVE_DECLARE_TARGET(INCREMENT) RESULT(K)
 END FUNCTION RECURSIVE_DECLARE_TARGET
 
 ! DEVICE-LABEL: func.func @_QPrecursive_declare_target_enter
-! DEVICE-SAME: {{.*}}attributes {fir.func_recursive, omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}
+! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}
 RECURSIVE FUNCTION RECURSIVE_DECLARE_TARGET_ENTER(INCREMENT) RESULT(K)
 !$omp declare target enter(RECURSIVE_DECLARE_TARGET_ENTER) device_type(nohost)
     INTEGER :: INCREMENT, K
diff --git a/flang/test/Lower/OpenMP/declare-target-implicit-func-and-subr-cap-enter.f90 b/flang/test/Lower/OpenMP/declare-target-implicit-func-and-subr-cap-enter.f90
index 0ca2bcbd66a96..941f1eecbaf56 100644
--- a/flang/test/Lower/OpenMP/declare-target-implicit-func-and-subr-cap-enter.f90
+++ b/flang/test/Lower/OpenMP/declare-target-implicit-func-and-subr-cap-enter.f90
@@ -1,7 +1,7 @@
-!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
-!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s  --check-prefix=DEVICE
-!RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
-!RUN: bbc -emit-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -fopenmp-is-target-device %s -o - | FileCheck %s  --check-prefix=DEVICE
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=52 -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE
 
 ! CHECK-LABEL: func.func @_QPimplicitly_captured_twice
 ! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
@@ -105,7 +105,7 @@ end function target_function_test_host
 !! -----
 
 ! DEVICE-LABEL: func.func @_QPimplicitly_captured_with_dev_type_recursive
-! DEVICE-SAME: {{.*}}attributes {fir.func_recursive, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
+! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
 recursive function implicitly_captured_with_dev_type_recursive(increment) result(k)
 !$omp declare target enter(implicitly_captured_with_dev_type_recursive) device_type(host)
    integer :: increment, k
@@ -174,7 +174,7 @@ recursive subroutine implicitly_captured_recursive(increment)
 end program
 
 ! DEVICE-LABEL: func.func @_QPimplicitly_captured_recursive
-! DEVICE-SAME: {{.*}}attributes {fir.func_recursive, omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
+! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
 recursive subroutine implicitly_captured_recursive(increment)
    integer :: increment
    if (increment == 10) then
diff --git a/flang/test/Lower/OpenMP/declare-target-implicit-func-and-subr-cap.f90 b/flang/test/Lower/OpenMP/declare-target-implicit-func-and-subr-cap.f90
index ffca5c3ff2500..8140fcc5f4813 100644
--- a/flang/test/Lower/OpenMP/declare-target-implicit-func-and-subr-cap.f90
+++ b/flang/test/Lower/OpenMP/declare-target-implicit-func-and-subr-cap.f90
@@ -1,7 +1,7 @@
-!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
-!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s  --check-prefix=DEVICE
-!RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
-!RUN: bbc -emit-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -fopenmp-is-target-device %s -o - | FileCheck %s  --check-prefix=DEVICE
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE
 
 ! CHECK-LABEL: func.func @_QPimplicitly_captured
 ! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
@@ -131,7 +131,7 @@ end function target_function_test_host
 !! -----
 
 ! DEVICE-LABEL: func.func @_QPimplicitly_captured_with_dev_type_recursive
-! DEVICE-SAME: {{.*}}attributes {fir.func_recursive, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
+! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
 recursive function implicitly_captured_with_dev_type_recursive(increment) result(k)
 !$omp declare target to(implicitly_captured_with_dev_type_recursive) device_type(host)
    integer :: increment, k
@@ -200,7 +200,7 @@ recursive subroutine implicitly_captured_recursive(increment)
 end program
 
 ! DEVICE-LABEL: func.func @_QPimplicitly_captured_recursive
-! DEVICE-SAME: {{.*}}attributes {fir.func_recursive, omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
+! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
 recursive subroutine implicitly_captured_recursive(increment)
    integer :: increment
    if (increment == 10) then
diff --git a/flang/test/Lower/OpenMP/declare-target-implicit-tarop-cap.f90 b/flang/test/Lower/OpenMP/declare-target-implicit-tarop-cap.f90
index 9b85a32036ca5..eca527ffffcec 100644
--- a/flang/test/Lower/OpenMP/declare-target-implicit-tarop-cap.f90
+++ b/flang/test/Lower/OpenMP/declare-target-implicit-tarop-cap.f90
@@ -1,7 +1,7 @@
-!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
-!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-device %s -o - | FileCheck %s  --check-prefix=DEVICE
-!RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
-!RUN: bbc -emit-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -fopenmp-is-device %s -o - | FileCheck %s  --check-prefix=DEVICE
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=52 -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE
 
 ! DEVICE-LABEL: func.func @_QPimplicit_capture
 ! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
@@ -67,7 +67,7 @@ end function target_function_test_device
 !! -----
 
 ! DEVICE-LABEL: func.func @_QPimplicitly_captured_recursive
-! DEVICE-SAME: {{.*}}attributes {fir.func_recursive, omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
+! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
 recursive function implicitly_captured_recursive(increment) result(k)
    integer :: increment, k
    if (increment == 10) then
diff --git a/flang/test/Lower/OpenMP/default-clause-byref.f90 b/flang/test/Lower/OpenMP/default-clause-byref.f90
index 7e9011f9c1bd5..6cdff407a9790 100644
--- a/flang/test/Lower/OpenMP/default-clause-byref.f90
+++ b/flang/test/Lower/OpenMP/default-clause-byref.f90
@@ -74,7 +74,7 @@
 !CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFEz"}
 !CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel private(
-!CHECK-SAME: @[[X_FIRSTPRIVATIZER]] %[[X_DECL]]#0 -> %[[PRIVATE_X:.*]] : {{.*}}, @[[Y_PRIVATIZER]] %[[Y_DECL]]#0 -> %[[PRIVATE_Y:.*]] : {{.*}}, @[[W_PRIVATIZER]] %[[W_DECL]]#0 -> %[[PRIVATE_W:.*]] : {{.*}}) {
+!CHECK-SAME: @[[X_FIRSTPRIVATIZER]] %[[X_DECL]]#0 -> %[[PRIVATE_X:.*]], @[[Y_PRIVATIZER]] %[[Y_DECL]]#0 -> %[[PRIVATE_Y:.*]], @[[W_PRIVATIZER]] %[[W_DECL]]#0 -> %[[PRIVATE_W:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -108,7 +108,7 @@ program default_clause_lowering
     !$omp end parallel
 
 !CHECK: omp.parallel private(
-!CHECK-SAME: @[[X_PRIVATIZER]] %[[X_DECL]]#0 -> %[[PRIVATE_X:.*]] : {{.*}}, @[[Y_PRIVATIZER]] %[[Y_DECL]]#0 -> %[[PRIVATE_Y:.*]] : {{.*}}) {
+!CHECK-SAME: @[[X_PRIVATIZER]] %[[X_DECL]]#0 -> %[[PRIVATE_X:.*]], @[[Y_PRIVATIZER]] %[[Y_DECL]]#0 -> %[[PRIVATE_Y:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
@@ -121,7 +121,7 @@ program default_clause_lowering
     !$omp end parallel
 
 !CHECK: omp.parallel private(
-!CHECK-SAME: @[[Y_FIRSTPRIVATIZER]] %[[Y_DECL]]#0 -> %[[PRIVATE_Y:.*]] : {{.*}}, @[[X_FIRSTPRIVATIZER]] %[[X_DECL]]#0 -> %[[PRIVATE_X:.*]] : {{.*}}) {
+!CHECK-SAME: @[[Y_FIRSTPRIVATIZER]] %[[Y_DECL]]#0 -> %[[PRIVATE_Y:.*]], @[[X_FIRSTPRIVATIZER]] %[[X_DECL]]#0 -> %[[PRIVATE_X:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
@@ -134,7 +134,7 @@ program default_clause_lowering
     !$omp end parallel
 
 !CHECK: omp.parallel private(
-!CHECK-SAME: @[[X_PRIVATIZER]] %[[X_DECL]]#0 -> %[[PRIVATE_X:.*]] : {{.*}}, @[[Y_FIRSTPRIVATIZER]] %[[Y_DECL]]#0 -> %[[PRIVATE_Y:.*]] : {{.*}}, @[[W_FIRSTPRIVATIZER]] %[[W_DECL]]#0 -> %[[PRIVATE_W:.*]] : {{.*}}) {
+!CHECK-SAME: @[[X_PRIVATIZER]] %[[X_DECL]]#0 -> %[[PRIVATE_X:.*]], @[[Y_FIRSTPRIVATIZER]] %[[Y_DECL]]#0 -> %[[PRIVATE_Y:.*]], @[[W_FIRSTPRIVATIZER]] %[[W_DECL]]#0 -> %[[PRIVATE_W:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -156,7 +156,7 @@ program default_clause_lowering
 
 !CHECK: omp.parallel   {
 !CHECK: omp.parallel   private(
-!CHECK-SAME: @[[X_PRIVATIZER]] %[[X_DECL]]#0 -> %[[PRIVATE_X:.*]] : {{.*}}, @[[Y_PRIVATIZER]] %[[Y_DECL]]#0 -> %[[PRIVATE_Y:.*]] : {{.*}}) {
+!CHECK-SAME: @[[X_PRIVATIZER]] %[[X_DECL]]#0 -> %[[PRIVATE_X:.*]], @[[Y_PRIVATIZER]] %[[Y_DECL]]#0 -> %[[PRIVATE_Y:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
@@ -164,7 +164,7 @@ program default_clause_lowering
 !CHECK: omp.terminator
 !CHECK: }
 !CHECK: omp.parallel private(
-!CHECK-SAME: @[[W_FIRSTPRIVATIZER]] %[[W_DECL]]#0 -> %[[PRIVATE_W:.*]] : {{.*}}, @[[X_FIRSTPRIVATIZER]] %[[X_DECL]]#0 -> %[[PRIVATE_X:.*]] : {{.*}}) {
+!CHECK-SAME: @[[W_FIRSTPRIVATIZER]] %[[W_DECL]]#0 -> %[[PRIVATE_W:.*]], @[[X_FIRSTPRIVATIZER]] %[[X_DECL]]#0 -> %[[PRIVATE_X:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
@@ -197,12 +197,12 @@ subroutine nested_default_clause_tests
 !CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFnested_default_clause_testsEz"}
 !CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Z:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_K:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Z:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_K:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Y:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[INNER_PRIVATE_X:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Y:.*]], {{.*}} {{.*}}#0 -> %[[INNER_PRIVATE_X:.*]] : {{.*}}) {
 !CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[CONST:.*]] = arith.constant 20 : i32
@@ -211,7 +211,7 @@ subroutine nested_default_clause_tests
 !CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: omp.terminator
 !CHECK: }
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[INNER_PRIVATE_W:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Z:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_K:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[INNER_PRIVATE_W:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Z:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_K:.*]] : {{.*}}) {
 !CHECK: %[[INNER_PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[INNER_PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[INNER_PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -242,18 +242,18 @@ subroutine nested_default_clause_tests
     !$omp end parallel
     
     
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Z:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Z:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_INNER_X:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_INNER_Y:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_INNER_X:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_INNER_Y:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[INNER_PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_INNER_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: omp.terminator
 !CHECK: }
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_INNER_W:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_INNER_X:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_INNER_W:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_INNER_X:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_INNER_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP_1:.*]] = fir.load %[[PRIVATE_INNER_X_DECL]]#0 : !fir.ref<i32>
@@ -272,12 +272,12 @@ subroutine nested_default_clause_tests
         !$omp end parallel
     !$omp end parallel    
     
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_W:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Z:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_W:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Z:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_X:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Y:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_X:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Y:.*]] : {{.*}}) {
 !CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[INNER_PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
@@ -302,7 +302,7 @@ subroutine nested_default_clause_tests
 		!$omp end parallel
 	!$omp end parallel
 
-!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.single {
diff --git a/flang/test/Lower/OpenMP/default-clause.f90 b/flang/test/Lower/OpenMP/default-clause.f90
index fefb5fcc4239e..1b1b47c40b545 100644
--- a/flang/test/Lower/OpenMP/default-clause.f90
+++ b/flang/test/Lower/OpenMP/default-clause.f90
@@ -17,7 +17,7 @@
 !CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFEz"}
 !CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_W:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_W:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -50,7 +50,7 @@ program default_clause_lowering
         x = y
     !$omp end parallel
 
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
@@ -62,7 +62,7 @@ program default_clause_lowering
         x = y
     !$omp end parallel
 
-!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
@@ -74,7 +74,7 @@ program default_clause_lowering
         x = y
     !$omp end parallel
 
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_W:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_W:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -95,14 +95,14 @@ program default_clause_lowering
     !$omp end parallel
 
 !CHECK: omp.parallel   {
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: omp.terminator
 !CHECK: }
-!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_W:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_W:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
@@ -134,12 +134,12 @@ end program default_clause_lowering
 !CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFnested_default_clause_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFnested_default_clause_test1Ez"}
 !CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFnested_default_clause_test1Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Z:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_K:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Z:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_K:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_test1Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_K]] {uniq_name = "_QFnested_default_clause_test1Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Y:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[INNER_PRIVATE_X:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Y:.*]], {{.*}} {{.*}}#0 -> %[[INNER_PRIVATE_X:.*]] : {{.*}}) {
 !CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[CONST:.*]] = arith.constant 20 : i32
@@ -148,7 +148,7 @@ end program default_clause_lowering
 !CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: omp.terminator
 !CHECK: }
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[INNER_PRIVATE_W:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Z:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_K:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[INNER_PRIVATE_W:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Z:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_K:.*]] : {{.*}}) {
 !CHECK: %[[INNER_PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_W]] {uniq_name = "_QFnested_default_clause_test1Ew"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[INNER_PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_test1Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[INNER_PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_K]] {uniq_name = "_QFnested_default_clause_test1Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -183,12 +183,12 @@ subroutine nested_default_clause_test1
 end subroutine
 
 !CHECK-LABEL: func @_QPnested_default_clause_test2
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_W:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Z:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_W:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Z:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test2Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_test2Ew"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_test2Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_INNER_X:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Y:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_W:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_INNER_X:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Y:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_W:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test2Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[INNER_PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_W]] {{.*}}
@@ -196,7 +196,7 @@ subroutine nested_default_clause_test1
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_INNER_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: omp.terminator
 !CHECK: }
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_INNER_W:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_INNER_X:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_INNER_W:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_INNER_X:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_INNER_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_W]] {uniq_name = "_QFnested_default_clause_test2Ew"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP_1:.*]] = fir.load %[[PRIVATE_INNER_X_DECL]]#0 : !fir.ref<i32>
@@ -222,12 +222,12 @@ subroutine nested_default_clause_test2
 end subroutine
 
 !CHECK-LABEL: func @_QPnested_default_clause_test3
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_W:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[PRIVATE_Z:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_W:.*]], {{.*}} {{.*}}#0 -> %[[PRIVATE_Z:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test3Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_test3Ew"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_test3Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_X:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Y:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_X:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_PRIVATE_Y:.*]] : {{.*}}) {
 !CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test3Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[INNER_PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
@@ -261,7 +261,7 @@ subroutine nested_default_clause_test3
 !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFnested_default_clause_test4Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFnested_default_clause_test4Ey"}
 !CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFnested_default_clause_test4Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_X:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[PRIVATE_Y:.*]] : {{.*}}) {
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_test4Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test4Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.single {
@@ -314,7 +314,7 @@ subroutine nested_default_clause_test5
 end subroutine
 
 !CHECK-LABEL: func @_QPnested_default_clause_test6
-!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[X_VAR:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[Y_VAR:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[Z_VAR:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*}} {{.*}}#0 -> %[[X_VAR:.*]], {{.*}} {{.*}}#0 -> %[[Y_VAR:.*]], {{.*}} {{.*}}#0 -> %[[Z_VAR:.*]] : {{.*}}) {
 !CHECK: %[[X_VAR_DECLARE:.*]]:2 = hlfir.declare %[[X_VAR]] {{.*}}
 
 !CHECK: %[[Y_VAR_DECLARE:.*]]:2 = hlfir.declare %[[Y_VAR]] {{.*}}
@@ -333,7 +333,7 @@ subroutine nested_default_clause_test5
 !CHECK: %[[CONST:.*]] = arith.constant 1 : i32
 !CHECK: %[[ADD:.*]] = arith.addi %[[LOADED_X]], %[[CONST]] : i32
 !CHECK: hlfir.assign %[[ADD]] to %[[X_VAR_DECLARE]]#0 : i32, !fir.ref<i32>
-!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_Y_ALLOCA:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_Z_ALLOCA:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_Y_ALLOCA:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_Z_ALLOCA:.*]] : {{.*}}) {
 !CHECK: %[[INNER_Y_DECLARE:.*]]:2 = hlfir.declare %[[INNER_Y_ALLOCA]] {{.}}
 !CHECK: %[[INNER_Z_DECLARE:.*]]:2 = hlfir.declare %[[INNER_Z_ALLOCA]] {{.}}
 !CHECK: %[[LOADED_Y:.*]] = fir.load %[[INNER_Y_DECLARE]]#0 : !fir.ref<i32>
@@ -460,7 +460,7 @@ subroutine nested_constructs
 !CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {{.*}}
 
     integer :: y, z
-!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_Y:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[INNER_Z:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[INNER_I:.*]] : {{.*}}, {{.*}} {{.*}}#0 -> %[[INNER_J:.*]] : {{.*}}) {
+!CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[INNER_Y:.*]], {{.*}} {{.*}}#0 -> %[[INNER_Z:.*]], {{.*}} {{.*}}#0 -> %[[INNER_I:.*]], {{.*}} {{.*}}#0 -> %[[INNER_J:.*]] : {{.*}}) {
 
 !CHECK: %[[INNER_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_Y]] {{.*}}
 
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90
index 337e7d5ec885c..7d202f46c09d3 100644
--- a/flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90
+++ b/flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90
@@ -33,5 +33,6 @@ subroutine delayed_privatization_private_firstprivate
 ! CHECK:  %[[VAR2_DECL:.*]]:2 = hlfir.declare %[[VAR2_ALLOC]]
 
 ! CHECK:  omp.parallel private(
-! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM]] %[[VAR1_DECL]]#0 -> %{{.*}} : !fir.ref<i32>, 
-! CHECK-SAME: @[[VAR2_PRIVATIZER_SYM]] %[[VAR2_DECL]]#0 -> %{{.*}} : !fir.ref<i32>) {
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM]] %[[VAR1_DECL]]#0 -> %{{[^,]+}}, 
+! CHECK-SAME: @[[VAR2_PRIVATIZER_SYM]] %[[VAR2_DECL]]#0 -> %{{.*}} :
+! CHECK-SAME: !fir.ref<i32>, !fir.ref<i32>) {
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
index 2943957117932..6c00bb23f15b9 100644
--- a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
+++ b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
@@ -26,5 +26,5 @@ subroutine red_and_delayed_private
 
 ! CHECK-LABEL: _QPred_and_delayed_private
 ! CHECK: omp.parallel
-! CHECK-SAME: reduction(byref @[[REDUCTION_SYM]] %{{.*}} -> %arg0 : !fir.ref<i32>)
-! CHECK-SAME: private(@[[PRIVATIZER_SYM]] %{{.*}} -> %arg1 : !fir.ref<i32>) {
+! CHECK-SAME: private(@[[PRIVATIZER_SYM]] %{{.*}} -> %arg0 : !fir.ref<i32>)
+! CHECK-SAME: reduction(byref @[[REDUCTION_SYM]] %{{.*}} -> %arg1 : !fir.ref<i32>) {
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-reduction.f90 b/flang/test/Lower/OpenMP/delayed-privatization-reduction.f90
index d814b2b0ff0f3..38139e52ce95c 100644
--- a/flang/test/Lower/OpenMP/delayed-privatization-reduction.f90
+++ b/flang/test/Lower/OpenMP/delayed-privatization-reduction.f90
@@ -29,5 +29,5 @@ subroutine red_and_delayed_private
 
 ! CHECK-LABEL: _QPred_and_delayed_private
 ! CHECK: omp.parallel
-! CHECK-SAME: reduction(@[[REDUCTION_SYM]] %{{.*}} -> %arg0 : !fir.ref<i32>)
-! CHECK-SAME: private(@[[PRIVATIZER_SYM]] %{{.*}} -> %arg1 : !fir.ref<i32>) {
+! CHECK-SAME: private(@[[PRIVATIZER_SYM]] %{{.*}} -> %arg0 : !fir.ref<i32>)
+! CHECK-SAME: reduction(@[[REDUCTION_SYM]] %{{.*}} -> %arg1 : !fir.ref<i32>) {
diff --git a/flang/test/Lower/OpenMP/derived-type-map.f90 b/flang/test/Lower/OpenMP/derived-type-map.f90
index 30b89e90470b0..d1eed74a4270c 100644
--- a/flang/test/Lower/OpenMP/derived-type-map.f90
+++ b/flang/test/Lower/OpenMP/derived-type-map.f90
@@ -5,7 +5,6 @@
 !CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFmaptype_derived_implicitEscalar_arr"} : (!fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) -> (!fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>)
 !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(implicit, tofrom) capture(ByRef) -> !fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr"}
 !CHECK:     omp.target map_entries(%[[MAP]] -> %[[ARG0:.*]] : !fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
-!CHECK:         ^bb0(%[[ARG0]]: !fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>):
 subroutine mapType_derived_implicit
     type :: scalar_and_array
       real(4) :: real
@@ -23,7 +22,6 @@ end subroutine mapType_derived_implicit
 !CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFmaptype_derived_explicitEscalar_arr"} : (!fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) -> (!fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>)
 !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr"}
 !CHECK:  omp.target map_entries(%[[MAP]] -> %[[ARG0:.*]] : !fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
-!CHECK:    ^bb0(%[[ARG0]]: !fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>):
 subroutine mapType_derived_explicit
     type :: scalar_and_array
       real(4) :: real
@@ -44,7 +42,6 @@ end subroutine mapType_derived_explicit
 !CHECK: %[[MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[MEMBER]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<10xi32>> {name = "scalar_arr%array"}
 !CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(tofrom) capture(ByRef) members(%[[MEMBER_MAP]] : [1] : !fir.ref<!fir.array<10xi32>>) -> !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr", partial_map = true}
 !CHECK: omp.target map_entries(%[[MEMBER_MAP]] -> %[[ARG0:.*]], %[[PARENT_MAP]] -> %[[ARG1:.*]] : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
-!CHECK:  ^bb0(%[[ARG0]]: !fir.ref<!fir.array<10xi32>>, %[[ARG1]]: !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>):
 subroutine mapType_derived_explicit_single_member
     type :: scalar_and_array
       real(4) :: real
@@ -66,7 +63,6 @@ end subroutine mapType_derived_explicit_single_member
 !CHECK: %[[MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[MEMBER2]] : !fir.ref<f32>, f32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f32> {name = "scalar_arr%real"}
 !CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(tofrom) capture(ByRef) members(%[[MEMBER_MAP_1]], %[[MEMBER_MAP_2]] : [2], [0] : !fir.ref<i32>, !fir.ref<f32>) -> !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr", partial_map = true}
 !CHECK: omp.target map_entries(%[[MEMBER_MAP_1]] -> %[[ARG0:.*]], %[[MEMBER_MAP_2]] -> %[[ARG1:.*]], %[[PARENT_MAP]] -> %[[ARG2:.*]] : !fir.ref<i32>, !fir.ref<f32>, !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
-!CHECK:  ^bb0(%[[ARG0]]: !fir.ref<i32>, %[[ARG1]]: !fir.ref<f32>, %[[ARG2]]: !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>):
 subroutine mapType_derived_explicit_multiple_members
     type :: scalar_and_array
       real(4) :: real
@@ -90,7 +86,6 @@ end subroutine mapType_derived_explicit_multiple_members
 !CHECK: %[[MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[MEMBER]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%20) -> !fir.ref<!fir.array<10xi32>> {name = "scalar_arr%array(2:5)"}
 !CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(tofrom) capture(ByRef) members(%[[MEMBER_MAP]] : [1] : !fir.ref<!fir.array<10xi32>>) -> !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr", partial_map = true}
 !CHECK: omp.target map_entries(%[[MEMBER_MAP]] -> %[[ARG0:.*]], %[[PARENT_MAP]] -> %[[ARG1:.*]] : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
-!CHECK: ^bb0(%[[ARG0]]: !fir.ref<!fir.array<10xi32>>, %[[ARG1]]: !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>):
 subroutine mapType_derived_explicit_member_with_bounds
     type :: scalar_and_array
       real(4) :: real
@@ -112,7 +107,6 @@ end subroutine mapType_derived_explicit_member_with_bounds
 !CHECK: %[[MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[NEST_MEMBER]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<10xi32>> {name = "scalar_arr%nest%array"}
 !CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : {{.*}}) map_clauses(tofrom) capture(ByRef) members(%35 : [2,2] : !fir.ref<!fir.array<10xi32>>) -> {{.*}} {name = "scalar_arr", partial_map = true}
 !CHECK: omp.target map_entries(%[[MEMBER_MAP]] -> %[[ARG0:.*]], %[[PARENT_MAP]] -> %[[ARG1:.*]] : {{.*}}, {{.*}}) {
-!CHECK:  ^bb0(%[[ARG0]]: {{.*}}, %[[ARG1]]: {{.*}}):
 subroutine mapType_derived_nested_explicit_single_member
   type :: nested
     integer(4) :: int
@@ -144,7 +138,6 @@ end subroutine mapType_derived_nested_explicit_single_member
 !CHECK: %[[MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[NEST_MEMBER2]] : !fir.ref<f32>, f32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f32> {name = "scalar_arr%nest%real"}
 !CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : {{.*}}, {{.*}}) map_clauses(tofrom) capture(ByRef) members(%[[MEMBER_MAP_1]], %[[MEMBER_MAP_2]] : [2,0], [2,1] : !fir.ref<i32>, !fir.ref<f32>) -> {{.*}} {name = "scalar_arr", partial_map = true}
 !CHECK: omp.target map_entries(%[[MEMBER_MAP_1]] -> %[[ARG0:.*]], %[[MEMBER_MAP_2]] -> %[[ARG1:.*]], %[[PARENT_MAP]] -> %[[ARG2:.*]] : !fir.ref<i32>, !fir.ref<f32>, {{.*}}) {
-!CHECK: ^bb0(%[[ARG0]]: !fir.ref<i32>, %[[ARG1]]: !fir.ref<f32>, %[[ARG2]]: {{.*}}):
 subroutine mapType_derived_nested_explicit_multiple_members
   type :: nested
     integer(4) :: int
@@ -178,7 +171,6 @@ end subroutine mapType_derived_nested_explicit_multiple_members
 !CHECK: %[[MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[NEST_MEMBER]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<10xi32>> {name = "scalar_arr%nest%array(2:5)"}
 !CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : {{.*}}, {{.*}}) map_clauses(tofrom) capture(ByRef) members(%[[MEMBER_MAP]] : [2,2] : !fir.ref<!fir.array<10xi32>>) -> {{.*}} {name = "scalar_arr", partial_map = true}
 !CHECK: omp.target map_entries(%[[MEMBER_MAP]] -> %[[ARG0:.*]], %[[PARENT_MAP]] -> %[[ARG1:.*]] : !fir.ref<!fir.array<10xi32>>, {{.*}}) {
-!CHECK: ^bb0(%[[ARG0]]: !fir.ref<!fir.array<10xi32>>, %[[ARG1]]: {{.*}}):
 subroutine mapType_derived_nested_explicit_member_with_bounds
   type :: nested
     integer(4) :: int
@@ -213,7 +205,6 @@ end subroutine mapType_derived_nested_explicit_member_with_bounds
 !CHECK: %[[MAP_PARENT_1:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : {{.*}}) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_1]] : [2,0] : !fir.ref<i32>) -> {{.*}} {name = "scalar_arr1", partial_map = true}
 !CHECK: %[[MAP_PARENT_2:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : {{.*}}) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_2]] : [2,0] : !fir.ref<i32>) -> {{.*}} {name = "scalar_arr2", partial_map = true}
 !CHECK: omp.target map_entries(%[[MAP_MEMBER_1]] -> %[[ARG0:.*]], %[[MAP_PARENT_1]] -> %[[ARG1:.*]], %[[MAP_MEMBER_2]] -> %[[ARG2:.*]], %[[MAP_PARENT_2:.*]] -> %[[ARG3:.*]] : !fir.ref<i32>, {{.*}}, !fir.ref<i32>, {{.*}}) {
-!CHECK: ^bb0(%[[ARG0]]: !fir.ref<i32>, %[[ARG1]]: {{.*}}, %[[ARG2]]: !fir.ref<i32>, %[[ARG3]]: {{.*}}):
 subroutine mapType_multilpe_derived_nested_explicit_member
   type :: nested
     integer(4) :: int
diff --git a/flang/test/Lower/OpenMP/distribute-parallel-do-simd.f90 b/flang/test/Lower/OpenMP/distribute-parallel-do-simd.f90
index 711d4dc4ba177..bea7f037cecf3 100644
--- a/flang/test/Lower/OpenMP/distribute-parallel-do-simd.f90
+++ b/flang/test/Lower/OpenMP/distribute-parallel-do-simd.f90
@@ -83,8 +83,8 @@ subroutine distribute_parallel_do_simd_private()
   ! CHECK: omp.teams {
   !$omp teams
 
-  ! CHECK:      omp.parallel private(@{{.*}} %[[X]]#0 -> %[[X_ARG:.*]] : !fir.ref<i64>,
-  ! CHECK-SAME:                      @{{.*}} %[[INDEX]]#0 -> %[[INDEX_ARG:.*]] : !fir.ref<i32>) {
+  ! CHECK:      omp.parallel private(@{{.*}} %[[X]]#0 -> %[[X_ARG:[^,]+]],
+  ! CHECK-SAME:                      @{{.*}} %[[INDEX]]#0 -> %[[INDEX_ARG:.*]] : !fir.ref<i64>, !fir.ref<i32>) {
   ! CHECK:      %[[X_PRIV:.*]]:2 = hlfir.declare %[[X_ARG]]
   ! CHECK:      %[[INDEX_PRIV:.*]]:2 = hlfir.declare %[[INDEX_ARG]]
   ! CHECK:      omp.distribute {
diff --git a/flang/test/Lower/OpenMP/distribute-parallel-do.f90 b/flang/test/Lower/OpenMP/distribute-parallel-do.f90
index 48567a1fb3491..cddf61647ead3 100644
--- a/flang/test/Lower/OpenMP/distribute-parallel-do.f90
+++ b/flang/test/Lower/OpenMP/distribute-parallel-do.f90
@@ -63,8 +63,8 @@ subroutine distribute_parallel_do_private()
   ! CHECK: omp.teams {
   !$omp teams
 
-  ! CHECK:      omp.parallel private(@{{.*}} %[[X]]#0 -> %[[X_ARG:.*]] : !fir.ref<i64>,
-  ! CHECK-SAME:                      @{{.*}} %[[INDEX]]#0 -> %[[INDEX_ARG:.*]] : !fir.ref<i32>) {
+  ! CHECK:      omp.parallel private(@{{.*}} %[[X]]#0 -> %[[X_ARG:[^,]+]],
+  ! CHECK-SAME:                      @{{.*}} %[[INDEX]]#0 -> %[[INDEX_ARG:.*]] : !fir.ref<i64>, !fir.ref<i32>) {
   ! CHECK:      %[[X_PRIV:.*]]:2 = hlfir.declare %[[X_ARG]]
   ! CHECK:      %[[INDEX_PRIV:.*]]:2 = hlfir.declare %[[INDEX_ARG]]
   ! CHECK:      omp.distribute {
diff --git a/flang/test/Lower/OpenMP/firstprivate-commonblock.f90 b/flang/test/Lower/OpenMP/firstprivate-commonblock.f90
index 0fa0d2bc32495..315e1b7745a6f 100644
--- a/flang/test/Lower/OpenMP/firstprivate-commonblock.f90
+++ b/flang/test/Lower/OpenMP/firstprivate-commonblock.f90
@@ -14,7 +14,7 @@
 !CHECK: %[[val_5:.*]] = fir.coordinate_of %[[val_4]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
 !CHECK: %[[val_6:.*]] = fir.convert %[[val_5]] : (!fir.ref<i8>) -> !fir.ref<f32>
 !CHECK: %[[VAL_6_DECL:.*]]:2 = hlfir.declare %[[val_6]] {uniq_name = "_QFfirstprivate_commonEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[val_7:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[val_9:.*]] : {{.*}}) {
+!CHECK: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[val_7:.*]], @{{.*}} %{{.*}}#0 -> %[[val_9:.*]] : {{.*}}) {
 !CHECK: %[[VAL_7_DECL:.*]]:2 = hlfir.declare %[[val_7]] {uniq_name = "_QFfirstprivate_commonEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK: %[[VAL_9_DECL:.*]]:2 = hlfir.declare %[[val_9]] {uniq_name = "_QFfirstprivate_commonEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK: omp.terminator
diff --git a/flang/test/Lower/OpenMP/function-filtering-2.f90 b/flang/test/Lower/OpenMP/function-filtering-2.f90
index f367069efb3d9..0c02aa223820e 100644
--- a/flang/test/Lower/OpenMP/function-filtering-2.f90
+++ b/flang/test/Lower/OpenMP/function-filtering-2.f90
@@ -1,9 +1,9 @@
-! RUN: %flang_fc1 -fopenmp -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM,LLVM-HOST %s
-! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck --check-prefix=MLIR %s
-! RUN: %flang_fc1 -fopenmp -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM,LLVM-DEVICE %s
-! RUN: %flang_fc1 -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefix=MLIR %s
-! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
-! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM,LLVM-HOST %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck --check-prefix=MLIR %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM,LLVM-DEVICE %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefix=MLIR %s
+! RUN: bbc -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
+! RUN: bbc -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
 
 ! MLIR: func.func @{{.*}}implicit_invocation() attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>}
 ! MLIR: return
diff --git a/flang/test/Lower/OpenMP/function-filtering.f90 b/flang/test/Lower/OpenMP/function-filtering.f90
index c473b9961907d..a72822503c373 100644
--- a/flang/test/Lower/OpenMP/function-filtering.f90
+++ b/flang/test/Lower/OpenMP/function-filtering.f90
@@ -1,9 +1,9 @@
-! RUN: %flang_fc1 -fopenmp -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-HOST,LLVM-ALL %s
-! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
-! RUN: %flang_fc1 -fopenmp -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-DEVICE,LLVM-ALL %s
-! RUN: %flang_fc1 -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
-! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
-! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-HOST,LLVM-ALL %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-DEVICE,LLVM-ALL %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
+! RUN: bbc -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
+! RUN: bbc -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
 
 ! Check that the correct LLVM IR functions are kept for the host and device
 ! after running the whole set of translation and transformation passes from
diff --git a/flang/test/Lower/OpenMP/hlfir-seqloop-parallel.f90 b/flang/test/Lower/OpenMP/hlfir-seqloop-parallel.f90
index b6b30a3ef0830..7ec73923daf6d 100644
--- a/flang/test/Lower/OpenMP/hlfir-seqloop-parallel.f90
+++ b/flang/test/Lower/OpenMP/hlfir-seqloop-parallel.f90
@@ -52,7 +52,7 @@ subroutine sb2
 !CHECK:    %[[J_DECL:.*]]:2 = hlfir.declare %[[J_ADDR]] {uniq_name = "_QFsb2Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[K_ADDR:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFsb2Ek"}
 !CHECK:    %[[K_DECL:.*]]:2 = hlfir.declare %[[K_ADDR]] {uniq_name = "_QFsb2Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    omp.parallel private({{.*}} %[[J_DECL]]#0 -> %[[J_PVT_ADDR:.*]] : {{.*}}, {{.*}} %[[I_DECL]]#0 -> %[[I_PVT_ADDR:.*]] : {{.*}}) {
+!CHECK:    omp.parallel private({{.*}} %[[J_DECL]]#0 -> %[[J_PVT_ADDR:.*]], {{.*}} %[[I_DECL]]#0 -> %[[I_PVT_ADDR:.*]] : {{.*}}) {
 
 !CHECK:      %[[J_PVT_DECL:.*]]:2 = hlfir.declare %[[J_PVT_ADDR]] {uniq_name = "_QFsb2Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
diff --git a/flang/test/Lower/OpenMP/implicit-dsa.f90 b/flang/test/Lower/OpenMP/implicit-dsa.f90
index 925677469847e..53d6483a7b1b9 100644
--- a/flang/test/Lower/OpenMP/implicit-dsa.f90
+++ b/flang/test/Lower/OpenMP/implicit-dsa.f90
@@ -107,7 +107,7 @@ subroutine implicit_dsa_test3
 !CHECK:       %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_test4Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:       %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_test4Ez"}
 !CHECK:       %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFimplicit_dsa_test4Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:       omp.parallel private({{.*}} %{{.*}}#0 -> %[[PRIV_X:.*]] : {{.*}}, {{.*}} %{{.*}}#0 -> %[[PRIV_Z:.*]] : {{.*}}, {{.*}} %{{.*}}#0 -> %[[PRIV_Y:.*]] : {{.*}}) {
+!CHECK:       omp.parallel private({{.*}} %{{.*}}#0 -> %[[PRIV_X:.*]], {{.*}} %{{.*}}#0 -> %[[PRIV_Z:.*]], {{.*}} %{{.*}}#0 -> %[[PRIV_Y:.*]] : {{.*}}) {
 !CHECK:         %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test4Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:         %[[PRIV_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV_Z]] {uniq_name = "_QFimplicit_dsa_test4Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:         %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test4Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -206,14 +206,14 @@ subroutine implicit_dsa_test5
 !CHECK-NEXT:    %[[PRIV_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV_Z]] {uniq_name = "_QFimplicit_dsa_test6Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:    %[[TEMP3:.*]] = fir.load %[[Z_DECL]]#0 : !fir.ref<i32>
 !CHECK-NEXT:    hlfir.assign %[[TEMP3]] to %[[PRIV_Z_DECL]]#0 : i32, !fir.ref<i32>
-!CHECK:         omp.parallel private({{.*}} %{{.*}}#0 -> %[[PRIV2_X:.*]] : {{.*}}, {{.*}} %{{.*}}#0 -> %[[PRIV2_Y:.*]] : {{.*}}) {
+!CHECK:         omp.parallel private({{.*}} %{{.*}}#0 -> %[[PRIV2_X:.*]], {{.*}} %{{.*}}#0 -> %[[PRIV2_Y:.*]] : {{.*}}) {
 !CHECK:           %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test6Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NOT:       hlfir.assign
 !CHECK:           %[[PRIV2_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV2_Y]] {uniq_name = "_QFimplicit_dsa_test6Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NOT:       hlfir.assign
 !CHECK:           hlfir.assign %{{.*}} to %[[PRIV2_X_DECL]]
 !CHECK:         }
-!CHECK:         omp.parallel private({{.*firstprivate.*}} %{{.*}}#0 -> %[[PRIV3_X:.*]] : {{.*}}, {{.*firstprivate.*}} %{{.*}}#0 -> %[[PRIV3_Z:.*]] : {{.*}}) {
+!CHECK:         omp.parallel private({{.*firstprivate.*}} %{{.*}}#0 -> %[[PRIV3_X:.*]], {{.*firstprivate.*}} %{{.*}}#0 -> %[[PRIV3_Z:.*]] : {{.*}}) {
 !CHECK-NEXT:      %[[PRIV3_X_DECL:.*]]:2 = hlfir.declare %[[PRIV3_X]] {uniq_name = "_QFimplicit_dsa_test6Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:      %[[PRIV3_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV3_Z]] {uniq_name = "_QFimplicit_dsa_test6Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:           hlfir.assign %{{.*}} to %[[PRIV_Y_DECL]]#0 : i32, !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/map-component-ref.f90 b/flang/test/Lower/OpenMP/map-component-ref.f90
index 21b56ab303acd..79b5605378d38 100644
--- a/flang/test/Lower/OpenMP/map-component-ref.f90
+++ b/flang/test/Lower/OpenMP/map-component-ref.f90
@@ -8,7 +8,6 @@
 ! CHECK: %[[V3:[0-9]+]] = omp.map.info var_ptr(%[[V2]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a%a1"}
 ! CHECK: %[[V4:[0-9]+]] = omp.map.info var_ptr(%[[V1]]#1 : !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>, !fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>) map_clauses(tofrom) capture(ByRef) members(%[[V3]] : [1] : !fir.ref<i32>) -> !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>> {name = "a", partial_map = true}
 ! CHECK: omp.target map_entries(%[[V3]] -> %arg0, %[[V4]] -> %arg1 : !fir.ref<i32>, !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>) {
-! CHECK: ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>):
 ! CHECK:   %[[V5:[0-9]+]]:2 = hlfir.declare %arg1 {uniq_name = "_QFfoo1Ea"} : (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>) -> (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>, !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>)
 ! CHECK:   %c0_i32 = arith.constant 0 : i32
 ! CHECK:   %[[V6:[0-9]+]] = hlfir.designate %[[V5]]#0{"a1"}   : (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>) -> !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
index f80a9744b0762..4cee01488f4a5 100644
--- a/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
+++ b/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
@@ -35,7 +35,7 @@
 !CHECK-DAG: func @_QPfirstprivate_complex(%[[ARG1:.*]]: !fir.ref<!fir.complex<4>>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.complex<8>>{{.*}}) {
 !CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_complexEarg1"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 !CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_complexEarg2"} : (!fir.ref<!fir.complex<8>>, !fir.dscope) -> (!fir.ref<!fir.complex<8>>, !fir.ref<!fir.complex<8>>)
-!CHECK:   omp.parallel private(@[[ARG1_COMPLEX_PRIVATIZER]] %{{.*}}#0 -> %[[ARG1_PVT:.*]] : {{.*}}, @[[ARG2_COMPLEX_PRIVATIZER]] %{{.*}}#0 -> %[[ARG2_PVT:.*]] : {{.*}}) {
+!CHECK:   omp.parallel private(@[[ARG1_COMPLEX_PRIVATIZER]] %{{.*}}#0 -> %[[ARG1_PVT:.*]], @[[ARG2_COMPLEX_PRIVATIZER]] %{{.*}}#0 -> %[[ARG2_PVT:.*]] : {{.*}}) {
 !CHECK:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_complexEarg1"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 !CHECK:     %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_complexEarg2"} : (!fir.ref<!fir.complex<8>>) -> (!fir.ref<!fir.complex<8>>, !fir.ref<!fir.complex<8>>)
 !CHECK:     fir.call @_QPfoo(%[[ARG1_PVT_DECL]]#1, %[[ARG2_PVT_DECL]]#1) {{.*}}: (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<8>>) -> ()
@@ -59,7 +59,7 @@ subroutine firstprivate_complex(arg1, arg2)
 !CHECK:  %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg4"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:  %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg5"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 !CHECK:  %[[ARG6_DECL:.*]]:2 = hlfir.declare %[[ARG6]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg6"} : (!fir.ref<i128>, !fir.dscope) -> (!fir.ref<i128>, !fir.ref<i128>)
-!CHECK:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG2_PVT:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG5_PVT:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG6_PVT:.*]] : {{.*}}) {
+!CHECK:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG2_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG5_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG6_PVT:.*]] : {{.*}}) {
 !CHECK:    %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_integerEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_integerEarg2"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
 !CHECK:    %[[ARG3_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG3_PVT]] {uniq_name = "_QFfirstprivate_integerEarg3"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
@@ -91,7 +91,7 @@ subroutine firstprivate_integer(arg1, arg2, arg3, arg4, arg5, arg6)
 !CHECK:    %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg3"} : (!fir.ref<!fir.logical<2>>, !fir.dscope) -> (!fir.ref<!fir.logical<2>>, !fir.ref<!fir.logical<2>>)
 !CHECK:    %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg4"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 !CHECK:    %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg5"} : (!fir.ref<!fir.logical<8>>, !fir.dscope) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>)
-!CHECK:  omp.parallel private(@[[ARG1_LOGICAL_PRIVATIZER]] {{.*}}#0 -> %[[ARG1_PVT:.*]] : {{.*}}, @[[ARG2_LOGICAL_PRIVATIZER]] {{.*}}#0 -> %[[ARG2_PVT:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG5_PVT:.*]] : {{.*}}) {
+!CHECK:  omp.parallel private(@[[ARG1_LOGICAL_PRIVATIZER]] {{.*}}#0 -> %[[ARG1_PVT:.*]], @[[ARG2_LOGICAL_PRIVATIZER]] {{.*}}#0 -> %[[ARG2_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG5_PVT:.*]] : {{.*}}) {
 !CHECK:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_logicalEarg1"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 !CHECK:     %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_logicalEarg2"} : (!fir.ref<!fir.logical<1>>) -> (!fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<1>>)
 !CHECK:     %[[ARG3_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG3_PVT]] {uniq_name = "_QFfirstprivate_logicalEarg3"} : (!fir.ref<!fir.logical<2>>) -> (!fir.ref<!fir.logical<2>>, !fir.ref<!fir.logical<2>>)
@@ -121,7 +121,7 @@ subroutine firstprivate_logical(arg1, arg2, arg3, arg4, arg5)
 !CHECK:   %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg4"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>)
 !CHECK:   %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg5"} : (!fir.ref<f80>, !fir.dscope) -> (!fir.ref<f80>, !fir.ref<f80>)
 !CHECK:   %[[ARG6_DECL:.*]]:2 = hlfir.declare %[[ARG6]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg6"} : (!fir.ref<f128>, !fir.dscope) -> (!fir.ref<f128>, !fir.ref<f128>)
-!CHECK:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG2_PVT:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG5_PVT:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG6_PVT:.*]] : {{.*}}) {
+!CHECK:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG2_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG5_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG6_PVT:.*]] : {{.*}}) {
 !CHECK:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:     %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_realEarg2"} : (!fir.ref<f16>) -> (!fir.ref<f16>, !fir.ref<f16>)
 !CHECK:     %[[ARG3_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG3_PVT]] {uniq_name = "_QFfirstprivate_realEarg3"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -151,7 +151,7 @@ subroutine firstprivate_real(arg1, arg2, arg3, arg4, arg5, arg6)
 !CHECK-SAME:                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
 !CHECK:           %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:           %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_firstprivateEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[A_PRIV_ADDR:.*]] : {{.*}}, {{.*firstprivate.*}} {{.*}}#0 -> %[[B_PRIV_ADDR:.*]] : {{.*}}) {
+!CHECK:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[A_PRIV_ADDR:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[B_PRIV_ADDR:.*]] : {{.*}}) {
 !CHECK:             %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFmultiple_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:             %[[B_PRIV_DECL:.*]]:2 = hlfir.declare %[[B_PRIV_ADDR]] {uniq_name = "_QFmultiple_firstprivateEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:             fir.call @_QPquux(%[[A_PRIV_DECL]]#1, %[[B_PRIV_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/parallel-private-clause.f90
index 73a4c7ff9c515..3d807b83e5060 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause.f90
@@ -22,7 +22,7 @@
 !FIRDialect-DAG: %[[BETA_ARRAY:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {bindc_name = "beta_array", uniq_name = "{{.*}}beta_array"}
 !FIRDialect-DAG: %[[BETA_ARRAY_DECL:.*]]:2 = hlfir.declare %[[BETA_ARRAY]]({{.*}}) typeparams {{.*}} {uniq_name = "{{.*}}beta_array"} : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>)
 
-!FIRDialect-DAG: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[ALPHA_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[ALPHA_ARRAY_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[BETA_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[BETA_ARRAY_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[ARG1_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[ARG2_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[ARG3_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[ARG4_PVT:.*]] : {{.*}}) {
+!FIRDialect-DAG: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[ALPHA_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[ALPHA_ARRAY_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[BETA_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[BETA_ARRAY_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[ARG1_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[ARG2_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[ARG3_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[ARG4_PVT:.*]] : {{.*}}) {
 !FIRDialect-DAG:  %[[ALPHA_PVT_DECL:.*]]:2 = hlfir.declare %[[ALPHA_PVT]] {uniq_name = "{{.*}}alpha"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !FIRDialect-DAG:  %[[ALPHA_ARRAY_PVT_DECL:.*]]:2 = hlfir.declare %[[ALPHA_ARRAY_PVT]]({{.*}}) {uniq_name = "{{.*}}alpha_array"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 !FIRDialect-DAG:  %[[BETA_PVT_DECL:.*]]:2 = hlfir.declare %[[BETA_PVT]] typeparams {{.*}} {uniq_name = "{{.*}}beta"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
@@ -72,7 +72,7 @@ subroutine private_clause(arg1, arg2, arg3, arg4)
 !FIRDialect-DAG:  %[[R:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFprivate_clause_scalarEr"}
 !FIRDialect-DAG:  %[[R_DECL:.*]]:2 = hlfir.declare %[[R]] {uniq_name = "_QFprivate_clause_scalarEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 
-!FIRDialect-DAG: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[I1_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[I2_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[I4_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[I8_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[I16_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[C_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[L_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[R_PVT:.*]] : {{.*}}) {
+!FIRDialect-DAG: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[I1_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[I2_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[I4_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[I8_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[I16_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[C_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[L_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[R_PVT:.*]] : {{.*}}) {
 !FIRDialect-DAG:    %[[I1_PVT_DECL:.*]]:2 = hlfir.declare %[[I1_PVT]] {uniq_name = "_QFprivate_clause_scalarEi1"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
 !FIRDialect-DAG:    %[[I2_PVT_DECL:.*]]:2 = hlfir.declare %[[I2_PVT]] {uniq_name = "_QFprivate_clause_scalarEi2"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
 !FIRDialect-DAG:    %[[I4_PVT_DECL:.*]]:2 = hlfir.declare %[[I4_PVT]] {uniq_name = "_QFprivate_clause_scalarEi4"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -131,7 +131,7 @@ subroutine private_clause_derived_type()
 !FIRDialect-DAG:  %[[X4:.*]] = fir.address_of(@{{.*}}Ex4) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 !FIRDialect-DAG:  %[[X4_DECL:.*]]:2 = hlfir.declare %[[X4]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Ex4"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 
-!FIRDialect:   omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[X_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[X2_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[X3_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[X4_PVT:.*]] : {{.*}}) {
+!FIRDialect:   omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[X_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[X2_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[X3_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[X4_PVT:.*]] : {{.*}}) {
 !FIRDialect-DAG:    %[[X_PVT_DECL:.*]]:2 = hlfir.declare %[[X_PVT]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Ex"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
 !FIRDialect-DAG:    %[[X2_PVT_DECL:.*]]:2 = hlfir.declare %[[X2_PVT]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Ex2"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 !FIRDialect-DAG:    %[[X3_PVT_DECL:.*]]:2 = hlfir.declare %[[X3_PVT]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Ex3"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
@@ -223,7 +223,7 @@ end subroutine increment_list_items
 !FIRDialect-DAG: %[[Z1_DECL:.*]]:2 = hlfir.declare %[[Z1]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFparallel_pointerEz1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !FIRDialect-DAG:  %[[Z2:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "z2", fir.target, uniq_name = "_QFparallel_pointerEz2"}
 !FIRDialect-DAG:  %[[Z2_DECL:.*]]:2 = hlfir.declare %[[Z2]](%12) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFparallel_pointerEz2"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
-!FIRDialect:      omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[Y1_PVT:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[Y2_PVT:.*]] : {{.*}}) {
+!FIRDialect:      omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[Y1_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[Y2_PVT:.*]] : {{.*}}) {
 !FIRDialect-DAG:    %[[Y1_PVT_DECL:.*]]:2 = hlfir.declare %[[Y1_PVT]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFparallel_pointerEy1"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
 !FIRDialect-DAG:    %[[Y2_PVT_DECL:.*]]:2 = hlfir.declare %[[Y2_PVT]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFparallel_pointerEy2"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
 !FIRDialect-DAG:    %[[PP18:.*]] = fir.embox %[[Z1_DECL]]#1 : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90
index ad97b17d6857d..725c411c3fd51 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90
@@ -98,7 +98,7 @@ subroutine simple_real_add
 !CHECK:  hlfir.assign %[[R_START]] to %[[R_DECL]]#0 : f32, !fir.ref<f32>
 !CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
 !CHECK:  hlfir.assign %[[I_START]] to %[[I_DECL]]#0 : i32, !fir.ref<i32>
-!CHECK:  omp.parallel reduction(byref @[[RED_I32_NAME]] %[[I_DECL]]#0 -> %[[IPRV:.+]] : !fir.ref<i32>, byref @[[RED_F32_NAME]] %[[R_DECL]]#0 -> %[[RPRV:.+]] : !fir.ref<f32>) {
+!CHECK:  omp.parallel reduction(byref @[[RED_I32_NAME]] %[[I_DECL]]#0 -> %[[IPRV:.+]], byref @[[RED_F32_NAME]] %[[R_DECL]]#0 -> %[[RPRV:.+]] : !fir.ref<i32>, !fir.ref<f32>) {
 !CHECK:    %[[IP_DECL:.+]]:2 = hlfir.declare %[[IPRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[RP_DECL:.+]]:2 = hlfir.declare %[[RPRV]] {{.*}} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-add.f90 b/flang/test/Lower/OpenMP/parallel-reduction-add.f90
index 213fc71cc3494..198a9c912ea45 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-add.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-add.f90
@@ -82,7 +82,7 @@ subroutine simple_real_add
 !CHECK:  hlfir.assign %[[R_START]] to %[[R_DECL]]#0 : f32, !fir.ref<f32>
 !CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
 !CHECK:  hlfir.assign %[[I_START]] to %[[I_DECL]]#0 : i32, !fir.ref<i32>
-!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[I_DECL]]#0 -> %[[IPRV:.+]] : !fir.ref<i32>, @[[RED_F32_NAME]] %[[R_DECL]]#0 -> %[[RPRV:.+]] : !fir.ref<f32>) {
+!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[I_DECL]]#0 -> %[[IPRV:.+]], @[[RED_F32_NAME]] %[[R_DECL]]#0 -> %[[RPRV:.+]] : !fir.ref<i32>, !fir.ref<f32>) {
 !CHECK:    %[[IP_DECL:.+]]:2 = hlfir.declare %[[IPRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[RP_DECL:.+]]:2 = hlfir.declare %[[RPRV]] {{.*}} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop.f90 b/flang/test/Lower/OpenMP/parallel-wsloop.f90
index de1b8f4bc7d04..4a9c66857ffaa 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop.f90
@@ -147,7 +147,7 @@ end subroutine parallel_private_do
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"},
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
 ! CHECK:           %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_private_doEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[COND_ADDR:.*]] : {{.*}}, @{{.*firstprivate.*}} %{{.*}}#0 -> %[[NT_PRIV_ADDR:.*]] : {{.*}}) {
+! CHECK:           omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[COND_ADDR:.*]], @{{.*firstprivate.*}} %{{.*}}#0 -> %[[NT_PRIV_ADDR:.*]] : {{.*}}) {
 
 ! CHECK:             %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_ADDR]] {uniq_name = "_QFparallel_private_doEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 
@@ -194,7 +194,7 @@ end subroutine omp_parallel_multiple_firstprivate_do
 ! CHECK-SAME:                                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
 ! CHECK:            %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:            %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           omp.parallel private(@{{.*firstprivate.*}} %{{.*}}#0 -> %[[A_PRIV_ADDR:.*]] : {{.*}}, @{{.*firstprivate.*}} %{{.*}}#0 -> %[[B_PRIV_ADDR:.*]] : {{.*}}) {
+! CHECK:           omp.parallel private(@{{.*firstprivate.*}} %{{.*}}#0 -> %[[A_PRIV_ADDR:.*]], @{{.*firstprivate.*}} %{{.*}}#0 -> %[[B_PRIV_ADDR:.*]] : {{.*}}) {
 
 ! CHECK:             %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
diff --git a/flang/test/Lower/OpenMP/private-commonblock.f90 b/flang/test/Lower/OpenMP/private-commonblock.f90
index 20f6cf57c2ae5..59c55ea6bf922 100644
--- a/flang/test/Lower/OpenMP/private-commonblock.f90
+++ b/flang/test/Lower/OpenMP/private-commonblock.f90
@@ -3,7 +3,7 @@
 ! RUN: | FileCheck %s
 
 !CHECK: func.func @_QPprivate_common() {
-!CHECK: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[X:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[Y:.*]] : {{.*}}) {
+!CHECK: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[X:.*]], @{{.*}} %{{.*}}#0 -> %[[Y:.*]] : {{.*}}) {
 !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFprivate_commonEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFprivate_commonEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK: omp.terminator
@@ -48,7 +48,7 @@ subroutine private_common
 !CHECK:    %[[D_REF:.*]] = fir.convert %[[D_DECL]]#1 : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,5>>
 !CHECK:    %[[D_BOX:.*]] = fir.emboxchar %[[D_REF]], %[[TP5]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
 !CHECK:    fir.call @_QPsub1(%[[A_DECL]]#1, %[[B_DECL]]#1, %[[C_BOX]], %[[D_BOX]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
-!CHECK:    omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[B_PVT_REF:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[C_PVT_REF:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[D_PVT_REF:.*]] : {{.*}}) {
+!CHECK:    omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[B_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[C_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[D_PVT_REF:.*]] : {{.*}}) {
 !CHECK:      %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFprivate_clause_commonblockEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:      %[[SH10:.*]] = fir.shape %c10{{.*}} : (index) -> !fir.shape<1>
 !CHECK:      %[[B_PVT_DECL:.*]]:2 = hlfir.declare %[[B_PVT_REF]](%[[SH10]]) {uniq_name = "_QFprivate_clause_commonblockEb"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
@@ -94,7 +94,7 @@ subroutine private_clause_commonblock()
 !CHECK:    %[[C_ADDR:.*]] = fir.box_addr %[[C_BOX]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
 !CHECK:    %[[C_REF:.*]] = fir.convert %[[C_ADDR]] : (!fir.ptr<!fir.complex<4>>) -> !fir.ref<!fir.complex<4>>
 !CHECK:    fir.call @_QPsub4(%[[C_REF]], %[[A_DECL]]#1) fastmath<contract> : (!fir.ref<!fir.complex<4>>, !fir.ref<i32>) -> ()
-!CHECK: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[C_PVT_REF:.*]] : {{.*}}, @{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]] : {{.*}}) {
+!CHECK: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[C_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]] : {{.*}}) {
 !CHECK:      %[[C_PVT_DECL:.*]]:2 = hlfir.declare %[[C_PVT_REF]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFprivate_clause_commonblock_pointerEc"} : (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>)
 !CHECK:      %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFprivate_clause_commonblock_pointerEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:      %[[C_PVT_BOX:.*]] = fir.load %[[C_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
diff --git a/flang/test/Lower/OpenMP/sections-array-reduction.f90 b/flang/test/Lower/OpenMP/sections-array-reduction.f90
index e5319e8d6bcc7..91e0680692637 100644
--- a/flang/test/Lower/OpenMP/sections-array-reduction.f90
+++ b/flang/test/Lower/OpenMP/sections-array-reduction.f90
@@ -35,8 +35,7 @@ subroutine sectionsReduction(x)
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
 ! CHECK:             fir.store %[[VAL_2]]#1 to %[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
-! CHECK:             omp.sections reduction(byref @add_reduction_byref_box_Uxf32 -> %[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xf32>>>) {
-! CHECK:             ^bb0(%[[VAL_4:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>):
+! CHECK:             omp.sections reduction(byref @add_reduction_byref_box_Uxf32 %[[VAL_3]] -> %[[VAL_4:.*]] : !fir.ref<!fir.box<!fir.array<?xf32>>>) {
 ! CHECK:               omp.section {
 ! CHECK:               ^bb0(%[[VAL_5:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>):
 ! [...]
diff --git a/flang/test/Lower/OpenMP/sections-reduction.f90 b/flang/test/Lower/OpenMP/sections-reduction.f90
index 854f9ea22a7dd..27da965c2ca16 100644
--- a/flang/test/Lower/OpenMP/sections-reduction.f90
+++ b/flang/test/Lower/OpenMP/sections-reduction.f90
@@ -40,8 +40,7 @@ subroutine sectionsReduction(x,y)
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFsectionsreductionEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFsectionsreductionEy"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           omp.parallel {
-! CHECK:             omp.sections reduction(@add_reduction_f32 -> %[[VAL_3]]#0 : !fir.ref<f32>, @add_reduction_f32 -> %[[VAL_4]]#0 : !fir.ref<f32>) {
-! CHECK:             ^bb0(%[[VAL_5:.*]]: !fir.ref<f32>, %[[VAL_6:.*]]: !fir.ref<f32>):
+! CHECK:             omp.sections reduction(@add_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_5:.*]], @add_reduction_f32 %[[VAL_4]]#0 -> %[[VAL_6:.*]] : !fir.ref<f32>, !fir.ref<f32>) {
 ! CHECK:               omp.section {
 ! CHECK:               ^bb0(%[[VAL_7:.*]]: !fir.ref<f32>, %[[VAL_8:.*]]: !fir.ref<f32>):
 ! CHECK:                 %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFsectionsreductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -71,8 +70,7 @@ subroutine sectionsReduction(x,y)
 ! CHECK:             omp.terminator
 ! CHECK:           }
 ! CHECK:           omp.parallel {
-! CHECK:             omp.sections reduction(@add_reduction_f32 -> %[[VAL_3]]#0 : !fir.ref<f32>, @add_reduction_f32 -> %[[VAL_4]]#0 : !fir.ref<f32>) {
-! CHECK:             ^bb0(%[[VAL_23:.*]]: !fir.ref<f32>, %[[VAL_24:.*]]: !fir.ref<f32>):
+! CHECK:             omp.sections reduction(@add_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_23:.*]], @add_reduction_f32 %[[VAL_4]]#0 -> %[[VAL_24:.*]] : !fir.ref<f32>, !fir.ref<f32>) {
 ! CHECK:               omp.section {
 ! CHECK:               ^bb0(%[[VAL_25:.*]]: !fir.ref<f32>, %[[VAL_26:.*]]: !fir.ref<f32>):
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFsectionsreductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
diff --git a/flang/test/Lower/OpenMP/statement-function.f90 b/flang/test/Lower/OpenMP/statement-function.f90
index fd6f5986bb072..8d30450161d7d 100644
--- a/flang/test/Lower/OpenMP/statement-function.f90
+++ b/flang/test/Lower/OpenMP/statement-function.f90
@@ -4,8 +4,8 @@
 !CHECK-LABEL: func @_QPtest_implicit_use
 !CHECK:         %[[IEXP:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_implicit_useEiexp"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:         %[[IIMP:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_implicit_useEiimp"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:         omp.parallel private({{.*firstprivate.*}} %[[IEXP]]#0 -> %[[PRIV_IEXP:.*]] : !fir.ref<i32>,
-!CHECK-SAME:                         {{.*firstprivate.*}} %[[IIMP]]#0 -> %[[PRIV_IIMP:.*]] : !fir.ref<i32>)
+!CHECK:         omp.parallel private({{.*firstprivate.*}} %[[IEXP]]#0 -> %[[PRIV_IEXP:[^,]+]],
+!CHECK-SAME:                         {{.*firstprivate.*}} %[[IIMP]]#0 -> %[[PRIV_IIMP:.*]] : !fir.ref<i32>, !fir.ref<i32>)
 !CHECK:           %{{.*}}:2 = hlfir.declare %[[PRIV_IEXP]] {uniq_name = "_QFtest_implicit_useEiexp"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:           %{{.*}}:2 = hlfir.declare %[[PRIV_IIMP]] {uniq_name = "_QFtest_implicit_useEiimp"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 subroutine test_implicit_use()
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 6fccea7e37072..ab33b6b380831 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -340,7 +340,6 @@ subroutine omp_target
    !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
    !CHECK: omp.target   map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !fir.ref<!fir.array<1024xi32>>) {
-   !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<!fir.array<1024xi32>>):
    !$omp target map(tofrom: a)
       !CHECK: %[[VAL_7:.*]] = arith.constant 1024 : index
       !CHECK: %[[VAL_2:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
@@ -394,7 +393,6 @@ subroutine omp_target_implicit
    integer :: a(1024)
    !CHECK: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_3]]#1 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(implicit, tofrom) capture(ByRef) bounds(%{{.*}}) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
    !CHECK: omp.target   map_entries(%[[VAL_4]] -> %[[VAL_6:.*]] : !fir.ref<!fir.array<1024xi32>>) {
-   !CHECK: ^bb0(%[[VAL_6]]: !fir.ref<!fir.array<1024xi32>>):
    !$omp target
       !CHECK: %[[VAL_7:.*]] = arith.constant 1024 : index
       !CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
@@ -417,7 +415,6 @@ end subroutine omp_target_implicit
 subroutine omp_target_implicit_nested
    integer::a, b
    !CHECK: omp.target   map_entries(%{{.*}} -> %[[ARG0:.*]], %{{.*}} -> %[[ARG1:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
-   !CHECK: ^bb0(%[[ARG0]]: !fir.ref<i32>, %[[ARG1]]: !fir.ref<i32>):
    !$omp target
       !CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFomp_target_implicit_nestedEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
       !CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFomp_target_implicit_nestedEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -463,7 +460,6 @@ subroutine omp_target_implicit_bounds(n)
    !CHECK: %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_10]]#1 : !fir.ref<!fir.array<?xi32>>, i32) map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[VAL_14]]) -> !fir.ref<!fir.array<?xi32>> {name = "a"}
    !CHECK: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_COPY]] : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = ""}
    !CHECK: omp.target map_entries(%[[VAL_15]] -> %[[VAL_17:.*]], %[[VAL_16]] -> %[[VAL_18:.*]] : !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>) {
-   !CHECK: ^bb0(%[[VAL_17]]: !fir.ref<!fir.array<?xi32>>, %[[VAL_18]]: !fir.ref<i32>):
    !$omp target
       !CHECK: %[[VAL_19:.*]] = fir.load %[[VAL_18]] : !fir.ref<i32>
       !CHECK: %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i32) -> i64
@@ -492,8 +488,7 @@ subroutine omp_target_thread_limit
    integer :: a
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a"}
    !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
-   !CHECK: omp.target map_entries(%[[MAP]] -> %{{.*}} : !fir.ref<i32>) thread_limit(%[[VAL_1]] : i32) {
-   !CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
+   !CHECK: omp.target thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] -> %{{.*}} : !fir.ref<i32>) {
    !$omp target map(tofrom: a) thread_limit(64)
       a = 10
    !CHECK: omp.terminator
@@ -511,9 +506,8 @@ subroutine omp_target_device_ptr
    type(c_ptr) :: a
    integer, target :: b
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> {{.*}} {name = "a"}
-   !CHECK: omp.target_data map_entries(%[[MAP]]{{.*}}) use_device_ptr({{.*}})
+   !CHECK: omp.target_data map_entries(%[[MAP]]{{.*}}) use_device_ptr({{.*}} -> %[[VAL_1:.*]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
    !$omp target data map(tofrom: a) use_device_ptr(a)
-   !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>):
    !CHECK: {{.*}} = fir.coordinate_of %[[VAL_1:.*]], {{.*}} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
       a = c_loc(b)
    !CHECK: omp.terminator
@@ -534,9 +528,8 @@ subroutine omp_target_device_addr
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.box<!fir.ptr<i32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBERS]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.ptr<i32>>> {name = "a"}
    !CHECK: %[[DEV_ADDR_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, i32) var_ptr_ptr({{.*}} : !fir.llvm_ptr<!fir.ref<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
    !CHECK: %[[DEV_ADDR:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.box<!fir.ptr<i32>>) map_clauses(tofrom) capture(ByRef) members(%[[DEV_ADDR_MEMBERS]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.ptr<i32>>> {name = "a"}
-   !CHECK: omp.target_data map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) use_device_addr(%[[DEV_ADDR_MEMBERS]], %[[DEV_ADDR]] : {{.*}}) {
+   !CHECK: omp.target_data map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) use_device_addr(%[[DEV_ADDR_MEMBERS]] -> %[[ARG_0:.*]], %[[DEV_ADDR]] -> %[[ARG_1:.*]] : !fir.llvm_ptr<!fir.ref<i32>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) {
    !$omp target data map(tofrom: a) use_device_addr(a)
-   !CHECK: ^bb0(%[[ARG_0:.*]]: !fir.llvm_ptr<!fir.ref<i32>>, %[[ARG_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
    !CHECK: %[[VAL_1_DECL:.*]]:2 = hlfir.declare %[[ARG_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
    !CHECK: %[[C10:.*]] = arith.constant 10 : i32
    !CHECK: %[[A_BOX:.*]] = fir.load %[[VAL_1_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
@@ -590,7 +583,6 @@ subroutine omp_target_parallel_do
    !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound(%[[C0]] : index) upper_bound(%[[SUB]] : index) extent(%[[C1024]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index)
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_0_DECL]]#1 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
    !CHECK: omp.target   map_entries(%[[MAP]] -> %[[ARG_0:.*]], %{{.*}} -> %{{.*}} : !fir.ref<!fir.array<1024xi32>>, !fir.ref<i32>) {
-   !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<!fir.array<1024xi32>>, %{{.*}}: !fir.ref<i32>):
       !CHECK: %[[VAL_0_DECL:.*]]:2 = hlfir.declare %[[ARG_0]](%{{.*}}) {uniq_name = "_QFomp_target_parallel_doEa"} : (!fir.ref<!fir.array<1024xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
       !CHECK: omp.parallel
       !$omp target parallel do map(tofrom: a)
@@ -631,7 +623,6 @@ subroutine target_unstructured
    !CHECK: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
    !CHECK: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_3]]#1 : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "j"}
    !CHECK: omp.target map_entries(%[[VAL_4]] -> %[[VAL_6:.*]], %[[VAL_5]] -> %[[VAL_7:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
-   !CHECK: ^bb0(%[[VAL_6]]: !fir.ref<i32>, %[[VAL_7]]: !fir.ref<i32>):
    !$omp target
       !CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFtarget_unstructuredEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
       !CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFtarget_unstructuredEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/OpenMP/unstructured.f90 b/flang/test/Lower/OpenMP/unstructured.f90
index bd030b918033e..e7860b1d6ca8c 100644
--- a/flang/test/Lower/OpenMP/unstructured.f90
+++ b/flang/test/Lower/OpenMP/unstructured.f90
@@ -330,7 +330,7 @@ subroutine ss8() ! EXIT inside OpenMP parallel do
 
 ! CHECK-LABEL: func @_QPss9() {
 ! CHECK:    omp.parallel  {
-! CHECK-NEXT: omp.parallel private(@{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}, @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
+! CHECK-NEXT: omp.parallel private(@{{.*}} %{{.*}}#0 -> %{{.*}}, @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
 ! CHECK:      br ^[[BB1:.*]]
 ! CHECK:         ^[[BB1]]:
 ! CHECK:      cond_br %{{.*}}, ^[[BB2:.*]], ^[[BB5:.*]]
diff --git a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
index 085f5419fa7f8..cb26246a6e80f 100644
--- a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
+++ b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
@@ -6,8 +6,7 @@
 ! use_device_ptr to use_device_addr works, without breaking any functionality.
 
 !CHECK: func.func @{{.*}}only_use_device_ptr()
-!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
-!CHECK: ^bb0(%{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+!CHECK: omp.target_data use_device_addr(%{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) use_device_ptr(%{{.*}} -> %{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
 subroutine only_use_device_ptr
     use iso_c_binding
     integer, pointer, dimension(:) :: array
@@ -19,8 +18,7 @@ subroutine only_use_device_ptr
      end subroutine
 
 !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr()
-!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) use_device_ptr({{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
-!CHECK: ^bb0(%{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+!CHECK: omp.target_data use_device_addr(%{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) use_device_ptr({{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
 subroutine mix_use_device_ptr_and_addr
     use iso_c_binding
     integer, pointer, dimension(:) :: array
@@ -32,8 +30,7 @@ subroutine mix_use_device_ptr_and_addr
      end subroutine
 
      !CHECK: func.func @{{.*}}only_use_device_addr()
-     !CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
-     !CHECK: ^bb0(%{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+     !CHECK: omp.target_data use_device_addr(%{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
      subroutine only_use_device_addr
         use iso_c_binding
         integer, pointer, dimension(:) :: array
@@ -45,8 +42,7 @@ subroutine only_use_device_addr
      end subroutine
 
      !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr_and_map()
-     !CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}} : !fir.ref<i32>, !fir.ref<i32>) use_device_addr(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
-     !CHECK: ^bb0(%{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+     !CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}} : !fir.ref<i32>, !fir.ref<i32>) use_device_addr(%{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
      subroutine mix_use_device_ptr_and_addr_and_map
         use iso_c_binding
         integer :: i, j
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90
index 67d8964622275..44fb49239eab4 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90
@@ -270,7 +270,7 @@ subroutine simple_real_reduction_switch_order
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<i32>, byref @add_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<i32>, byref @add_reduction_byref_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], byref @add_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], byref @add_reduction_byref_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -334,7 +334,7 @@ subroutine multiple_int_reductions_same_type
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<f32>, byref @add_reduction_byref_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<f32>, byref @add_reduction_byref_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], byref @add_reduction_byref_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], byref @add_reduction_byref_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -405,7 +405,7 @@ subroutine multiple_real_reductions_same_type
 ! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_17:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]] : !fir.ref<i32>, byref @add_reduction_byref_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]] : !fir.ref<i64>, byref @add_reduction_byref_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]] : !fir.ref<f32>, byref @add_reduction_byref_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<f64>) {
+! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]], byref @add_reduction_byref_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]], byref @add_reduction_byref_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]], byref @add_reduction_byref_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<i32>, !fir.ref<i64>, !fir.ref<f32>, !fir.ref<f64>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_23:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
 ! CHECK:                 %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-add.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-add.f90
index cd7c362e3c0d8..d1617bc839cc8 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-add.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-add.f90
@@ -238,7 +238,7 @@ subroutine simple_real_reduction_switch_order
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<i32>, @add_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<i32>, @add_reduction_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], @add_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], @add_reduction_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -302,7 +302,7 @@ subroutine multiple_int_reductions_same_type
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<f32>, @add_reduction_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<f32>, @add_reduction_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop reduction(@add_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], @add_reduction_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], @add_reduction_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -373,7 +373,7 @@ subroutine multiple_real_reductions_same_type
 ! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_17:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]] : !fir.ref<i32>, @add_reduction_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]] : !fir.ref<i64>, @add_reduction_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]] : !fir.ref<f32>, @add_reduction_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<f64>) {
+! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]], @add_reduction_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]], @add_reduction_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]], @add_reduction_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<i32>, !fir.ref<i64>, !fir.ref<f32>, !fir.ref<f64>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_23:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
 ! CHECK:                 %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
index ed89ee1fade8e..65072f3ccce35 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
@@ -153,7 +153,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @and_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, byref @and_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, byref @and_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop reduction(byref @and_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], byref @and_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], byref @and_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
index 65781d6c36520..818e1a652064e 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
@@ -145,7 +145,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@and_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, @and_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, @and_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop reduction(@and_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], @and_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], @and_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
index dd2176e4f2de1..71d4a34da2d34 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
@@ -152,7 +152,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @eqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, byref @eqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, byref @eqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop reduction(byref @eqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], byref @eqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], byref @eqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
index c77a2bef0dd2a..926cf380b0f22 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
@@ -144,7 +144,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@eqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop reduction(@eqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], @eqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], @eqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
index 4ce4f258f5ec1..3b7f5771b87c5 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
@@ -155,7 +155,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @neqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, byref @neqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, byref @neqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop reduction(byref @neqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], byref @neqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], byref @neqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
index 28e821bb41ecc..1a1bffad19d27 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
@@ -147,7 +147,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@neqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop reduction(@neqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], @neqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], @neqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
index 2b750605519cf..22239ea69e980 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
@@ -151,7 +151,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @or_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, byref @or_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, byref @or_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop reduction(byref @or_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], byref @or_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], byref @or_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
index 2453efe59e25e..6bdf47bc89d64 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
@@ -144,7 +144,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@or_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, @or_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, @or_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop reduction(@or_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], @or_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], @or_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90
index 28b78e41be2a0..04c6787ab01e0 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90
@@ -255,7 +255,7 @@ subroutine simple_real_reduction_switch_order
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @multiply_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<i32>, byref @multiply_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<i32>, byref @multiply_reduction_byref_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop reduction(byref @multiply_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], byref @multiply_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], byref @multiply_reduction_byref_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -315,7 +315,7 @@ subroutine multiple_int_reductions_same_type
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @multiply_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<f32>, byref @multiply_reduction_byref_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<f32>, byref @multiply_reduction_byref_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop reduction(byref @multiply_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], byref @multiply_reduction_byref_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], byref @multiply_reduction_byref_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -382,7 +382,7 @@ subroutine multiple_real_reductions_same_type
 ! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_17:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @multiply_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]] : !fir.ref<i32>, byref @multiply_reduction_byref_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]] : !fir.ref<i64>, byref @multiply_reduction_byref_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]] : !fir.ref<f32>, byref @multiply_reduction_byref_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<f64>) {
+! CHECK:             omp.wsloop reduction(byref @multiply_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]], byref @multiply_reduction_byref_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]], byref @multiply_reduction_byref_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]], byref @multiply_reduction_byref_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<i32>, !fir.ref<i64>, !fir.ref<f32>, !fir.ref<f64>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_23:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
 ! CHECK:                 %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-mul.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-mul.f90
index 573f1f431c18a..d7cafb2285222 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-mul.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-mul.f90
@@ -222,7 +222,7 @@ subroutine simple_real_reduction_switch_order
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@multiply_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<i32>, @multiply_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<i32>, @multiply_reduction_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop reduction(@multiply_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], @multiply_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], @multiply_reduction_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -282,7 +282,7 @@ subroutine multiple_int_reductions_same_type
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@multiply_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<f32>, @multiply_reduction_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<f32>, @multiply_reduction_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop reduction(@multiply_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], @multiply_reduction_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], @multiply_reduction_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -349,7 +349,7 @@ subroutine multiple_real_reductions_same_type
 ! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_17:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@multiply_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]] : !fir.ref<i32>, @multiply_reduction_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]] : !fir.ref<i64>, @multiply_reduction_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]] : !fir.ref<f32>, @multiply_reduction_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<f64>) {
+! CHECK:             omp.wsloop reduction(@multiply_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]], @multiply_reduction_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]], @multiply_reduction_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]], @multiply_reduction_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<i32>, !fir.ref<i64>, !fir.ref<f32>, !fir.ref<f64>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_23:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
 ! CHECK:                 %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multi.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multi.f90
index 429253efdc809..1ca8cdc622ea9 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-multi.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-multi.f90
@@ -42,9 +42,10 @@
 !CHECK:      %[[Z_REF:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_reductionEz"}
 !CHECK:      %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z_REF]] {uniq_name = "_QFmultiple_reductionEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:      omp.wsloop reduction(
-!CHECK-SAME: @[[ADD_RED_I32_NAME]] %[[X_DECL]]#0 -> %[[PRV_X:.+]] : !fir.ref<i32>,
-!CHECK-SAME: @[[ADD_RED_F32_NAME]] %[[Y_DECL]]#0 -> %[[PRV_Y:.+]] : !fir.ref<f32>,
-!CHECK-SAME: @[[MIN_RED_I32_NAME]] %[[Z_DECL]]#0 -> %[[PRV_Z:.+]] : !fir.ref<i32>) {
+!CHECK-SAME: @[[ADD_RED_I32_NAME]] %[[X_DECL]]#0 -> %[[PRV_X:[^,]+]],
+!CHECK-SAME: @[[ADD_RED_F32_NAME]] %[[Y_DECL]]#0 -> %[[PRV_Y:[^,]+]],
+!CHECK-SAME: @[[MIN_RED_I32_NAME]] %[[Z_DECL]]#0 -> %[[PRV_Z:.+]] :
+!CHECK-SAME: !fir.ref<i32>, !fir.ref<f32>, !fir.ref<i32>) {
 !CHECK-NEXT:   omp.loop_nest {{.*}} {
 !CHECK:          %[[PRV_X_DECL:.+]]:2 = hlfir.declare %[[PRV_X]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:          %[[PRV_Y_DECL:.+]]:2 = hlfir.declare %[[PRV_Y]] {{.*}} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
index 3e93e915bcd89..746229b8699f3 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
@@ -117,7 +117,7 @@ program main
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_16:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_17:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_f64 %[[VAL_8]]#0 -> %[[VAL_18:.*]] : !fir.ref<f64>, byref @add_reduction_byref_box_3x3xf64 %[[VAL_12]] -> %[[VAL_19:.*]] : !fir.ref<!fir.box<!fir.array<3x3xf64>>>) {
+! CHECK:             omp.wsloop reduction(@add_reduction_f64 %[[VAL_8]]#0 -> %[[VAL_18:.*]], byref @add_reduction_byref_box_3x3xf64 %[[VAL_12]] -> %[[VAL_19:.*]] : !fir.ref<f64>, !fir.ref<!fir.box<!fir.array<3x3xf64>>>) {
 ! CHECK:               omp.loop_nest (%[[VAL_20:.*]]) : i32 = (%[[VAL_15]]) to (%[[VAL_16]]) inclusive step (%[[VAL_17]]) {
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFEscalar"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
 ! CHECK:                 %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFEarray"} : (!fir.ref<!fir.box<!fir.array<3x3xf64>>>) -> (!fir.ref<!fir.box<!fir.array<3x3xf64>>>, !fir.ref<!fir.box<!fir.array<3x3xf64>>>)
diff --git a/flang/test/Lower/array-elemental-calls-char.f90 b/flang/test/Lower/array-elemental-calls-char.f90
index 652e79232c1b5..603cc677805fc 100644
--- a/flang/test/Lower/array-elemental-calls-char.f90
+++ b/flang/test/Lower/array-elemental-calls-char.f90
@@ -123,7 +123,7 @@ subroutine foo2b(i, j, c)
 ! CHECK:           %[[VAL_13:.*]] = fir.emboxchar %[[VAL_7]], %[[VAL_3]] : (!fir.ref<!fir.char<1,10>>, index) -> !fir.boxchar<1>
 ! CHECK:           %[[VAL_14:.*]] = arith.addi %[[VAL_9]], %[[VAL_5]] : index
 ! CHECK:           %[[VAL_15:.*]] = fir.array_coor %[[VAL_1]](%[[VAL_8]]) %[[VAL_14]] : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
-! CHECK:           %[[VAL_16:.*]] = fir.call @_QPelem2(%[[VAL_13]], %[[VAL_15]]) fastmath<contract> : (!fir.boxchar<1>, !fir.ref<i32>) -> i32
+! CHECK:           %[[VAL_16:.*]] = fir.call @_QPelem2(%[[VAL_13]], %[[VAL_15]]) proc_attrs<elemental, pure> fastmath<contract> : (!fir.boxchar<1>, !fir.ref<i32>) -> i32
 ! CHECK:           %[[VAL_17:.*]] = fir.array_coor %[[VAL_0]](%[[VAL_8]]) %[[VAL_14]] : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
 ! CHECK:           fir.store %[[VAL_16]] to %[[VAL_17]] : !fir.ref<i32>
 ! CHECK:           %[[VAL_18:.*]] = arith.subi %[[VAL_10]], %[[VAL_5]] : index
diff --git a/flang/test/Lower/array-user-def-assignments.f90 b/flang/test/Lower/array-user-def-assignments.f90
index 97090ff77678c..e88bc2fb861ba 100644
--- a/flang/test/Lower/array-user-def-assignments.f90
+++ b/flang/test/Lower/array-user-def-assignments.f90
@@ -442,7 +442,7 @@ elemental subroutine sto_char(a,b)
 ! CHECK:     %[[V_6:[0-9]+]] = fir.do_loop %arg2 = %[[V_2]] to %[[V_3]] step %[[C_1]] unordered iter_args(%arg3 = %[[V_5]]) -> (!fir.array<10x!fir.logical<4>>) {
 ! CHECK:       %[[V_7:[0-9]+]] = fir.convert %arg2 : (index) -> i32
 ! CHECK:       fir.store %[[V_7]] to %[[V_1:[0-9]+]] : !fir.ref<i32>
-! CHECK:       %[[V_8:[0-9]+]] = fir.call @_QPreturns_alloc(%[[V_1]]) fastmath<contract> : (!fir.ref<i32>) -> !fir.box<!fir.heap<f32>>
+! CHECK:       %[[V_8:[0-9]+]] = fir.call @_QPreturns_alloc(%[[V_1]]) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>) -> !fir.box<!fir.heap<f32>>
 ! CHECK:       fir.save_result %[[V_8]] to %[[V_0:[0-9]+]] : !fir.box<!fir.heap<f32>>, !fir.ref<!fir.box<!fir.heap<f32>>>
 ! CHECK:       %[[V_9:[0-9]+]] = fir.load %[[V_0:[0-9]+]] : !fir.ref<!fir.box<!fir.heap<f32>>>
 ! CHECK:       %[[V_10:[0-9]+]] = fir.box_addr %[[V_9:[0-9]+]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
diff --git a/flang/test/Lower/bindc_procs.f90 b/flang/test/Lower/bindc_procs.f90
index 514f7713c383b..232e9d809bf17 100644
--- a/flang/test/Lower/bindc_procs.f90
+++ b/flang/test/Lower/bindc_procs.f90
@@ -1,6 +1,6 @@
 ! RUN: bbc -emit-fir %s -o - | FileCheck %s
 
-! CHECK-DAG: func.func private @proc1() attributes {fir.bindc_name = "proc1"}
+! CHECK-DAG: func.func private @proc1() attributes {fir.bindc_name = "proc1", fir.proc_attrs = #fir.proc_attrs<bind_c>}
 module decl1
   interface
      subroutine proc_iface() bind(C)
@@ -13,7 +13,7 @@ subroutine test1(x)
   call PrOc1
 end subroutine test1
 
-! CHECK-DAG: func.func private @proc2() attributes {fir.bindc_name = "proc2"}
+! CHECK-DAG: func.func private @proc2() attributes {fir.bindc_name = "proc2", fir.proc_attrs = #fir.proc_attrs<bind_c>}
 module decl2
   interface
      subroutine proc_iface() bind(C)
@@ -26,7 +26,7 @@ subroutine test2(x)
   call PrOc2
 end subroutine test2
 
-! CHECK-DAG: func.func private @func3() -> f32 attributes {fir.bindc_name = "func3"}
+! CHECK-DAG: func.func private @func3() -> f32 attributes {fir.bindc_name = "func3", fir.proc_attrs = #fir.proc_attrs<bind_c>}
 module decl3
   interface
      real function func_iface() bind(C)
@@ -40,7 +40,7 @@ subroutine test3(x)
   x = FuNc3()
 end subroutine test3
 
-! CHECK-DAG: func.func private @func4() -> f32 attributes {fir.bindc_name = "func4"}
+! CHECK-DAG: func.func private @func4() -> f32 attributes {fir.bindc_name = "func4", fir.proc_attrs = #fir.proc_attrs<bind_c>}
 module decl4
   interface
      real function func_iface() bind(C)
diff --git a/flang/test/Lower/c-interoperability-c-pointer.f90 b/flang/test/Lower/c-interoperability-c-pointer.f90
index 780e3d7dbcb68..9700440f6650b 100644
--- a/flang/test/Lower/c-interoperability-c-pointer.f90
+++ b/flang/test/Lower/c-interoperability-c-pointer.f90
@@ -32,7 +32,7 @@ subroutine c_func(c_t1, c_t2) bind(c, name="c_func")
 end
 
 ! CHECK-LABEL: func.func @test_callee_c_ptr(
-! CHECK-SAME:                               %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "ptr1"}) attributes {fir.bindc_name = "test_callee_c_ptr"} {
+! CHECK-SAME:                               %[[VAL_0:.*]]: !fir.ref<i64>
 ! CHECK:         %[[VAL_5:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> {bindc_name = "local", uniq_name = "_QFtest_callee_c_ptrElocal"}
 ! CHECK:         %[[VAL_1:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
 ! CHECK:         %[[VAL_2:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
@@ -56,7 +56,7 @@ subroutine test_callee_c_ptr(ptr1) bind(c)
 end subroutine
 
 ! CHECK-LABEL: func.func @test_callee_c_funptr(
-! CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "ptr1"}) attributes {fir.bindc_name = "test_callee_c_funptr"} {
+! CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.ref<i64>
 ! CHECK:         %[[VAL_5:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}> {bindc_name = "local", uniq_name = "_QFtest_callee_c_funptrElocal"}
 ! CHECK:         %[[VAL_1:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
 ! CHECK:         %[[VAL_2:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
diff --git a/flang/test/Lower/call.f90 b/flang/test/Lower/call.f90
index 78e9b5f4bc8a7..dc5120c6eb226 100644
--- a/flang/test/Lower/call.f90
+++ b/flang/test/Lower/call.f90
@@ -45,7 +45,7 @@ function f_int_to_char(i) bind(c, name="f_int_to_char")
 end function
 
 ! CHECK-LABEL: func.func @f_int_to_char(
-! CHECK-SAME: %[[ARG0:.*]]: i32 {fir.bindc_name = "i"}) -> !fir.char<1> attributes {fir.bindc_name = "f_int_to_char"} {
+! CHECK-SAME: %[[ARG0:.*]]: i32 {fir.bindc_name = "i"}) -> !fir.char<1> attributes {fir.bindc_name = "f_int_to_char", fir.proc_attrs = #fir.proc_attrs<bind_c>} {
 ! CHECK: %[[CHARBOX:.*]] = fir.alloca !fir.char<1> {adapt.valuebyref}
 ! CHECK: %[[RESULT:.*]] = fir.alloca !fir.char<1> {bindc_name = "f_int_to_char", uniq_name = "_QFf_int_to_charEf_int_to_char"}
 ! CHECK: %[[INT_I:.*]] = fir.alloca i32
diff --git a/flang/test/Lower/func-attrs.f90 b/flang/test/Lower/func-attrs.f90
index 7ab549a0ac7ce..6c1e70bf6dabb 100644
--- a/flang/test/Lower/func-attrs.f90
+++ b/flang/test/Lower/func-attrs.f90
@@ -3,29 +3,34 @@
 pure subroutine sub1()
 end
 
-! CHECK: func.func @_QPsub1() attributes {fir.func_pure}
+! CHECK: func.func @_QPsub1() attributes {fir.proc_attrs = #fir.proc_attrs<pure>}
 
 elemental subroutine sub2()
 end
 
-! CHECK: func.func @_QPsub2() attributes {fir.func_elemental, fir.func_pure}
+! CHECK: func.func @_QPsub2() attributes {fir.proc_attrs = #fir.proc_attrs<elemental, pure>}
 
-recursive subroutine sub3()
+non_recursive subroutine sub3()
 end
 
-! CHECK: func.func @_QPsub3() attributes {fir.func_recursive}
+! CHECK: func.func @_QPsub3() attributes {fir.proc_attrs = #fir.proc_attrs<non_recursive>}
+
+impure elemental subroutine sub4()
+end
+
+! CHECK: func.func @_QPsub4() attributes {fir.proc_attrs = #fir.proc_attrs<elemental>}
 
 pure function fct1()
 end
 
-! CHECK: func.func @_QPfct1() -> f32 attributes {fir.func_pure}
+! CHECK: func.func @_QPfct1() -> f32 attributes {fir.proc_attrs = #fir.proc_attrs<pure>}
 
 elemental function fct2()
 end
 
-! CHECK: func.func @_QPfct2() -> f32 attributes {fir.func_elemental, fir.func_pure}
+! CHECK: func.func @_QPfct2() -> f32 attributes {fir.proc_attrs = #fir.proc_attrs<elemental, pure>}
 
-recursive function fct3()
+non_recursive function fct3()
 end
 
-! CHECK: func.func @_QPfct3() -> f32 attributes {fir.func_recursive}
+! CHECK: func.func @_QPfct3() -> f32 attributes {fir.proc_attrs = #fir.proc_attrs<non_recursive>}
diff --git a/flang/test/Lower/host-associated.f90 b/flang/test/Lower/host-associated.f90
index 67465f5a7073d..9b4269df7bfcb 100644
--- a/flang/test/Lower/host-associated.f90
+++ b/flang/test/Lower/host-associated.f90
@@ -309,7 +309,7 @@ subroutine test7(j, k)
 contains
 
 ! CHECK-LABEL: func private @_QFtest7Ptest7_inner(
-! CHECK-SAME: %[[i:.*]]: !fir.ref<i32>{{.*}}, %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) -> i32 attributes {fir.func_elemental, fir.func_pure, fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME: %[[i:.*]]: !fir.ref<i32>{{.*}}, %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) -> i32 attributes {fir.host_symbol = {{.*}}, fir.proc_attrs = #fir.proc_attrs<elemental, pure>, llvm.linkage = #llvm.linkage<internal>} {
 elemental integer function test7_inner(i)
   implicit none
   integer, intent(in) :: i
diff --git a/flang/test/Lower/program-units-fir-mangling.f90 b/flang/test/Lower/program-units-fir-mangling.f90
index 002343c45f6ec..e0af6f065f34d 100644
--- a/flang/test/Lower/program-units-fir-mangling.f90
+++ b/flang/test/Lower/program-units-fir-mangling.f90
@@ -134,22 +134,22 @@ subroutine should_not_collide()
 end subroutine
 end program
 
-! CHECK-LABEL: func @omp_get_num_threads() -> f32 attributes {fir.bindc_name = "omp_get_num_threads"} {
+! CHECK-LABEL: func @omp_get_num_threads() -> f32 attributes {fir.bindc_name = "omp_get_num_threads", fir.proc_attrs = #fir.proc_attrs<bind_c>} {
 function omp_get_num_threads() bind(c)
 ! CHECK: }
 end function
 
-! CHECK-LABEL: func @get_threads() -> f32 attributes {fir.bindc_name = "get_threads"} {
+! CHECK-LABEL: func @get_threads() -> f32 attributes {fir.bindc_name = "get_threads", fir.proc_attrs = #fir.proc_attrs<bind_c>} {
 function omp_get_num_threads_1() bind(c, name ="get_threads")
 ! CHECK: }
 end function
 
-! CHECK-LABEL: func @bEtA() -> f32 attributes {fir.bindc_name = "bEtA"} {
+! CHECK-LABEL: func @bEtA() -> f32 attributes {fir.bindc_name = "bEtA", fir.proc_attrs = #fir.proc_attrs<bind_c>} {
 function alpha() bind(c, name =" bEtA ")
 ! CHECK: }
 end function
 
-! CHECK-LABEL: func @bc1() attributes {fir.bindc_name = "bc1"} {
+! CHECK-LABEL: func @bc1() attributes {fir.bindc_name = "bc1", fir.proc_attrs = #fir.proc_attrs<bind_c>} {
 subroutine bind_c_s() Bind(C,Name='bc1')
   ! CHECK: return
 end subroutine bind_c_s
@@ -175,11 +175,11 @@ subroutine bind_c_s() Bind(C, name='bc1')
 ! Test that BIND(C) label is taken into account for ENTRY symbols.
 ! CHECK-LABEL: func @_QPsub_with_entries() {
 subroutine sub_with_entries
-! CHECK-LABEL: func @bar() attributes {fir.bindc_name = "bar"} {
+! CHECK-LABEL: func @bar() attributes {fir.bindc_name = "bar", fir.proc_attrs = #fir.proc_attrs<bind_c>} {
  entry some_entry() bind(c, name="bar")
 ! CHECK-LABEL: func @_QPnormal_entry() {
  entry normal_entry()
-! CHECK-LABEL: func @some_other_entry() attributes {fir.bindc_name = "some_other_entry"} {
+! CHECK-LABEL: func @some_other_entry() attributes {fir.bindc_name = "some_other_entry", fir.proc_attrs = #fir.proc_attrs<bind_c>} {
  entry some_other_entry() bind(c)
 end subroutine
 
@@ -196,24 +196,24 @@ subroutine s1() bind(c,name=ok//'2')
     end subroutine
   end interface
  contains
-! CHECK-LABEL: func @ok3() -> f32 attributes {fir.bindc_name = "ok3"} {
+! CHECK-LABEL: func @ok3() -> f32 attributes {fir.bindc_name = "ok3", fir.proc_attrs = #fir.proc_attrs<bind_c>} {
   real function f2() bind(c,name=foo//'3')
     character*(*), parameter :: foo = ok
 ! CHECK: fir.call @ok1() {{.*}}: () -> f32
-! CHECK-LABEL: func @ok4() -> f32 attributes {fir.bindc_name = "ok4"} {
+! CHECK-LABEL: func @ok4() -> f32 attributes {fir.bindc_name = "ok4", fir.proc_attrs = #fir.proc_attrs<bind_c>} {
     entry f3() bind(c,name=foo//'4')
 ! CHECK: fir.call @ok1() {{.*}}: () -> f32
     f2 = f1()
   end function
-! CHECK-LABEL: func @ok5() attributes {fir.bindc_name = "ok5"} {
+! CHECK-LABEL: func @ok5() attributes {fir.bindc_name = "ok5", fir.proc_attrs = #fir.proc_attrs<bind_c>} {
   subroutine s2() bind(c,name=foo//'5')
     character*(*), parameter :: foo = ok
 ! CHECK: fir.call @ok2() {{.*}}: () -> ()
-! CHECK-LABEL: func @ok6() attributes {fir.bindc_name = "ok6"} {
+! CHECK-LABEL: func @ok6() attributes {fir.bindc_name = "ok6", fir.proc_attrs = #fir.proc_attrs<bind_c>} {
     entry s3() bind(c,name=foo//'6')
 ! CHECK: fir.call @ok2() {{.*}}: () -> ()
     continue ! force end of specification part
-! CHECK-LABEL: func @ok7() attributes {fir.bindc_name = "ok7"} {
+! CHECK-LABEL: func @ok7() attributes {fir.bindc_name = "ok7", fir.proc_attrs = #fir.proc_attrs<bind_c>} {
     entry s4() bind(c,name=foo//'7')
 ! CHECK: fir.call @ok2() {{.*}}: () -> ()
     call s1
diff --git a/flang/test/Parser/OpenMP/declare_target-device_type.f90 b/flang/test/Parser/OpenMP/declare_target-device_type.f90
index 0b4f75e7ddccb..40eb1c2fa4cae 100644
--- a/flang/test/Parser/OpenMP/declare_target-device_type.f90
+++ b/flang/test/Parser/OpenMP/declare_target-device_type.f90
@@ -1,5 +1,5 @@
-! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s | FileCheck --ignore-case %s
-! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=52 %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=52 %s | FileCheck --check-prefix="PARSE-TREE" %s
 
 subroutine openmp_declare_target
     integer, save :: x, y
diff --git a/flang/test/Parser/OpenMP/in-reduction-clause.f90 b/flang/test/Parser/OpenMP/in-reduction-clause.f90
index 16f045771d732..776ead3824b71 100644
--- a/flang/test/Parser/OpenMP/in-reduction-clause.f90
+++ b/flang/test/Parser/OpenMP/in-reduction-clause.f90
@@ -1,5 +1,5 @@
-! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s | FileCheck --ignore-case %s
-! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=50 %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=50 %s | FileCheck --check-prefix="PARSE-TREE" %s
 
 ! Check for IN_REDUCTION() clause on OpenMP constructs
 
diff --git a/flang/test/Parser/OpenMP/order-clause01.f90 b/flang/test/Parser/OpenMP/order-clause01.f90
index d7efaf0f67c23..41e131f9b5428 100644
--- a/flang/test/Parser/OpenMP/order-clause01.f90
+++ b/flang/test/Parser/OpenMP/order-clause01.f90
@@ -1,5 +1,5 @@
-! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s | FileCheck --ignore-case %s
-! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=50 %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=50 %s | FileCheck --check-prefix="PARSE-TREE" %s
 
 ! Check for ORDER([order-modifier :]concurrent) clause on OpenMP constructs
 
diff --git a/flang/test/Parser/OpenMP/tile-size.f90 b/flang/test/Parser/OpenMP/tile-size.f90
index f40dc3819af07..64bc3c5319e88 100644
--- a/flang/test/Parser/OpenMP/tile-size.f90
+++ b/flang/test/Parser/OpenMP/tile-size.f90
@@ -1,5 +1,5 @@
-! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s | FileCheck --ignore-case %s
-! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=51 %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=51 %s | FileCheck --check-prefix="PARSE-TREE" %s
 
 subroutine openmp_tiles(x)
 
diff --git a/flang/test/Parser/OpenMP/unroll-full.f90 b/flang/test/Parser/OpenMP/unroll-full.f90
index 3f26f61fc9aa0..30d2f46624991 100644
--- a/flang/test/Parser/OpenMP/unroll-full.f90
+++ b/flang/test/Parser/OpenMP/unroll-full.f90
@@ -1,5 +1,5 @@
-! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s | FileCheck --ignore-case %s
-! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=51 %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=51 %s | FileCheck --check-prefix="PARSE-TREE" %s
 
 subroutine openmp_parse_unroll(x)
 
diff --git a/flang/test/Parser/OpenMP/unroll.f90 b/flang/test/Parser/OpenMP/unroll.f90
index 93163a3390db4..8ac2a74166773 100644
--- a/flang/test/Parser/OpenMP/unroll.f90
+++ b/flang/test/Parser/OpenMP/unroll.f90
@@ -1,5 +1,5 @@
-! RUN: %flang_fc1 -fdebug-unparse-no-sema -fopenmp %s | FileCheck --ignore-case %s
-! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s
+! RUN: %flang_fc1 -fdebug-unparse-no-sema -fopenmp -fopenmp-version=51 %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=51 %s | FileCheck --check-prefix="PARSE-TREE" %s
 
 subroutine openmp_parse_unroll(x)
 
diff --git a/flang/test/Parser/include.f b/flang/test/Parser/include.f
index 8a7fe3a2ecd9d..6e16afd92ad0c 100644
--- a/flang/test/Parser/include.f
+++ b/flang/test/Parser/include.f
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -E -I %S/Inputs %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -E -fpreprocess-include-lines -I %S/Inputs %s 2>&1 | FileCheck %s
       include 'include-file'
       include "include-file"
       include 1_'include-file'
diff --git a/flang/test/Parser/recovery06.f90 b/flang/test/Parser/recovery06.f90
new file mode 100644
index 0000000000000..4c0214180eb0f
--- /dev/null
+++ b/flang/test/Parser/recovery06.f90
@@ -0,0 +1,62 @@
+! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+program p
+ contains
+! CHECK: error: expected 'END'
+! CHECK: in the context: END PROGRAM statement
+  continue
+end
+
+subroutine s
+ contains
+! CHECK: error: expected 'END'
+! CHECK: in the context: SUBROUTINE subprogram
+  continue
+end
+
+function f()
+ contains
+! CHECK: error: expected 'END'
+! CHECK: in the context: FUNCTION subprogram
+  continue
+end
+
+module m
+  interface
+    module subroutine ms
+    end
+  end interface
+ contains
+! CHECK: error: expected 'END'
+! CHECK: in the context: END MODULE statement
+  continue
+end
+
+module m2
+ contains
+  subroutine m2s
+   contains
+! CHECK: error: expected 'END'
+! CHECK: in the context: SUBROUTINE subprogram
+    continue
+  end
+end
+
+submodule(m) s1
+ contains
+! CHECK: error: expected 'END'
+! CHECK: in the context: END SUBMODULE statement
+  continue
+end
+
+submodule(m) s2
+ contains
+  module procedure ms
+   contains
+! CHECK: error: expected 'END'
+! CHECK: in the context: END PROCEDURE statement
+    continue
+  end
+end
+
+! Ensure no error cascade
+! CHECK-NOT: error:
diff --git a/flang/test/Semantics/OpenMP/atomic-hint-clause.f90 b/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
index e157b7e1e73a7..f724a69345f6e 100644
--- a/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
+++ b/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags 
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50
 ! Semantic checks on hint clauses, as they appear on atomic constructs
 
 program sample
diff --git a/flang/test/Semantics/OpenMP/atomic01.f90 b/flang/test/Semantics/OpenMP/atomic01.f90
index f0e1b47d2fa16..538db316f6e7f 100644
--- a/flang/test/Semantics/OpenMP/atomic01.f90
+++ b/flang/test/Semantics/OpenMP/atomic01.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50
 ! Semantic checks for OpenMP 5.0 standard 2.17.7 atomic Construct.
 
 use omp_lib
diff --git a/flang/test/Semantics/OpenMP/atomic05.f90 b/flang/test/Semantics/OpenMP/atomic05.f90
index 2d9566463309c..f37aabcfce06e 100644
--- a/flang/test/Semantics/OpenMP/atomic05.f90
+++ b/flang/test/Semantics/OpenMP/atomic05.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang %openmp_flags
+! RUN: %python %S/../test_errors.py %s %flang %openmp_flags -fopenmp-version=50
 
 ! This tests the various semantics related to the clauses of various OpenMP atomic constructs
 
diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90
index 020d63f735596..24540492e7327 100644
--- a/flang/test/Semantics/OpenMP/clause-validity01.f90
+++ b/flang/test/Semantics/OpenMP/clause-validity01.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags %openmp_module_flag
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags %openmp_module_flag -fopenmp-version=50
 use omp_lib
 ! Check OpenMP clause validity for the following directives:
 !
diff --git a/flang/test/Semantics/OpenMP/clause-validity02.f90 b/flang/test/Semantics/OpenMP/clause-validity02.f90
new file mode 100644
index 0000000000000..7e61bf0fd3177
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/clause-validity02.f90
@@ -0,0 +1,9 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=45
+
+subroutine bad_in_45(h_ptr)
+  integer, pointer :: h_ptr
+  !ERROR: USE_DEVICE_ADDR clause is not allowed on directive TARGET DATA in OpenMP v4.5, try -fopenmp-version=50
+  !$omp target data use_device_addr(h_ptr)
+  !$omp end target data
+end
+
diff --git a/flang/test/Semantics/OpenMP/declarative-directive.f90 b/flang/test/Semantics/OpenMP/declarative-directive.f90
index 4d10dc2d1b123..8d6762b87adb9 100644
--- a/flang/test/Semantics/OpenMP/declarative-directive.f90
+++ b/flang/test/Semantics/OpenMP/declarative-directive.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
 
 ! Check OpenMP declarative directives
 
diff --git a/flang/test/Semantics/OpenMP/declare-target01.f90 b/flang/test/Semantics/OpenMP/declare-target01.f90
index 2c50a9248280b..0651d3b5d89c1 100644
--- a/flang/test/Semantics/OpenMP/declare-target01.f90
+++ b/flang/test/Semantics/OpenMP/declare-target01.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=52
 ! OpenMP Version 5.1
 ! Check OpenMP construct validity for the following directives:
 ! 2.14.7 Declare Target Directive
diff --git a/flang/test/Semantics/OpenMP/declare-target02.f90 b/flang/test/Semantics/OpenMP/declare-target02.f90
index 8166e10d702b8..0f12180587f83 100644
--- a/flang/test/Semantics/OpenMP/declare-target02.f90
+++ b/flang/test/Semantics/OpenMP/declare-target02.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=52
 ! OpenMP Version 5.1
 ! Check OpenMP construct validity for the following directives:
 ! 2.14.7 Declare Target Directive
diff --git a/flang/test/Semantics/OpenMP/declare-target06.f90 b/flang/test/Semantics/OpenMP/declare-target06.f90
index a1c55d39e1b68..9abcfcecb681a 100644
--- a/flang/test/Semantics/OpenMP/declare-target06.f90
+++ b/flang/test/Semantics/OpenMP/declare-target06.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=52
 ! OpenMP Version 5.1
 ! Check OpenMP construct validity for the following directives:
 ! 2.14.7 Declare Target Directive
diff --git a/flang/test/Semantics/OpenMP/device-constructs.f90 b/flang/test/Semantics/OpenMP/device-constructs.f90
index 1ac00ef922c6b..4047fbf3fa811 100644
--- a/flang/test/Semantics/OpenMP/device-constructs.f90
+++ b/flang/test/Semantics/OpenMP/device-constructs.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
 ! Check OpenMP clause validity for the following directives:
 !     2.10 Device constructs
 program main
diff --git a/flang/test/Semantics/OpenMP/flush02.f90 b/flang/test/Semantics/OpenMP/flush02.f90
index d12c76bad0334..f06719f302fd7 100644
--- a/flang/test/Semantics/OpenMP/flush02.f90
+++ b/flang/test/Semantics/OpenMP/flush02.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50
 
 ! Check OpenMP 5.0 - 2.17.8 flush Construct
 ! Restriction -
diff --git a/flang/test/Semantics/OpenMP/if-clause.f90 b/flang/test/Semantics/OpenMP/if-clause.f90
index 493c6c873bfbf..7aeb617e53563 100644
--- a/flang/test/Semantics/OpenMP/if-clause.f90
+++ b/flang/test/Semantics/OpenMP/if-clause.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
 ! Check OpenMP 'if' clause validity for all directives that can have it
 
 program main
diff --git a/flang/test/Semantics/OpenMP/nontemporal.f90 b/flang/test/Semantics/OpenMP/nontemporal.f90
index 6d24849575ee9..ad0ebc85b5ce7 100644
--- a/flang/test/Semantics/OpenMP/nontemporal.f90
+++ b/flang/test/Semantics/OpenMP/nontemporal.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s  %flang -fopenmp
+! RUN: %python %S/../test_errors.py %s  %flang -fopenmp -fopenmp-version=50
 ! REQUIRES: shell
 ! Check OpenMP clause validity for NONTEMPORAL clause
 
diff --git a/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt-read.f90 b/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt-read.f90
new file mode 100644
index 0000000000000..6469b1bfb7847
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt-read.f90
@@ -0,0 +1,41 @@
+! RUN: %flang_fc1 -fopenmp %s -o -
+
+integer :: x, vv(2), xx(2)
+type t1
+  integer :: v,y,yy(2)
+end type t1
+type(t1)::t,tt(2)
+x=1
+xx=1
+vv=1
+t%y=1
+t%yy=1
+tt(1)%y=1
+tt(1)%yy=1
+tt(2)%v=1
+tt(2)%y=1
+tt(2)%yy=1
+
+!$omp atomic read
+  vv(1) = vv(2)
+!$omp atomic read
+  t%v = t%y
+!$omp atomic read
+  t%v = t%yy(1)
+!$omp atomic read
+  tt(1)%v = tt(1)%y
+!$omp atomic read
+  tt(1)%v = tt(2)%v
+!$omp atomic read
+  tt(1)%v = tt(1)%yy(1)
+!$omp atomic read
+  t%yy(2) = t%y
+!$omp atomic read
+  t%yy(2) = t%yy(1)
+!$omp atomic read
+  tt(1)%yy(2) = tt(1)%y
+!$omp atomic read
+  tt(1)%yy(2) = tt(1)%yy(1)
+!$omp atomic read
+  tt(1)%yy(2) = tt(2)%yy(2)
+end
diff --git a/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90 b/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90
index 0d4da5485af04..9701c1db92c1c 100644
--- a/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90
+++ b/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50
 ! Semantic checks for various assignments related to atomic constructs
 
 program sample
diff --git a/flang/test/Semantics/OpenMP/order-clause01.f90 b/flang/test/Semantics/OpenMP/order-clause01.f90
index 247791fac15b4..bb6e19e4ddd14 100644
--- a/flang/test/Semantics/OpenMP/order-clause01.f90
+++ b/flang/test/Semantics/OpenMP/order-clause01.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=50
 
 subroutine omp_order()
  integer :: i, j = 1
diff --git a/flang/test/Semantics/OpenMP/requires-atomic01.f90 b/flang/test/Semantics/OpenMP/requires-atomic01.f90
index cb7b1bc1ac52a..ae9fd086015dd 100644
--- a/flang/test/Semantics/OpenMP/requires-atomic01.f90
+++ b/flang/test/Semantics/OpenMP/requires-atomic01.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -fopenmp -fdebug-dump-parse-tree %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=50 -fdebug-dump-parse-tree %s 2>&1 | FileCheck %s
 ! Ensure that requires atomic_default_mem_order is used to update atomic
 ! operations with no explicit memory order set.
 program requires
diff --git a/flang/test/Semantics/OpenMP/requires-atomic02.f90 b/flang/test/Semantics/OpenMP/requires-atomic02.f90
index 5a4249794f7b5..4976a9667eb78 100644
--- a/flang/test/Semantics/OpenMP/requires-atomic02.f90
+++ b/flang/test/Semantics/OpenMP/requires-atomic02.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -fopenmp -fdebug-dump-parse-tree %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=50 -fdebug-dump-parse-tree %s 2>&1 | FileCheck %s
 ! Ensure that requires atomic_default_mem_order is used to update atomic
 ! operations with no explicit memory order set. ACQ_REL clause tested here.
 program requires
diff --git a/flang/test/Semantics/OpenMP/requires04.f90 b/flang/test/Semantics/OpenMP/requires04.f90
index bb4101c1cbd6c..1fbb3aa6219bc 100644
--- a/flang/test/Semantics/OpenMP/requires04.f90
+++ b/flang/test/Semantics/OpenMP/requires04.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
 ! OpenMP Version 5.0
 ! 2.4 Requires directive
 ! Target-related clauses in 'requires' directives must come strictly before any
diff --git a/flang/test/Semantics/OpenMP/requires05.f90 b/flang/test/Semantics/OpenMP/requires05.f90
index dd27e3895e394..f410f0104d899 100644
--- a/flang/test/Semantics/OpenMP/requires05.f90
+++ b/flang/test/Semantics/OpenMP/requires05.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
 ! OpenMP Version 5.0
 ! 2.4 Requires directive
 ! Target-related clauses in 'requires' directives must come strictly before any
diff --git a/flang/test/Semantics/OpenMP/simd-nontemporal.f90 b/flang/test/Semantics/OpenMP/simd-nontemporal.f90
index a488edd98cdc3..42518fd1aabe1 100644
--- a/flang/test/Semantics/OpenMP/simd-nontemporal.f90
+++ b/flang/test/Semantics/OpenMP/simd-nontemporal.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=50
 
 ! OpenMP Version 4.5
 ! 2.8.1 simd Construct
diff --git a/flang/test/Semantics/OpenMP/target01.f90 b/flang/test/Semantics/OpenMP/target01.f90
index 9836f0112738f..545cc8a7b69f8 100644
--- a/flang/test/Semantics/OpenMP/target01.f90
+++ b/flang/test/Semantics/OpenMP/target01.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=51
 
 subroutine foo(b)
 use iso_c_binding
diff --git a/flang/test/Semantics/OpenMP/taskgroup01.f90 b/flang/test/Semantics/OpenMP/taskgroup01.f90
index 98c9aabffa234..e05051387411a 100644
--- a/flang/test/Semantics/OpenMP/taskgroup01.f90
+++ b/flang/test/Semantics/OpenMP/taskgroup01.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang %openmp_flags
+! RUN: %python %S/../test_errors.py %s %flang %openmp_flags -fopenmp-version=50
 
 use omp_lib
   implicit none
diff --git a/flang/test/Semantics/OpenMP/use_device_addr.f90 b/flang/test/Semantics/OpenMP/use_device_addr.f90
index 93a7643b5eb48..0571a1fc06f81 100644
--- a/flang/test/Semantics/OpenMP/use_device_addr.f90
+++ b/flang/test/Semantics/OpenMP/use_device_addr.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols %s | FileCheck %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=51 -fdebug-dump-symbols %s | FileCheck %s
 ! OpenMP Version 5.1
 ! 2.14.2 use_device_addr clause
 ! List item that appears in a use_device_addr clause has corresponding storage
diff --git a/flang/test/Semantics/OpenMP/use_device_addr1.f90 b/flang/test/Semantics/OpenMP/use_device_addr1.f90
index 867e324b68ad9..e6a3e6e5b2a2d 100644
--- a/flang/test/Semantics/OpenMP/use_device_addr1.f90
+++ b/flang/test/Semantics/OpenMP/use_device_addr1.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=50
 ! OpenMP Version 5.0
 ! 2.10.1 use_device_ptr clause
 ! List item in USE_DEVICE_ADDR clause must not be structure element.
diff --git a/flang/test/Semantics/OpenMP/use_device_ptr1.f90 b/flang/test/Semantics/OpenMP/use_device_ptr1.f90
index 176fb5f35a849..41dbadc59ce7c 100644
--- a/flang/test/Semantics/OpenMP/use_device_ptr1.f90
+++ b/flang/test/Semantics/OpenMP/use_device_ptr1.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=50
 ! OpenMP Version 5.0
 ! 2.10.1 use_device_ptr clause
 ! List item in USE_DEVICE_PTR clause must not be structure element.
diff --git a/flang/test/Semantics/free.f90 b/flang/test/Semantics/free.f90
new file mode 100644
index 0000000000000..6332f03b19cd8
--- /dev/null
+++ b/flang/test/Semantics/free.f90
@@ -0,0 +1,33 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1 -Werror
+
+! Accept free of cray pointer without warning
+subroutine free_cptr()
+  integer :: x
+  pointer(ptr_x, x)
+  call free(ptr_x)
+end subroutine
+
+subroutine free_i8()
+  integer(kind=1) :: x
+  ! WARNING: FREE should only be used with Cray pointers
+  call free(x)
+end subroutine
+
+
+subroutine free_i16()
+  integer(kind=2) :: x
+  ! WARNING: FREE should only be used with Cray pointers
+  call free(x)
+end subroutine
+
+subroutine free_i32()
+  integer(kind=4) :: x
+  ! WARNING: FREE should only be used with Cray pointers
+  call free(x)
+end subroutine
+
+subroutine free_i64()
+  integer(kind=8) :: x
+  ! WARNING: FREE should only be used with Cray pointers
+  call free(x)
+end subroutine
diff --git a/flang/test/Semantics/io07.f90 b/flang/test/Semantics/io07.f90
index 1c13c7df20a31..64a32c9959287 100644
--- a/flang/test/Semantics/io07.f90
+++ b/flang/test/Semantics/io07.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
 1001 format(A)
 
      !ERROR: Format statement must be labeled
@@ -23,9 +23,13 @@
      endif
 
      ! C1302 warnings; no errors
+     !WARNING: Expected ',' or ')' in format expression
 2051 format(1X3/)
+     !WARNING: Expected ',' or ')' in format expression
 2052 format(1X003/)
+     !WARNING: Expected ',' or ')' in format expression
 2053 format(3P7I2)
+     !WARNING: Expected ',' or ')' in format expression
 2054 format(3PI2)
 
      !ERROR: Expected ',' or ')' in format expression
@@ -37,13 +41,14 @@
      !ERROR: Expected ',' or ')' in format expression
 2103 format(3I8 3Z8)
 
-     !ERROR: Expected ',' or ')' in format expression
+     !WARNING: Expected ',' or ')' in format expression
 2104 format(3I8 Z8)
 
 3001 format(*(I3))
 3002 format(5X,*(2(A)))
 
      !ERROR: Unlimited format item list must contain a data edit descriptor
+     !WARNING: 'X' edit descriptor must have a positive position value
 3101 format(*(X))
 
      !ERROR: Unlimited format item list must contain a data edit descriptor
@@ -52,9 +57,11 @@
      !ERROR: Unlimited format item list must contain a data edit descriptor
 3103 format(5X, 'abc', *((:)))
 
+     !WARNING: 'X' edit descriptor must have a positive position value
 4001 format(2(X))
 
      !ERROR: List repeat specifier must be positive
+     !WARNING: 'X' edit descriptor must have a positive position value
      !ERROR: 'DT' edit descriptor repeat specifier must be positive
 4101 format(0(X), 0dt)
 
diff --git a/flang/test/Transforms/debug-107988.fir b/flang/test/Transforms/debug-107988.fir
index 308f78a865120..0b08cf1c0b2eb 100644
--- a/flang/test/Transforms/debug-107988.fir
+++ b/flang/test/Transforms/debug-107988.fir
@@ -17,7 +17,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
 // CHECK: func.func @test
 // CHECK: %[[V1:.*]]:2 = fir.unboxchar{{.*}}
 // CHECK: %[[V2:.*]] = fir.convert %[[V1]]#1 : (index) -> i64
-// CHECK: llvm.intr.dbg.value #di_local_variable = %[[V2]] : i64
+// CHECK: llvm.intr.dbg.value #[[VAR]] = %[[V2]] : i64
 // CHECK: #[[STR_TY:.*]] = #llvm.di_string_type<tag = DW_TAG_string_type, name = "", stringLength = #[[VAR]], encoding = DW_ATE_ASCII>
 // CHECK: #llvm.di_local_variable<{{.*}}name = "str"{{.*}}type = #[[STR_TY]]>
 
diff --git a/flang/test/Transforms/debug-assumed-shape-array.fir b/flang/test/Transforms/debug-assumed-shape-array.fir
index d1e64297acea7..cb3927a7d79cf 100644
--- a/flang/test/Transforms/debug-assumed-shape-array.fir
+++ b/flang/test/Transforms/debug-assumed-shape-array.fir
@@ -1,16 +1,25 @@
 // RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>, #dlti.dl_entry<"dlti.endianness", "little">>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"} {
-  func.func @ff_(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "arr"} ) {
+  func.func @ff_(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "arr"}, %arg1: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "arr1"}) {
+    %c4 = arith.constant 4 : index
+    %c3 = arith.constant 3 : index
     %0 = fir.undefined !fir.dscope
     %1 = fircg.ext_declare %arg0 dummy_scope %0 {uniq_name = "_QFffEarr"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> !fir.box<!fir.array<?x?xi32>> loc(#loc1)
+    %2 = fircg.ext_declare %arg1 origin %c3, %c4 dummy_scope %0 {uniq_name = "_QFffEarr1"} : (!fir.box<!fir.array<?x?xi32>>, index, index, !fir.dscope) -> !fir.box<!fir.array<?x?xi32>> loc(#loc3)
     return
   } loc(#loc2)
 }
 #loc1 = loc("test1.f90":1:1)
 #loc2 = loc("test1.f90":3:16)
+#loc3 = loc("test1.f90":4:16)
 
-// CHECK: #llvm.di_composite_type<tag = DW_TAG_array_type
-// CHECK-SAME: elements = #llvm.di_subrange<count = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(32), DW_OP_deref]>, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(24), DW_OP_deref]>, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(40), DW_OP_deref]>>,
-// CHECK-SAME: #llvm.di_subrange<count = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(56), DW_OP_deref]>, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(48), DW_OP_deref]>, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(64), DW_OP_deref]>>
+// CHECK: #[[TY1:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type
+// CHECK-SAME: elements = #llvm.di_subrange<count = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(32), DW_OP_deref]>, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(40), DW_OP_deref]>>,
+// CHECK-SAME: #llvm.di_subrange<count = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(56), DW_OP_deref]>, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(64), DW_OP_deref]>>
 // CHECK-SAME: dataLocation = <[DW_OP_push_object_address, DW_OP_deref]>>
+// CHECK: #[[TY2:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type
+// CHECK-SAME: elements = #llvm.di_subrange<count = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(32), DW_OP_deref]>, lowerBound = 3 : i64, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(40), DW_OP_deref]>>,
+// CHECK-SAME: #llvm.di_subrange<count = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(56), DW_OP_deref]>, lowerBound = 4 : i64, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(64), DW_OP_deref]>>, dataLocation = <[DW_OP_push_object_address, DW_OP_deref]>>
+// CHECK: #llvm.di_local_variable<{{.*}}name = "arr"{{.*}}type = #[[TY1]]>
+// CHECK: #llvm.di_local_variable<{{.*}}name = "arr1"{{.*}}type = #[[TY2]]>
diff --git a/flang/test/Transforms/debug-derived-type-1.fir b/flang/test/Transforms/debug-derived-type-1.fir
index e453db6ae6fbb..26f7017f5f5a3 100644
--- a/flang/test/Transforms/debug-derived-type-1.fir
+++ b/flang/test/Transforms/debug-derived-type-1.fir
@@ -12,12 +12,18 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
     %0 = fir.zero_bits !fir.type<_QMt1Tt_t1{age:i32,points:!fir.array<3x!fir.complex<4>>,cond:!fir.logical<1>,name:!fir.char<1,20>,ratio:f64}>
     fir.has_value %0 : !fir.type<_QMt1Tt_t1{age:i32,points:!fir.array<3x!fir.complex<4>>,cond:!fir.logical<1>,name:!fir.char<1,20>,ratio:f64}>
   } loc(#loc6)
+  fir.global @_QMtest_1Exyz : !fir.type<_QMtest_1Tsometype{m_array:!fir.array<3xi32>,m_vt1:!fir.box<!fir.heap<!fir.type<_QMtest_1Tt1{name:!fir.char<1,20>,height:f32}>>>,v2:i32,m_alloc:!fir.box<!fir.heap<!fir.array<?xi32>>>,v3:i32,m_first:!fir.box<!fir.heap<!fir.char<1,?>>>,v4:i32,m_p1:!fir.box<!fir.ptr<i32>>,v5:i32,m_p2:!fir.box<!fir.ptr<i32>>,v6:i32,m_p3:!fir.box<!fir.ptr<!fir.array<?xi32>>>,v7:i32}> {
+    %0 = fir.zero_bits !fir.type<_QMtest_1Tsometype{m_array:!fir.array<3xi32>,m_vt1:!fir.box<!fir.heap<!fir.type<_QMtest_1Tt1{name:!fir.char<1,20>,height:f32}>>>,v2:i32,m_alloc:!fir.box<!fir.heap<!fir.array<?xi32>>>,v3:i32,m_first:!fir.box<!fir.heap<!fir.char<1,?>>>,v4:i32,m_p1:!fir.box<!fir.ptr<i32>>,v5:i32,m_p2:!fir.box<!fir.ptr<i32>>,v6:i32,m_p3:!fir.box<!fir.ptr<!fir.array<?xi32>>>,v7:i32}>
+    fir.has_value %0 : !fir.type<_QMtest_1Tsometype{m_array:!fir.array<3xi32>,m_vt1:!fir.box<!fir.heap<!fir.type<_QMtest_1Tt1{name:!fir.char<1,20>,height:f32}>>>,v2:i32,m_alloc:!fir.box<!fir.heap<!fir.array<?xi32>>>,v3:i32,m_first:!fir.box<!fir.heap<!fir.char<1,?>>>,v4:i32,m_p1:!fir.box<!fir.ptr<i32>>,v5:i32,m_p2:!fir.box<!fir.ptr<i32>>,v6:i32,m_p3:!fir.box<!fir.ptr<!fir.array<?xi32>>>,v7:i32}>
+  } loc(#loc12)
   fir.type_info @_QMt1Tt_t1 noinit nodestroy nofinal : !fir.type<_QMt1Tt_t1{age:i32,points:!fir.array<3x!fir.complex<4>>,cond:!fir.logical<1>,name:!fir.char<1,20>,ratio:f64}> loc(#loc7)
   fir.type_info @_QMm_employeeTt_address noinit nodestroy nofinal : !fir.type<_QMm_employeeTt_address{house_number:i32}> loc(#loc1)
   fir.type_info @_QMm_employeeTt_person noinit nodestroy nofinal extends !fir.type<_QMm_employeeTt_address{house_number:i32}> : !fir.type<_QMm_employeeTt_person{t_address:!fir.type<_QMm_employeeTt_address{house_number:i32}>,name:!fir.char<1,20>}> loc(#loc2)
   fir.type_info @_QMm_employeeTt_date noinit nodestroy nofinal : !fir.type<_QMm_employeeTt_date{year:i32,month:i32,day:i32}> loc(#loc3)
   fir.type_info @_QMm_employeeTt_employee noinit nodestroy nofinal extends !fir.type<_QMm_employeeTt_person{t_address:!fir.type<_QMm_employeeTt_address{house_number:i32}>,name:!fir.char<1,20>}> : !fir.type<_QMm_employeeTt_employee{t_person:!fir.type<_QMm_employeeTt_person{t_address:!fir.type<_QMm_employeeTt_address{house_number:i32}>,name:!fir.char<1,20>}>,hired_date:!fir.type<_QMm_employeeTt_date{year:i32,month:i32,day:i32}>,monthly_salary:f32}> loc(#loc4)
   fir.type_info @_QFTt_pair noinit nodestroy nofinal : !fir.type<_QFTt_pair{i:i64,x:f64}> loc(#loc8)
+  fir.type_info @_QMtest_1Tt1 noinit nodestroy nofinal : !fir.type<_QMtest_1Tt1{name:!fir.char<1,20>,height:f32}> loc(#loc11)
+  fir.type_info @_QMtest_1Tsometype nofinal : !fir.type<_QMtest_1Tsometype{m_array:!fir.array<3xi32>,m_vt1:!fir.box<!fir.heap<!fir.type<_QMtest_1Tt1{name:!fir.char<1,20>,height:f32}>>>,v2:i32,m_alloc:!fir.box<!fir.heap<!fir.array<?xi32>>>,v3:i32,m_first:!fir.box<!fir.heap<!fir.char<1,?>>>,v4:i32,m_p1:!fir.box<!fir.ptr<i32>>,v5:i32,m_p2:!fir.box<!fir.ptr<i32>>,v6:i32,m_p3:!fir.box<!fir.ptr<!fir.array<?xi32>>>,v7:i32}> loc(#loc12)
   func.func @_QQmain() attributes {fir.bindc_name = "test"} {
     %1 = fir.alloca !fir.type<_QFTt_pair{i:i64,x:f64}> {bindc_name = "pair", uniq_name = "_QFEpair"}
     %2 = fircg.ext_declare %1 {uniq_name = "_QFEpair"} : (!fir.ref<!fir.type<_QFTt_pair{i:i64,x:f64}>>) -> !fir.ref<!fir.type<_QFTt_pair{i:i64,x:f64}>> loc(#loc9)
@@ -34,6 +40,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
 #loc8 = loc("derived1.f90":85:3)
 #loc9 = loc("derived1.f90":77:3)
 #loc10 = loc("derived1.f90":75:3)
+#loc11 = loc("derived1.f90":95:3)
+#loc12 = loc("derived1.f90":105:3)
 
 
 // CHECK-DAG: #[[INT_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
@@ -47,27 +55,42 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
 // CHECK-DAG: #[[MOD:.*]] = #llvm.di_module<{{.*}}name = "m_employee"{{.*}}>
 // CHECK-DAG: #[[MOD1:.*]] = #llvm.di_module<{{.*}}name = "t1"{{.*}}>
 // CHECK-DAG: #[[ELMA1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "house_number", baseType = #[[INT_TY]], sizeInBits = 32, alignInBits = 32>
-// CHECK-DAG: #[[ADDR:.*]] = #llvm.di_composite_type<tag = DW_TAG_structure_type, name = "t_address"{{.*}}line = 24, scope = #[[MOD]], sizeInBits = 32, elements = #[[ELMA1]]>
+// CHECK-DAG: #[[ADDR:.*]] = #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "t_address"{{.*}}line = 24, scope = #[[MOD]], sizeInBits = 32, elements = #[[ELMA1]]>
 // CHECK-DAG: #[[ELMD1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "year", baseType = #[[INT_TY]], sizeInBits = 32, alignInBits = 32>
 // CHECK-DAG: #[[ELMD2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "month", baseType = #[[INT_TY]], sizeInBits = 32, alignInBits = 32, offsetInBits = 32>
 // CHECK-DAG: #[[ELMD3:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "day", baseType = #[[INT_TY]], sizeInBits = 32, alignInBits = 32, offsetInBits = 64>
-// CHECK-DAG: #[[DATE:.*]] = #llvm.di_composite_type<tag = DW_TAG_structure_type, name = "t_date", file = #di_file, line = 17, scope = #[[MOD]], sizeInBits = 96, elements = #[[ELMD1]], #[[ELMD2]], #[[ELMD3]]>
+// CHECK-DAG: #[[DATE:.*]] = #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "t_date", file = #di_file, line = 17, scope = #[[MOD]], sizeInBits = 96, elements = #[[ELMD1]], #[[ELMD2]], #[[ELMD3]]>
 // CHECK-DAG: #[[ELMP1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "t_address", baseType = #[[ADDR]], sizeInBits = 32, alignInBits = 32>
 // CHECK-DAG: #[[ELMP2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "name", baseType = #[[STR_TY]], sizeInBits = 160, alignInBits = 8, offsetInBits = 32>
-// CHECK-DAG: #[[PERS:.*]] = #llvm.di_composite_type<tag = DW_TAG_structure_type, name = "t_person"{{.*}}line = 35, scope = #[[MOD]], sizeInBits = 192, elements = #[[ELMP1]], #[[ELMP2]]>
+// CHECK-DAG: #[[PERS:.*]] = #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "t_person"{{.*}}line = 35, scope = #[[MOD]], sizeInBits = 192, elements = #[[ELMP1]], #[[ELMP2]]>
 // CHECK-DAG: #[[ELME1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "t_person", baseType = #[[PERS]], sizeInBits = 192, alignInBits = 32>
 // CHECK-DAG: #[[ELME2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "hired_date", baseType = #[[DATE]], sizeInBits = 96, alignInBits = 32, offsetInBits = 192>
 // CHECK-DAG: #[[ELME3:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "monthly_salary", baseType = #[[REAL4_TY]], sizeInBits = 32, alignInBits = 32, offsetInBits = 288>
-// CHECK-DAG: #[[EMP:.*]] = #llvm.di_composite_type<tag = DW_TAG_structure_type, name = "t_employee"{{.*}}line = 46, scope = #[[MOD]], sizeInBits = 320, elements = #[[ELME1]], #[[ELME2]], #[[ELME3]]>
+// CHECK-DAG: #[[EMP:.*]] = #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "t_employee"{{.*}}line = 46, scope = #[[MOD]], sizeInBits = 320, elements = #[[ELME1]], #[[ELME2]], #[[ELME3]]>
 
 // CHECK-DAG: #[[ELM1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "age", baseType = #[[INT_TY]], sizeInBits = 32, alignInBits = 32>
 // CHECK-DAG: #[[ELM2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "points", baseType = #[[CMX_ARR]], sizeInBits = 192, alignInBits = 32, offsetInBits = 32>
 // CHECK-DAG: #[[ELM3:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "cond", baseType = #[[LOG_TY]], sizeInBits = 8, alignInBits = 8, offsetInBits = 224>
 // CHECK-DAG: #[[ELM4:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "name", baseType = #[[STR_TY]], sizeInBits = 160, alignInBits = 8, offsetInBits = 232>
 // CHECK-DAG: #[[ELM5:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "ratio", baseType = #[[REAL8_TY]], sizeInBits = 64, alignInBits = 64, offsetInBits = 448>
-// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_structure_type, name = "t_t1"{{.*}}, line = 70, scope = #[[MOD1]], sizeInBits = 512, elements = #[[ELM1]], #[[ELM2]], #[[ELM3]], #[[ELM4]], #[[ELM5]]>
+// CHECK-DAG: #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "t_t1"{{.*}}, line = 70, scope = #[[MOD1]], sizeInBits = 512, elements = #[[ELM1]], #[[ELM2]], #[[ELM3]], #[[ELM4]], #[[ELM5]]>
 
 // CHECK-DAG: #[[SP:.*]] = #llvm.di_subprogram
 // CHECK-DAG: #[[ELML1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "i", baseType = #[[INT8_TY]], sizeInBits = 64, alignInBits = 64>
 // CHECK-DAG: #[[ELML2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "x", baseType = #[[REAL8_TY]], sizeInBits = 64, alignInBits = 64, offsetInBits = 64>
-// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_structure_type, name = "t_pair"{{.*}}line = 85, scope = #di_subprogram, sizeInBits = 128, elements = #[[ELML1]], #[[ELML2]]>
+// CHECK-DAG: #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "t_pair"{{.*}}line = 85, scope = #di_subprogram, sizeInBits = 128, elements = #[[ELML1]], #[[ELML2]]>
+
+// CHECK-DAG: #[[E1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "m_array", baseType = #{{.*}}, sizeInBits = 96, alignInBits = 32>
+// CHECK-DAG: #[[E2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "m_vt1", baseType = #{{.*}}, sizeInBits = 320, alignInBits = 64, offsetInBits = 128>
+// CHECK-DAG: #[[E3:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "v2", baseType = #{{.*}}, sizeInBits = 32, alignInBits = 32, offsetInBits = 448>
+// CHECK-DAG: #[[E4:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "m_alloc", baseType = #{{.*}}, sizeInBits = 384, alignInBits = 64, offsetInBits = 512>
+// CHECK-DAG: #[[E5:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "v3", baseType = #{{.*}}, sizeInBits = 32, alignInBits = 32, offsetInBits = 896>
+// CHECK-DAG: #[[E6:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "m_first", baseType = #{{.*}}, sizeInBits = 192, alignInBits = 64, offsetInBits = 960>
+// CHECK-DAG: #[[E7:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "v4", baseType = #{{.*}}, sizeInBits = 32, alignInBits = 32, offsetInBits = 1152>
+// CHECK-DAG: #[[E8:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "m_p1", baseType = #{{.*}}, sizeInBits = 192, alignInBits = 64, offsetInBits = 1216>
+// CHECK-DAG: #[[E9:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "v5", baseType = #{{.*}}, sizeInBits = 32, alignInBits = 32, offsetInBits = 1408>
+// CHECK-DAG: #[[E10:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "m_p2", baseType = #{{.*}}, sizeInBits = 192, alignInBits = 64, offsetInBits = 1472>
+// CHECK-DAG: #[[E11:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "v6", baseType = #{{.*}}, sizeInBits = 32, alignInBits = 32, offsetInBits = 1664>
+// CHECK-DAG: #[[E12:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "m_p3", baseType = #{{.*}}, sizeInBits = 384, alignInBits = 64, offsetInBits = 1728>
+// CHECK-DAG: #[[E13:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "v7", baseType = #{{.*}}, sizeInBits = 32, alignInBits = 32, offsetInBits = 2112>
+// CHECK-DAG: #llvm.di_composite_type<{{.*}}tag = DW_TAG_structure_type, name = "sometype"{{.*}}sizeInBits = 2144, elements = #[[E1]], #[[E2]], #[[E3]], #[[E4]], #[[E5]], #[[E6]], #[[E7]], #[[E8]], #[[E9]], #[[E10]], #[[E11]], #[[E12]], #[[E13]]>
diff --git a/flang/test/Transforms/debug-variable-char-len.fir b/flang/test/Transforms/debug-variable-char-len.fir
new file mode 100644
index 0000000000000..598d97cee970a
--- /dev/null
+++ b/flang/test/Transforms/debug-variable-char-len.fir
@@ -0,0 +1,31 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s -o - | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
+  func.func @foo(%arg0: !fir.ref<!fir.char<1,?>> {fir.bindc_name = "str1"} , %arg1: !fir.ref<i64> {fir.bindc_name = "len1"} loc("/home/haqadeer/work/fortran/t1/../str.f90":1:1), %arg2: i64) {
+    %0 = fir.emboxchar %arg0, %arg2 : (!fir.ref<!fir.char<1,?>>, i64) -> !fir.boxchar<1>
+    %c4_i32 = arith.constant 4 : i32
+    %c6_i32 = arith.constant 6 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %1 = fir.undefined !fir.dscope
+    %2 = fircg.ext_declare %arg1 dummy_scope %1 {uniq_name = "_QFfooElen1"} : (!fir.ref<i64>, !fir.dscope) -> !fir.ref<i64> loc(#loc1)
+    %3:2 = fir.unboxchar %0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %4 = fir.load %2 : !fir.ref<i64>
+    %5 = arith.cmpi sgt, %4, %c0_i64 : i64
+    %6 = arith.select %5, %4, %c0_i64 : i64
+    %7 = fircg.ext_declare %3#0 typeparams %6 dummy_scope %1 {uniq_name = "_QFfooEstr1"} : (!fir.ref<!fir.char<1,?>>, i64, !fir.dscope) -> !fir.ref<!fir.char<1,?>> loc(#loc2)
+    return
+  } loc(#loc3)
+}
+
+
+#loc1 = loc("test.f90":18:1)
+#loc2 = loc("test.f90":17:1)
+#loc3 = loc("test.f90":15:1)
+
+// CHECK: #[[VAR:.*]] = #llvm.di_local_variable<{{.*}}name = "._QFfooEstr1"{{.*}}flags = Artificial>
+// CHECK: func.func @foo
+// CHECK: llvm.intr.dbg.value #[[VAR]]
+// CHECK: return
+// CHECK: #[[STR_TY:.*]] = #llvm.di_string_type<tag = DW_TAG_string_type, name = "", stringLength = #[[VAR]], encoding = DW_ATE_ASCII>
+// CHECK: #llvm.di_local_variable<{{.*}}name = "str1"{{.*}}type = #[[STR_TY]]>
+
diff --git a/flang/test/Transforms/omp-map-info-finalization.fir b/flang/test/Transforms/omp-map-info-finalization.fir
index 9d776b674151d..61af1ee308d78 100644
--- a/flang/test/Transforms/omp-map-info-finalization.fir
+++ b/flang/test/Transforms/omp-map-info-finalization.fir
@@ -20,7 +20,6 @@ module attributes {omp.is_target_device = false} {
     %8 = omp.map.info var_ptr(%4#1 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<i32>>>
     %9 = omp.map.info var_ptr(%7 : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>) map_clauses(from) capture(ByRef) bounds(%bounds) -> !fir.ref<!fir.array<?xi32>>
     omp.target map_entries(%8 -> %arg1, %9 -> %arg2 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.array<?xi32>>) {
-    ^bb0(%arg1: !fir.ref<!fir.box<!fir.heap<i32>>>, %arg2: !fir.ref<!fir.array<?xi32>>):
       omp.terminator
     }
     return 
@@ -41,7 +40,6 @@ module attributes {omp.is_target_device = false} {
 // CHECK: %[[DESC_MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.array<?xi32>) var_ptr_ptr(%[[BASE_ADDR_OFF_2]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
 // CHECK: %[[DESC_PARENT_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(from) capture(ByRef) members(%[[DESC_MEMBER_MAP_2]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>>
 // CHECK: omp.target map_entries(%[[DESC_MEMBER_MAP]] -> %[[ARG1:.*]], %[[DESC_PARENT_MAP]] -> %[[ARG2:.*]], %[[DESC_MEMBER_MAP_2]] -> %[[ARG3:.*]], %[[DESC_PARENT_MAP_2]] -> %[[ARG4:.*]] : {{.*}}) {
-// CHECK: ^bb0(%[[ARG1]]: !fir.llvm_ptr<!fir.ref<i32>>, %[[ARG2]]: !fir.ref<!fir.box<!fir.heap<i32>>>, %[[ARG3]]: !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, %[[ARG4]]: !fir.ref<!fir.array<?xi32>>):
 
 // -----
 
@@ -53,7 +51,6 @@ module attributes {omp.is_target_device = false} {
     %3 = omp.map.info var_ptr(%2 : !fir.ref<f32>, f32) map_clauses(from) capture(ByRef) -> !fir.ref<f32> {name = "scalar_struct%ry"}
     %4 = omp.map.info var_ptr(%arg0 : !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>, !fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>) map_clauses(from) capture(ByRef) members(%1, %3 : [1], [4] : !fir.ref<f32>, !fir.ref<f32>) -> !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>> {name = "scalar_struct", partial_map = true}
     omp.target map_entries(%4 -> %arg1 : !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>) {
-    ^bb0(%arg1: !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>):
       omp.terminator
     }
     return
@@ -65,7 +62,6 @@ module attributes {omp.is_target_device = false} {
 // CHECK:   %[[MAP_MEMBER_2:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<f32>, f32) map_clauses(from) capture(ByRef) -> !fir.ref<f32> {name = "scalar_struct%ry"}
 // CHECK:   %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>, !fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>) map_clauses(from) capture(ByRef) members(%[[MAP_MEMBER_1]], %[[MAP_MEMBER_2]] : [1], [4] : !fir.ref<f32>, !fir.ref<f32>) -> !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>> {name = "scalar_struct", partial_map = true}
 // CHECK:   omp.target map_entries(%[[MAP_MEMBER_1]] -> %[[ARG1:.*]], %[[MAP_MEMBER_2]] -> %[[ARG2:.*]], %[[MAP_PARENT]] -> %[[ARG3:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>) {
-// CHECK:     ^bb0(%[[ARG1]]: !fir.ref<f32>, %[[ARG2]]: !fir.ref<f32>, %[[ARG3]]: !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>):
 
 // -----
 
@@ -84,7 +80,6 @@ func.func @test_nested_derived_type_map_operand_and_block_addition(%arg0: !fir.r
     %10 = omp.map.info var_ptr(%9 : !fir.ref<f32>, f32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f32> {name = "sa%n%r"}
     %11 = omp.map.info var_ptr(%0 : !fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>>, !fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>) map_clauses(tofrom) capture(ByRef) members(%5, %10 : [1,0], [1,1] : !fir.ref<i32>, !fir.ref<f32>) -> !fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>> {name = "sa", partial_map = true}
     omp.target map_entries(%11 -> %arg1 : !fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>>) {
-    ^bb0(%arg1: !fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>>):
       omp.terminator
     }
     return
@@ -96,4 +91,3 @@ func.func @test_nested_derived_type_map_operand_and_block_addition(%arg0: !fir.r
 // CHECK:   %[[MAP_MEMBER_2:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<f32>, f32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f32> {name = "sa%n%r"}
 // CHECK:   %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}, {{.*}}) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_1]], %[[MAP_MEMBER_2]] : [1,0], [1,1] : !fir.ref<i32>, !fir.ref<f32>) -> {{.*}} {name = "sa", partial_map = true}
 // CHECK:   omp.target map_entries(%[[MAP_MEMBER_1]] -> %[[ARG1:.*]], %[[MAP_MEMBER_2]] -> %[[ARG2:.*]], %[[MAP_PARENT]] -> %[[ARG3:.*]] : !fir.ref<i32>, !fir.ref<f32>, {{.*}}) {
-// CHECK:     ^bb0(%[[ARG1]]: !fir.ref<i32>, %[[ARG2]]: !fir.ref<f32>, %[[ARG3]]: {{.*}}):
diff --git a/flang/tools/bbc/CMakeLists.txt b/flang/tools/bbc/CMakeLists.txt
index 69316d4dc61de..18fa7ac1d6cdc 100644
--- a/flang/tools/bbc/CMakeLists.txt
+++ b/flang/tools/bbc/CMakeLists.txt
@@ -1,14 +1,14 @@
 set(LLVM_LINK_COMPONENTS
-Passes
-AllTargetsCodeGens
-AllTargetsDescs
-AllTargetsInfos
-TargetParser
+  Passes
+  AllTargetsCodeGens
+  AllTargetsDescs
+  AllTargetsInfos
+  TargetParser
 )
 
 add_flang_tool(bbc bbc.cpp
-DEPENDS
-FIROptCodeGenPassIncGen
+  DEPENDS
+  FIROptCodeGenPassIncGen
 )
 
 llvm_update_compile_flags(bbc)
@@ -16,23 +16,24 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
 
 target_link_libraries(bbc PRIVATE
-CUFAttrs
-CUFDialect
-FIRDialect
-FIRDialectSupport
-FIRSupport
-FIRTransforms
-FIRBuilder
-HLFIRDialect
-HLFIRTransforms
-FlangOpenMPTransforms
-${dialect_libs}
-${extension_libs}
-MLIRAffineToStandard
-MLIRSCFToControlFlow
-FortranCommon
-FortranParser
-FortranEvaluate
-FortranSemantics
-FortranLower
+  CUFAttrs
+  CUFDialect
+  FIRDialect
+  FIRDialectSupport
+  FIRSupport
+  FIRTransforms
+  FIRBuilder
+  HLFIRDialect
+  HLFIRTransforms
+  flangPasses
+  FlangOpenMPTransforms
+  ${dialect_libs}
+  ${extension_libs}
+  MLIRAffineToStandard
+  MLIRSCFToControlFlow
+  FortranCommon
+  FortranParser
+  FortranEvaluate
+  FortranSemantics
+  FortranLower
 )
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index dcff4503f1657..0a008d577cc25 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Common/Fortran-features.h"
+#include "flang/Common/LangOptions.h"
 #include "flang/Common/OpenMP-features.h"
 #include "flang/Common/Version.h"
 #include "flang/Common/default-kinds.h"
@@ -233,7 +234,8 @@ static llvm::cl::opt<bool>
            llvm::cl::init(false));
 
 #define FLANG_EXCLUDE_CODEGEN
-#include "flang/Tools/CLOptions.inc"
+#include "flang/Optimizer/Passes/CommandLineOpts.h"
+#include "flang/Optimizer/Passes/Pipelines.h"
 
 //===----------------------------------------------------------------------===//
 
@@ -507,6 +509,21 @@ int main(int argc, char **argv) {
   options.predefinitions.emplace_back(
       "__flang_patchlevel__"s, std::string{FLANG_VERSION_PATCHLEVEL_STRING});
 
+  Fortran::common::LangOptions langOpts;
+  langOpts.NoGPULib = setNoGPULib;
+  langOpts.OpenMPVersion = setOpenMPVersion;
+  langOpts.OpenMPIsTargetDevice = enableOpenMPDevice;
+  langOpts.OpenMPIsGPU = enableOpenMPGPU;
+  langOpts.OpenMPForceUSM = enableOpenMPForceUSM;
+  langOpts.OpenMPTargetDebug = setOpenMPTargetDebug;
+  langOpts.OpenMPThreadSubscription = setOpenMPThreadSubscription;
+  langOpts.OpenMPTeamSubscription = setOpenMPTeamSubscription;
+  langOpts.OpenMPNoThreadState = setOpenMPNoThreadState;
+  langOpts.OpenMPNoNestedParallelism = setOpenMPNoNestedParallelism;
+  std::transform(targetTriplesOpenMP.begin(), targetTriplesOpenMP.end(),
+                 std::back_inserter(langOpts.OMPTargetTriples),
+                 [](const std::string &str) { return llvm::Triple(str); });
+
   // enable parsing of OpenMP
   if (enableOpenMP) {
     options.features.Enable(Fortran::common::LanguageFeature::OpenMP);
@@ -538,7 +555,7 @@ int main(int argc, char **argv) {
   Fortran::parser::AllSources allSources;
   Fortran::parser::AllCookedSources allCookedSources(allSources);
   Fortran::semantics::SemanticsContext semanticsContext{
-      defaultKinds, options.features, allCookedSources};
+      defaultKinds, options.features, langOpts, allCookedSources};
   semanticsContext.set_moduleDirectory(moduleDir)
       .set_moduleFileSuffix(moduleSuffix)
       .set_searchDirectories(includeDirs)
diff --git a/flang/tools/tco/CMakeLists.txt b/flang/tools/tco/CMakeLists.txt
index 698a398547c77..aac80437ee11d 100644
--- a/flang/tools/tco/CMakeLists.txt
+++ b/flang/tools/tco/CMakeLists.txt
@@ -17,7 +17,9 @@ target_link_libraries(tco PRIVATE
   FIRBuilder
   HLFIRDialect
   HLFIRTransforms
+  flangPasses
   FlangOpenMPTransforms
+  FortranCommon
   ${dialect_libs}
   ${extension_libs}
   MLIRIR
diff --git a/flang/tools/tco/tco.cpp b/flang/tools/tco/tco.cpp
index a8c64333109ae..5c373c4e85258 100644
--- a/flang/tools/tco/tco.cpp
+++ b/flang/tools/tco/tco.cpp
@@ -70,7 +70,8 @@ static cl::opt<bool> codeGenLLVM(
     cl::desc("Run only CodeGen passes and translate FIR to LLVM IR"),
     cl::init(false));
 
-#include "flang/Tools/CLOptions.inc"
+#include "flang/Optimizer/Passes/CommandLineOpts.h"
+#include "flang/Optimizer/Passes/Pipelines.h"
 
 static void printModule(mlir::ModuleOp mod, raw_ostream &output) {
   output << mod << '\n';
diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 5e3122f59de9e..13dc892978bb8 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -78,10 +78,14 @@ add_proxy_header_library(
     libc.include.signal
 )
 
+add_header_library(stdio_overlay HDRS stdio_overlay.h)
+
 add_proxy_header_library(
   stdio_macros
   HDRS
     stdio_macros.h
+  DEPENDS
+    .stdio_overlay
   FULL_BUILD_DEPENDS
     libc.include.stdio
     libc.include.llvm-libc-macros.stdio_macros
diff --git a/libc/hdr/errno_macros.h b/libc/hdr/errno_macros.h
index 198b5233d4409..27ea49977d8c8 100644
--- a/libc/hdr/errno_macros.h
+++ b/libc/hdr/errno_macros.h
@@ -14,9 +14,9 @@
 #ifdef __linux__
 #include <linux/errno.h>
 
-#include "llvm-libc-macros/error-number-macros.h"
+#include "include/llvm-libc-macros/error-number-macros.h"
 #else // __linux__
-#include "llvm-libc-macros/generic-error-number-macros.h"
+#include "include/llvm-libc-macros/generic-error-number-macros.h"
 #endif
 
 #else // Overlay mode
diff --git a/libc/hdr/stdio_macros.h b/libc/hdr/stdio_macros.h
index a212846dd8f41..a4d6a972ec9ac 100644
--- a/libc/hdr/stdio_macros.h
+++ b/libc/hdr/stdio_macros.h
@@ -16,7 +16,7 @@
 
 #else // Overlay mode
 
-#include <stdio.h>
+#include "stdio_overlay.h"
 
 #endif // LLVM_LIBC_FULL_BUILD
 
diff --git a/libc/hdr/stdio_overlay.h b/libc/hdr/stdio_overlay.h
new file mode 100644
index 0000000000000..cec55abfde7bf
--- /dev/null
+++ b/libc/hdr/stdio_overlay.h
@@ -0,0 +1,47 @@
+//===-- Including stdio.h in overlay mode ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_STDIO_OVERLAY_H
+#define LLVM_LIBC_HDR_STDIO_OVERLAY_H
+
+#ifdef LIBC_FULL_BUILD
+#error "This header should only be included in overlay mode"
+#endif
+
+// Overlay mode
+
+// glibc <stdio.h> header might provide extern inline definitions for few
+// functions, causing external alias errors.  They are guarded by
+// `__USE_EXTERN_INLINES` macro.  We temporarily disable `__USE_EXTERN_INLINES`
+// macro by defining `__NO_INLINE__` before including <stdio.h>.
+// And the same with `__USE_FORTIFY_LEVEL`, which will be temporarily disabled
+// with `_FORTIFY_SOURCE`.
+
+#ifdef _FORTIFY_SOURCE
+#define LIBC_OLD_FORTIFY_SOURCE _FORTIFY_SOURCE
+#undef _FORTIFY_SOURCE
+#endif
+
+#ifndef __NO_INLINE__
+#define __NO_INLINE__ 1
+#define LIBC_SET_NO_INLINE
+#endif
+
+#include <stdio.h>
+
+#ifdef LIBC_OLD_FORTIFY_SOURCE
+#define _FORTIFY_SOURCE LIBC_OLD_FORTIFY_SOURCE
+#undef LIBC_OLD_FORTIFY_SOURCE
+#endif
+
+#ifdef LIBC_SET_NO_INLINE
+#undef __NO_INLINE__
+#undef LIBC_SET_NO_INLINE
+#endif
+
+#endif // LLVM_LIBC_HDR_STDIO_OVERLAY_H
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index b4de39621416f..fab5245816bbe 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -140,6 +140,8 @@ add_proxy_header_library(
   FILE
   HDRS
     FILE.h
+  DEPENDS
+    libc.hdr.stdio_overlay
   FULL_BUILD_DEPENDS
     libc.include.llvm-libc-types.FILE
     libc.include.stdio
@@ -149,6 +151,8 @@ add_proxy_header_library(
   off_t
   HDRS
     off_t.h
+  DEPENDS
+    libc.hdr.stdio_overlay
   FULL_BUILD_DEPENDS
     libc.include.llvm-libc-types.off_t
     libc.include.stdio
@@ -158,6 +162,8 @@ add_proxy_header_library(
   cookie_io_functions_t
   HDRS
     cookie_io_functions_t.h
+  DEPENDS
+    libc.hdr.stdio_overlay
   FULL_BUILD_DEPENDS
     libc.include.llvm-libc-types.cookie_io_functions_t
     libc.include.stdio
diff --git a/libc/hdr/types/FILE.h b/libc/hdr/types/FILE.h
index 60e95f07e37f9..ecb52b7102cb0 100644
--- a/libc/hdr/types/FILE.h
+++ b/libc/hdr/types/FILE.h
@@ -15,7 +15,7 @@
 
 #else // Overlay mode
 
-#include <stdio.h>
+#include "hdr/stdio_overlay.h"
 
 #endif // LLVM_LIBC_FULL_BUILD
 
diff --git a/libc/hdr/types/cookie_io_functions_t.h b/libc/hdr/types/cookie_io_functions_t.h
index d8fe7731a84bd..7323a05001c40 100644
--- a/libc/hdr/types/cookie_io_functions_t.h
+++ b/libc/hdr/types/cookie_io_functions_t.h
@@ -15,7 +15,7 @@
 
 #else // Overlay mode
 
-#include <stdio.h>
+#include "hdr/stdio_overlay.h"
 
 #endif // LLVM_LIBC_FULL_BUILD
 
diff --git a/libc/hdr/types/off_t.h b/libc/hdr/types/off_t.h
index abc3aa659365f..52337e5b63e2d 100644
--- a/libc/hdr/types/off_t.h
+++ b/libc/hdr/types/off_t.h
@@ -15,7 +15,7 @@
 
 #else // Overlay mode
 
-#include <stdio.h>
+#include "hdr/stdio_overlay.h"
 
 #endif // LLVM_LIBC_FULL_BUILD
 
diff --git a/libc/include/llvm-libc-macros/math-function-macros.h b/libc/include/llvm-libc-macros/math-function-macros.h
index 68f9ff9d1c033..c740eb2d18825 100644
--- a/libc/include/llvm-libc-macros/math-function-macros.h
+++ b/libc/include/llvm-libc-macros/math-function-macros.h
@@ -20,5 +20,9 @@
   __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, x)
 #define isnormal(x) __builtin_isnormal(x)
 #define issubnormal(x) (fpclassify(x) == FP_SUBNORMAL)
+#if (defined(__clang__) && __clang_major__ >= 18) ||                           \
+    (defined(__GNUC__) && __GNUC__ >= 13)
+#define issignaling(x) __builtin_issignaling(x)
+#endif
 
 #endif // LLVM_LIBC_MACROS_MATH_FUNCTION_MACROS_H
diff --git a/libc/include/llvm-libc-macros/stdio-macros.h b/libc/include/llvm-libc-macros/stdio-macros.h
index 69fb71ad3f651..96f0e6933ade6 100644
--- a/libc/include/llvm-libc-macros/stdio-macros.h
+++ b/libc/include/llvm-libc-macros/stdio-macros.h
@@ -9,6 +9,30 @@
 #ifndef LLVM_LIBC_MACROS_STDIO_MACROS_H
 #define LLVM_LIBC_MACROS_STDIO_MACROS_H
 
+#include "../llvm-libc-types/FILE.h"
+
+#ifdef __cplusplus
+extern "C" FILE *stdin;
+extern "C" FILE *stdout;
+extern "C" FILE *stderr;
+#else
+extern FILE *stdin;
+extern FILE *stdout;
+extern FILE *stderr;
+#endif
+
+#ifndef stdin
+#define stdin stdin
+#endif
+
+#ifndef stdout
+#define stdout stdout
+#endif
+
+#ifndef stderr
+#define stderr stderr
+#endif
+
 #ifndef EOF
 #define EOF (-1)
 #endif
@@ -19,4 +43,16 @@
 #define _IOLBF 1
 #define _IOFBF 0
 
+#ifndef SEEK_SET
+#define SEEK_SET 0
+#endif
+
+#ifndef SEEK_CUR
+#define SEEK_CUR 1
+#endif
+
+#ifndef SEEK_END
+#define SEEK_END 2
+#endif
+
 #endif // LLVM_LIBC_MACROS_STDIO_MACROS_H
diff --git a/libc/src/__support/File/linux/CMakeLists.txt b/libc/src/__support/File/linux/CMakeLists.txt
index 5556b812596f8..5abbf11b3671c 100644
--- a/libc/src/__support/File/linux/CMakeLists.txt
+++ b/libc/src/__support/File/linux/CMakeLists.txt
@@ -8,7 +8,6 @@ add_object_library(
     lseekImpl.h
   DEPENDS
     libc.include.fcntl
-    libc.include.stdio
     libc.include.sys_syscall
     libc.include.sys_stat
     libc.src.__support.CPP.new
diff --git a/libc/src/math/generic/pow.cpp b/libc/src/math/generic/pow.cpp
index 20f914430261c..3a50e220154e5 100644
--- a/libc/src/math/generic/pow.cpp
+++ b/libc/src/math/generic/pow.cpp
@@ -231,9 +231,16 @@ LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) {
     switch (y_a) {
     case 0: // y = +-0.0
       return 1.0;
-    case 0x3fe0'0000'0000'0000: // y = +-0.5
+    case 0x3fe0'0000'0000'0000: { // y = +-0.5
       // TODO: speed up x^(-1/2) with rsqrt(x) when available.
+      if (LIBC_UNLIKELY(!y_sign && (x_u == FPBits::zero(Sign::NEG).uintval() ||
+                                    x_u == FPBits::inf(Sign::NEG).uintval()))) {
+        // pow(-0, 1/2) = +0
+        // pow(-inf, 1/2) = +inf
+        return FPBits(x_abs).get_val();
+      }
       return y_sign ? (1.0 / fputil::sqrt<double>(x)) : fputil::sqrt<double>(x);
+    }
     case 0x3ff0'0000'0000'0000: // y = +-1.0
       return y_sign ? (1.0 / x) : x;
     case 0x4000'0000'0000'0000: // y = +-2.0;
diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp
index 845ac2521e090..e3dee678a1a70 100644
--- a/libc/src/math/generic/powf.cpp
+++ b/libc/src/math/generic/powf.cpp
@@ -563,6 +563,11 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
       switch (y_u) {
       case 0x3f00'0000: // y = 0.5f
         // pow(x, 1/2) = sqrt(x)
+        if (LIBC_UNLIKELY(x_u == 0x8000'0000 || x_u == 0xff80'0000)) {
+          // pow(-0, 1/2) = +0
+          // pow(-inf, 1/2) = +inf
+          return FloatBits(x_abs).get_val();
+        }
         return fputil::sqrt<float>(x);
       case 0x3f80'0000: // y = 1.0f
         return x;
diff --git a/libc/src/stdio/asprintf.h b/libc/src/stdio/asprintf.h
index fd2b908db171d..0c0d5a350829e 100644
--- a/libc/src/stdio/asprintf.h
+++ b/libc/src/stdio/asprintf.h
@@ -10,8 +10,6 @@
 #define LLVM_LIBC_SRC_STDIO_ASPRINTF_H
 
 #include "src/__support/macros/config.h"
-#include <stdarg.h>
-#include <stdio.h>
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdio/gpu/CMakeLists.txt b/libc/src/stdio/gpu/CMakeLists.txt
index 9cac42ed71fb7..c4ad333e25163 100644
--- a/libc/src/stdio/gpu/CMakeLists.txt
+++ b/libc/src/stdio/gpu/CMakeLists.txt
@@ -1,9 +1,40 @@
+add_entrypoint_object(
+  stdin
+  SRCS
+    stdin.cpp
+  HDRS
+    ../stdin.h
+  DEPENDS
+    libc.hdr.types.FILE
+)
+
+add_entrypoint_object(
+  stdout
+  SRCS
+    stdout.cpp
+  HDRS
+    ../stdout.h
+  DEPENDS
+    libc.hdr.types.FILE
+)
+
+add_entrypoint_object(
+  stderr
+  SRCS
+    stderr.cpp
+  HDRS
+    ../stderr.h
+  DEPENDS
+    libc.hdr.types.FILE
+)
+
 add_header_library(
   gpu_file
   HDRS
     file.h
   DEPENDS
     libc.hdr.types.FILE
+    libc.hdr.stdio_macros
     libc.src.__support.RPC.rpc_client
     libc.src.__support.common
     .stdin
@@ -123,7 +154,6 @@ add_entrypoint_object(
     ../puts.h
   DEPENDS
     libc.hdr.types.FILE
-    libc.include.stdio # needed for stdin
     .gpu_file
 )
 
@@ -168,7 +198,6 @@ add_entrypoint_object(
     ../putc.h
   DEPENDS
     libc.hdr.types.FILE
-    libc.include.stdio # needed for stdin
     .gpu_file
 )
 
@@ -180,7 +209,6 @@ add_entrypoint_object(
     ../putchar.h
   DEPENDS
     libc.hdr.types.FILE
-    libc.include.stdio # needed for stdin
     .gpu_file
 )
 
@@ -203,7 +231,6 @@ add_entrypoint_object(
     ../getc.h
   DEPENDS
     libc.hdr.types.FILE
-    libc.include.stdio # needed for stdin
     .gpu_file
 )
 
@@ -215,7 +242,6 @@ add_entrypoint_object(
     ../getchar.h
   DEPENDS
     libc.hdr.types.FILE
-    libc.include.stdio # needed for stdin
     .gpu_file
 )
 
@@ -304,33 +330,3 @@ add_entrypoint_object(
     libc.hdr.types.FILE
     .gpu_file
 )
-
-add_entrypoint_object(
-  stdin
-  SRCS
-    stdin.cpp
-  HDRS
-    ../stdin.h
-  DEPENDS
-    libc.hdr.types.FILE
-)
-
-add_entrypoint_object(
-  stdout
-  SRCS
-    stdout.cpp
-  HDRS
-    ../stdout.h
-  DEPENDS
-    libc.hdr.types.FILE
-)
-
-add_entrypoint_object(
-  stderr
-  SRCS
-    stderr.cpp
-  HDRS
-    ../stderr.h
-  DEPENDS
-    libc.hdr.types.FILE
-)
diff --git a/libc/src/stdio/gpu/file.h b/libc/src/stdio/gpu/file.h
index 5de76842d7bea..0856a3430803a 100644
--- a/libc/src/stdio/gpu/file.h
+++ b/libc/src/stdio/gpu/file.h
@@ -10,10 +10,9 @@
 #include "src/__support/macros/config.h"
 #include "src/string/string_utils.h"
 
+#include "hdr/stdio_macros.h" // For stdin/out/err
 #include "hdr/types/FILE.h"
 
-#include <stdio.h> //needed for stdin/out/err
-
 namespace LIBC_NAMESPACE_DECL {
 namespace file {
 
diff --git a/libc/src/stdio/gpu/fprintf.cpp b/libc/src/stdio/gpu/fprintf.cpp
index 42d6ad0087773..6222589cc4bab 100644
--- a/libc/src/stdio/gpu/fprintf.cpp
+++ b/libc/src/stdio/gpu/fprintf.cpp
@@ -8,12 +8,13 @@
 
 #include "src/stdio/fprintf.h"
 
+#include "hdr/types/FILE.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/arg_list.h"
 #include "src/errno/libc_errno.h"
 #include "src/stdio/gpu/vfprintf_utils.h"
 
-#include <stdio.h>
+#include <stdarg.h>
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdio/gpu/getchar.cpp b/libc/src/stdio/gpu/getchar.cpp
index 048cf23b0d64a..d99b97b5c5a00 100644
--- a/libc/src/stdio/gpu/getchar.cpp
+++ b/libc/src/stdio/gpu/getchar.cpp
@@ -10,10 +10,7 @@
 #include "file.h"
 #include "src/__support/macros/config.h"
 
-#include "hdr/stdio_macros.h" // for EOF.
-#include "hdr/types/FILE.h"
-
-#include <stdio.h> //needed for stdin
+#include "hdr/stdio_macros.h" // for EOF and stdin.
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/gpu/printf.cpp b/libc/src/stdio/gpu/printf.cpp
index 63af6fffeea73..d9903193ef165 100644
--- a/libc/src/stdio/gpu/printf.cpp
+++ b/libc/src/stdio/gpu/printf.cpp
@@ -13,7 +13,7 @@
 #include "src/errno/libc_errno.h"
 #include "src/stdio/gpu/vfprintf_utils.h"
 
-#include <stdio.h>
+#include <stdarg.h>
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdio/gpu/putchar.cpp b/libc/src/stdio/gpu/putchar.cpp
index d03a3fe68daf7..c49b02e2f1f10 100644
--- a/libc/src/stdio/gpu/putchar.cpp
+++ b/libc/src/stdio/gpu/putchar.cpp
@@ -10,10 +10,7 @@
 #include "file.h"
 #include "src/__support/macros/config.h"
 
-#include "hdr/stdio_macros.h" // for EOF.
-#include "hdr/types/FILE.h"
-
-#include <stdio.h> //needed for stdout
+#include "hdr/stdio_macros.h" // for EOF and stdout.
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/gpu/puts.cpp b/libc/src/stdio/gpu/puts.cpp
index af84432d1ef8c..20f7a889a378a 100644
--- a/libc/src/stdio/gpu/puts.cpp
+++ b/libc/src/stdio/gpu/puts.cpp
@@ -12,10 +12,7 @@
 #include "src/errno/libc_errno.h"
 #include "src/stdio/gpu/file.h"
 
-#include "hdr/stdio_macros.h" // for EOF.
-#include "hdr/types/FILE.h"
-
-#include <stdio.h> //needed for stdout
+#include "hdr/stdio_macros.h" // for EOF and stdout.
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/gpu/vfprintf.cpp b/libc/src/stdio/gpu/vfprintf.cpp
index f314f6872ad0e..961cfa48579e0 100644
--- a/libc/src/stdio/gpu/vfprintf.cpp
+++ b/libc/src/stdio/gpu/vfprintf.cpp
@@ -8,13 +8,12 @@
 
 #include "src/stdio/vfprintf.h"
 
+#include "hdr/types/FILE.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/arg_list.h"
 #include "src/errno/libc_errno.h"
 #include "src/stdio/gpu/vfprintf_utils.h"
 
-#include <stdio.h>
-
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(int, vfprintf,
diff --git a/libc/src/stdio/gpu/vfprintf_utils.h b/libc/src/stdio/gpu/vfprintf_utils.h
index f364646fcea58..7c012d139ba5d 100644
--- a/libc/src/stdio/gpu/vfprintf_utils.h
+++ b/libc/src/stdio/gpu/vfprintf_utils.h
@@ -6,13 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/types/FILE.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/arg_list.h"
 #include "src/stdio/gpu/file.h"
 #include "src/string/string_utils.h"
 
-#include <stdio.h>
-
 namespace LIBC_NAMESPACE {
 
 template <uint16_t opcode>
diff --git a/libc/src/stdio/gpu/vprintf.cpp b/libc/src/stdio/gpu/vprintf.cpp
index 1356aceeb51c5..2bb74d7f017b5 100644
--- a/libc/src/stdio/gpu/vprintf.cpp
+++ b/libc/src/stdio/gpu/vprintf.cpp
@@ -13,8 +13,6 @@
 #include "src/errno/libc_errno.h"
 #include "src/stdio/gpu/vfprintf_utils.h"
 
-#include <stdio.h>
-
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(int, vprintf,
diff --git a/libc/src/stdio/linux/CMakeLists.txt b/libc/src/stdio/linux/CMakeLists.txt
index fa36732a159be..d6241e1ca0439 100644
--- a/libc/src/stdio/linux/CMakeLists.txt
+++ b/libc/src/stdio/linux/CMakeLists.txt
@@ -6,7 +6,6 @@ add_entrypoint_object(
     ../remove.h
   DEPENDS
     libc.include.fcntl
-    libc.include.stdio
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -32,7 +31,6 @@ add_entrypoint_object(
   HDRS
     ../fdopen.h
   DEPENDS
-    libc.include.stdio
     libc.src.__support.File.file
     libc.src.__support.File.platform_file
 )
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index 1095f01d71f24..542327ad5a49a 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -141,7 +141,6 @@ add_header_library(
   HDRS
     vfprintf_internal.h
   DEPENDS
-    libc.include.stdio
     libc.src.__support.File.file
     libc.src.__support.arg_list
     libc.src.stdio.printf_core.printf_main
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index e2cb734b5be71..acbbaa25b1c9d 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -265,7 +265,7 @@ template <typename ArgProvider> class Parser {
       case ('m'):
         // %m is an odd conversion in that it doesn't consume an argument, it
         // just takes the current value of errno as its argument.
-        section.conv_val_raw = libc_errno;
+        section.conv_val_raw = static_cast<int>(libc_errno);
         break;
 #endif // LIBC_COPT_PRINTF_DISABLE_STRERROR
 #ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
diff --git a/libc/src/stdio/vsscanf.cpp b/libc/src/stdio/vsscanf.cpp
index fcf0b88885f17..f3f56bce64292 100644
--- a/libc/src/stdio/vsscanf.cpp
+++ b/libc/src/stdio/vsscanf.cpp
@@ -8,13 +8,13 @@
 
 #include "src/stdio/vsscanf.h"
 
+#include "hdr/stdio_macros.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
 #include "src/stdio/scanf_core/reader.h"
 #include "src/stdio/scanf_core/scanf_main.h"
 
 #include <stdarg.h>
-#include <stdio.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/test/include/CMakeLists.txt b/libc/test/include/CMakeLists.txt
index 12692eed417c4..dd8f21bdd07ae 100644
--- a/libc/test/include/CMakeLists.txt
+++ b/libc/test/include/CMakeLists.txt
@@ -81,6 +81,36 @@ add_libc_test(
     libc.include.llvm-libc-macros.stdckdint_macros
 )
 
+add_libc_test(
+  issignaling_test
+  SUITE
+    libc_include_tests
+  SRCS
+    issignaling_test.cpp
+  DEPENDS
+    libc.include.llvm-libc-macros.math_function_macros
+)
+
+add_libc_test(
+  issignalingf_test
+  SUITE
+    libc_include_tests
+  SRCS
+    issignalingf_test.cpp
+  DEPENDS
+    libc.include.llvm-libc-macros.math_function_macros
+)
+
+add_libc_test(
+  issignalingl_test
+  SUITE
+    libc_include_tests
+  SRCS
+    issignalingl_test.cpp
+  DEPENDS
+    libc.include.llvm-libc-macros.math_function_macros
+)
+
 add_libc_test(
   issubnormal_test
   SUITE
@@ -366,6 +396,21 @@ add_libc_test(
     libc.include.llvm-libc-macros.math_function_macros
 )
 
+add_libc_test(
+  issignaling_c_test
+  C_TEST
+  UNIT_TEST_ONLY
+  SUITE
+    libc_include_tests
+  SRCS
+    issignaling_test.c
+  COMPILE_OPTIONS
+    -Wall
+    -Werror
+  DEPENDS
+    libc.include.llvm-libc-macros.math_function_macros
+)
+
 add_libc_test(
   isinf_c_test
   C_TEST
diff --git a/libc/test/include/IsSignalingTest.h b/libc/test/include/IsSignalingTest.h
new file mode 100644
index 0000000000000..c369cfe090ed3
--- /dev/null
+++ b/libc/test/include/IsSignalingTest.h
@@ -0,0 +1,49 @@
+//===-- Utility class to test the issignaling macro  ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_INCLUDE_MATH_ISSIGNALING_H
+#define LLVM_LIBC_TEST_INCLUDE_MATH_ISSIGNALING_H
+
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+#include "include/llvm-libc-macros/math-function-macros.h"
+
+template <typename T>
+class IsSignalingTest : public LIBC_NAMESPACE::testing::Test {
+  DECLARE_SPECIAL_CONSTANTS(T)
+
+public:
+  typedef int (*IsSignalingFunc)(T);
+
+  void testSpecialNumbers(IsSignalingFunc func) {
+    EXPECT_EQ(func(aNaN), 0);
+    EXPECT_EQ(func(neg_aNaN), 0);
+    EXPECT_EQ(func(sNaN), 1);
+    EXPECT_EQ(func(neg_sNaN), 1);
+    EXPECT_EQ(func(inf), 0);
+    EXPECT_EQ(func(neg_inf), 0);
+    EXPECT_EQ(func(min_normal), 0);
+    EXPECT_EQ(func(max_normal), 0);
+    EXPECT_EQ(func(neg_max_normal), 0);
+    EXPECT_EQ(func(min_denormal), 0);
+    EXPECT_EQ(func(neg_min_denormal), 0);
+    EXPECT_EQ(func(max_denormal), 0);
+    EXPECT_EQ(func(zero), 0);
+    EXPECT_EQ(func(neg_zero), 0);
+  }
+};
+
+#define LIST_ISSIGNALING_TESTS(T, func)                                        \
+  using LlvmLibcIsSignalingTest = IsSignalingTest<T>;                          \
+  TEST_F(LlvmLibcIsSignalingTest, SpecialNumbers) {                            \
+    auto issignaling_func = [](T x) { return func(x); };                       \
+    testSpecialNumbers(issignaling_func);                                      \
+  }
+
+#endif // LLVM_LIBC_TEST_INCLUDE_MATH_ISSIGNALING_H
diff --git a/libc/test/include/issignaling_test.c b/libc/test/include/issignaling_test.c
new file mode 100644
index 0000000000000..2c080696404ae
--- /dev/null
+++ b/libc/test/include/issignaling_test.c
@@ -0,0 +1,24 @@
+//===-- Unittests for issignaling macro -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "include/llvm-libc-macros/math-function-macros.h"
+
+#include <assert.h>
+
+// TODO: enable the test unconditionally when issignaling macro is fixed for
+//       older compiler
+int main(void) {
+#ifdef issignaling
+  assert(issignaling(__builtin_nans("")) == 1);
+  assert(issignaling(__builtin_nansf("")) == 1);
+  assert(issignaling(__builtin_nansl("")) == 1);
+  assert(issignaling(1.819f) == 0);
+  assert(issignaling(-1.726) == 0);
+  assert(issignaling(1.426L) == 0);
+#endif
+  return 0;
+}
diff --git a/libc/test/include/issignaling_test.cpp b/libc/test/include/issignaling_test.cpp
new file mode 100644
index 0000000000000..3d25ea394c835
--- /dev/null
+++ b/libc/test/include/issignaling_test.cpp
@@ -0,0 +1,18 @@
+//===-- Unittest for issignaling[d] macro ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "IsSignalingTest.h"
+#include "include/llvm-libc-macros/math-function-macros.h"
+
+// TODO: enable the test unconditionally when issignaling macro is fixed for
+//       older compiler
+#ifdef issignaling
+LIST_ISSIGNALING_TESTS(double, issignaling)
+#else
+TEST(LlvmLibcIsSignalingTest, Skip) {}
+#endif
diff --git a/libc/test/include/issignalingf_test.cpp b/libc/test/include/issignalingf_test.cpp
new file mode 100644
index 0000000000000..02426ceb24ac8
--- /dev/null
+++ b/libc/test/include/issignalingf_test.cpp
@@ -0,0 +1,18 @@
+//===-- Unittest for issignaling[f] macro ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "IsSignalingTest.h"
+#include "include/llvm-libc-macros/math-function-macros.h"
+
+// TODO: enable the test unconditionally when issignaling macro is fixed for
+//       older compiler
+#ifdef issignaling
+LIST_ISSIGNALING_TESTS(float, issignaling)
+#else
+TEST(LlvmLibcIsSignalingTest, Skip) {}
+#endif
diff --git a/libc/test/include/issignalingl_test.cpp b/libc/test/include/issignalingl_test.cpp
new file mode 100644
index 0000000000000..9897647fb1077
--- /dev/null
+++ b/libc/test/include/issignalingl_test.cpp
@@ -0,0 +1,18 @@
+//===-- Unittest for issignaling[l] macro ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "IsSignalingTest.h"
+#include "include/llvm-libc-macros/math-function-macros.h"
+
+// TODO: enable the test unconditionally when issignaling macro is fixed for
+//       older compiler
+#ifdef issignaling
+LIST_ISSIGNALING_TESTS(long double, issignaling)
+#else
+TEST(LlvmLibcIsSignalingTest, Skip) {}
+#endif
diff --git a/libc/test/src/__support/File/file_test.cpp b/libc/test/src/__support/File/file_test.cpp
index 2f68c3faa0ad0..5977ea7c8e0b5 100644
--- a/libc/test/src/__support/File/file_test.cpp
+++ b/libc/test/src/__support/File/file_test.cpp
@@ -12,7 +12,6 @@
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <stdio.h>
 #include <stdlib.h>
 
 using ModeFlags = LIBC_NAMESPACE::File::ModeFlags;
diff --git a/libc/test/src/__support/File/platform_file_test.cpp b/libc/test/src/__support/File/platform_file_test.cpp
index 8aa07219a6527..6b2be2a149329 100644
--- a/libc/test/src/__support/File/platform_file_test.cpp
+++ b/libc/test/src/__support/File/platform_file_test.cpp
@@ -9,7 +9,7 @@
 #include "src/__support/File/file.h"
 #include "test/UnitTest/Test.h"
 
-#include <stdio.h> // For SEEK_* macros
+#include "hdr/stdio_macros.h" // For SEEK_* macros
 
 using File = LIBC_NAMESPACE::File;
 constexpr char TEXT[] = "Hello, File";
diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp
index ffbb3ec337ed4..1a21afe51085b 100644
--- a/libc/test/src/fcntl/fcntl_test.cpp
+++ b/libc/test/src/fcntl/fcntl_test.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/fcntl_macros.h"
+#include "hdr/stdio_macros.h"
 #include "hdr/types/struct_flock.h"
 #include "src/errno/libc_errno.h"
 #include "src/fcntl/fcntl.h"
@@ -16,7 +17,6 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <stdio.h>
 #include <sys/stat.h> // For S_IRWXU
 
 TEST(LlvmLibcFcntlTest, FcntlDupfd) {
diff --git a/libc/test/src/math/smoke/RIntTest.h b/libc/test/src/math/smoke/RIntTest.h
index 1412c3f27a2d5..fb2c89c4980b4 100644
--- a/libc/test/src/math/smoke/RIntTest.h
+++ b/libc/test/src/math/smoke/RIntTest.h
@@ -17,7 +17,6 @@
 
 #include "hdr/fenv_macros.h"
 #include "hdr/math_macros.h"
-#include <stdio.h>
 
 static constexpr int ROUNDING_MODES[4] = {FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO,
                                           FE_TONEAREST};
diff --git a/libc/test/src/math/smoke/pow_test.cpp b/libc/test/src/math/smoke/pow_test.cpp
index 4f2a3a28c0dcb..7f0136d783c6b 100644
--- a/libc/test/src/math/smoke/pow_test.cpp
+++ b/libc/test/src/math/smoke/pow_test.cpp
@@ -22,6 +22,7 @@ TEST_F(LlvmLibcPowTest, SpecialNumbers) {
   constexpr double POS_ODD_INTEGER = 5.0;
   constexpr double POS_EVEN_INTEGER = 8.0;
   constexpr double POS_NON_INTEGER = 1.1;
+  constexpr double ONE_HALF = 0.5;
 
   for (int i = 0; i < N_ROUNDING_MODES; ++i) {
     ForceRoundingMode __r(ROUNDING_MODES[i]);
@@ -38,6 +39,7 @@ TEST_F(LlvmLibcPowTest, SpecialNumbers) {
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::pow(zero, POS_ODD_INTEGER));
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::pow(zero, POS_EVEN_INTEGER));
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::pow(zero, POS_NON_INTEGER));
+    EXPECT_FP_EQ(zero, LIBC_NAMESPACE::pow(zero, ONE_HALF));
     EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(zero, zero));
     EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(zero, neg_zero));
     EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::pow(zero, inf));
@@ -55,6 +57,7 @@ TEST_F(LlvmLibcPowTest, SpecialNumbers) {
     EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::pow(neg_zero, POS_ODD_INTEGER));
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::pow(neg_zero, POS_EVEN_INTEGER));
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::pow(neg_zero, POS_NON_INTEGER));
+    EXPECT_FP_EQ(zero, LIBC_NAMESPACE::pow(neg_zero, ONE_HALF));
     EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(neg_zero, zero));
     EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(neg_zero, neg_zero));
     EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::pow(neg_zero, inf));
@@ -105,6 +108,7 @@ TEST_F(LlvmLibcPowTest, SpecialNumbers) {
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::pow(inf, POS_ODD_INTEGER));
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::pow(inf, POS_EVEN_INTEGER));
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::pow(inf, POS_NON_INTEGER));
+    EXPECT_FP_EQ(inf, LIBC_NAMESPACE::pow(inf, ONE_HALF));
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::pow(inf, inf));
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::pow(inf, neg_inf));
     EXPECT_FP_IS_NAN(LIBC_NAMESPACE::pow(inf, aNaN));
@@ -120,6 +124,7 @@ TEST_F(LlvmLibcPowTest, SpecialNumbers) {
     EXPECT_FP_EQ(neg_inf, LIBC_NAMESPACE::pow(neg_inf, POS_ODD_INTEGER));
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::pow(neg_inf, POS_EVEN_INTEGER));
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::pow(neg_inf, POS_NON_INTEGER));
+    EXPECT_FP_EQ(inf, LIBC_NAMESPACE::pow(neg_inf, ONE_HALF));
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::pow(neg_inf, inf));
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::pow(neg_inf, neg_inf));
     EXPECT_FP_IS_NAN(LIBC_NAMESPACE::pow(neg_inf, aNaN));
diff --git a/libc/test/src/math/smoke/powf_test.cpp b/libc/test/src/math/smoke/powf_test.cpp
index 98a532f3468c7..6aa1ac2b661be 100644
--- a/libc/test/src/math/smoke/powf_test.cpp
+++ b/libc/test/src/math/smoke/powf_test.cpp
@@ -25,7 +25,8 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) {
   constexpr float neg_non_integer = -1.1f;
   constexpr float pos_odd_integer = 5.0f;
   constexpr float pos_even_integer = 8.0f;
-  constexpr float pos_non_integer = 1.1f;
+  constexpr float pos_non_integer = 1.3f;
+  constexpr float one_half = 0.5f;
 
   for (int i = 0; i < N_ROUNDING_MODES; ++i) {
     ForceRoundingMode __r(ROUNDING_MODES[i]);
@@ -42,6 +43,7 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) {
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::powf(zero, pos_odd_integer));
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::powf(zero, pos_even_integer));
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::powf(zero, pos_non_integer));
+    EXPECT_FP_EQ(zero, LIBC_NAMESPACE::powf(zero, one_half));
     EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(zero, zero));
     EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(zero, neg_zero));
     EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::powf(zero, inf));
@@ -59,6 +61,7 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) {
     EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::powf(neg_zero, pos_odd_integer));
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::powf(neg_zero, pos_even_integer));
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::powf(neg_zero, pos_non_integer));
+    EXPECT_FP_EQ(zero, LIBC_NAMESPACE::powf(neg_zero, one_half));
     EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(neg_zero, zero));
     EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(neg_zero, neg_zero));
     EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::powf(neg_zero, inf));
@@ -109,6 +112,7 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) {
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::powf(inf, pos_odd_integer));
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::powf(inf, pos_even_integer));
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::powf(inf, pos_non_integer));
+    EXPECT_FP_EQ(inf, LIBC_NAMESPACE::powf(inf, one_half));
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::powf(inf, inf));
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::powf(inf, neg_inf));
     EXPECT_FP_IS_NAN(LIBC_NAMESPACE::powf(inf, aNaN));
@@ -124,6 +128,7 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) {
     EXPECT_FP_EQ(neg_inf, LIBC_NAMESPACE::powf(neg_inf, pos_odd_integer));
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::powf(neg_inf, pos_even_integer));
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::powf(neg_inf, pos_non_integer));
+    EXPECT_FP_EQ(inf, LIBC_NAMESPACE::powf(neg_inf, one_half));
     EXPECT_FP_EQ(inf, LIBC_NAMESPACE::powf(neg_inf, inf));
     EXPECT_FP_EQ(zero, LIBC_NAMESPACE::powf(neg_inf, neg_inf));
     EXPECT_FP_IS_NAN(LIBC_NAMESPACE::powf(neg_inf, aNaN));
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 989bb312afadf..2cc8436bd66f2 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -16,8 +16,8 @@
 #include "src/stdio/getc.h"
 #include "test/UnitTest/Test.h"
 
+#include "hdr/stdio_macros.h"
 #include "src/errno/libc_errno.h"
-#include <stdio.h>
 
 class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index 48d7a043cad7c..46cf12c2c253b 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -19,8 +19,8 @@
 #include "src/stdio/getc_unlocked.h"
 #include "test/UnitTest/Test.h"
 
+#include "hdr/stdio_macros.h"
 #include "src/errno/libc_errno.h"
-#include <stdio.h>
 
 class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index d005a71710d21..a8a2c62f07b5e 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -15,7 +15,6 @@
 #include "test/UnitTest/Test.h"
 
 #include "src/errno/libc_errno.h"
-#include <stdio.h>
 
 TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
@@ -43,6 +42,8 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   file = LIBC_NAMESPACE::fopen(FILENAME, "r");
   ASSERT_FALSE(file == nullptr);
 
+  // The GPU build relies on the host C library, so this check may be different.
+#ifndef LIBC_TARGET_ARCH_IS_GPU
   // If we request just 1 byte, it should return just a null byte and not
   // advance the read head. This is implementation defined.
   output = LIBC_NAMESPACE::fgets(buff, 1, file);
@@ -54,6 +55,7 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is also implementation defined.
   output = LIBC_NAMESPACE::fgets(buff, 0, file);
   ASSERT_TRUE(output == nullptr);
+#endif
 
   const char *output_arr[] = {
       "1234567", "89\n", "1234567", "\n", "123456\n", "1",
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index 0fbe19cf08d83..98ead6edd38b4 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -20,8 +20,8 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
+#include "hdr/stdio_macros.h"
 #include "src/errno/libc_errno.h"
-#include <stdio.h>
 
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index 6c86b8759801e..016722aa11ab8 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -18,8 +18,8 @@
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
+#include "hdr/stdio_macros.h"
 #include "src/errno/libc_errno.h"
-#include <stdio.h>
 #include <stdlib.h>
 
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp
index 08b31795b435b..82a3e039d9baa 100644
--- a/libc/test/src/stdio/fprintf_test.cpp
+++ b/libc/test/src/stdio/fprintf_test.cpp
@@ -17,8 +17,6 @@
 
 #include "test/UnitTest/Test.h"
 
-#include <stdio.h>
-
 namespace printf_test {
 #ifndef LIBC_COPT_STDIO_USE_SYSTEM_FILE
 using LIBC_NAMESPACE::fclose;
diff --git a/libc/test/src/stdio/fscanf_test.cpp b/libc/test/src/stdio/fscanf_test.cpp
index 701090788ca10..e5b8c4f422bac 100644
--- a/libc/test/src/stdio/fscanf_test.cpp
+++ b/libc/test/src/stdio/fscanf_test.cpp
@@ -19,8 +19,6 @@
 
 #include "test/UnitTest/Test.h"
 
-#include <stdio.h>
-
 namespace scanf_test {
 #ifndef LIBC_COPT_STDIO_USE_SYSTEM_FILE
 using LIBC_NAMESPACE::fclose;
diff --git a/libc/test/src/stdio/ftell_test.cpp b/libc/test/src/stdio/ftell_test.cpp
index 62745e2194be6..01ff071f2ee78 100644
--- a/libc/test/src/stdio/ftell_test.cpp
+++ b/libc/test/src/stdio/ftell_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/stdio_macros.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fread.h"
@@ -17,8 +18,6 @@
 #include "src/stdio/setvbuf.h"
 #include "test/UnitTest/Test.h"
 
-#include <stdio.h>
-
 class LlvmLibcFTellTest : public LIBC_NAMESPACE::testing::Test {
 protected:
   void test_with_bufmode(int bufmode) {
diff --git a/libc/test/src/stdio/putc_test.cpp b/libc/test/src/stdio/putc_test.cpp
index 7349a97d80e19..e881a0e2d0108 100644
--- a/libc/test/src/stdio/putc_test.cpp
+++ b/libc/test/src/stdio/putc_test.cpp
@@ -15,8 +15,6 @@
 
 #include "test/UnitTest/Test.h"
 
-#include <stdio.h>
-
 TEST(LlvmLibcPutcTest, WriteToFile) {
   constexpr char FILENAME[] = "testdata/putc_output.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
diff --git a/libc/test/src/stdio/setbuf_test.cpp b/libc/test/src/stdio/setbuf_test.cpp
index b0abca4acf731..25fea59076626 100644
--- a/libc/test/src/stdio/setbuf_test.cpp
+++ b/libc/test/src/stdio/setbuf_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/stdio_macros.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fread.h"
@@ -14,8 +15,6 @@
 #include "src/stdio/ungetc.h"
 #include "test/UnitTest/Test.h"
 
-#include <stdio.h>
-
 TEST(LlvmLibcSetbufTest, DefaultBufsize) {
   // The idea in this test is to change the buffer after opening a file and
   // ensure that read and write work as expected.
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index d42ebac12ead2..a1e1fee25db31 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -13,8 +13,8 @@
 #include "src/stdio/setvbuf.h"
 #include "test/UnitTest/Test.h"
 
+#include "hdr/stdio_macros.h"
 #include "src/errno/libc_errno.h"
-#include <stdio.h>
 
 TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index 54076eb64f205..09a175104d2fe 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -22,12 +22,6 @@
 // using ::sprintf;
 // }
 
-class LlvmLibcSPrintfTest : public LIBC_NAMESPACE::testing::Test {
-protected:
-  char buff[1000];
-  int written;
-};
-
 using LIBC_NAMESPACE::fputil::testing::ForceRoundingMode;
 using LIBC_NAMESPACE::fputil::testing::RoundingMode;
 
@@ -806,7 +800,10 @@ TEST(LlvmLibcSPrintfTest, OctConv) {
 
 #ifndef LIBC_COPT_PRINTF_DISABLE_FLOAT
 
-TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
+TEST(LlvmLibcSPrintfTest, FloatHexExpConv) {
+  char buff[128];
+  int written;
+
   ForceRoundingMode r(RoundingMode::Nearest);
   double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
   double nan = LIBC_NAMESPACE::fputil::FPBits<double>::quiet_nan().get_val();
@@ -1170,7 +1167,10 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
       " 0x1.00000000000000000000000000000000000000000000000000p+0");
 }
 
-TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
+TEST(LlvmLibcSPrintfTest, FloatDecimalConv) {
+  char buff[1000];
+  int written;
+
   ForceRoundingMode r(RoundingMode::Nearest);
   double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
   double nan = LIBC_NAMESPACE::fputil::FPBits<double>::quiet_nan().get_val();
@@ -1685,7 +1685,10 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
 
 // The long double tests are separated so that their performance can be directly
 // measured.
-TEST_F(LlvmLibcSPrintfTest, FloatDecimalLongDoubleConv) {
+TEST(LlvmLibcSPrintfTest, FloatDecimalLongDoubleConv) {
+  char buff[1000];
+  int written;
+
   ForceRoundingMode r(RoundingMode::Nearest);
 
   // Length Modifier Tests.
@@ -2022,7 +2025,10 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalLongDoubleConv) {
 #endif // LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80
 }
 
-TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
+TEST(LlvmLibcSPrintfTest, FloatExponentConv) {
+  char buff[1000];
+  int written;
+
   ForceRoundingMode r(RoundingMode::Nearest);
   double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
   double nan = LIBC_NAMESPACE::fputil::FPBits<double>::quiet_nan().get_val();
@@ -2508,7 +2514,10 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
   ASSERT_STREQ_LEN(written, buff, "+1.256e-01    001.256e+03");
 }
 
-TEST_F(LlvmLibcSPrintfTest, FloatExponentLongDoubleConv) {
+TEST(LlvmLibcSPrintfTest, FloatExponentLongDoubleConv) {
+  char buff[1000];
+  int written;
+
   ForceRoundingMode r(RoundingMode::Nearest);
   // Length Modifier Tests.
 
@@ -2629,7 +2638,10 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentLongDoubleConv) {
 */
 }
 
-TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
+TEST(LlvmLibcSPrintfTest, FloatAutoConv) {
+  char buff[1000];
+  int written;
+
   ForceRoundingMode r(RoundingMode::Nearest);
   double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
   double nan = LIBC_NAMESPACE::fputil::FPBits<double>::quiet_nan().get_val();
@@ -3137,7 +3149,10 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
   ASSERT_STREQ_LEN(written, buff, "+0.126        0001.26e+03");
 }
 
-TEST_F(LlvmLibcSPrintfTest, FloatAutoLongDoubleConv) {
+TEST(LlvmLibcSPrintfTest, FloatAutoLongDoubleConv) {
+  char buff[1000];
+  int written;
+
   ForceRoundingMode r(RoundingMode::Nearest);
 
   // Length Modifier Tests.
@@ -3292,7 +3307,9 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoLongDoubleConv) {
 
 #if defined(LIBC_COMPILER_HAS_FIXED_POINT) &&                                  \
     !defined(LIBC_COPT_PRINTF_DISABLE_FIXED_POINT)
-TEST_F(LlvmLibcSPrintfTest, FixedConv) {
+TEST(LlvmLibcSPrintfTest, FixedConv) {
+  char buff[1000];
+  int written;
 
   // These numeric tests are potentially a little weak, but the fuzz test is
   // more thorough than my handwritten tests tend to be.
@@ -3502,7 +3519,10 @@ TEST_F(LlvmLibcSPrintfTest, FixedConv) {
        // !defined(LIBC_COPT_PRINTF_DISABLE_FIXED_POINT)
 
 #ifndef LIBC_COPT_PRINTF_DISABLE_STRERROR
-TEST_F(LlvmLibcSPrintfTest, StrerrorConv) {
+TEST(LlvmLibcSPrintfTest, StrerrorConv) {
+  char buff[1000];
+  int written;
+
   LIBC_NAMESPACE::libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%m");
   ASSERT_STREQ_LEN(written, buff, "Success");
diff --git a/libc/test/src/stdio/sscanf_test.cpp b/libc/test/src/stdio/sscanf_test.cpp
index 59be4e6de6ed6..33bb0acba3e66 100644
--- a/libc/test/src/stdio/sscanf_test.cpp
+++ b/libc/test/src/stdio/sscanf_test.cpp
@@ -6,13 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/CPP/limits.h"
-#include "src/__support/FPUtil/FPBits.h"
-
 #include "src/stdio/sscanf.h"
 
-#include <stdio.h> // For EOF
-
+#include "hdr/stdio_macros.h" // For EOF
+#include "src/__support/CPP/limits.h"
+#include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
diff --git a/libc/test/src/stdio/ungetc_test.cpp b/libc/test/src/stdio/ungetc_test.cpp
index c98995ff0811b..b9d7530fc7177 100644
--- a/libc/test/src/stdio/ungetc_test.cpp
+++ b/libc/test/src/stdio/ungetc_test.cpp
@@ -6,16 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/stdio/ungetc.h"
+
+#include "hdr/stdio_macros.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
-#include "src/stdio/ungetc.h"
 #include "test/UnitTest/Test.h"
 
-#include <stdio.h>
-
 TEST(LlvmLibcUngetcTest, UngetAndReadBack) {
   constexpr char FILENAME[] = "testdata/ungetc_test.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index 09697a6452f48..67f1b0ff513bc 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -18,7 +18,6 @@
 #include "test/UnitTest/Test.h"
 
 #include "src/errno/libc_errno.h"
-#include <stdio.h>
 
 TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
diff --git a/libc/test/src/stdio/vfprintf_test.cpp b/libc/test/src/stdio/vfprintf_test.cpp
index 9bad2c831e5c4..80d484500d5f2 100644
--- a/libc/test/src/stdio/vfprintf_test.cpp
+++ b/libc/test/src/stdio/vfprintf_test.cpp
@@ -21,8 +21,6 @@
 
 #include "test/UnitTest/Test.h"
 
-#include <stdio.h>
-
 namespace printf_test {
 #ifndef LIBC_COPT_STDIO_USE_SYSTEM_FILE
 using LIBC_NAMESPACE::fclose;
diff --git a/libc/test/src/stdio/vfscanf_test.cpp b/libc/test/src/stdio/vfscanf_test.cpp
index fa4e27582375f..b66538671f620 100644
--- a/libc/test/src/stdio/vfscanf_test.cpp
+++ b/libc/test/src/stdio/vfscanf_test.cpp
@@ -19,8 +19,6 @@
 
 #include "test/UnitTest/Test.h"
 
-#include <stdio.h>
-
 namespace scanf_test {
 #ifndef LIBC_COPT_STDIO_USE_SYSTEM_FILE
 using LIBC_NAMESPACE::fclose;
diff --git a/libc/test/src/unistd/getopt_test.cpp b/libc/test/src/unistd/getopt_test.cpp
index 1ca7c99e1ce37..e6e87720cde48 100644
--- a/libc/test/src/unistd/getopt_test.cpp
+++ b/libc/test/src/unistd/getopt_test.cpp
@@ -13,8 +13,6 @@
 #include "src/stdio/fflush.h"
 #include "src/stdio/fopencookie.h"
 
-#include <stdio.h>
-
 using LIBC_NAMESPACE::cpp::array;
 
 namespace test_globals {
diff --git a/libc/test/src/wchar/wctob_test.cpp b/libc/test/src/wchar/wctob_test.cpp
index 3f911884a7c12..977224bf77abe 100644
--- a/libc/test/src/wchar/wctob_test.cpp
+++ b/libc/test/src/wchar/wctob_test.cpp
@@ -6,10 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <stdio.h> //for EOF
-
+#include "hdr/stdio_macros.h" //for EOF
 #include "src/wchar/wctob.h"
-
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcWctob, DefaultLocale) {
diff --git a/libc/utils/HdrGen/Main.cpp b/libc/utils/HdrGen/Main.cpp
index d3418f206b10e..f795e96e45c57 100644
--- a/libc/utils/HdrGen/Main.cpp
+++ b/libc/utils/HdrGen/Main.cpp
@@ -15,42 +15,39 @@
 #include <string>
 #include <unordered_map>
 
-namespace {
-
-llvm::cl::opt<std::string>
+static llvm::cl::opt<std::string>
     HeaderDefFile("def", llvm::cl::desc("Path to the .h.def file."),
                   llvm::cl::value_desc("<filename>"), llvm::cl::Required);
-llvm::cl::opt<std::string> StandardHeader(
+static llvm::cl::opt<std::string> StandardHeader(
     "header",
     llvm::cl::desc("The standard header file which is to be generated."),
     llvm::cl::value_desc("<header file>"));
-llvm::cl::list<std::string> EntrypointNamesOption(
+static llvm::cl::list<std::string> EntrypointNamesOption(
     "e", llvm::cl::value_desc("<list of entrypoints>"),
     llvm::cl::desc(
         "Each --e is one entrypoint (generated from entrypoints.txt)"),
     llvm::cl::OneOrMore);
-llvm::cl::list<std::string> ReplacementValues(
+static llvm::cl::list<std::string> ReplacementValues(
     "args", llvm::cl::desc("Command separated <argument name>=<value> pairs."),
     llvm::cl::value_desc("<name=value>[,name=value]"));
-llvm::cl::opt<bool> ExportDecls(
+static llvm::cl::opt<bool> ExportDecls(
     "export-decls",
     llvm::cl::desc("Output a new header containing only the entrypoints."));
 
-void ParseArgValuePairs(std::unordered_map<std::string, std::string> &Map) {
+static void
+ParseArgValuePairs(std::unordered_map<std::string, std::string> &Map) {
   for (std::string &R : ReplacementValues) {
     auto Pair = llvm::StringRef(R).split('=');
     Map[std::string(Pair.first)] = std::string(Pair.second);
   }
 }
 
-} // anonymous namespace
-
-namespace llvm_libc {
-
-bool HeaderGeneratorMain(llvm::raw_ostream &OS, llvm::RecordKeeper &Records) {
+static bool HeaderGeneratorMain(llvm::raw_ostream &OS,
+                                const llvm::RecordKeeper &Records) {
   std::unordered_map<std::string, std::string> ArgMap;
   ParseArgValuePairs(ArgMap);
-  Generator G(HeaderDefFile, EntrypointNamesOption, StandardHeader, ArgMap);
+  llvm_libc::Generator G(HeaderDefFile, EntrypointNamesOption, StandardHeader,
+                         ArgMap);
   if (ExportDecls)
     G.generateDecls(OS, Records);
   else
@@ -59,9 +56,7 @@ bool HeaderGeneratorMain(llvm::raw_ostream &OS, llvm::RecordKeeper &Records) {
   return false;
 }
 
-} // namespace llvm_libc
-
 int main(int argc, char *argv[]) {
   llvm::cl::ParseCommandLineOptions(argc, argv);
-  return TableGenMain(argv[0], &llvm_libc::HeaderGeneratorMain);
+  return TableGenMain(argv[0], &HeaderGeneratorMain);
 }
diff --git a/libc/utils/LibcTableGenUtil/APIIndexer.cpp b/libc/utils/LibcTableGenUtil/APIIndexer.cpp
index 65814b96f7f43..0a77584071f99 100644
--- a/libc/utils/LibcTableGenUtil/APIIndexer.cpp
+++ b/libc/utils/LibcTableGenUtil/APIIndexer.cpp
@@ -25,8 +25,8 @@ static const char StandardSpecClassName[] = "StandardSpec";
 static const char PublicAPIClassName[] = "PublicAPI";
 
 static bool isa(const llvm::Record *Def, const llvm::Record *TypeClass) {
-  llvm::RecordRecTy *RecordType = Def->getType();
-  llvm::ArrayRef<llvm::Record *> Classes = RecordType->getClasses();
+  const llvm::RecordRecTy *RecordType = Def->getType();
+  llvm::ArrayRef<const llvm::Record *> Classes = RecordType->getClasses();
   // We want exact types. That is, we don't want the classes listed in
   // spec.td to be subclassed. Hence, we do not want the record |Def|
   // to be of more than one class type..
diff --git a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
index 1beef8170475a..ca13414519d4c 100644
--- a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
@@ -231,8 +231,8 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   std::memcpy(args, &kernel_args, sizeof(args_t));
 
   // Initialize the necessary implicit arguments to the proper values.
-  bool dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
-              (params.num_blocks_z * params.num_threads_z != 1);
+  int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
+             (params.num_blocks_z * params.num_threads_z != 1);
   implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
       reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
   implicit_args->grid_dims = dims;
@@ -281,6 +281,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   // Initialize the packet header and set the doorbell signal to begin execution
   // by the HSA runtime.
   uint16_t header =
+      1u << HSA_PACKET_HEADER_BARRIER |
       (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
@@ -540,11 +541,11 @@ int load(int argc, const char **argv, const char **envp, void *image,
     }
   }
 
-  // Obtain a queue with the minimum (power of two) size, used to send commands
+  // Obtain a queue with the maximum (power of two) size, used to send commands
   // to the HSA runtime and launch execution on the device.
   uint64_t queue_size;
   if (hsa_status_t err = hsa_agent_get_info(
-          dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
+          dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size))
     handle_error(err);
   hsa_queue_t *queue = nullptr;
   if (hsa_status_t err =
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index aa65dfe69c385..6951c5ae147df 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -42,8 +42,19 @@ static_assert(sizeof(rpc_buffer_t) == sizeof(rpc::Buffer),
 static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::MAX_PORT_COUNT,
               "Incorrect maximum port count");
 
+namespace {
+struct TempStorage {
+  char *alloc(size_t size) {
+    storage.emplace_back(std::make_unique<char[]>(size));
+    return storage.back().get();
+  }
+
+  std::vector<std::unique_ptr<char[]>> storage;
+};
+} // namespace
+
 template <bool packed, uint32_t lane_size>
-void handle_printf(rpc::Server::Port &port) {
+static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
   FILE *files[lane_size] = {nullptr};
   // Get the appropriate output stream to use.
   if (port.get_opcode() == RPC_PRINTF_TO_STREAM ||
@@ -65,7 +76,7 @@ void handle_printf(rpc::Server::Port &port) {
 
   // Recieve the format string and arguments from the client.
   port.recv_n(format, format_sizes,
-              [&](uint64_t size) { return new char[size]; });
+              [&](uint64_t size) { return temp_storage.alloc(size); });
 
   // Parse the format string to get the expected size of the buffer.
   for (uint32_t lane = 0; lane < lane_size; ++lane) {
@@ -88,7 +99,8 @@ void handle_printf(rpc::Server::Port &port) {
   port.send([&](rpc::Buffer *buffer, uint32_t id) {
     buffer->data[0] = args_sizes[id];
   });
-  port.recv_n(args, args_sizes, [&](uint64_t size) { return new char[size]; });
+  port.recv_n(args, args_sizes,
+              [&](uint64_t size) { return temp_storage.alloc(size); });
 
   // Identify any arguments that are actually pointers to strings on the client.
   // Additionally we want to determine how much buffer space we need to print.
@@ -137,7 +149,8 @@ void handle_printf(rpc::Server::Port &port) {
     });
     uint64_t str_sizes[lane_size] = {0};
     void *strs[lane_size] = {nullptr};
-    port.recv_n(strs, str_sizes, [](uint64_t size) { return new char[size]; });
+    port.recv_n(strs, str_sizes,
+                [&](uint64_t size) { return temp_storage.alloc(size); });
     for (uint32_t lane = 0; lane < lane_size; ++lane) {
       if (!strs[lane])
         continue;
@@ -149,13 +162,12 @@ void handle_printf(rpc::Server::Port &port) {
 
   // Perform the final formatting and printing using the LLVM C library printf.
   int results[lane_size] = {0};
-  std::vector<void *> to_be_deleted;
   for (uint32_t lane = 0; lane < lane_size; ++lane) {
     if (!format[lane])
       continue;
 
-    std::unique_ptr<char[]> buffer(new char[buffer_size[lane]]);
-    WriteBuffer wb(buffer.get(), buffer_size[lane]);
+    char *buffer = temp_storage.alloc(buffer_size[lane]);
+    WriteBuffer wb(buffer, buffer_size[lane]);
     Writer writer(&wb);
 
     internal::StructArgList<packed> printf_args(args[lane], args_sizes[lane]);
@@ -173,7 +185,6 @@ void handle_printf(rpc::Server::Port &port) {
       if (cur_section.has_conv && cur_section.conv_name == 's') {
         if (!copied_strs[lane].empty()) {
           cur_section.conv_val_ptr = copied_strs[lane].back();
-          to_be_deleted.push_back(copied_strs[lane].back());
           copied_strs[lane].pop_back();
         } else {
           cur_section.conv_val_ptr = nullptr;
@@ -188,8 +199,7 @@ void handle_printf(rpc::Server::Port &port) {
       }
     }
 
-    results[lane] =
-        fwrite(buffer.get(), 1, writer.get_chars_written(), files[lane]);
+    results[lane] = fwrite(buffer, 1, writer.get_chars_written(), files[lane]);
     if (results[lane] != writer.get_chars_written() || ret == -1)
       results[lane] = -1;
   }
@@ -199,11 +209,7 @@ void handle_printf(rpc::Server::Port &port) {
   port.send([&](rpc::Buffer *buffer, uint32_t id) {
     buffer->data[0] = static_cast<uint64_t>(results[id]);
     buffer->data[1] = reinterpret_cast<uintptr_t>(nullptr);
-    delete[] reinterpret_cast<char *>(format[id]);
-    delete[] reinterpret_cast<char *>(args[id]);
   });
-  for (void *ptr : to_be_deleted)
-    delete[] reinterpret_cast<char *>(ptr);
 }
 
 template <uint32_t lane_size>
@@ -216,6 +222,8 @@ rpc_status_t handle_server_impl(
   if (!port)
     return RPC_STATUS_SUCCESS;
 
+  TempStorage temp_storage;
+
   switch (port->get_opcode()) {
   case RPC_WRITE_TO_STREAM:
   case RPC_WRITE_TO_STDERR:
@@ -234,7 +242,8 @@ rpc_status_t handle_server_impl(
       std::fill(files, files + lane_size, stdout);
     }
 
-    port->recv_n(strs, sizes, [&](uint64_t size) { return new char[size]; });
+    port->recv_n(strs, sizes,
+                 [&](uint64_t size) { return temp_storage.alloc(size); });
     port->send([&](rpc::Buffer *buffer, uint32_t id) {
       flockfile(files[id]);
       buffer->data[0] = fwrite_unlocked(strs[id], 1, sizes[id], files[id]);
@@ -242,7 +251,6 @@ rpc_status_t handle_server_impl(
           buffer->data[0] == sizes[id])
         buffer->data[0] += fwrite_unlocked("\n", 1, 1, files[id]);
       funlockfile(files[id]);
-      delete[] reinterpret_cast<uint8_t *>(strs[id]);
     });
     break;
   }
@@ -250,13 +258,12 @@ rpc_status_t handle_server_impl(
     uint64_t sizes[lane_size] = {0};
     void *data[lane_size] = {nullptr};
     port->recv([&](rpc::Buffer *buffer, uint32_t id) {
-      data[id] = new char[buffer->data[0]];
+      data[id] = temp_storage.alloc(buffer->data[0]);
       sizes[id] =
           fread(data[id], 1, buffer->data[0], file::to_stream(buffer->data[1]));
     });
     port->send_n(data, sizes);
     port->send([&](rpc::Buffer *buffer, uint32_t id) {
-      delete[] reinterpret_cast<uint8_t *>(data[id]);
       std::memcpy(buffer->data, &sizes[id], sizeof(uint64_t));
     });
     break;
@@ -265,22 +272,20 @@ rpc_status_t handle_server_impl(
     uint64_t sizes[lane_size] = {0};
     void *data[lane_size] = {nullptr};
     port->recv([&](rpc::Buffer *buffer, uint32_t id) {
-      data[id] = new char[buffer->data[0]];
+      data[id] = temp_storage.alloc(buffer->data[0]);
       const char *str =
           fgets(reinterpret_cast<char *>(data[id]), buffer->data[0],
                 file::to_stream(buffer->data[1]));
       sizes[id] = !str ? 0 : std::strlen(str) + 1;
     });
     port->send_n(data, sizes);
-    for (uint32_t id = 0; id < lane_size; ++id)
-      if (data[id])
-        delete[] reinterpret_cast<uint8_t *>(data[id]);
     break;
   }
   case RPC_OPEN_FILE: {
     uint64_t sizes[lane_size] = {0};
     void *paths[lane_size] = {nullptr};
-    port->recv_n(paths, sizes, [&](uint64_t size) { return new char[size]; });
+    port->recv_n(paths, sizes,
+                 [&](uint64_t size) { return temp_storage.alloc(size); });
     port->recv_and_send([&](rpc::Buffer *buffer, uint32_t id) {
       FILE *file = fopen(reinterpret_cast<char *>(paths[id]),
                          reinterpret_cast<char *>(buffer->data));
@@ -315,13 +320,12 @@ rpc_status_t handle_server_impl(
   case RPC_HOST_CALL: {
     uint64_t sizes[lane_size] = {0};
     void *args[lane_size] = {nullptr};
-    port->recv_n(args, sizes, [&](uint64_t size) { return new char[size]; });
+    port->recv_n(args, sizes,
+                 [&](uint64_t size) { return temp_storage.alloc(size); });
     port->recv([&](rpc::Buffer *buffer, uint32_t id) {
       reinterpret_cast<void (*)(void *)>(buffer->data[0])(args[id]);
     });
-    port->send([&](rpc::Buffer *, uint32_t id) {
-      delete[] reinterpret_cast<uint8_t *>(args[id]);
-    });
+    port->send([&](rpc::Buffer *, uint32_t id) {});
     break;
   }
   case RPC_FEOF: {
@@ -372,23 +376,23 @@ rpc_status_t handle_server_impl(
   case RPC_PRINTF_TO_STREAM_PACKED:
   case RPC_PRINTF_TO_STDOUT_PACKED:
   case RPC_PRINTF_TO_STDERR_PACKED: {
-    handle_printf<true, lane_size>(*port);
+    handle_printf<true, lane_size>(*port, temp_storage);
     break;
   }
   case RPC_PRINTF_TO_STREAM:
   case RPC_PRINTF_TO_STDOUT:
   case RPC_PRINTF_TO_STDERR: {
-    handle_printf<false, lane_size>(*port);
+    handle_printf<false, lane_size>(*port, temp_storage);
     break;
   }
   case RPC_REMOVE: {
     uint64_t sizes[lane_size] = {0};
     void *args[lane_size] = {nullptr};
-    port->recv_n(args, sizes, [&](uint64_t size) { return new char[size]; });
+    port->recv_n(args, sizes,
+                 [&](uint64_t size) { return temp_storage.alloc(size); });
     port->send([&](rpc::Buffer *buffer, uint32_t id) {
       buffer->data[0] = static_cast<uint64_t>(
           remove(reinterpret_cast<const char *>(args[id])));
-      delete[] reinterpret_cast<uint8_t *>(args[id]);
     });
     break;
   }
@@ -398,26 +402,24 @@ rpc_status_t handle_server_impl(
     void *oldpath[lane_size] = {nullptr};
     void *newpath[lane_size] = {nullptr};
     port->recv_n(oldpath, oldsizes,
-                 [&](uint64_t size) { return new char[size]; });
+                 [&](uint64_t size) { return temp_storage.alloc(size); });
     port->recv_n(newpath, newsizes,
-                 [&](uint64_t size) { return new char[size]; });
+                 [&](uint64_t size) { return temp_storage.alloc(size); });
     port->send([&](rpc::Buffer *buffer, uint32_t id) {
       buffer->data[0] = static_cast<uint64_t>(
           rename(reinterpret_cast<const char *>(oldpath[id]),
                  reinterpret_cast<const char *>(newpath[id])));
-      delete[] reinterpret_cast<uint8_t *>(oldpath[id]);
-      delete[] reinterpret_cast<uint8_t *>(newpath[id]);
     });
     break;
   }
   case RPC_SYSTEM: {
     uint64_t sizes[lane_size] = {0};
     void *args[lane_size] = {nullptr};
-    port->recv_n(args, sizes, [&](uint64_t size) { return new char[size]; });
+    port->recv_n(args, sizes,
+                 [&](uint64_t size) { return temp_storage.alloc(size); });
     port->send([&](rpc::Buffer *buffer, uint32_t id) {
       buffer->data[0] = static_cast<uint64_t>(
           system(reinterpret_cast<const char *>(args[id])));
-      delete[] reinterpret_cast<uint8_t *>(args[id]);
     });
     break;
   }
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 1bf7eb2ca7ed7..260e4d433a1d4 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -221,8 +221,10 @@ if( ENABLE_RUNTIME_SUBNORMAL )
        TARGET ${file}
        INPUTS ${CMAKE_CURRENT_SOURCE_DIR}/generic/lib/${file}.ll
     )
-    install( FILES $<TARGET_PROPERTY:${file},TARGET_FILE> ARCHIVE
-      DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
+    install(
+      FILES $<TARGET_PROPERTY:${file},TARGET_FILE>
+      DESTINATION "${CMAKE_INSTALL_DATADIR}/clc"
+    )
   endforeach()
 endif()
 
@@ -426,9 +428,9 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       add_custom_target( ${builtins_opt_lib_tgt}
         ALL DEPENDS ${builtins_opt_lib_tgt}.bc
       )
-      set_target_properties( ${builtins_opt_lib_tgt}
-        PROPERTIES TARGET_FILE ${builtins_opt_lib_tgt}.bc
-                   FOLDER "libclc/Device IR/Opt"
+      set_target_properties( ${builtins_opt_lib_tgt} PROPERTIES
+        TARGET_FILE ${CMAKE_CURRENT_BINARY_DIR}/${builtins_opt_lib_tgt}.bc
+        FOLDER "libclc/Device IR/Opt"
       )
 
       set( builtins_opt_lib $<TARGET_PROPERTY:${builtins_opt_lib_tgt},TARGET_FILE> )
diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake
index 839815d8cc6ff..f2032660ba99b 100644
--- a/libclc/cmake/modules/AddLibclc.cmake
+++ b/libclc/cmake/modules/AddLibclc.cmake
@@ -113,7 +113,7 @@ function(link_bc)
 
   add_custom_target( ${ARG_TARGET} ALL DEPENDS ${ARG_TARGET}.bc )
   set_target_properties( ${ARG_TARGET} PROPERTIES
-    TARGET_FILE ${ARG_TARGET}.bc
+    TARGET_FILE ${CMAKE_CURRENT_BINARY_DIR}/${ARG_TARGET}.bc
     FOLDER "libclc/Device IR/Linking"
   )
 endfunction()
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake b/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
index c0f2bad1c95af..f63436c767947 100644
--- a/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
+++ b/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
@@ -1,2 +1,2 @@
 set(LIBCXX_HARDENING_MODE "fast" CACHE STRING "")
-set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_BOUNDED_ITERATORS;_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING;_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR" CACHE STRING "")
+set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_BOUNDED_ITERATORS;_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING;_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR;_LIBCPP_ABI_BOUNDED_UNIQUE_PTR" CACHE STRING "")
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index bde203ea12f14..a62c4992020a0 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -50,7 +50,7 @@
 "`LWG4013 <https://wg21.link/LWG4013>`__","``lazy_split_view::outer-iterator::value_type`` should not provide default constructor","2024-03 (Tokyo)","","",""
 "`LWG4016 <https://wg21.link/LWG4016>`__","container-insertable checks do not match what container-inserter does","2024-03 (Tokyo)","","",""
 "`LWG4023 <https://wg21.link/LWG4023>`__","Preconditions of ``std::basic_streambuf::setg/setp``","2024-03 (Tokyo)","|Complete|","19.0",""
-"`LWG4025 <https://wg21.link/LWG4025>`__","Move assignment operator of ``std::expected<cv void, E>`` should not be conditionally deleted","2024-03 (Tokyo)","","",""
+"`LWG4025 <https://wg21.link/LWG4025>`__","Move assignment operator of ``std::expected<cv void, E>`` should not be conditionally deleted","2024-03 (Tokyo)","|Complete|","20.0",""
 "`LWG4030 <https://wg21.link/LWG4030>`__","Clarify whether arithmetic expressions in ``[numeric.sat.func]`` are mathematical or C++","2024-03 (Tokyo)","|Nothing To Do|","",""
 "`LWG4031 <https://wg21.link/LWG4031>`__","``bad_expected_access<void>`` member functions should be ``noexcept``","2024-03 (Tokyo)","|Complete|","16.0",""
 "`LWG4035 <https://wg21.link/LWG4035>`__","``single_view`` should provide ``empty``","2024-03 (Tokyo)","|Complete|","19.0",""
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 8c61009167ddc..8a63280053340 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -23,7 +23,6 @@ set(files
   __algorithm/find_if.h
   __algorithm/find_if_not.h
   __algorithm/find_segment_if.h
-  __algorithm/fold.h
   __algorithm/for_each.h
   __algorithm/for_each_n.h
   __algorithm/for_each_segment.h
@@ -98,6 +97,7 @@ set(files
   __algorithm/ranges_find_if.h
   __algorithm/ranges_find_if_not.h
   __algorithm/ranges_find_last.h
+  __algorithm/ranges_fold.h
   __algorithm/ranges_for_each.h
   __algorithm/ranges_for_each_n.h
   __algorithm/ranges_generate.h
@@ -536,6 +536,7 @@ set(files
   __memory/allocator_arg_t.h
   __memory/allocator_destructor.h
   __memory/allocator_traits.h
+  __memory/array_cookie.h
   __memory/assume_aligned.h
   __memory/auto_ptr.h
   __memory/builtin_new_allocator.h
@@ -687,7 +688,6 @@ set(files
   __ranges/views.h
   __ranges/zip_view.h
   __split_buffer
-  __std_clang_module
   __std_mbstate_t.h
   __stop_token/atomic_unique_lock.h
   __stop_token/intrusive_list_view.h
diff --git a/libcxx/include/__algorithm/fold.h b/libcxx/include/__algorithm/ranges_fold.h
similarity index 97%
rename from libcxx/include/__algorithm/fold.h
rename to libcxx/include/__algorithm/ranges_fold.h
index 1bcb3be9aadab..d2c3921398504 100644
--- a/libcxx/include/__algorithm/fold.h
+++ b/libcxx/include/__algorithm/ranges_fold.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_FOLD_H
-#define _LIBCPP___ALGORITHM_FOLD_H
+#ifndef _LIBCPP___ALGORITHM_RANGES_FOLD_H
+#define _LIBCPP___ALGORITHM_RANGES_FOLD_H
 
 #include <__concepts/assignable.h>
 #include <__concepts/constructible.h>
@@ -126,4 +126,4 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
-#endif // _LIBCPP___ALGORITHM_FOLD_H
+#endif // _LIBCPP___ALGORITHM_RANGES_FOLD_H
diff --git a/libcxx/include/__config b/libcxx/include/__config
index f90c966aad695..0d71264611ff6 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -649,6 +649,10 @@ typedef __char32_t char32_t;
          __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500)
 #      define _LIBCPP_HAS_NO_C11_ALIGNED_ALLOC
 #    endif
+#    if (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) &&                                                    \
+         __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000)
+#      define _LIBCPP_HAS_NO_C11_ALIGNED_ALLOC
+#    endif
 #  elif defined(__ANDROID__) && __ANDROID_API__ < 28
 // Android only provides aligned_alloc when targeting API 28 or higher.
 #    define _LIBCPP_HAS_NO_C11_ALIGNED_ALLOC
diff --git a/libcxx/include/__configuration/abi.h b/libcxx/include/__configuration/abi.h
index 707e10b5ceb53..62c129f5921de 100644
--- a/libcxx/include/__configuration/abi.h
+++ b/libcxx/include/__configuration/abi.h
@@ -181,6 +181,13 @@
 #  define _LIBCPP_ABI_NO_COMPRESSED_PAIR_PADDING
 #endif
 
+// Tracks the bounds of the array owned by std::unique_ptr<T[]>, allowing it to trap when accessed out-of-bounds.
+// Note that limited bounds checking is also available outside of this ABI configuration, but only some categories
+// of types can be checked.
+//
+// ABI impact: This causes the layout of std::unique_ptr<T[]> to change and its size to increase.
+// #define _LIBCPP_ABI_BOUNDED_UNIQUE_PTR
+
 #if defined(_LIBCPP_COMPILER_CLANG_BASED)
 #  if defined(__APPLE__)
 #    if defined(__i386__) || defined(__x86_64__)
diff --git a/libcxx/include/__configuration/availability.h b/libcxx/include/__configuration/availability.h
index ab483a07c9c13..b10f29590a2c9 100644
--- a/libcxx/include/__configuration/availability.h
+++ b/libcxx/include/__configuration/availability.h
@@ -138,15 +138,31 @@
 #  define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE __attribute__((unavailable))
 
 // LLVM 18
-// TODO: Fill this in
-#  define _LIBCPP_INTRODUCED_IN_LLVM_18 0
-#  define _LIBCPP_INTRODUCED_IN_LLVM_18_ATTRIBUTE __attribute__((unavailable))
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 150000) ||       \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 180000) ||     \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 180000) ||             \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 110000) ||       \
+      (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 90000) ||      \
+      (defined(__ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ < 240000)
+#    define _LIBCPP_INTRODUCED_IN_LLVM_18 0
+#  else
+#    define _LIBCPP_INTRODUCED_IN_LLVM_18 1
+#  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_18_ATTRIBUTE                                                                 \
+    __attribute__((availability(macos, strict, introduced = 15.0)))                                               \
+    __attribute__((availability(ios, strict, introduced = 18.0)))                                                 \
+    __attribute__((availability(tvos, strict, introduced = 18.0)))                                                \
+    __attribute__((availability(watchos, strict, introduced = 11.0)))                                             \
+    __attribute__((availability(bridgeos, strict, introduced = 9.0)))                                             \
+    __attribute__((availability(driverkit, strict, introduced = 24.0)))
 
 // LLVM 17
 #  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 140400) ||       \
       (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 170400) ||     \
       (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 170400) ||             \
-      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 100400)
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 100400) ||       \
+      (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 80400) ||      \
+      (defined(__ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ < 230400)
 #    define _LIBCPP_INTRODUCED_IN_LLVM_17 0
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_17 1
@@ -155,13 +171,17 @@
     __attribute__((availability(macos, strict, introduced = 14.4)))                                               \
     __attribute__((availability(ios, strict, introduced = 17.4)))                                                 \
     __attribute__((availability(tvos, strict, introduced = 17.4)))                                                \
-    __attribute__((availability(watchos, strict, introduced = 10.4)))
+    __attribute__((availability(watchos, strict, introduced = 10.4)))                                             \
+    __attribute__((availability(bridgeos, strict, introduced = 8.4)))                                             \
+    __attribute__((availability(driverkit, strict, introduced = 23.4)))
 
 // LLVM 16
 #  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 140000) ||       \
       (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 170000) ||     \
       (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 170000) ||             \
-      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 100000)
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 100000) ||       \
+      (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 80000) ||      \
+      (defined(__ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ < 230000)
 #    define _LIBCPP_INTRODUCED_IN_LLVM_16 0
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_16 1
@@ -170,13 +190,17 @@
     __attribute__((availability(macos, strict, introduced = 14.0)))                                               \
     __attribute__((availability(ios, strict, introduced = 17.0)))                                                 \
     __attribute__((availability(tvos, strict, introduced = 17.0)))                                                \
-    __attribute__((availability(watchos, strict, introduced = 10.0)))
+    __attribute__((availability(watchos, strict, introduced = 10.0)))                                             \
+    __attribute__((availability(bridgeos, strict, introduced = 8.0)))                                             \
+    __attribute__((availability(driverkit, strict, introduced = 23.0)))
 
 // LLVM 15
 #  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 130400) ||   \
       (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 160500) || \
       (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 160500) ||         \
-      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90500)
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90500) ||    \
+      (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 70500) ||  \
+      (defined(__ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ < 220400)
 #    define _LIBCPP_INTRODUCED_IN_LLVM_15 0
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_15 1
@@ -185,7 +209,9 @@
     __attribute__((availability(macos, strict, introduced = 13.4)))                                               \
     __attribute__((availability(ios, strict, introduced = 16.5)))                                                 \
     __attribute__((availability(tvos, strict, introduced = 16.5)))                                                \
-    __attribute__((availability(watchos, strict, introduced = 9.5)))
+    __attribute__((availability(watchos, strict, introduced = 9.5)))                                              \
+    __attribute__((availability(bridgeos, strict, introduced = 7.5)))                                             \
+    __attribute__((availability(driverkit, strict, introduced = 22.4)))
 
 // LLVM 14
 #  define _LIBCPP_INTRODUCED_IN_LLVM_14 _LIBCPP_INTRODUCED_IN_LLVM_15
@@ -195,7 +221,9 @@
 #  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 130000) ||   \
       (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 160000) || \
       (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 160000) ||         \
-      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90000)
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90000) ||    \
+      (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 70000) ||  \
+      (defined(__ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ < 220000)
 #    define _LIBCPP_INTRODUCED_IN_LLVM_13 0
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_13 1
@@ -204,13 +232,17 @@
     __attribute__((availability(macos, strict, introduced = 13.0)))                                               \
     __attribute__((availability(ios, strict, introduced = 16.0)))                                                 \
     __attribute__((availability(tvos, strict, introduced = 16.0)))                                                \
-    __attribute__((availability(watchos, strict, introduced = 9.0)))
+    __attribute__((availability(watchos, strict, introduced = 9.0)))                                              \
+    __attribute__((availability(bridgeos, strict, introduced = 7.0)))                                             \
+    __attribute__((availability(driverkit, strict, introduced = 22.0)))
 
 // LLVM 12
 #  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 120300)   ||     \
       (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 150300) ||     \
       (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 150300)         ||     \
-      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 80300)
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 80300)    ||     \
+      (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 60000)  ||     \
+      (defined(__ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ < 210300)
 #    define _LIBCPP_INTRODUCED_IN_LLVM_12 0
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_12 1
@@ -219,7 +251,9 @@
     __attribute__((availability(macos, strict, introduced = 12.3)))                                               \
     __attribute__((availability(ios, strict, introduced = 15.3)))                                                 \
     __attribute__((availability(tvos, strict, introduced = 15.3)))                                                \
-    __attribute__((availability(watchos, strict, introduced = 8.3)))
+    __attribute__((availability(watchos, strict, introduced = 8.3)))                                              \
+    __attribute__((availability(bridgeos, strict, introduced = 6.0)))                                             \
+    __attribute__((availability(driverkit, strict, introduced = 21.3)))
 
 // LLVM 11
 #  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 110000) ||   \
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index f4ad455a19ea5..e04e17d1ebe69 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -1493,8 +1493,6 @@ class expected<_Tp, _Err> : private __expected_void_base<_Err> {
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr expected& operator=(expected&&) = delete;
-
   _LIBCPP_HIDE_FROM_ABI constexpr expected&
   operator=(expected&& __rhs) noexcept(is_nothrow_move_assignable_v<_Err> && is_nothrow_move_constructible_v<_Err>)
     requires(is_move_assignable_v<_Err> && is_move_constructible_v<_Err>)
diff --git a/libcxx/include/__format/format_arg_store.h b/libcxx/include/__format/format_arg_store.h
index 68e936acecade..9a4a41df6d4e6 100644
--- a/libcxx/include/__format/format_arg_store.h
+++ b/libcxx/include/__format/format_arg_store.h
@@ -234,6 +234,11 @@ struct __packed_format_arg_store {
   uint64_t __types_ = 0;
 };
 
+template <class _Context>
+struct __packed_format_arg_store<_Context, 0> {
+  uint64_t __types_ = 0;
+};
+
 template <class _Context, size_t _Np>
 struct __unpacked_format_arg_store {
   basic_format_arg<_Context> __args_[_Np];
diff --git a/libcxx/include/__format/formatter_integral.h b/libcxx/include/__format/formatter_integral.h
index beed3ab8d93df..0c04cce855a08 100644
--- a/libcxx/include/__format/formatter_integral.h
+++ b/libcxx/include/__format/formatter_integral.h
@@ -27,6 +27,7 @@
 #include <__type_traits/make_unsigned.h>
 #include <__utility/unreachable.h>
 #include <array>
+#include <cstdint>
 #include <limits>
 #include <string>
 #include <string_view>
diff --git a/libcxx/include/__iterator/next.h b/libcxx/include/__iterator/next.h
index fb6c8ea6d7550..1f68a5bec8f39 100644
--- a/libcxx/include/__iterator/next.h
+++ b/libcxx/include/__iterator/next.h
@@ -25,7 +25,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _InputIter, __enable_if_t<__has_input_iterator_category<_InputIter>::value, int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _InputIter
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _InputIter
 next(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n = 1) {
   // Calling `advance` with a negative value on a non-bidirectional iterator is a no-op in the current implementation.
   // Note that this check duplicates the similar check in `std::advance`.
@@ -43,25 +43,26 @@ next(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n =
 namespace ranges {
 struct __next {
   template <input_or_output_iterator _Ip>
-  _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x) const {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x) const {
     ++__x;
     return __x;
   }
 
   template <input_or_output_iterator _Ip>
-  _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n) const {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n) const {
     ranges::advance(__x, __n);
     return __x;
   }
 
   template <input_or_output_iterator _Ip, sentinel_for<_Ip> _Sp>
-  _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, _Sp __bound_sentinel) const {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, _Sp __bound_sentinel) const {
     ranges::advance(__x, __bound_sentinel);
     return __x;
   }
 
   template <input_or_output_iterator _Ip, sentinel_for<_Ip> _Sp>
-  _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n, _Sp __bound_sentinel) const {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip
+  operator()(_Ip __x, iter_difference_t<_Ip> __n, _Sp __bound_sentinel) const {
     ranges::advance(__x, __n, __bound_sentinel);
     return __x;
   }
diff --git a/libcxx/include/__iterator/prev.h b/libcxx/include/__iterator/prev.h
index e950d8dc41471..7e97203836eb9 100644
--- a/libcxx/include/__iterator/prev.h
+++ b/libcxx/include/__iterator/prev.h
@@ -25,7 +25,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _InputIter, __enable_if_t<__has_input_iterator_category<_InputIter>::value, int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _InputIter
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _InputIter
 prev(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n = 1) {
   // Calling `advance` with a negative value on a non-bidirectional iterator is a no-op in the current implementation.
   // Note that this check duplicates the similar check in `std::advance`.
@@ -42,19 +42,20 @@ prev(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n =
 namespace ranges {
 struct __prev {
   template <bidirectional_iterator _Ip>
-  _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x) const {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x) const {
     --__x;
     return __x;
   }
 
   template <bidirectional_iterator _Ip>
-  _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n) const {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n) const {
     ranges::advance(__x, -__n);
     return __x;
   }
 
   template <bidirectional_iterator _Ip>
-  _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n, _Ip __bound_iter) const {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Ip
+  operator()(_Ip __x, iter_difference_t<_Ip> __n, _Ip __bound_iter) const {
     ranges::advance(__x, -__n, __bound_iter);
     return __x;
   }
diff --git a/libcxx/include/__memory/array_cookie.h b/libcxx/include/__memory/array_cookie.h
new file mode 100644
index 0000000000000..34eec64320610
--- /dev/null
+++ b/libcxx/include/__memory/array_cookie.h
@@ -0,0 +1,55 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___MEMORY_ARRAY_COOKIE_H
+#define _LIBCPP___MEMORY_ARRAY_COOKIE_H
+
+#include <__config>
+#include <__configuration/abi.h>
+#include <__type_traits/integral_constant.h>
+#include <__type_traits/is_trivially_destructible.h>
+#include <__type_traits/negation.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// Trait representing whether a type requires an array cookie at the start of its allocation when
+// allocated as `new T[n]` and deallocated as `delete array`.
+//
+// Under the Itanium C++ ABI [1], we know that an array cookie is available unless `T` is trivially
+// destructible and the call to `operator delete[]` is not a sized operator delete. Under ABIs other
+// than the Itanium ABI, we assume there are no array cookies.
+//
+// [1]: https://itanium-cxx-abi.github.io/cxx-abi/abi.html#array-cookies
+#ifdef _LIBCPP_ABI_ITANIUM
+// TODO: Use a builtin instead
+// TODO: We should factor in the choice of the usual deallocation function in this determination.
+template <class _Tp>
+struct __has_array_cookie : _Not<is_trivially_destructible<_Tp> > {};
+#else
+template <class _Tp>
+struct __has_array_cookie : false_type {};
+#endif
+
+template <class _Tp>
+// Avoid failures when -fsanitize-address-poison-custom-array-cookie is enabled
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_SANITIZE("address") size_t __get_array_cookie(_Tp const* __ptr) {
+  static_assert(
+      __has_array_cookie<_Tp>::value, "Trying to access the array cookie of a type that is not guaranteed to have one");
+  size_t const* __cookie = reinterpret_cast<size_t const*>(__ptr) - 1; // TODO: Use a builtin instead
+  return *__cookie;
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___MEMORY_ARRAY_COOKIE_H
diff --git a/libcxx/include/__memory/compressed_pair.h b/libcxx/include/__memory/compressed_pair.h
index 629e3ad8848ff..6454b2b7ab070 100644
--- a/libcxx/include/__memory/compressed_pair.h
+++ b/libcxx/include/__memory/compressed_pair.h
@@ -52,13 +52,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #ifndef _LIBCPP_ABI_NO_COMPRESSED_PAIR_PADDING
 
-template <class _ToPad>
+template <class _ToPad,
+          bool _Empty = ((is_empty<_ToPad>::value && !__libcpp_is_final<_ToPad>::value) ||
+                         is_reference<_ToPad>::value || sizeof(_ToPad) == __datasizeof_v<_ToPad>)>
 class __compressed_pair_padding {
-  char __padding_[((is_empty<_ToPad>::value && !__libcpp_is_final<_ToPad>::value) || is_reference<_ToPad>::value)
-                      ? 0
-                      : sizeof(_ToPad) - __datasizeof_v<_ToPad>];
+  char __padding_[sizeof(_ToPad) - __datasizeof_v<_ToPad>];
 };
 
+template <class _ToPad>
+class __compressed_pair_padding<_ToPad, true> {};
+
 #  define _LIBCPP_COMPRESSED_PAIR(T1, Initializer1, T2, Initializer2)                                                  \
     _LIBCPP_NO_UNIQUE_ADDRESS __attribute__((__aligned__(_LIBCPP_ALIGNOF(T2)))) T1 Initializer1;                       \
     _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T1> _LIBCPP_CONCAT3(__padding1_, __LINE__, _);          \
diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h
index 9ca13d0e4fd1a..11215dc111e36 100644
--- a/libcxx/include/__memory/unique_ptr.h
+++ b/libcxx/include/__memory/unique_ptr.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP___MEMORY_UNIQUE_PTR_H
 #define _LIBCPP___MEMORY_UNIQUE_PTR_H
 
+#include <__assert>
 #include <__compare/compare_three_way.h>
 #include <__compare/compare_three_way_result.h>
 #include <__compare/three_way_comparable.h>
@@ -17,8 +18,10 @@
 #include <__functional/hash.h>
 #include <__functional/operations.h>
 #include <__memory/allocator_traits.h> // __pointer
+#include <__memory/array_cookie.h>
 #include <__memory/auto_ptr.h>
 #include <__memory/compressed_pair.h>
+#include <__memory/pointer_traits.h>
 #include <__type_traits/add_lvalue_reference.h>
 #include <__type_traits/common_type.h>
 #include <__type_traits/conditional.h>
@@ -27,6 +30,7 @@
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_assignable.h>
+#include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
 #include <__type_traits/is_function.h>
@@ -41,7 +45,9 @@
 #include <__utility/declval.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
+#include <__utility/private_constructor_tag.h>
 #include <cstddef>
+#include <cstdint>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -292,6 +298,91 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr {
   }
 };
 
+// Bounds checking in unique_ptr<T[]>
+// ==================================
+//
+// We provide some helper classes that allow bounds checking when accessing a unique_ptr<T[]>.
+// There are a few cases where bounds checking can be implemented:
+//
+// 1. When an array cookie (see [1]) exists at the beginning of the array allocation, we are
+//    able to reuse that cookie to extract the size of the array and perform bounds checking.
+//    An array cookie is a size inserted at the beginning of the allocation by the compiler.
+//    That size is inserted implicitly when doing `new T[n]` in some cases, and its purpose
+//    is to allow the runtime to destroy the `n` array elements when doing `delete array`.
+//    When we are able to use array cookies, we reuse information already available in the
+//    current runtime, so bounds checking does not require changing libc++'s ABI.
+//
+// 2. When the "bounded unique_ptr" ABI configuration (controlled by `_LIBCPP_ABI_BOUNDED_UNIQUE_PTR`)
+//    is enabled, we store the size of the allocation (when it is known) so we can check it when
+//    indexing into the `unique_ptr`. That changes the layout of `std::unique_ptr<T[]>`, which is
+//    an ABI break from the default configuration.
+//
+//    Note that even under this ABI configuration, we can't always know the size of the unique_ptr.
+//    Indeed, the size of the allocation can only be known when the unique_ptr is created via
+//    make_unique or a similar API. For example, it can't be known when constructed from an arbitrary
+//    pointer, in which case we are not able to check the bounds on access:
+//
+//      unique_ptr<T[], MyDeleter> ptr(new T[3]);
+//
+//    When we don't know the size of the allocation via the API used to create the unique_ptr, we
+//    try to fall back to using an array cookie when available.
+//
+//    Finally, note that when this ABI configuration is enabled, we have no choice but to always
+//    make space for a size to be stored in the unique_ptr. Indeed, while we might want to avoid
+//    storing the size when an array cookie is available, knowing whether an array cookie is available
+//    requires the type stored in the unique_ptr to be complete, while unique_ptr can normally
+//    accommodate incomplete types.
+//
+// (1) Implementation where we rely on the array cookie to know the size of the allocation, if
+//     an array cookie exists.
+struct __unique_ptr_array_bounds_stateless {
+  __unique_ptr_array_bounds_stateless() = default;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __unique_ptr_array_bounds_stateless(size_t) {}
+
+  template <class _Tp, __enable_if_t<__has_array_cookie<_Tp>::value, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __in_bounds(_Tp* __ptr, size_t __index) const {
+    // In constant expressions, we can't check the array cookie so we just pretend that the index
+    // is in-bounds. The compiler catches invalid accesses anyway.
+    if (__libcpp_is_constant_evaluated())
+      return true;
+    size_t __cookie = std::__get_array_cookie(__ptr);
+    return __index < __cookie;
+  }
+
+  template <class _Tp, __enable_if_t<!__has_array_cookie<_Tp>::value, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __in_bounds(_Tp*, size_t) const {
+    return true; // If we don't have an array cookie, we assume the access is in-bounds
+  }
+};
+
+// (2) Implementation where we store the size in the class whenever we have it.
+//
+// Semantically, we'd need to store the size as an optional<size_t>. However, since that
+// is really heavy weight, we instead store a size_t and use SIZE_MAX as a magic value
+// meaning that we don't know the size.
+struct __unique_ptr_array_bounds_stored {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __unique_ptr_array_bounds_stored() : __size_(SIZE_MAX) {}
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __unique_ptr_array_bounds_stored(size_t __size) : __size_(__size) {}
+
+  // Use the array cookie if there's one
+  template <class _Tp, __enable_if_t<__has_array_cookie<_Tp>::value, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __in_bounds(_Tp* __ptr, size_t __index) const {
+    if (__libcpp_is_constant_evaluated())
+      return true;
+    size_t __cookie = std::__get_array_cookie(__ptr);
+    return __index < __cookie;
+  }
+
+  // Otherwise, fall back on the stored size (if any)
+  template <class _Tp, __enable_if_t<!__has_array_cookie<_Tp>::value, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __in_bounds(_Tp*, size_t __index) const {
+    return __index < __size_;
+  }
+
+private:
+  size_t __size_;
+};
+
 template <class _Tp, class _Dp>
 class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp> {
 public:
@@ -300,8 +391,9 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp>
   typedef typename __pointer<_Tp, deleter_type>::type pointer;
 
   // A unique_ptr contains the following members which may be trivially relocatable:
-  // - pointer : this may be trivially relocatable, so it's checked
+  // - pointer: this may be trivially relocatable, so it's checked
   // - deleter_type: this may be trivially relocatable, so it's checked
+  // - (optionally) size: this is trivially relocatable
   //
   // This unique_ptr implementation only contains a pointer to the unique object and a deleter, so there are no
   // references to itself. This means that the entire structure is trivially relocatable if its members are.
@@ -311,7 +403,16 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp>
       void>;
 
 private:
+  template <class _Up, class _OtherDeleter>
+  friend class unique_ptr;
+
   _LIBCPP_COMPRESSED_PAIR(pointer, __ptr_, deleter_type, __deleter_);
+#ifdef _LIBCPP_ABI_BOUNDED_UNIQUE_PTR
+  using _BoundsChecker = __unique_ptr_array_bounds_stored;
+#else
+  using _BoundsChecker = __unique_ptr_array_bounds_stateless;
+#endif
+  _LIBCPP_NO_UNIQUE_ADDRESS _BoundsChecker __checker_;
 
   template <class _From>
   struct _CheckArrayPointerConversion : is_same<_From, pointer> {};
@@ -373,6 +474,12 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp>
       : __ptr_(__p),
         __deleter_() {}
 
+  // Private constructor used by make_unique & friends to pass the size that was allocated
+  template <class _Tag, class _Ptr, __enable_if_t<is_same<_Tag, __private_constructor_tag>::value, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 explicit unique_ptr(_Tag, _Ptr __ptr, size_t __size) _NOEXCEPT
+      : __ptr_(__ptr),
+        __checker_(__size) {}
+
   template <class _Pp,
             bool _Dummy = true,
             class       = _EnableIfDeleterConstructible<_LValRefType<_Dummy> >,
@@ -411,11 +518,13 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp>
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr(unique_ptr&& __u) _NOEXCEPT
       : __ptr_(__u.release()),
-        __deleter_(std::forward<deleter_type>(__u.get_deleter())) {}
+        __deleter_(std::forward<deleter_type>(__u.get_deleter())),
+        __checker_(std::move(__u.__checker_)) {}
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr& operator=(unique_ptr&& __u) _NOEXCEPT {
     reset(__u.release());
     __deleter_ = std::forward<deleter_type>(__u.get_deleter());
+    __checker_ = std::move(std::move(__u.__checker_));
     return *this;
   }
 
@@ -425,7 +534,8 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp>
             class = _EnableIfDeleterConvertible<_Ep> >
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr(unique_ptr<_Up, _Ep>&& __u) _NOEXCEPT
       : __ptr_(__u.release()),
-        __deleter_(std::forward<_Ep>(__u.get_deleter())) {}
+        __deleter_(std::forward<_Ep>(__u.get_deleter())),
+        __checker_(std::move(__u.__checker_)) {}
 
   template <class _Up,
             class _Ep,
@@ -434,6 +544,7 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr& operator=(unique_ptr<_Up, _Ep>&& __u) _NOEXCEPT {
     reset(__u.release());
     __deleter_ = std::forward<_Ep>(__u.get_deleter());
+    __checker_ = std::move(__u.__checker_);
     return *this;
   }
 
@@ -451,6 +562,8 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp>
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __add_lvalue_reference_t<_Tp> operator[](size_t __i) const {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__checker_.__in_bounds(std::__to_address(__ptr_), __i),
+                                        "unique_ptr<T[]>::operator[](index): index out of range");
     return __ptr_[__i];
   }
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 pointer get() const _NOEXCEPT { return __ptr_; }
@@ -467,6 +580,8 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 pointer release() _NOEXCEPT {
     pointer __t = __ptr_;
     __ptr_      = pointer();
+    // The deleter and the optional bounds-checker are left unchanged. The bounds-checker
+    // will be reinitialized appropriately when/if the unique_ptr gets assigned-to or reset.
     return __t;
   }
 
@@ -474,6 +589,7 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void reset(_Pp __p) _NOEXCEPT {
     pointer __tmp = __ptr_;
     __ptr_        = __p;
+    __checker_    = _BoundsChecker();
     if (__tmp)
       __deleter_(__tmp);
   }
@@ -481,6 +597,7 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void reset(nullptr_t = nullptr) _NOEXCEPT {
     pointer __tmp = __ptr_;
     __ptr_        = nullptr;
+    __checker_    = _BoundsChecker();
     if (__tmp)
       __deleter_(__tmp);
   }
@@ -489,6 +606,7 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp>
     using std::swap;
     swap(__ptr_, __u.__ptr_);
     swap(__deleter_, __u.__deleter_);
+    swap(__checker_, __u.__checker_);
   }
 };
 
@@ -645,7 +763,7 @@ template <class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 typename __unique_if<_Tp>::__unique_array_unknown_bound
 make_unique(size_t __n) {
   typedef __remove_extent_t<_Tp> _Up;
-  return unique_ptr<_Tp>(new _Up[__n]());
+  return unique_ptr<_Tp>(__private_constructor_tag(), new _Up[__n](), __n);
 }
 
 template <class _Tp, class... _Args>
@@ -664,7 +782,7 @@ make_unique_for_overwrite() {
 template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 typename __unique_if<_Tp>::__unique_array_unknown_bound
 make_unique_for_overwrite(size_t __n) {
-  return unique_ptr<_Tp>(new __remove_extent_t<_Tp>[__n]);
+  return unique_ptr<_Tp>(__private_constructor_tag(), new __remove_extent_t<_Tp>[__n], __n);
 }
 
 template <class _Tp, class... _Args>
diff --git a/libcxx/include/__std_clang_module b/libcxx/include/__std_clang_module
deleted file mode 100644
index a21ed26addfe8..0000000000000
--- a/libcxx/include/__std_clang_module
+++ /dev/null
@@ -1,193 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// WARNING, this entire header is generated by
-// utils/generate_std_clang_module_header.py
-// DO NOT MODIFY!
-
-// This header should not be directly included, it's exclusively to import all
-// of the libc++ public clang modules for the `std` clang module to export. In
-// other words, it's to facilitate `@import std;` in Objective-C++ and `import std`
-// in Swift to expose all of the libc++ interfaces. This is generally not
-// recommended, however there are some clients that need to import all of libc++
-// without knowing what "all" is.
-#if !__building_module(std)
-#  error "Do not include this header directly, include individual headers instead"
-#endif
-
-#include <__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-#include <algorithm>
-#include <any>
-#include <array>
-#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)
-#  include <atomic>
-#endif
-#include <barrier>
-#include <bit>
-#include <bitset>
-#include <cassert>
-#include <ccomplex>
-#include <cctype>
-#include <cerrno>
-#include <cfenv>
-#include <cfloat>
-#include <charconv>
-#include <chrono>
-#include <cinttypes>
-#include <ciso646>
-#include <climits>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <clocale>
-#endif
-#include <cmath>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <codecvt>
-#endif
-#include <compare>
-#include <complex.h>
-#include <complex>
-#include <concepts>
-#include <condition_variable>
-#include <coroutine>
-#include <csetjmp>
-#include <csignal>
-#include <cstdarg>
-#include <cstdbool>
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <ctgmath>
-#include <ctime>
-#include <ctype.h>
-#include <cuchar>
-#include <cwchar>
-#include <cwctype>
-#include <deque>
-#include <errno.h>
-#include <exception>
-#include <execution>
-#include <expected>
-#include <experimental/iterator>
-#include <experimental/memory>
-#include <experimental/propagate_const>
-#include <experimental/simd>
-#include <experimental/type_traits>
-#include <experimental/utility>
-#include <fenv.h>
-#include <filesystem>
-#include <float.h>
-#include <format>
-#include <forward_list>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <fstream>
-#endif
-#include <functional>
-#include <future>
-#include <initializer_list>
-#include <inttypes.h>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <iomanip>
-#endif
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <ios>
-#endif
-#include <iosfwd>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <iostream>
-#endif
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <istream>
-#endif
-#include <iterator>
-#include <latch>
-#include <limits>
-#include <list>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <locale.h>
-#endif
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <locale>
-#endif
-#include <map>
-#include <math.h>
-#include <mdspan>
-#include <memory>
-#include <memory_resource>
-#include <mutex>
-#include <new>
-#include <numbers>
-#include <numeric>
-#include <optional>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <ostream>
-#endif
-#include <print>
-#include <queue>
-#include <random>
-#include <ranges>
-#include <ratio>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <regex>
-#endif
-#include <scoped_allocator>
-#include <semaphore>
-#include <set>
-#include <shared_mutex>
-#include <source_location>
-#include <span>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <sstream>
-#endif
-#include <stack>
-#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)
-#  include <stdatomic.h>
-#endif
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdexcept>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stop_token>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <streambuf>
-#endif
-#include <string.h>
-#include <string>
-#include <string_view>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <strstream>
-#endif
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <syncstream>
-#endif
-#include <system_error>
-#include <tgmath.h>
-#include <thread>
-#include <tuple>
-#include <type_traits>
-#include <typeindex>
-#include <typeinfo>
-#include <uchar.h>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <valarray>
-#include <variant>
-#include <vector>
-#include <version>
-#include <wchar.h>
-#include <wctype.h>
diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm
index 36fd035b7e51b..17d63ce0cf1c0 100644
--- a/libcxx/include/algorithm
+++ b/libcxx/include/algorithm
@@ -2020,10 +2020,10 @@ template <class BidirectionalIterator, class Compare>
 #endif
 
 #if _LIBCPP_STD_VER >= 23
-#  include <__algorithm/fold.h>
 #  include <__algorithm/ranges_contains_subrange.h>
 #  include <__algorithm/ranges_ends_with.h>
 #  include <__algorithm/ranges_find_last.h>
+#  include <__algorithm/ranges_fold.h>
 #  include <__algorithm/ranges_starts_with.h>
 #endif // _LIBCPP_STD_VER >= 23
 
diff --git a/libcxx/include/ios b/libcxx/include/ios
index 61a05fadd29a1..d4f15a269a11a 100644
--- a/libcxx/include/ios
+++ b/libcxx/include/ios
@@ -524,7 +524,10 @@ template <class _Traits>
 // Attribute 'packed' is used to keep the layout compatible with the previous
 // definition of the '__fill_' and '_set_' pair in basic_ios on AIX & z/OS.
 struct _LIBCPP_PACKED _FillHelper {
-  _LIBCPP_HIDE_FROM_ABI void __init() { __set_ = false; }
+  _LIBCPP_HIDE_FROM_ABI void __init() {
+    __set_      = false;
+    __fill_val_ = _Traits::eof();
+  }
   _LIBCPP_HIDE_FROM_ABI _FillHelper& operator=(typename _Traits::int_type __x) {
     __set_      = true;
     __fill_val_ = __x;
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index ef4a242cf8bf7..881c0ca6c3669 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1,2124 +1,2240 @@
-// Main C++ standard library interfaces
-module std_algorithm [system] {
-  header "algorithm"
-  export *
-}
-module std_any [system] {
-  header "any"
-  export *
-}
-module std_array [system] {
-  header "array"
-  export *
-}
-module std_atomic [system] {
-  header "atomic"
-  export *
-}
-module std_barrier [system] {
-  header "barrier"
-  export *
-}
-module std_bit [system] {
-  header "bit"
-  export *
-}
-module std_bitset [system] {
-  header "bitset"
-  export *
-}
-module std_charconv [system] {
-  header "charconv"
-  module chars_format            { header "__charconv/chars_format.h" }
-  module from_chars_integral     { header "__charconv/from_chars_integral.h" }
-  module from_chars_result       { header "__charconv/from_chars_result.h" }
-  module tables                  { header "__charconv/tables.h" }
-  module to_chars                { header "__charconv/to_chars.h" }
-  module to_chars_base_10        { header "__charconv/to_chars_base_10.h" }
-  module to_chars_floating_point { header "__charconv/to_chars_floating_point.h" }
-  module to_chars_integral       { header "__charconv/to_chars_integral.h" }
-  module to_chars_result         { header "__charconv/to_chars_result.h" }
-  module traits                  { header "__charconv/traits.h" }
-  export *
-}
-module std_chrono [system] {
-  header "chrono"
-  export *
-}
-module std_codecvt [system] {
-  header "codecvt"
-  export *
-}
-module std_compare [system] {
-  header "compare"
-  export *
-}
-module std_complex [system] {
-  header "complex"
-  export *
-}
-module std_concepts [system] {
-  header "concepts"
-  export *
-}
-module std_condition_variable [system] {
-  header "condition_variable"
-  module condition_variable { header "__condition_variable/condition_variable.h" }
-  export *
-}
-module std_coroutine [system] {
-  header "coroutine"
-  module coroutine_handle      { header "__coroutine/coroutine_handle.h" }
-  module coroutine_traits      { header "__coroutine/coroutine_traits.h" }
-  module noop_coroutine_handle { header "__coroutine/noop_coroutine_handle.h" }
-  module trivial_awaitables    { header "__coroutine/trivial_awaitables.h" }
-  export *
-}
-module std_deque [system] {
-  header "deque"
-  export *
-}
-module std_exception [system] {
-  header "exception"
-  export *
-}
-module std_execution [system] {
-  header "execution"
-  export *
-}
-module std_expected [system] {
-  header "expected"
-  export *
-}
-module std_filesystem [system] {
-  header "filesystem"
-  module copy_options                 { header "__filesystem/copy_options.h" }
-  module directory_entry              { header "__filesystem/directory_entry.h" }
-  module directory_iterator           { header "__filesystem/directory_iterator.h" }
-  module directory_options            { header "__filesystem/directory_options.h" }
-  module file_status                  { header "__filesystem/file_status.h" }
-  module file_time_type               { header "__filesystem/file_time_type.h" }
-  module file_type                    { header "__filesystem/file_type.h" }
-  module filesystem_error             {
-    header "__filesystem/filesystem_error.h"
-    export std_private_memory_shared_ptr
-  }
-  module operations                   { header "__filesystem/operations.h" }
-  module path                         {
-    header "__filesystem/path.h"
-    export std_string // returned by various methods
-  }
-  module path_iterator                { header "__filesystem/path_iterator.h" }
-  module perm_options                 { header "__filesystem/perm_options.h" }
-  module perms                        { header "__filesystem/perms.h" }
-  module recursive_directory_iterator { header "__filesystem/recursive_directory_iterator.h" }
-  module space_info                   { header "__filesystem/space_info.h" }
-  module u8path                       { header "__filesystem/u8path.h" }
-  export *
-}
-module std_format [system] {
-  header "format"
-  export *
-}
-module std_forward_list [system] {
-  header "forward_list"
-  export *
-}
-module std_fstream [system] {
-  header "fstream"
-  export *
-}
-module std_functional [system] {
-  header "functional"
-  export *
-}
-module std_future [system] {
-  header "future"
-  export *
-}
-module std_initializer_list [system] {
-  header "initializer_list"
-  export *
-}
-module std_iomanip [system] {
-  header "iomanip"
-  export *
-}
-module std_ios [system] {
-  header "ios"
-  export *
-}
-module std_iosfwd [system] {
-  header "iosfwd"
-  export *
-}
-module std_iostream [system] {
-  header "iostream"
-  export *
-}
-module std_istream [system] {
-  header "istream"
-  export *
-}
-module std_iterator [system] {
-  header "iterator"
-  export *
-}
-module std_latch [system] {
-  header "latch"
-  export *
-}
-module std_limits [system] {
-  header "limits"
-  export *
-}
-module std_list [system] {
-  header "list"
-  export *
-}
-module std_locale [system] {
-  header "locale"
-  export *
-}
-module std_map [system] {
-  header "map"
-  export *
+// This module contains headers related to the configuration of the library. These headers
+// are free of any dependency on the rest of libc++.
+module std_config [system] {
+  textual header "__config"
+  textual header "__configuration/abi.h"
+  textual header "__configuration/availability.h"
+  textual header "__configuration/compiler.h"
+  textual header "__configuration/language.h"
+  textual header "__configuration/platform.h"
+  textual header "version"
 }
-module std_mdspan [system] {
-  header "mdspan"
-  module default_accessor { header "__mdspan/default_accessor.h" }
-  module extents          { header "__mdspan/extents.h" }
-  module fwd              { header "__fwd/mdspan.h" }
-  module layout_left      { header "__mdspan/layout_left.h" }
-  module layout_right     { header "__mdspan/layout_right.h" }
-  module layout_stride    { header "__mdspan/layout_stride.h" }
-  module mdspan           {
-    header "__mdspan/mdspan.h"
-    export std_array // for strides()
+
+module std_core [system] {
+  module cstddef {
+    module byte         { header "__cstddef/byte.h" }
+    module max_align_t  { header "__cstddef/max_align_t.h" }
+    module nullptr_t    { header "__cstddef/nullptr_t.h" }
+    module ptrdiff_t    { header "__cstddef/ptrdiff_t.h" }
+    module size_t       { header "__cstddef/size_t.h" }
   }
-  export *
-}
-module std_memory [system] {
-  header "memory"
-  export *
-}
-module std_memory_resource [system] {
-  header "memory_resource"
-  export *
-}
-module std_mutex [system] {
-  header "mutex"
-  export *
-}
-module std_new [system] {
-  header "new"
-  export *
-}
-module std_numbers [system] {
-  header "numbers"
-  export *
-}
-module std_numeric [system] {
-  header "numeric"
-  export *
-}
-module std_optional [system] {
-  header "optional"
-  export *
-}
-module std_ostream [system] {
-  header "ostream"
-  export *
-}
-module std_print [system] {
-  header "print"
-  export *
-}
-module std_queue [system] {
-  header "queue"
-  export *
-}
-module std_random [system] {
-  header "random"
-  export *
-}
-module std_ranges [system] {
-  header "ranges"
-  export *
-}
-module std_ratio [system] {
-  header "ratio"
-  export *
-}
-module std_regex [system] {
-  header "regex"
-  export *
-}
-module std_scoped_allocator [system] {
-  header "scoped_allocator"
-  export *
-}
-module std_semaphore [system] {
-  header "semaphore"
-  export *
-}
-module std_set [system] {
-  header "set"
-  export *
-}
-module std_shared_mutex [system] {
-  header "shared_mutex"
-  export std_version
-}
-module std_source_location [system] {
-  header "source_location"
-  export *
-}
-module std_span [system] {
-  header "span"
-  export std_private_ranges_enable_borrowed_range
-  export std_version
-  export std_private_span_span_fwd
-}
-module std_sstream [system] {
-  header "sstream"
-  export *
-}
-module std_stack [system] {
-  header "stack"
-  export *
-}
-module std_stdexcept [system] {
-  header "stdexcept"
-  export *
-}
-module std_stop_token [system] {
-  header "stop_token"
-  private header "__stop_token/atomic_unique_lock.h"
-  private header "__stop_token/intrusive_list_view.h"
-  private header "__stop_token/intrusive_shared_ptr.h"
-  private header "__stop_token/stop_callback.h"
-  private header "__stop_token/stop_source.h"
-  private header "__stop_token/stop_state.h"
-  private header "__stop_token/stop_token.h"
-  export *
-}
-module std_streambuf [system] {
-  header "streambuf"
-  export *
-}
-module std_string [system] {
-  header "string"
-  export *
-}
-module std_string_view [system] {
-  header "string_view"
-  export *
-}
-module std_strstream [system] {
-  header "strstream"
-  export *
-}
-module std_syncstream [system] {
-  header "syncstream"
-  export *
-}
-module std_system_error [system] {
-  header "system_error"
-  export *
-}
-module std_thread [system] {
-  header "thread"
-  export *
-}
-module std_tuple [system] {
-  header "tuple"
-  export *
-}
-module std_type_traits [system] {
-  header "type_traits"
-  export *
-}
-module std_typeindex [system] {
-  header "typeindex"
-  export *
-}
-module std_typeinfo [system] {
-  header "typeinfo"
-  export *
-}
-module std_unordered_map [system] {
-  header "unordered_map"
-  export *
-}
-module std_unordered_set [system] {
-  header "unordered_set"
-  export *
-}
-module std_utility [system] {
-  header "utility"
-  export *
-}
-module std_valarray [system] {
-  header "valarray"
-  export *
-}
-module std_variant [system] {
-  header "variant"
-  export *
-}
-module std_vector [system] {
-  header "vector"
-  export *
-}
-module std_version [system] {
-  header "version"
-  export *
-}
 
-// C standard library interface wrappers
-module std_cassert [system] {
-  // <cassert>'s use of NDEBUG requires textual inclusion.
-  textual header "cassert"
-}
-module std_ccomplex [system] {
-  header "ccomplex"
-  export *
-}
-module std_cctype [system] {
-  header "cctype"
-  export *
-}
-module std_cerrno [system] {
-  header "cerrno"
-  export *
-}
-module std_cfenv [system] {
-  header "cfenv"
-  export *
-}
-module std_cfloat [system] {
-  header "cfloat"
-  export *
-}
-module std_cinttypes [system] {
-  header "cinttypes"
-  export *
-}
-module std_ciso646 [system] {
-  header "ciso646"
-  export *
-}
-module std_climits [system] {
-  header "climits"
-  export *
-}
-module std_clocale [system] {
-  header "clocale"
-  export *
-}
-module std_cmath [system] {
-  header "cmath"
-  export *
-}
-module std_csetjmp [system] {
-  header "csetjmp"
-  export *
-}
-module std_csignal [system] {
-  header "csignal"
-  export *
-}
-// FIXME: <cstdalign> is missing.
-module std_cstdarg [system] {
-  header "cstdarg"
-  export *
-}
-module std_cstdbool [system] {
-  header "cstdbool"
-  export *
-}
-module std_cstddef [system] {
-  header "cstddef"
-  module byte         { header "__cstddef/byte.h" }
-  module max_align_t  { header "__cstddef/max_align_t.h" }
-  module nullptr_t    { header "__cstddef/nullptr_t.h" }
-  module ptrdiff_t    { header "__cstddef/ptrdiff_t.h" }
-  module size_t       { header "__cstddef/size_t.h" }
-  export *
-}
-module std_cstdint [system] {
-  header "cstdint"
-  export *
-}
-module std_cstdio [system] {
-  header "cstdio"
-  export *
-}
-module std_cstdlib [system] {
-  header "cstdlib"
-  export *
-}
-module std_cstring [system] {
-  header "cstring"
-  export *
-}
-module std_ctgmath [system] {
-  header "ctgmath"
-  export *
-}
-module std_ctime [system] {
-  header "ctime"
-  export *
-}
-module std_cuchar [system] {
-  header "cuchar"
-  export *
-}
-module std_cwchar [system] {
-  header "cwchar"
-  export *
-}
-module std_cwctype [system] {
-  header "cwctype"
-  export *
-}
+  module cstdint {
+    header "cstdint"
+    export *
+  }
 
-// C standard library interfaces augmented/replaced in C++
-// <assert.h> provided by C library.
-module std_complex_h [system] {
-  header "complex.h"
-  export *
-}
-module std_ctype_h [system] {
-  header "ctype.h"
-  export *
-}
-module std_errno_h [system] {
-  header "errno.h"
-  export *
-}
-module std_fenv_h [system] {
-  header "fenv.h"
-  export *
-}
-module std_float_h [system] {
-  header "float.h"
-  export *
-}
-module std_inttypes_h [system] {
-  header "inttypes.h"
-  export *
-}
-// <iso646.h> provided by compiler.
-module std_locale_h [system] {
-  header "locale.h"
-  export *
-}
-module std_math_h [system] {
-  header "math.h"
-  export *
-}
-// <setjmp.h> provided by C library.
-// <signal.h> provided by C library.
-// FIXME: <stdalign.h> is missing.
-// <stdarg.h> provided by compiler.
-module std_stdatomic_h [system] {
-  header "stdatomic.h"
-  export *
-}
-module std_stdbool_h [system] {
-  // <stdbool.h>'s __bool_true_false_are_defined macro requires textual inclusion.
-  textual header "stdbool.h"
-  export *
-}
-module std_stddef_h [system] {
-  // <stddef.h>'s __need_* macros require textual inclusion.
-  textual header "stddef.h"
-  export *
-}
-module std_stdint_h [system] {
-  header "stdint.h"
-  export *
-}
-module std_stdio_h [system] {
-  // <stdio.h>'s __need_* macros require textual inclusion.
-  textual header "stdio.h"
-  export *
-}
-module std_stdlib_h [system] {
-  // <stdlib.h>'s __need_* macros require textual inclusion.
-  textual header "stdlib.h"
-  export *
-}
-module std_string_h [system] {
-  header "string.h"
-  export *
-}
-module std_tgmath_h [system] {
-  header "tgmath.h"
-  export *
-}
-module std_uchar_h [system] {
-  header "uchar.h"
-  export *
-}
-// <time.h> provided by C library.
-module std_wchar_h [system] {
-  // <wchar.h>'s __need_* macros require textual inclusion.
-  textual header "wchar.h"
-  export *
-}
-module std_wctype_h [system] {
-  header "wctype.h"
-  export *
-}
+  module fwd {
+    module byte         { header "__fwd/byte.h" }
+    module functional   { header "__fwd/functional.h" }
+    module pair         { header "__fwd/pair.h" }
+    module tuple        { header "__fwd/tuple.h" }
+  }
+
+  module limits {
+    header "limits"
+    export *
+  }
+
+  module math {
+    module abs                              { header "__math/abs.h" }
+    module copysign                         { header "__math/copysign.h" }
+    module error_functions                  { header "__math/error_functions.h" }
+    module exponential_functions            { header "__math/exponential_functions.h" }
+    module fdim                             { header "__math/fdim.h" }
+    module fma                              { header "__math/fma.h" }
+    module gamma                            { header "__math/gamma.h" }
+    module hyperbolic_functions             { header "__math/hyperbolic_functions.h" }
+    module hypot                            { header "__math/hypot.h" }
+    module inverse_hyperbolic_functions     { header "__math/inverse_hyperbolic_functions.h" }
+    module inverse_trigonometric_functions  { header "__math/inverse_trigonometric_functions.h" }
+    module logarithms                       { header "__math/logarithms.h" }
+    module min_max                          { header "__math/min_max.h" }
+    module modulo                           { header "__math/modulo.h" }
+    module remainder                        { header "__math/remainder.h" }
+    module roots                            { header "__math/roots.h" }
+    module rounding_functions               { header "__math/rounding_functions.h" }
+    module special_functions                { header "__math/special_functions.h" }
+    module traits                           { header "__math/traits.h" }
+    module trigonometric_functions          { header "__math/trigonometric_functions.h" }
+  }
+
+  module type_traits {
+    module add_const                                  { header "__type_traits/add_const.h" }
+    module add_cv                                     { header "__type_traits/add_cv.h" }
+    module add_lvalue_reference                       { header "__type_traits/add_lvalue_reference.h" }
+    module add_pointer                                { header "__type_traits/add_pointer.h" }
+    module add_rvalue_reference                       { header "__type_traits/add_rvalue_reference.h" }
+    module add_volatile                               { header "__type_traits/add_volatile.h" }
+    module aligned_storage                            { header "__type_traits/aligned_storage.h" }
+    module aligned_union                              { header "__type_traits/aligned_union.h" }
+    module alignment_of                               { header "__type_traits/alignment_of.h" }
+    module can_extract_key                            { header "__type_traits/can_extract_key.h" }
+    module common_reference                           { header "__type_traits/common_reference.h" }
+    module common_type {
+      header "__type_traits/common_type.h"
+      // We need to export everything from this module because common_type inherits from __builtin_common_type,
+      // which needs to be re-exported.
+      export *
+    }
+    module conditional                                { header "__type_traits/conditional.h" }
+    module conjunction                                { header "__type_traits/conjunction.h" }
+    module copy_cv                                    { header "__type_traits/copy_cv.h" }
+    module copy_cvref                                 { header "__type_traits/copy_cvref.h" }
+    module datasizeof                                 { header "__type_traits/datasizeof.h" }
+    module decay                                      { header "__type_traits/decay.h" }
+    module dependent_type                             { header "__type_traits/dependent_type.h" }
+    module desugars_to                                { header "__type_traits/desugars_to.h" }
+    module disjunction                                { header "__type_traits/disjunction.h" }
+    module enable_if                                  { header "__type_traits/enable_if.h" }
+    module extent                                     { header "__type_traits/extent.h" }
+    module has_unique_object_representation           { header "__type_traits/has_unique_object_representation.h" }
+    module has_virtual_destructor                     { header "__type_traits/has_virtual_destructor.h" }
+    module integral_constant                          { header "__type_traits/integral_constant.h" }
+    module invoke                                     { header "__type_traits/invoke.h" }
+    module is_abstract {
+      header "__type_traits/is_abstract.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_aggregate {
+      header "__type_traits/is_aggregate.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_allocator {
+      header "__type_traits/is_allocator.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_always_bitcastable {
+      header "__type_traits/is_always_bitcastable.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_arithmetic {
+      header "__type_traits/is_arithmetic.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_array {
+      header "__type_traits/is_array.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_assignable {
+      header "__type_traits/is_assignable.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_base_of {
+      header "__type_traits/is_base_of.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_bounded_array {
+      header "__type_traits/is_bounded_array.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_callable {
+      header "__type_traits/is_callable.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_char_like_type {
+      header "__type_traits/is_char_like_type.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_class {
+      header "__type_traits/is_class.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_compound {
+      header "__type_traits/is_compound.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_const {
+      header "__type_traits/is_const.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_constant_evaluated {
+      header "__type_traits/is_constant_evaluated.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_constructible {
+      header "__type_traits/is_constructible.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_convertible {
+      header "__type_traits/is_convertible.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_core_convertible {
+      header "__type_traits/is_core_convertible.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_destructible {
+      header "__type_traits/is_destructible.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_empty {
+      header "__type_traits/is_empty.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_enum {
+      header "__type_traits/is_enum.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_equality_comparable {
+      header "__type_traits/is_equality_comparable.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_execution_policy {
+      header "__type_traits/is_execution_policy.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_final {
+      header "__type_traits/is_final.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_floating_point {
+      header "__type_traits/is_floating_point.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_function {
+      header "__type_traits/is_function.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_fundamental {
+      header "__type_traits/is_fundamental.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_implicitly_default_constructible {
+      header "__type_traits/is_implicitly_default_constructible.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_integral {
+      header "__type_traits/is_integral.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_literal_type {
+      header "__type_traits/is_literal_type.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_member_pointer {
+      header "__type_traits/is_member_pointer.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_nothrow_assignable {
+      header "__type_traits/is_nothrow_assignable.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_nothrow_constructible {
+      header "__type_traits/is_nothrow_constructible.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_nothrow_convertible {
+      header "__type_traits/is_nothrow_convertible.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_nothrow_destructible {
+      header "__type_traits/is_nothrow_destructible.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_null_pointer {
+      header "__type_traits/is_null_pointer.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_object {
+      header "__type_traits/is_object.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_pod {
+      header "__type_traits/is_pod.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_pointer {
+      header "__type_traits/is_pointer.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_polymorphic {
+      header "__type_traits/is_polymorphic.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_primary_template {
+      header "__type_traits/is_primary_template.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_reference_wrapper {
+      header "__type_traits/is_reference_wrapper.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_reference {
+      header "__type_traits/is_reference.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_referenceable {
+      header "__type_traits/is_referenceable.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_same {
+      header "__type_traits/is_same.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_scalar {
+      header "__type_traits/is_scalar.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_signed_integer {
+      header "__type_traits/is_signed_integer.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_signed {
+      header "__type_traits/is_signed.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_specialization {
+      header "__type_traits/is_specialization.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_standard_layout {
+      header "__type_traits/is_standard_layout.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_swappable {
+      header "__type_traits/is_swappable.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_trivial {
+      header "__type_traits/is_trivial.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_trivially_assignable {
+      header "__type_traits/is_trivially_assignable.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_trivially_constructible {
+      header "__type_traits/is_trivially_constructible.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_trivially_copyable {
+      header "__type_traits/is_trivially_copyable.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_trivially_destructible {
+      header "__type_traits/is_trivially_destructible.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_trivially_lexicographically_comparable {
+      header "__type_traits/is_trivially_lexicographically_comparable.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_trivially_relocatable {
+      header "__type_traits/is_trivially_relocatable.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_unbounded_array {
+      header "__type_traits/is_unbounded_array.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_union {
+      header "__type_traits/is_union.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_unsigned_integer {
+      header "__type_traits/is_unsigned_integer.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_unsigned {
+      header "__type_traits/is_unsigned.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_valid_expansion {
+      header "__type_traits/is_valid_expansion.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_void {
+      header "__type_traits/is_void.h"
+      export std_core.type_traits.integral_constant
+    }
+    module is_volatile {
+      header "__type_traits/is_volatile.h"
+      export std_core.type_traits.integral_constant
+    }
+    module lazy                                       { header "__type_traits/lazy.h" }
+    module make_32_64_or_128_bit                      { header "__type_traits/make_32_64_or_128_bit.h" }
+    module make_const_lvalue_ref                      { header "__type_traits/make_const_lvalue_ref.h" }
+    module make_signed                                { header "__type_traits/make_signed.h" }
+    module make_unsigned                              { header "__type_traits/make_unsigned.h" }
+    module maybe_const                                { header "__type_traits/maybe_const.h" }
+    module nat                                        { header "__type_traits/nat.h" }
+    module negation                                   { header "__type_traits/negation.h" }
+    module promote                                    { header "__type_traits/promote.h" }
+    module rank                                       { header "__type_traits/rank.h" }
+    module remove_all_extents                         { header "__type_traits/remove_all_extents.h" }
+    module remove_const_ref                           { header "__type_traits/remove_const_ref.h" }
+    module remove_const                               { header "__type_traits/remove_const.h" }
+    module remove_cv                                  { header "__type_traits/remove_cv.h" }
+    module remove_cvref                               { header "__type_traits/remove_cvref.h" }
+    module remove_extent                              { header "__type_traits/remove_extent.h" }
+    module remove_pointer                             { header "__type_traits/remove_pointer.h" }
+    module remove_reference                           { header "__type_traits/remove_reference.h" }
+    module remove_volatile                            { header "__type_traits/remove_volatile.h" }
+    module result_of                                  { header "__type_traits/result_of.h" }
+    module strip_signature                            { header "__type_traits/strip_signature.h" }
+    module type_identity                              { header "__type_traits/type_identity.h" }
+    module type_list                                  { header "__type_traits/type_list.h" }
+    module underlying_type                            { header "__type_traits/underlying_type.h" }
+    module unwrap_ref                                 { header "__type_traits/unwrap_ref.h" }
+    module void_t                                     { header "__type_traits/void_t.h" }
+
+    header "type_traits"
+    export *
+  } // module type_traits
+
+  // Only the truly dependency-free parts of __utility are here
+  module utility_core {
+    module declval  { header "__utility/declval.h" }
+    module empty    { header "__utility/empty.h" }
+    module forward  { header "__utility/forward.h" }
+  }
+} // module std_core
+
+module std [system] {
+  module algorithm {
+    module adjacent_find                          { header "__algorithm/adjacent_find.h" }
+    module all_of                                 { header "__algorithm/all_of.h" }
+    module any_of                                 { header "__algorithm/any_of.h" }
+    module binary_search                          { header "__algorithm/binary_search.h" }
+    module clamp                                  { header "__algorithm/clamp.h" }
+    module comp_ref_type                          { header "__algorithm/comp_ref_type.h" }
+    module comp                                   { header "__algorithm/comp.h" }
+    module copy_backward                          { header "__algorithm/copy_backward.h" }
+    module copy_if                                { header "__algorithm/copy_if.h" }
+    module copy_move_common                       { header "__algorithm/copy_move_common.h" }
+    module copy_n                                 { header "__algorithm/copy_n.h" }
+    module copy                                   { header "__algorithm/copy.h" }
+    module count_if                               { header "__algorithm/count_if.h" }
+    module count                                  { header "__algorithm/count.h" }
+    module equal_range                            { header "__algorithm/equal_range.h" }
+    module equal                                  { header "__algorithm/equal.h" }
+    module fill_n                                 { header "__algorithm/fill_n.h" }
+    module fill                                   { header "__algorithm/fill.h" }
+    module find_end                               { header "__algorithm/find_end.h" }
+    module find_first_of                          { header "__algorithm/find_first_of.h" }
+    module find_if_not                            { header "__algorithm/find_if_not.h" }
+    module find_if                                { header "__algorithm/find_if.h" }
+    module find_segment_if                        { header "__algorithm/find_segment_if.h" }
+    module find                                   { header "__algorithm/find.h" }
+    module for_each_n                             { header "__algorithm/for_each_n.h" }
+    module for_each_segment                       { header "__algorithm/for_each_segment.h" }
+    module for_each                               { header "__algorithm/for_each.h" }
+    module generate_n                             { header "__algorithm/generate_n.h" }
+    module generate                               { header "__algorithm/generate.h" }
+    module half_positive                          { header "__algorithm/half_positive.h" }
+    module in_found_result                        { header "__algorithm/in_found_result.h" }
+    module in_fun_result                          { header "__algorithm/in_fun_result.h" }
+    module in_in_out_result                       { header "__algorithm/in_in_out_result.h" }
+    module in_in_result                           { header "__algorithm/in_in_result.h" }
+    module in_out_out_result                      { header "__algorithm/in_out_out_result.h" }
+    module in_out_result                          { header "__algorithm/in_out_result.h" }
+    module includes                               { header "__algorithm/includes.h" }
+    module inplace_merge                          { header "__algorithm/inplace_merge.h" }
+    module is_heap_until                          { header "__algorithm/is_heap_until.h" }
+    module is_heap                                { header "__algorithm/is_heap.h" }
+    module is_partitioned                         { header "__algorithm/is_partitioned.h" }
+    module is_permutation                         { header "__algorithm/is_permutation.h" }
+    module is_sorted_until                        { header "__algorithm/is_sorted_until.h" }
+    module is_sorted                              { header "__algorithm/is_sorted.h" }
+    module iter_swap                              { header "__algorithm/iter_swap.h" }
+    module iterator_operations {
+      header "__algorithm/iterator_operations.h"
+      export std.iterator.advance
+      export std.iterator.distance
+      export std.iterator.iter_move
+      export std.iterator.iter_swap
+      export std.iterator.next
+      export std.iterator.prev
+    }
+    module lexicographical_compare_three_way      { header "__algorithm/lexicographical_compare_three_way.h" }
+    module lexicographical_compare                { header "__algorithm/lexicographical_compare.h" }
+    module lower_bound                            { header "__algorithm/lower_bound.h" }
+    module make_heap                              { header "__algorithm/make_heap.h" }
+    module make_projected                         { header "__algorithm/make_projected.h" }
+    module max_element                            { header "__algorithm/max_element.h" }
+    module max                                    { header "__algorithm/max.h" }
+    module merge                                  { header "__algorithm/merge.h" }
+    module min_element                            { header "__algorithm/min_element.h" }
+    module min_max_result                         { header "__algorithm/min_max_result.h" }
+    module min                                    { header "__algorithm/min.h" }
+    module minmax_element                         { header "__algorithm/minmax_element.h" }
+    module minmax {
+      header "__algorithm/minmax.h"
+      export std.utility.pair // return type
+    }
+    module mismatch {
+      header "__algorithm/mismatch.h"
+      export std.utility.pair // return type
+    }
+    module move_backward                          { header "__algorithm/move_backward.h" }
+    module move                                   { header "__algorithm/move.h" }
+    module next_permutation                       { header "__algorithm/next_permutation.h" }
+    module none_of                                { header "__algorithm/none_of.h" }
+    module nth_element                            { header "__algorithm/nth_element.h" }
+    module partial_sort_copy                      { header "__algorithm/partial_sort_copy.h" }
+    module partial_sort                           { header "__algorithm/partial_sort.h" }
+    module partition_copy                         { header "__algorithm/partition_copy.h" }
+    module partition_point                        { header "__algorithm/partition_point.h" }
+    module partition                              { header "__algorithm/partition.h" }
+    module pop_heap                               { header "__algorithm/pop_heap.h" }
+    module prev_permutation                       { header "__algorithm/prev_permutation.h" }
+    module pstl                                   { header "__algorithm/pstl.h" }
+    module push_heap                              { header "__algorithm/push_heap.h" }
+    module ranges_adjacent_find                   { header "__algorithm/ranges_adjacent_find.h" }
+    module ranges_all_of                          { header "__algorithm/ranges_all_of.h" }
+    module ranges_any_of                          { header "__algorithm/ranges_any_of.h" }
+    module ranges_binary_search {
+      header "__algorithm/ranges_binary_search.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_clamp {
+      header "__algorithm/ranges_clamp.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_contains_subrange {
+      header "__algorithm/ranges_contains_subrange.h"
+    }
+    module ranges_contains {
+      header "__algorithm/ranges_contains.h"
+    }
+    module ranges_copy_backward {
+      header "__algorithm/ranges_copy_backward.h"
+      export std.algorithm.in_out_result
+    }
+    module ranges_copy_if {
+      header "__algorithm/ranges_copy_if.h"
+      export std.algorithm.in_out_result
+    }
+    module ranges_copy_n {
+      header "__algorithm/ranges_copy_n.h"
+      export std.algorithm.in_out_result
+    }
+    module ranges_copy {
+      header "__algorithm/ranges_copy.h"
+      export std.algorithm.in_out_result
+    }
+    module ranges_count_if                        { header "__algorithm/ranges_count_if.h" }
+    module ranges_count                           { header "__algorithm/ranges_count.h" }
+    module ranges_ends_with                       { header "__algorithm/ranges_ends_with.h" }
+    module ranges_equal_range {
+      header "__algorithm/ranges_equal_range.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_equal {
+      header "__algorithm/ranges_equal.h"
+      export std.functional.identity
+    }
+    module ranges_fill_n                          { header "__algorithm/ranges_fill_n.h" }
+    module ranges_fill                            { header "__algorithm/ranges_fill.h" }
+    module ranges_find_end                        { header "__algorithm/ranges_find_end.h" }
+    module ranges_find_first_of                   { header "__algorithm/ranges_find_first_of.h" }
+    module ranges_find_if_not                     { header "__algorithm/ranges_find_if_not.h" }
+    module ranges_find_if                         { header "__algorithm/ranges_find_if.h" }
+    module ranges_find_last                       { header "__algorithm/ranges_find_last.h" }
+    module ranges_find                            { header "__algorithm/ranges_find.h" }
+    module ranges_fold                            { header "__algorithm/ranges_fold.h" }
+    module ranges_for_each_n {
+      header "__algorithm/ranges_for_each_n.h"
+      export std.algorithm.in_fun_result
+    }
+    module ranges_for_each {
+      header "__algorithm/ranges_for_each.h"
+      export std.algorithm.in_fun_result
+    }
+    module ranges_generate_n {
+      header "__algorithm/ranges_generate_n.h"
+    }
+    module ranges_generate {
+      header "__algorithm/ranges_generate.h"
+    }
+    module ranges_includes {
+      header "__algorithm/ranges_includes.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_inplace_merge {
+      header "__algorithm/ranges_inplace_merge.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_is_heap_until {
+      header "__algorithm/ranges_is_heap_until.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_is_heap {
+      header "__algorithm/ranges_is_heap.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_is_partitioned {
+      header "__algorithm/ranges_is_partitioned.h"
+    }
+    module ranges_is_permutation {
+      header "__algorithm/ranges_is_permutation.h"
+    }
+    module ranges_is_sorted_until {
+      header "__algorithm/ranges_is_sorted_until.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_is_sorted {
+      header "__algorithm/ranges_is_sorted.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_iterator_concept {
+      header "__algorithm/ranges_iterator_concept.h"
+    }
+    module ranges_lexicographical_compare {
+      header "__algorithm/ranges_lexicographical_compare.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_lower_bound {
+      header "__algorithm/ranges_lower_bound.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_make_heap {
+      header "__algorithm/ranges_make_heap.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_max_element {
+      header "__algorithm/ranges_max_element.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_max {
+      header "__algorithm/ranges_max.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_merge {
+      header "__algorithm/ranges_merge.h"
+      export std.functional.ranges_operations
+      export std.algorithm.in_in_out_result
+    }
+    module ranges_min_element {
+      header "__algorithm/ranges_min_element.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_min {
+      header "__algorithm/ranges_min.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_minmax_element {
+      header "__algorithm/ranges_minmax_element.h"
+      export std.functional.ranges_operations
+      export std.algorithm.min_max_result
+    }
+    module ranges_minmax {
+      header "__algorithm/ranges_minmax.h"
+      export std.functional.ranges_operations
+      export std.algorithm.min_max_result
+    }
+    module ranges_mismatch {
+      header "__algorithm/ranges_mismatch.h"
+      export std.algorithm.in_in_result
+    }
+    module ranges_move_backward {
+      header "__algorithm/ranges_move_backward.h"
+      export std.algorithm.in_out_result
+    }
+    module ranges_move {
+      header "__algorithm/ranges_move.h"
+      export std.algorithm.in_out_result
+    }
+    module ranges_next_permutation {
+      header "__algorithm/ranges_next_permutation.h"
+      export std.functional.ranges_operations
+      export std.algorithm.in_found_result
+    }
+    module ranges_none_of {
+      header "__algorithm/ranges_none_of.h"
+    }
+    module ranges_nth_element {
+      header "__algorithm/ranges_nth_element.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_partial_sort_copy {
+      header "__algorithm/ranges_partial_sort_copy.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_partial_sort {
+      header "__algorithm/ranges_partial_sort.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_partition_copy {
+      header "__algorithm/ranges_partition_copy.h"
+      export std.algorithm.in_out_out_result
+    }
+    module ranges_partition_point {
+      header "__algorithm/ranges_partition_point.h"
+    }
+    module ranges_partition {
+      header "__algorithm/ranges_partition.h"
+    }
+    module ranges_pop_heap {
+      header "__algorithm/ranges_pop_heap.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_prev_permutation {
+      header "__algorithm/ranges_prev_permutation.h"
+      export std.functional.ranges_operations
+      export std.algorithm.in_found_result
+    }
+    module ranges_push_heap {
+      header "__algorithm/ranges_push_heap.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_remove_copy_if {
+      header "__algorithm/ranges_remove_copy_if.h"
+      export std.algorithm.in_out_result
+    }
+    module ranges_remove_copy {
+      header "__algorithm/ranges_remove_copy.h"
+      export std.algorithm.in_out_result
+    }
+    module ranges_remove_if {
+      header "__algorithm/ranges_remove_if.h"
+    }
+    module ranges_remove {
+      header "__algorithm/ranges_remove.h"
+    }
+    module ranges_replace_copy_if {
+      header "__algorithm/ranges_replace_copy_if.h"
+      export std.algorithm.in_out_result
+    }
+    module ranges_replace_copy {
+      header "__algorithm/ranges_replace_copy.h"
+      export std.algorithm.in_out_result
+    }
+    module ranges_replace_if {
+      header "__algorithm/ranges_replace_if.h"
+    }
+    module ranges_replace {
+      header "__algorithm/ranges_replace.h"
+    }
+    module ranges_reverse_copy {
+      header "__algorithm/ranges_reverse_copy.h"
+      export std.algorithm.in_out_result
+    }
+    module ranges_reverse {
+      header "__algorithm/ranges_reverse.h"
+    }
+    module ranges_rotate_copy {
+      header "__algorithm/ranges_rotate_copy.h"
+      export std.algorithm.in_out_result
+    }
+    module ranges_rotate                          { header "__algorithm/ranges_rotate.h" }
+    module ranges_sample                          { header "__algorithm/ranges_sample.h" }
+    module ranges_search_n                        { header "__algorithm/ranges_search_n.h" }
+    module ranges_search                          { header "__algorithm/ranges_search.h" }
+    module ranges_set_difference {
+      header "__algorithm/ranges_set_difference.h"
+      export std.functional.ranges_operations
+      export std.algorithm.in_out_result
+    }
+    module ranges_set_intersection {
+      header "__algorithm/ranges_set_intersection.h"
+      export std.functional.ranges_operations
+      export std.algorithm.in_in_out_result
+    }
+    module ranges_set_symmetric_difference {
+      header "__algorithm/ranges_set_symmetric_difference.h"
+      export std.functional.ranges_operations
+      export std.algorithm.in_in_out_result
+    }
+    module ranges_set_union {
+      header "__algorithm/ranges_set_union.h"
+      export std.functional.ranges_operations
+      export std.algorithm.in_in_out_result
+    }
+    module ranges_shuffle {
+      header "__algorithm/ranges_shuffle.h"
+    }
+    module ranges_sort_heap {
+      header "__algorithm/ranges_sort_heap.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_sort {
+      header "__algorithm/ranges_sort.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_stable_partition {
+      header "__algorithm/ranges_stable_partition.h"
+    }
+    module ranges_stable_sort {
+      header "__algorithm/ranges_stable_sort.h"
+      export std.functional.ranges_operations
+    }
+    module ranges_starts_with {
+      header "__algorithm/ranges_starts_with.h"
+    }
+    module ranges_swap_ranges {
+      header "__algorithm/ranges_swap_ranges.h"
+      export std.algorithm.in_in_result
+    }
+    module ranges_transform {
+      header "__algorithm/ranges_transform.h"
+      export std.algorithm.in_out_result
+      export std.algorithm.in_in_out_result
+    }
+    module ranges_unique_copy {
+      header "__algorithm/ranges_unique_copy.h"
+    }
+    module ranges_unique {
+      header "__algorithm/ranges_unique.h"
+    }
+    module ranges_upper_bound {
+      header "__algorithm/ranges_upper_bound.h"
+      export std.functional.ranges_operations
+    }
+    module remove_copy_if                         { header "__algorithm/remove_copy_if.h" }
+    module remove_copy                            { header "__algorithm/remove_copy.h" }
+    module remove_if                              { header "__algorithm/remove_if.h" }
+    module remove                                 { header "__algorithm/remove.h" }
+    module replace_copy_if                        { header "__algorithm/replace_copy_if.h" }
+    module replace_copy                           { header "__algorithm/replace_copy.h" }
+    module replace_if                             { header "__algorithm/replace_if.h" }
+    module replace                                { header "__algorithm/replace.h" }
+    module reverse_copy                           { header "__algorithm/reverse_copy.h" }
+    module reverse                                { header "__algorithm/reverse.h" }
+    module rotate_copy                            { header "__algorithm/rotate_copy.h" }
+    module rotate                                 { header "__algorithm/rotate.h" }
+    module sample                                 { header "__algorithm/sample.h" }
+    module search_n                               { header "__algorithm/search_n.h" }
+    module search                                 { header "__algorithm/search.h" }
+    module set_difference                         { header "__algorithm/set_difference.h" }
+    module set_intersection                       { header "__algorithm/set_intersection.h" }
+    module set_symmetric_difference               { header "__algorithm/set_symmetric_difference.h" }
+    module set_union                              { header "__algorithm/set_union.h" }
+    module shift_left                             { header "__algorithm/shift_left.h" }
+    module shift_right                            { header "__algorithm/shift_right.h" }
+    module shuffle                                { header "__algorithm/shuffle.h" }
+    module sift_down                              { header "__algorithm/sift_down.h" }
+    module simd_utils                             { header "__algorithm/simd_utils.h" }
+    module sort_heap                              { header "__algorithm/sort_heap.h" }
+    module sort                                   { header "__algorithm/sort.h" }
+    module stable_partition                       { header "__algorithm/stable_partition.h" }
+    module stable_sort                            { header "__algorithm/stable_sort.h" }
+    module swap_ranges                            { header "__algorithm/swap_ranges.h" }
+    module three_way_comp_ref_type                { header "__algorithm/three_way_comp_ref_type.h" }
+    module transform                              { header "__algorithm/transform.h" }
+    module uniform_random_bit_generator_adaptor   { header "__algorithm/uniform_random_bit_generator_adaptor.h" }
+    module unique_copy                            { header "__algorithm/unique_copy.h" }
+    module unique                                 { header "__algorithm/unique.h" }
+    module unwrap_iter                            { header "__algorithm/unwrap_iter.h" }
+    module unwrap_range                           { header "__algorithm/unwrap_range.h" }
+    module upper_bound                            { header "__algorithm/upper_bound.h" }
+
+    header "algorithm"
+    export *
+  } // module algorithm
+
+  module any {
+    header "any"
+    export *
+  }
+
+  module array {
+    module fwd { header "__fwd/array.h" }
+
+    header "array"
+    export *
+  }
+
+  module atomic {
+    module aliases                { header "__atomic/aliases.h" }
+    module atomic_base            { header "__atomic/atomic_base.h" }
+    module atomic_flag            { header "__atomic/atomic_flag.h" }
+    module atomic_init            { header "__atomic/atomic_init.h" }
+    module atomic_lock_free       { header "__atomic/atomic_lock_free.h" }
+    module atomic_ref             { header "__atomic/atomic_ref.h" }
+    module atomic_sync            { header "__atomic/atomic_sync.h" }
+    module atomic {
+      header "__atomic/atomic.h"
+      export std.atomic.atomic_base // most of std::atomic methods are defined there
+    }
+    module check_memory_order     { header "__atomic/check_memory_order.h" }
+    module contention_t           { header "__atomic/contention_t.h" }
+    module cxx_atomic_impl        { header "__atomic/cxx_atomic_impl.h" }
+    module fence                  { header "__atomic/fence.h" }
+    module is_always_lock_free    { header "__atomic/is_always_lock_free.h" }
+    module kill_dependency        { header "__atomic/kill_dependency.h" }
+    module memory_order           { header "__atomic/memory_order.h" }
+    module to_gcc_order           { header "__atomic/to_gcc_order.h" }
+
+    header "atomic"
+    export *
+  }
+
+  module barrier {
+    header "barrier"
+    export *
+  }
+
+  module bit {
+    module bit_cast         { header "__bit/bit_cast.h" }
+    module bit_ceil         { header "__bit/bit_ceil.h" }
+    module bit_floor        { header "__bit/bit_floor.h" }
+    module bit_log2         { header "__bit/bit_log2.h" }
+    module bit_width        { header "__bit/bit_width.h" }
+    module blsr             { header "__bit/blsr.h" }
+    module byteswap         { header "__bit/byteswap.h" }
+    module countl           { header "__bit/countl.h" }
+    module countr           { header "__bit/countr.h" }
+    module endian           { header "__bit/endian.h" }
+    module has_single_bit   { header "__bit/has_single_bit.h" }
+    module invert_if        { header "__bit/invert_if.h" }
+    module popcount         { header "__bit/popcount.h" }
+    module rotate           { header "__bit/rotate.h" }
+
+    header "bit"
+    export *
+  }
+
+  module bitset {
+    header "bitset"
+    export *
+  }
+
+  module charconv {
+    module chars_format               { header "__charconv/chars_format.h" }
+    module from_chars_integral        { header "__charconv/from_chars_integral.h" }
+    module from_chars_result          { header "__charconv/from_chars_result.h" }
+    module tables                     { header "__charconv/tables.h" }
+    module to_chars                   { header "__charconv/to_chars.h" }
+    module to_chars_base_10           { header "__charconv/to_chars_base_10.h" }
+    module to_chars_floating_point    { header "__charconv/to_chars_floating_point.h" }
+    module to_chars_integral          { header "__charconv/to_chars_integral.h" }
+    module to_chars_result            { header "__charconv/to_chars_result.h" }
+    module traits                     { header "__charconv/traits.h" }
+
+    header "charconv"
+    export *
+  }
+
+  module chrono {
+    module calendar                   { header "__chrono/calendar.h" }
+    module concepts                   { header "__chrono/concepts.h" }
+    module convert_to_timespec        { header "__chrono/convert_to_timespec.h" }
+    module convert_to_tm              { header "__chrono/convert_to_tm.h" }
+    module day                        { header "__chrono/day.h" }
+    module duration                   { header "__chrono/duration.h" }
+    module exception                  { header "__chrono/exception.h" }
+    module file_clock                 { header "__chrono/file_clock.h" }
+    module formatter                  { header "__chrono/formatter.h" }
+    module hh_mm_ss                   { header "__chrono/hh_mm_ss.h" }
+    module high_resolution_clock {
+      header "__chrono/high_resolution_clock.h"
+      export *
+    }
+    module leap_second {
+      header "__chrono/leap_second.h"
+    }
+    module literals {
+      header "__chrono/literals.h"
+    }
+    module local_info {
+      header "__chrono/local_info.h"
+      export std.chrono.sys_info
+    }
+    module month_weekday              { header "__chrono/month_weekday.h" }
+    module month                      { header "__chrono/month.h" }
+    module monthday                   { header "__chrono/monthday.h" }
+    module ostream                    { header "__chrono/ostream.h" }
+    module parser_std_format_spec     { header "__chrono/parser_std_format_spec.h" }
+    module statically_widen           { header "__chrono/statically_widen.h" }
+    module steady_clock {
+      header "__chrono/steady_clock.h"
+      export std.chrono.time_point
+    }
+    module sys_info {
+      header "__chrono/sys_info.h"
+    }
+    module system_clock {
+      header "__chrono/system_clock.h"
+      export std.chrono.time_point
+    }
+    module time_point                 { header "__chrono/time_point.h" }
+    module time_zone_link             { header "__chrono/time_zone_link.h" }
+    module time_zone                  { header "__chrono/time_zone.h" }
+    module tzdb_list {
+      header "__chrono/tzdb_list.h"
+      export std.forward_list // forward_list iterators are used to implement this API
+      export std.string_view // by-value argument of type std::string_view
+    }
+    module tzdb {
+      header "__chrono/tzdb.h"
+      export std.string // public data member of type std::string
+      export std.vector // public data members of type std::vector
+    }
+    module weekday                    { header "__chrono/weekday.h" }
+    module year_month_day             { header "__chrono/year_month_day.h" }
+    module year_month_weekday         { header "__chrono/year_month_weekday.h" }
+    module year_month                 { header "__chrono/year_month.h" }
+    module year                       { header "__chrono/year.h" }
+    module zoned_time                 { header "__chrono/zoned_time.h" }
+
+    header "chrono"
+    export *
+  } // module chrono
+
+  module codecvt {
+    header "codecvt"
+    export *
+  }
+
+  module compare {
+    module common_comparison_category       { header "__compare/common_comparison_category.h" }
+    module compare_partial_order_fallback   { header "__compare/compare_partial_order_fallback.h" }
+    module compare_strong_order_fallback    { header "__compare/compare_strong_order_fallback.h" }
+    module compare_three_way                { header "__compare/compare_three_way.h" }
+    module compare_three_way_result         { header "__compare/compare_three_way_result.h" }
+    module compare_weak_order_fallback      { header "__compare/compare_weak_order_fallback.h" }
+    module is_eq                            { header "__compare/is_eq.h" }
+    module ordering                         { header "__compare/ordering.h" }
+    module partial_order                    { header "__compare/partial_order.h" }
+    module strong_order                     { header "__compare/strong_order.h" }
+    module synth_three_way                  { header "__compare/synth_three_way.h" }
+    module three_way_comparable             { header "__compare/three_way_comparable.h" }
+    module weak_order                       { header "__compare/weak_order.h" }
+
+    header "compare"
+    export *
+  }
+
+  module complex {
+    module fwd { header "__fwd/complex.h" }
+
+    header "complex"
+    export *
+  }
+
+  module concepts {
+    module arithmetic               { header "__concepts/arithmetic.h" }
+    module assignable               { header "__concepts/assignable.h" }
+    module boolean_testable         { header "__concepts/boolean_testable.h" }
+    module class_or_enum            { header "__concepts/class_or_enum.h" }
+    module common_reference_with    { header "__concepts/common_reference_with.h" }
+    module common_with              { header "__concepts/common_with.h" }
+    module constructible            { header "__concepts/constructible.h" }
+    module convertible_to           { header "__concepts/convertible_to.h" }
+    module copyable                 { header "__concepts/copyable.h" }
+    module derived_from             { header "__concepts/derived_from.h" }
+    module destructible             { header "__concepts/destructible.h" }
+    module different_from           { header "__concepts/different_from.h" }
+    module equality_comparable      { header "__concepts/equality_comparable.h" }
+    module invocable                { header "__concepts/invocable.h" }
+    module movable                  { header "__concepts/movable.h" }
+    module predicate                { header "__concepts/predicate.h" }
+    module regular                  { header "__concepts/regular.h" }
+    module relation                 { header "__concepts/relation.h" }
+    module same_as                  { header "__concepts/same_as.h" }
+    module semiregular              { header "__concepts/semiregular.h" }
+    module swappable                { header "__concepts/swappable.h" }
+    module totally_ordered          { header "__concepts/totally_ordered.h" }
+
+    header "concepts"
+    export *
+  }
+
+  module condition_variable {
+    module condition_variable   { header "__condition_variable/condition_variable.h" }
+
+    header "condition_variable"
+    export *
+  }
+
+  module cassert {
+    textual header "cassert" // NDEBUG requires textual inclusion
+  }
+
+  module ccomplex {
+    header "ccomplex"
+    export *
+  }
+
+  module cctype {
+    header "cctype"
+    export *
+  }
+
+  module cerrno {
+    header "cerrno"
+    export *
+  }
+
+  module cfenv {
+    header "cfenv"
+    export *
+  }
+
+  module cfloat {
+    header "cfloat"
+    export *
+  }
+
+  module cinttypes {
+    header "cinttypes"
+    export *
+  }
+
+  module ciso646 {
+    header "ciso646"
+    export *
+  }
+
+  module climits {
+    header "climits"
+    export *
+  }
+
+  module clocale {
+    header "clocale"
+    export *
+  }
+
+  module cmath {
+    header "cmath"
+    export *
+  }
+
+  // TODO: Make non-textual. This seems to cause problems when compiling against Glibc.
+  module csetjmp {
+    textual header "csetjmp"
+  }
+
+  module csignal {
+    header "csignal"
+    export *
+  }
+
+  module cstdarg {
+    header "cstdarg"
+    export *
+  }
+
+  module cstdbool {
+    header "cstdbool"
+    export *
+  }
+
+  module cstddef {
+    header "cstddef"
+    export *
+  }
+
+  module cstdio {
+    header "cstdio"
+    export *
+  }
+
+  module cstdlib {
+    header "cstdlib"
+    export *
+  }
+
+  module cstring {
+    header "cstring"
+    export *
+  }
+
+  module ctgmath {
+    header "ctgmath"
+    export *
+  }
+
+  module ctime {
+    header "ctime"
+    export *
+  }
+
+  module cuchar {
+    header "cuchar"
+    export *
+  }
+
+  module cwchar {
+    header "cwchar"
+    export *
+  }
+
+  module cwctype {
+    header "cwctype"
+    export *
+  }
+
+  module deque {
+    module fwd { header "__fwd/deque.h" }
+
+    header "deque"
+    export *
+  }
+
+  module exception {
+    module exception          { header "__exception/exception.h" }
+    module exception_ptr      { header "__exception/exception_ptr.h" }
+    module nested_exception   { header "__exception/nested_exception.h" }
+    module operations         { header "__exception/operations.h" }
+    module terminate          { header "__exception/terminate.h" }
+
+    header "exception"
+    export *
+  }
+
+  module execution {
+    header "execution"
+    export *
+  }
+
+  module expected {
+    module bad_expected_access    { header "__expected/bad_expected_access.h" }
+    module expected               { header "__expected/expected.h" }
+    module unexpect               { header "__expected/unexpect.h" }
+    module unexpected             { header "__expected/unexpected.h" }
+
+    header "expected"
+    export *
+  }
+
+  module filesystem {
+    module copy_options                   { header "__filesystem/copy_options.h" }
+    module directory_entry                { header "__filesystem/directory_entry.h" }
+    module directory_iterator             { header "__filesystem/directory_iterator.h" }
+    module directory_options              { header "__filesystem/directory_options.h" }
+    module file_status                    { header "__filesystem/file_status.h" }
+    module file_time_type                 { header "__filesystem/file_time_type.h" }
+    module file_type                      { header "__filesystem/file_type.h" }
+    module filesystem_error               { header "__filesystem/filesystem_error.h" }
+    module operations                     { header "__filesystem/operations.h" }
+    module path_iterator                  { header "__filesystem/path_iterator.h" }
+    module path                           {
+      header "__filesystem/path.h"
+      export std.string // returned by various methods of filesystem::path
+    }
+    module perm_options                   { header "__filesystem/perm_options.h" }
+    module perms                          { header "__filesystem/perms.h" }
+    module recursive_directory_iterator   { header "__filesystem/recursive_directory_iterator.h" }
+    module space_info                     { header "__filesystem/space_info.h" }
+    module u8path                         { header "__filesystem/u8path.h" }
+
+    header "filesystem"
+    export *
+  }
+
+  module format {
+    module buffer                             { header "__format/buffer.h" }
+    module concepts                           { header "__format/concepts.h" }
+    module container_adaptor                  { header "__format/container_adaptor.h" }
+    module enable_insertable                  { header "__format/enable_insertable.h" }
+    module escaped_output_table               { header "__format/escaped_output_table.h" }
+    module extended_grapheme_cluster_table    { header "__format/extended_grapheme_cluster_table.h" }
+    module format_arg                         { header "__format/format_arg.h" }
+    module format_arg_store                   { header "__format/format_arg_store.h" }
+    module format_args                        { header "__format/format_args.h" }
+    module format_context {
+      header "__format/format_context.h"
+      export std.optional // default argument for __format_context_create
+    }
+    module format_error {
+      header "__format/format_error.h"
+    }
+    module format_functions {
+      header "__format/format_functions.h"
+      export std.string // returned by the functions in that header
+    }
+    module format_parse_context               { header "__format/format_parse_context.h" }
+    module format_string                      { header "__format/format_string.h" }
+    module format_to_n_result                 { header "__format/format_to_n_result.h" }
+    module formatter                          { header "__format/formatter.h" }
+    module formatter_bool                     { header "__format/formatter_bool.h" }
+    module formatter_char                     { header "__format/formatter_char.h" }
+    module formatter_floating_point           { header "__format/formatter_floating_point.h" }
+    module formatter_integer                  { header "__format/formatter_integer.h" }
+    module formatter_integral                 { header "__format/formatter_integral.h" }
+    module formatter_output                   { header "__format/formatter_output.h" }
+    module formatter_pointer                  { header "__format/formatter_pointer.h" }
+    module formatter_string                   { header "__format/formatter_string.h" }
+    module formatter_tuple                    { header "__format/formatter_tuple.h" }
+    module fwd                                { header "__fwd/format.h" }
+    module indic_conjunct_break_table         { header "__format/indic_conjunct_break_table.h" }
+    module parser_std_format_spec             { header "__format/parser_std_format_spec.h" }
+    module range_default_formatter            { header "__format/range_default_formatter.h" }
+    module range_formatter                    { header "__format/range_formatter.h" }
+    module unicode                            { header "__format/unicode.h" }
+    module width_estimation_table             { header "__format/width_estimation_table.h" }
+    module write_escaped                      { header "__format/write_escaped.h" }
+
+    header "format"
+    export *
+  } // module format
+
+  module forward_list {
+    header "forward_list"
+    export *
+  }
+
+  module fstream {
+    module fwd { header "__fwd/fstream.h" }
+
+    header "fstream"
+    export *
+  }
+
+  module functional {
+    module binary_function              { header "__functional/binary_function.h" }
+    module binary_negate                { header "__functional/binary_negate.h" }
+    module bind_back {
+      header "__functional/bind_back.h"
+      export std.functional.perfect_forward // inherited from and using its operators
+    }
+    module bind_front {
+      header "__functional/bind_front.h"
+      export std.functional.perfect_forward // inherited from and using its operators
+    }
+    module bind                         { header "__functional/bind.h" }
+    module binder1st                    { header "__functional/binder1st.h" }
+    module binder2nd                    { header "__functional/binder2nd.h" }
+    module boyer_moore_searcher {
+      header "__functional/boyer_moore_searcher.h"
+      export std.memory.shared_ptr
+    }
+    module compose {
+      header "__functional/compose.h"
+      export std.functional.perfect_forward // inherited from and using its operators
+    }
+    module default_searcher             { header "__functional/default_searcher.h" }
+    module function                     { header "__functional/function.h" }
+    module hash                         { header "__functional/hash.h" }
+    module identity                     { header "__functional/identity.h" }
+    module invoke                       { header "__functional/invoke.h" }
+    module is_transparent               { header "__functional/is_transparent.h" }
+    module mem_fn                       { header "__functional/mem_fn.h" }
+    module mem_fun_ref                  { header "__functional/mem_fun_ref.h" }
+    module not_fn {
+      header "__functional/not_fn.h"
+      export std.functional.perfect_forward // inherited from and using its operators
+    }
+    module operations                   { header "__functional/operations.h" }
+    module perfect_forward {
+      header "__functional/perfect_forward.h"
+      export std.tuple
+    }
+    module pointer_to_binary_function   { header "__functional/pointer_to_binary_function.h" }
+    module pointer_to_unary_function    { header "__functional/pointer_to_unary_function.h" }
+    module ranges_operations            { header "__functional/ranges_operations.h" }
+    module reference_wrapper            { header "__functional/reference_wrapper.h" }
+    module unary_function               { header "__functional/unary_function.h" }
+    module unary_negate                 { header "__functional/unary_negate.h" }
+    module weak_result_type             { header "__functional/weak_result_type.h" }
+
+    header "functional"
+    export *
+  } // module functional
+
+  module future {
+    header "future"
+    export *
+  }
+
+  module initializer_list {
+    header "initializer_list"
+    export *
+  }
+
+  module iomanip {
+    header "iomanip"
+    export *
+  }
+
+  module ios {
+    module fwd  { header "__fwd/ios.h" }
+    module fpos { header "__ios/fpos.h" }
+
+    header "ios"
+    export *
+  }
+
+  module iosfwd {
+    header "iosfwd"
+    export *
+  }
+
+  module iostream {
+    header "iostream"
+    export *
+  }
+
+  module istream {
+    module fwd { header "__fwd/istream.h" }
+
+    header "istream"
+    export std.ios // base class
+  }
 
-// Experimental C++ standard library interfaces
-module std_experimental [system] {
   module iterator {
-    header "experimental/iterator"
+    module access                     { header "__iterator/access.h" }
+    module advance                    { header "__iterator/advance.h" }
+    module aliasing_iterator          { header "__iterator/aliasing_iterator.h" }
+    module back_insert_iterator       { header "__iterator/back_insert_iterator.h" }
+    module bounded_iter               { header "__iterator/bounded_iter.h" }
+    module common_iterator            { header "__iterator/common_iterator.h" }
+    module concepts {
+      header "__iterator/concepts.h"
+      export std_core.type_traits.common_reference
+    }
+    module counted_iterator           { header "__iterator/counted_iterator.h" }
+    module cpp17_iterator_concepts    { header "__iterator/cpp17_iterator_concepts.h" }
+    module data                       { header "__iterator/data.h" }
+    module default_sentinel           { header "__iterator/default_sentinel.h" }
+    module distance                   { header "__iterator/distance.h" }
+    module empty                      { header "__iterator/empty.h" }
+    module erase_if_container         { header "__iterator/erase_if_container.h" }
+    module front_insert_iterator      { header "__iterator/front_insert_iterator.h" }
+    module incrementable_traits       { header "__iterator/incrementable_traits.h" }
+    module indirectly_comparable      { header "__iterator/indirectly_comparable.h" }
+    module insert_iterator            { header "__iterator/insert_iterator.h" }
+    module istream_iterator           { header "__iterator/istream_iterator.h" }
+    module istreambuf_iterator        { header "__iterator/istreambuf_iterator.h" }
+    module iter_move                  { header "__iterator/iter_move.h" }
+    module iter_swap                  { header "__iterator/iter_swap.h" }
+    module iterator_traits {
+      header "__iterator/iterator_traits.h"
+      export std_core.type_traits.integral_constant
+    }
+    module iterator_with_data         { header "__iterator/iterator_with_data.h" }
+    module iterator                   { header "__iterator/iterator.h" }
+    module mergeable                  { header "__iterator/mergeable.h" }
+    module move_iterator              { header "__iterator/move_iterator.h" }
+    module move_sentinel              { header "__iterator/move_sentinel.h" }
+    module next                       { header "__iterator/next.h" }
+    module ostream_iterator           { header "__iterator/ostream_iterator.h" }
+    module ostreambuf_iterator {
+      header "__iterator/ostreambuf_iterator.h"
+      export iosfwd // for default template argument of ostreambuf_iterator
+    }
+    module permutable                 { header "__iterator/permutable.h" }
+    module prev                       { header "__iterator/prev.h" }
+    module projected                  { header "__iterator/projected.h" }
+    module ranges_iterator_traits     { header "__iterator/ranges_iterator_traits.h" }
+    module readable_traits            { header "__iterator/readable_traits.h" }
+    module reverse_access             { header "__iterator/reverse_access.h" }
+    module reverse_iterator           { header "__iterator/reverse_iterator.h" }
+    module segmented_iterator         { header "__iterator/segmented_iterator.h" }
+    module size                       { header "__iterator/size.h" }
+    module sortable                   { header "__iterator/sortable.h" }
+    module unreachable_sentinel       { header "__iterator/unreachable_sentinel.h" }
+    module wrap_iter                  { header "__iterator/wrap_iter.h" }
+
+    header "iterator"
+    export *
+  }
+
+  module latch {
+    header "latch"
+    export *
+  }
+
+  module list {
+    header "list"
+    export *
+  }
+
+  module locale {
+    header "locale"
+    header "__locale_dir/locale_base_api.h"
+    header "__locale_dir/locale_base_api/locale_guard.h"
+    module locale_base_api {
+      textual header "__locale_dir/locale_base_api/android.h"
+      textual header "__locale_dir/locale_base_api/bsd_locale_defaults.h"
+      textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h"
+      textual header "__locale_dir/locale_base_api/fuchsia.h"
+      textual header "__locale_dir/locale_base_api/ibm.h"
+      textual header "__locale_dir/locale_base_api/musl.h"
+      textual header "__locale_dir/locale_base_api/newlib.h"
+      textual header "__locale_dir/locale_base_api/openbsd.h"
+      textual header "__locale_dir/locale_base_api/win32.h"
+    }
+    export *
+  }
+
+  // TODO: Understand why this needs to live in its own module
+  module locale_base [system] {
+    header "__locale"
+    export *
+  }
+
+  module map {
+    header "map"
+    export *
+  }
+
+  module mdspan {
+    module default_accessor   { header "__mdspan/default_accessor.h" }
+    module extents            { header "__mdspan/extents.h" }
+    module fwd                { header "__fwd/mdspan.h" }
+    module layout_left        { header "__mdspan/layout_left.h" }
+    module layout_right       { header "__mdspan/layout_right.h" }
+    module layout_stride      { header "__mdspan/layout_stride.h" }
+    module mdspan {
+      header "__mdspan/mdspan.h"
+      export std.array // returned by some methods
+    }
+
+    header "mdspan"
     export *
   }
+
   module memory {
-    header "experimental/memory"
+    module addressof                          { header "__memory/addressof.h" }
+    module align                              { header "__memory/align.h" }
+    module aligned_alloc                      { header "__memory/aligned_alloc.h" }
+    module allocate_at_least                  { header "__memory/allocate_at_least.h" }
+    module allocation_guard                   { header "__memory/allocation_guard.h" }
+    module allocator                          { header "__memory/allocator.h" }
+    module allocator_arg_t                    { header "__memory/allocator_arg_t.h" }
+    module allocator_destructor               { header "__memory/allocator_destructor.h" }
+    module allocator_traits                   { header "__memory/allocator_traits.h" }
+    module array_cookie                       { header "__memory/array_cookie.h" }
+    module assume_aligned                     { header "__memory/assume_aligned.h" }
+    module auto_ptr                           { header "__memory/auto_ptr.h" }
+    module builtin_new_allocator              { header "__memory/builtin_new_allocator.h" }
+    module compressed_pair                    { header "__memory/compressed_pair.h" }
+    module concepts                           { header "__memory/concepts.h" }
+    module construct_at                       { header "__memory/construct_at.h" }
+    module destruct_n                         { header "__memory/destruct_n.h" }
+    module fwd                                { header "__fwd/memory.h" }
+    module inout_ptr                          { header "__memory/inout_ptr.h" }
+    module noexcept_move_assign_container     { header "__memory/noexcept_move_assign_container.h" }
+    module out_ptr                            { header "__memory/out_ptr.h" }
+    module pointer_traits                     { header "__memory/pointer_traits.h" }
+    module ranges_construct_at                { header "__memory/ranges_construct_at.h" }
+    module ranges_uninitialized_algorithms {
+      header "__memory/ranges_uninitialized_algorithms.h"
+      export std.algorithm.in_out_result
+    }
+    module raw_storage_iterator               { header "__memory/raw_storage_iterator.h" }
+    module shared_ptr                         { header "__memory/shared_ptr.h" }
+    module swap_allocator                     { header "__memory/swap_allocator.h" }
+    module temp_value                         { header "__memory/temp_value.h" }
+    module temporary_buffer                   {
+      header "__memory/temporary_buffer.h"
+      export std.utility.pair // return type of std::get_temporary_buffer()
+    }
+    module uninitialized_algorithms {
+      header "__memory/uninitialized_algorithms.h"
+    }
+    module unique_ptr {
+      header "__memory/unique_ptr.h"
+    }
+    module unique_temporary_buffer {
+      header "__memory/unique_temporary_buffer.h"
+      export std.memory.unique_ptr
+      export std_core.type_traits.is_constant_evaluated
+    }
+    module uses_allocator                     { header "__memory/uses_allocator.h" }
+    module uses_allocator_construction        { header "__memory/uses_allocator_construction.h" }
+    module voidify                            { header "__memory/voidify.h" }
+
+    header "memory"
     export *
   }
-  module propagate_const {
-    header "experimental/propagate_const"
+
+  module memory_resource {
+    module fwd                            { header "__fwd/memory_resource.h" }
+    module memory_resource                { header "__memory_resource/memory_resource.h" }
+    module monotonic_buffer_resource      { header "__memory_resource/monotonic_buffer_resource.h" }
+    module polymorphic_allocator          { header "__memory_resource/polymorphic_allocator.h" }
+    module pool_options                   { header "__memory_resource/pool_options.h" }
+    module synchronized_pool_resource     { header "__memory_resource/synchronized_pool_resource.h" }
+    module unsynchronized_pool_resource   { header "__memory_resource/unsynchronized_pool_resource.h" }
+
+    header "memory_resource"
     export *
   }
-  module simd {
-    module aligned_tag          { private header "experimental/__simd/aligned_tag.h" }
-    module declaration          { private header "experimental/__simd/declaration.h" }
-    module reference            { private header "experimental/__simd/reference.h" }
-    module scalar               { private header "experimental/__simd/scalar.h" }
-    module simd                 { private header "experimental/__simd/simd.h" }
-    module simd_mask            { private header "experimental/__simd/simd_mask.h" }
-    module traits               { private header "experimental/__simd/traits.h" }
-    module utility              { private header "experimental/__simd/utility.h" }
-    module vec_ext              { private header "experimental/__simd/vec_ext.h" }
 
-    header "experimental/simd"
+  module mutex {
+    module lock_guard     { header "__mutex/lock_guard.h" }
+    module mutex          { header "__mutex/mutex.h" }
+    module once_flag      { header "__mutex/once_flag.h" }
+    module tag_types      { header "__mutex/tag_types.h" }
+    module unique_lock    { header "__mutex/unique_lock.h" }
+
+    header "mutex"
     export *
   }
-  module type_traits {
-    header "experimental/type_traits"
+
+  module new {
+    header "new"
     export *
   }
-  module utility {
-    header "experimental/utility"
+
+  module numbers {
+    header "numbers"
     export *
   }
-}
 
-// Convenience method to get all of the above modules in a single import statement.
-// Importing only the needed modules is likely to be more performant.
-module std [system] {
-  header "__std_clang_module"
-  export *
-}
+  module numeric {
+    module accumulate                 { header "__numeric/accumulate.h" }
+    module adjacent_difference        { header "__numeric/adjacent_difference.h" }
+    module exclusive_scan             { header "__numeric/exclusive_scan.h" }
+    module gcd_lcm                    { header "__numeric/gcd_lcm.h" }
+    module inclusive_scan             { header "__numeric/inclusive_scan.h" }
+    module inner_product              { header "__numeric/inner_product.h" }
+    module iota                       { header "__numeric/iota.h" }
+    module midpoint                   { header "__numeric/midpoint.h" }
+    module partial_sum                { header "__numeric/partial_sum.h" }
+    module pstl                       { header "__numeric/pstl.h" }
+    module reduce                     { header "__numeric/reduce.h" }
+    module saturation_arithmetic      { header "__numeric/saturation_arithmetic.h" }
+    module transform_exclusive_scan   { header "__numeric/transform_exclusive_scan.h" }
+    module transform_inclusive_scan   { header "__numeric/transform_inclusive_scan.h" }
+    module transform_reduce           { header "__numeric/transform_reduce.h" }
 
-// Implementation detail headers that are private to libc++. These modules
-// must not be directly imported.
-module std_private_assert            [system] {
-  header "__assert"
-  export *
-}
-module std_private_bit_reference     [system] {
-  header "__bit_reference"
-  export *
-}
-module std_private_fwd_bit_reference [system] {
-  header "__fwd/bit_reference.h"
-}
-module std_private_fwd_byte [system] {
-  header "__fwd/byte.h"
-}
-module std_private_config            [system] {
-  textual header "__config"
-  textual header "__configuration/abi.h"
-  textual header "__configuration/availability.h"
-  textual header "__configuration/compiler.h"
-  textual header "__configuration/language.h"
-  textual header "__configuration/platform.h"
-  export *
-}
-module std_private_hash_table        [system] {
-  header "__hash_table"
-  export *
-}
-module std_private_locale            [system] {
-  header "__locale"
-  export *
-}
-module std_private_mbstate_t         [system] {
-  header "__mbstate_t.h"
-  export *
-}
-module std_private_node_handle       [system] {
-  header "__node_handle"
-  export *
-}
-module std_private_split_buffer      [system] {
-  header "__split_buffer"
-  export *
-}
-module std_private_std_mbstate_t     [system] {
-  header "__std_mbstate_t.h"
-  export *
-}
-module std_private_tree              [system] {
-  header "__tree"
-  export *
-}
-module std_private_undef_macros      [system] {
-  textual header "__undef_macros"
-  export *
-}
-module std_private_verbose_abort     [system] {
-  header "__verbose_abort"
-  export *
-}
+    header "numeric"
+    export *
+  }
+
+  module optional {
+    header "optional"
+    export *
+  }
+
+  module ostream {
+    module basic_ostream {
+      header "__ostream/basic_ostream.h"
+      export std.ios // base class
+    }
+    module fwd {
+      header "__fwd/ostream.h"
+    }
+    module print {
+      header "__ostream/print.h"
+      export *
+    }
+
+    header "ostream"
+    export *
+  }
+
+  module print {
+    header "print"
+    export *
+  }
+
+  module queue {
+    module fwd { header "__fwd/queue.h" }
+
+    header "queue"
+    export *
+  }
+
+  module random {
+    module bernoulli_distribution             { header "__random/bernoulli_distribution.h" }
+    module binomial_distribution              { header "__random/binomial_distribution.h" }
+    module cauchy_distribution                { header "__random/cauchy_distribution.h" }
+    module chi_squared_distribution           { header "__random/chi_squared_distribution.h" }
+    module clamp_to_integral                  { header "__random/clamp_to_integral.h" }
+    module default_random_engine              { header "__random/default_random_engine.h" }
+    module discard_block_engine               { header "__random/discard_block_engine.h" }
+    module discrete_distribution              { header "__random/discrete_distribution.h" }
+    module exponential_distribution           { header "__random/exponential_distribution.h" }
+    module extreme_value_distribution         { header "__random/extreme_value_distribution.h" }
+    module fisher_f_distribution              { header "__random/fisher_f_distribution.h" }
+    module gamma_distribution                 { header "__random/gamma_distribution.h" }
+    module generate_canonical                 { header "__random/generate_canonical.h" }
+    module geometric_distribution             { header "__random/geometric_distribution.h" }
+    module independent_bits_engine            { header "__random/independent_bits_engine.h" }
+    module is_seed_sequence                   { header "__random/is_seed_sequence.h" }
+    module is_valid {
+      header "__random/is_valid.h"
+      export std_core.type_traits.integral_constant
+    }
+    module knuth_b                            { header "__random/knuth_b.h" }
+    module linear_congruential_engine         { header "__random/linear_congruential_engine.h" }
+    module log2                               { header "__random/log2.h" }
+    module lognormal_distribution             { header "__random/lognormal_distribution.h" }
+    module mersenne_twister_engine            { header "__random/mersenne_twister_engine.h" }
+    module negative_binomial_distribution     { header "__random/negative_binomial_distribution.h" }
+    module normal_distribution                { header "__random/normal_distribution.h" }
+    module piecewise_constant_distribution    { header "__random/piecewise_constant_distribution.h" }
+    module piecewise_linear_distribution      { header "__random/piecewise_linear_distribution.h" }
+    module poisson_distribution               { header "__random/poisson_distribution.h" }
+    module random_device                      { header "__random/random_device.h" }
+    module ranlux                             { header "__random/ranlux.h" }
+    module seed_seq                           { header "__random/seed_seq.h" }
+    module shuffle_order_engine               { header "__random/shuffle_order_engine.h" }
+    module student_t_distribution             { header "__random/student_t_distribution.h" }
+    module subtract_with_carry_engine         { header "__random/subtract_with_carry_engine.h" }
+    module uniform_int_distribution           { header "__random/uniform_int_distribution.h" }
+    module uniform_random_bit_generator       { header "__random/uniform_random_bit_generator.h" }
+    module uniform_real_distribution          { header "__random/uniform_real_distribution.h" }
+    module weibull_distribution               { header "__random/weibull_distribution.h" }
+
+    header "random"
+    export *
+  }
+
+  module ranges {
+    module access                         { header "__ranges/access.h" }
+    module all                            { header "__ranges/all.h" }
+    module as_rvalue_view                 { header "__ranges/as_rvalue_view.h" }
+    module chunk_by_view {
+      header "__ranges/chunk_by_view.h"
+      export std.functional.bind_back
+    }
+    module common_view                    { header "__ranges/common_view.h" }
+    module concepts                       { header "__ranges/concepts.h" }
+    module container_compatible_range     { header "__ranges/container_compatible_range.h" }
+    module counted {
+      header "__ranges/counted.h"
+      export std.span            // return type of views::counted
+      export std.ranges.subrange // return type of views::counted
+    }
+    module dangling {
+      header "__ranges/dangling.h"
+    }
+    module data {
+      header "__ranges/data.h"
+    }
+    module drop_view {
+      header "__ranges/drop_view.h"
+      export std.functional.bind_back
+    }
+    module drop_while_view {
+      header "__ranges/drop_while_view.h"
+      export std.functional.bind_back
+    }
+    module elements_view                  { header "__ranges/elements_view.h" }
+    module empty                          { header "__ranges/empty.h" }
+    module empty_view                     { header "__ranges/empty_view.h" }
+    module enable_borrowed_range          { header "__ranges/enable_borrowed_range.h" }
+    module enable_view                    { header "__ranges/enable_view.h" }
+    module filter_view {
+      header "__ranges/filter_view.h"
+      export std.functional.bind_back
+    }
+    module from_range                     { header "__ranges/from_range.h" }
+    module iota_view                      { header "__ranges/iota_view.h" }
+    module istream_view                   { header "__ranges/istream_view.h" }
+    module join_view                      { header "__ranges/join_view.h" }
+    module lazy_split_view {
+      header "__ranges/lazy_split_view.h"
+      export std.functional.bind_back
+    }
+    module movable_box                    { header "__ranges/movable_box.h" }
+    module non_propagating_cache          { header "__ranges/non_propagating_cache.h" }
+    module owning_view                    { header "__ranges/owning_view.h" }
+    module range_adaptor                  { header "__ranges/range_adaptor.h" }
+    module rbegin                         { header "__ranges/rbegin.h" }
+    module ref_view                       { header "__ranges/ref_view.h" }
+    module rend                           { header "__ranges/rend.h" }
+    module repeat_view                    { header "__ranges/repeat_view.h" }
+    module reverse_view                   { header "__ranges/reverse_view.h" }
+    module single_view                    { header "__ranges/single_view.h" }
+    module size                           { header "__ranges/size.h" }
+    module split_view {
+      header "__ranges/split_view.h"
+      export std.functional.bind_back
+    }
+    module subrange {
+      header "__ranges/subrange.h"
+      export std.ranges.subrange_fwd
+    }
+    module subrange_fwd {
+      header "__fwd/subrange.h"
+    }
+    module take_view {
+      header "__ranges/take_view.h"
+      export std.functional.bind_back
+    }
+    module take_while_view {
+      header "__ranges/take_while_view.h"
+      export std.functional.bind_back
+    }
+    module to {
+      header "__ranges/to.h"
+      export std.functional.bind_back
+    }
+    module transform_view {
+      header "__ranges/transform_view.h"
+      export std.functional.bind_back
+    }
+    module view_interface {
+      header "__ranges/view_interface.h"
+    }
+    module views {
+      header "__ranges/views.h"
+    }
+    module zip_view {
+      header "__ranges/zip_view.h"
+      export std.utility.pair
+    }
+
+    header "ranges"
+    export *
+  } // module ranges
+
+  module ratio {
+    header "ratio"
+    export *
+  }
+
+  module regex {
+    header "regex"
+    export *
+  }
+
+  module scoped_allocator {
+    header "scoped_allocator"
+    export *
+  }
+
+  module semaphore {
+    header "semaphore"
+    export *
+  }
+
+  module set {
+    header "set"
+    export *
+  }
+
+  module shared_mutex {
+    header "shared_mutex"
+    export *
+  }
+
+  module source_location {
+    header "source_location"
+    export *
+  }
+
+  module span {
+    module fwd { header "__fwd/span.h" }
+
+    header "span"
+    export *
+  }
+
+  module sstream {
+    module fwd { header "__fwd/sstream.h" }
+
+    header "sstream"
+    export *
+  }
+
+  module stack {
+    module fwd { header "__fwd/stack.h" }
+
+    header "stack"
+    export *
+  }
+
+  module stdexcept {
+    header "stdexcept"
+    export *
+  }
+
+  module stop_token {
+    module atomic_unique_lock     { header "__stop_token/atomic_unique_lock.h" }
+    module intrusive_list_view    { header "__stop_token/intrusive_list_view.h" }
+    module intrusive_shared_ptr   { header "__stop_token/intrusive_shared_ptr.h" }
+    module stop_callback          { header "__stop_token/stop_callback.h" }
+    module stop_source            { header "__stop_token/stop_source.h" }
+    module stop_state             { header "__stop_token/stop_state.h" }
+    module stop_token             { header "__stop_token/stop_token.h" }
+
+    header "stop_token"
+    export *
+  }
+
+  module streambuf {
+    module fwd { header "__fwd/streambuf.h" }
+
+    header "streambuf"
+    export *
+  }
+
+  module string {
+    module char_traits              { header "__string/char_traits.h" }
+    module constexpr_c_functions    { header "__string/constexpr_c_functions.h" }
+    module extern_template_lists    { header "__string/extern_template_lists.h" }
+    module fwd                      {  header "__fwd/string.h" }
+
+    header "string"
+    export *
+  }
+
+  module string_view {
+    module fwd { header "__fwd/string_view.h" }
+
+    header "string_view"
+    export *
+  }
+
+  module strstream {
+    header "strstream"
+    export *
+  }
+
+  module syncstream {
+    header "syncstream"
+    export *
+  }
+
+  module system_error {
+    module errc               { header "__system_error/errc.h" }
+    module error_category     { header "__system_error/error_category.h" }
+    module error_code {
+      header "__system_error/error_code.h"
+      export std.system_error.error_category // methods of error_code return that type
+    }
+    module error_condition    { header "__system_error/error_condition.h" }
+    module system_error       { header "__system_error/system_error.h" }
+
+    header "system_error"
+    export *
+  }
+
+  module thread {
+    module formatter              { header "__thread/formatter.h" }
+    module id                     { header "__thread/id.h" }
+    module jthread                { header "__thread/jthread.h" }
+    module poll_with_backoff      { header "__thread/poll_with_backoff.h" }
+    module this_thread            { header "__thread/this_thread.h" }
+    module thread                 { header "__thread/thread.h" }
+    module timed_backoff_policy   { header "__thread/timed_backoff_policy.h" }
+
+    module support {
+      header "__thread/support.h"
+      export *
+    }
+    module support_impl {
+      textual header "__thread/support/c11.h"
+      textual header "__thread/support/external.h"
+      textual header "__thread/support/pthread.h"
+      textual header "__thread/support/windows.h"
+    }
+
+    header "thread"
+    export *
+  }
+
+  module tuple {
+    module find_index               { header "__tuple/find_index.h" }
+    module ignore                   { header "__tuple/ignore.h" }
+    module make_tuple_types         { header "__tuple/make_tuple_types.h" }
+    module sfinae_helpers           { header "__tuple/sfinae_helpers.h" }
+    module tuple_element            { header "__tuple/tuple_element.h" }
+    module tuple_indices            { header "__tuple/tuple_indices.h" }
+    module tuple_like_ext           { header "__tuple/tuple_like_ext.h" }
+    module tuple_like_no_subrange   { header "__tuple/tuple_like_no_subrange.h" }
+    module tuple_like               { header "__tuple/tuple_like.h" }
+    module tuple_size               { header "__tuple/tuple_size.h" }
+    module tuple_types              { header "__tuple/tuple_types.h" }
+
+    header "tuple"
+    export *
+  }
+
+  module typeindex {
+    header "typeindex"
+    export *
+  }
+
+  module typeinfo {
+    header "typeinfo"
+    export *
+  }
+
+  module unordered_map {
+    header "unordered_map"
+    export *
+  }
+
+  module unordered_set {
+    header "unordered_set"
+    export *
+  }
+
+  module utility {
+    module as_const                   { header "__utility/as_const.h" }
+    module as_lvalue                  { header "__utility/as_lvalue.h" }
+    module auto_cast                  {
+      header "__utility/auto_cast.h"
+      export std_core.type_traits.decay // the macro expansion uses that trait
+    }
+    module cmp                        { header "__utility/cmp.h" }
+    module convert_to_integral        { header "__utility/convert_to_integral.h" }
+    module exception_guard            { header "__utility/exception_guard.h" }
+    module exchange                   { header "__utility/exchange.h" }
+    module forward_like               { header "__utility/forward_like.h" }
+    module in_place {
+      header "__utility/in_place.h"
+      export std_core.type_traits.integral_constant
+    }
+    module integer_sequence           { header "__utility/integer_sequence.h" }
+    module is_pointer_in_range        { header "__utility/is_pointer_in_range.h" }
+    module is_valid_range             { header "__utility/is_valid_range.h" }
+    module move                       { header "__utility/move.h" }
+    module no_destroy                 { header "__utility/no_destroy.h" }
+    module pair                       { header "__utility/pair.h" }
+    module piecewise_construct        { header "__utility/piecewise_construct.h" }
+    module priority_tag               { header "__utility/priority_tag.h" }
+    module private_constructor_tag    { header "__utility/private_constructor_tag.h" }
+    module rel_ops                    { header "__utility/rel_ops.h" }
+    module small_buffer               { header "__utility/small_buffer.h" }
+    module swap                       { header "__utility/swap.h" }
+    module to_underlying              { header "__utility/to_underlying.h" }
+    module unreachable                { header "__utility/unreachable.h" }
+
+    header "utility"
+    export *
+  }
+
+  module valarray {
+    header "valarray"
+    export *
+  }
 
-module std_private_algorithm_adjacent_find                               [system] { header "__algorithm/adjacent_find.h" }
-module std_private_algorithm_all_of                                      [system] { header "__algorithm/all_of.h" }
-module std_private_algorithm_any_of                                      [system] { header "__algorithm/any_of.h" }
-module std_private_algorithm_binary_search                               [system] { header "__algorithm/binary_search.h" }
-module std_private_algorithm_clamp                                       [system] { header "__algorithm/clamp.h" }
-module std_private_algorithm_comp                                        [system] { header "__algorithm/comp.h" }
-module std_private_algorithm_comp_ref_type                               [system] { header "__algorithm/comp_ref_type.h" }
-module std_private_algorithm_copy                                        [system] {
-  header "__algorithm/copy.h"
-  export std_private_algorithm_copy_move_common
-}
-module std_private_algorithm_copy_backward                               [system] { header "__algorithm/copy_backward.h" }
-module std_private_algorithm_copy_if                                     [system] { header "__algorithm/copy_if.h" }
-module std_private_algorithm_copy_move_common                            [system] {
-  header "__algorithm/copy_move_common.h"
-  export std_private_type_traits_is_trivially_copyable
-}
-module std_private_algorithm_copy_n                                      [system] { header "__algorithm/copy_n.h" }
-module std_private_algorithm_count                                       [system] { header "__algorithm/count.h" }
-module std_private_algorithm_count_if                                    [system] { header "__algorithm/count_if.h" }
-module std_private_algorithm_equal                                       [system] { header "__algorithm/equal.h" }
-module std_private_algorithm_equal_range                                 [system] { header "__algorithm/equal_range.h" }
-module std_private_algorithm_fill                                        [system] { header "__algorithm/fill.h" }
-module std_private_algorithm_fill_n                                      [system] { header "__algorithm/fill_n.h" }
-module std_private_algorithm_find                                        [system] {
-  header "__algorithm/find.h"
-  export std_private_algorithm_unwrap_iter
-}
-module std_private_algorithm_find_end                                    [system] { header "__algorithm/find_end.h" }
-module std_private_algorithm_find_first_of                               [system] { header "__algorithm/find_first_of.h" }
-module std_private_algorithm_find_if                                     [system] { header "__algorithm/find_if.h" }
-module std_private_algorithm_find_if_not                                 [system] { header "__algorithm/find_if_not.h" }
-module std_private_algorithm_find_segment_if                             [system] { header "__algorithm/find_segment_if.h" }
-module std_private_algorithm_fold                                        [system] { header "__algorithm/fold.h" }
-module std_private_algorithm_for_each                                    [system] { header "__algorithm/for_each.h" }
-module std_private_algorithm_for_each_n                                  [system] { header "__algorithm/for_each_n.h" }
-module std_private_algorithm_for_each_segment                            [system] { header "__algorithm/for_each_segment.h" }
-module std_private_algorithm_generate                                    [system] { header "__algorithm/generate.h" }
-module std_private_algorithm_generate_n                                  [system] { header "__algorithm/generate_n.h" }
-module std_private_algorithm_half_positive                               [system] { header "__algorithm/half_positive.h" }
-module std_private_algorithm_in_found_result                             [system] { header "__algorithm/in_found_result.h" }
-module std_private_algorithm_in_fun_result                               [system] { header "__algorithm/in_fun_result.h" }
-module std_private_algorithm_in_in_out_result                            [system] { header "__algorithm/in_in_out_result.h" }
-module std_private_algorithm_in_in_result                                [system] { header "__algorithm/in_in_result.h" }
-module std_private_algorithm_in_out_out_result                           [system] { header "__algorithm/in_out_out_result.h" }
-module std_private_algorithm_in_out_result                               [system] { header "__algorithm/in_out_result.h" }
-module std_private_algorithm_includes                                    [system] { header "__algorithm/includes.h" }
-module std_private_algorithm_inplace_merge                               [system] { header "__algorithm/inplace_merge.h" }
-module std_private_algorithm_is_heap                                     [system] { header "__algorithm/is_heap.h" }
-module std_private_algorithm_is_heap_until                               [system] { header "__algorithm/is_heap_until.h" }
-module std_private_algorithm_is_partitioned                              [system] { header "__algorithm/is_partitioned.h" }
-module std_private_algorithm_is_permutation                              [system] { header "__algorithm/is_permutation.h" }
-module std_private_algorithm_is_sorted                                   [system] { header "__algorithm/is_sorted.h" }
-module std_private_algorithm_is_sorted_until                             [system] { header "__algorithm/is_sorted_until.h" }
-module std_private_algorithm_iter_swap                                   [system] { header "__algorithm/iter_swap.h" }
-module std_private_algorithm_iterator_operations                         [system] {
-  header "__algorithm/iterator_operations.h"
-  export *
-}
-module std_private_algorithm_lexicographical_compare                     [system] { header "__algorithm/lexicographical_compare.h" }
-module std_private_algorithm_lexicographical_compare_three_way           [system] { header "__algorithm/lexicographical_compare_three_way.h" }
-module std_private_algorithm_lower_bound                                 [system] { header "__algorithm/lower_bound.h" }
-module std_private_algorithm_make_heap                                   [system] { header "__algorithm/make_heap.h" }
-module std_private_algorithm_make_projected                              [system] { header "__algorithm/make_projected.h" }
-module std_private_algorithm_max                                         [system] { header "__algorithm/max.h" }
-module std_private_algorithm_max_element                                 [system] { header "__algorithm/max_element.h" }
-module std_private_algorithm_merge                                       [system] { header "__algorithm/merge.h" }
-module std_private_algorithm_min                                         [system] { header "__algorithm/min.h" }
-module std_private_algorithm_min_element                                 [system] { header "__algorithm/min_element.h" }
-module std_private_algorithm_min_max_result                              [system] { header "__algorithm/min_max_result.h" }
-module std_private_algorithm_minmax                                      [system] {
-  header "__algorithm/minmax.h"
-  export *
-}
-module std_private_algorithm_minmax_element                              [system] { header "__algorithm/minmax_element.h" }
-module std_private_algorithm_mismatch                                    [system] {
-  header "__algorithm/mismatch.h"
-  export std_private_algorithm_simd_utils
-  export std_private_iterator_aliasing_iterator
-}
-module std_private_algorithm_move                                        [system] { header "__algorithm/move.h" }
-module std_private_algorithm_move_backward                               [system] { header "__algorithm/move_backward.h" }
-module std_private_algorithm_next_permutation                            [system] { header "__algorithm/next_permutation.h" }
-module std_private_algorithm_none_of                                     [system] { header "__algorithm/none_of.h" }
-module std_private_algorithm_nth_element                                 [system] { header "__algorithm/nth_element.h" }
-module std_private_algorithm_partial_sort                                [system] { header "__algorithm/partial_sort.h" }
-module std_private_algorithm_partial_sort_copy                           [system] { header "__algorithm/partial_sort_copy.h" }
-module std_private_algorithm_partition                                   [system] { header "__algorithm/partition.h" }
-module std_private_algorithm_partition_copy                              [system] { header "__algorithm/partition_copy.h" }
-module std_private_algorithm_partition_point                             [system] { header "__algorithm/partition_point.h" }
-module std_private_algorithm_pop_heap                                    [system] { header "__algorithm/pop_heap.h" }
-module std_private_algorithm_prev_permutation                            [system] { header "__algorithm/prev_permutation.h" }
-module std_private_algorithm_pstl                                        [system] {
-  header "__algorithm/pstl.h"
-  export *
-}
-module std_private_algorithm_push_heap                                   [system] { header "__algorithm/push_heap.h" }
-module std_private_algorithm_ranges_adjacent_find                        [system] { header "__algorithm/ranges_adjacent_find.h" }
-module std_private_algorithm_ranges_all_of                               [system] { header "__algorithm/ranges_all_of.h" }
-module std_private_algorithm_ranges_any_of                               [system] { header "__algorithm/ranges_any_of.h" }
-module std_private_algorithm_ranges_binary_search                        [system] {
-  header "__algorithm/ranges_binary_search.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_clamp                                [system] {
-  header "__algorithm/ranges_clamp.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_contains                             [system] { header "__algorithm/ranges_contains.h" }
-module std_private_algorithm_ranges_contains_subrange                    [system] { header "__algorithm/ranges_contains_subrange.h" }
-module std_private_algorithm_ranges_copy                                 [system] {
-  header "__algorithm/ranges_copy.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_copy_backward                        [system] {
-  header "__algorithm/ranges_copy_backward.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_copy_if                              [system] {
-  header "__algorithm/ranges_copy_if.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_copy_n                               [system] {
-  header "__algorithm/ranges_copy_n.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_count                                [system] { header "__algorithm/ranges_count.h" }
-module std_private_algorithm_ranges_count_if                             [system] { header "__algorithm/ranges_count_if.h" }
-module std_private_algorithm_ranges_ends_with                            [system] { header "__algorithm/ranges_ends_with.h" }
-module std_private_algorithm_ranges_equal                                [system] { header "__algorithm/ranges_equal.h" }
-module std_private_algorithm_ranges_equal_range                          [system] {
-  header "__algorithm/ranges_equal_range.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_fill                                 [system] { header "__algorithm/ranges_fill.h" }
-module std_private_algorithm_ranges_fill_n                               [system] { header "__algorithm/ranges_fill_n.h" }
-module std_private_algorithm_ranges_find                                 [system] { header "__algorithm/ranges_find.h" }
-module std_private_algorithm_ranges_find_end                             [system] { header "__algorithm/ranges_find_end.h" }
-module std_private_algorithm_ranges_find_first_of                        [system] { header "__algorithm/ranges_find_first_of.h" }
-module std_private_algorithm_ranges_find_if                              [system] { header "__algorithm/ranges_find_if.h" }
-module std_private_algorithm_ranges_find_if_not                          [system] { header "__algorithm/ranges_find_if_not.h" }
-module std_private_algorithm_ranges_find_last                            [system] { header "__algorithm/ranges_find_last.h" }
-module std_private_algorithm_ranges_for_each                             [system] {
-  header "__algorithm/ranges_for_each.h"
-  export std_private_algorithm_in_fun_result
-}
-module std_private_algorithm_ranges_for_each_n                           [system] {
-  header "__algorithm/ranges_for_each_n.h"
-  export std_private_algorithm_in_fun_result
-}
-module std_private_algorithm_ranges_generate                             [system] { header "__algorithm/ranges_generate.h" }
-module std_private_algorithm_ranges_generate_n                           [system] { header "__algorithm/ranges_generate_n.h" }
-module std_private_algorithm_ranges_includes                             [system] {
-  header "__algorithm/ranges_includes.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_inplace_merge                        [system] {
-  header "__algorithm/ranges_inplace_merge.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_is_heap                              [system] {
-  header "__algorithm/ranges_is_heap.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_is_heap_until                        [system] {
-  header "__algorithm/ranges_is_heap_until.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_is_partitioned                       [system] { header "__algorithm/ranges_is_partitioned.h" }
-module std_private_algorithm_ranges_is_permutation                       [system] { header "__algorithm/ranges_is_permutation.h" }
-module std_private_algorithm_ranges_is_sorted                            [system] {
-  header "__algorithm/ranges_is_sorted.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_is_sorted_until                      [system] {
-  header "__algorithm/ranges_is_sorted_until.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_iterator_concept                     [system] { header "__algorithm/ranges_iterator_concept.h" }
-module std_private_algorithm_ranges_lexicographical_compare              [system] {
-  header "__algorithm/ranges_lexicographical_compare.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_lower_bound                          [system] {
-  header "__algorithm/ranges_lower_bound.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_make_heap                            [system] {
-  header "__algorithm/ranges_make_heap.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_max                                  [system] {
-  header "__algorithm/ranges_max.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_max_element                          [system] {
-  header "__algorithm/ranges_max_element.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_merge                                [system] {
-  header "__algorithm/ranges_merge.h"
-  export std_private_algorithm_in_in_out_result
-}
-module std_private_algorithm_ranges_min                                  [system] {
-  header "__algorithm/ranges_min.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_min_element                          [system] {
-  header "__algorithm/ranges_min_element.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_minmax                               [system] {
-  header "__algorithm/ranges_minmax.h"
-  export std_private_functional_ranges_operations
-  export std_private_algorithm_min_max_result
-}
-module std_private_algorithm_ranges_minmax_element                       [system] {
-  header "__algorithm/ranges_minmax_element.h"
-  export std_private_functional_ranges_operations
-  export std_private_algorithm_min_max_result
-}
-module std_private_algorithm_ranges_mismatch                             [system] {
-  header "__algorithm/ranges_mismatch.h"
-  export std_private_algorithm_in_in_result
-}
-module std_private_algorithm_ranges_move                                 [system] {
-  header "__algorithm/ranges_move.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_move_backward                        [system] {
-  header "__algorithm/ranges_move_backward.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_next_permutation                     [system] {
-  header "__algorithm/ranges_next_permutation.h"
-  export std_private_algorithm_in_found_result
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_none_of                              [system] { header "__algorithm/ranges_none_of.h" }
-module std_private_algorithm_ranges_nth_element                          [system] {
-  header "__algorithm/ranges_nth_element.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_partial_sort                         [system] {
-  header "__algorithm/ranges_partial_sort.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_partial_sort_copy                    [system] {
-  header "__algorithm/ranges_partial_sort_copy.h"
-  export std_private_algorithm_in_out_result
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_partition                            [system] { header "__algorithm/ranges_partition.h" }
-module std_private_algorithm_ranges_partition_copy                       [system] { header "__algorithm/ranges_partition_copy.h" }
-module std_private_algorithm_ranges_partition_point                      [system] { header "__algorithm/ranges_partition_point.h" }
-module std_private_algorithm_ranges_pop_heap                             [system] {
-  header "__algorithm/ranges_pop_heap.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_prev_permutation                     [system] {
-  header "__algorithm/ranges_prev_permutation.h"
-  export std_private_algorithm_in_found_result
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_push_heap                            [system] {
-  header "__algorithm/ranges_push_heap.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_remove                               [system] { header "__algorithm/ranges_remove.h" }
-module std_private_algorithm_ranges_remove_copy                          [system] {
-  header "__algorithm/ranges_remove_copy.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_remove_copy_if                       [system] {
-  header "__algorithm/ranges_remove_copy_if.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_remove_if                            [system] { header "__algorithm/ranges_remove_if.h" }
-module std_private_algorithm_ranges_replace                              [system] { header "__algorithm/ranges_replace.h" }
-module std_private_algorithm_ranges_replace_copy                         [system] {
-  header "__algorithm/ranges_replace_copy.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_replace_copy_if                      [system] {
-  header "__algorithm/ranges_replace_copy_if.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_replace_if                           [system] { header "__algorithm/ranges_replace_if.h" }
-module std_private_algorithm_ranges_reverse                              [system] { header "__algorithm/ranges_reverse.h" }
-module std_private_algorithm_ranges_reverse_copy                         [system] {
-  header "__algorithm/ranges_reverse_copy.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_rotate                               [system] { header "__algorithm/ranges_rotate.h" }
-module std_private_algorithm_ranges_rotate_copy                          [system] {
-  header "__algorithm/ranges_rotate_copy.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_sample                               [system] { header "__algorithm/ranges_sample.h" }
-module std_private_algorithm_ranges_search                               [system] { header "__algorithm/ranges_search.h" }
-module std_private_algorithm_ranges_search_n                             [system] { header "__algorithm/ranges_search_n.h" }
-module std_private_algorithm_ranges_set_difference                       [system] {
-  header "__algorithm/ranges_set_difference.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_set_intersection                     [system] {
-  header "__algorithm/ranges_set_intersection.h"
-  export std_private_algorithm_in_in_out_result
-}
-module std_private_algorithm_ranges_set_symmetric_difference             [system] {
-  header "__algorithm/ranges_set_symmetric_difference.h"
-  export std_private_algorithm_in_in_out_result
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_set_union                            [system] {
-  header "__algorithm/ranges_set_union.h"
-  export std_private_algorithm_in_in_out_result
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_shuffle                              [system] { header "__algorithm/ranges_shuffle.h" }
-module std_private_algorithm_ranges_sort                                 [system] {
-  header "__algorithm/ranges_sort.h"
-  export std_private_algorithm_make_projected
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_sort_heap                            [system] {
-  header "__algorithm/ranges_sort_heap.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_stable_partition                     [system] { header "__algorithm/ranges_stable_partition.h" }
-module std_private_algorithm_ranges_stable_sort                          [system] {
-  header "__algorithm/ranges_stable_sort.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_ranges_starts_with                          [system] { header "__algorithm/ranges_starts_with.h" }
-module std_private_algorithm_ranges_swap_ranges                          [system] {
-  header "__algorithm/ranges_swap_ranges.h"
-  export std_private_algorithm_in_in_result
-}
-module std_private_algorithm_ranges_transform                            [system] {
-  header "__algorithm/ranges_transform.h"
-  export std_private_algorithm_in_in_out_result
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_unique                               [system] { header "__algorithm/ranges_unique.h" }
-module std_private_algorithm_ranges_unique_copy                          [system] {
-  header "__algorithm/ranges_unique_copy.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_algorithm_ranges_upper_bound                          [system] {
-  header "__algorithm/ranges_upper_bound.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_algorithm_remove                                      [system] { header "__algorithm/remove.h" }
-module std_private_algorithm_remove_copy                                 [system] { header "__algorithm/remove_copy.h" }
-module std_private_algorithm_remove_copy_if                              [system] { header "__algorithm/remove_copy_if.h" }
-module std_private_algorithm_remove_if                                   [system] { header "__algorithm/remove_if.h" }
-module std_private_algorithm_replace                                     [system] { header "__algorithm/replace.h" }
-module std_private_algorithm_replace_copy                                [system] { header "__algorithm/replace_copy.h" }
-module std_private_algorithm_replace_copy_if                             [system] { header "__algorithm/replace_copy_if.h" }
-module std_private_algorithm_replace_if                                  [system] { header "__algorithm/replace_if.h" }
-module std_private_algorithm_reverse                                     [system] { header "__algorithm/reverse.h" }
-module std_private_algorithm_reverse_copy                                [system] { header "__algorithm/reverse_copy.h" }
-module std_private_algorithm_rotate                                      [system] { header "__algorithm/rotate.h" }
-module std_private_algorithm_rotate_copy                                 [system] { header "__algorithm/rotate_copy.h" }
-module std_private_algorithm_sample                                      [system] { header "__algorithm/sample.h" }
-module std_private_algorithm_search                                      [system] { header "__algorithm/search.h" }
-module std_private_algorithm_search_n                                    [system] { header "__algorithm/search_n.h" }
-module std_private_algorithm_set_difference                              [system] { header "__algorithm/set_difference.h" }
-module std_private_algorithm_set_intersection                            [system] { header "__algorithm/set_intersection.h" }
-module std_private_algorithm_set_symmetric_difference                    [system] { header "__algorithm/set_symmetric_difference.h" }
-module std_private_algorithm_set_union                                   [system] { header "__algorithm/set_union.h" }
-module std_private_algorithm_shift_left                                  [system] { header "__algorithm/shift_left.h" }
-module std_private_algorithm_shift_right                                 [system] { header "__algorithm/shift_right.h" }
-module std_private_algorithm_shuffle                                     [system] { header "__algorithm/shuffle.h" }
-module std_private_algorithm_sift_down                                   [system] { header "__algorithm/sift_down.h" }
-module std_private_algorithm_sort                                        [system] {
-  header "__algorithm/sort.h"
-  export std_private_debug_utils_strict_weak_ordering_check
-}
-module std_private_algorithm_simd_utils                                  [system] { header "__algorithm/simd_utils.h" }
-module std_private_algorithm_sort_heap                                   [system] { header "__algorithm/sort_heap.h" }
-module std_private_algorithm_stable_partition                            [system] { header "__algorithm/stable_partition.h" }
-module std_private_algorithm_stable_sort                                 [system] { header "__algorithm/stable_sort.h" }
-module std_private_algorithm_swap_ranges                                 [system] {
-  header "__algorithm/swap_ranges.h"
-  export std_private_algorithm_iterator_operations
-}
-module std_private_algorithm_three_way_comp_ref_type                     [system] { header "__algorithm/three_way_comp_ref_type.h" }
-module std_private_algorithm_transform                                   [system] { header "__algorithm/transform.h" }
-module std_private_algorithm_uniform_random_bit_generator_adaptor        [system] { header "__algorithm/uniform_random_bit_generator_adaptor.h" }
-module std_private_algorithm_unique                                      [system] { header "__algorithm/unique.h" }
-module std_private_algorithm_unique_copy                                 [system] { header "__algorithm/unique_copy.h" }
-module std_private_algorithm_unwrap_iter                                 [system] {
-  header "__algorithm/unwrap_iter.h"
-  export std_private_iterator_iterator_traits
-}
-module std_private_algorithm_unwrap_range                                [system] {
-  header "__algorithm/unwrap_range.h"
-  export std_private_utility_pair
-}
-module std_private_algorithm_upper_bound                                 [system] { header "__algorithm/upper_bound.h" }
+  module variant {
+    module fwd       { header "__fwd/variant.h" }
+    module monostate { header "__variant/monostate.h" }
 
-module std_private_array_array_fwd [system] { header "__fwd/array.h" }
+    header "variant"
+    export *
+  }
 
-module std_private_atomic_aliases             [system] {
-  header "__atomic/aliases.h"
-  export std_private_atomic_atomic
-}
-module std_private_atomic_atomic              [system] {
-  header "__atomic/atomic.h"
-  export std_private_atomic_atomic_base
-}
-module std_private_atomic_atomic_base         [system] { header "__atomic/atomic_base.h" }
-module std_private_atomic_atomic_flag         [system] {
-  header "__atomic/atomic_flag.h"
-  export *
-}
-module std_private_atomic_atomic_init         [system] { header "__atomic/atomic_init.h" }
-module std_private_atomic_atomic_lock_free    [system] { header "__atomic/atomic_lock_free.h" }
-module std_private_atomic_atomic_ref          [system] { header "__atomic/atomic_ref.h" }
-module std_private_atomic_atomic_sync         [system] {
-  header "__atomic/atomic_sync.h"
-  export std_private_atomic_to_gcc_order
-}
-module std_private_atomic_check_memory_order  [system] { header "__atomic/check_memory_order.h" }
-module std_private_atomic_contention_t        [system] { header "__atomic/contention_t.h" }
-module std_private_atomic_cxx_atomic_impl     [system] { header "__atomic/cxx_atomic_impl.h" }
-module std_private_atomic_fence               [system] { header "__atomic/fence.h" }
-module std_private_atomic_is_always_lock_free [system] { header "__atomic/is_always_lock_free.h" }
-module std_private_atomic_kill_dependency     [system] { header "__atomic/kill_dependency.h" }
-module std_private_atomic_memory_order        [system] { header "__atomic/memory_order.h" }
-module std_private_atomic_to_gcc_order        [system] {
-  header "__atomic/to_gcc_order.h"
-  export std_private_atomic_memory_order
-}
+  module vector {
+    module fwd { header "__fwd/vector.h" }
 
-module std_private_bit_bit_cast       [system] { header "__bit/bit_cast.h" }
-module std_private_bit_bit_ceil       [system] { header "__bit/bit_ceil.h" }
-module std_private_bit_bit_floor      [system] { header "__bit/bit_floor.h" }
-module std_private_bit_bit_log2       [system] { header "__bit/bit_log2.h" }
-module std_private_bit_bit_width      [system] { header "__bit/bit_width.h" }
-module std_private_bit_blsr           [system] { header "__bit/blsr.h" }
-module std_private_bit_byteswap       [system] { header "__bit/byteswap.h" }
-module std_private_bit_countl         [system] { header "__bit/countl.h" }
-module std_private_bit_countr         [system] { header "__bit/countr.h" }
-module std_private_bit_endian         [system] { header "__bit/endian.h" }
-module std_private_bit_has_single_bit [system] { header "__bit/has_single_bit.h" }
-module std_private_bit_invert_if      [system] { header "__bit/invert_if.h" }
-module std_private_bit_popcount       [system] { header "__bit/popcount.h" }
-module std_private_bit_rotate         [system] { header "__bit/rotate.h" }
-
-module std_private_chrono_calendar               [system] { header "__chrono/calendar.h" }
-module std_private_chrono_concepts               [system] { header "__chrono/concepts.h" }
-module std_private_chrono_convert_to_timespec    [system] { header "__chrono/convert_to_timespec.h" }
-module std_private_chrono_convert_to_tm          [system] { header "__chrono/convert_to_tm.h" }
-module std_private_chrono_day                    [system] { header "__chrono/day.h" }
-module std_private_chrono_duration               [system] {
-  header "__chrono/duration.h"
-  export std_private_type_traits_is_convertible
-}
-module std_private_chrono_exception              [system] { header "__chrono/exception.h" }
-module std_private_chrono_file_clock             [system] { header "__chrono/file_clock.h" }
-module std_private_chrono_formatter              [system] {
-  header "__chrono/formatter.h"
-}
-module std_private_chrono_hh_mm_ss               [system] { header "__chrono/hh_mm_ss.h" }
-module std_private_chrono_high_resolution_clock  [system] {
-  header "__chrono/high_resolution_clock.h"
-  export std_private_chrono_steady_clock
-  export std_private_chrono_system_clock
-}
-module std_private_chrono_leap_second            [system] { header "__chrono/leap_second.h" }
-module std_private_chrono_literals               [system] { header "__chrono/literals.h" }
-module std_private_chrono_local_info             [system] {
-  header "__chrono/local_info.h"
-  export std_private_chrono_sys_info
-}
-module std_private_chrono_month                  [system] { header "__chrono/month.h" }
-module std_private_chrono_month_weekday          [system] { header "__chrono/month_weekday.h" }
-module std_private_chrono_monthday               [system] { header "__chrono/monthday.h" }
-module std_private_chrono_ostream                [system] {
-  header "__chrono/ostream.h"
-}
-module std_private_chrono_parser_std_format_spec [system] {
-  header "__chrono/parser_std_format_spec.h"
-}
-module std_private_chrono_statically_widen       [system] { header "__chrono/statically_widen.h" }
-module std_private_chrono_steady_clock           [system] {
-  header "__chrono/steady_clock.h"
-  export std_private_chrono_time_point
-}
-module std_private_chrono_time_zone              [system] {
-  header "__chrono/time_zone.h"
-  export std_private_memory_unique_ptr
-}
-module std_private_chrono_time_zone_link         [system] {
-  header "__chrono/time_zone_link.h"
-}
-module std_private_chrono_sys_info               [system] {
-  header "__chrono/sys_info.h"
-}
-module std_private_chrono_system_clock           [system] {
-  header "__chrono/system_clock.h"
-  export std_private_chrono_time_point
-}
-module std_private_chrono_tzdb                   [system] {
-  header "__chrono/tzdb.h"
-  export *
-}
-module std_private_chrono_tzdb_list              [system] {
-  header "__chrono/tzdb_list.h"
-  export *
-}
-module std_private_chrono_time_point             [system] { header "__chrono/time_point.h" }
-module std_private_chrono_weekday                [system] { header "__chrono/weekday.h" }
-module std_private_chrono_year                   [system] { header "__chrono/year.h" }
-module std_private_chrono_year_month             [system] { header "__chrono/year_month.h" }
-module std_private_chrono_year_month_day         [system] { header "__chrono/year_month_day.h" }
-module std_private_chrono_year_month_weekday     [system] { header "__chrono/year_month_weekday.h" }
-module std_private_chrono_zoned_time             [system] { header "__chrono/zoned_time.h" }
-
-module std_private_compare_common_comparison_category     [system] { header "__compare/common_comparison_category.h" }
-module std_private_compare_compare_partial_order_fallback [system] { header "__compare/compare_partial_order_fallback.h" }
-module std_private_compare_compare_strong_order_fallback  [system] { header "__compare/compare_strong_order_fallback.h" }
-module std_private_compare_compare_three_way              [system] { header "__compare/compare_three_way.h" }
-module std_private_compare_compare_three_way_result       [system] { header "__compare/compare_three_way_result.h" }
-module std_private_compare_compare_weak_order_fallback    [system] { header "__compare/compare_weak_order_fallback.h" }
-module std_private_compare_is_eq                          [system] { header "__compare/is_eq.h" }
-module std_private_compare_ordering                       [system] { header "__compare/ordering.h" }
-module std_private_compare_partial_order                  [system] { header "__compare/partial_order.h" }
-module std_private_compare_strong_order                   [system] { header "__compare/strong_order.h" }
-module std_private_compare_synth_three_way                [system] { header "__compare/synth_three_way.h" }
-module std_private_compare_three_way_comparable           [system] { header "__compare/three_way_comparable.h" }
-module std_private_compare_weak_order                     [system] { header "__compare/weak_order.h" }
-
-module std_private_complex_complex_fwd            [system] { header "__fwd/complex.h" }
-
-module std_private_concepts_arithmetic            [system] { header "__concepts/arithmetic.h" }
-module std_private_concepts_assignable            [system] { header "__concepts/assignable.h" }
-module std_private_concepts_boolean_testable      [system] { header "__concepts/boolean_testable.h" }
-module std_private_concepts_class_or_enum         [system] { header "__concepts/class_or_enum.h" }
-module std_private_concepts_common_reference_with [system] { header "__concepts/common_reference_with.h" }
-module std_private_concepts_common_with           [system] { header "__concepts/common_with.h" }
-module std_private_concepts_constructible         [system] {
-  header "__concepts/constructible.h"
-  export std_private_concepts_destructible
-}
-module std_private_concepts_convertible_to        [system] { header "__concepts/convertible_to.h" }
-module std_private_concepts_copyable              [system] { header "__concepts/copyable.h" }
-module std_private_concepts_derived_from          [system] { header "__concepts/derived_from.h" }
-module std_private_concepts_destructible          [system] {
-  header "__concepts/destructible.h"
-  export std_private_type_traits_is_nothrow_destructible
-}
-module std_private_concepts_different_from        [system] { header "__concepts/different_from.h" }
-module std_private_concepts_equality_comparable   [system] {
-  header "__concepts/equality_comparable.h"
-  export std_private_type_traits_common_reference
-}
-module std_private_concepts_invocable             [system] { header "__concepts/invocable.h" }
-module std_private_concepts_movable               [system] {
-  header "__concepts/movable.h"
-  export std_private_type_traits_is_object
-}
-module std_private_concepts_predicate             [system] { header "__concepts/predicate.h" }
-module std_private_concepts_regular               [system] { header "__concepts/regular.h" }
-module std_private_concepts_relation              [system] { header "__concepts/relation.h" }
-module std_private_concepts_same_as               [system] {
-  header "__concepts/same_as.h"
-  export std_private_type_traits_is_same
-}
-module std_private_concepts_semiregular           [system] { header "__concepts/semiregular.h" }
-module std_private_concepts_swappable             [system] { header "__concepts/swappable.h" }
-module std_private_concepts_totally_ordered       [system] { header "__concepts/totally_ordered.h" }
-
-module std_private_debug_utils_randomize_range            [system] { header "__debug_utils/randomize_range.h" }
-module std_private_debug_utils_sanitizers                 [system] { header "__debug_utils/sanitizers.h" }
-module std_private_debug_utils_strict_weak_ordering_check [system] {
-  header "__debug_utils/strict_weak_ordering_check.h"
-  export std_private_type_traits_is_constant_evaluated
-}
+    header "vector"
+    export *
+  }
 
-module std_private_deque_fwd [system] { header "__fwd/deque.h" }
+  // Experimental C++ Standard Library interfaces
+  module experimental {
+    module iterator           { header "experimental/iterator" }
+    module memory             { header "experimental/memory" }
+    module propagate_const    { header "experimental/propagate_const" }
+    module type_traits        { header "experimental/type_traits" }
+    module utility            { header "experimental/utility" }
+    module simd {
+      private header "experimental/__simd/aligned_tag.h"
+      private header "experimental/__simd/declaration.h"
+      private header "experimental/__simd/reference.h"
+      private header "experimental/__simd/scalar.h"
+      private header "experimental/__simd/simd_mask.h"
+      private header "experimental/__simd/simd.h"
+      private header "experimental/__simd/traits.h"
+      private header "experimental/__simd/utility.h"
+      private header "experimental/__simd/vec_ext.h"
+      header "experimental/simd"
+      export *
+    }
+  }
 
-module std_private_exception_exception        [system] { header "__exception/exception.h" }
-module std_private_exception_exception_ptr    [system] {
-  header "__exception/exception_ptr.h"
-  export std_private_exception_operations
-}
-module std_private_exception_nested_exception [system] { header "__exception/nested_exception.h" }
-module std_private_exception_operations       [system] { header "__exception/operations.h" }
-module std_private_exception_terminate        [system] { header "__exception/terminate.h" }
-
-module std_private_expected_bad_expected_access [system] { header "__expected/bad_expected_access.h" }
-module std_private_expected_expected            [system] { header "__expected/expected.h" }
-module std_private_expected_unexpect            [system] { header "__expected/unexpect.h" }
-module std_private_expected_unexpected          [system] { header "__expected/unexpected.h" }
-
-module std_private_format_buffer                          [system] { header "__format/buffer.h" }
-module std_private_format_concepts                        [system] { header "__format/concepts.h" }
-module std_private_format_container_adaptor               [system] { header "__format/container_adaptor.h" }
-module std_private_format_enable_insertable               [system] { header "__format/enable_insertable.h" }
-module std_private_format_escaped_output_table            [system] { header "__format/escaped_output_table.h" }
-module std_private_format_extended_grapheme_cluster_table [system] { header "__format/extended_grapheme_cluster_table.h" }
-module std_private_format_format_arg                      [system] { header "__format/format_arg.h" }
-module std_private_format_format_arg_store                [system] { header "__format/format_arg_store.h" }
-module std_private_format_format_args                     [system] { header "__format/format_args.h" }
-module std_private_format_format_context                  [system] {
-  header "__format/format_context.h"
-  export *
-}
-module std_private_format_format_error                    [system] { header "__format/format_error.h" }
-module std_private_format_format_functions                [system] {
-  header "__format/format_functions.h"
-  export std_string
-}
-module std_private_format_fwd                             [system] { header "__fwd/format.h" }
-module std_private_format_format_parse_context            [system] { header "__format/format_parse_context.h" }
-module std_private_format_format_string                   [system] { header "__format/format_string.h" }
-module std_private_format_format_to_n_result              [system] {
-  header "__format/format_to_n_result.h"
-  export std_private_iterator_incrementable_traits
-}
-module std_private_format_formatter                       [system] { header "__format/formatter.h" }
-module std_private_format_formatter_bool                  [system] { header "__format/formatter_bool.h" }
-module std_private_format_formatter_char                  [system] { header "__format/formatter_char.h" }
-module std_private_format_formatter_floating_point        [system] { header "__format/formatter_floating_point.h" }
-module std_private_format_formatter_integer               [system] { header "__format/formatter_integer.h" }
-module std_private_format_formatter_integral              [system] { header "__format/formatter_integral.h" }
-module std_private_format_formatter_output                [system] { header "__format/formatter_output.h" }
-module std_private_format_formatter_pointer               [system] { header "__format/formatter_pointer.h" }
-module std_private_format_formatter_string                [system] { header "__format/formatter_string.h" }
-module std_private_format_formatter_tuple                 [system] { header "__format/formatter_tuple.h" }
-module std_private_format_indic_conjunct_break_table      [system] { header "__format/indic_conjunct_break_table.h" }
-module std_private_format_parser_std_format_spec          [system] { header "__format/parser_std_format_spec.h" }
-module std_private_format_range_default_formatter         [system] { header "__format/range_default_formatter.h" }
-module std_private_format_range_formatter                 [system] { header "__format/range_formatter.h" }
-module std_private_format_unicode                         [system] {
-  header "__format/unicode.h"
-  export std_private_format_extended_grapheme_cluster_table
-  export std_private_format_indic_conjunct_break_table
-}
-module std_private_format_width_estimation_table          [system] { header "__format/width_estimation_table.h" }
-module std_private_format_write_escaped                   [system] { header "__format/write_escaped.h" }
-
-module std_private_functional_binary_function            [system] { header "__functional/binary_function.h" }
-module std_private_functional_binary_negate              [system] { header "__functional/binary_negate.h" }
-module std_private_functional_bind                       [system] { header "__functional/bind.h" }
-module std_private_functional_bind_back                  [system] { header "__functional/bind_back.h" }
-module std_private_functional_bind_front                 [system] { header "__functional/bind_front.h" }
-module std_private_functional_binder1st                  [system] { header "__functional/binder1st.h" }
-module std_private_functional_binder2nd                  [system] { header "__functional/binder2nd.h" }
-module std_private_functional_boyer_moore_searcher       [system] {
-  header "__functional/boyer_moore_searcher.h"
-  export std_private_memory_shared_ptr
-}
-module std_private_functional_compose                    [system] {
-  header "__functional/compose.h"
-  export std_private_functional_perfect_forward
-}
-module std_private_functional_default_searcher           [system] { header "__functional/default_searcher.h" }
-module std_private_functional_function                   [system] { header "__functional/function.h" }
-module std_private_functional_hash                       [system] {
-  header "__functional/hash.h"
-  export std_cstdint
-  export std_private_type_traits_underlying_type
-  export std_private_utility_pair
-}
-module std_private_functional_fwd                        [system] { header "__fwd/functional.h" }
-module std_private_functional_identity                   [system] { header "__functional/identity.h" }
-module std_private_functional_invoke                     [system] {
-  header "__functional/invoke.h"
-  export *
-}
-module std_private_functional_is_transparent             [system] { header "__functional/is_transparent.h" }
-module std_private_functional_mem_fn                     [system] { header "__functional/mem_fn.h" }
-module std_private_functional_mem_fun_ref                [system] { header "__functional/mem_fun_ref.h" }
-module std_private_functional_not_fn                     [system] {
-  header "__functional/not_fn.h"
-  export std_private_functional_perfect_forward
-}
-module std_private_functional_operations                 [system] { header "__functional/operations.h" }
-module std_private_functional_perfect_forward            [system] {
-  header "__functional/perfect_forward.h"
-  export *
-}
-module std_private_functional_pointer_to_binary_function [system] { header "__functional/pointer_to_binary_function.h" }
-module std_private_functional_pointer_to_unary_function  [system] { header "__functional/pointer_to_unary_function.h" }
-module std_private_functional_ranges_operations          [system] { header "__functional/ranges_operations.h" }
-module std_private_functional_reference_wrapper          [system] { header "__functional/reference_wrapper.h" }
-module std_private_functional_unary_function             [system] { header "__functional/unary_function.h" }
-module std_private_functional_unary_negate               [system] { header "__functional/unary_negate.h" }
-module std_private_functional_weak_result_type           [system] { header "__functional/weak_result_type.h" }
-
-module std_private_ios_fpos [system] { header "__ios/fpos.h" }
-
-module std_private_iosfwd_fstream_fwd   [system] { header "__fwd/fstream.h" }
-module std_private_iosfwd_ios_fwd       [system] { header "__fwd/ios.h" }
-module std_private_iosfwd_istream_fwd   [system] { header "__fwd/istream.h" }
-module std_private_iosfwd_ostream_fwd   [system] { header "__fwd/ostream.h" }
-module std_private_iosfwd_sstream_fwd   [system] { header "__fwd/sstream.h" }
-module std_private_iosfwd_streambuf_fwd [system] { header "__fwd/streambuf.h" }
-
-module std_private_iterator_access                  [system] { header "__iterator/access.h" }
-module std_private_iterator_advance                 [system] { header "__iterator/advance.h" }
-module std_private_iterator_aliasing_iterator       [system] { header "__iterator/aliasing_iterator.h" }
-module std_private_iterator_back_insert_iterator    [system] { header "__iterator/back_insert_iterator.h" }
-module std_private_iterator_bounded_iter            [system] { header "__iterator/bounded_iter.h" }
-module std_private_iterator_common_iterator         [system] { header "__iterator/common_iterator.h" }
-module std_private_iterator_concepts                [system] {
-  header "__iterator/concepts.h"
-  export std_private_concepts_constructible
-  export std_private_concepts_equality_comparable
-  export std_private_concepts_movable
-  export std_private_type_traits_common_reference
-  export std_private_type_traits_is_reference
-  export std_private_type_traits_remove_cvref
-}
-module std_private_iterator_counted_iterator        [system] { header "__iterator/counted_iterator.h" }
-module std_private_iterator_cpp17_iterator_concepts [system] { header "__iterator/cpp17_iterator_concepts.h" }
-module std_private_iterator_data                    [system] { header "__iterator/data.h" }
-module std_private_iterator_default_sentinel        [system] { header "__iterator/default_sentinel.h" }
-module std_private_iterator_distance                [system] {
-  header "__iterator/distance.h"
-  export std_private_ranges_size
-}
-module std_private_iterator_empty                   [system] { header "__iterator/empty.h" }
-module std_private_iterator_erase_if_container      [system] { header "__iterator/erase_if_container.h" }
-module std_private_iterator_front_insert_iterator   [system] { header "__iterator/front_insert_iterator.h" }
-module std_private_iterator_incrementable_traits    [system] { header "__iterator/incrementable_traits.h" }
-module std_private_iterator_indirectly_comparable   [system] { header "__iterator/indirectly_comparable.h" }
-module std_private_iterator_insert_iterator         [system] { header "__iterator/insert_iterator.h" }
-module std_private_iterator_istream_iterator        [system] { header "__iterator/istream_iterator.h" }
-module std_private_iterator_istreambuf_iterator     [system] { header "__iterator/istreambuf_iterator.h" }
-module std_private_iterator_iter_move               [system] { header "__iterator/iter_move.h" }
-module std_private_iterator_iter_swap               [system] { header "__iterator/iter_swap.h" }
-module std_private_iterator_iterator                [system] { header "__iterator/iterator.h" }
-module std_private_iterator_iterator_traits         [system] {
-  header "__iterator/iterator_traits.h"
-  export std_private_type_traits_is_primary_template
-  export std_private_type_traits_integral_constant
-}
-module std_private_iterator_iterator_with_data      [system] { header "__iterator/iterator_with_data.h" }
-module std_private_iterator_mergeable               [system] {
-  header "__iterator/mergeable.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_iterator_move_iterator           [system] { header "__iterator/move_iterator.h" }
-module std_private_iterator_move_sentinel           [system] { header "__iterator/move_sentinel.h" }
-module std_private_iterator_next                    [system] { header "__iterator/next.h" }
-module std_private_iterator_ostream_iterator        [system] { header "__iterator/ostream_iterator.h" }
-module std_private_iterator_ostreambuf_iterator     [system] {
-  header "__iterator/ostreambuf_iterator.h"
-  export *
-}
-module std_private_iterator_permutable              [system] { header "__iterator/permutable.h" }
-module std_private_iterator_prev                    [system] { header "__iterator/prev.h" }
-module std_private_iterator_projected               [system] { header "__iterator/projected.h" }
-module std_private_iterator_ranges_iterator_traits  [system] { header "__iterator/ranges_iterator_traits.h" }
-module std_private_iterator_readable_traits         [system] { header "__iterator/readable_traits.h" }
-module std_private_iterator_reverse_access          [system] { header "__iterator/reverse_access.h" }
-module std_private_iterator_reverse_iterator        [system] { header "__iterator/reverse_iterator.h" }
-module std_private_iterator_segmented_iterator      [system] { header "__iterator/segmented_iterator.h" }
-module std_private_iterator_size                    [system] { header "__iterator/size.h" }
-module std_private_iterator_sortable                [system] {
-  header "__iterator/sortable.h"
-  export std_private_functional_ranges_operations
-}
-module std_private_iterator_unreachable_sentinel    [system] { header "__iterator/unreachable_sentinel.h" }
-module std_private_iterator_wrap_iter               [system] { header "__iterator/wrap_iter.h" }
-
-module std_private_locale_locale_base_api_android              [system] { textual header "__locale_dir/locale_base_api/android.h" }
-module std_private_locale_locale_base_api_bsd_locale_defaults  [system] { textual header "__locale_dir/locale_base_api/bsd_locale_defaults.h" }
-module std_private_locale_locale_base_api_bsd_locale_fallbacks [system] { textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h" }
-module std_private_locale_locale_base_api_fuchsia              [system] { textual header "__locale_dir/locale_base_api/fuchsia.h" }
-module std_private_locale_locale_base_api_ibm                  [system] { textual header "__locale_dir/locale_base_api/ibm.h" }
-module std_private_locale_locale_base_api_locale_guard         [system] { header "__locale_dir/locale_base_api/locale_guard.h" }
-module std_private_locale_locale_base_api_musl                 [system] { textual header "__locale_dir/locale_base_api/musl.h" }
-module std_private_locale_locale_base_api_newlib               [system] { textual header "__locale_dir/locale_base_api/newlib.h" }
-module std_private_locale_locale_base_api_openbsd              [system] { textual header "__locale_dir/locale_base_api/openbsd.h" }
-module std_private_locale_locale_base_api_win32                [system] { textual header "__locale_dir/locale_base_api/win32.h" }
-module std_private_locale_locale_base_api                      [system] {
-  header "__locale_dir/locale_base_api.h"
-  export *
-}
+  // Implementation detail headers that are private to libc++. These modules
+  // must not be directly imported.
+  module debug_utils {
+    module randomize_range              { header "__debug_utils/randomize_range.h" }
+    module sanitizers                   { header "__debug_utils/sanitizers.h" }
+    module strict_weak_ordering_check   { header "__debug_utils/strict_weak_ordering_check.h" }
+  }
 
-module std_private_math_abs                             [system] { header "__math/abs.h" }
-module std_private_math_copysign                        [system] { header "__math/copysign.h" }
-module std_private_math_error_functions                 [system] { header "__math/error_functions.h" }
-module std_private_math_exponential_functions           [system] { header "__math/exponential_functions.h" }
-module std_private_math_fdim                            [system] { header "__math/fdim.h" }
-module std_private_math_fma                             [system] { header "__math/fma.h" }
-module std_private_math_gamma                           [system] { header "__math/gamma.h" }
-module std_private_math_hyperbolic_functions            [system] { header "__math/hyperbolic_functions.h" }
-module std_private_math_hypot                           [system] { header "__math/hypot.h" }
-module std_private_math_inverse_hyperbolic_functions    [system] { header "__math/inverse_hyperbolic_functions.h" }
-module std_private_math_inverse_trigonometric_functions [system] { header "__math/inverse_trigonometric_functions.h" }
-module std_private_math_logarithms                      [system] { header "__math/logarithms.h" }
-module std_private_math_min_max                         [system] { header "__math/min_max.h" }
-module std_private_math_modulo                          [system] { header "__math/modulo.h" }
-module std_private_math_remainder                       [system] { header "__math/remainder.h" }
-module std_private_math_roots                           [system] { header "__math/roots.h" }
-module std_private_math_rounding_functions              [system] { header "__math/rounding_functions.h" }
-module std_private_math_special_functions               [system] { header "__math/special_functions.h" }
-module std_private_math_traits                          [system] { header "__math/traits.h" }
-module std_private_math_trigonometric_functions         [system] { header "__math/trigonometric_functions.h" }
-
-module std_private_memory_addressof                       [system] { header "__memory/addressof.h" }
-module std_private_memory_align                           [system] { header "__memory/align.h" }
-module std_private_memory_aligned_alloc                   [system] { header "__memory/aligned_alloc.h" }
-module std_private_memory_allocate_at_least               [system] { header "__memory/allocate_at_least.h" }
-module std_private_memory_allocation_guard                [system] { header "__memory/allocation_guard.h" }
-module std_private_memory_allocator                       [system] { header "__memory/allocator.h" }
-module std_private_memory_allocator_arg_t                 [system] { header "__memory/allocator_arg_t.h" }
-module std_private_memory_allocator_destructor            [system] { header "__memory/allocator_destructor.h" }
-module std_private_memory_allocator_traits                [system] { header "__memory/allocator_traits.h" }
-module std_private_memory_assume_aligned                  [system] { header "__memory/assume_aligned.h" }
-module std_private_memory_auto_ptr                        [system] { header "__memory/auto_ptr.h" }
-module std_private_memory_builtin_new_allocator           [system] {
-  header "__memory/builtin_new_allocator.h"
-  export *
-}
-module std_private_memory_compressed_pair                 [system] { header "__memory/compressed_pair.h" }
-module std_private_memory_concepts                        [system] {
-  header "__memory/concepts.h"
-  export std_private_type_traits_remove_reference
-}
-module std_private_memory_construct_at                    [system] { header "__memory/construct_at.h" }
-module std_private_memory_destruct_n                      [system] { header "__memory/destruct_n.h" }
-module std_private_memory_fwd                             [system] { header "__fwd/memory.h" }
-module std_private_memory_inout_ptr                       [system] { header "__memory/inout_ptr.h" }
-module std_private_memory_noexcept_move_assign_container  [system] { header "__memory/noexcept_move_assign_container.h" }
-module std_private_memory_out_ptr                         [system] { header "__memory/out_ptr.h" }
-module std_private_memory_pointer_traits                  [system] { header "__memory/pointer_traits.h" }
-module std_private_memory_ranges_construct_at             [system] { header "__memory/ranges_construct_at.h" }
-module std_private_memory_ranges_uninitialized_algorithms [system] {
-  header "__memory/ranges_uninitialized_algorithms.h"
-  export std_private_algorithm_in_out_result
-}
-module std_private_memory_raw_storage_iterator            [system] { header "__memory/raw_storage_iterator.h" }
-module std_private_memory_shared_ptr                      [system] {
-  header "__memory/shared_ptr.h"
-  export std_private_memory_uninitialized_algorithms
-}
-module std_private_memory_swap_allocator                  [system] { header "__memory/swap_allocator.h" }
-module std_private_memory_temp_value                      [system] { header "__memory/temp_value.h" }
-module std_private_memory_temporary_buffer                [system] {
-  header "__memory/temporary_buffer.h"
-  export std_private_utility_pair
-}
-module std_private_memory_uninitialized_algorithms        [system] {
-  header "__memory/uninitialized_algorithms.h"
-  export std_private_algorithm_copy
-}
-module std_private_memory_unique_ptr                      [system] {
-  header "__memory/unique_ptr.h"
-  export std_private_type_traits_add_lvalue_reference
-  export std_private_type_traits_is_pointer
-  export std_private_type_traits_type_identity
-}
-module std_private_memory_unique_temporary_buffer         [system] {
-  header "__memory/unique_temporary_buffer.h"
-  export std_private_memory_unique_ptr
-  export std_private_type_traits_is_constant_evaluated
-}
-module std_private_memory_uses_allocator                  [system] { header "__memory/uses_allocator.h" }
-module std_private_memory_uses_allocator_construction     [system] { header "__memory/uses_allocator_construction.h" }
-module std_private_memory_voidify                         [system] { header "__memory/voidify.h" }
-
-module std_private_memory_resource_memory_resource              [system] { header "__memory_resource/memory_resource.h" }
-module std_private_memory_resource_memory_resource_fwd          [system] { header "__fwd/memory_resource.h" }
-module std_private_memory_resource_monotonic_buffer_resource    [system] { header "__memory_resource/monotonic_buffer_resource.h" }
-module std_private_memory_resource_polymorphic_allocator        [system] { header "__memory_resource/polymorphic_allocator.h" }
-module std_private_memory_resource_pool_options                 [system] { header "__memory_resource/pool_options.h" }
-module std_private_memory_resource_synchronized_pool_resource   [system] {
-  header "__memory_resource/synchronized_pool_resource.h"
-  export *
-}
-module std_private_memory_resource_unsynchronized_pool_resource [system] { header "__memory_resource/unsynchronized_pool_resource.h" }
-
-module std_private_mutex_lock_guard  [system] { header "__mutex/lock_guard.h" }
-module std_private_mutex_mutex       [system] { header "__mutex/mutex.h" }
-module std_private_mutex_once_flag  [system]  { header "__mutex/once_flag.h" }
-module std_private_mutex_tag_types   [system] { header "__mutex/tag_types.h" }
-module std_private_mutex_unique_lock [system] { header "__mutex/unique_lock.h" }
-
-module std_private_numeric_accumulate               [system] { header "__numeric/accumulate.h" }
-module std_private_numeric_adjacent_difference      [system] { header "__numeric/adjacent_difference.h" }
-module std_private_numeric_exclusive_scan           [system] { header "__numeric/exclusive_scan.h" }
-module std_private_numeric_gcd_lcm                  [system] { header "__numeric/gcd_lcm.h" }
-module std_private_numeric_inclusive_scan           [system] { header "__numeric/inclusive_scan.h" }
-module std_private_numeric_inner_product            [system] { header "__numeric/inner_product.h" }
-module std_private_numeric_iota                     [system] { header "__numeric/iota.h" }
-module std_private_numeric_midpoint                 [system] { header "__numeric/midpoint.h" }
-module std_private_numeric_partial_sum              [system] { header "__numeric/partial_sum.h" }
-module std_private_numeric_pstl                     [system] {
-  header "__numeric/pstl.h"
-  export *
-}
-module std_private_numeric_reduce                   [system] { header "__numeric/reduce.h" }
-module std_private_numeric_saturation_arithmetic    [system] { header "__numeric/saturation_arithmetic.h" }
-module std_private_numeric_transform_exclusive_scan [system] { header "__numeric/transform_exclusive_scan.h" }
-module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" }
-module std_private_numeric_transform_reduce         [system] { header "__numeric/transform_reduce.h" }
-
-module std_private_pstl [system] {
-  header "__pstl/backend.h"
-  header "__pstl/backend_fwd.h"
-  header "__pstl/backends/default.h"
-  header "__pstl/backends/libdispatch.h"
-  header "__pstl/backends/serial.h"
-  header "__pstl/backends/std_thread.h"
-  header "__pstl/cpu_algos/any_of.h"
-  header "__pstl/cpu_algos/cpu_traits.h"
-  header "__pstl/cpu_algos/fill.h"
-  header "__pstl/cpu_algos/find_if.h"
-  header "__pstl/cpu_algos/for_each.h"
-  header "__pstl/cpu_algos/merge.h"
-  header "__pstl/cpu_algos/stable_sort.h"
-  header "__pstl/cpu_algos/transform.h"
-  header "__pstl/cpu_algos/transform_reduce.h"
-  header "__pstl/dispatch.h"
-  header "__pstl/handle_exception.h"
-}
+  module get_fwd {
+    header "__fwd/get.h"
+    export std_core.fwd.pair
+    export std_core.fwd.tuple
+    export std.array.fwd
+    export std.complex.fwd
+    export std.ranges.subrange_fwd
+    export std.variant.fwd
+  }
 
-module std_private_queue_fwd [system] { header "__fwd/queue.h" }
+  module pstl {
+    module backend_fwd {
+      header "__pstl/backend_fwd.h"
+    }
+    module backend {
+      header "__pstl/backend.h"
+      export * // need to export everything from whatever backend is currently configured
+    }
+    module backends {
+      module default {
+        header "__pstl/backends/default.h"
+        export std_core.utility_core.empty
+      }
+      module libdispatch {
+        header "__pstl/backends/libdispatch.h"
+        export std.pstl.cpu_algos
+        export std_core.utility_core.empty
+      }
+      module serial {
+        header "__pstl/backends/serial.h"
+        export std_core.utility_core.empty
+      }
+      module std_thread {
+        header "__pstl/backends/std_thread.h"
+        export std.pstl.cpu_algos
+        export std_core.utility_core.empty
+      }
+    }
+    module cpu_algos {
+      module any_of {
+        header "__pstl/cpu_algos/any_of.h"
+      }
+      module cpu_traits {
+        header "__pstl/cpu_algos/cpu_traits.h"
+      }
+      module fill {
+        header "__pstl/cpu_algos/fill.h"
+        export std_core.utility_core.empty
+      }
+      module find_if {
+        header "__pstl/cpu_algos/find_if.h"
+      }
+      module for_each {
+        header "__pstl/cpu_algos/for_each.h"
+        export std_core.utility_core.empty
+      }
+      module merge {
+        header "__pstl/cpu_algos/merge.h"
+      }
+      module stable_sort {
+        header "__pstl/cpu_algos/stable_sort.h"
+        export std_core.utility_core.empty
+      }
+      module transform {
+        header "__pstl/cpu_algos/transform.h"
+      }
+      module transform_reduce {
+        header "__pstl/cpu_algos/transform_reduce.h"
+      }
+    }
+    module dispatch           { header "__pstl/dispatch.h" }
+    module handle_exception   { header "__pstl/handle_exception.h" }
+  }
 
-module std_private_ostream_basic_ostream [system] {
-  header "__ostream/basic_ostream.h"
-  export std_streambuf
-}
-module std_private_ostream_print         [system] {
-  header "__ostream/print.h"
-  export std_print
-}
+  // Miscellaneous modules for top-level headers
+  module bit_reference_fwd {
+    header "__fwd/bit_reference.h"
+  }
+  module bit_reference {
+    header "__bit_reference"
+    export std.bit_reference_fwd
+  }
+  module hash_table           { header "__hash_table" }
+  module node_handle          { header "__node_handle" }
+  module split_buffer         { header "__split_buffer" }
+  module tree                 { header "__tree" }
+  module std_mbstate_t {
+    header "__std_mbstate_t.h"
+    export *
+  }
+  module verbose_abort {
+    header "__verbose_abort"
+  }
+  module internal_assert {
+    header "__assert"
+    export *
+  }
+
+  module undef_macros {
+    textual header "__undef_macros"
+  }
+
+  // This module needs to appear after __tree to work around issues with modules in Objective-C++ mode.
+  module coroutine {
+    module coroutine_handle         { header "__coroutine/coroutine_handle.h" }
+    module coroutine_traits         { header "__coroutine/coroutine_traits.h" }
+    module noop_coroutine_handle    { header "__coroutine/noop_coroutine_handle.h" }
+    module trivial_awaitables       { header "__coroutine/trivial_awaitables.h" }
 
-module std_private_random_bernoulli_distribution          [system] { header "__random/bernoulli_distribution.h" }
-module std_private_random_binomial_distribution           [system] { header "__random/binomial_distribution.h" }
-module std_private_random_cauchy_distribution             [system] { header "__random/cauchy_distribution.h" }
-module std_private_random_chi_squared_distribution        [system] { header "__random/chi_squared_distribution.h" }
-module std_private_random_clamp_to_integral               [system] { header "__random/clamp_to_integral.h" }
-module std_private_random_default_random_engine           [system] { header "__random/default_random_engine.h" }
-module std_private_random_discard_block_engine            [system] { header "__random/discard_block_engine.h" }
-module std_private_random_discrete_distribution           [system] {
-  header "__random/discrete_distribution.h"
+    header "coroutine"
+    export *
+  }
+} // module std
+
+// C compatibility headers
+//
+// These modules need to be their own top-level modules because they depend on the system-provided
+// headers (via include_next), which are then free to include other C headers provided by libc++.
+// If we group these headers in a single module, we would end up with circular dependencies.
+module std_complex_h [system] {
+  header "complex.h"
   export *
 }
-module std_private_random_exponential_distribution        [system] { header "__random/exponential_distribution.h" }
-module std_private_random_extreme_value_distribution      [system] { header "__random/extreme_value_distribution.h" }
-module std_private_random_fisher_f_distribution           [system] { header "__random/fisher_f_distribution.h" }
-module std_private_random_gamma_distribution              [system] { header "__random/gamma_distribution.h" }
-module std_private_random_generate_canonical              [system] { header "__random/generate_canonical.h" }
-module std_private_random_geometric_distribution          [system] { header "__random/geometric_distribution.h" }
-module std_private_random_independent_bits_engine         [system] { header "__random/independent_bits_engine.h" }
-module std_private_random_is_seed_sequence                [system] { header "__random/is_seed_sequence.h" }
-module std_private_random_is_valid                        [system] { header "__random/is_valid.h" }
-module std_private_random_knuth_b                         [system] { header "__random/knuth_b.h" }
-module std_private_random_linear_congruential_engine      [system] { header "__random/linear_congruential_engine.h" }
-module std_private_random_log2                            [system] { header "__random/log2.h" }
-module std_private_random_lognormal_distribution          [system] { header "__random/lognormal_distribution.h" }
-module std_private_random_mersenne_twister_engine         [system] { header "__random/mersenne_twister_engine.h" }
-module std_private_random_negative_binomial_distribution  [system] { header "__random/negative_binomial_distribution.h" }
-module std_private_random_normal_distribution             [system] { header "__random/normal_distribution.h" }
-module std_private_random_piecewise_constant_distribution [system] {
-  header "__random/piecewise_constant_distribution.h"
+module std_ctype_h [system] {
+  header "ctype.h"
   export *
 }
-module std_private_random_piecewise_linear_distribution   [system] {
-  header "__random/piecewise_linear_distribution.h"
+module std_errno_h [system] {
+  header "errno.h"
   export *
 }
-module std_private_random_poisson_distribution            [system] { header "__random/poisson_distribution.h" }
-module std_private_random_random_device                   [system] {
-  header "__random/random_device.h"
+module std_fenv_h [system] {
+  header "fenv.h"
   export *
 }
-module std_private_random_ranlux                          [system] { header "__random/ranlux.h" }
-module std_private_random_seed_seq                        [system] {
-  header "__random/seed_seq.h"
+module std_float_h [system] {
+  header "float.h"
   export *
 }
-module std_private_random_shuffle_order_engine            [system] { header "__random/shuffle_order_engine.h" }
-module std_private_random_student_t_distribution          [system] { header "__random/student_t_distribution.h" }
-module std_private_random_subtract_with_carry_engine      [system] { header "__random/subtract_with_carry_engine.h" }
-module std_private_random_uniform_int_distribution        [system] { header "__random/uniform_int_distribution.h" }
-module std_private_random_uniform_random_bit_generator    [system] { header "__random/uniform_random_bit_generator.h" }
-module std_private_random_uniform_real_distribution       [system] { header "__random/uniform_real_distribution.h" }
-module std_private_random_weibull_distribution            [system] { header "__random/weibull_distribution.h" }
-
-module std_private_ranges_access                     [system] { header "__ranges/access.h" }
-module std_private_ranges_all                        [system] {
-  header "__ranges/all.h"
-  export std_private_functional_compose
-  export std_private_functional_perfect_forward
-  export std_private_ranges_owning_view
-}
-module std_private_ranges_as_rvalue_view             [system] { header "__ranges/as_rvalue_view.h" }
-module std_private_ranges_chunk_by_view              [system] { header "__ranges/chunk_by_view.h" }
-module std_private_ranges_common_view                [system] { header "__ranges/common_view.h" }
-module std_private_ranges_concepts                   [system] {
-  header "__ranges/concepts.h"
-  export std_private_iterator_concepts
-}
-module std_private_ranges_container_compatible_range [system] { header "__ranges/container_compatible_range.h" }
-module std_private_ranges_counted                    [system] {
-  header "__ranges/counted.h"
-  export std_span
-}
-module std_private_ranges_dangling                   [system] { header "__ranges/dangling.h" }
-module std_private_ranges_data                       [system] { header "__ranges/data.h" }
-module std_private_ranges_drop_view                  [system] { header "__ranges/drop_view.h" }
-module std_private_ranges_drop_while_view            [system] { header "__ranges/drop_while_view.h" }
-module std_private_ranges_elements_view              [system] { header "__ranges/elements_view.h" }
-module std_private_ranges_empty                      [system] { header "__ranges/empty.h" }
-module std_private_ranges_empty_view                 [system] { header "__ranges/empty_view.h" }
-module std_private_ranges_enable_borrowed_range      [system] { header "__ranges/enable_borrowed_range.h" }
-module std_private_ranges_enable_view                [system] { header "__ranges/enable_view.h" }
-module std_private_ranges_filter_view                [system] {
-  header "__ranges/filter_view.h"
-  export std_private_ranges_range_adaptor
-}
-module std_private_ranges_from_range                 [system] { header "__ranges/from_range.h" }
-module std_private_ranges_iota_view                  [system] { header "__ranges/iota_view.h" }
-module std_private_ranges_istream_view               [system] {
-  header "__ranges/istream_view.h"
-}
-module std_private_ranges_join_view                  [system] {
-  header "__ranges/join_view.h"
-  export std_private_iterator_iterator_with_data
-  export std_private_iterator_segmented_iterator
-}
-module std_private_ranges_lazy_split_view            [system] {
-  header "__ranges/lazy_split_view.h"
-  export std_private_ranges_non_propagating_cache
+module std_inttypes_h [system] {
+  header "inttypes.h"
+  export *
 }
-module std_private_ranges_movable_box                [system] { header "__ranges/movable_box.h" }
-module std_private_ranges_non_propagating_cache      [system] { header "__ranges/non_propagating_cache.h" }
-module std_private_ranges_owning_view                [system] { header "__ranges/owning_view.h" }
-module std_private_ranges_range_adaptor              [system] { header "__ranges/range_adaptor.h" }
-module std_private_ranges_rbegin                     [system] { header "__ranges/rbegin.h" }
-module std_private_ranges_ref_view                   [system] { header "__ranges/ref_view.h" }
-module std_private_ranges_rend                       [system] { header "__ranges/rend.h" }
-module std_private_ranges_repeat_view                [system] { header "__ranges/repeat_view.h" }
-module std_private_ranges_reverse_view               [system] { header "__ranges/reverse_view.h" }
-module std_private_ranges_single_view                [system] { header "__ranges/single_view.h" }
-module std_private_ranges_size                       [system] {
-  header "__ranges/size.h"
-  export std_private_type_traits_make_unsigned
+module std_locale_h [system] {
+  header "locale.h"
+  export *
 }
-module std_private_ranges_split_view                 [system] { header "__ranges/split_view.h" }
-module std_private_ranges_subrange                   [system] {
-  header "__ranges/subrange.h"
-  export std_private_ranges_subrange_fwd
+module std_math_h [system] {
+  header "math.h"
+  export *
 }
-module std_private_ranges_subrange_fwd               [system] {
-  header "__fwd/subrange.h"
-  export std_private_iterator_concepts
+module std_stdatomic_h [system] {
+  header "stdatomic.h"
+  export *
 }
-module std_private_ranges_take_view                  [system] { header "__ranges/take_view.h" }
-module std_private_ranges_take_while_view            [system] { header "__ranges/take_while_view.h" }
-module std_private_ranges_to                         [system] { header "__ranges/to.h" }
-module std_private_ranges_transform_view             [system] {
-  header "__ranges/transform_view.h"
-  export std_private_functional_bind_back
-  export std_private_functional_perfect_forward
-  export std_private_ranges_movable_box
+module std_stdbool_h [system] {
+  // <stdbool.h>'s __bool_true_false_are_defined macro requires textual inclusion.
+  textual header "stdbool.h"
 }
-module std_private_ranges_view_interface             [system] { header "__ranges/view_interface.h" }
-module std_private_ranges_views                      [system] { header "__ranges/views.h" }
-module std_private_ranges_zip_view                   [system] {
-  header "__ranges/zip_view.h"
-  export std_private_utility_pair
+module std_stddef_h [system] {
+  // <stddef.h>'s __need_* macros require textual inclusion.
+  textual header "stddef.h"
 }
-
-module std_private_span_span_fwd [system] { header "__fwd/span.h" }
-
-module std_private_stack_fwd [system] { header "__fwd/stack.h" }
-
-module std_private_string_char_traits           [system] {
-  header "__string/char_traits.h"
+module std_stdint_h [system] {
+  header "stdint.h"
   export *
 }
-module std_private_string_constexpr_c_functions [system] {
-  header "__string/constexpr_c_functions.h"
-  export std_private_type_traits_is_equality_comparable
-}
-module std_private_string_extern_template_lists [system] { header "__string/extern_template_lists.h" }
-module std_private_string_string_fwd            [system] { header "__fwd/string.h" }
-
-module std_private_string_view_string_view_fwd [system] { header "__fwd/string_view.h" }
-
-module std_private_system_error_errc            [system] { header "__system_error/errc.h" }
-module std_private_system_error_error_category  [system] { header "__system_error/error_category.h" }
-module std_private_system_error_error_code      [system] {
-  header "__system_error/error_code.h"
-  export std_private_functional_hash
-  export std_private_functional_unary_function
+module std_stdio_h [system] {
+  // <stdio.h>'s __need_* macros require textual inclusion.
+  textual header "stdio.h"
 }
-module std_private_system_error_error_condition [system] {
-  header "__system_error/error_condition.h"
-  export std_private_functional_hash
-  export std_private_functional_unary_function
+module std_stdlib_h [system] {
+  // <stdlib.h>'s __need_* macros require textual inclusion.
+  textual header "stdlib.h"
 }
-module std_private_system_error_system_error    [system] { header "__system_error/system_error.h" }
-
-module std_private_thread_formatter            [system] { header "__thread/formatter.h" }
-module std_private_thread_id                   [system] { header "__thread/id.h" }
-module std_private_thread_jthread              [system] {
-  header "__thread/jthread.h"
+module std_string_h [system] {
+  header "string.h"
   export *
 }
-module std_private_thread_poll_with_backoff    [system] { header "__thread/poll_with_backoff.h" }
-module std_private_thread_support              [system] {
-  header "__thread/support.h"
+module std_tgmath_h [system] {
+  header "tgmath.h"
   export *
 }
-module std_private_thread_support_c11          [system] { textual header "__thread/support/c11.h" }
-module std_private_thread_support_external     [system] { textual header "__thread/support/external.h" }
-module std_private_thread_support_pthread      [system] { textual header "__thread/support/pthread.h" }
-module std_private_thread_support_windows      [system] { textual header "__thread/support/windows.h" }
-module std_private_thread_this_thread          [system] { header "__thread/this_thread.h" }
-module std_private_thread_thread               [system] {
-  header "__thread/thread.h"
+module std_uchar_h [system] {
+  header "uchar.h"
   export *
 }
-module std_private_thread_timed_backoff_policy [system] { header "__thread/timed_backoff_policy.h" }
-
-module std_private_tuple_find_index             [system] { header "__tuple/find_index.h" }
-module std_private_tuple_ignore                 [system] { header "__tuple/ignore.h" }
-module std_private_tuple_make_tuple_types       [system] { header "__tuple/make_tuple_types.h" }
-module std_private_tuple_tuple_like_no_subrange [system] {
-  header "__tuple/tuple_like_no_subrange.h"
-}
-module std_private_tuple_sfinae_helpers         [system] { header "__tuple/sfinae_helpers.h" }
-module std_private_tuple_tuple_element          [system] { header "__tuple/tuple_element.h" }
-module std_private_tuple_tuple_fwd              [system] { header "__fwd/tuple.h" }
-module std_private_get_fwd                      [system] {
-  header "__fwd/get.h"
-  export std_private_array_array_fwd
-  export std_private_complex_complex_fwd
-  export std_private_ranges_subrange_fwd
-  export std_private_tuple_tuple_fwd
-  export std_private_utility_pair_fwd
-  export std_private_variant_fwd
+module std_wchar_h [system] {
+  // <wchar.h>'s __need_* macros require textual inclusion.
+  textual header "wchar.h"
 }
-module std_private_tuple_tuple_indices          [system] { header "__tuple/tuple_indices.h" }
-module std_private_tuple_tuple_like             [system] {
-  header "__tuple/tuple_like.h"
+module std_wctype_h [system] {
+  header "wctype.h"
   export *
 }
-module std_private_tuple_tuple_like_ext         [system] { header "__tuple/tuple_like_ext.h" }
-module std_private_tuple_tuple_size             [system] {
-  header "__tuple/tuple_size.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_tuple_tuple_types            [system] { header "__tuple/tuple_types.h" }
 
-module std_private_type_traits_add_const                                 [system] { header "__type_traits/add_const.h" }
-module std_private_type_traits_add_cv                                    [system] { header "__type_traits/add_cv.h" }
-module std_private_type_traits_add_lvalue_reference                      [system] {
-  header "__type_traits/add_lvalue_reference.h"
-  export std_private_type_traits_is_referenceable
-}
-module std_private_type_traits_add_pointer                               [system] { header "__type_traits/add_pointer.h" }
-module std_private_type_traits_add_rvalue_reference                      [system] { header "__type_traits/add_rvalue_reference.h" }
-module std_private_type_traits_add_volatile                              [system] { header "__type_traits/add_volatile.h" }
-module std_private_type_traits_aligned_storage                           [system] { header "__type_traits/aligned_storage.h" }
-module std_private_type_traits_aligned_union                             [system] { header "__type_traits/aligned_union.h" }
-module std_private_type_traits_alignment_of                              [system] { header "__type_traits/alignment_of.h" }
-module std_private_type_traits_can_extract_key                           [system] { header "__type_traits/can_extract_key.h" }
-module std_private_type_traits_common_reference                          [system] {
-  header "__type_traits/common_reference.h"
-  export std_private_type_traits_remove_cvref
-}
-module std_private_type_traits_common_type                               [system] {
-  header "__type_traits/common_type.h"
-  export std_private_type_traits_type_identity
-  export std_private_utility_declval
-  export std_private_utility_empty
-}
-module std_private_type_traits_conditional                               [system] { header "__type_traits/conditional.h" }
-module std_private_type_traits_conjunction                               [system] { header "__type_traits/conjunction.h" }
-module std_private_type_traits_copy_cv                                   [system] { header "__type_traits/copy_cv.h" }
-module std_private_type_traits_copy_cvref                                [system] { header "__type_traits/copy_cvref.h" }
-module std_private_type_traits_datasizeof                                [system] { header "__type_traits/datasizeof.h" }
-module std_private_type_traits_decay                                     [system] {
-  header "__type_traits/decay.h"
-  export std_private_type_traits_add_pointer
-}
-module std_private_type_traits_dependent_type                            [system] { header "__type_traits/dependent_type.h" }
-module std_private_type_traits_desugars_to                               [system] { header "__type_traits/desugars_to.h" }
-module std_private_type_traits_disjunction                               [system] { header "__type_traits/disjunction.h" }
-module std_private_type_traits_enable_if                                 [system] { header "__type_traits/enable_if.h" }
-module std_private_type_traits_extent                                    [system] { header "__type_traits/extent.h" }
-module std_private_type_traits_has_unique_object_representation          [system] { header "__type_traits/has_unique_object_representation.h" }
-module std_private_type_traits_has_virtual_destructor                    [system] { header "__type_traits/has_virtual_destructor.h" }
-module std_private_type_traits_integral_constant                         [system] { header "__type_traits/integral_constant.h" }
-module std_private_type_traits_invoke                                    [system] {
-  header "__type_traits/invoke.h"
-  export std_private_type_traits_conditional
-  export std_private_type_traits_decay
-  export std_private_type_traits_decay
-  export std_private_type_traits_enable_if
-  export std_private_type_traits_is_base_of
-  export std_private_type_traits_is_core_convertible
-  export std_private_type_traits_is_reference_wrapper
-  export std_private_type_traits_is_same
-  export std_private_type_traits_is_void
-  export std_private_type_traits_nat
-  export std_private_type_traits_remove_cv
-}
-module std_private_type_traits_is_abstract                               [system] { header "__type_traits/is_abstract.h" }
-module std_private_type_traits_is_aggregate                              [system] { header "__type_traits/is_aggregate.h" }
-module std_private_type_traits_is_allocator                              [system] { header "__type_traits/is_allocator.h" }
-module std_private_type_traits_is_always_bitcastable                     [system] { header "__type_traits/is_always_bitcastable.h" }
-module std_private_type_traits_is_arithmetic                             [system] {
-  header "__type_traits/is_arithmetic.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_type_traits_is_array                                  [system] {
-  header "__type_traits/is_array.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_type_traits_is_assignable                             [system] { header "__type_traits/is_assignable.h" }
-module std_private_type_traits_is_base_of                                [system] { header "__type_traits/is_base_of.h" }
-module std_private_type_traits_is_bounded_array                          [system] { header "__type_traits/is_bounded_array.h" }
-module std_private_type_traits_is_callable                               [system] {
-  header "__type_traits/is_callable.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_type_traits_is_char_like_type                         [system] { header "__type_traits/is_char_like_type.h" }
-module std_private_type_traits_is_class                                  [system] { header "__type_traits/is_class.h" }
-module std_private_type_traits_is_compound                               [system] { header "__type_traits/is_compound.h" }
-module std_private_type_traits_is_const                                  [system] { header "__type_traits/is_const.h" }
-module std_private_type_traits_is_constant_evaluated                     [system] { header "__type_traits/is_constant_evaluated.h" }
-module std_private_type_traits_is_constructible                          [system] { header "__type_traits/is_constructible.h" }
-module std_private_type_traits_is_convertible                            [system] {
-  header "__type_traits/is_convertible.h"
-  export std_private_type_traits_is_array
-}
-module std_private_type_traits_is_copy_assignable                        [system] { header "__type_traits/is_copy_assignable.h" }
-module std_private_type_traits_is_copy_constructible                     [system] { header "__type_traits/is_copy_constructible.h" }
-module std_private_type_traits_is_core_convertible                       [system] {
-  header "__type_traits/is_core_convertible.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_type_traits_is_destructible                           [system] { header "__type_traits/is_destructible.h" }
-module std_private_type_traits_is_empty                                  [system] { header "__type_traits/is_empty.h" }
-module std_private_type_traits_is_enum                                   [system] {
-  header "__type_traits/is_enum.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_type_traits_is_equality_comparable                    [system] {
-  header "__type_traits/is_equality_comparable.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_type_traits_is_execution_policy                       [system] {
-  header "__type_traits/is_execution_policy.h"
-  export std_private_type_traits_remove_cvref
-}
-module std_private_type_traits_is_final                                  [system] { header "__type_traits/is_final.h" }
-module std_private_type_traits_is_floating_point                         [system] { header "__type_traits/is_floating_point.h" }
-module std_private_type_traits_is_function                               [system] { header "__type_traits/is_function.h" }
-module std_private_type_traits_is_fundamental                            [system] { header "__type_traits/is_fundamental.h" }
-module std_private_type_traits_is_implicitly_default_constructible       [system] {
-  header "__type_traits/is_implicitly_default_constructible.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_type_traits_is_integral                               [system] {
-  header "__type_traits/is_integral.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_type_traits_is_literal_type                           [system] { header "__type_traits/is_literal_type.h" }
-module std_private_type_traits_is_member_pointer                         [system] { header "__type_traits/is_member_pointer.h" }
-module std_private_type_traits_is_nothrow_assignable                     [system] { header "__type_traits/is_nothrow_assignable.h" }
-module std_private_type_traits_is_nothrow_constructible                  [system] {
-  header "__type_traits/is_nothrow_constructible.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_type_traits_is_nothrow_convertible                    [system] { header "__type_traits/is_nothrow_convertible.h" }
-module std_private_type_traits_is_nothrow_destructible                   [system] {
-  header "__type_traits/is_nothrow_destructible.h"
-  export std_private_type_traits_is_destructible
-}
-module std_private_type_traits_is_null_pointer                           [system] {
-  header "__type_traits/is_null_pointer.h"
-  export std_cstddef
-}
-module std_private_type_traits_is_object                                 [system] {
-  header "__type_traits/is_object.h"
-  export std_private_type_traits_is_scalar
-}
-module std_private_type_traits_is_pod                                    [system] { header "__type_traits/is_pod.h" }
-module std_private_type_traits_is_pointer                                [system] { header "__type_traits/is_pointer.h" }
-module std_private_type_traits_is_polymorphic                            [system] { header "__type_traits/is_polymorphic.h" }
-module std_private_type_traits_is_primary_template                       [system] {
-  header "__type_traits/is_primary_template.h"
-  export std_private_type_traits_enable_if
-}
-module std_private_type_traits_is_reference                              [system] {
-  header "__type_traits/is_reference.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_type_traits_is_reference_wrapper                      [system] { header "__type_traits/is_reference_wrapper.h" }
-module std_private_type_traits_is_referenceable                          [system] { header "__type_traits/is_referenceable.h" }
-module std_private_type_traits_is_same                                   [system] {
-  header "__type_traits/is_same.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_type_traits_is_scalar                                 [system] {
-  header "__type_traits/is_scalar.h"
-  export std_private_type_traits_is_null_pointer
-}
-module std_private_type_traits_is_signed                                 [system] { header "__type_traits/is_signed.h" }
-module std_private_type_traits_is_signed_integer                         [system] { header "__type_traits/is_signed_integer.h" }
-module std_private_type_traits_is_specialization                         [system] { header "__type_traits/is_specialization.h" }
-module std_private_type_traits_is_standard_layout                        [system] { header "__type_traits/is_standard_layout.h" }
-module std_private_type_traits_is_swappable                              [system] {
-  header "__type_traits/is_swappable.h"
-  export std_private_type_traits_is_move_constructible
-}
-module std_private_type_traits_is_trivial                                [system] { header "__type_traits/is_trivial.h" }
-module std_private_type_traits_is_trivially_assignable                   [system] { header "__type_traits/is_trivially_assignable.h" }
-module std_private_type_traits_is_trivially_constructible                [system] { header "__type_traits/is_trivially_constructible.h" }
-module std_private_type_traits_is_trivially_copyable                     [system] {
-  header "__type_traits/is_trivially_copyable.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_type_traits_is_trivially_destructible                 [system] { header "__type_traits/is_trivially_destructible.h" }
-module std_private_type_traits_is_trivially_lexicographically_comparable [system] { header "__type_traits/is_trivially_lexicographically_comparable.h" }
-module std_private_type_traits_is_trivially_relocatable                  [system] { header "__type_traits/is_trivially_relocatable.h" }
-module std_private_type_traits_is_unbounded_array                        [system] { header "__type_traits/is_unbounded_array.h" }
-module std_private_type_traits_is_union                                  [system] { header "__type_traits/is_union.h" }
-module std_private_type_traits_is_unsigned                               [system] { header "__type_traits/is_unsigned.h" }
-module std_private_type_traits_is_unsigned_integer                       [system] { header "__type_traits/is_unsigned_integer.h" }
-module std_private_type_traits_is_valid_expansion                        [system] { header "__type_traits/is_valid_expansion.h" }
-module std_private_type_traits_is_void                                   [system] {
-  header "__type_traits/is_void.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_type_traits_is_volatile                               [system] { header "__type_traits/is_volatile.h" }
-module std_private_type_traits_lazy                                      [system] { header "__type_traits/lazy.h" }
-module std_private_type_traits_make_32_64_or_128_bit                     [system] { header "__type_traits/make_32_64_or_128_bit.h" }
-module std_private_type_traits_make_const_lvalue_ref                     [system] { header "__type_traits/make_const_lvalue_ref.h" }
-module std_private_type_traits_make_signed                               [system] { header "__type_traits/make_signed.h" }
-module std_private_type_traits_make_unsigned                             [system] {
-  header "__type_traits/make_unsigned.h"
-  export std_private_type_traits_is_unsigned
-}
-module std_private_type_traits_maybe_const                               [system] { header "__type_traits/maybe_const.h" }
-module std_private_type_traits_nat                                       [system] { header "__type_traits/nat.h" }
-module std_private_type_traits_negation                                  [system] { header "__type_traits/negation.h" }
-module std_private_type_traits_promote                                   [system] { header "__type_traits/promote.h" }
-module std_private_type_traits_rank                                      [system] { header "__type_traits/rank.h" }
-module std_private_type_traits_remove_all_extents                        [system] { header "__type_traits/remove_all_extents.h" }
-module std_private_type_traits_remove_const                              [system] { header "__type_traits/remove_const.h" }
-module std_private_type_traits_remove_const_ref                          [system] { header "__type_traits/remove_const_ref.h" }
-module std_private_type_traits_remove_cv                                 [system] {
-  header "__type_traits/remove_cv.h"
-  export std_private_type_traits_remove_const
-  export std_private_type_traits_remove_volatile
-}
-module std_private_type_traits_remove_cvref                              [system] { header "__type_traits/remove_cvref.h" }
-module std_private_type_traits_remove_extent                             [system] { header "__type_traits/remove_extent.h" }
-module std_private_type_traits_remove_pointer                            [system] { header "__type_traits/remove_pointer.h" }
-module std_private_type_traits_remove_reference                          [system] { header "__type_traits/remove_reference.h" }
-module std_private_type_traits_remove_volatile                           [system] { header "__type_traits/remove_volatile.h" }
-module std_private_type_traits_result_of                                 [system] { header "__type_traits/result_of.h" }
-module std_private_type_traits_strip_signature                           [system] { header "__type_traits/strip_signature.h" }
-module std_private_type_traits_type_identity                             [system] { header "__type_traits/type_identity.h" }
-module std_private_type_traits_type_list                                 [system] { header "__type_traits/type_list.h" }
-module std_private_type_traits_underlying_type                           [system] {
-  header "__type_traits/underlying_type.h"
-  export std_private_type_traits_is_enum
-}
-module std_private_type_traits_unwrap_ref                                [system] { header "__type_traits/unwrap_ref.h" }
-module std_private_type_traits_void_t                                    [system] { header "__type_traits/void_t.h" }
-
-module std_private_utility_as_const               [system] { header "__utility/as_const.h" }
-module std_private_utility_as_lvalue              [system] { header "__utility/as_lvalue.h" }
-module std_private_utility_auto_cast              [system] {
-  header "__utility/auto_cast.h"
-  export std_private_type_traits_decay
-}
-module std_private_utility_cmp                    [system] {
-  header "__utility/cmp.h"
-  export std_private_type_traits_make_unsigned
-}
-module std_private_utility_convert_to_integral    [system] { header "__utility/convert_to_integral.h" }
-module std_private_utility_declval                [system] { header "__utility/declval.h" }
-module std_private_utility_empty                  [system] { header "__utility/empty.h" }
-module std_private_utility_exception_guard        [system] { header "__utility/exception_guard.h" }
-module std_private_utility_exchange               [system] { header "__utility/exchange.h" }
-module std_private_utility_forward                [system] { header "__utility/forward.h" }
-module std_private_utility_forward_like           [system] { header "__utility/forward_like.h" }
-module std_private_utility_in_place               [system] {
-  header "__utility/in_place.h"
-  export std_private_type_traits_integral_constant
-}
-module std_private_utility_integer_sequence       [system] { header "__utility/integer_sequence.h" }
-module std_private_utility_is_pointer_in_range    [system] { header "__utility/is_pointer_in_range.h" }
-module std_private_utility_is_valid_range         [system] { header "__utility/is_valid_range.h" }
-module std_private_utility_move                   [system] {
-  header "__utility/move.h"
-  export std_private_type_traits_is_copy_constructible
-  export std_private_type_traits_is_nothrow_move_constructible
-  export std_private_type_traits_remove_reference
-}
-module std_private_utility_no_destroy             [system] { header "__utility/no_destroy.h" }
-module std_private_utility_pair                   [system] {
-  header "__utility/pair.h"
-  export std_private_ranges_subrange_fwd
-  export std_private_tuple_pair_like
-  export std_private_type_traits_is_assignable
-  export std_private_type_traits_is_constructible
-  export std_private_type_traits_is_convertible
-  export std_private_type_traits_is_copy_assignable
-  export std_private_type_traits_is_move_assignable
-  export std_private_type_traits_is_nothrow_copy_constructible
-  export std_private_type_traits_is_nothrow_default_constructible
-  export std_private_type_traits_is_nothrow_move_assignable
-  export std_private_utility_pair_fwd
-}
-module std_private_utility_pair_fwd                [system] { header "__fwd/pair.h" }
-module std_private_utility_piecewise_construct     [system] { header "__utility/piecewise_construct.h" }
-module std_private_utility_priority_tag            [system] { header "__utility/priority_tag.h" }
-module std_private_utility_private_constructor_tag [system] { header "__utility/private_constructor_tag.h" }
-module std_private_utility_rel_ops                 [system] { header "__utility/rel_ops.h" }
-module std_private_utility_small_buffer            [system] { header "__utility/small_buffer.h" }
-module std_private_utility_swap                    [system] {
-  header "__utility/swap.h"
-  export std_private_type_traits_is_swappable
+// This header is used by other C compatibility headers so it needs to be in its own module.
+module std_private_mbstate_t [system] {
+  header "__mbstate_t.h"
+  export *
 }
-module std_private_utility_to_underlying           [system] { header "__utility/to_underlying.h" }
-module std_private_utility_unreachable             [system] { header "__utility/unreachable.h" }
-
-module std_private_variant_monostate               [system] { header "__variant/monostate.h" }
-module std_private_variant_fwd                     [system] { header "__fwd/variant.h" }
-
-module std_private_vector_fwd                      [system] { header "__fwd/vector.h" }
diff --git a/libcxx/lib/abi/CHANGELOG.TXT b/libcxx/lib/abi/CHANGELOG.TXT
index 32526f1786c6d..68c9d980a016e 100644
--- a/libcxx/lib/abi/CHANGELOG.TXT
+++ b/libcxx/lib/abi/CHANGELOG.TXT
@@ -16,6 +16,36 @@ New entries should be added directly below the "Version" header.
 Version 19.0
 ------------
 
+* [libc++] Avoid re-exporting a few specific symbols from libc++abi
+
+  In 6a884a9aef39, I synchronized the export list of libc++abi to the
+  export list of libc++. From the linker's perspective, this caused
+  these symbols to be taken from libc++.dylib instead of libc++abi.dylib.
+
+  However, that can be problematic when back-deploying. Indeed, this means
+  that the linker will encode an undefined reference to be fullfilled by
+  libc++.dylib, but when backdeploying against an older system, that symbol
+  might only be available in libc++abi.dylib.
+
+  Most of the symbols that started being re-exported after 6a884a9aef39
+  turn out to be implementation details of libc++abi, so nobody really
+  depends on them and this back-deployment issue is inconsequential.
+
+  However, we ran into issues with a few of these symbols while testing
+  LLVM 19, which led to this patch.
+
+  In the future, a follow-up cleanup would be to stop exporting most of
+  the _cxxabiv1_foo_type_infoE symbols from both libc++abi and libc++
+  since they are implementation details that nobody should be relying
+  on.
+
+  <arch>-apple-darwin
+  -------------------
+  Symbol not reexported anymore: ___cxa_rethrow_primary_exception
+  Symbol not reexported anymore: __ZTIN10__cxxabiv117__class_type_infoE
+  Symbol not reexported anymore: __ZTIN10__cxxabiv120__si_class_type_infoE
+  Symbol not reexported anymore: __ZTIN10__cxxabiv121__vmi_class_type_infoE
+
 * [libc++] Always keep libc++abi re-exports up-to-date
 
   This patch makes sure that the set of libc++abi symbols re-exported from libc++
diff --git a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index 917388f86811f..32acae46e292d 100644
--- a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -78,12 +78,9 @@
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv117__array_type_infoE', 'type': 'U'}
-{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv119__pointer_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv120__function_type_infoE', 'type': 'U'}
-{'is_defined': False, 'name': '__ZTIN10__cxxabiv120__si_class_type_infoE', 'type': 'U'}
-{'is_defined': False, 'name': '__ZTIN10__cxxabiv121__vmi_class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv123__fundamental_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDh', 'type': 'U'}
@@ -2002,12 +1999,9 @@
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv117__array_type_infoE', 'type': 'I'}
-{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__class_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv119__pointer_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv120__function_type_infoE', 'type': 'I'}
-{'is_defined': True, 'name': '__ZTIN10__cxxabiv120__si_class_type_infoE', 'type': 'I'}
-{'is_defined': True, 'name': '__ZTIN10__cxxabiv121__vmi_class_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv123__fundamental_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTINSt12experimental15fundamentals_v112bad_any_castE', 'size': 0, 'type': 'OBJECT'}
@@ -2615,7 +2609,6 @@
 {'is_defined': True, 'name': '___cxa_new_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_pure_virtual', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_rethrow', 'type': 'I'}
-{'is_defined': True, 'name': '___cxa_rethrow_primary_exception', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_terminate_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw_bad_array_new_length', 'type': 'I'}
diff --git a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index defe235a283c2..7b5d77499d55f 100644
--- a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -78,12 +78,9 @@
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv117__array_type_infoE', 'type': 'U'}
-{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv119__pointer_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv120__function_type_infoE', 'type': 'U'}
-{'is_defined': False, 'name': '__ZTIN10__cxxabiv120__si_class_type_infoE', 'type': 'U'}
-{'is_defined': False, 'name': '__ZTIN10__cxxabiv121__vmi_class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv123__fundamental_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDh', 'type': 'U'}
@@ -2002,12 +1999,9 @@
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv117__array_type_infoE', 'type': 'I'}
-{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__class_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv119__pointer_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv120__function_type_infoE', 'type': 'I'}
-{'is_defined': True, 'name': '__ZTIN10__cxxabiv120__si_class_type_infoE', 'type': 'I'}
-{'is_defined': True, 'name': '__ZTIN10__cxxabiv121__vmi_class_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv123__fundamental_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTINSt12experimental15fundamentals_v112bad_any_castE', 'size': 0, 'type': 'OBJECT'}
@@ -2649,7 +2643,6 @@
 {'is_defined': True, 'name': '___cxa_new_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_pure_virtual', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_rethrow', 'type': 'I'}
-{'is_defined': True, 'name': '___cxa_rethrow_primary_exception', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_terminate_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw_bad_array_new_length', 'type': 'I'}
diff --git a/libcxx/test/configs/apple-libc++-backdeployment.cfg.in b/libcxx/test/configs/apple-libc++-backdeployment.cfg.in
deleted file mode 100644
index 9843c4a9ad70d..0000000000000
--- a/libcxx/test/configs/apple-libc++-backdeployment.cfg.in
+++ /dev/null
@@ -1,65 +0,0 @@
-# Testing configuration for back-deployment against older Apple system libc++.
-#
-# Under this configuration, we compile and link all the test suite against the latest libc++,
-# however we run against the libc++ on a different platform. This emulates the workflow of
-# a developer building their application using recent tools but with the goal of deploying
-# on existing devices running an older OS (and hence an older dylib).
-
-import os, site
-site.addsitedir(os.path.join('@LIBCXX_SOURCE_DIR@', 'utils'))
-import libcxx.test.params, libcxx.test.config, libcxx.test.dsl
-
-lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
-
-BACKDEPLOYMENT_PARAMETERS = [
-    libcxx.test.dsl.Parameter(name='cxx_runtime_root', type=str,
-        actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{cxx-runtime-root}', root)],
-        help="""
-        The simulated root of the system (for libc++) when running tests.
-
-        This should be a directory hierarchy under which the libc++ dylib can be found.
-        The dylib in that hierarchy is the one that will be used at runtime when running
-        the tests.
-        """),
-    libcxx.test.dsl.Parameter(name='abi_runtime_root', type=str,
-        actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{abi-runtime-root}', root)],
-        help="""
-        The simulated root of the system (for libc++abi) when running tests.
-
-        This should be a directory hierarchy under which the libc++abi dylib can be found.
-        The dylib in that hierarchy is the one that will be used at runtime when running
-        the tests.
-        """),
-    libcxx.test.dsl.Parameter(name='unwind_runtime_root', type=str,
-        actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{unwind-runtime-root}', root)],
-        help="""
-        The simulated root of the system (for libunwind) when running tests.
-
-        This should be a directory hierarchy under which the libunwind dylib can be found.
-        The dylib in that hierarchy is the one that will be used at runtime when running
-        the tests.
-        """),
-]
-
-config.substitutions.append(('%{flags}',
-    '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else ''
-))
-config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include-dir} -I %{libcxx-dir}/test/support'
-))
-config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib-dir} -lc++'
-))
-config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --env DYLD_LIBRARY_PATH="%{cxx-runtime-root}:%{abi-runtime-root}:%{unwind-runtime-root}" -- '
-))
-
-config.stdlib = 'apple-libc++'
-config.using_system_stdlib = True
-
-libcxx.test.config.configure(
-    libcxx.test.params.DEFAULT_PARAMETERS + BACKDEPLOYMENT_PARAMETERS,
-    libcxx.test.features.DEFAULT_FEATURES,
-    config,
-    lit_config
-)
diff --git a/libcxx/test/configs/apple-libc++-system.cfg.in b/libcxx/test/configs/apple-libc++-system.cfg.in
new file mode 100644
index 0000000000000..b59506f375c4a
--- /dev/null
+++ b/libcxx/test/configs/apple-libc++-system.cfg.in
@@ -0,0 +1,33 @@
+# Testing configuration for back-deployment against the system-provided libc++.
+#
+# Under this configuration, we compile and link all the test suite against the just-built
+# libc++, but we run against the system libc++.
+
+import os, site
+site.addsitedir(os.path.join('@LIBCXX_SOURCE_DIR@', 'utils'))
+import libcxx.test.params, libcxx.test.config, libcxx.test.dsl
+
+lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
+
+config.substitutions.append(('%{flags}',
+    '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else ''
+))
+config.substitutions.append(('%{compile_flags}',
+    '-nostdinc++ -I %{include-dir} -I %{libcxx-dir}/test/support'
+))
+config.substitutions.append(('%{link_flags}',
+    '-nostdlib++ -L %{lib-dir} -lc++'
+))
+config.substitutions.append(('%{exec}',
+    '%{executor} --execdir %T -- '
+))
+
+config.stdlib = 'apple-libc++'
+config.using_system_stdlib = True
+
+libcxx.test.config.configure(
+    libcxx.test.params.DEFAULT_PARAMETERS,
+    libcxx.test.features.DEFAULT_FEATURES,
+    config,
+    lit_config
+)
diff --git a/libcxx/test/configs/llvm-libc++-shared-mingw.cfg.in b/libcxx/test/configs/llvm-libc++-shared-mingw.cfg.in
index 01c4d58ca05f9..8868f0cadd2aa 100644
--- a/libcxx/test/configs/llvm-libc++-shared-mingw.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared-mingw.cfg.in
@@ -1,5 +1,5 @@
 # This testing configuration handles running the test suite against LLVM's libc++
-# using either a DLL or a static library, with MinGW/Clang on Windows.
+# using a DLL with MinGW/Clang on Windows.
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
diff --git a/libcxx/test/configs/llvm-libc++-static-mingw.cfg.in b/libcxx/test/configs/llvm-libc++-static-mingw.cfg.in
index 650b60c6536a9..fb2f9065898a5 100644
--- a/libcxx/test/configs/llvm-libc++-static-mingw.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-static-mingw.cfg.in
@@ -1,5 +1,5 @@
 # This testing configuration handles running the test suite against LLVM's libc++
-# using either a DLL or a static library, with MinGW/Clang on Windows.
+# using a static library with MinGW/Clang on Windows.
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
diff --git a/libcxx/test/libcxx/clang_modules_include.gen.py b/libcxx/test/libcxx/clang_modules_include.gen.py
index f0421b2e73813..bc028f2a0809a 100644
--- a/libcxx/test/libcxx/clang_modules_include.gen.py
+++ b/libcxx/test/libcxx/clang_modules_include.gen.py
@@ -37,13 +37,17 @@
 // TODO: Investigate this failure
 // UNSUPPORTED: LIBCXX-FREEBSD-FIXME
 
+// TODO: Investigate why this doesn't work on Picolibc once the locale base API is refactored
+// UNSUPPORTED: LIBCXX-PICOLIBC-FIXME
+
 {lit_header_restrictions.get(header, '')}
 
 #include <{header}>
 """)
 
-print(f"""\
-//--- __std_clang_module.compile.pass.mm
+print(
+    f"""\
+//--- import_std.compile.pass.mm
 // RUN: %{{cxx}} %s %{{flags}} %{{compile_flags}} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only
 
 // REQUIRES: clang-modules-build
@@ -61,6 +65,10 @@
 // TODO: Investigate this failure
 // UNSUPPORTED: LIBCXX-FREEBSD-FIXME
 
+// TODO: Investigate why this doesn't work on Picolibc once the locale base API is refactored
+// UNSUPPORTED: LIBCXX-PICOLIBC-FIXME
+
 @import std;
 
-""")
+"""
+)
diff --git a/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp
index 9147ca93866b2..cea074a4e70f1 100644
--- a/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp
@@ -8,6 +8,10 @@
 
 // UNSUPPORTED: libcpp-has-abi-fix-unordered-container-size-type, libcpp-abi-no-compressed-pair-padding
 
+// std::unique_ptr is used as an implementation detail of the unordered containers, so the layout of
+// unordered containers changes when bounded unique_ptr is enabled.
+// UNSUPPORTED: libcpp-has-abi-bounded-unique_ptr
+
 #include <cstdint>
 #include <unordered_map>
 
diff --git a/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp
index dc6cc082c3b99..2a112aff227d8 100644
--- a/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp
@@ -8,6 +8,10 @@
 
 // UNSUPPORTED: libcpp-has-abi-fix-unordered-container-size-type, libcpp-abi-no-compressed-pair-padding
 
+// std::unique_ptr is used as an implementation detail of the unordered containers, so the layout of
+// unordered containers changes when bounded unique_ptr is enabled.
+// UNSUPPORTED: libcpp-has-abi-bounded-unique_ptr
+
 #include <cstdint>
 #include <unordered_set>
 
diff --git a/libcxx/test/libcxx/diagnostics/iterator.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/iterator.nodiscard.verify.cpp
index 8f9bc3e411f90..c7cd2f5ce5767 100644
--- a/libcxx/test/libcxx/diagnostics/iterator.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/iterator.nodiscard.verify.cpp
@@ -15,12 +15,24 @@
 #include <iterator>
 #include <vector>
 
+#include "test_macros.h"
+
 void test() {
   std::vector<int> container;
   int c_array[] = {1, 2, 3};
   std::initializer_list<int> initializer_list;
 
-  std::empty(container);        // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-  std::empty(c_array);          // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-  std::empty(initializer_list); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::empty(container);                                     // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::empty(c_array);                                       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::empty(initializer_list);                              // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::prev(c_array);                                        // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::next(c_array);                                        // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+#if TEST_STD_VER >= 20
+  std::ranges::prev(c_array);                                // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::ranges::prev(container.end(), 2);                     // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::ranges::next(container.end(), 2, container.begin());  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::ranges::next(c_array);                                // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::ranges::next(container.begin(), 2);                   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::ranges::next(container.end(), 1, container.end());    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+#endif
 }
diff --git a/libcxx/test/libcxx/iterators/assert.next.pass.cpp b/libcxx/test/libcxx/iterators/assert.next.pass.cpp
index 242a0c6f0f7ce..f6fd24284bbfd 100644
--- a/libcxx/test/libcxx/iterators/assert.next.pass.cpp
+++ b/libcxx/test/libcxx/iterators/assert.next.pass.cpp
@@ -23,8 +23,8 @@
 int main(int, char**) {
     int a[] = {1, 2, 3};
     forward_iterator<int *> it(a+1);
-    std::next(it, 1);  // should work fine
-    std::next(it, 0);  // should work fine
+    (void)std::next(it, 1); // should work fine
+    (void)std::next(it, 0); // should work fine
     TEST_LIBCPP_ASSERT_FAILURE(std::next(it, -1), "Attempt to next(it, n) with negative n on a non-bidirectional iterator");
 
     return 0;
diff --git a/libcxx/test/libcxx/iterators/assert.prev.pass.cpp b/libcxx/test/libcxx/iterators/assert.prev.pass.cpp
index a5a04f1bbeb6b..08cbe5e03dd5f 100644
--- a/libcxx/test/libcxx/iterators/assert.prev.pass.cpp
+++ b/libcxx/test/libcxx/iterators/assert.prev.pass.cpp
@@ -24,13 +24,13 @@ int main(int, char**) {
     int a[] = {1, 2, 3};
 
     bidirectional_iterator<int *> bidi(a+1);
-    std::prev(bidi, -1);  // should work fine
-    std::prev(bidi,  0);  // should work fine
-    std::prev(bidi,  1);  // should work fine
+    (void)std::prev(bidi, -1); // should work fine
+    (void)std::prev(bidi, 0);  // should work fine
+    (void)std::prev(bidi, 1);  // should work fine
 
     forward_iterator<int *> it(a+1);
-    std::prev(it, -1); // should work fine
-    std::prev(it,  0); // should work fine
+    (void)std::prev(it, -1); // should work fine
+    (void)std::prev(it, 0);  // should work fine
     TEST_LIBCPP_ASSERT_FAILURE(std::prev(it, 1), "Attempt to prev(it, n) with a positive n on a non-bidirectional iterator");
 
     return 0;
diff --git a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp
index a298f55ea7cf5..7488f518c8bae 100644
--- a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp
+++ b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp
@@ -195,10 +195,12 @@ int main(int, char**) {
   test<TrivialEmptyType, FinalEmptyAlloc>();
   test<TrivialEmptyType, NonTrivialAlloc>();
 
+#if !defined(TEST_HAS_NO_ALIGNED_ALLOCATION)
   test<OveralignedEmptyType, TrivialEmptyAlloc>();
   test<OveralignedEmptyType, TrivialNonEmptyAlloc>();
   test<OveralignedEmptyType, FinalEmptyAlloc>();
   test<OveralignedEmptyType, NonTrivialAlloc>();
+#endif
 
   test<TrivialNonEmptyType, TrivialEmptyAlloc>();
   test<TrivialNonEmptyType, TrivialNonEmptyAlloc>();
diff --git a/libcxx/test/std/containers/associative/map/map.cons/move.pass.cpp b/libcxx/test/std/containers/associative/map/map.cons/move.pass.cpp
index 0afe64a93d7bd..7a883bed0d6e8 100644
--- a/libcxx/test/std/containers/associative/map/map.cons/move.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.cons/move.pass.cpp
@@ -36,6 +36,7 @@ int main(int, char**)
         assert(std::distance(m.begin(), m.end()) == 0);
 
         assert(mo.get_allocator() == A(7));
+        assert(mo.get_allocator().get_id() == test_alloc_base::moved_value);
         assert(mo.key_comp() == C(5));
         assert(mo.size() == 0);
         assert(std::distance(mo.begin(), mo.end()) == 0);
@@ -66,6 +67,7 @@ int main(int, char**)
         assert(*std::next(m.begin(), 2) == V(3, 1));
 
         assert(mo.get_allocator() == A(7));
+        assert(mo.get_allocator().get_id() == test_alloc_base::moved_value);
         assert(mo.key_comp() == C(5));
         assert(mo.size() == 0);
         assert(std::distance(mo.begin(), mo.end()) == 0);
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.cons/move.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.cons/move.pass.cpp
index 6458231e9a4d3..8227b81f0c05c 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.cons/move.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.cons/move.pass.cpp
@@ -36,6 +36,7 @@ int main(int, char**)
         assert(std::distance(m.begin(), m.end()) == 0);
 
         assert(mo.get_allocator() == A(7));
+        assert(mo.get_allocator().get_id() == test_alloc_base::moved_value);
         assert(mo.key_comp() == C(5));
         assert(mo.size() == 0);
         assert(std::distance(mo.begin(), mo.end()) == 0);
@@ -72,6 +73,7 @@ int main(int, char**)
         assert(*std::next(m.begin(), 8) == V(3, 2));
 
         assert(mo.get_allocator() == A(7));
+        assert(mo.get_allocator().get_id() == test_alloc_base::moved_value);
         assert(mo.key_comp() == C(5));
         assert(mo.size() == 0);
         assert(std::distance(mo.begin(), mo.end()) == 0);
diff --git a/libcxx/test/std/containers/associative/multiset/multiset.cons/move.pass.cpp b/libcxx/test/std/containers/associative/multiset/multiset.cons/move.pass.cpp
index 65d297d3bfd45..63898747180ae 100644
--- a/libcxx/test/std/containers/associative/multiset/multiset.cons/move.pass.cpp
+++ b/libcxx/test/std/containers/associative/multiset/multiset.cons/move.pass.cpp
@@ -36,6 +36,7 @@ int main(int, char**)
         assert(std::distance(m.begin(), m.end()) == 0);
 
         assert(mo.get_allocator() == A(7));
+        assert(mo.get_allocator().get_id() == test_alloc_base::moved_value);
         assert(mo.key_comp() == C(5));
         assert(mo.size() == 0);
         assert(std::distance(mo.begin(), mo.end()) == 0);
@@ -73,6 +74,7 @@ int main(int, char**)
         assert(*std::next(m.begin(), 8) == 3);
 
         assert(mo.get_allocator() == A(7));
+        assert(mo.get_allocator().get_id() == test_alloc_base::moved_value);
         assert(mo.key_comp() == C(5));
         assert(mo.size() == 0);
         assert(std::distance(mo.begin(), mo.end()) == 0);
diff --git a/libcxx/test/std/containers/associative/set/set.cons/move.pass.cpp b/libcxx/test/std/containers/associative/set/set.cons/move.pass.cpp
index 3824ea2d322b8..3d6ab89ccf82f 100644
--- a/libcxx/test/std/containers/associative/set/set.cons/move.pass.cpp
+++ b/libcxx/test/std/containers/associative/set/set.cons/move.pass.cpp
@@ -36,6 +36,7 @@ int main(int, char**)
         assert(std::distance(m.begin(), m.end()) == 0);
 
         assert(mo.get_allocator() == A(7));
+        assert(mo.get_allocator().get_id() == test_alloc_base::moved_value);
         assert(mo.key_comp() == C(5));
         assert(mo.size() == 0);
         assert(std::distance(mo.begin(), mo.end()) == 0);
@@ -67,6 +68,7 @@ int main(int, char**)
         assert(*std::next(m.begin(), 2) == 3);
 
         assert(mo.get_allocator() == A(7));
+        assert(mo.get_allocator().get_id() == test_alloc_base::moved_value);
         assert(mo.key_comp() == C(5));
         assert(mo.size() == 0);
         assert(std::distance(mo.begin(), mo.end()) == 0);
diff --git a/libcxx/test/std/containers/sequences/deque/deque.cons/move.pass.cpp b/libcxx/test/std/containers/sequences/deque/deque.cons/move.pass.cpp
index daed255a5f3a7..b3f08da81ff5b 100644
--- a/libcxx/test/std/containers/sequences/deque/deque.cons/move.pass.cpp
+++ b/libcxx/test/std/containers/sequences/deque/deque.cons/move.pass.cpp
@@ -39,6 +39,7 @@ int main(int, char**)
         assert(c1.size() == 0);
         assert(c3.get_allocator() == old_a);
         assert(c1.get_allocator() == A(1));
+        assert(c1.get_allocator().get_id() == test_alloc_base::moved_value);
         LIBCPP_ASSERT(is_double_ended_contiguous_container_asan_correct(c1));
         LIBCPP_ASSERT(is_double_ended_contiguous_container_asan_correct(c2));
         LIBCPP_ASSERT(is_double_ended_contiguous_container_asan_correct(c3));
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.members/open_pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.members/open_pointer.pass.cpp
index f070762b3b94d..9f617dc1e5a89 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.members/open_pointer.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.members/open_pointer.pass.cpp
@@ -10,6 +10,9 @@
 
 // basic_filebuf<charT,traits>* open(const char* s, ios_base::openmode mode);
 
+// In C++23 and later, this test requires support for P2467R1 in the dylib (a3f17ba3febbd546f2342ffc780ac93b694fdc8d)
+// XFAIL: (!c++03 && !c++11 && !c++14 && !c++17 && !c++20) && using-built-library-before-llvm-18
+
 // XFAIL: LIBCXX-AIX-FIXME
 
 #include <fstream>
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp
index 8bcce28162033..10435dc482367 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp
@@ -10,6 +10,9 @@
 
 // basic_streambuf<charT, traits>* setbuf(char_type* s, streamsize n) override;
 
+// In C++23 and later, this test requires support for P2467R1 in the dylib (a3f17ba3febbd546f2342ffc780ac93b694fdc8d)
+// XFAIL: (!c++03 && !c++11 && !c++14 && !c++17 && !c++20) && using-built-library-before-llvm-18
+
 #include <fstream>
 #include <cstddef>
 #include <cassert>
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/pointer.pass.cpp
index df7d3b948e327..2e0ebcd684d79 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/pointer.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/pointer.pass.cpp
@@ -13,6 +13,9 @@
 
 // explicit basic_fstream(const char* s, ios_base::openmode mode = ios_base::in | ios_base::out);
 
+// In C++23 and later, this test requires support for P2467R1 in the dylib (a3f17ba3febbd546f2342ffc780ac93b694fdc8d)
+// XFAIL: (!c++03 && !c++11 && !c++14 && !c++17 && !c++20) && using-built-library-before-llvm-18
+
 // XFAIL: LIBCXX-AIX-FIXME
 
 #include <fstream>
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/fstream.members/open_pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/fstream.members/open_pointer.pass.cpp
index 790b9ef02f902..0d83d681b1dfc 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/fstream.members/open_pointer.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/fstream.members/open_pointer.pass.cpp
@@ -13,6 +13,9 @@
 
 // void open(const char* s, ios_base::openmode mode = ios_base::in|ios_base::out);
 
+// In C++23 and later, this test requires support for P2467R1 in the dylib (a3f17ba3febbd546f2342ffc780ac93b694fdc8d)
+// XFAIL: (!c++03 && !c++11 && !c++14 && !c++17 && !c++20) && using-built-library-before-llvm-18
+
 // XFAIL: LIBCXX-AIX-FIXME
 
 #include <fstream>
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/pointer.pass.cpp
index af43ffdbf8006..fbb03f1e85841 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/pointer.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/pointer.pass.cpp
@@ -13,6 +13,9 @@
 
 // explicit basic_ofstream(const char* s, ios_base::openmode mode = ios_base::out);
 
+// In C++23 and later, this test requires support for P2467R1 in the dylib (a3f17ba3febbd546f2342ffc780ac93b694fdc8d)
+// XFAIL: (!c++03 && !c++11 && !c++14 && !c++17 && !c++20) && using-built-library-before-llvm-18
+
 // XFAIL: LIBCXX-AIX-FIXME
 
 #include <fstream>
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.members/open_pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.members/open_pointer.pass.cpp
index b0a68fd4340b7..73a474277a933 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.members/open_pointer.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.members/open_pointer.pass.cpp
@@ -13,6 +13,9 @@
 
 // void open(const char* s, ios_base::openmode mode = ios_base::out);
 
+// In C++23 and later, this test requires support for P2467R1 in the dylib (a3f17ba3febbd546f2342ffc780ac93b694fdc8d)
+// XFAIL: (!c++03 && !c++11 && !c++14 && !c++17 && !c++20) && using-built-library-before-llvm-18
+
 // XFAIL: LIBCXX-AIX-FIXME
 
 #include <fstream>
diff --git a/libcxx/test/std/input.output/string.streams/stringstream/stringstream.members/gcount.pass.cpp b/libcxx/test/std/input.output/string.streams/stringstream/stringstream.members/gcount.pass.cpp
index a9079dc63b6b5..84004dc12f11a 100644
--- a/libcxx/test/std/input.output/string.streams/stringstream/stringstream.members/gcount.pass.cpp
+++ b/libcxx/test/std/input.output/string.streams/stringstream/stringstream.members/gcount.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: 32-bit-pointer
 // REQUIRES: large_tests
 
+// This bug was fixed in the dylib by 53aed4759b33e33614e0f4e321bc1ef764b6d5b6.
+// XFAIL: using-built-library-before-llvm-17
+
 // Android devices frequently don't have enough memory to run this test. Rather
 // than throw std::bad_alloc, exhausting memory triggers the OOM Killer.
 // UNSUPPORTED: LIBCXX-ANDROID-FIXME
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp
new file mode 100644
index 0000000000000..1203b2f3ec18f
--- /dev/null
+++ b/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <string>
+
+// Make sure the size we allocate and deallocate match. See https://github.com/llvm/llvm-project/pull/90292.
+
+#include <string>
+#include <cassert>
+#include <cstdint>
+#include <type_traits>
+
+#include "test_macros.h"
+
+static int allocated_;
+
+template <class T, class Sz>
+struct test_alloc {
+  typedef Sz size_type;
+  typedef typename std::make_signed<Sz>::type difference_type;
+  typedef T value_type;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+  typedef typename std::add_lvalue_reference<value_type>::type reference;
+  typedef typename std::add_lvalue_reference<const value_type>::type const_reference;
+
+  template <class U>
+  struct rebind {
+    typedef test_alloc<U, Sz> other;
+  };
+
+  TEST_CONSTEXPR_CXX14 pointer allocate(size_type n, const void* = nullptr) {
+    allocated_ += n;
+    return std::allocator<value_type>().allocate(n);
+  }
+
+  TEST_CONSTEXPR_CXX14 void deallocate(pointer p, size_type s) {
+    allocated_ -= s;
+    std::allocator<value_type>().deallocate(p, s);
+  }
+};
+
+template <class Sz>
+void test() {
+  for (int i = 1; i < 1000; ++i) {
+    using Str = std::basic_string<char, std::char_traits<char>, test_alloc<char, Sz> >;
+    {
+      Str s(i, 't');
+      assert(allocated_ == 0 || allocated_ >= i);
+    }
+  }
+  assert(allocated_ == 0);
+}
+
+int main(int, char**) {
+  test<uint32_t>();
+  test<uint64_t>();
+  test<size_t>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/incomplete.sh.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/incomplete.sh.cpp
new file mode 100644
index 0000000000000..4a03d2bcf07bf
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/incomplete.sh.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <memory>
+
+// unique_ptr
+
+// Make sure that we can form unique_ptrs to incomplete types and perform restricted
+// operations on them. This requires setting up a TU where the type is complete and
+// the unique_ptr is created and destroyed, and a TU where the type is incomplete and
+// we check that a restricted set of operations can be performed on the unique_ptr.
+
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.tu1.o -DCOMPLETE
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.tu2.o -DINCOMPLETE
+// RUN: %{cxx} %t.tu1.o %t.tu2.o %{flags} %{link_flags} -o %t.exe
+// RUN: %{exec} %t.exe
+
+#include <memory>
+#include <cassert>
+
+struct T;
+extern void use(std::unique_ptr<T>& ptr);
+extern void use(std::unique_ptr<T[]>& ptr);
+
+#ifdef INCOMPLETE
+
+void use(std::unique_ptr<T>& ptr) {
+  {
+    T* x = ptr.get();
+    assert(x != nullptr);
+  }
+  {
+    T& ref = *ptr;
+    assert(&ref == ptr.get());
+  }
+  {
+    bool engaged = static_cast<bool>(ptr);
+    assert(engaged);
+  }
+  {
+    assert(ptr == ptr);
+    assert(!(ptr != ptr));
+    assert(!(ptr < ptr));
+    assert(!(ptr > ptr));
+    assert(ptr <= ptr);
+    assert(ptr >= ptr);
+  }
+}
+
+void use(std::unique_ptr<T[]>& ptr) {
+  {
+    T* x = ptr.get();
+    assert(x != nullptr);
+  }
+  {
+    bool engaged = static_cast<bool>(ptr);
+    assert(engaged);
+  }
+  {
+    assert(ptr == ptr);
+    assert(!(ptr != ptr));
+    assert(!(ptr < ptr));
+    assert(!(ptr > ptr));
+    assert(ptr <= ptr);
+    assert(ptr >= ptr);
+  }
+}
+
+#endif // INCOMPLETE
+
+#ifdef COMPLETE
+
+struct T {}; // complete the type
+
+int main(int, char**) {
+  {
+    std::unique_ptr<T> ptr(new T());
+    use(ptr);
+  }
+
+  {
+    std::unique_ptr<T[]> ptr(new T[3]());
+    use(ptr);
+  }
+  return 0;
+}
+
+#endif // COMPLETE
diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/assert.subscript.pass.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/assert.subscript.pass.cpp
new file mode 100644
index 0000000000000..1eaf2d5900356
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/assert.subscript.pass.cpp
@@ -0,0 +1,166 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// <memory>
+//
+// unique_ptr<T[]>
+//
+// T& operator[](std::size_t);
+
+// This test ensures that we catch an out-of-bounds access in std::unique_ptr<T[]>::operator[]
+// when unique_ptr has the appropriate ABI configuration.
+
+#include <memory>
+#include <cstddef>
+#include <string>
+
+#include "check_assertion.h"
+#include "type_algorithms.h"
+
+struct MyDeleter {
+  MyDeleter() = default;
+
+  // required to exercise converting move-constructor
+  template <class T>
+  MyDeleter(std::default_delete<T> const&) {}
+
+  // required to exercise converting move-assignment
+  template <class T>
+  MyDeleter& operator=(std::default_delete<T> const&) {
+    return *this;
+  }
+
+  template <class T>
+  void operator()(T* ptr) const {
+    delete[] ptr;
+  }
+};
+
+template <class WithCookie, class NoCookie>
+void test() {
+  // For types with an array cookie, we can always detect OOB accesses.
+  {
+    // Check with the default deleter
+    {
+      {
+        std::unique_ptr<WithCookie[]> ptr(new WithCookie[5]);
+        TEST_LIBCPP_ASSERT_FAILURE(ptr[6], "unique_ptr<T[]>::operator[](index): index out of range");
+      }
+      {
+        std::unique_ptr<WithCookie[]> ptr = std::make_unique<WithCookie[]>(5);
+        TEST_LIBCPP_ASSERT_FAILURE(ptr[6], "unique_ptr<T[]>::operator[](index): index out of range");
+      }
+#if TEST_STD_VER >= 20
+      {
+        std::unique_ptr<WithCookie[]> ptr = std::make_unique_for_overwrite<WithCookie[]>(5);
+        TEST_LIBCPP_ASSERT_FAILURE(ptr[6] = WithCookie(), "unique_ptr<T[]>::operator[](index): index out of range");
+      }
+#endif
+    }
+
+    // Check with a custom deleter
+    {
+      std::unique_ptr<WithCookie[], MyDeleter> ptr(new WithCookie[5]);
+      TEST_LIBCPP_ASSERT_FAILURE(ptr[6], "unique_ptr<T[]>::operator[](index): index out of range");
+    }
+  }
+
+  // For types that don't have an array cookie, things are a bit more complicated. We can detect OOB accesses
+  // only when the unique_ptr is created via an API where the size is passed down to the library so that we
+  // can store it inside the unique_ptr. That requires the appropriate ABI configuration to be enabled.
+  //
+  // Note that APIs that allow the size to be passed down to the library only support the default deleter
+  // as of writing this test.
+#if defined(_LIBCPP_ABI_BOUNDED_UNIQUE_PTR)
+  {
+    {
+      std::unique_ptr<NoCookie[]> ptr = std::make_unique<NoCookie[]>(5);
+      TEST_LIBCPP_ASSERT_FAILURE(ptr[6], "unique_ptr<T[]>::operator[](index): index out of range");
+    }
+#  if TEST_STD_VER >= 20
+    {
+      std::unique_ptr<NoCookie[]> ptr = std::make_unique_for_overwrite<NoCookie[]>(5);
+      TEST_LIBCPP_ASSERT_FAILURE(ptr[6] = NoCookie(), "unique_ptr<T[]>::operator[](index): index out of range");
+    }
+#  endif
+  }
+#endif
+
+  // Make sure that we carry the bounds information properly through conversions, assignments, etc.
+  // These tests are mostly relevant when the ABI setting is enabled (with a stateful bounds-checker),
+  // but we still run them for types with an array cookie either way.
+#if defined(_LIBCPP_ABI_BOUNDED_UNIQUE_PTR)
+  using Types = types::type_list<NoCookie, WithCookie>;
+#else
+  using Types = types::type_list<WithCookie>;
+#endif
+  types::for_each(Types(), []<class T> {
+    // Bounds carried through move construction
+    {
+      std::unique_ptr<T[]> ptr = std::make_unique<T[]>(5);
+      std::unique_ptr<T[]> other(std::move(ptr));
+      TEST_LIBCPP_ASSERT_FAILURE(other[6], "unique_ptr<T[]>::operator[](index): index out of range");
+    }
+
+    // Bounds carried through move assignment
+    {
+      std::unique_ptr<T[]> ptr = std::make_unique<T[]>(5);
+      std::unique_ptr<T[]> other;
+      other = std::move(ptr);
+      TEST_LIBCPP_ASSERT_FAILURE(other[6], "unique_ptr<T[]>::operator[](index): index out of range");
+    }
+
+    // Bounds carried through converting move-constructor
+    {
+      std::unique_ptr<T[]> ptr = std::make_unique<T[]>(5);
+      std::unique_ptr<T[], MyDeleter> other(std::move(ptr));
+      TEST_LIBCPP_ASSERT_FAILURE(other[6], "unique_ptr<T[]>::operator[](index): index out of range");
+    }
+
+    // Bounds carried through converting move-assignment
+    {
+      std::unique_ptr<T[]> ptr = std::make_unique<T[]>(5);
+      std::unique_ptr<T[], MyDeleter> other;
+      other = std::move(ptr);
+      TEST_LIBCPP_ASSERT_FAILURE(other[6], "unique_ptr<T[]>::operator[](index): index out of range");
+    }
+  });
+}
+
+template <std::size_t Size>
+struct NoCookie {
+  char padding[Size];
+};
+
+template <std::size_t Size>
+struct WithCookie {
+  WithCookie() = default;
+  WithCookie(WithCookie const&) {}
+  WithCookie& operator=(WithCookie const&) { return *this; }
+  ~WithCookie() {}
+  char padding[Size];
+};
+
+int main(int, char**) {
+  test<WithCookie<1>, NoCookie<1>>();
+  test<WithCookie<2>, NoCookie<2>>();
+  test<WithCookie<3>, NoCookie<3>>();
+  test<WithCookie<4>, NoCookie<4>>();
+  test<WithCookie<8>, NoCookie<8>>();
+  test<WithCookie<16>, NoCookie<16>>();
+  test<WithCookie<32>, NoCookie<32>>();
+  test<WithCookie<256>, NoCookie<256>>();
+  test<std::string, int>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/get.pass.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/get.pass.cpp
index 3bd3788960e2a..c92c39c8f299e 100644
--- a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/get.pass.cpp
+++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/get.pass.cpp
@@ -10,43 +10,114 @@
 
 // unique_ptr
 
-// test get
+// pointer unique_ptr<T>::get() const noexcept;
+// pointer unique_ptr<T[]>::get() const noexcept;
 
 #include <memory>
 #include <cassert>
+#include <cstddef>
 
 #include "test_macros.h"
-#include "unique_ptr_test_helper.h"
 
-template <bool IsArray>
+template <class T>
 TEST_CONSTEXPR_CXX23 void test_basic() {
-  typedef typename std::conditional<IsArray, int[], int>::type VT;
-  typedef const VT CVT;
+  // non-const element type
   {
-    typedef std::unique_ptr<VT> U;
-    int* p = newValue<VT>(1);
-    U s(p);
-    U const& sc = s;
-    ASSERT_SAME_TYPE(decltype(s.get()), int*);
-    ASSERT_SAME_TYPE(decltype(sc.get()), int*);
-    assert(s.get() == p);
-    assert(sc.get() == s.get());
+    // non-const access
+    {
+      T* x = new T;
+      std::unique_ptr<T> ptr(x);
+      ASSERT_SAME_TYPE(decltype(ptr.get()), T*);
+      ASSERT_NOEXCEPT(ptr.get());
+      assert(ptr.get() == x);
+    }
+
+    // const access
+    {
+      T* x = new T;
+      std::unique_ptr<T> const ptr(x);
+      ASSERT_SAME_TYPE(decltype(ptr.get()), T*);
+      ASSERT_NOEXCEPT(ptr.get());
+      assert(ptr.get() == x);
+    }
+  }
+
+  // const element type
+  {
+    // non-const access
+    {
+      T* x = new T;
+      std::unique_ptr<T const> ptr(x);
+      ASSERT_SAME_TYPE(decltype(ptr.get()), T const*);
+      assert(ptr.get() == x);
+    }
+
+    // const access
+    {
+      T* x = new T;
+      std::unique_ptr<T const> const ptr(x);
+      ASSERT_SAME_TYPE(decltype(ptr.get()), T const*);
+      assert(ptr.get() == x);
+    }
+  }
+
+  // Same thing but for unique_ptr<T[]>
+  // non-const element type
+  {
+    // non-const access
+    {
+      T* x = new T[3];
+      std::unique_ptr<T[]> ptr(x);
+      ASSERT_SAME_TYPE(decltype(ptr.get()), T*);
+      ASSERT_NOEXCEPT(ptr.get());
+      assert(ptr.get() == x);
+    }
+
+    // const access
+    {
+      T* x = new T[3];
+      std::unique_ptr<T[]> const ptr(x);
+      ASSERT_SAME_TYPE(decltype(ptr.get()), T*);
+      ASSERT_NOEXCEPT(ptr.get());
+      assert(ptr.get() == x);
+    }
   }
+
+  // const element type
   {
-    typedef std::unique_ptr<CVT> U;
-    const int* p = newValue<VT>(1);
-    U s(p);
-    U const& sc = s;
-    ASSERT_SAME_TYPE(decltype(s.get()), const int*);
-    ASSERT_SAME_TYPE(decltype(sc.get()), const int*);
-    assert(s.get() == p);
-    assert(sc.get() == s.get());
+    // non-const access
+    {
+      T* x = new T[3];
+      std::unique_ptr<T const[]> ptr(x);
+      ASSERT_SAME_TYPE(decltype(ptr.get()), T const*);
+      assert(ptr.get() == x);
+    }
+
+    // const access
+    {
+      T* x = new T[3];
+      std::unique_ptr<T const[]> const ptr(x);
+      ASSERT_SAME_TYPE(decltype(ptr.get()), T const*);
+      assert(ptr.get() == x);
+    }
   }
 }
 
+template <std::size_t Size>
+struct WithSize {
+  char padding[Size];
+};
+
 TEST_CONSTEXPR_CXX23 bool test() {
-  test_basic</*IsArray*/ false>();
-  test_basic<true>();
+  test_basic<char>();
+  test_basic<int>();
+  test_basic<WithSize<1> >();
+  test_basic<WithSize<2> >();
+  test_basic<WithSize<3> >();
+  test_basic<WithSize<4> >();
+  test_basic<WithSize<8> >();
+  test_basic<WithSize<16> >();
+  test_basic<WithSize<256> >();
 
   return true;
 }
diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/op_subscript.runtime.pass.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/op_subscript.runtime.pass.cpp
index fbb4dbc6e0308..ebfad8ec724e5 100644
--- a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/op_subscript.runtime.pass.cpp
+++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/op_subscript.runtime.pass.cpp
@@ -10,51 +10,117 @@
 
 // unique_ptr
 
-// test op[](size_t)
+// T& unique_ptr::operator[](size_t) const
 
 #include <memory>
 #include <cassert>
-
-// TODO: Move TEST_IS_CONSTANT_EVALUATED into its own header
 #include <type_traits>
+#include <array>
 
 #include "test_macros.h"
+#include "type_algorithms.h"
 
-class A {
-  int state_;
-  static int next_;
+static int next = 0;
+struct EnumeratedDefaultCtor {
+  EnumeratedDefaultCtor() : value(0) { value = ++next; }
+  int value;
+};
 
-public:
-  TEST_CONSTEXPR_CXX23 A() : state_(0) {
-    if (!TEST_IS_CONSTANT_EVALUATED)
-      state_ = ++next_;
+template <std::size_t Size>
+struct WithTrivialDtor {
+  std::array<char, Size> padding = {'x'};
+  TEST_CONSTEXPR_CXX23 friend bool operator==(WithTrivialDtor const& x, WithTrivialDtor const& y) {
+    return x.padding == y.padding;
   }
+};
 
-  TEST_CONSTEXPR_CXX23 int get() const { return state_; }
-
-  friend TEST_CONSTEXPR_CXX23 bool operator==(const A& x, int y) { return x.state_ == y; }
-
-  TEST_CONSTEXPR_CXX23 A& operator=(int i) {
-    state_ = i;
-    return *this;
+template <std::size_t Size>
+struct WithNonTrivialDtor {
+  std::array<char, Size> padding = {'x'};
+  TEST_CONSTEXPR_CXX23 friend bool operator==(WithNonTrivialDtor const& x, WithNonTrivialDtor const& y) {
+    return x.padding == y.padding;
   }
+  TEST_CONSTEXPR_CXX23 ~WithNonTrivialDtor() {}
 };
 
-int A::next_ = 0;
+template <class T>
+struct CustomDeleter : std::default_delete<T> {};
 
 TEST_CONSTEXPR_CXX23 bool test() {
-  std::unique_ptr<A[]> p(new A[3]);
-  if (!TEST_IS_CONSTANT_EVALUATED) {
-    assert(p[0] == 1);
-    assert(p[1] == 2);
-    assert(p[2] == 3);
+  // Basic test
+  {
+    std::unique_ptr<int[]> p(new int[3]);
+    {
+      int& result = p[0];
+      result      = 0;
+    }
+    {
+      int& result = p[1];
+      result      = 1;
+    }
+    {
+      int& result = p[2];
+      result      = 2;
+    }
+
+    assert(p[0] == 0);
+    assert(p[1] == 1);
+    assert(p[2] == 2);
+  }
+
+  // Ensure that the order of access is correct after initializing a unique_ptr but
+  // before actually modifying any of its elements. The implementation would have to
+  // really try for this not to be the case, but we still check it.
+  //
+  // This requires assigning known values to the elements when they are first constructed,
+  // which requires global state.
+  {
+    if (!TEST_IS_CONSTANT_EVALUATED) {
+      std::unique_ptr<EnumeratedDefaultCtor[]> p(new EnumeratedDefaultCtor[3]);
+      assert(p[0].value == 1);
+      assert(p[1].value == 2);
+      assert(p[2].value == 3);
+    }
+  }
+
+  // Make sure operator[] is const-qualified
+  {
+    std::unique_ptr<int[]> const p(new int[3]);
+    p[0] = 42;
+    assert(p[0] == 42);
+  }
+
+  // Make sure we properly handle types with trivial and non-trivial destructors of different
+  // sizes. This is relevant because some implementations may want to use properties of the
+  // ABI like array cookies and these properties often depend on e.g. the triviality of T's
+  // destructor, T's size and so on.
+#if TEST_STD_VER >= 20 // this test is too painful to write before C++20
+  {
+    using TrickyCookieTypes = types::type_list<
+        WithTrivialDtor<1>,
+        WithTrivialDtor<2>,
+        WithTrivialDtor<3>,
+        WithTrivialDtor<4>,
+        WithTrivialDtor<8>,
+        WithTrivialDtor<16>,
+        WithTrivialDtor<256>,
+        WithNonTrivialDtor<1>,
+        WithNonTrivialDtor<2>,
+        WithNonTrivialDtor<3>,
+        WithNonTrivialDtor<4>,
+        WithNonTrivialDtor<8>,
+        WithNonTrivialDtor<16>,
+        WithNonTrivialDtor<256>>;
+    types::for_each(TrickyCookieTypes(), []<class T> {
+      types::for_each(types::type_list<std::default_delete<T[]>, CustomDeleter<T[]>>(), []<class Deleter> {
+        std::unique_ptr<T[], Deleter> p(new T[3]);
+        assert(p[0] == T());
+        assert(p[1] == T());
+        assert(p[2] == T());
+      });
+    });
   }
-  p[0] = 3;
-  p[1] = 2;
-  p[2] = 1;
-  assert(p[0] == 3);
-  assert(p[1] == 2);
-  assert(p[2] == 1);
+#endif // C++20
 
   return true;
 }
diff --git a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
index 5de2d44994ad0..0e1d3506a9973 100644
--- a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
+++ b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
@@ -1,3 +1,5 @@
+# TODO: Re-enable the tests once the CI is back under control
+return()
 
 # The find_package changes these variables. This leaves the build in an odd
 # state. Calling cmake a second time tries to write site config information in
diff --git a/libcxx/utils/CMakeLists.txt b/libcxx/utils/CMakeLists.txt
index 1116531fa0653..027e485fc15ef 100644
--- a/libcxx/utils/CMakeLists.txt
+++ b/libcxx/utils/CMakeLists.txt
@@ -2,10 +2,6 @@ add_custom_target(libcxx-generate-feature-test-macros
     COMMAND "${Python3_EXECUTABLE}" "${LIBCXX_SOURCE_DIR}/utils/generate_feature_test_macro_components.py"
     COMMENT "Generate the <version> header and tests for feature test macros.")
 
-add_custom_target(libcxx-generate-std-clang-module-header
-  COMMAND "${Python3_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/generate_std_clang_module_header.py"
-  COMMENT "Generate the <__std_clang_module> header")
-
 add_custom_target(libcxx-generate-std-cppm-in-file
   COMMAND
         "${Python3_EXECUTABLE}"
@@ -57,7 +53,6 @@ add_custom_target(libcxx-indic-conjunct-break-table
 
 add_custom_target(libcxx-generate-files
     DEPENDS libcxx-generate-feature-test-macros
-            libcxx-generate-std-clang-module-header
             libcxx-generate-std-cppm-in-file
             libcxx-generate-std-compat-cppm-in-file
             libcxx-generate-extended-grapheme-cluster-tables
diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
index 906df734bc42b..d1465721cf164 100644
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -37,48 +37,6 @@ definitions:
       - "**/crash_diagnostics/*"
 
 steps:
-- group: ':mac: Apple'
-  steps:
-  - label: Apple back-deployment macosx10.13
-    command: libcxx/utils/ci/run-buildbot apple-system-backdeployment-10.13
-    agents:
-      queue: libcxx-builders
-      os: macos
-      arch: x86_64 # We need to use x86_64 for back-deployment CI on this target since macOS didn't support arm64 back then
-    <<: *common
-
-  - label: Apple back-deployment macosx10.15
-    command: libcxx/utils/ci/run-buildbot apple-system-backdeployment-10.15
-    agents:
-      queue: libcxx-builders
-      os: macos
-      arch: x86_64 # We need to use x86_64 for back-deployment CI on this target since macOS didn't support arm64 back then
-    <<: *common
-
-  - label: Apple back-deployment with hardening enabled
-    command: libcxx/utils/ci/run-buildbot apple-system-backdeployment-hardened-11.0
-    agents:
-      queue: libcxx-builders
-      os: macos
-      arch: x86_64 # TODO: Remove this once we are able to run back-deployment on arm64 again, since this isn't x86_64 specific
-    <<: *common
-
-    # TODO: Re-enable this once we've figured out how to run back-deployment testing on arm64 on recent OSes
-    # - label: "Apple back-deployment macosx11.0 arm64"
-    #   command: "libcxx/utils/ci/run-buildbot apple-system-backdeployment-11.0"
-    #   artifact_paths:
-    #     - "**/test-results.xml"
-    #     - "**/*.abilist"
-    #   agents:
-    #     queue: "libcxx-builders"
-    #     os: "macos"
-    #     arch: "arm64"
-    #   retry:
-    #     automatic:
-    #       - exit_status: -1  # Agent was lost
-    #         limit: 2
-    #   timeout_in_minutes: 120
-
 - group: ARM
   steps:
   - label: AArch64
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index b0533cb9a49c9..229963b38f52b 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -371,7 +371,14 @@ bootstrapping-build)
           -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests"
 
     echo "+++ Running the LLDB libc++ data formatter tests"
-    ${NINJA} -vC "${BUILD_DIR}" check-lldb-api-functionalities-data-formatter-data-formatter-stl-libcxx
+    ${NINJA} -vC "${BUILD_DIR}" check-lldb-api-functionalities-data-formatter-data-formatter-stl-libcxx \
+                                check-lldb-api-functionalities-data-formatter-data-formatter-stl-generic \
+                                check-lldb-api-functionalities-data-formatter-data-formatter-stl-libcxx-simulators \
+                                check-lldb-api-commands-expression-import-std-module \
+                                check-lldb-api-lang-cpp-std-function-step-into-callable \
+                                check-lldb-api-lang-cpp-std-function-recognizer \
+                                check-lldb-api-lang-cpp-std-invoke-recognizer
+
 
     echo "--- Running the libc++ and libc++abi tests"
     ${NINJA} -vC "${BUILD_DIR}" check-runtimes
@@ -522,80 +529,94 @@ apple-configuration)
     # TODO: It would be better to run the tests against the fake-installed version of libc++ instead
     xcrun --sdk macosx ninja -vC "${BUILD_DIR}/${arch}" check-cxx check-cxxabi check-cxx-abilist
 ;;
-apple-system-backdeployment-hardened-*)
+apple-system-hardened)
     clean
 
-    if [[ "${OSX_ROOTS}" == "" ]]; then
-        echo "--- Downloading previous macOS dylibs"
-        PREVIOUS_DYLIBS_URL="https://dl.dropboxusercontent.com/s/gmcfxwgl9f9n6pu/libcxx-roots.tar.gz"
-        OSX_ROOTS="${BUILD_DIR}/macos-roots"
-        mkdir -p "${OSX_ROOTS}"
-        curl "${PREVIOUS_DYLIBS_URL}" | tar -xz --strip-components=1 -C "${OSX_ROOTS}"
-    fi
+    arch="$(uname -m)"
+    version="$(sw_vers --productVersion)"
+    params="target_triple=${arch}-apple-macosx${version}"
+    params+=";hardening_mode=fast"
 
-    DEPLOYMENT_TARGET="${BUILDER#apple-system-backdeployment-hardened-}"
+    # In the Apple system configuration, we build libc++ and libunwind separately.
+    ${CMAKE} \
+        -S "${MONOREPO_ROOT}/runtimes" \
+        -B "${BUILD_DIR}/cxx" \
+        -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \
+        -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+        -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/cxx" \
+        -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \
+        -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \
+        -DLIBCXX_CXX_ABI=libcxxabi \
+        -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Apple.cmake" \
+        -DLIBCXX_TEST_CONFIG="apple-libc++-system.cfg.in" \
+        -DLIBCXXABI_TEST_CONFIG="apple-libc++abi-system.cfg.in" \
+        -DLIBCXX_TEST_PARAMS="${params}" \
+        -DLIBCXXABI_TEST_PARAMS="${params}"
 
-    # TODO: On Apple platforms, we never produce libc++abi.1.dylib or libunwind.1.dylib,
-    #       only libc++abi.dylib and libunwind.dylib. Fix that in the build so that the
-    #       tests stop searching for @rpath/libc++abi.1.dylib and @rpath/libunwind.1.dylib.
-    cp "${OSX_ROOTS}/macOS/libc++abi/${DEPLOYMENT_TARGET}/libc++abi.dylib" \
-       "${OSX_ROOTS}/macOS/libc++abi/${DEPLOYMENT_TARGET}/libc++abi.1.dylib"
-    cp "${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}/libunwind.dylib" \
-       "${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}/libunwind.1.dylib"
+    ${CMAKE} \
+        -S "${MONOREPO_ROOT}/runtimes" \
+        -B "${BUILD_DIR}/unwind" \
+        -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \
+        -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+        -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/unwind" \
+        -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \
+        -DLLVM_ENABLE_RUNTIMES="libunwind" \
+        -DLIBUNWIND_TEST_CONFIG="apple-libunwind-system.cfg.in" \
+        -DLIBUNWIND_TEST_PARAMS="${params}" \
+        -DCMAKE_INSTALL_NAME_DIR="/usr/lib/system"
 
-    arch="$(uname -m)"
-    PARAMS="target_triple=${arch}-apple-macosx${DEPLOYMENT_TARGET}"
-    PARAMS+=";cxx_runtime_root=${OSX_ROOTS}/macOS/libc++/${DEPLOYMENT_TARGET}"
-    PARAMS+=";abi_runtime_root=${OSX_ROOTS}/macOS/libc++abi/${DEPLOYMENT_TARGET}"
-    PARAMS+=";unwind_runtime_root=${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}"
-    PARAMS+=";hardening_mode=fast"
+    echo "+++ Running the libc++ tests"
+    ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxx
 
-    generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Apple.cmake" \
-                   -DLIBCXX_TEST_CONFIG="apple-libc++-backdeployment.cfg.in" \
-                   -DLIBCXXABI_TEST_CONFIG="apple-libc++abi-backdeployment.cfg.in" \
-                   -DLIBUNWIND_TEST_CONFIG="apple-libunwind-backdeployment.cfg.in" \
-                   -DLIBCXX_TEST_PARAMS="${PARAMS}" \
-                   -DLIBCXXABI_TEST_PARAMS="${PARAMS}" \
-                   -DLIBUNWIND_TEST_PARAMS="${PARAMS}"
+    echo "+++ Running the libc++abi tests"
+    ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxxabi
 
-    check-runtimes
+    echo "+++ Running the libunwind tests"
+    ${NINJA} -vC "${BUILD_DIR}/unwind" check-unwind
 ;;
-apple-system-backdeployment-*)
+apple-system)
     clean
 
-    if [[ "${OSX_ROOTS}" == "" ]]; then
-        echo "--- Downloading previous macOS dylibs"
-        PREVIOUS_DYLIBS_URL="https://dl.dropboxusercontent.com/s/gmcfxwgl9f9n6pu/libcxx-roots.tar.gz"
-        OSX_ROOTS="${BUILD_DIR}/macos-roots"
-        mkdir -p "${OSX_ROOTS}"
-        curl "${PREVIOUS_DYLIBS_URL}" | tar -xz --strip-components=1 -C "${OSX_ROOTS}"
-    fi
+    arch="$(uname -m)"
+    version="$(sw_vers --productVersion)"
+    params="target_triple=${arch}-apple-macosx${version}"
 
-    DEPLOYMENT_TARGET="${BUILDER#apple-system-backdeployment-}"
+    # In the Apple system configuration, we build libc++ and libunwind separately.
+    ${CMAKE} \
+        -S "${MONOREPO_ROOT}/runtimes" \
+        -B "${BUILD_DIR}/cxx" \
+        -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \
+        -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+        -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/cxx" \
+        -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \
+        -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \
+        -DLIBCXX_CXX_ABI=libcxxabi \
+        -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Apple.cmake" \
+        -DLIBCXX_TEST_CONFIG="apple-libc++-system.cfg.in" \
+        -DLIBCXXABI_TEST_CONFIG="apple-libc++abi-system.cfg.in" \
+        -DLIBCXX_TEST_PARAMS="${params}" \
+        -DLIBCXXABI_TEST_PARAMS="${params}"
 
-    # TODO: On Apple platforms, we never produce libc++abi.1.dylib or libunwind.1.dylib,
-    #       only libc++abi.dylib and libunwind.dylib. Fix that in the build so that the
-    #       tests stop searching for @rpath/libc++abi.1.dylib and @rpath/libunwind.1.dylib.
-    cp "${OSX_ROOTS}/macOS/libc++abi/${DEPLOYMENT_TARGET}/libc++abi.dylib" \
-       "${OSX_ROOTS}/macOS/libc++abi/${DEPLOYMENT_TARGET}/libc++abi.1.dylib"
-    cp "${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}/libunwind.dylib" \
-       "${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}/libunwind.1.dylib"
+    ${CMAKE} \
+        -S "${MONOREPO_ROOT}/runtimes" \
+        -B "${BUILD_DIR}/unwind" \
+        -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \
+        -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+        -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/unwind" \
+        -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \
+        -DLLVM_ENABLE_RUNTIMES="libunwind" \
+        -DLIBUNWIND_TEST_CONFIG="apple-libunwind-system.cfg.in" \
+        -DLIBUNWIND_TEST_PARAMS="${params}" \
+        -DCMAKE_INSTALL_NAME_DIR="/usr/lib/system"
 
-    arch="$(uname -m)"
-    PARAMS="target_triple=${arch}-apple-macosx${DEPLOYMENT_TARGET}"
-    PARAMS+=";cxx_runtime_root=${OSX_ROOTS}/macOS/libc++/${DEPLOYMENT_TARGET}"
-    PARAMS+=";abi_runtime_root=${OSX_ROOTS}/macOS/libc++abi/${DEPLOYMENT_TARGET}"
-    PARAMS+=";unwind_runtime_root=${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}"
+    echo "+++ Running the libc++ tests"
+    ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxx
 
-    generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Apple.cmake" \
-                   -DLIBCXX_TEST_CONFIG="apple-libc++-backdeployment.cfg.in" \
-                   -DLIBCXXABI_TEST_CONFIG="apple-libc++abi-backdeployment.cfg.in" \
-                   -DLIBUNWIND_TEST_CONFIG="apple-libunwind-backdeployment.cfg.in" \
-                   -DLIBCXX_TEST_PARAMS="${PARAMS}" \
-                   -DLIBCXXABI_TEST_PARAMS="${PARAMS}" \
-                   -DLIBUNWIND_TEST_PARAMS="${PARAMS}"
+    echo "+++ Running the libc++abi tests"
+    ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxxabi
 
-    check-runtimes
+    echo "+++ Running the libunwind tests"
+    ${NINJA} -vC "${BUILD_DIR}/unwind" check-unwind
 ;;
 benchmarks)
     clean
diff --git a/libcxx/utils/generate_std_clang_module_header.py b/libcxx/utils/generate_std_clang_module_header.py
deleted file mode 100644
index 33c9acf395379..0000000000000
--- a/libcxx/utils/generate_std_clang_module_header.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# ===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# ===----------------------------------------------------------------------===##
-
-import os.path
-
-import libcxx.header_information
-
-header_restrictions = libcxx.header_information.header_restrictions
-
-libcxx_include_directory = os.path.join(
-    os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "include"
-)
-with open(
-    os.path.join(libcxx_include_directory, "__std_clang_module"), "w"
-) as std_clang_module_header:
-    std_clang_module_header.write(
-        """\
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// WARNING, this entire header is generated by
-// utils/generate_std_clang_module_header.py
-// DO NOT MODIFY!
-
-// This header should not be directly included, it's exclusively to import all
-// of the libc++ public clang modules for the `std` clang module to export. In
-// other words, it's to facilitate `@import std;` in Objective-C++ and `import std`
-// in Swift to expose all of the libc++ interfaces. This is generally not
-// recommended, however there are some clients that need to import all of libc++
-// without knowing what "all" is.
-#if !__building_module(std)
-#  error "Do not include this header directly, include individual headers instead"
-#endif
-
-#include <__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-"""
-    )
-    # Include the angle brackets in sorting so that <a.h> sorts before <a>
-    # like check-format wants.
-    for include, header in sorted([(f"<{header}>", header) for header in libcxx.header_information.public_headers]):
-        header_restriction = header_restrictions.get(header)
-        if header_restriction:
-            std_clang_module_header.write(f"#if {header_restriction}\n")
-            std_clang_module_header.write(f"#  include {include}\n")
-            std_clang_module_header.write(f"#endif\n")
-        else:
-            std_clang_module_header.write(f"#include {include}\n")
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 15456171b5483..735eb5ac949dc 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -374,6 +374,7 @@ def _mingwSupportsModules(cfg):
     "_LIBCPP_ABI_BOUNDED_ITERATORS": "libcpp-has-abi-bounded-iterators",
     "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING": "libcpp-has-abi-bounded-iterators-in-string",
     "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR": "libcpp-has-abi-bounded-iterators-in-vector",
+    "_LIBCPP_ABI_BOUNDED_UNIQUE_PTR": "libcpp-has-abi-bounded-unique_ptr",
     "_LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE": "libcpp-has-abi-fix-unordered-container-size-type",
     "_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR": "libcpp-deprecated-abi-disable-pair-trivial-copy-ctor",
     "_LIBCPP_ABI_NO_COMPRESSED_PAIR_PADDING": "libcpp-abi-no-compressed-pair-padding",
@@ -609,24 +610,31 @@ def check_gdb(cfg):
 # Helpers to define correspondances between LLVM versions and vendor system versions.
 # Those are used for backdeployment features below, do not use directly in tests.
 DEFAULT_FEATURES += [
+    Feature(
+        name="_target-has-llvm-18",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "target={{.+}}-apple-macosx{{15(.[0-9]+)?(.[0-9]+)?}}",
+            cfg.available_features,
+        ),
+    ),
     Feature(
         name="_target-has-llvm-17",
         when=lambda cfg: BooleanExpression.evaluate(
-            "target={{.+}}-apple-macosx{{14.[4-9](.0)?}} || target={{.+}}-apple-macosx{{1[5-9]([.].+)?}}",
+            "_target-has-llvm-18 || target={{.+}}-apple-macosx{{14.[4-9](.[0-9]+)?}} || target={{.+}}-apple-macosx{{1[5-9]([.].+)?}}",
             cfg.available_features,
         ),
     ),
     Feature(
         name="_target-has-llvm-16",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-17 || target={{.+}}-apple-macosx{{14.[0-3](.0)?}}",
+            "_target-has-llvm-17 || target={{.+}}-apple-macosx{{14.[0-3](.[0-9]+)?}}",
             cfg.available_features,
         ),
     ),
     Feature(
         name="_target-has-llvm-15",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-16 || target={{.+}}-apple-macosx{{13.[4-9](.0)?}}",
+            "_target-has-llvm-16 || target={{.+}}-apple-macosx{{13.[4-9](.[0-9]+)?}}",
             cfg.available_features,
         ),
     ),
@@ -640,21 +648,21 @@ def check_gdb(cfg):
     Feature(
         name="_target-has-llvm-13",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-14 || target={{.+}}-apple-macosx{{13.[0-3](.0)?}}",
+            "_target-has-llvm-14 || target={{.+}}-apple-macosx{{13.[0-3](.[0-9]+)?}}",
             cfg.available_features,
         ),
     ),
     Feature(
         name="_target-has-llvm-12",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-13 || target={{.+}}-apple-macosx{{12.[3-9](.0)?}}",
+            "_target-has-llvm-13 || target={{.+}}-apple-macosx{{12.[3-9](.[0-9]+)?}}",
             cfg.available_features,
         ),
     ),
     Feature(
         name="_target-has-llvm-11",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-12 || target={{.+}}-apple-macosx{{(11.[0-9]|12.[0-2])(.0)?}}",
+            "_target-has-llvm-12 || target={{.+}}-apple-macosx{{(11.[0-9]|12.[0-2])(.[0-9]+)?}}",
             cfg.available_features,
         ),
     ),
@@ -668,7 +676,7 @@ def check_gdb(cfg):
     Feature(
         name="_target-has-llvm-9",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-10 || target={{.+}}-apple-macosx{{10.15(.0)?}}",
+            "_target-has-llvm-10 || target={{.+}}-apple-macosx{{10.15(.[0-9]+)?}}",
             cfg.available_features,
         ),
     ),
@@ -711,7 +719,7 @@ def check_gdb(cfg):
 # a libc++ flavor that enables availability markup. Similarly, a test could fail when
 # run against the system library of an older version of FreeBSD, even though FreeBSD
 # doesn't provide availability markup at the time of writing this.
-for version in ("9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19"):
+for version in ("9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"):
     DEFAULT_FEATURES.append(
         Feature(
             name="using-built-library-before-llvm-{}".format(version),
@@ -747,27 +755,27 @@ def check_gdb(cfg):
             cfg.available_features,
         ),
     ),
-    # Tests that require __libcpp_verbose_abort support in the built library
+    # Tests that require std::to_chars(floating-point) in the built library
     Feature(
-        name="availability-verbose_abort-missing",
+        name="availability-fp_to_chars-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-13)",
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-14)",
             cfg.available_features,
         ),
     ),
-    # Tests that require std::pmr support in the built library
+    # Tests that require __libcpp_verbose_abort support in the built library
     Feature(
-        name="availability-pmr-missing",
+        name="availability-verbose_abort-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-13)",
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-15)",
             cfg.available_features,
         ),
     ),
-    # Tests that require std::to_chars(floating-point) in the built library
+    # Tests that require std::pmr support in the built library
     Feature(
-        name="availability-fp_to_chars-missing",
+        name="availability-pmr-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-14)",
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-16)",
             cfg.available_features,
         ),
     ),
diff --git a/libcxx/vendor/llvm/default_assertion_handler.in b/libcxx/vendor/llvm/default_assertion_handler.in
index 8f37b9089ece8..e12ccccdaff37 100644
--- a/libcxx/vendor/llvm/default_assertion_handler.in
+++ b/libcxx/vendor/llvm/default_assertion_handler.in
@@ -27,7 +27,7 @@
 // AppleClang shipped a slightly different version of __builtin_verbose_trap from the upstream
 // version before upstream Clang actually got the builtin.
 // TODO: Remove once AppleClang supports the two-arguments version of the builtin.
-#    if defined(_LIBCPP_APPLE_CLANG_VER) && _LIBCPP_APPLE_CLANG_VER < 17000
+#    if defined(_LIBCPP_APPLE_CLANG_VER) && _LIBCPP_APPLE_CLANG_VER < 1700
 #      define _LIBCPP_ASSERTION_HANDLER(message) __builtin_verbose_trap(message)
 #    else
 #      define _LIBCPP_ASSERTION_HANDLER(message) __builtin_verbose_trap("libc++", message)
diff --git a/libcxxabi/include/__cxxabi_config.h b/libcxxabi/include/__cxxabi_config.h
index e8aa37e6d5ec9..759445dac91f9 100644
--- a/libcxxabi/include/__cxxabi_config.h
+++ b/libcxxabi/include/__cxxabi_config.h
@@ -103,4 +103,10 @@
 #define _LIBCXXABI_DTOR_FUNC
 #endif
 
+#if __cplusplus < 201103L
+#  define _LIBCXXABI_NOEXCEPT throw()
+#else
+#  define _LIBCXXABI_NOEXCEPT noexcept
+#endif
+
 #endif // ____CXXABI_CONFIG_H
diff --git a/libcxxabi/include/cxxabi.h b/libcxxabi/include/cxxabi.h
index 8d1b5c9b55782..9ea93680f62e0 100644
--- a/libcxxabi/include/cxxabi.h
+++ b/libcxxabi/include/cxxabi.h
@@ -43,16 +43,16 @@ extern "C"  {
 
 // 2.4.2 Allocating the Exception Object
 extern _LIBCXXABI_FUNC_VIS void *
-__cxa_allocate_exception(size_t thrown_size) throw();
+__cxa_allocate_exception(size_t thrown_size) _LIBCXXABI_NOEXCEPT;
 extern _LIBCXXABI_FUNC_VIS void
-__cxa_free_exception(void *thrown_exception) throw();
+__cxa_free_exception(void *thrown_exception) _LIBCXXABI_NOEXCEPT;
 // This function is an LLVM extension, which mirrors the same extension in libsupc++ and libcxxrt
 extern _LIBCXXABI_FUNC_VIS __cxa_exception*
 #ifdef __wasm__
 // In Wasm, a destructor returns its argument
-__cxa_init_primary_exception(void* object, std::type_info* tinfo, void*(_LIBCXXABI_DTOR_FUNC* dest)(void*)) throw();
+__cxa_init_primary_exception(void* object, std::type_info* tinfo, void*(_LIBCXXABI_DTOR_FUNC* dest)(void*)) _LIBCXXABI_NOEXCEPT;
 #else
-__cxa_init_primary_exception(void* object, std::type_info* tinfo, void(_LIBCXXABI_DTOR_FUNC* dest)(void*)) throw();
+__cxa_init_primary_exception(void* object, std::type_info* tinfo, void(_LIBCXXABI_DTOR_FUNC* dest)(void*)) _LIBCXXABI_NOEXCEPT;
 #endif
 
 // 2.4.3 Throwing the Exception Object
@@ -66,13 +66,13 @@ __cxa_throw(void *thrown_exception, std::type_info *tinfo,
 
 // 2.5.3 Exception Handlers
 extern _LIBCXXABI_FUNC_VIS void *
-__cxa_get_exception_ptr(void *exceptionObject) throw();
+__cxa_get_exception_ptr(void *exceptionObject) _LIBCXXABI_NOEXCEPT;
 extern _LIBCXXABI_FUNC_VIS void *
-__cxa_begin_catch(void *exceptionObject) throw();
+__cxa_begin_catch(void *exceptionObject) _LIBCXXABI_NOEXCEPT;
 extern _LIBCXXABI_FUNC_VIS void __cxa_end_catch();
 #if defined(_LIBCXXABI_ARM_EHABI)
 extern _LIBCXXABI_FUNC_VIS bool
-__cxa_begin_cleanup(void *exceptionObject) throw();
+__cxa_begin_cleanup(void *exceptionObject) _LIBCXXABI_NOEXCEPT;
 extern _LIBCXXABI_FUNC_VIS void __cxa_end_cleanup();
 #endif
 extern _LIBCXXABI_FUNC_VIS std::type_info *__cxa_current_exception_type();
@@ -80,7 +80,7 @@ extern _LIBCXXABI_FUNC_VIS std::type_info *__cxa_current_exception_type();
 // GNU extension
 // Calls `terminate` with the current exception being caught. This function is used by GCC when a `noexcept` function
 // throws an exception inside a try/catch block and doesn't catch it.
-extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_NORETURN void __cxa_call_terminate(void*) throw();
+extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_NORETURN void __cxa_call_terminate(void*) _LIBCXXABI_NOEXCEPT;
 
 // 2.5.4 Rethrowing Exceptions
 extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_NORETURN void __cxa_rethrow();
@@ -168,23 +168,23 @@ extern _LIBCXXABI_FUNC_VIS char *__cxa_demangle(const char *mangled_name,
 
 // Apple additions to support C++ 0x exception_ptr class
 // These are primitives to wrap a smart pointer around an exception object
-extern _LIBCXXABI_FUNC_VIS void *__cxa_current_primary_exception() throw();
+extern _LIBCXXABI_FUNC_VIS void *__cxa_current_primary_exception() _LIBCXXABI_NOEXCEPT;
 extern _LIBCXXABI_FUNC_VIS void
 __cxa_rethrow_primary_exception(void *primary_exception);
 extern _LIBCXXABI_FUNC_VIS void
-__cxa_increment_exception_refcount(void *primary_exception) throw();
+__cxa_increment_exception_refcount(void *primary_exception) _LIBCXXABI_NOEXCEPT;
 extern _LIBCXXABI_FUNC_VIS void
-__cxa_decrement_exception_refcount(void *primary_exception) throw();
+__cxa_decrement_exception_refcount(void *primary_exception) _LIBCXXABI_NOEXCEPT;
 
 // Apple extension to support std::uncaught_exception()
-extern _LIBCXXABI_FUNC_VIS bool __cxa_uncaught_exception() throw();
-extern _LIBCXXABI_FUNC_VIS unsigned int __cxa_uncaught_exceptions() throw();
+extern _LIBCXXABI_FUNC_VIS bool __cxa_uncaught_exception() _LIBCXXABI_NOEXCEPT;
+extern _LIBCXXABI_FUNC_VIS unsigned int __cxa_uncaught_exceptions() _LIBCXXABI_NOEXCEPT;
 
 #if defined(__linux__) || defined(__Fuchsia__)
 // Linux and Fuchsia TLS support. Not yet an official part of the Itanium ABI.
 // https://sourceware.org/glibc/wiki/Destructor%20support%20for%20thread_local%20variables
 extern _LIBCXXABI_FUNC_VIS int __cxa_thread_atexit(void (*)(void *), void *,
-                                                   void *) throw();
+                                                   void *) _LIBCXXABI_NOEXCEPT;
 #endif
 
 } // extern "C"
diff --git a/libcxxabi/lib/cxxabiv1.exp b/libcxxabi/lib/cxxabiv1.exp
index b1bab45ef3347..0a22831a63d0b 100644
--- a/libcxxabi/lib/cxxabiv1.exp
+++ b/libcxxabi/lib/cxxabiv1.exp
@@ -2,12 +2,9 @@
 __ZTIN10__cxxabiv116__enum_type_infoE
 __ZTIN10__cxxabiv116__shim_type_infoE
 __ZTIN10__cxxabiv117__array_type_infoE
-__ZTIN10__cxxabiv117__class_type_infoE
 __ZTIN10__cxxabiv117__pbase_type_infoE
 __ZTIN10__cxxabiv119__pointer_type_infoE
 __ZTIN10__cxxabiv120__function_type_infoE
-__ZTIN10__cxxabiv120__si_class_type_infoE
-__ZTIN10__cxxabiv121__vmi_class_type_infoE
 __ZTIN10__cxxabiv123__fundamental_type_infoE
 __ZTIN10__cxxabiv129__pointer_to_member_type_infoE
 
diff --git a/libcxxabi/lib/itanium-base.exp b/libcxxabi/lib/itanium-base.exp
index 002e062df423e..0c23a19c94c52 100644
--- a/libcxxabi/lib/itanium-base.exp
+++ b/libcxxabi/lib/itanium-base.exp
@@ -12,7 +12,6 @@ ___cxa_guard_acquire
 ___cxa_guard_release
 ___cxa_increment_exception_refcount
 ___cxa_pure_virtual
-___cxa_rethrow_primary_exception
 ___cxa_throw_bad_array_new_length
 ___cxa_uncaught_exception
 ___cxa_uncaught_exceptions
diff --git a/libcxxabi/lib/symbols-not-reexported.exp b/libcxxabi/lib/symbols-not-reexported.exp
new file mode 100644
index 0000000000000..ea5d0b3fc15d7
--- /dev/null
+++ b/libcxxabi/lib/symbols-not-reexported.exp
@@ -0,0 +1,13 @@
+# These symbols are not re-exported from libc++ because providing a definition in libc++ causes
+# issues with some clients when backdeploying.
+
+# These symbols are implementation details of libc++abi, but they are referenced from UBSan
+# (which is a total hack). We'll need to figure out how to decouple UBSan from these details
+# before we can stop exporting them from libc++abi.
+__ZTIN10__cxxabiv117__class_type_infoE
+__ZTIN10__cxxabiv120__si_class_type_infoE
+__ZTIN10__cxxabiv121__vmi_class_type_infoE
+
+# This symbol is not an implementation detail of libc++abi, but it also causes issues when moving
+# to libc++. This needs further investigation.
+___cxa_rethrow_primary_exception
diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
index c1a7bcb14eb19..6f16c614212ef 100644
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -214,14 +214,22 @@ if (LIBCXXABI_ENABLE_SHARED)
   endif()
 
   add_library(cxxabi-reexports INTERFACE)
-  function(reexport_symbols file)
+  function(export_symbols file)
     # -exported_symbols_list is only available on Apple platforms
     if (APPLE)
       target_link_libraries(cxxabi_shared PRIVATE "-Wl,-exported_symbols_list,${file}")
+    endif()
+  endfunction()
+
+  function(reexport_symbols file)
+    export_symbols("${file}")
+    # -reexported_symbols_list is only available on Apple platforms
+    if (APPLE)
       target_link_libraries(cxxabi-reexports INTERFACE "-Wl,-reexported_symbols_list,${file}")
     endif()
   endfunction()
 
+  export_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/symbols-not-reexported.exp")
   reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/cxxabiv1.exp")
   reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/fundamental-types.exp")
   reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-base.exp")
@@ -306,7 +314,6 @@ endif()
 
 # Add a meta-target for both libraries.
 add_custom_target(cxxabi DEPENDS ${LIBCXXABI_BUILD_TARGETS})
-add_dependencies(cxxabi-test-depends cxxabi cxx)
 
 if (LIBCXXABI_INSTALL_LIBRARY)
   install(TARGETS ${LIBCXXABI_INSTALL_TARGETS}
diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 723bdfe324b14..501d0b6fdfcd1 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -2632,7 +2632,8 @@ template<typename NodeT> struct NodeKind;
 #include "ItaniumNodes.def"
 
 inline bool NodeArray::printAsString(OutputBuffer &OB) const {
-  auto Fail = [&OB, StartPos = OB.getCurrentPosition()] {
+  auto StartPos = OB.getCurrentPosition();
+  auto Fail = [&OB, StartPos] {
     OB.setCurrentPosition(StartPos);
     return false;
   };
diff --git a/libcxxabi/test/CMakeLists.txt b/libcxxabi/test/CMakeLists.txt
index 8e3048f2ffe8a..9eabfb08240b6 100644
--- a/libcxxabi/test/CMakeLists.txt
+++ b/libcxxabi/test/CMakeLists.txt
@@ -8,6 +8,57 @@ macro(pythonize_bool var)
   endif()
 endmacro()
 
+set(LIBCXXABI_TESTING_INSTALL_PREFIX "${LIBCXXABI_BINARY_DIR}/test-suite-install")
+add_custom_target(libcxxabi-install-cxx-for-testing
+                      DEPENDS cxx-headers
+                              cxx
+                              cxx_experimental
+                              cxx-modules
+                      COMMAND ${CMAKE_COMMAND} -E make_directory "${LIBCXXABI_TESTING_INSTALL_PREFIX}"
+                      COMMAND "${CMAKE_COMMAND}"
+                              -DCMAKE_INSTALL_COMPONENT=cxx-headers
+                              -DCMAKE_INSTALL_PREFIX="${LIBCXXABI_TESTING_INSTALL_PREFIX}"
+                              -P "${CMAKE_BINARY_DIR}/cmake_install.cmake"
+                      COMMAND "${CMAKE_COMMAND}"
+                              -DCMAKE_INSTALL_COMPONENT=cxx-modules
+                              -DCMAKE_INSTALL_PREFIX="${LIBCXXABI_TESTING_INSTALL_PREFIX}"
+                              -P "${CMAKE_BINARY_DIR}/cmake_install.cmake"
+                      COMMAND "${CMAKE_COMMAND}"
+                              -DCMAKE_INSTALL_COMPONENT=cxx
+                              -DCMAKE_INSTALL_PREFIX="${LIBCXXABI_TESTING_INSTALL_PREFIX}"
+                              -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+add_dependencies(cxxabi-test-depends libcxxabi-install-cxx-for-testing)
+
+add_custom_target(libcxxabi-install-cxxabi-for-testing
+                      DEPENDS cxxabi-headers
+                              cxxabi
+                      COMMAND ${CMAKE_COMMAND} -E make_directory "${LIBCXXABI_TESTING_INSTALL_PREFIX}"
+                      COMMAND "${CMAKE_COMMAND}"
+                              -DCMAKE_INSTALL_COMPONENT=cxxabi-headers
+                              -DCMAKE_INSTALL_PREFIX="${LIBCXXABI_TESTING_INSTALL_PREFIX}"
+                              -P "${CMAKE_BINARY_DIR}/cmake_install.cmake"
+                      COMMAND "${CMAKE_COMMAND}"
+                              -DCMAKE_INSTALL_COMPONENT=cxxabi
+                              -DCMAKE_INSTALL_PREFIX="${LIBCXXABI_TESTING_INSTALL_PREFIX}"
+                              -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+add_dependencies(cxxabi-test-depends libcxxabi-install-cxxabi-for-testing)
+
+if (LIBCXXABI_USE_LLVM_UNWINDER AND TARGET unwind)
+  add_custom_target(libcxxabi-install-unwind-for-testing
+    DEPENDS unwind-headers
+            unwind
+    COMMAND ${CMAKE_COMMAND} -E make_directory "${LIBCXXABI_TESTING_INSTALL_PREFIX}"
+    COMMAND "${CMAKE_COMMAND}"
+            -DCMAKE_INSTALL_COMPONENT=unwind-headers
+            -DCMAKE_INSTALL_PREFIX="${LIBCXXABI_TESTING_INSTALL_PREFIX}"
+            -P "${CMAKE_BINARY_DIR}/cmake_install.cmake"
+    COMMAND "${CMAKE_COMMAND}"
+            -DCMAKE_INSTALL_COMPONENT=unwind
+            -DCMAKE_INSTALL_PREFIX="${LIBCXXABI_TESTING_INSTALL_PREFIX}"
+            -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+  add_dependencies(cxxabi-test-depends libcxxabi-install-unwind-for-testing)
+endif()
+
 pythonize_bool(LIBCXXABI_USE_LLVM_UNWINDER)
 
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxxabi configuration.\n# Do not edit!")
diff --git a/libcxxabi/test/configs/apple-libc++abi-backdeployment.cfg.in b/libcxxabi/test/configs/apple-libc++abi-backdeployment.cfg.in
deleted file mode 100644
index 9e725c523f29b..0000000000000
--- a/libcxxabi/test/configs/apple-libc++abi-backdeployment.cfg.in
+++ /dev/null
@@ -1,66 +0,0 @@
-# Testing configuration for back-deployment against older Apple system libc++abi.
-#
-# Under this configuration, we compile and link all the test suite against the latest libc++abi,
-# however we run against the libc++abi on a different platform. This emulates the workflow of
-# a developer building their application using recent tools but with the goal of deploying
-# on existing devices running an older OS (and hence an older dylib).
-
-import os, site
-site.addsitedir(os.path.join('@LIBCXXABI_LIBCXX_PATH@', 'utils'))
-import libcxx.test.params, libcxx.test.config, libcxx.test.dsl
-
-lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
-
-BACKDEPLOYMENT_PARAMETERS = [
-    libcxx.test.dsl.Parameter(name='cxx_runtime_root', type=str,
-        actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{cxx-runtime-root}', root)],
-        help="""
-        The simulated root of the system (for libc++) when running tests.
-
-        This should be a directory hierarchy under which the libc++ dylib can be found.
-        The dylib in that hierarchy is the one that will be used at runtime when running
-        the tests.
-        """),
-    libcxx.test.dsl.Parameter(name='abi_runtime_root', type=str,
-        actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{abi-runtime-root}', root)],
-        help="""
-        The simulated root of the system (for libc++abi) when running tests.
-
-        This should be a directory hierarchy under which the libc++abi dylib can be found.
-        The dylib in that hierarchy is the one that will be used at runtime when running
-        the tests.
-        """),
-    libcxx.test.dsl.Parameter(name='unwind_runtime_root', type=str,
-        actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{unwind-runtime-root}', root)],
-        help="""
-        The simulated root of the system (for libunwind) when running tests.
-
-        This should be a directory hierarchy under which the libunwind dylib can be found.
-        The dylib in that hierarchy is the one that will be used at runtime when running
-        the tests.
-        """),
-]
-
-config.substitutions.append(('%{flags}',
-    '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else ''
-))
-config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{cxx-include} -I %{cxx-target-include} %{maybe-include-libunwind} -D_LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS ' +
-    '-I %{libcxx}/test/support -I %{libcxx}/src'
-))
-config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++ -lc++abi'
-))
-config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --env DYLD_LIBRARY_PATH="%{cxx-runtime-root}:%{abi-runtime-root}:%{unwind-runtime-root}" -- '
-))
-
-config.stdlib = 'apple-libc++'
-config.using_system_stdlib = True
-
-libcxx.test.config.configure(
-    libcxx.test.params.DEFAULT_PARAMETERS + BACKDEPLOYMENT_PARAMETERS,
-    libcxx.test.features.DEFAULT_FEATURES,
-    config,
-    lit_config
-)
diff --git a/libcxxabi/test/configs/apple-libc++abi-system.cfg.in b/libcxxabi/test/configs/apple-libc++abi-system.cfg.in
new file mode 100644
index 0000000000000..1e80eee4f8d0c
--- /dev/null
+++ b/libcxxabi/test/configs/apple-libc++abi-system.cfg.in
@@ -0,0 +1,34 @@
+# Testing configuration for back-deployment against the system-provided libc++abi.
+#
+# Under this configuration, we compile and link all the test suite against the just-built
+# libc++abi, but we run against the system libc++abi.
+
+import os, site
+site.addsitedir(os.path.join('@LIBCXXABI_LIBCXX_PATH@', 'utils'))
+import libcxx.test.params, libcxx.test.config, libcxx.test.dsl
+
+lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
+
+config.substitutions.append(('%{flags}',
+    '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else ''
+))
+config.substitutions.append(('%{compile_flags}',
+    '-nostdinc++ -I %{include} -I %{cxx-include} -I %{cxx-target-include} %{maybe-include-libunwind} -D_LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS ' +
+    '-I %{libcxx}/test/support -I %{libcxx}/src'
+))
+config.substitutions.append(('%{link_flags}',
+    '-nostdlib++ -L %{lib} -lc++'
+))
+config.substitutions.append(('%{exec}',
+    '%{executor} --execdir %T -- '
+))
+
+config.stdlib = 'apple-libc++'
+config.using_system_stdlib = True
+
+libcxx.test.config.configure(
+    libcxx.test.params.DEFAULT_PARAMETERS,
+    libcxx.test.features.DEFAULT_FEATURES,
+    config,
+    lit_config
+)
diff --git a/libcxxabi/test/configs/cmake-bridge.cfg.in b/libcxxabi/test/configs/cmake-bridge.cfg.in
index 3fefc6a7fdc88..b00eb642750c9 100644
--- a/libcxxabi/test/configs/cmake-bridge.cfg.in
+++ b/libcxxabi/test/configs/cmake-bridge.cfg.in
@@ -27,10 +27,11 @@ config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test')
 config.host_triple = '@LLVM_HOST_TRIPLE@'
 
 config.substitutions.append(('%{libcxx}', '@LIBCXXABI_LIBCXX_PATH@'))
-config.substitutions.append(('%{include}', '@LIBCXXABI_SOURCE_DIR@/include'))
-config.substitutions.append(('%{cxx-include}', '@LIBCXXABI_HEADER_DIR@/include/c++/v1'))
-config.substitutions.append(('%{cxx-target-include}', '@LIBCXXABI_HEADER_DIR@/include/%{triple}/c++/v1'))
-config.substitutions.append(('%{lib}', '@LIBCXXABI_LIBRARY_DIR@'))
+config.substitutions.append(('%{install-prefix}', '@LIBCXXABI_TESTING_INSTALL_PREFIX@'))
+config.substitutions.append(('%{include}', '@LIBCXXABI_TESTING_INSTALL_PREFIX@/include'))
+config.substitutions.append(('%{cxx-include}', '@LIBCXXABI_TESTING_INSTALL_PREFIX@/@LIBCXXABI_INSTALL_INCLUDE_DIR@'))
+config.substitutions.append(('%{cxx-target-include}', '@LIBCXXABI_TESTING_INSTALL_PREFIX@/include/%{triple}/c++/v1'))
+config.substitutions.append(('%{lib}', '@LIBCXXABI_TESTING_INSTALL_PREFIX@/@LIBCXXABI_INSTALL_LIBRARY_DIR@'))
 
 if @LIBCXXABI_USE_LLVM_UNWINDER@:
     config.substitutions.append(('%{maybe-include-libunwind}', '-I "@LIBCXXABI_LIBUNWIND_INCLUDES_INTERNAL@"'))
diff --git a/libcxxabi/test/configs/llvm-libc++abi-mingw.cfg.in b/libcxxabi/test/configs/llvm-libc++abi-mingw.cfg.in
index 1e9f202e8dd8a..0d8b7bdcb1d08 100644
--- a/libcxxabi/test/configs/llvm-libc++abi-mingw.cfg.in
+++ b/libcxxabi/test/configs/llvm-libc++abi-mingw.cfg.in
@@ -1,5 +1,5 @@
 # This testing configuration handles running the test suite against LLVM's libc++abi
-# using either a DLL or a static library, with MinGW/Clang on Windows.
+# using a static library merged into libc++ with MinGW/Clang on Windows.
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
@@ -11,7 +11,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib} -lc++'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{lib} -- '
+    '%{executor} --execdir %T --prepend_env PATH=%{install-prefix}/bin -- '
 ))
 
 import os, site
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index 17786a3a486fc..ad131bb3a8a7b 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -9,9 +9,8 @@
 // This test is too big for most embedded devices.
 // XFAIL: LIBCXX-PICOLIBC-FIXME
 
-// https://llvm.org/PR51407 was not fixed in some previously-released
-// demanglers, which causes them to run into the infinite loop.
-// UNSUPPORTED: using-built-library-before-llvm-14
+// This test exercises support for char array initializer lists added in dd8b266ef.
+// UNSUPPORTED: using-built-library-before-llvm-20
 
 // Android's long double on x86[-64] is (64/128)-bits instead of Linux's usual
 // 80-bit format, and this demangling test is failing on it.
@@ -30128,11 +30127,15 @@ const char* cases[][2] =
     // C++20 concepts, see https://github.com/itanium-cxx-abi/cxx-abi/issues/24.
     {"_Z2f0IiE1SIX1CIT_EEEv", "S<C<int>> f0<int>()"},
     {"_ZN5test21AIiEF1fEzQ4TrueIT_E", "test2::A<int>::friend f(...) requires True<T>"},
-    {"_ZN5test2F1gIvEEvzQaa4TrueIT_E4TrueITL0__E", "void test2::friend g<void>(...) requires True<T> && True<TL0_>"},
+    {"_ZN5test21AIbEF1fEzQ4TrueIT_E", "test2::A<bool>::friend f(...) requires True<T>"},
+    {"_ZN5test21AIiEF1gIvEEvzQaa4TrueIT_E4TrueITL0__E", "void test2::A<int>::friend g<void>(...) requires True<T> && True<TL0_>"},
+    {"_ZN5test21AIbEF1gIvEEvzQaa4TrueIT_E4TrueITL0__E", "void test2::A<bool>::friend g<void>(...) requires True<T> && True<TL0_>"},
     {"_ZN5test21hIvEEvzQ4TrueITL0__E", "void test2::h<void>(...) requires True<TL0_>"},
-    {"_ZN5test2F1iIvQaa4TrueIT_E4TrueITL0__EEEvz", "void test2::friend i<void>(...)"},
+    {"_ZN5test21AIiEF1iIvQaa4TrueIT_E4TrueITL0__EEEvz", "void test2::A<int>::friend i<void>(...)"},
+    {"_ZN5test21AIbEF1iIvQaa4TrueIT_E4TrueITL0__EEEvz", "void test2::A<bool>::friend i<void>(...)"},
     {"_ZN5test21jIvQ4TrueITL0__EEEvz", "void test2::j<void>(...)"},
-    {"_ZN5test2F1kITk4TruevQ4TrueIT_EEEvz", "void test2::friend k<void>(...)"},
+    {"_ZN5test21AIiEF1kITk4TruevQ4TrueIT_EEEvz", "void test2::A<int>::friend k<void>(...)"},
+    {"_ZN5test21AIbEF1kITk4TruevQ4TrueIT_EEEvz", "void test2::A<bool>::friend k<void>(...)"},
     {"_ZN5test21lITk4TruevEEvz", "void test2::l<void>(...)"},
     {"_ZN5test31dITnDaLi0EEEvv", "void test3::d<0>()"},
     {"_ZN5test31eITnDcLi0EEEvv", "void test3::e<0>()"},
diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt
index 28d67b0fef92c..b911f482fc26b 100644
--- a/libunwind/CMakeLists.txt
+++ b/libunwind/CMakeLists.txt
@@ -31,11 +31,6 @@ include(CMakeDependentOption)
 include(HandleCompilerRT)
 
 # Define options.
-option(LIBUNWIND_BUILD_32_BITS "Build 32 bit multilib libunwind. This option is not supported anymore when building the runtimes. Please specify a full triple instead." ${LLVM_BUILD_32_BITS})
-if (LIBUNWIND_BUILD_32_BITS)
-  message(FATAL_ERROR "LIBUNWIND_BUILD_32_BITS is not supported anymore when building the runtimes, please specify a full triple instead.")
-endif()
-
 option(LIBUNWIND_ENABLE_CET "Build libunwind with CET enabled." OFF)
 option(LIBUNWIND_ENABLE_GCS "Build libunwind with GCS enabled." OFF)
 option(LIBUNWIND_ENABLE_ASSERTIONS "Enable assertions independent of build mode." ON)
@@ -66,8 +61,18 @@ cmake_dependent_option(LIBUNWIND_INSTALL_SHARED_LIBRARY
   "Install the shared libunwind library." ON
   "LIBUNWIND_ENABLE_SHARED;LIBUNWIND_INSTALL_LIBRARY" OFF)
 
+set(LIBUNWIND_LIBRARY_VERSION "1.0" CACHE STRING
+  "Version of libunwind. This will be reflected in the name of the shared library produced.
+   For example, -DLIBUNWIND_LIBRARY_VERSION=x.y will result in the library being named
+   libunwind.x.y.dylib, along with the usual symlinks pointing to that. On Apple platforms,
+   this also controls the linker's 'current_version' property.")
+
 if(MINGW)
-  set(LIBUNWIND_DEFAULT_TEST_CONFIG "llvm-libunwind-mingw.cfg.in")
+  if (LIBUNWIND_ENABLE_SHARED)
+    set(LIBUNWIND_DEFAULT_TEST_CONFIG "llvm-libunwind-shared-mingw.cfg.in")
+  else()
+    set(LIBUNWIND_DEFAULT_TEST_CONFIG "llvm-libunwind-static-mingw.cfg.in")
+  endif()
 elseif (LIBUNWIND_ENABLE_SHARED)
   set(LIBUNWIND_DEFAULT_TEST_CONFIG "llvm-libunwind-shared.cfg.in")
 else()
diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt
index 780430ba70ba6..125cf4ffe912a 100644
--- a/libunwind/src/CMakeLists.txt
+++ b/libunwind/src/CMakeLists.txt
@@ -105,15 +105,6 @@ if (LIBUNWIND_ENABLE_SHARED AND
           "on C++ ABI library is not supported.")
 endif()
 
-if (APPLE)
-  add_compile_flags("-U__STRICT_ANSI__")
-  add_link_flags("-compatibility_version 1" "-install_name /usr/lib/libunwind.1.dylib")
-
-  if (CMAKE_OSX_DEPLOYMENT_TARGET STREQUAL "10.6")
-    add_link_flags("-current_version ${LIBUNWIND_VERSION}" "/usr/lib/libSystem.B.dylib")
-  endif ()
-endif ()
-
 if (HAIKU)
   add_library_flags_if(LIBUNWIND_HAS_ROOT_LIB root)
 
@@ -170,8 +161,8 @@ if (LIBUNWIND_ENABLE_SHARED)
       LINK_FLAGS "${LIBUNWIND_LINK_FLAGS}"
       LINKER_LANGUAGE C
       OUTPUT_NAME "${LIBUNWIND_SHARED_OUTPUT_NAME}"
-      VERSION "1.0"
-      SOVERSION "1"
+      VERSION     "${LIBUNWIND_LIBRARY_VERSION}"
+      SOVERSION   "1"
   )
 
   list(APPEND LIBUNWIND_BUILD_TARGETS "unwind_shared")
diff --git a/libunwind/test/CMakeLists.txt b/libunwind/test/CMakeLists.txt
index c7b1b3d01d8c7..c222c0bdbf5af 100644
--- a/libunwind/test/CMakeLists.txt
+++ b/libunwind/test/CMakeLists.txt
@@ -8,6 +8,20 @@ macro(pythonize_bool var)
   endif()
 endmacro()
 
+set(LIBUNWIND_TESTING_INSTALL_PREFIX "${LIBUNWIND_BINARY_DIR}/test-suite-install")
+add_custom_target(libunwind-install-unwind-for-testing
+  DEPENDS unwind-headers
+          unwind
+  COMMAND ${CMAKE_COMMAND} -E make_directory "${LIBUNWIND_TESTING_INSTALL_PREFIX}"
+  COMMAND "${CMAKE_COMMAND}"
+          -DCMAKE_INSTALL_COMPONENT=unwind-headers
+          -DCMAKE_INSTALL_PREFIX="${LIBUNWIND_TESTING_INSTALL_PREFIX}"
+          -P "${CMAKE_BINARY_DIR}/cmake_install.cmake"
+  COMMAND "${CMAKE_COMMAND}"
+          -DCMAKE_INSTALL_COMPONENT=unwind
+          -DCMAKE_INSTALL_PREFIX="${LIBUNWIND_TESTING_INSTALL_PREFIX}"
+          -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+
 pythonize_bool(LIBUNWIND_ENABLE_CET)
 pythonize_bool(LIBUNWIND_ENABLE_GCS)
 pythonize_bool(LIBUNWIND_ENABLE_THREADS)
@@ -48,4 +62,4 @@ configure_lit_site_cfg(
 
 add_lit_testsuite(check-unwind "Running libunwind tests"
   ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS unwind)
+  DEPENDS libunwind-install-unwind-for-testing)
diff --git a/libunwind/test/configs/apple-libunwind-backdeployment.cfg.in b/libunwind/test/configs/apple-libunwind-backdeployment.cfg.in
deleted file mode 100644
index 013c43ae78001..0000000000000
--- a/libunwind/test/configs/apple-libunwind-backdeployment.cfg.in
+++ /dev/null
@@ -1,67 +0,0 @@
-# Testing configuration for back-deployment against older Apple system libunwind.
-#
-# Under this configuration, we compile and link all the test suite against the latest libunwind,
-# however we run against the libunwind on a different platform. This emulates the workflow of
-# a developer building their application using recent tools but with the goal of deploying
-# on existing devices running an older OS (and hence an older dylib).
-
-import os, site
-site.addsitedir(os.path.join('@LIBUNWIND_LIBCXX_PATH@', 'utils'))
-import libcxx.test.params, libcxx.test.config, libcxx.test.dsl
-
-lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
-
-BACKDEPLOYMENT_PARAMETERS = [
-    libcxx.test.dsl.Parameter(name='cxx_runtime_root', type=str,
-        actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{cxx-runtime-root}', root)],
-        help="""
-        The simulated root of the system (for libc++) when running tests.
-
-        This should be a directory hierarchy under which the libc++ dylib can be found.
-        The dylib in that hierarchy is the one that will be used at runtime when running
-        the tests.
-        """),
-    libcxx.test.dsl.Parameter(name='abi_runtime_root', type=str,
-        actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{abi-runtime-root}', root)],
-        help="""
-        The simulated root of the system (for libc++abi) when running tests.
-
-        This should be a directory hierarchy under which the libc++abi dylib can be found.
-        The dylib in that hierarchy is the one that will be used at runtime when running
-        the tests.
-        """),
-    libcxx.test.dsl.Parameter(name='unwind_runtime_root', type=str,
-        actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{unwind-runtime-root}', root)],
-        help="""
-        The simulated root of the system (for libunwind) when running tests.
-
-        This should be a directory hierarchy under which the libunwind dylib can be found.
-        The dylib in that hierarchy is the one that will be used at runtime when running
-        the tests.
-        """),
-]
-
-config.substitutions.append(('%{flags}',
-    '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else ''
-))
-config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include}'
-))
-config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++ -lc++abi -lunwind'
-))
-config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --env DYLD_LIBRARY_PATH="%{cxx-runtime-root}:%{abi-runtime-root}:%{unwind-runtime-root}" -- '
-))
-
-config.stdlib = 'apple-libc++'
-config.using_system_stdlib = True
-
-import os, site
-import libcxx.test.params, libcxx.test.config
-libcxx.test.config.configure(
-    libcxx.test.params.DEFAULT_PARAMETERS + BACKDEPLOYMENT_PARAMETERS,
-    libcxx.test.features.DEFAULT_FEATURES,
-    config,
-    lit_config
-)
diff --git a/libunwind/test/configs/apple-libunwind-system.cfg.in b/libunwind/test/configs/apple-libunwind-system.cfg.in
new file mode 100644
index 0000000000000..e5a7c983562a6
--- /dev/null
+++ b/libunwind/test/configs/apple-libunwind-system.cfg.in
@@ -0,0 +1,41 @@
+# Testing configuration for back-deployment against the system-provided libunwind.
+#
+# Under this configuration, we compile and link all the test suite against the just-built
+# libunwind, but we run against the system libunwind.
+
+import os, site
+site.addsitedir(os.path.join('@LIBUNWIND_LIBCXX_PATH@', 'utils'))
+import libcxx.test.params, libcxx.test.config, libcxx.test.dsl
+
+lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
+
+config.substitutions.append(('%{flags}',
+    '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else ''
+))
+config.substitutions.append(('%{compile_flags}',
+    '-nostdinc++ -I %{include}'
+))
+config.substitutions.append(('%{link_flags}',
+    '-nostdlib++ -L %{lib} -lc++ -lunwind'
+))
+config.substitutions.append(('%{exec}',
+    '%{executor} --execdir %T -- '
+))
+
+config.stdlib = 'apple-libc++'
+config.using_system_stdlib = True
+
+# TODO: This is a giant hack, but we need to change the install_name of libunwind.dylib because the
+#       upstream configuration can't currently produce a libunwind.dylib that is compatible with the
+#       Apple system one.
+import subprocess
+subprocess.check_call(['install_name_tool', '-id', '/usr/lib/system/libunwind.dylib', '@LIBUNWIND_TESTING_INSTALL_PREFIX@/lib/libunwind.dylib'])
+
+import os, site
+import libcxx.test.params, libcxx.test.config
+libcxx.test.config.configure(
+    libcxx.test.params.DEFAULT_PARAMETERS,
+    libcxx.test.features.DEFAULT_FEATURES,
+    config,
+    lit_config
+)
diff --git a/libunwind/test/configs/cmake-bridge.cfg.in b/libunwind/test/configs/cmake-bridge.cfg.in
index 7fc7a3da42462..bd7122bbea32c 100644
--- a/libunwind/test/configs/cmake-bridge.cfg.in
+++ b/libunwind/test/configs/cmake-bridge.cfg.in
@@ -29,5 +29,6 @@ if not @LIBUNWIND_ENABLE_THREADS@:
     config.available_features.add('libunwind-no-threads')
 
 # Add substitutions for bootstrapping the test suite configuration
-config.substitutions.append(('%{include}', '@LIBUNWIND_SOURCE_DIR@/include'))
-config.substitutions.append(('%{lib}', '@LIBUNWIND_LIBRARY_DIR@'))
+config.substitutions.append(('%{install-prefix}', '@LIBUNWIND_TESTING_INSTALL_PREFIX@'))
+config.substitutions.append(('%{include}', '@LIBUNWIND_TESTING_INSTALL_PREFIX@/include'))
+config.substitutions.append(('%{lib}', '@LIBUNWIND_TESTING_INSTALL_PREFIX@/lib'))
diff --git a/libunwind/test/configs/llvm-libunwind-shared-mingw.cfg.in b/libunwind/test/configs/llvm-libunwind-shared-mingw.cfg.in
new file mode 100644
index 0000000000000..b29d83fbab305
--- /dev/null
+++ b/libunwind/test/configs/llvm-libunwind-shared-mingw.cfg.in
@@ -0,0 +1,25 @@
+# This testing configuration handles running the test suite against LLVM's libunwind
+# using a DLL with MinGW/Clang on Windows.
+
+lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
+
+config.substitutions.append(('%{flags}', ''))
+config.substitutions.append(('%{compile_flags}',
+    '-nostdinc++ -I %{include} -funwind-tables'
+))
+config.substitutions.append(('%{link_flags}',
+    '-L %{lib} -lunwind'
+))
+config.substitutions.append(('%{exec}',
+    '%{executor} --execdir %T --prepend_env PATH=%{install-prefix}/bin -- '
+))
+
+import os, site
+site.addsitedir(os.path.join('@LIBUNWIND_LIBCXX_PATH@', 'utils'))
+import libcxx.test.params, libcxx.test.config
+libcxx.test.config.configure(
+    libcxx.test.params.DEFAULT_PARAMETERS,
+    libcxx.test.features.DEFAULT_FEATURES,
+    config,
+    lit_config
+)
diff --git a/libunwind/test/configs/llvm-libunwind-mingw.cfg.in b/libunwind/test/configs/llvm-libunwind-static-mingw.cfg.in
similarity index 91%
rename from libunwind/test/configs/llvm-libunwind-mingw.cfg.in
rename to libunwind/test/configs/llvm-libunwind-static-mingw.cfg.in
index 33d061a3efc6f..437c53bea2f06 100644
--- a/libunwind/test/configs/llvm-libunwind-mingw.cfg.in
+++ b/libunwind/test/configs/llvm-libunwind-static-mingw.cfg.in
@@ -1,5 +1,5 @@
 # This testing configuration handles running the test suite against LLVM's libunwind
-# using either a DLL or a static library, with MinGW/Clang on Windows.
+# using a static library with MinGW/Clang on Windows.
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index 6510c637ae8fe..c6986681dffe7 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -1100,6 +1100,13 @@ void CHPERedirectionChunk::writeTo(uint8_t *buf) const {
 ImportThunkChunkARM64EC::ImportThunkChunkARM64EC(ImportFile *file)
     : ImportThunkChunk(file->ctx, file->impSym), file(file) {}
 
+size_t ImportThunkChunkARM64EC::getSize() const {
+  if (!extended)
+    return sizeof(importThunkARM64EC);
+  // The last instruction is replaced with an inline range extension thunk.
+  return sizeof(importThunkARM64EC) + sizeof(arm64Thunk) - sizeof(uint32_t);
+}
+
 void ImportThunkChunkARM64EC::writeTo(uint8_t *buf) const {
   memcpy(buf, importThunkARM64EC, sizeof(importThunkARM64EC));
   applyArm64Addr(buf, file->impSym->getRVA(), rva, 12);
@@ -1116,7 +1123,29 @@ void ImportThunkChunkARM64EC::writeTo(uint8_t *buf) const {
   applyArm64Imm(buf + 12, exitThunkRVA & 0xfff, 0);
 
   Defined *helper = cast<Defined>(file->ctx.config.arm64ECIcallHelper);
-  applyArm64Branch26(buf + 16, helper->getRVA() - rva - 16);
+  if (extended) {
+    // Replace last instruction with an inline range extension thunk.
+    memcpy(buf + 16, arm64Thunk, sizeof(arm64Thunk));
+    applyArm64Addr(buf + 16, helper->getRVA(), rva + 16, 12);
+    applyArm64Imm(buf + 20, helper->getRVA() & 0xfff, 0);
+  } else {
+    applyArm64Branch26(buf + 16, helper->getRVA() - rva - 16);
+  }
+}
+
+bool ImportThunkChunkARM64EC::verifyRanges() {
+  if (extended)
+    return true;
+  auto helper = cast<Defined>(file->ctx.config.arm64ECIcallHelper);
+  return isInt<28>(helper->getRVA() - rva - 16);
+}
+
+uint32_t ImportThunkChunkARM64EC::extendRanges() {
+  if (extended || verifyRanges())
+    return 0;
+  extended = true;
+  // The last instruction is replaced with an inline range extension thunk.
+  return sizeof(arm64Thunk) - sizeof(uint32_t);
 }
 
 } // namespace lld::coff
diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index 04a656ae0874e..42284f485e5c0 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -185,6 +185,13 @@ class NonSectionChunk : public Chunk {
   // bytes, so this is used only for logging or debugging.
   virtual StringRef getDebugName() const { return ""; }
 
+  // Verify that chunk relocations are within their ranges.
+  virtual bool verifyRanges() { return true; };
+
+  // If needed, extend the chunk to ensure all relocations are within the
+  // allowed ranges. Return the additional space required for the extension.
+  virtual uint32_t extendRanges() { return 0; };
+
   static bool classof(const Chunk *c) { return c->kind() >= OtherKind; }
 
 protected:
@@ -620,12 +627,15 @@ class ImportThunkChunkARM64 : public ImportThunkChunk {
 class ImportThunkChunkARM64EC : public ImportThunkChunk {
 public:
   explicit ImportThunkChunkARM64EC(ImportFile *file);
-  size_t getSize() const override { return sizeof(importThunkARM64EC); };
+  size_t getSize() const override;
   MachineTypes getMachine() const override { return ARM64EC; }
   void writeTo(uint8_t *buf) const override;
+  bool verifyRanges() override;
+  uint32_t extendRanges() override;
 
   Defined *exitThunk;
   Defined *sym = nullptr;
+  bool extended = false;
 
 private:
   ImportFile *file;
diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index 39dcce9fe8483..2d20b094888c7 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -812,6 +812,16 @@ void DelayLoadContents::create(Defined *h) {
         s->loadThunkSym =
             cast<DefinedSynthetic>(ctx.symtab.addSynthetic(symName, t));
       }
+
+      if (s->file->impECSym) {
+        auto chunk = make<AuxImportChunk>(s->file);
+        auxIat.push_back(chunk);
+        s->file->impECSym->setLocation(chunk);
+
+        chunk = make<AuxImportChunk>(s->file);
+        auxIatCopy.push_back(chunk);
+        s->file->auxImpCopySym->setLocation(chunk);
+      }
     }
     thunks.push_back(tm);
     if (pdataChunk)
@@ -822,6 +832,10 @@ void DelayLoadContents::create(Defined *h) {
     // Terminate with null values.
     addresses.push_back(make<NullChunk>(8));
     names.push_back(make<NullChunk>(8));
+    if (ctx.config.machine == ARM64EC) {
+      auxIat.push_back(make<NullChunk>(8));
+      auxIatCopy.push_back(make<NullChunk>(8));
+    }
 
     for (int i = 0, e = syms.size(); i < e; ++i)
       syms[i]->setLocation(addresses[base + i]);
@@ -845,6 +859,7 @@ void DelayLoadContents::create(Defined *h) {
 Chunk *DelayLoadContents::newTailMergeChunk(Chunk *dir) {
   switch (ctx.config.machine) {
   case AMD64:
+  case ARM64EC:
     return make<TailMergeChunkX64>(dir, helper);
   case I386:
     return make<TailMergeChunkX86>(ctx, dir, helper);
@@ -880,6 +895,7 @@ Chunk *DelayLoadContents::newThunkChunk(DefinedImportData *s,
                                         Chunk *tailMerge) {
   switch (ctx.config.machine) {
   case AMD64:
+  case ARM64EC:
     return make<ThunkChunkX64>(s, tailMerge);
   case I386:
     return make<ThunkChunkX86>(ctx, s, tailMerge);
diff --git a/lld/COFF/DLL.h b/lld/COFF/DLL.h
index afb46f22ec9e1..f7d2b57a20a02 100644
--- a/lld/COFF/DLL.h
+++ b/lld/COFF/DLL.h
@@ -48,6 +48,8 @@ class DelayLoadContents {
   ArrayRef<Chunk *> getCodeChunks() { return thunks; }
   ArrayRef<Chunk *> getCodePData() { return pdata; }
   ArrayRef<Chunk *> getCodeUnwindInfo() { return unwindinfo; }
+  ArrayRef<Chunk *> getAuxIat() { return auxIat; }
+  ArrayRef<Chunk *> getAuxIatCopy() { return auxIatCopy; }
 
   uint64_t getDirRVA() { return dirs[0]->getRVA(); }
   uint64_t getDirSize();
@@ -69,6 +71,8 @@ class DelayLoadContents {
   std::vector<Chunk *> pdata;
   std::vector<Chunk *> unwindinfo;
   std::vector<Chunk *> dllNames;
+  std::vector<Chunk *> auxIat;
+  std::vector<Chunk *> auxIatCopy;
 
   COFFLinkerContext &ctx;
 };
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 5a6a4a61030e6..dc757bf7dd1e7 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -2465,12 +2465,17 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     ctx.symtab.addAbsolute("__arm64x_extra_rfe_table_size", 0);
     ctx.symtab.addAbsolute("__arm64x_redirection_metadata", 0);
     ctx.symtab.addAbsolute("__arm64x_redirection_metadata_count", 0);
+    ctx.symtab.addAbsolute("__hybrid_auxiliary_delayload_iat_copy", 0);
+    ctx.symtab.addAbsolute("__hybrid_auxiliary_delayload_iat", 0);
     ctx.symtab.addAbsolute("__hybrid_auxiliary_iat", 0);
     ctx.symtab.addAbsolute("__hybrid_auxiliary_iat_copy", 0);
     ctx.symtab.addAbsolute("__hybrid_code_map", 0);
     ctx.symtab.addAbsolute("__hybrid_code_map_count", 0);
+    ctx.symtab.addAbsolute("__hybrid_image_info_bitfield", 0);
     ctx.symtab.addAbsolute("__x64_code_ranges_to_entry_points", 0);
     ctx.symtab.addAbsolute("__x64_code_ranges_to_entry_points_count", 0);
+    ctx.symtab.addSynthetic("__guard_check_icall_a64n_fptr", nullptr);
+    ctx.symtab.addSynthetic("__arm64x_native_entrypoint", nullptr);
   }
 
   if (config->pseudoRelocs) {
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 7cf723a8cf103..71ee5ce468555 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -473,8 +473,14 @@ bool Writer::createThunks(OutputSection *os, int margin) {
   // elements into it.
   for (size_t i = 0; i != os->chunks.size(); ++i) {
     SectionChunk *sc = dyn_cast<SectionChunk>(os->chunks[i]);
-    if (!sc)
+    if (!sc) {
+      auto chunk = cast<NonSectionChunk>(os->chunks[i]);
+      if (uint32_t size = chunk->extendRanges()) {
+        thunksSize += size;
+        addressesChanged = true;
+      }
       continue;
+    }
     MachineTypes machine = sc->getMachine();
     size_t thunkInsertionSpot = i + 1;
 
@@ -607,8 +613,11 @@ void Writer::createECCodeMap() {
 bool Writer::verifyRanges(const std::vector<Chunk *> chunks) {
   for (Chunk *c : chunks) {
     SectionChunk *sc = dyn_cast<SectionChunk>(c);
-    if (!sc)
+    if (!sc) {
+      if (!cast<NonSectionChunk>(c)->verifyRanges())
+        return false;
       continue;
+    }
     MachineTypes machine = sc->getMachine();
 
     ArrayRef<coff_relocation> relocs = sc->getRelocs();
@@ -949,6 +958,13 @@ void Writer::appendECImportTables() {
                             auxIat->chunks.end());
     rdataSec->addContributingPartialSection(auxIat);
   }
+
+  if (!delayIdata.getAuxIat().empty()) {
+    delayIdata.getAuxIat().front()->setAlignment(0x1000);
+    rdataSec->chunks.insert(rdataSec->chunks.end(),
+                            delayIdata.getAuxIat().begin(),
+                            delayIdata.getAuxIat().end());
+  }
 }
 
 // Locate the first Chunk and size of the import directory list and the
@@ -1285,6 +1301,8 @@ void Writer::appendImportThunks() {
       textSec->addChunk(c);
     for (Chunk *c : delayIdata.getCodePData())
       pdataSec->addChunk(c);
+    for (Chunk *c : delayIdata.getAuxIatCopy())
+      rdataSec->addChunk(c);
     for (Chunk *c : delayIdata.getCodeUnwindInfo())
       rdataSec->addChunk(c);
   }
@@ -2286,6 +2304,20 @@ void Writer::setECSymbols() {
   replaceSymbol<DefinedSynthetic>(
       iatCopySym, "__hybrid_auxiliary_iat_copy",
       idata.auxIatCopy.empty() ? nullptr : idata.auxIatCopy.front());
+
+  Symbol *delayIatSym =
+      ctx.symtab.findUnderscore("__hybrid_auxiliary_delayload_iat");
+  replaceSymbol<DefinedSynthetic>(
+      delayIatSym, "__hybrid_auxiliary_delayload_iat",
+      delayIdata.getAuxIat().empty() ? nullptr
+                                     : delayIdata.getAuxIat().front());
+
+  Symbol *delayIatCopySym =
+      ctx.symtab.findUnderscore("__hybrid_auxiliary_delayload_iat_copy");
+  replaceSymbol<DefinedSynthetic>(
+      delayIatCopySym, "__hybrid_auxiliary_delayload_iat_copy",
+      delayIdata.getAuxIatCopy().empty() ? nullptr
+                                         : delayIdata.getAuxIatCopy().front());
 }
 
 // Write section contents to a mmap'ed file.
diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index fb70e66b6d756..45d429c915a6e 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -28,10 +28,42 @@ uint64_t elf::getAArch64Page(uint64_t expr) {
   return expr & ~static_cast<uint64_t>(0xFFF);
 }
 
+// A BTI landing pad is a valid target for an indirect branch when the Branch
+// Target Identification has been enabled.  As linker generated branches are
+// via x16 the BTI landing pads are defined as: BTI C, BTI J, BTI JC, PACIASP,
+// PACIBSP.
+bool elf::isAArch64BTILandingPad(Symbol &s, int64_t a) {
+  // PLT entries accessed indirectly have a BTI c.
+  if (s.isInPlt())
+    return true;
+  Defined *d = dyn_cast<Defined>(&s);
+  if (!isa_and_nonnull<InputSection>(d->section))
+    // All places that we cannot disassemble are responsible for making
+    // the target a BTI landing pad.
+    return true;
+  InputSection *isec = cast<InputSection>(d->section);
+  uint64_t off = d->value + a;
+  // Likely user error, but protect ourselves against out of bounds
+  // access.
+  if (off >= isec->getSize())
+    return true;
+  const uint8_t *buf = isec->content().begin();
+  const uint32_t instr = read32le(buf + off);
+  // All BTI instructions are HINT instructions which all have same encoding
+  // apart from bits [11:5]
+  if ((instr & 0xd503201f) == 0xd503201f &&
+      is_contained({/*PACIASP*/ 0xd503233f, /*PACIBSP*/ 0xd503237f,
+                    /*BTI C*/ 0xd503245f, /*BTI J*/ 0xd503249f,
+                    /*BTI JC*/ 0xd50324df},
+                   instr))
+    return true;
+  return false;
+}
+
 namespace {
 class AArch64 : public TargetInfo {
 public:
-  AArch64();
+  AArch64(Ctx &);
   RelExpr getRelExpr(RelType type, const Symbol &s,
                      const uint8_t *loc) const override;
   RelType getDynRel(RelType type) const override;
@@ -76,7 +108,7 @@ static uint64_t getBits(uint64_t val, int start, int end) {
   return (val >> start) & mask;
 }
 
-AArch64::AArch64() {
+AArch64::AArch64(Ctx &ctx) : TargetInfo(ctx) {
   copyRel = R_AARCH64_COPY;
   relativeRel = R_AARCH64_RELATIVE;
   iRelativeRel = R_AARCH64_IRELATIVE;
@@ -960,7 +992,7 @@ void AArch64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
 namespace {
 class AArch64BtiPac final : public AArch64 {
 public:
-  AArch64BtiPac();
+  AArch64BtiPac(Ctx &);
   void writePltHeader(uint8_t *buf) const override;
   void writePlt(uint8_t *buf, const Symbol &sym,
                 uint64_t pltEntryAddr) const override;
@@ -971,7 +1003,7 @@ class AArch64BtiPac final : public AArch64 {
 };
 } // namespace
 
-AArch64BtiPac::AArch64BtiPac() {
+AArch64BtiPac::AArch64BtiPac(Ctx &ctx) : AArch64(ctx) {
   btiHeader = (ctx.arg.andFeatures & GNU_PROPERTY_AARCH64_FEATURE_1_BTI);
   // A BTI (Branch Target Indicator) Plt Entry is only required if the
   // address of the PLT entry can be taken by the program, which permits an
@@ -1073,18 +1105,6 @@ void AArch64BtiPac::writePlt(uint8_t *buf, const Symbol &sym,
     memcpy(buf + sizeof(addrInst) + sizeof(stdBr), nopData, sizeof(nopData));
 }
 
-static TargetInfo *getTargetInfo() {
-  if ((ctx.arg.andFeatures & GNU_PROPERTY_AARCH64_FEATURE_1_BTI) ||
-      ctx.arg.zPacPlt) {
-    static AArch64BtiPac t;
-    return &t;
-  }
-  static AArch64 t;
-  return &t;
-}
-
-TargetInfo *elf::getAArch64TargetInfo() { return getTargetInfo(); }
-
 template <class ELFT>
 static void
 addTaggedSymbolReferences(InputSectionBase &sec,
@@ -1187,3 +1207,13 @@ void lld::elf::createTaggedSymbols(const SmallVector<ELFFileBase *, 0> &files) {
     symbol->setIsTagged(true);
   }
 }
+
+TargetInfo *elf::getAArch64TargetInfo(Ctx &ctx) {
+  if ((ctx.arg.andFeatures & GNU_PROPERTY_AARCH64_FEATURE_1_BTI) ||
+      ctx.arg.zPacPlt) {
+    static AArch64BtiPac t(ctx);
+    return &t;
+  }
+  static AArch64 t(ctx);
+  return &t;
+}
diff --git a/lld/ELF/Arch/AMDGPU.cpp b/lld/ELF/Arch/AMDGPU.cpp
index d9440acec9dda..29b21f1b95398 100644
--- a/lld/ELF/Arch/AMDGPU.cpp
+++ b/lld/ELF/Arch/AMDGPU.cpp
@@ -28,7 +28,7 @@ class AMDGPU final : public TargetInfo {
   uint32_t calcEFlagsV6() const;
 
 public:
-  AMDGPU();
+  AMDGPU(Ctx &);
   uint32_t calcEFlags() const override;
   void relocate(uint8_t *loc, const Relocation &rel,
                 uint64_t val) const override;
@@ -39,7 +39,7 @@ class AMDGPU final : public TargetInfo {
 };
 } // namespace
 
-AMDGPU::AMDGPU() {
+AMDGPU::AMDGPU(Ctx &ctx) : TargetInfo(ctx) {
   relativeRel = R_AMDGPU_RELATIVE64;
   gotRel = R_AMDGPU_ABS64;
   symbolicRel = R_AMDGPU_ABS64;
@@ -219,7 +219,7 @@ int64_t AMDGPU::getImplicitAddend(const uint8_t *buf, RelType type) const {
   }
 }
 
-TargetInfo *elf::getAMDGPUTargetInfo() {
-  static AMDGPU target;
+TargetInfo *elf::getAMDGPUTargetInfo(Ctx &ctx) {
+  static AMDGPU target(ctx);
   return &target;
 }
diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index 1bbd2e1f21d7c..77bdd656dd8cd 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -28,7 +28,7 @@ using namespace llvm::object;
 namespace {
 class ARM final : public TargetInfo {
 public:
-  ARM();
+  ARM(Ctx &);
   uint32_t calcEFlags() const override;
   RelExpr getRelExpr(RelType type, const Symbol &s,
                      const uint8_t *loc) const override;
@@ -54,7 +54,7 @@ enum class CodeState { Data = 0, Thumb = 2, Arm = 4 };
 
 static DenseMap<InputSection *, SmallVector<const Defined *, 0>> sectionMap{};
 
-ARM::ARM() {
+ARM::ARM(Ctx &ctx) : TargetInfo(ctx) {
   copyRel = R_ARM_COPY;
   relativeRel = R_ARM_RELATIVE;
   iRelativeRel = R_ARM_IRELATIVE;
@@ -491,9 +491,10 @@ bool ARM::inBranchRange(RelType type, uint64_t src, uint64_t dst) const {
 // Helper to produce message text when LLD detects that a CALL relocation to
 // a non STT_FUNC symbol that may result in incorrect interworking between ARM
 // or Thumb.
-static void stateChangeWarning(uint8_t *loc, RelType relt, const Symbol &s) {
+static void stateChangeWarning(Ctx &ctx, uint8_t *loc, RelType relt,
+                               const Symbol &s) {
   assert(!s.isFunc());
-  const ErrorPlace place = getErrorPlace(loc);
+  const ErrorPlace place = getErrorPlace(ctx, loc);
   std::string hint;
   if (!place.srcLoc.empty())
     hint = "; " + place.srcLoc;
@@ -630,7 +631,7 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     // lld 10.0 and before always used bit0Thumb when deciding to write a BLX
     // even when type not STT_FUNC.
     if (!rel.sym->isFunc() && isBlx != bit0Thumb)
-      stateChangeWarning(loc, rel.type, *rel.sym);
+      stateChangeWarning(ctx, loc, rel.type, *rel.sym);
     if (rel.sym->isFunc() ? bit0Thumb : isBlx) {
       // The BLX encoding is 0xfa:H:imm24 where Val = imm24:H:'1'
       checkInt(loc, val, 26, rel);
@@ -687,7 +688,7 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     // lld 10.0 and before always used bit0Thumb when deciding to write a BLX
     // even when type not STT_FUNC.
     if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == useThumb)
-      stateChangeWarning(loc, rel.type, *rel.sym);
+      stateChangeWarning(ctx, loc, rel.type, *rel.sym);
     if ((rel.sym->isFunc() || rel.sym->isInPlt()) ? !useThumb : isBlx) {
       // We are writing a BLX. Ensure BLX destination is 4-byte aligned. As
       // the BLX instruction may only be two byte aligned. This must be done
@@ -1260,7 +1261,7 @@ static std::string checkCmseSymAttributes(Symbol *acleSeSym, Symbol *sym) {
 // name with __acle_se_.
 // Both these symbols are Thumb function symbols with external linkage.
 // <sym> may be redefined in .gnu.sgstubs.
-void elf::processArmCmseSymbols() {
+void elf::processArmCmseSymbols(Ctx &ctx) {
   if (!ctx.arg.cmseImplib)
     return;
   // Only symbols with external linkage end up in ctx.symtab, so no need to do
@@ -1532,8 +1533,8 @@ template <typename ELFT> void elf::writeARMCmseImportLib() {
           "': " + toString(std::move(e)));
 }
 
-TargetInfo *elf::getARMTargetInfo() {
-  static ARM target;
+TargetInfo *elf::getARMTargetInfo(Ctx &ctx) {
+  static ARM target(ctx);
   return &target;
 }
 
diff --git a/lld/ELF/Arch/AVR.cpp b/lld/ELF/Arch/AVR.cpp
index 2275f86942871..cc2d9fa3daf79 100644
--- a/lld/ELF/Arch/AVR.cpp
+++ b/lld/ELF/Arch/AVR.cpp
@@ -43,7 +43,7 @@ using namespace lld::elf;
 namespace {
 class AVR final : public TargetInfo {
 public:
-  AVR() { needsThunks = true; }
+  AVR(Ctx &ctx) : TargetInfo(ctx) { needsThunks = true; }
   uint32_t calcEFlags() const override;
   RelExpr getRelExpr(RelType type, const Symbol &s,
                      const uint8_t *loc) const override;
@@ -267,8 +267,8 @@ void AVR::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
   }
 }
 
-TargetInfo *elf::getAVRTargetInfo() {
-  static AVR target;
+TargetInfo *elf::getAVRTargetInfo(Ctx &ctx) {
+  static AVR target(ctx);
   return &target;
 }
 
diff --git a/lld/ELF/Arch/Hexagon.cpp b/lld/ELF/Arch/Hexagon.cpp
index a492d0a630b46..d689fc2a15210 100644
--- a/lld/ELF/Arch/Hexagon.cpp
+++ b/lld/ELF/Arch/Hexagon.cpp
@@ -24,7 +24,7 @@ using namespace lld::elf;
 namespace {
 class Hexagon final : public TargetInfo {
 public:
-  Hexagon();
+  Hexagon(Ctx &);
   uint32_t calcEFlags() const override;
   RelExpr getRelExpr(RelType type, const Symbol &s,
                      const uint8_t *loc) const override;
@@ -38,7 +38,7 @@ class Hexagon final : public TargetInfo {
 };
 } // namespace
 
-Hexagon::Hexagon() {
+Hexagon::Hexagon(Ctx &ctx) : TargetInfo(ctx) {
   pltRel = R_HEX_JMP_SLOT;
   relativeRel = R_HEX_RELATIVE;
   gotRel = R_HEX_GLOB_DAT;
@@ -404,7 +404,7 @@ int64_t Hexagon::getImplicitAddend(const uint8_t *buf, RelType type) const {
   }
 }
 
-TargetInfo *elf::getHexagonTargetInfo() {
-  static Hexagon target;
+TargetInfo *elf::getHexagonTargetInfo(Ctx &ctx) {
+  static Hexagon target(ctx);
   return &target;
 }
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 662dcb2ef3c62..3e86488063f0e 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -24,7 +24,7 @@ using namespace lld::elf;
 namespace {
 class LoongArch final : public TargetInfo {
 public:
-  LoongArch();
+  LoongArch(Ctx &);
   uint32_t calcEFlags() const override;
   int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override;
   void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
@@ -170,7 +170,7 @@ static void handleUleb128(uint8_t *loc, uint64_t val) {
   encodeULEB128((orig + val) & mask, loc, count);
 }
 
-LoongArch::LoongArch() {
+LoongArch::LoongArch(Ctx &ctx) : TargetInfo(ctx) {
   // The LoongArch ISA itself does not have a limit on page sizes. According to
   // the ISA manual, the PS (page size) field in MTLB entries and CSR.STLBPS is
   // 6 bits wide, meaning the maximum page size is 2^63 which is equivalent to
@@ -598,7 +598,7 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
     // immediate fields, the relocation range is [-128G - 0x20000, +128G -
     // 0x20000) (of course must be 4-byte aligned).
     if (((int64_t)val + 0x20000) != llvm::SignExtend64(val + 0x20000, 38))
-      reportRangeError(loc, rel, Twine(val), llvm::minIntN(38) - 0x20000,
+      reportRangeError(ctx, loc, rel, Twine(val), llvm::minIntN(38) - 0x20000,
                        llvm::maxIntN(38) - 0x20000);
     checkAlignment(loc, val, 4, rel);
     // Since jirl performs sign extension on the offset immediate, adds (1<<17)
@@ -893,7 +893,7 @@ void LoongArch::finalizeRelax(int passes) const {
   }
 }
 
-TargetInfo *elf::getLoongArchTargetInfo() {
-  static LoongArch target;
+TargetInfo *elf::getLoongArchTargetInfo(Ctx &ctx) {
+  static LoongArch target(ctx);
   return &target;
 }
diff --git a/lld/ELF/Arch/MSP430.cpp b/lld/ELF/Arch/MSP430.cpp
index 378b2878d442b..7563f7cfaa02c 100644
--- a/lld/ELF/Arch/MSP430.cpp
+++ b/lld/ELF/Arch/MSP430.cpp
@@ -31,7 +31,7 @@ using namespace lld::elf;
 namespace {
 class MSP430 final : public TargetInfo {
 public:
-  MSP430();
+  MSP430(Ctx &);
   RelExpr getRelExpr(RelType type, const Symbol &s,
                      const uint8_t *loc) const override;
   void relocate(uint8_t *loc, const Relocation &rel,
@@ -39,7 +39,7 @@ class MSP430 final : public TargetInfo {
 };
 } // namespace
 
-MSP430::MSP430() {
+MSP430::MSP430(Ctx &ctx) : TargetInfo(ctx) {
   // mov.b #0, r3
   trapInstr = {0x43, 0x43, 0x43, 0x43};
 }
@@ -88,7 +88,7 @@ void MSP430::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
   }
 }
 
-TargetInfo *elf::getMSP430TargetInfo() {
-  static MSP430 target;
+TargetInfo *elf::getMSP430TargetInfo(Ctx &ctx) {
+  static MSP430 target(ctx);
   return &target;
 }
diff --git a/lld/ELF/Arch/Mips.cpp b/lld/ELF/Arch/Mips.cpp
index 8822be5ea8d5b..0e65df347031e 100644
--- a/lld/ELF/Arch/Mips.cpp
+++ b/lld/ELF/Arch/Mips.cpp
@@ -23,7 +23,7 @@ using namespace lld::elf;
 namespace {
 template <class ELFT> class MIPS final : public TargetInfo {
 public:
-  MIPS();
+  MIPS(Ctx &);
   uint32_t calcEFlags() const override;
   RelExpr getRelExpr(RelType type, const Symbol &s,
                      const uint8_t *loc) const override;
@@ -42,7 +42,7 @@ template <class ELFT> class MIPS final : public TargetInfo {
 };
 } // namespace
 
-template <class ELFT> MIPS<ELFT>::MIPS() {
+template <class ELFT> MIPS<ELFT>::MIPS(Ctx &ctx) : TargetInfo(ctx) {
   gotPltHeaderEntriesNum = 2;
   defaultMaxPageSize = 65536;
   pltEntrySize = 16;
@@ -778,16 +778,29 @@ template <class ELFT> bool elf::isMipsPIC(const Defined *sym) {
   return cast<ObjFile<ELFT>>(file)->getObj().getHeader().e_flags & EF_MIPS_PIC;
 }
 
-template <class ELFT> TargetInfo *elf::getMipsTargetInfo() {
-  static MIPS<ELFT> target;
-  return &target;
+TargetInfo *elf::getMipsTargetInfo(Ctx &ctx) {
+  switch (ctx.arg.ekind) {
+  case ELF32LEKind: {
+    static MIPS<ELF32LE> t(ctx);
+    return &t;
+  }
+  case ELF32BEKind: {
+    static MIPS<ELF32BE> t(ctx);
+    return &t;
+  }
+  case ELF64LEKind: {
+    static MIPS<ELF64LE> t(ctx);
+    return &t;
+  }
+  case ELF64BEKind: {
+    static MIPS<ELF64BE> t(ctx);
+    return &t;
+  }
+  default:
+    llvm_unreachable("unsupported target");
+  }
 }
 
-template TargetInfo *elf::getMipsTargetInfo<ELF32LE>();
-template TargetInfo *elf::getMipsTargetInfo<ELF32BE>();
-template TargetInfo *elf::getMipsTargetInfo<ELF64LE>();
-template TargetInfo *elf::getMipsTargetInfo<ELF64BE>();
-
 template bool elf::isMipsPIC<ELF32LE>(const Defined *);
 template bool elf::isMipsPIC<ELF32BE>(const Defined *);
 template bool elf::isMipsPIC<ELF64LE>(const Defined *);
diff --git a/lld/ELF/Arch/PPC.cpp b/lld/ELF/Arch/PPC.cpp
index c5f9de5a2f2a5..2d6355fac13e6 100644
--- a/lld/ELF/Arch/PPC.cpp
+++ b/lld/ELF/Arch/PPC.cpp
@@ -26,7 +26,7 @@ using namespace lld::elf;
 namespace {
 class PPC final : public TargetInfo {
 public:
-  PPC();
+  PPC(Ctx &);
   RelExpr getRelExpr(RelType type, const Symbol &s,
                      const uint8_t *loc) const override;
   RelType getDynRel(RelType type) const override;
@@ -79,7 +79,7 @@ void elf::writePPC32GlinkSection(uint8_t *buf, size_t numEntries) {
   if (!ctx.arg.isPic) {
     for (const Symbol *sym :
          cast<PPC32GlinkSection>(*ctx.in.plt).canonical_plts) {
-      writePPC32PltCallStub(buf, sym->getGotPltVA(), nullptr, 0);
+      writePPC32PltCallStub(ctx, buf, sym->getGotPltVA(), nullptr, 0);
       buf += 16;
       glink += 16;
     }
@@ -152,7 +152,7 @@ void elf::writePPC32GlinkSection(uint8_t *buf, size_t numEntries) {
     write32(buf, 0x60000000);
 }
 
-PPC::PPC() {
+PPC::PPC(Ctx &ctx) : TargetInfo(ctx) {
   copyRel = R_PPC_COPY;
   gotRel = R_PPC_GLOB_DAT;
   pltRel = R_PPC_JMP_SLOT;
@@ -181,7 +181,7 @@ void PPC::writeIplt(uint8_t *buf, const Symbol &sym,
                     uint64_t /*pltEntryAddr*/) const {
   // In -pie or -shared mode, assume r30 points to .got2+0x8000, and use a
   // .got2.plt_pic32. thunk.
-  writePPC32PltCallStub(buf, sym.getGotPltVA(), sym.file, 0x8000);
+  writePPC32PltCallStub(ctx, buf, sym.getGotPltVA(), sym.file, 0x8000);
 }
 
 void PPC::writeGotHeader(uint8_t *buf) const {
@@ -525,7 +525,7 @@ void PPC::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
   }
 }
 
-TargetInfo *elf::getPPCTargetInfo() {
-  static PPC target;
+TargetInfo *elf::getPPCTargetInfo(Ctx &ctx) {
+  static PPC target(ctx);
   return &target;
 }
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index fdf3d07b98bca..da2a5aeed43b0 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -168,7 +168,7 @@ enum class LegacyToPrefixMask : uint64_t {
 
 class PPC64 final : public TargetInfo {
 public:
-  PPC64();
+  PPC64(Ctx &);
   int getTlsGdRelaxSkip(RelType type) const override;
   uint32_t calcEFlags() const override;
   RelExpr getRelExpr(RelType type, const Symbol &s,
@@ -578,7 +578,7 @@ static uint64_t readPrefixedInstruction(const uint8_t *loc) {
   return ctx.arg.isLE ? (fullInstr << 32 | fullInstr >> 32) : fullInstr;
 }
 
-PPC64::PPC64() {
+PPC64::PPC64(Ctx &ctx) : TargetInfo(ctx) {
   copyRel = R_PPC64_COPY;
   gotRel = R_PPC64_GLOB_DAT;
   pltRel = R_PPC64_JMP_SLOT;
@@ -1750,7 +1750,7 @@ bool PPC64::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
   return true;
 }
 
-TargetInfo *elf::getPPC64TargetInfo() {
-  static PPC64 target;
+TargetInfo *elf::getPPC64TargetInfo(Ctx &ctx) {
+  static PPC64 target(ctx);
   return &target;
 }
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 4b02612bec870..f776ac8ede1bd 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -29,7 +29,7 @@ namespace {
 
 class RISCV final : public TargetInfo {
 public:
-  RISCV();
+  RISCV(Ctx &);
   uint32_t calcEFlags() const override;
   int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override;
   void writeGotHeader(uint8_t *buf) const override;
@@ -107,7 +107,7 @@ static uint32_t setLO12_S(uint32_t insn, uint32_t imm) {
          (extractBits(imm, 4, 0) << 7);
 }
 
-RISCV::RISCV() {
+RISCV::RISCV(Ctx &ctx) : TargetInfo(ctx) {
   copyRel = R_RISCV_COPY;
   pltRel = R_RISCV_JUMP_SLOT;
   relativeRel = R_RISCV_RELATIVE;
@@ -1305,7 +1305,7 @@ void RISCVAttributesSection::writeTo(uint8_t *buf) {
   }
 }
 
-void elf::mergeRISCVAttributesSections() {
+void elf::mergeRISCVAttributesSections(Ctx &) {
   // Find the first input SHT_RISCV_ATTRIBUTES; return if not found.
   size_t place =
       llvm::find_if(ctx.inputSections,
@@ -1328,7 +1328,7 @@ void elf::mergeRISCVAttributesSections() {
                            mergeAttributesSection(sections));
 }
 
-TargetInfo *elf::getRISCVTargetInfo() {
-  static RISCV target;
+TargetInfo *elf::getRISCVTargetInfo(Ctx &ctx) {
+  static RISCV target(ctx);
   return &target;
 }
diff --git a/lld/ELF/Arch/SPARCV9.cpp b/lld/ELF/Arch/SPARCV9.cpp
index f7f296c81f335..15c7c9c28b2ed 100644
--- a/lld/ELF/Arch/SPARCV9.cpp
+++ b/lld/ELF/Arch/SPARCV9.cpp
@@ -21,7 +21,7 @@ using namespace lld::elf;
 namespace {
 class SPARCV9 final : public TargetInfo {
 public:
-  SPARCV9();
+  SPARCV9(Ctx &);
   RelExpr getRelExpr(RelType type, const Symbol &s,
                      const uint8_t *loc) const override;
   void writePlt(uint8_t *buf, const Symbol &sym,
@@ -31,7 +31,7 @@ class SPARCV9 final : public TargetInfo {
 };
 } // namespace
 
-SPARCV9::SPARCV9() {
+SPARCV9::SPARCV9(Ctx &ctx) : TargetInfo(ctx) {
   copyRel = R_SPARC_COPY;
   gotRel = R_SPARC_GLOB_DAT;
   pltRel = R_SPARC_JMP_SLOT;
@@ -193,7 +193,7 @@ void SPARCV9::writePlt(uint8_t *buf, const Symbol & /*sym*/,
   relocateNoSym(buf + 4, R_SPARC_WDISP19, -(off + 4 - pltEntrySize));
 }
 
-TargetInfo *elf::getSPARCV9TargetInfo() {
-  static SPARCV9 target;
+TargetInfo *elf::getSPARCV9TargetInfo(Ctx &ctx) {
+  static SPARCV9 target(ctx);
   return &target;
 }
diff --git a/lld/ELF/Arch/SystemZ.cpp b/lld/ELF/Arch/SystemZ.cpp
index 484ffd7601ddc..fc87103165fd4 100644
--- a/lld/ELF/Arch/SystemZ.cpp
+++ b/lld/ELF/Arch/SystemZ.cpp
@@ -23,7 +23,7 @@ using namespace lld::elf;
 namespace {
 class SystemZ : public TargetInfo {
 public:
-  SystemZ();
+  SystemZ(Ctx &);
   int getTlsGdRelaxSkip(RelType type) const override;
   RelExpr getRelExpr(RelType type, const Symbol &s,
                      const uint8_t *loc) const override;
@@ -51,7 +51,7 @@ class SystemZ : public TargetInfo {
 };
 } // namespace
 
-SystemZ::SystemZ() {
+SystemZ::SystemZ(Ctx &ctx) : TargetInfo(ctx) {
   copyRel = R_390_COPY;
   gotRel = R_390_GLOB_DAT;
   pltRel = R_390_JMP_SLOT;
@@ -453,7 +453,7 @@ bool SystemZ::relaxOnce(int pass) const {
           continue;
         if (rel.sym->auxIdx == 0) {
           rel.sym->allocateAux();
-          addGotEntry(*rel.sym);
+          addGotEntry(ctx, *rel.sym);
           changed = true;
         }
         rel.expr = R_GOT_PC;
@@ -601,7 +601,7 @@ void SystemZ::relocate(uint8_t *loc, const Relocation &rel,
   }
 }
 
-TargetInfo *elf::getSystemZTargetInfo() {
-  static SystemZ t;
+TargetInfo *elf::getSystemZTargetInfo(Ctx &ctx) {
+  static SystemZ t(ctx);
   return &t;
 }
diff --git a/lld/ELF/Arch/X86.cpp b/lld/ELF/Arch/X86.cpp
index e02038b1689c4..0a16ca24fcb31 100644
--- a/lld/ELF/Arch/X86.cpp
+++ b/lld/ELF/Arch/X86.cpp
@@ -22,7 +22,7 @@ using namespace lld::elf;
 namespace {
 class X86 : public TargetInfo {
 public:
-  X86();
+  X86(Ctx &);
   int getTlsGdRelaxSkip(RelType type) const override;
   RelExpr getRelExpr(RelType type, const Symbol &s,
                      const uint8_t *loc) const override;
@@ -42,7 +42,7 @@ class X86 : public TargetInfo {
 };
 } // namespace
 
-X86::X86() {
+X86::X86(Ctx &ctx) : TargetInfo(ctx) {
   copyRel = R_386_COPY;
   gotRel = R_386_GLOB_DAT;
   pltRel = R_386_JUMP_SLOT;
@@ -518,7 +518,7 @@ void X86::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
 namespace {
 class IntelIBT : public X86 {
 public:
-  IntelIBT();
+  IntelIBT(Ctx &ctx) : X86(ctx) { pltHeaderSize = 0; }
   void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
   void writePlt(uint8_t *buf, const Symbol &sym,
                 uint64_t pltEntryAddr) const override;
@@ -528,8 +528,6 @@ class IntelIBT : public X86 {
 };
 } // namespace
 
-IntelIBT::IntelIBT() { pltHeaderSize = 0; }
-
 void IntelIBT::writeGotPlt(uint8_t *buf, const Symbol &s) const {
   uint64_t va =
       ctx.in.ibtPlt->getVA() + IBTPltHeaderSize + s.getPltIdx() * pltEntrySize;
@@ -580,7 +578,7 @@ void IntelIBT::writeIBTPlt(uint8_t *buf, size_t numEntries) const {
 namespace {
 class RetpolinePic : public X86 {
 public:
-  RetpolinePic();
+  RetpolinePic(Ctx &);
   void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
   void writePltHeader(uint8_t *buf) const override;
   void writePlt(uint8_t *buf, const Symbol &sym,
@@ -589,7 +587,7 @@ class RetpolinePic : public X86 {
 
 class RetpolineNoPic : public X86 {
 public:
-  RetpolineNoPic();
+  RetpolineNoPic(Ctx &);
   void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
   void writePltHeader(uint8_t *buf) const override;
   void writePlt(uint8_t *buf, const Symbol &sym,
@@ -597,7 +595,7 @@ class RetpolineNoPic : public X86 {
 };
 } // namespace
 
-RetpolinePic::RetpolinePic() {
+RetpolinePic::RetpolinePic(Ctx &ctx) : X86(ctx) {
   pltHeaderSize = 48;
   pltEntrySize = 32;
   ipltEntrySize = 32;
@@ -651,7 +649,7 @@ void RetpolinePic::writePlt(uint8_t *buf, const Symbol &sym,
   write32le(buf + 23, -off - 27);
 }
 
-RetpolineNoPic::RetpolineNoPic() {
+RetpolineNoPic::RetpolineNoPic(Ctx &ctx) : X86(ctx) {
   pltHeaderSize = 48;
   pltEntrySize = 32;
   ipltEntrySize = 32;
@@ -710,21 +708,21 @@ void RetpolineNoPic::writePlt(uint8_t *buf, const Symbol &sym,
   write32le(buf + 22, -off - 26);
 }
 
-TargetInfo *elf::getX86TargetInfo() {
+TargetInfo *elf::getX86TargetInfo(Ctx &ctx) {
   if (ctx.arg.zRetpolineplt) {
     if (ctx.arg.isPic) {
-      static RetpolinePic t;
+      static RetpolinePic t(ctx);
       return &t;
     }
-    static RetpolineNoPic t;
+    static RetpolineNoPic t(ctx);
     return &t;
   }
 
   if (ctx.arg.andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT) {
-    static IntelIBT t;
+    static IntelIBT t(ctx);
     return &t;
   }
 
-  static X86 t;
+  static X86 t(ctx);
   return &t;
 }
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index 48f17718365e2..d58d0a2961d61 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -26,7 +26,7 @@ using namespace lld::elf;
 namespace {
 class X86_64 : public TargetInfo {
 public:
-  X86_64();
+  X86_64(Ctx &);
   int getTlsGdRelaxSkip(RelType type) const override;
   RelExpr getRelExpr(RelType type, const Symbol &s,
                      const uint8_t *loc) const override;
@@ -67,7 +67,7 @@ static const std::vector<std::vector<uint8_t>> nopInstructions = {
     {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
     {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}};
 
-X86_64::X86_64() {
+X86_64::X86_64(Ctx &ctx) : TargetInfo(ctx) {
   copyRel = R_X86_64_COPY;
   gotRel = R_X86_64_GLOB_DAT;
   pltRel = R_X86_64_JUMP_SLOT;
@@ -339,7 +339,7 @@ bool X86_64::relaxOnce(int pass) const {
           continue;
         if (rel.sym->auxIdx == 0) {
           rel.sym->allocateAux();
-          addGotEntry(*rel.sym);
+          addGotEntry(ctx, *rel.sym);
           changed = true;
         }
         rel.expr = R_GOT_PC;
@@ -388,6 +388,7 @@ RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
   case R_X86_64_GOTPCREL:
   case R_X86_64_GOTPCRELX:
   case R_X86_64_REX_GOTPCRELX:
+  case R_X86_64_REX2_GOTPCRELX:
   case R_X86_64_GOTTPOFF:
     return R_GOT_PC;
   case R_X86_64_GOTOFF64:
@@ -725,6 +726,7 @@ int64_t X86_64::getImplicitAddend(const uint8_t *buf, RelType type) const {
   case R_X86_64_GOTPCREL:
   case R_X86_64_GOTPCRELX:
   case R_X86_64_REX_GOTPCRELX:
+  case R_X86_64_REX2_GOTPCRELX:
   case R_X86_64_PC32:
   case R_X86_64_GOTTPOFF:
   case R_X86_64_PLT32:
@@ -808,6 +810,7 @@ void X86_64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     break;
   case R_X86_64_GOTPCRELX:
   case R_X86_64_REX_GOTPCRELX:
+  case R_X86_64_REX2_GOTPCRELX:
     if (rel.expr != R_GOT_PC) {
       relaxGot(loc, rel, val);
     } else {
@@ -859,12 +862,13 @@ void X86_64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
 
 RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend,
                                 const uint8_t *loc) const {
-  // Only R_X86_64_[REX_]GOTPCRELX can be relaxed. GNU as may emit GOTPCRELX
-  // with addend != -4. Such an instruction does not load the full GOT entry, so
-  // we cannot relax the relocation. E.g. movl x@GOTPCREL+4(%rip), %rax
-  // (addend=0) loads the high 32 bits of the GOT entry.
+  // Only R_X86_64_[REX_]|[REX2_]GOTPCRELX can be relaxed. GNU as may emit
+  // GOTPCRELX with addend != -4. Such an instruction does not load the full GOT
+  // entry, so we cannot relax the relocation. E.g. movl x@GOTPCREL+4(%rip),
+  // %rax (addend=0) loads the high 32 bits of the GOT entry.
   if (!ctx.arg.relax || addend != -4 ||
-      (type != R_X86_64_GOTPCRELX && type != R_X86_64_REX_GOTPCRELX))
+      (type != R_X86_64_GOTPCRELX && type != R_X86_64_REX_GOTPCRELX &&
+       type != R_X86_64_REX2_GOTPCRELX))
     return R_GOT_PC;
   const uint8_t op = loc[-2];
   const uint8_t modRm = loc[-1];
@@ -880,7 +884,7 @@ RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend,
   if (op == 0xff && (modRm == 0x15 || modRm == 0x25))
     return R_RELAX_GOT_PC;
 
-  // We don't support test/binop instructions without a REX prefix.
+  // We don't support test/binop instructions without a REX/REX2 prefix.
   if (type == R_X86_64_GOTPCRELX)
     return R_GOT_PC;
 
@@ -894,8 +898,8 @@ RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend,
 // "Intel 64 and IA-32 Architectures Software Developer's Manual V2"
 // (http://www.intel.com/content/dam/www/public/us/en/documents/manuals/
 //    64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf)
-static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op,
-                          uint8_t modRm) {
+static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op, uint8_t modRm,
+                          bool isRex2) {
   const uint8_t rex = loc[-3];
   // Convert "test %reg, foo@GOTPCREL(%rip)" to "test $foo, %reg".
   if (op == 0x85) {
@@ -921,7 +925,7 @@ static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op,
     // See "TEST-Logical Compare" (4-428 Vol. 2B).
     loc[-2] = 0xf7;
 
-    // Move R bit to the B bit in REX byte.
+    // Move R bit to the B bit in REX/REX2 byte.
     // REX byte is encoded as 0100WRXB, where
     // 0100 is 4bit fixed pattern.
     // REX.W When 1, a 64-bit operand size is used. Otherwise, when 0, the
@@ -932,7 +936,17 @@ static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op,
     // REX.B This 1-bit value is an extension to the MODRM.rm field or the
     // SIB.base field.
     // See "2.2.1.2 More on REX Prefix Fields " (2-8 Vol. 2A).
-    loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2;
+    //
+    // REX2 prefix is encoded as 0xd5|M|R2|X2|B2|WRXB, where
+    // 0xd5 is 1byte fixed pattern.
+    // REX2's [W,R,X,B] have the same meanings as REX's.
+    // REX2.M encodes the map id.
+    // R2/X2/B2 provides the fifth and most siginicant bits of the R/X/B
+    // register identifiers, each of which can now address all 32 GPRs.
+    if (isRex2)
+      loc[-3] = (rex & ~0x44) | (rex & 0x44) >> 2;
+    else
+      loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2;
     write32le(loc, val);
     return;
   }
@@ -953,7 +967,10 @@ static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op,
   // "INSTRUCTION SET REFERENCE, N-Z" (Vol. 2B 4-1) for
   // descriptions about each operation.
   loc[-2] = 0x81;
-  loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2;
+  if (isRex2)
+    loc[-3] = (rex & ~0x44) | (rex & 0x44) >> 2;
+  else
+    loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2;
   write32le(loc, val);
 }
 
@@ -974,7 +991,7 @@ static void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) {
     // We are relaxing a rip relative to an absolute, so compensate
     // for the old -4 addend.
     assert(!ctx.arg.isPic);
-    relaxGotNoPic(loc, val + 4, op, modRm);
+    relaxGotNoPic(loc, val + 4, op, modRm, rel.type == R_X86_64_REX2_GOTPCRELX);
     return;
   }
 
@@ -1059,7 +1076,7 @@ void X86_64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
 namespace {
 class IntelIBT : public X86_64 {
 public:
-  IntelIBT();
+  IntelIBT(Ctx &ctx) : X86_64(ctx) { pltHeaderSize = 0; };
   void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
   void writePlt(uint8_t *buf, const Symbol &sym,
                 uint64_t pltEntryAddr) const override;
@@ -1069,8 +1086,6 @@ class IntelIBT : public X86_64 {
 };
 } // namespace
 
-IntelIBT::IntelIBT() { pltHeaderSize = 0; }
-
 void IntelIBT::writeGotPlt(uint8_t *buf, const Symbol &s) const {
   uint64_t va =
       ctx.in.ibtPlt->getVA() + IBTPltHeaderSize + s.getPltIdx() * pltEntrySize;
@@ -1119,7 +1134,7 @@ void IntelIBT::writeIBTPlt(uint8_t *buf, size_t numEntries) const {
 namespace {
 class Retpoline : public X86_64 {
 public:
-  Retpoline();
+  Retpoline(Ctx &);
   void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
   void writePltHeader(uint8_t *buf) const override;
   void writePlt(uint8_t *buf, const Symbol &sym,
@@ -1128,7 +1143,7 @@ class Retpoline : public X86_64 {
 
 class RetpolineZNow : public X86_64 {
 public:
-  RetpolineZNow();
+  RetpolineZNow(Ctx &);
   void writeGotPlt(uint8_t *buf, const Symbol &s) const override {}
   void writePltHeader(uint8_t *buf) const override;
   void writePlt(uint8_t *buf, const Symbol &sym,
@@ -1136,7 +1151,7 @@ class RetpolineZNow : public X86_64 {
 };
 } // namespace
 
-Retpoline::Retpoline() {
+Retpoline::Retpoline(Ctx &ctx) : X86_64(ctx) {
   pltHeaderSize = 48;
   pltEntrySize = 32;
   ipltEntrySize = 32;
@@ -1189,7 +1204,7 @@ void Retpoline::writePlt(uint8_t *buf, const Symbol &sym,
   write32le(buf + 23, -off - 27);
 }
 
-RetpolineZNow::RetpolineZNow() {
+RetpolineZNow::RetpolineZNow(Ctx &ctx) : X86_64(ctx) {
   pltHeaderSize = 32;
   pltEntrySize = 16;
   ipltEntrySize = 16;
@@ -1224,23 +1239,21 @@ void RetpolineZNow::writePlt(uint8_t *buf, const Symbol &sym,
   write32le(buf + 8, ctx.in.plt->getVA() - pltEntryAddr - 12);
 }
 
-static TargetInfo *getTargetInfo() {
+TargetInfo *elf::getX86_64TargetInfo(Ctx &ctx) {
   if (ctx.arg.zRetpolineplt) {
     if (ctx.arg.zNow) {
-      static RetpolineZNow t;
+      static RetpolineZNow t(ctx);
       return &t;
     }
-    static Retpoline t;
+    static Retpoline t(ctx);
     return &t;
   }
 
   if (ctx.arg.andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT) {
-    static IntelIBT t;
+    static IntelIBT t(ctx);
     return &t;
   }
 
-  static X86_64 t;
+  static X86_64 t(ctx);
   return &t;
 }
-
-TargetInfo *elf::getX86_64TargetInfo() { return getTargetInfo(); }
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 80a45bc4b6379..1d113375143a5 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -173,9 +173,9 @@ class LinkerDriver {
 
   std::unique_ptr<BitcodeCompiler> lto;
   std::vector<InputFile *> files;
-  InputFile *armCmseImpLib = nullptr;
 
 public:
+  InputFile *armCmseImpLib = nullptr;
   SmallVector<std::pair<StringRef, unsigned>, 0> archiveFiles;
 };
 
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 343fc4989fa4c..94cd060b697d2 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -109,7 +109,7 @@ void Ctx::reset() {
 
   in.reset();
   sym = ElfSym{};
-  symtab = std::make_unique<SymbolTable>();
+  symtab = std::make_unique<SymbolTable>(*this);
 
   memoryBuffers.clear();
   objectFiles.clear();
@@ -167,7 +167,7 @@ bool link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
   LinkerScript script(ctx);
   ctx.script = &script;
   ctx.symAux.emplace_back();
-  ctx.symtab = std::make_unique<SymbolTable>();
+  ctx.symtab = std::make_unique<SymbolTable>(ctx);
 
   ctx.partitions.clear();
   ctx.partitions.emplace_back();
@@ -231,7 +231,7 @@ static std::tuple<ELFKind, uint16_t, uint8_t> parseEmulation(StringRef emul) {
 // Returns slices of MB by parsing MB as an archive file.
 // Each slice consists of a member file in the archive.
 std::vector<std::pair<MemoryBufferRef, uint64_t>> static getArchiveMembers(
-    MemoryBufferRef mb) {
+    Ctx &ctx, MemoryBufferRef mb) {
   std::unique_ptr<Archive> file =
       CHECK(Archive::create(mb),
             mb.getBufferIdentifier() + ": failed to parse archive");
@@ -296,7 +296,7 @@ void LinkerDriver::addFile(StringRef path, bool withLOption) {
     readLinkerScript(ctx, mbref);
     return;
   case file_magic::archive: {
-    auto members = getArchiveMembers(mbref);
+    auto members = getArchiveMembers(ctx, mbref);
     if (inWholeArchive) {
       for (const std::pair<MemoryBufferRef, uint64_t> &p : members) {
         if (isBitcode(p.first))
@@ -632,7 +632,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
 
   // Handle -help
   if (args.hasArg(OPT_help)) {
-    printHelp();
+    printHelp(ctx);
     return;
   }
 
@@ -850,7 +850,7 @@ static ICFLevel getICF(opt::InputArgList &args) {
   return ICFLevel::All;
 }
 
-static StripPolicy getStrip(opt::InputArgList &args) {
+static StripPolicy getStrip(Ctx &ctx, opt::InputArgList &args) {
   if (args.hasArg(OPT_relocatable))
     return StripPolicy::None;
   if (!ctx.arg.zSectionHeader)
@@ -953,7 +953,7 @@ static std::pair<bool, bool> getPackDynRelocs(opt::InputArgList &args) {
   return {false, false};
 }
 
-static void readCallGraph(MemoryBufferRef mb) {
+static void readCallGraph(Ctx &ctx, MemoryBufferRef mb) {
   // Build a map from symbol name to section
   DenseMap<StringRef, Symbol *> map;
   for (ELFFileBase *file : ctx.objectFiles)
@@ -994,7 +994,7 @@ static void readCallGraph(MemoryBufferRef mb) {
 // true and populates cgProfile and symbolIndices.
 template <class ELFT>
 static bool
-processCallGraphRelocations(SmallVector<uint32_t, 32> &symbolIndices,
+processCallGraphRelocations(Ctx &ctx, SmallVector<uint32_t, 32> &symbolIndices,
                             ArrayRef<typename ELFT::CGProfile> &cgProfile,
                             ObjFile<ELFT> *inputObj) {
   if (inputObj->cgProfileSectionIndex == SHN_UNDEF)
@@ -1041,12 +1041,12 @@ processCallGraphRelocations(SmallVector<uint32_t, 32> &symbolIndices,
   return !symbolIndices.empty();
 }
 
-template <class ELFT> static void readCallGraphsFromObjectFiles() {
+template <class ELFT> static void readCallGraphsFromObjectFiles(Ctx &ctx) {
   SmallVector<uint32_t, 32> symbolIndices;
   ArrayRef<typename ELFT::CGProfile> cgProfile;
   for (auto file : ctx.objectFiles) {
     auto *obj = cast<ObjFile<ELFT>>(file);
-    if (!processCallGraphRelocations(symbolIndices, cgProfile, obj))
+    if (!processCallGraphRelocations(ctx, symbolIndices, cgProfile, obj))
       continue;
 
     if (symbolIndices.size() != cgProfile.size() * 2)
@@ -1070,7 +1070,8 @@ template <class ELFT> static void readCallGraphsFromObjectFiles() {
 }
 
 template <class ELFT>
-static void ltoValidateAllVtablesHaveTypeInfos(opt::InputArgList &args) {
+static void ltoValidateAllVtablesHaveTypeInfos(Ctx &ctx,
+                                               opt::InputArgList &args) {
   DenseSet<StringRef> typeInfoSymbols;
   SmallSetVector<StringRef, 0> vtableSymbols;
   auto processVtableAndTypeInfoSymbols = [&](StringRef name) {
@@ -1184,7 +1185,8 @@ getOldNewOptionsExtra(opt::InputArgList &args, unsigned id) {
 }
 
 // Parse the symbol ordering file and warn for any duplicate entries.
-static SmallVector<StringRef, 0> getSymbolOrderingFile(MemoryBufferRef mb) {
+static SmallVector<StringRef, 0> getSymbolOrderingFile(Ctx &ctx,
+                                                       MemoryBufferRef mb) {
   SetVector<StringRef, SmallVector<StringRef, 0>> names;
   for (StringRef s : args::getLines(mb))
     if (!names.insert(s) && ctx.arg.warnSymbolOrdering)
@@ -1193,7 +1195,7 @@ static SmallVector<StringRef, 0> getSymbolOrderingFile(MemoryBufferRef mb) {
   return names.takeVector();
 }
 
-static bool getIsRela(opt::InputArgList &args) {
+static bool getIsRela(Ctx &ctx, opt::InputArgList &args) {
   // The psABI specifies the default relocation entry format.
   bool rela = is_contained({EM_AARCH64, EM_AMDGPU, EM_HEXAGON, EM_LOONGARCH,
                             EM_PPC, EM_PPC64, EM_RISCV, EM_S390, EM_X86_64},
@@ -1212,7 +1214,7 @@ static bool getIsRela(opt::InputArgList &args) {
   return rela;
 }
 
-static void parseClangOption(StringRef opt, const Twine &msg) {
+static void parseClangOption(Ctx &ctx, StringRef opt, const Twine &msg) {
   std::string err;
   raw_string_ostream os(err);
 
@@ -1228,7 +1230,7 @@ static bool isValidReportString(StringRef arg) {
 }
 
 // Process a remap pattern 'from-glob=to-file'.
-static bool remapInputs(StringRef line, const Twine &location) {
+static bool remapInputs(Ctx &ctx, StringRef line, const Twine &location) {
   SmallVector<StringRef, 0> fields;
   line.split(fields, '=');
   if (fields.size() != 2 || fields[1].empty()) {
@@ -1440,7 +1442,7 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
       args::getInteger(args, OPT_split_stack_adjust_size, 16384);
   ctx.arg.zSectionHeader =
       getZFlag(args, "sectionheader", "nosectionheader", true);
-  ctx.arg.strip = getStrip(args); // needs zSectionHeader
+  ctx.arg.strip = getStrip(ctx, args); // needs zSectionHeader
   ctx.arg.sysroot = args.getLastArgValue(OPT_sysroot);
   ctx.arg.target1Rel = args.hasFlag(OPT_target1_rel, OPT_target1_abs, false);
   ctx.arg.target2 = getTarget2(args);
@@ -1535,7 +1537,7 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
 
   for (opt::Arg *arg : args.filtered(OPT_remap_inputs)) {
     StringRef value(arg->getValue());
-    remapInputs(value, arg->getSpelling());
+    remapInputs(ctx, value, arg->getSpelling());
   }
   for (opt::Arg *arg : args.filtered(OPT_remap_inputs_file)) {
     StringRef filename(arg->getValue());
@@ -1544,7 +1546,7 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
       continue;
     // Parse 'from-glob=to-file' lines, ignoring #-led comments.
     for (auto [lineno, line] : llvm::enumerate(args::getLines(*buffer)))
-      if (remapInputs(line, filename + ":" + Twine(lineno + 1)))
+      if (remapInputs(ctx, line, filename + ":" + Twine(lineno + 1)))
         break;
   }
 
@@ -1637,11 +1639,12 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
 
   // Parse LTO options.
   if (auto *arg = args.getLastArg(OPT_plugin_opt_mcpu_eq))
-    parseClangOption(saver().save("-mcpu=" + StringRef(arg->getValue())),
+    parseClangOption(ctx, saver().save("-mcpu=" + StringRef(arg->getValue())),
                      arg->getSpelling());
 
   for (opt::Arg *arg : args.filtered(OPT_plugin_opt_eq_minus))
-    parseClangOption(std::string("-") + arg->getValue(), arg->getSpelling());
+    parseClangOption(ctx, std::string("-") + arg->getValue(),
+                     arg->getSpelling());
 
   // GCC collect2 passes -plugin-opt=path/to/lto-wrapper with an absolute or
   // relative path. Just ignore. If not ended with "lto-wrapper" (or
@@ -1658,7 +1661,7 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
 
   // Parse -mllvm options.
   for (const auto *arg : args.filtered(OPT_mllvm)) {
-    parseClangOption(arg->getValue(), arg->getSpelling());
+    parseClangOption(ctx, arg->getValue(), arg->getSpelling());
     ctx.arg.mllvmOpts.emplace_back(arg->getValue());
   }
 
@@ -1758,7 +1761,7 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
       error("--symbol-ordering-file and --call-graph-order-file "
             "may not be used together");
     if (std::optional<MemoryBufferRef> buffer = readFile(arg->getValue())) {
-      ctx.arg.symbolOrderingFile = getSymbolOrderingFile(*buffer);
+      ctx.arg.symbolOrderingFile = getSymbolOrderingFile(ctx, *buffer);
       // Also need to disable CallGraphProfileSort to prevent
       // LLD order symbols with CGProfile
       ctx.arg.callGraphProfileSort = CGProfileSortKind::None;
@@ -1851,7 +1854,7 @@ static void setConfigs(Ctx &ctx, opt::InputArgList &args) {
   // We pick the format for dynamic relocations according to the psABI for each
   // processor, but a contrary choice can be made if the dynamic loader
   // supports.
-  ctx.arg.isRela = getIsRela(args);
+  ctx.arg.isRela = getIsRela(ctx, args);
 
   // If the output uses REL relocations we must store the dynamic relocation
   // addends to the output sections. We also store addends for RELA relocations
@@ -2146,7 +2149,7 @@ static DenseSet<StringRef> getExcludeLibs(opt::InputArgList &args) {
 // A special library name "ALL" means all archive files.
 //
 // This is not a popular option, but some programs such as bionic libc use it.
-static void excludeLibs(opt::InputArgList &args) {
+static void excludeLibs(Ctx &ctx, opt::InputArgList &args) {
   DenseSet<StringRef> libs = getExcludeLibs(args);
   bool all = libs.count("ALL");
 
@@ -2375,13 +2378,12 @@ static void replaceCommonSymbols(Ctx &ctx) {
 
 // The section referred to by `s` is considered address-significant. Set the
 // keepUnique flag on the section if appropriate.
-static void markAddrsig(Symbol *s) {
+static void markAddrsig(bool icfSafe, Symbol *s) {
+  // We don't need to keep text sections unique under --icf=all even if they
+  // are address-significant.
   if (auto *d = dyn_cast_or_null<Defined>(s))
-    if (d->section)
-      // We don't need to keep text sections unique under --icf=all even if they
-      // are address-significant.
-      if (ctx.arg.icf == ICFLevel::Safe || !(d->section->flags & SHF_EXECINSTR))
-        d->section->keepUnique = true;
+    if (d->section && (icfSafe || !(d->section->flags & SHF_EXECINSTR)))
+      d->section->keepUnique = true;
 }
 
 // Record sections that define symbols mentioned in --keep-unique <symbol>
@@ -2406,9 +2408,10 @@ static void findKeepUniqueSections(Ctx &ctx, opt::InputArgList &args) {
 
   // Symbols in the dynsym could be address-significant in other executables
   // or DSOs, so we conservatively mark them as address-significant.
+  bool icfSafe = ctx.arg.icf == ICFLevel::Safe;
   for (Symbol *sym : ctx.symtab->getSymbols())
     if (sym->includeInDynsym())
-      markAddrsig(sym);
+      markAddrsig(icfSafe, sym);
 
   // Visit the address-significance table in each object file and mark each
   // referenced symbol as address-significant.
@@ -2425,14 +2428,14 @@ static void findKeepUniqueSections(Ctx &ctx, opt::InputArgList &args) {
         uint64_t symIndex = decodeULEB128(cur, &size, contents.end(), &err);
         if (err)
           fatal(toString(f) + ": could not decode addrsig section: " + err);
-        markAddrsig(syms[symIndex]);
+        markAddrsig(icfSafe, syms[symIndex]);
         cur += size;
       }
     } else {
       // If an object file does not have an address-significance table,
       // conservatively mark all of its symbols as address-significant.
       for (Symbol *s : syms)
-        markAddrsig(s);
+        markAddrsig(icfSafe, s);
     }
   }
 }
@@ -2441,7 +2444,7 @@ static void findKeepUniqueSections(Ctx &ctx, opt::InputArgList &args) {
 // are used to control which partition a symbol is allocated to. See
 // https://lld.llvm.org/Partitions.html for more details on partitions.
 template <typename ELFT>
-static void readSymbolPartitionSection(InputSectionBase *s) {
+static void readSymbolPartitionSection(Ctx &ctx, InputSectionBase *s) {
   // Read the relocation that refers to the partition's entry point symbol.
   Symbol *sym;
   const RelsOrRelas<ELFT> rels = s->template relsOrRelas<ELFT>();
@@ -2494,7 +2497,7 @@ static void readSymbolPartitionSection(InputSectionBase *s) {
   sym->partition = newPart.getNumber();
 }
 
-static void markBuffersAsDontNeed(bool skipLinkedOutput) {
+static void markBuffersAsDontNeed(Ctx &ctx, bool skipLinkedOutput) {
   // With --thinlto-index-only, all buffers are nearly unused from now on
   // (except symbol/section names used by infrequent passes). Mark input file
   // buffers as MADV_DONTNEED so that these pages can be reused by the expensive
@@ -2532,7 +2535,7 @@ void LinkerDriver::compileBitcodeFiles(bool skipLinkedOutput) {
     lto->add(*file);
 
   if (!ctx.bitcodeFiles.empty())
-    markBuffersAsDontNeed(skipLinkedOutput);
+    markBuffersAsDontNeed(ctx, skipLinkedOutput);
 
   for (InputFile *file : lto->compile()) {
     auto *obj = cast<ObjFile<ELFT>>(file);
@@ -2566,7 +2569,8 @@ struct WrappedSymbol {
 // This function instantiates wrapper symbols. At this point, they seem
 // like they are not being used at all, so we explicitly set some flags so
 // that LTO won't eliminate them.
-static std::vector<WrappedSymbol> addWrappedSymbols(opt::InputArgList &args) {
+static std::vector<WrappedSymbol> addWrappedSymbols(Ctx &ctx,
+                                                    opt::InputArgList &args) {
   std::vector<WrappedSymbol> v;
   DenseSet<StringRef> seen;
 
@@ -2617,7 +2621,7 @@ static std::vector<WrappedSymbol> addWrappedSymbols(opt::InputArgList &args) {
   return v;
 }
 
-static void combineVersionedSymbol(Symbol &sym,
+static void combineVersionedSymbol(Ctx &ctx, Symbol &sym,
                                    DenseMap<Symbol *, Symbol *> &map) {
   const char *suffix1 = sym.getVersionSuffix();
   if (suffix1[0] != '@' || suffix1[1] == '@')
@@ -2684,7 +2688,7 @@ static void redirectSymbols(Ctx &ctx, ArrayRef<WrappedSymbol> wrapped) {
   if (ctx.arg.versionDefinitions.size() > 2)
     for (Symbol *sym : ctx.symtab->getSymbols())
       if (sym->hasVersionSuffix)
-        combineVersionedSymbol(*sym, map);
+        combineVersionedSymbol(ctx, *sym, map);
 
   if (map.empty())
     return;
@@ -2871,7 +2875,7 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   for (StringRef name : ctx.arg.undefined)
     ctx.symtab->addUnusedUndefined(name)->referenced = true;
 
-  parseFiles(files, armCmseImpLib);
+  parseFiles(ctx, files);
 
   // Create dynamic sections for dynamic linking and static PIE.
   ctx.arg.hasDynSymTab = !ctx.sharedFiles.empty() || ctx.arg.isPic;
@@ -2924,7 +2928,7 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   }
 
   // Archive members defining __wrap symbols may be extracted.
-  std::vector<WrappedSymbol> wrapped = addWrappedSymbols(args);
+  std::vector<WrappedSymbol> wrapped = addWrappedSymbols(ctx, args);
 
   // No more lazy bitcode can be extracted at this point. Do post parse work
   // like checking duplicate symbols.
@@ -2961,7 +2965,7 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   // 'has undefined version' error in -shared --exclude-libs=ALL mode (PR36295).
   // GNU ld errors in this case.
   if (args.hasArg(OPT_exclude_libs))
-    excludeLibs(args);
+    excludeLibs(ctx, args);
 
   // Create elfHeader early. We need a dummy section in
   // addReservedSymbols to mark the created symbols as not absolute.
@@ -2994,7 +2998,7 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
 
   // Handle --lto-validate-all-vtables-have-type-infos.
   if (ctx.arg.ltoValidateAllVtablesHaveTypeInfos)
-    ltoValidateAllVtablesHaveTypeInfos<ELFT>(args);
+    ltoValidateAllVtablesHaveTypeInfos<ELFT>(ctx, args);
 
   // Do link-time optimization if given files are LLVM bitcode files.
   // This compiles bitcode files into real object files.
@@ -3045,10 +3049,10 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   // libcalls symbols defined in an excluded archive. This may override
   // versionId set by scanVersionScript().
   if (args.hasArg(OPT_exclude_libs))
-    excludeLibs(args);
+    excludeLibs(ctx, args);
 
   // Record [__acle_se_<sym>, <sym>] pairs for later processing.
-  processArmCmseSymbols();
+  processArmCmseSymbols(ctx);
 
   // Apply symbol renames for --wrap and combine foo@v1 and foo@@v1.
   redirectSymbols(ctx, wrapped);
@@ -3079,10 +3083,10 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   {
     llvm::TimeTraceScope timeScope("Strip sections");
     if (ctx.hasSympart.load(std::memory_order_relaxed)) {
-      llvm::erase_if(ctx.inputSections, [](InputSectionBase *s) {
+      llvm::erase_if(ctx.inputSections, [&ctx = ctx](InputSectionBase *s) {
         if (s->type != SHT_LLVM_SYMPART)
           return false;
-        readSymbolPartitionSection<ELFT>(s);
+        readSymbolPartitionSection<ELFT>(ctx, s);
         return true;
       });
     }
@@ -3118,7 +3122,7 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   // The Target instance handles target-specific stuff, such as applying
   // relocations or writing a PLT section. It also contains target-dependent
   // values such as a default image base address.
-  ctx.target = getTarget();
+  ctx.target = getTarget(ctx);
 
   ctx.arg.eflags = ctx.target->calcEFlags();
   // maxPageSize (sometimes called abi page size) is the maximum page size that
@@ -3140,10 +3144,10 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
     ctx.inputSections.push_back(createCommentSection());
 
   // Split SHF_MERGE and .eh_frame sections into pieces in preparation for garbage collection.
-  splitSections<ELFT>();
+  splitSections<ELFT>(ctx);
 
   // Garbage collection and removal of shared symbols from unused shared objects.
-  markLive<ELFT>();
+  markLive<ELFT>(ctx);
 
   // Make copies of any input sections that need to be copied into each
   // partition.
@@ -3156,17 +3160,17 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
 
   // Create synthesized sections such as .got and .plt. This is called before
   // processSectionCommands() so that they can be placed by SECTIONS commands.
-  createSyntheticSections<ELFT>();
+  createSyntheticSections<ELFT>(ctx);
 
   // Some input sections that are used for exception handling need to be moved
   // into synthetic sections. Do that now so that they aren't assigned to
   // output sections in the usual way.
   if (!ctx.arg.relocatable)
-    combineEhSections();
+    combineEhSections(ctx);
 
   // Merge .riscv.attributes sections.
   if (ctx.arg.emachine == EM_RISCV)
-    mergeRISCVAttributesSections();
+    mergeRISCVAttributesSections(ctx);
 
   {
     llvm::TimeTraceScope timeScope("Assign sections");
@@ -3197,15 +3201,15 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   // ICF runs after processSectionCommands() so that we know the output sections.
   if (ctx.arg.icf != ICFLevel::None) {
     findKeepUniqueSections<ELFT>(ctx, args);
-    doIcf<ELFT>();
+    doIcf<ELFT>(ctx);
   }
 
   // Read the callgraph now that we know what was gced or icfed
   if (ctx.arg.callGraphProfileSort != CGProfileSortKind::None) {
     if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file))
       if (std::optional<MemoryBufferRef> buffer = readFile(arg->getValue()))
-        readCallGraph(*buffer);
-    readCallGraphsFromObjectFiles<ELFT>();
+        readCallGraph(ctx, *buffer);
+    readCallGraphsFromObjectFiles<ELFT>(ctx);
   }
 
   // Write the result to the file.
diff --git a/lld/ELF/Driver.h b/lld/ELF/Driver.h
index 29a2b04af7117..f555e5267e32e 100644
--- a/lld/ELF/Driver.h
+++ b/lld/ELF/Driver.h
@@ -15,6 +15,8 @@
 #include <optional>
 
 namespace lld::elf {
+struct Ctx;
+
 // Parses command line options.
 class ELFOptTable : public llvm::opt::GenericOptTable {
 public:
@@ -30,7 +32,7 @@ enum {
 #undef OPTION
 };
 
-void printHelp();
+void printHelp(Ctx &ctx);
 std::string createResponseFile(const llvm::opt::InputArgList &args);
 
 std::optional<std::string> findFromSearchPaths(StringRef path);
diff --git a/lld/ELF/DriverUtils.cpp b/lld/ELF/DriverUtils.cpp
index f001f2c994e4c..d87f2d85e5a52 100644
--- a/lld/ELF/DriverUtils.cpp
+++ b/lld/ELF/DriverUtils.cpp
@@ -135,7 +135,7 @@ opt::InputArgList ELFOptTable::parse(ArrayRef<const char *> argv) {
   return args;
 }
 
-void elf::printHelp() {
+void elf::printHelp(Ctx &ctx) {
   ELFOptTable().printHelp(
       lld::outs(), (ctx.arg.progName + " [options] file...").str().c_str(),
       "lld", false /*ShowHidden*/, true /*ShowAllAliases*/);
diff --git a/lld/ELF/ICF.cpp b/lld/ELF/ICF.cpp
index 3f4f479785fd9..09582b8af8248 100644
--- a/lld/ELF/ICF.cpp
+++ b/lld/ELF/ICF.cpp
@@ -97,6 +97,7 @@ using namespace lld::elf;
 namespace {
 template <class ELFT> class ICF {
 public:
+  ICF(Ctx &ctx) : ctx(ctx) {}
   void run();
 
 private:
@@ -120,6 +121,7 @@ template <class ELFT> class ICF {
 
   void forEachClass(llvm::function_ref<void(size_t, size_t)> fn);
 
+  Ctx &ctx;
   SmallVector<InputSection *, 0> sections;
 
   // We repeat the main loop while `Repeat` is true.
@@ -457,7 +459,7 @@ static void combineRelocHashes(unsigned cnt, InputSection *isec,
   isec->eqClass[(cnt + 1) % 2] = hash | (1U << 31);
 }
 
-static void print(const Twine &s) {
+static void print(Ctx &ctx, const Twine &s) {
   if (ctx.arg.printIcfSections)
     message(s);
 }
@@ -546,9 +548,9 @@ template <class ELFT> void ICF<ELFT>::run() {
   forEachClassRange(0, sections.size(), [&](size_t begin, size_t end) {
     if (end - begin == 1)
       return;
-    print("selected section " + toString(sections[begin]));
+    print(ctx, "selected section " + toString(sections[begin]));
     for (size_t i = begin + 1; i < end; ++i) {
-      print("  removing identical section " + toString(sections[i]));
+      print(ctx, "  removing identical section " + toString(sections[i]));
       sections[begin]->replace(sections[i]);
 
       // At this point we know sections merged are fully identical and hence
@@ -586,12 +588,12 @@ template <class ELFT> void ICF<ELFT>::run() {
 }
 
 // ICF entry point function.
-template <class ELFT> void elf::doIcf() {
+template <class ELFT> void elf::doIcf(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("ICF");
-  ICF<ELFT>().run();
+  ICF<ELFT>(ctx).run();
 }
 
-template void elf::doIcf<ELF32LE>();
-template void elf::doIcf<ELF32BE>();
-template void elf::doIcf<ELF64LE>();
-template void elf::doIcf<ELF64BE>();
+template void elf::doIcf<ELF32LE>(Ctx &);
+template void elf::doIcf<ELF32BE>(Ctx &);
+template void elf::doIcf<ELF64LE>(Ctx &);
+template void elf::doIcf<ELF64BE>(Ctx &);
diff --git a/lld/ELF/ICF.h b/lld/ELF/ICF.h
index 3246cc33f43c9..b126c889ea863 100644
--- a/lld/ELF/ICF.h
+++ b/lld/ELF/ICF.h
@@ -10,9 +10,9 @@
 #define LLD_ELF_ICF_H
 
 namespace lld::elf {
+struct Ctx;
 
-template <class ELFT> void doIcf();
-
+template <class ELFT> void doIcf(Ctx &);
 }
 
 #endif
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 8dc6811045b3c..7265ed56e957f 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -296,7 +296,7 @@ static bool isCompatible(InputFile *file) {
   return false;
 }
 
-template <class ELFT> static void doParseFile(InputFile *file) {
+template <class ELFT> static void doParseFile(Ctx &ctx, InputFile *file) {
   if (!isCompatible(file))
     return;
 
@@ -329,7 +329,9 @@ template <class ELFT> static void doParseFile(InputFile *file) {
 }
 
 // Add symbols in File to the symbol table.
-void elf::parseFile(InputFile *file) { invokeELFT(doParseFile, file); }
+void elf::parseFile(Ctx &ctx, InputFile *file) {
+  invokeELFT(doParseFile, ctx, file);
+}
 
 // This function is explicitly instantiated in ARM.cpp. Mark it extern here,
 // to avoid warnings when building with MSVC.
@@ -339,23 +341,21 @@ extern template void ObjFile<ELF64LE>::importCmseSymbols();
 extern template void ObjFile<ELF64BE>::importCmseSymbols();
 
 template <class ELFT>
-static void doParseFiles(const std::vector<InputFile *> &files,
-                         InputFile *armCmseImpLib) {
+static void doParseFiles(Ctx &ctx, const std::vector<InputFile *> &files) {
   // Add all files to the symbol table. This will add almost all symbols that we
   // need to the symbol table. This process might add files to the link due to
   // addDependentLibrary.
   for (size_t i = 0; i < files.size(); ++i) {
     llvm::TimeTraceScope timeScope("Parse input files", files[i]->getName());
-    doParseFile<ELFT>(files[i]);
+    doParseFile<ELFT>(ctx, files[i]);
   }
-  if (armCmseImpLib)
-    cast<ObjFile<ELFT>>(*armCmseImpLib).importCmseSymbols();
+  if (ctx.driver.armCmseImpLib)
+    cast<ObjFile<ELFT>>(*ctx.driver.armCmseImpLib).importCmseSymbols();
 }
 
-void elf::parseFiles(const std::vector<InputFile *> &files,
-                     InputFile *armCmseImpLib) {
+void elf::parseFiles(Ctx &ctx, const std::vector<InputFile *> &files) {
   llvm::TimeTraceScope timeScope("Parse input files");
-  invokeELFT(doParseFiles, files, armCmseImpLib);
+  invokeELFT(doParseFiles, ctx, files);
 }
 
 // Concatenates arguments to construct a string representing an error location.
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 4e777761e143b..730a4d8855c6b 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -43,9 +43,8 @@ class Symbol;
 std::optional<MemoryBufferRef> readFile(StringRef path);
 
 // Add symbols in File to the symbol table.
-void parseFile(InputFile *file);
-void parseFiles(const std::vector<InputFile *> &files,
-                InputFile *armCmseImpLib);
+void parseFile(Ctx &, InputFile *file);
+void parseFiles(Ctx &, const std::vector<InputFile *> &files);
 
 // The root class of input files.
 class InputFile {
diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index 56668a3908429..6b4b0716b9ccb 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -61,8 +61,6 @@ static lto::Config createConfig(Ctx &ctx) {
   c.Options.FunctionSections = true;
   c.Options.DataSections = true;
 
-  c.Options.BBAddrMap = ctx.arg.ltoBBAddrMap;
-
   // Check if basic block sections must be used.
   // Allowed values for --lto-basic-block-sections are "all", "labels",
   // "<file name specifying basic block ids>", or none.  This is the equivalent
@@ -71,7 +69,8 @@ static lto::Config createConfig(Ctx &ctx) {
     if (ctx.arg.ltoBasicBlockSections == "all") {
       c.Options.BBSections = BasicBlockSection::All;
     } else if (ctx.arg.ltoBasicBlockSections == "labels") {
-      c.Options.BBSections = BasicBlockSection::Labels;
+      c.Options.BBAddrMap = true;
+      c.Options.BBSections = BasicBlockSection::None;
     } else if (ctx.arg.ltoBasicBlockSections == "none") {
       c.Options.BBSections = BasicBlockSection::None;
     } else {
@@ -87,6 +86,8 @@ static lto::Config createConfig(Ctx &ctx) {
     }
   }
 
+  c.Options.BBAddrMap = ctx.arg.ltoBBAddrMap;
+
   c.Options.UniqueBasicBlockSectionNames =
       ctx.arg.ltoUniqueBasicBlockSectionNames;
 
@@ -280,7 +281,7 @@ void BitcodeCompiler::add(BitcodeFile &f) {
 // If LazyObjFile has not been added to link, emit empty index files.
 // This is needed because this is what GNU gold plugin does and we have a
 // distributed build system that depends on that behavior.
-static void thinLTOCreateEmptyIndexFiles() {
+static void thinLTOCreateEmptyIndexFiles(Ctx &ctx) {
   DenseSet<StringRef> linkedBitCodeFiles;
   for (BitcodeFile *f : ctx.bitcodeFiles)
     linkedBitCodeFiles.insert(f->getName());
@@ -344,7 +345,7 @@ std::vector<InputFile *> BitcodeCompiler::compile() {
   }
 
   if (ctx.arg.thinLTOEmitIndexFiles)
-    thinLTOCreateEmptyIndexFiles();
+    thinLTOCreateEmptyIndexFiles(ctx);
 
   if (ctx.arg.thinLTOIndexOnly) {
     if (!ctx.arg.ltoObjPath.empty())
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index 1ff33366c7897..cce584ae4d867 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -1343,7 +1343,7 @@ void LinkerScript::adjustOutputSections() {
     if (isEmpty) {
       sec->flags =
           flags & ((sec->nonAlloc ? 0 : (uint64_t)SHF_ALLOC) | SHF_WRITE);
-      sec->sortRank = getSectionRank(*sec);
+      sec->sortRank = getSectionRank(ctx, *sec);
     }
 
     // The code below may remove empty output sections. We should save the
diff --git a/lld/ELF/MapFile.cpp b/lld/ELF/MapFile.cpp
index 17c694d410a6d..3495cdb0bc666 100644
--- a/lld/ELF/MapFile.cpp
+++ b/lld/ELF/MapFile.cpp
@@ -44,7 +44,7 @@ static constexpr char indent8[] = "        ";          // 8 spaces
 static constexpr char indent16[] = "                "; // 16 spaces
 
 // Print out the first three columns of a line.
-static void writeHeader(raw_ostream &os, uint64_t vma, uint64_t lma,
+static void writeHeader(Ctx &ctx, raw_ostream &os, uint64_t vma, uint64_t lma,
                         uint64_t size, uint64_t align) {
   if (ctx.arg.is64)
     os << format("%16llx %16llx %8llx %5lld ", vma, lma, size, align);
@@ -90,14 +90,14 @@ static SymbolMapTy getSectionSyms(ArrayRef<Defined *> syms) {
 // Demangling symbols (which is what toString() does) is slow, so
 // we do that in batch using parallel-for.
 static DenseMap<Symbol *, std::string>
-getSymbolStrings(ArrayRef<Defined *> syms) {
+getSymbolStrings(Ctx &ctx, ArrayRef<Defined *> syms) {
   auto strs = std::make_unique<std::string[]>(syms.size());
   parallelFor(0, syms.size(), [&](size_t i) {
     raw_string_ostream os(strs[i]);
     OutputSection *osec = syms[i]->getOutputSection();
     uint64_t vma = syms[i]->getVA();
     uint64_t lma = osec ? osec->getLMA() + vma - osec->getVA(0) : 0;
-    writeHeader(os, vma, lma, syms[i]->getSize(), 1);
+    writeHeader(ctx, os, vma, lma, syms[i]->getSize(), 1);
     os << indent16 << toString(*syms[i]);
   });
 
@@ -113,7 +113,7 @@ getSymbolStrings(ArrayRef<Defined *> syms) {
 // .eh_frame tend to contain a lot of section pieces that are contiguous
 // both in input file and output file. Such pieces are squashed before
 // being displayed to make output compact.
-static void printEhFrame(raw_ostream &os, const EhFrameSection *sec) {
+static void printEhFrame(Ctx &ctx, raw_ostream &os, const EhFrameSection *sec) {
   std::vector<EhSectionPiece> pieces;
 
   auto add = [&](const EhSectionPiece &p) {
@@ -139,18 +139,18 @@ static void printEhFrame(raw_ostream &os, const EhFrameSection *sec) {
   // Print out section pieces.
   const OutputSection *osec = sec->getOutputSection();
   for (EhSectionPiece &p : pieces) {
-    writeHeader(os, osec->addr + p.outputOff, osec->getLMA() + p.outputOff,
+    writeHeader(ctx, os, osec->addr + p.outputOff, osec->getLMA() + p.outputOff,
                 p.size, 1);
     os << indent8 << toString(p.sec->file) << ":(" << p.sec->name << "+0x"
        << Twine::utohexstr(p.inputOff) + ")\n";
   }
 }
 
-static void writeMapFile(raw_fd_ostream &os) {
+static void writeMapFile(Ctx &ctx, raw_fd_ostream &os) {
   // Collect symbol info that we want to print out.
   std::vector<Defined *> syms = getSymbols();
   SymbolMapTy sectionSyms = getSectionSyms(syms);
-  DenseMap<Symbol *, std::string> symStr = getSymbolStrings(syms);
+  DenseMap<Symbol *, std::string> symStr = getSymbolStrings(ctx, syms);
 
   // Print out the header line.
   int w = ctx.arg.is64 ? 16 : 8;
@@ -163,7 +163,7 @@ static void writeMapFile(raw_fd_ostream &os) {
       if (assign->provide && !assign->sym)
         continue;
       uint64_t lma = osec ? osec->getLMA() + assign->addr - osec->getVA(0) : 0;
-      writeHeader(os, assign->addr, lma, assign->size, 1);
+      writeHeader(ctx, os, assign->addr, lma, assign->size, 1);
       os << assign->commandString << '\n';
       continue;
     }
@@ -171,7 +171,8 @@ static void writeMapFile(raw_fd_ostream &os) {
       continue;
 
     osec = &cast<OutputDesc>(cmd)->osec;
-    writeHeader(os, osec->addr, osec->getLMA(), osec->size, osec->addralign);
+    writeHeader(ctx, os, osec->addr, osec->getLMA(), osec->size,
+                osec->addralign);
     os << osec->name << '\n';
 
     // Dump symbols for each input section.
@@ -179,11 +180,11 @@ static void writeMapFile(raw_fd_ostream &os) {
       if (auto *isd = dyn_cast<InputSectionDescription>(subCmd)) {
         for (InputSection *isec : isd->sections) {
           if (auto *ehSec = dyn_cast<EhFrameSection>(isec)) {
-            printEhFrame(os, ehSec);
+            printEhFrame(ctx, os, ehSec);
             continue;
           }
 
-          writeHeader(os, isec->getVA(), osec->getLMA() + isec->outSecOff,
+          writeHeader(ctx, os, isec->getVA(), osec->getLMA() + isec->outSecOff,
                       isec->getSize(), isec->addralign);
           os << indent8 << toString(isec) << '\n';
           for (Symbol *sym : llvm::make_first_range(sectionSyms[isec]))
@@ -193,7 +194,7 @@ static void writeMapFile(raw_fd_ostream &os) {
       }
 
       if (auto *data = dyn_cast<ByteCommand>(subCmd)) {
-        writeHeader(os, osec->addr + data->offset,
+        writeHeader(ctx, os, osec->addr + data->offset,
                     osec->getLMA() + data->offset, data->size, 1);
         os << indent8 << data->commandString << '\n';
         continue;
@@ -202,7 +203,7 @@ static void writeMapFile(raw_fd_ostream &os) {
       if (auto *assign = dyn_cast<SymbolAssignment>(subCmd)) {
         if (assign->provide && !assign->sym)
           continue;
-        writeHeader(os, assign->addr,
+        writeHeader(ctx, os, assign->addr,
                     osec->getLMA() + assign->addr - osec->getVA(0),
                     assign->size, 1);
         os << indent8 << assign->commandString << '\n';
@@ -223,7 +224,7 @@ static void writeMapFile(raw_fd_ostream &os) {
 //
 // In this case, strlen is defined by libc.so.6 and used by other two
 // files.
-static void writeCref(raw_fd_ostream &os) {
+static void writeCref(Ctx &ctx, raw_fd_ostream &os) {
   // Collect symbols and files.
   MapVector<Symbol *, SetVector<InputFile *>> map;
   for (ELFFileBase *file : ctx.objectFiles) {
@@ -256,7 +257,7 @@ static void writeCref(raw_fd_ostream &os) {
   }
 }
 
-void elf::writeMapAndCref() {
+void elf::writeMapAndCref(Ctx &ctx) {
   if (ctx.arg.mapFile.empty() && !ctx.arg.cref)
     return;
 
@@ -272,7 +273,7 @@ void elf::writeMapAndCref() {
   }
 
   if (!ctx.arg.mapFile.empty())
-    writeMapFile(os);
+    writeMapFile(ctx, os);
   if (ctx.arg.cref)
-    writeCref(os);
+    writeCref(ctx, os);
 }
diff --git a/lld/ELF/MapFile.h b/lld/ELF/MapFile.h
index b271f627df570..c4efd33a3095c 100644
--- a/lld/ELF/MapFile.h
+++ b/lld/ELF/MapFile.h
@@ -10,7 +10,8 @@
 #define LLD_ELF_MAPFILE_H
 
 namespace lld::elf {
-void writeMapAndCref();
+struct Ctx;
+void writeMapAndCref(Ctx &);
 }
 
 #endif
diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp
index b9a4e392a507a..23e2359491370 100644
--- a/lld/ELF/MarkLive.cpp
+++ b/lld/ELF/MarkLive.cpp
@@ -44,7 +44,7 @@ using namespace lld::elf;
 namespace {
 template <class ELFT> class MarkLive {
 public:
-  MarkLive(unsigned partition) : partition(partition) {}
+  MarkLive(Ctx &ctx, unsigned partition) : ctx(ctx), partition(partition) {}
 
   void run();
   void moveToMain();
@@ -60,6 +60,7 @@ template <class ELFT> class MarkLive {
   template <class RelTy>
   void scanEhFrameSection(EhInputSection &eh, ArrayRef<RelTy> rels);
 
+  Ctx &ctx;
   // The index of the partition that we are currently processing.
   unsigned partition;
 
@@ -73,21 +74,21 @@ template <class ELFT> class MarkLive {
 } // namespace
 
 template <class ELFT>
-static uint64_t getAddend(InputSectionBase &sec,
+static uint64_t getAddend(Ctx &ctx, InputSectionBase &sec,
                           const typename ELFT::Rel &rel) {
   return ctx.target->getImplicitAddend(sec.content().begin() + rel.r_offset,
                                        rel.getType(ctx.arg.isMips64EL));
 }
 
 template <class ELFT>
-static uint64_t getAddend(InputSectionBase &sec,
+static uint64_t getAddend(Ctx &, InputSectionBase &sec,
                           const typename ELFT::Rela &rel) {
   return rel.r_addend;
 }
 
 // Currently, we assume all input CREL relocations have an explicit addend.
 template <class ELFT>
-static uint64_t getAddend(InputSectionBase &sec,
+static uint64_t getAddend(Ctx &, InputSectionBase &sec,
                           const typename ELFT::Crel &rel) {
   return rel.r_addend;
 }
@@ -107,7 +108,7 @@ void MarkLive<ELFT>::resolveReloc(InputSectionBase &sec, RelTy &rel,
 
     uint64_t offset = d->value;
     if (d->isSection())
-      offset += getAddend<ELFT>(sec, rel);
+      offset += getAddend<ELFT>(ctx, sec, rel);
 
     // fromFDE being true means this is referenced by a FDE in a .eh_frame
     // piece. The relocation points to the described function or to a LSDA. We
@@ -361,7 +362,7 @@ template <class ELFT> void MarkLive<ELFT>::moveToMain() {
 // Before calling this function, Live bits are off for all
 // input sections. This function make some or all of them on
 // so that they are emitted to the output file.
-template <class ELFT> void elf::markLive() {
+template <class ELFT> void elf::markLive(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("markLive");
   // If --gc-sections is not given, retain all input sections.
   if (!ctx.arg.gcSections) {
@@ -378,13 +379,13 @@ template <class ELFT> void elf::markLive() {
 
   // Follow the graph to mark all live sections.
   for (unsigned i = 1, e = ctx.partitions.size(); i <= e; ++i)
-    MarkLive<ELFT>(i).run();
+    MarkLive<ELFT>(ctx, i).run();
 
   // If we have multiple partitions, some sections need to live in the main
   // partition even if they were allocated to a loadable partition. Move them
   // there now.
   if (ctx.partitions.size() != 1)
-    MarkLive<ELFT>(1).moveToMain();
+    MarkLive<ELFT>(ctx, 1).moveToMain();
 
   // Report garbage-collected sections.
   if (ctx.arg.printGcSections)
@@ -393,7 +394,7 @@ template <class ELFT> void elf::markLive() {
         message("removing unused section " + toString(sec));
 }
 
-template void elf::markLive<ELF32LE>();
-template void elf::markLive<ELF32BE>();
-template void elf::markLive<ELF64LE>();
-template void elf::markLive<ELF64BE>();
+template void elf::markLive<ELF32LE>(Ctx &);
+template void elf::markLive<ELF32BE>(Ctx &);
+template void elf::markLive<ELF64LE>(Ctx &);
+template void elf::markLive<ELF64BE>(Ctx &);
diff --git a/lld/ELF/MarkLive.h b/lld/ELF/MarkLive.h
index ef62fdf964e4b..a614646e25c99 100644
--- a/lld/ELF/MarkLive.h
+++ b/lld/ELF/MarkLive.h
@@ -10,9 +10,9 @@
 #define LLD_ELF_MARKLIVE_H
 
 namespace lld::elf {
+struct Ctx;
 
-template <class ELFT> void markLive();
-
+template <class ELFT> void markLive(Ctx &);
 }
 
 #endif // LLD_ELF_MARKLIVE_H
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index e0181f0809cc5..078166e0d3f03 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -65,7 +65,8 @@ using namespace llvm::support::endian;
 using namespace lld;
 using namespace lld::elf;
 
-static std::optional<std::string> getLinkerScriptLocation(const Symbol &sym) {
+static std::optional<std::string> getLinkerScriptLocation(Ctx &ctx,
+                                                          const Symbol &sym) {
   for (SectionCommand *cmd : ctx.script->sectionCommands)
     if (auto *assign = dyn_cast<SymbolAssignment>(cmd))
       if (assign->sym == &sym)
@@ -73,11 +74,11 @@ static std::optional<std::string> getLinkerScriptLocation(const Symbol &sym) {
   return std::nullopt;
 }
 
-static std::string getDefinedLocation(const Symbol &sym) {
+static std::string getDefinedLocation(Ctx &ctx, const Symbol &sym) {
   const char msg[] = "\n>>> defined in ";
   if (sym.file)
     return msg + toString(sym.file);
-  if (std::optional<std::string> loc = getLinkerScriptLocation(sym))
+  if (std::optional<std::string> loc = getLinkerScriptLocation(ctx, sym))
     return msg + *loc;
   return "";
 }
@@ -87,18 +88,18 @@ static std::string getDefinedLocation(const Symbol &sym) {
 // >>> defined in /home/alice/src/foo.o
 // >>> referenced by bar.c:12 (/home/alice/src/bar.c:12)
 // >>>               /home/alice/src/bar.o:(.text+0x1)
-static std::string getLocation(InputSectionBase &s, const Symbol &sym,
+static std::string getLocation(Ctx &ctx, InputSectionBase &s, const Symbol &sym,
                                uint64_t off) {
-  std::string msg = getDefinedLocation(sym) + "\n>>> referenced by ";
+  std::string msg = getDefinedLocation(ctx, sym) + "\n>>> referenced by ";
   std::string src = s.getSrcMsg(sym, off);
   if (!src.empty())
     msg += src + "\n>>>               ";
   return msg + s.getObjMsg(off);
 }
 
-void elf::reportRangeError(uint8_t *loc, const Relocation &rel, const Twine &v,
-                           int64_t min, uint64_t max) {
-  ErrorPlace errPlace = getErrorPlace(loc);
+void elf::reportRangeError(Ctx &, uint8_t *loc, const Relocation &rel,
+                           const Twine &v, int64_t min, uint64_t max) {
+  ErrorPlace errPlace = getErrorPlace(ctx, loc);
   std::string hint;
   if (rel.sym) {
     if (!rel.sym->isSection())
@@ -116,7 +117,7 @@ void elf::reportRangeError(uint8_t *loc, const Relocation &rel, const Twine &v,
   if (!errPlace.srcLoc.empty())
     hint += "\n>>> referenced by " + errPlace.srcLoc;
   if (rel.sym && !rel.sym->isSection())
-    hint += getDefinedLocation(*rel.sym);
+    hint += getDefinedLocation(ctx, *rel.sym);
 
   if (errPlace.isec && errPlace.isec->name.starts_with(".debug"))
     hint += "; consider recompiling with -fdebug-types-section to reduce size "
@@ -127,13 +128,13 @@ void elf::reportRangeError(uint8_t *loc, const Relocation &rel, const Twine &v,
               ", " + Twine(max).str() + "]" + hint);
 }
 
-void elf::reportRangeError(uint8_t *loc, int64_t v, int n, const Symbol &sym,
-                           const Twine &msg) {
-  ErrorPlace errPlace = getErrorPlace(loc);
+void elf::reportRangeError(Ctx &ctx, uint8_t *loc, int64_t v, int n,
+                           const Symbol &sym, const Twine &msg) {
+  ErrorPlace errPlace = getErrorPlace(ctx, loc);
   std::string hint;
   if (!sym.getName().empty())
-    hint =
-        "; references '" + lld::toString(sym) + '\'' + getDefinedLocation(sym);
+    hint = "; references '" + lld::toString(sym) + '\'' +
+           getDefinedLocation(ctx, sym);
   errorOrWarn(errPlace.loc + msg + " is out of range: " + Twine(v) +
               " is not in [" + Twine(llvm::minIntN(n)) + ", " +
               Twine(llvm::maxIntN(n)) + "]" + hint);
@@ -284,7 +285,7 @@ template <class ELFT> static bool isReadOnly(SharedSymbol &ss) {
 // them are copied by a copy relocation, all of them need to be copied.
 // Otherwise, they would refer to different places at runtime.
 template <class ELFT>
-static SmallSet<SharedSymbol *, 4> getSymbolsAt(SharedSymbol &ss) {
+static SmallSet<SharedSymbol *, 4> getSymbolsAt(Ctx &ctx, SharedSymbol &ss) {
   using Elf_Sym = typename ELFT::Sym;
 
   const auto &file = cast<SharedFile>(*ss.file);
@@ -371,7 +372,7 @@ static void replaceWithDefined(Symbol &sym, SectionBase &sec, uint64_t value,
 // to the variable in .bss. This kind of issue is sometimes very hard to
 // debug. What's a solution? Instead of exporting a variable V from a DSO,
 // define an accessor getV().
-template <class ELFT> static void addCopyRelSymbol(SharedSymbol &ss) {
+template <class ELFT> static void addCopyRelSymbol(Ctx &ctx, SharedSymbol &ss) {
   // Copy relocation against zero-sized symbol doesn't make sense.
   uint64_t symSize = ss.getSize();
   if (symSize == 0 || ss.alignment == 0)
@@ -396,7 +397,7 @@ template <class ELFT> static void addCopyRelSymbol(SharedSymbol &ss) {
   // Look through the DSO's dynamic symbol table for aliases and create a
   // dynamic symbol for each one. This causes the copy relocation to correctly
   // interpose any aliases.
-  for (SharedSymbol *sym : getSymbolsAt<ELFT>(ss))
+  for (SharedSymbol *sym : getSymbolsAt<ELFT>(ctx, ss))
     replaceWithDefined(*sym, *sec, 0, sym->size);
 
   ctx.mainPart->relaDyn->addSymbolReloc(ctx.target->copyRel, *sec, 0, ss);
@@ -459,10 +460,12 @@ class OffsetGetter {
 // InputSectionBase.
 class RelocationScanner {
 public:
+  RelocationScanner(Ctx &ctx) : ctx(ctx) {}
   template <class ELFT>
   void scanSection(InputSectionBase &s, bool isEH = false);
 
 private:
+  Ctx &ctx;
   InputSectionBase *sec;
   OffsetGetter getter;
 
@@ -476,6 +479,9 @@ class RelocationScanner {
                                 uint64_t relOff) const;
   void processAux(RelExpr expr, RelType type, uint64_t offset, Symbol &sym,
                   int64_t addend) const;
+  unsigned handleTlsRelocation(RelExpr expr, RelType type, uint64_t offset,
+                               Symbol &sym, int64_t addend);
+
   template <class ELFT, class RelTy>
   void scanOne(typename Relocs<RelTy>::const_iterator &i);
   template <class ELFT, class RelTy> void scan(Relocs<RelTy> rels);
@@ -520,7 +526,7 @@ int64_t RelocationScanner::computeMipsAddend(const RelTy &rel, RelExpr expr,
 
 // Custom error message if Sym is defined in a discarded section.
 template <class ELFT>
-static std::string maybeReportDiscarded(Undefined &sym) {
+static std::string maybeReportDiscarded(Ctx &ctx, Undefined &sym) {
   auto *file = dyn_cast_or_null<ObjFile<ELFT>>(sym.file);
   if (!file || !sym.discardedSecIdx)
     return "";
@@ -708,7 +714,7 @@ static const Symbol *getAlternativeSpelling(const Undefined &sym,
   return nullptr;
 }
 
-static void reportUndefinedSymbol(const UndefinedDiag &undef,
+static void reportUndefinedSymbol(Ctx &ctx, const UndefinedDiag &undef,
                                   bool correctSpelling) {
   Undefined &sym = *undef.sym;
 
@@ -728,16 +734,16 @@ static void reportUndefinedSymbol(const UndefinedDiag &undef,
   std::string msg;
   switch (ctx.arg.ekind) {
   case ELF32LEKind:
-    msg = maybeReportDiscarded<ELF32LE>(sym);
+    msg = maybeReportDiscarded<ELF32LE>(ctx, sym);
     break;
   case ELF32BEKind:
-    msg = maybeReportDiscarded<ELF32BE>(sym);
+    msg = maybeReportDiscarded<ELF32BE>(ctx, sym);
     break;
   case ELF64LEKind:
-    msg = maybeReportDiscarded<ELF64LE>(sym);
+    msg = maybeReportDiscarded<ELF64LE>(ctx, sym);
     break;
   case ELF64BEKind:
-    msg = maybeReportDiscarded<ELF64BE>(sym);
+    msg = maybeReportDiscarded<ELF64BE>(ctx, sym);
     break;
   default:
     llvm_unreachable("");
@@ -795,7 +801,7 @@ static void reportUndefinedSymbol(const UndefinedDiag &undef,
     error(msg, ErrorTag::SymbolNotFound, {sym.getName()});
 }
 
-void elf::reportUndefinedSymbols() {
+void elf::reportUndefinedSymbols(Ctx &ctx) {
   // Find the first "undefined symbol" diagnostic for each diagnostic, and
   // collect all "referenced from" lines at the first diagnostic.
   DenseMap<Symbol *, UndefinedDiag *> firstRef;
@@ -811,14 +817,14 @@ void elf::reportUndefinedSymbols() {
   // Enable spell corrector for the first 2 diagnostics.
   for (const auto &[i, undef] : llvm::enumerate(undefs))
     if (!undef.locs.empty())
-      reportUndefinedSymbol(undef, i < 2);
+      reportUndefinedSymbol(ctx, undef, i < 2);
   undefs.clear();
 }
 
 // Report an undefined symbol if necessary.
 // Returns true if the undefined symbol will produce an error message.
-static bool maybeReportUndefined(Undefined &sym, InputSectionBase &sec,
-                                 uint64_t offset) {
+static bool maybeReportUndefined(Ctx &ctx, Undefined &sym,
+                                 InputSectionBase &sec, uint64_t offset) {
   std::lock_guard<std::mutex> lock(relocMutex);
   // If versioned, issue an error (even if the symbol is weak) because we don't
   // know the defining filename which is required to construct a Verneed entry.
@@ -921,7 +927,7 @@ static void addPltEntry(PltSection &plt, GotPltSection &gotPlt,
                 sym, 0, R_ABS});
 }
 
-void elf::addGotEntry(Symbol &sym) {
+void elf::addGotEntry(Ctx &ctx, Symbol &sym) {
   ctx.in.got->addEntry(sym);
   uint64_t off = sym.getGotOffset();
 
@@ -941,7 +947,7 @@ void elf::addGotEntry(Symbol &sym) {
     addRelativeReloc(*ctx.in.got, off, sym, 0, R_ABS, ctx.target->symbolicRel);
 }
 
-static void addTpOffsetGotEntry(Symbol &sym) {
+static void addTpOffsetGotEntry(Ctx &ctx, Symbol &sym) {
   ctx.in.got->addEntry(sym);
   uint64_t off = sym.getGotOffset();
   if (!sym.isPreemptible && !ctx.arg.shared) {
@@ -955,7 +961,7 @@ static void addTpOffsetGotEntry(Symbol &sym) {
 // Return true if we can define a symbol in the executable that
 // contains the value/function of a symbol defined in a shared
 // library.
-static bool canDefineSymbolInExecutable(Symbol &sym) {
+static bool canDefineSymbolInExecutable(Ctx &ctx, Symbol &sym) {
   // If the symbol has default visibility the symbol defined in the
   // executable will preempt it.
   // Note that we want the visibility of the shared symbol itself, not
@@ -1036,7 +1042,7 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
       return true;
 
   error("relocation " + toString(type) + " cannot refer to absolute symbol: " +
-        toString(sym) + getLocation(*sec, sym, relOff));
+        toString(sym) + getLocation(ctx, *sec, sym, relOff));
   return true;
 }
 
@@ -1201,9 +1207,9 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
   // R_AARCH64_AUTH_ABS64.
   if (!ctx.arg.shared && sym.isShared() &&
       !(ctx.arg.emachine == EM_AARCH64 && type == R_AARCH64_AUTH_ABS64)) {
-    if (!canDefineSymbolInExecutable(sym)) {
+    if (!canDefineSymbolInExecutable(ctx, sym)) {
       errorOrWarn("cannot preempt symbol: " + toString(sym) +
-                  getLocation(*sec, sym, offset));
+                  getLocation(ctx, *sec, sym, offset));
       return;
     }
 
@@ -1214,7 +1220,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
           error("unresolvable relocation " + toString(type) +
                 " against symbol '" + toString(*ss) +
                 "'; recompile with -fPIC or remove '-z nocopyreloc'" +
-                getLocation(*sec, sym, offset));
+                getLocation(ctx, *sec, sym, offset));
         sym.setFlags(NEEDS_COPY);
       }
       sec->addReloc({expr, type, offset, addend, &sym});
@@ -1252,7 +1258,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
       if (ctx.arg.pie && ctx.arg.emachine == EM_386)
         errorOrWarn("symbol '" + toString(sym) +
                     "' cannot be preempted; recompile with -fPIE" +
-                    getLocation(*sec, sym, offset));
+                    getLocation(ctx, *sec, sym, offset));
       sym.setFlags(NEEDS_COPY | NEEDS_PLT);
       sec->addReloc({expr, type, offset, addend, &sym});
       return;
@@ -1262,7 +1268,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
   errorOrWarn("relocation " + toString(type) + " cannot be used against " +
               (sym.getName().empty() ? "local symbol"
                                      : "symbol '" + toString(sym) + "'") +
-              "; recompile with -fPIC" + getLocation(*sec, sym, offset));
+              "; recompile with -fPIC" + getLocation(ctx, *sec, sym, offset));
 }
 
 // This function is similar to the `handleTlsRelocation`. MIPS does not
@@ -1271,7 +1277,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
 // pollute other `handleTlsRelocation` by MIPS `ifs` statements.
 // Mips has a custom MipsGotSection that handles the writing of GOT entries
 // without dynamic relocations.
-static unsigned handleMipsTlsRelocation(RelType type, Symbol &sym,
+static unsigned handleMipsTlsRelocation(Ctx &ctx, RelType type, Symbol &sym,
                                         InputSectionBase &c, uint64_t offset,
                                         int64_t addend, RelExpr expr) {
   if (expr == R_MIPS_TLSLD) {
@@ -1294,20 +1300,21 @@ static unsigned handleMipsTlsRelocation(RelType type, Symbol &sym,
 // symbol in TLS block.
 //
 // Returns the number of relocations processed.
-static unsigned handleTlsRelocation(RelType type, Symbol &sym,
-                                    InputSectionBase &c, uint64_t offset,
-                                    int64_t addend, RelExpr expr) {
+unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
+                                                uint64_t offset, Symbol &sym,
+                                                int64_t addend) {
   if (expr == R_TPREL || expr == R_TPREL_NEG) {
     if (ctx.arg.shared) {
       errorOrWarn("relocation " + toString(type) + " against " + toString(sym) +
-                  " cannot be used with -shared" + getLocation(c, sym, offset));
+                  " cannot be used with -shared" +
+                  getLocation(ctx, *sec, sym, offset));
       return 1;
     }
     return 0;
   }
 
   if (ctx.arg.emachine == EM_MIPS)
-    return handleMipsTlsRelocation(type, sym, c, offset, addend, expr);
+    return handleMipsTlsRelocation(ctx, type, sym, *sec, offset, addend, expr);
 
   // LoongArch does not yet implement transition from TLSDESC to LE/IE, so
   // generate TLSDESC dynamic relocation for the dynamic linker to handle.
@@ -1316,7 +1323,7 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
             R_TLSDESC_CALL>(expr)) {
     if (expr != R_TLSDESC_CALL) {
       sym.setFlags(NEEDS_TLSDESC);
-      c.addReloc({expr, type, offset, addend, &sym});
+      sec->addReloc({expr, type, offset, addend, &sym});
     }
     return 1;
   }
@@ -1331,7 +1338,7 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
     if (expr != R_TLSDESC_CALL) {
       if (!isRISCV || type == R_RISCV_TLSDESC_HI20)
         sym.setFlags(NEEDS_TLSDESC);
-      c.addReloc({expr, type, offset, addend, &sym});
+      sec->addReloc({expr, type, offset, addend, &sym});
     }
     return 1;
   }
@@ -1345,10 +1352,11 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
       !ctx.arg.shared && ctx.arg.emachine != EM_ARM &&
       ctx.arg.emachine != EM_HEXAGON && ctx.arg.emachine != EM_LOONGARCH &&
       !(isRISCV && expr != R_TLSDESC_PC && expr != R_TLSDESC_CALL) &&
-      !c.file->ppc64DisableTLSRelax;
+      !sec->file->ppc64DisableTLSRelax;
 
   // If we are producing an executable and the symbol is non-preemptable, it
-  // must be defined and the code sequence can be optimized to use Local-Exec.
+  // must be defined and the code sequence can be optimized to use
+  // Local-Exesec->
   //
   // ARM and RISC-V do not support any relaxations for TLS relocations, however,
   // we can omit the DTPMOD dynamic relocations and resolve them at link time
@@ -1361,33 +1369,33 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
   // module index, with a special value of 0 for the current module. GOT[e1] is
   // unused. There only needs to be one module index entry.
   if (oneof<R_TLSLD_GOT, R_TLSLD_GOTPLT, R_TLSLD_PC, R_TLSLD_HINT>(expr)) {
-    // Local-Dynamic relocs can be optimized to Local-Exec.
+    // Local-Dynamic relocs can be optimized to Local-Exesec->
     if (execOptimize) {
-      c.addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE), type,
-                  offset, addend, &sym});
+      sec->addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE),
+                     type, offset, addend, &sym});
       return ctx.target->getTlsGdRelaxSkip(type);
     }
     if (expr == R_TLSLD_HINT)
       return 1;
     ctx.needsTlsLd.store(true, std::memory_order_relaxed);
-    c.addReloc({expr, type, offset, addend, &sym});
+    sec->addReloc({expr, type, offset, addend, &sym});
     return 1;
   }
 
-  // Local-Dynamic relocs can be optimized to Local-Exec.
+  // Local-Dynamic relocs can be optimized to Local-Exesec->
   if (expr == R_DTPREL) {
     if (execOptimize)
       expr = ctx.target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE);
-    c.addReloc({expr, type, offset, addend, &sym});
+    sec->addReloc({expr, type, offset, addend, &sym});
     return 1;
   }
 
   // Local-Dynamic sequence where offset of tls variable relative to dynamic
   // thread pointer is stored in the got. This cannot be optimized to
-  // Local-Exec.
+  // Local-Exesec->
   if (expr == R_TLSLD_GOT_OFF) {
     sym.setFlags(NEEDS_GOT_DTPREL);
-    c.addReloc({expr, type, offset, addend, &sym});
+    sec->addReloc({expr, type, offset, addend, &sym});
     return 1;
   }
 
@@ -1396,7 +1404,7 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
             R_LOONGARCH_TLSGD_PAGE_PC>(expr)) {
     if (!execOptimize) {
       sym.setFlags(NEEDS_TLSGD);
-      c.addReloc({expr, type, offset, addend, &sym});
+      sec->addReloc({expr, type, offset, addend, &sym});
       return 1;
     }
 
@@ -1405,14 +1413,14 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
     //
     // R_RISCV_TLSDESC_{LOAD_LO12,ADD_LO12_I,CALL} reference a non-preemptible
     // label, so TLSDESC=>IE will be categorized as R_RELAX_TLS_GD_TO_LE. We fix
-    // the categorization in RISCV::relocateAlloc.
+    // the categorization in RISCV::relocateAllosec->
     if (sym.isPreemptible) {
       sym.setFlags(NEEDS_TLSGD_TO_IE);
-      c.addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_IE), type,
-                  offset, addend, &sym});
+      sec->addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_IE),
+                     type, offset, addend, &sym});
     } else {
-      c.addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_LE), type,
-                  offset, addend, &sym});
+      sec->addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_LE),
+                     type, offset, addend, &sym});
     }
     return ctx.target->getTlsGdRelaxSkip(type);
   }
@@ -1423,15 +1431,15 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
     // Initial-Exec relocs can be optimized to Local-Exec if the symbol is
     // locally defined.  This is not supported on SystemZ.
     if (execOptimize && isLocalInExecutable && ctx.arg.emachine != EM_S390) {
-      c.addReloc({R_RELAX_TLS_IE_TO_LE, type, offset, addend, &sym});
+      sec->addReloc({R_RELAX_TLS_IE_TO_LE, type, offset, addend, &sym});
     } else if (expr != R_TLSIE_HINT) {
       sym.setFlags(NEEDS_TLSIE);
       // R_GOT needs a relative relocation for PIC on i386 and Hexagon.
       if (expr == R_GOT && ctx.arg.isPic &&
           !ctx.target->usesOnlyLowPageBits(type))
-        addRelativeReloc<true>(c, offset, sym, addend, expr, type);
+        addRelativeReloc<true>(*sec, offset, sym, addend, expr, type);
       else
-        c.addReloc({expr, type, offset, addend, &sym});
+        sec->addReloc({expr, type, offset, addend, &sym});
     }
     return 1;
   }
@@ -1480,7 +1488,7 @@ void RelocationScanner::scanOne(typename Relocs<RelTy>::const_iterator &i) {
   // Error if the target symbol is undefined. Symbol index 0 may be used by
   // marker relocations, e.g. R_*_NONE and R_ARM_V4BX. Don't error on them.
   if (sym.isUndefined() && symIndex != 0 &&
-      maybeReportUndefined(cast<Undefined>(sym), *sec, offset))
+      maybeReportUndefined(ctx, cast<Undefined>(sym), *sec, offset))
     return;
 
   if (ctx.arg.emachine == EM_PPC64) {
@@ -1508,7 +1516,7 @@ void RelocationScanner::scanOne(typename Relocs<RelTy>::const_iterator &i) {
         if (i == end) {
           errorOrWarn("R_PPC64_TLSGD/R_PPC64_TLSLD may not be the last "
                       "relocation" +
-                      getLocation(*sec, sym, offset));
+                      getLocation(ctx, *sec, sym, offset));
           return;
         }
       }
@@ -1539,7 +1547,7 @@ void RelocationScanner::scanOne(typename Relocs<RelTy>::const_iterator &i) {
   // but we need to process them in handleTlsRelocation.
   if (sym.isTls() || oneof<R_TLSDESC_PC, R_TLSDESC_CALL>(expr)) {
     if (unsigned processed =
-            handleTlsRelocation(type, sym, *sec, offset, addend, expr)) {
+            handleTlsRelocation(expr, type, offset, sym, addend)) {
       i += processed - 1;
       return;
     }
@@ -1635,7 +1643,7 @@ void RelocationScanner::scanSection(InputSectionBase &s, bool isEH) {
     scan<ELFT>(rels.relas);
 }
 
-template <class ELFT> void elf::scanRelocations() {
+template <class ELFT> void elf::scanRelocations(Ctx &ctx) {
   // Scan all relocations. Each relocation goes through a series of tests to
   // determine if it needs special treatment, such as creating GOT, PLT,
   // copy relocations, etc. Note that relocations for non-alloc sections are
@@ -1649,8 +1657,8 @@ template <class ELFT> void elf::scanRelocations() {
   parallel::TaskGroup tg;
   auto outerFn = [&]() {
     for (ELFFileBase *f : ctx.objectFiles) {
-      auto fn = [f]() {
-        RelocationScanner scanner;
+      auto fn = [f, &ctx]() {
+        RelocationScanner scanner(ctx);
         for (InputSectionBase *s : f->getSections()) {
           if (s && s->kind() == SectionBase::Regular && s->isLive() &&
               (s->flags & SHF_ALLOC) &&
@@ -1663,8 +1671,8 @@ template <class ELFT> void elf::scanRelocations() {
       else
         tg.spawn(fn);
     }
-    auto scanEH = [] {
-      RelocationScanner scanner;
+    auto scanEH = [&] {
+      RelocationScanner scanner(ctx);
       for (Partition &part : ctx.partitions) {
         for (EhInputSection *sec : part.ehFrame->sections)
           scanner.template scanSection<ELFT>(*sec, /*isEH=*/true);
@@ -1687,7 +1695,7 @@ template <class ELFT> void elf::scanRelocations() {
     outerFn();
 }
 
-static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) {
+static bool handleNonPreemptibleIfunc(Ctx &ctx, Symbol &sym, uint16_t flags) {
   // Handle a reference to a non-preemptible ifunc. These are special in a
   // few ways:
   //
@@ -1763,7 +1771,7 @@ static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) {
     d.type = STT_FUNC;
 
     if (flags & NEEDS_GOT)
-      addGotEntry(sym);
+      addGotEntry(ctx, sym);
   } else if (flags & NEEDS_GOT) {
     // Redirect GOT accesses to point to the Igot.
     sym.gotInIgot = true;
@@ -1771,10 +1779,10 @@ static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) {
   return true;
 }
 
-void elf::postScanRelocations() {
-  auto fn = [](Symbol &sym) {
+void elf::postScanRelocations(Ctx &ctx) {
+  auto fn = [&](Symbol &sym) {
     auto flags = sym.flags.load(std::memory_order_relaxed);
-    if (handleNonPreemptibleIfunc(sym, flags))
+    if (handleNonPreemptibleIfunc(ctx, sym, flags))
       return;
 
     if (sym.isTagged() && sym.isDefined())
@@ -1785,13 +1793,13 @@ void elf::postScanRelocations() {
     sym.allocateAux();
 
     if (flags & NEEDS_GOT)
-      addGotEntry(sym);
+      addGotEntry(ctx, sym);
     if (flags & NEEDS_PLT)
       addPltEntry(*ctx.in.plt, *ctx.in.gotPlt, *ctx.in.relaPlt,
                   ctx.target->pltRel, sym);
     if (flags & NEEDS_COPY) {
       if (sym.isObject()) {
-        invokeELFT(addCopyRelSymbol, cast<SharedSymbol>(sym));
+        invokeELFT(addCopyRelSymbol, ctx, cast<SharedSymbol>(sym));
         // NEEDS_COPY is cleared for sym and its aliases so that in
         // later iterations aliases won't cause redundant copies.
         assert(!sym.hasFlag(NEEDS_COPY));
@@ -1855,7 +1863,7 @@ void elf::postScanRelocations() {
     }
 
     if ((flags & NEEDS_TLSIE) && !(flags & NEEDS_TLSGD_TO_IE))
-      addTpOffsetGotEntry(sym);
+      addTpOffsetGotEntry(ctx, sym);
   };
 
   GotSection *got = ctx.in.got.get();
@@ -2045,7 +2053,7 @@ void ThunkCreator::mergeThunks(ArrayRef<OutputSection *> outputSections) {
       });
 }
 
-static int64_t getPCBias(RelType type) {
+static int64_t getPCBias(Ctx &ctx, RelType type) {
   if (ctx.arg.emachine != EM_ARM)
     return 0;
   switch (type) {
@@ -2067,7 +2075,7 @@ ThunkSection *ThunkCreator::getISDThunkSec(OutputSection *os,
                                            const Relocation &rel,
                                            uint64_t src) {
   // See the comment in getThunk for -pcBias below.
-  const int64_t pcBias = getPCBias(rel.type);
+  const int64_t pcBias = getPCBias(ctx, rel.type);
   for (std::pair<ThunkSection *, uint32_t> tp : isd->thunkSections) {
     ThunkSection *ts = tp.first;
     uint64_t tsBase = os->addr + ts->outSecOff - pcBias;
@@ -2228,7 +2236,7 @@ std::pair<Thunk *, bool> ThunkCreator::getThunk(InputSection *isec,
   // out in the relocation addend. We compensate for the PC bias so that
   // an Arm and Thumb relocation to the same destination get the same keyAddend,
   // which is usually 0.
-  const int64_t pcBias = getPCBias(rel.type);
+  const int64_t pcBias = getPCBias(ctx, rel.type);
   const int64_t keyAddend = rel.addend + pcBias;
 
   // We use a ((section, offset), addend) pair to find the thunk position if
@@ -2252,11 +2260,20 @@ std::pair<Thunk *, bool> ThunkCreator::getThunk(InputSection *isec,
       return std::make_pair(t, false);
 
   // No existing compatible Thunk in range, create a new one
-  Thunk *t = addThunk(*isec, rel);
+  Thunk *t = addThunk(ctx, *isec, rel);
   thunkVec->push_back(t);
   return std::make_pair(t, true);
 }
 
+std::pair<Thunk *, bool> ThunkCreator::getSyntheticLandingPad(Defined &d,
+                                                              int64_t a) {
+  auto [it, isNew] = landingPadsBySectionAndAddend.try_emplace(
+      {{d.section, d.value}, a}, nullptr);
+  if (isNew)
+    it->second = addLandingPadThunk(ctx, d, a);
+  return {it->second, isNew};
+}
+
 // Return true if the relocation target is an in range Thunk.
 // Return false if the relocation is not to a Thunk. If the relocation target
 // was originally to a Thunk, but is no longer in range we revert the
@@ -2340,6 +2357,20 @@ bool ThunkCreator::createThunks(uint32_t pass,
                 ts = getISDThunkSec(os, isec, isd, rel, src);
               ts->addThunk(t);
               thunks[t->getThunkTargetSym()] = t;
+
+              // When indirect branches are restricted, such as AArch64 BTI
+              // Thunks may need to target a linker generated landing pad
+              // instead of the target.
+              if (t->needsSyntheticLandingPad()) {
+                Thunk *lpt;
+                auto &dr = cast<Defined>(t->destination);
+                std::tie(lpt, isNew) = getSyntheticLandingPad(dr, t->addend);
+                if (isNew) {
+                  ts = getISThunkSec(cast<InputSection>(dr.section));
+                  ts->addThunk(lpt);
+                }
+                t->landingPad = lpt->getThunkTargetSym();
+              }
             }
 
             // Redirect relocation to Thunk, we never go via the PLT to a Thunk
@@ -2350,7 +2381,7 @@ bool ThunkCreator::createThunks(uint32_t pass,
             // STT_SECTION + non-zero addend, clear the addend after
             // redirection.
             if (ctx.arg.emachine != EM_MIPS)
-              rel.addend = -getPCBias(rel.type);
+              rel.addend = -getPCBias(ctx, rel.type);
           }
 
         for (auto &p : isd->thunkSections)
@@ -2383,13 +2414,13 @@ bool elf::hexagonNeedsTLSSymbol(ArrayRef<OutputSection *> outputSections) {
   return needTlsSymbol;
 }
 
-void elf::hexagonTLSSymbolUpdate(ArrayRef<OutputSection *> outputSections) {
+void elf::hexagonTLSSymbolUpdate(Ctx &ctx) {
   Symbol *sym = ctx.symtab->find("__tls_get_addr");
   if (!sym)
     return;
   bool needEntry = true;
   forEachInputSectionDescription(
-      outputSections, [&](OutputSection *os, InputSectionDescription *isd) {
+      ctx.outputSections, [&](OutputSection *os, InputSectionDescription *isd) {
         for (InputSection *isec : isd->sections)
           for (Relocation &rel : isec->relocs())
             if (rel.sym->type == llvm::ELF::STT_TLS && rel.expr == R_PLT_PC) {
@@ -2411,8 +2442,8 @@ static bool matchesRefTo(const NoCrossRefCommand &cmd, StringRef osec) {
 }
 
 template <class ELFT, class Rels>
-static void scanCrossRefs(const NoCrossRefCommand &cmd, OutputSection *osec,
-                          InputSection *sec, Rels rels) {
+static void scanCrossRefs(Ctx &ctx, const NoCrossRefCommand &cmd,
+                          OutputSection *osec, InputSection *sec, Rels rels) {
   for (const auto &r : rels) {
     Symbol &sym = sec->file->getSymbol(r.getSymbol(ctx.arg.isMips64EL));
     // A legal cross-reference is when the destination output section is
@@ -2435,7 +2466,7 @@ static void scanCrossRefs(const NoCrossRefCommand &cmd, OutputSection *osec,
 
 // For each output section described by at least one NOCROSSREFS(_TO) command,
 // scan relocations from its input sections for prohibited cross references.
-template <class ELFT> void elf::checkNoCrossRefs() {
+template <class ELFT> void elf::checkNoCrossRefs(Ctx &ctx) {
   for (OutputSection *osec : ctx.outputSections) {
     for (const NoCrossRefCommand &noxref : ctx.script->noCrossRefs) {
       if (!llvm::is_contained(noxref.outputSections, osec->name) ||
@@ -2446,19 +2477,19 @@ template <class ELFT> void elf::checkNoCrossRefs() {
         if (!isd)
           continue;
         parallelForEach(isd->sections, [&](InputSection *sec) {
-          invokeOnRelocs(*sec, scanCrossRefs<ELFT>, noxref, osec, sec);
+          invokeOnRelocs(*sec, scanCrossRefs<ELFT>, ctx, noxref, osec, sec);
         });
       }
     }
   }
 }
 
-template void elf::scanRelocations<ELF32LE>();
-template void elf::scanRelocations<ELF32BE>();
-template void elf::scanRelocations<ELF64LE>();
-template void elf::scanRelocations<ELF64BE>();
+template void elf::scanRelocations<ELF32LE>(Ctx &);
+template void elf::scanRelocations<ELF32BE>(Ctx &);
+template void elf::scanRelocations<ELF64LE>(Ctx &);
+template void elf::scanRelocations<ELF64BE>(Ctx &);
 
-template void elf::checkNoCrossRefs<ELF32LE>();
-template void elf::checkNoCrossRefs<ELF32BE>();
-template void elf::checkNoCrossRefs<ELF64LE>();
-template void elf::checkNoCrossRefs<ELF64BE>();
+template void elf::checkNoCrossRefs<ELF32LE>(Ctx &);
+template void elf::checkNoCrossRefs<ELF32BE>(Ctx &);
+template void elf::checkNoCrossRefs<ELF64LE>(Ctx &);
+template void elf::checkNoCrossRefs<ELF64BE>(Ctx &);
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index aaa4581490a28..64e67c2c96820 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -16,6 +16,8 @@
 #include <vector>
 
 namespace lld::elf {
+struct Ctx;
+class Defined;
 class Symbol;
 class InputSection;
 class InputSectionBase;
@@ -141,13 +143,13 @@ struct JumpInstrMod {
 // This function writes undefined symbol diagnostics to an internal buffer.
 // Call reportUndefinedSymbols() after calling scanRelocations() to emit
 // the diagnostics.
-template <class ELFT> void scanRelocations();
-template <class ELFT> void checkNoCrossRefs();
-void reportUndefinedSymbols();
-void postScanRelocations();
-void addGotEntry(Symbol &sym);
+template <class ELFT> void scanRelocations(Ctx &ctx);
+template <class ELFT> void checkNoCrossRefs(Ctx &ctx);
+void reportUndefinedSymbols(Ctx &);
+void postScanRelocations(Ctx &ctx);
+void addGotEntry(Ctx &ctx, Symbol &sym);
 
-void hexagonTLSSymbolUpdate(ArrayRef<OutputSection *> outputSections);
+void hexagonTLSSymbolUpdate(Ctx &ctx);
 bool hexagonNeedsTLSSymbol(ArrayRef<OutputSection *> outputSections);
 
 class ThunkSection;
@@ -156,6 +158,7 @@ class InputSectionDescription;
 
 class ThunkCreator {
 public:
+  ThunkCreator(Ctx &ctx) : ctx(ctx) {}
   // Return true if Thunks have been added to OutputSections
   bool createThunks(uint32_t pass, ArrayRef<OutputSection *> outputSections);
 
@@ -173,11 +176,15 @@ class ThunkCreator {
   std::pair<Thunk *, bool> getThunk(InputSection *isec, Relocation &rel,
                                     uint64_t src);
 
+  std::pair<Thunk *, bool> getSyntheticLandingPad(Defined &d, int64_t a);
+
   ThunkSection *addThunkSection(OutputSection *os, InputSectionDescription *,
                                 uint64_t off);
 
   bool normalizeExistingThunk(Relocation &rel, uint64_t src);
 
+  Ctx &ctx;
+
   // Record all the available Thunks for a (Symbol, addend) pair, where Symbol
   // is represented as a (section, offset) pair. There may be multiple
   // relocations sharing the same (section, offset + addend) pair. We may revert
@@ -197,9 +204,18 @@ class ThunkCreator {
   // Track InputSections that have an inline ThunkSection placed in front
   // an inline ThunkSection may have control fall through to the section below
   // so we need to make sure that there is only one of them.
-  // The Mips LA25 Thunk is an example of an inline ThunkSection.
+  // The Mips LA25 Thunk is an example of an inline ThunkSection, as is
+  // the AArch64BTLandingPadThunk.
   llvm::DenseMap<InputSection *, ThunkSection *> thunkedSections;
 
+  // Record landing pads, generated for a section + offset destination.
+  // Landling pads are alternative entry points for destinations that need
+  // to be reached via thunks that use indirect branches. A destination
+  // needs at most one landing pad as that can be reused by all callers.
+  llvm::DenseMap<std::pair<std::pair<SectionBase *, uint64_t>, int64_t>,
+                 Thunk *>
+      landingPadsBySectionAndAddend;
+
   // The number of completed passes of createThunks this permits us
   // to do one time initialization on Pass 0 and put a limit on the
   // number of times it can be called to prevent infinite loops.
diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp
index 74fa66e6d1182..db8ee8f4d7b3b 100644
--- a/lld/ELF/SymbolTable.cpp
+++ b/lld/ELF/SymbolTable.cpp
@@ -215,7 +215,7 @@ bool SymbolTable::assignExactVersion(SymbolVersion ver, uint16_t versionId,
   // Get a list of symbols which we need to assign the version to.
   SmallVector<Symbol *, 0> syms = findByVersion(ver);
 
-  auto getName = [](uint16_t ver) -> std::string {
+  auto getName = [&ctx = ctx](uint16_t ver) -> std::string {
     if (ver == VER_NDX_LOCAL)
       return "VER_NDX_LOCAL";
     if (ver == VER_NDX_GLOBAL)
diff --git a/lld/ELF/SymbolTable.h b/lld/ELF/SymbolTable.h
index c0bc73502bbe6..de00a01661551 100644
--- a/lld/ELF/SymbolTable.h
+++ b/lld/ELF/SymbolTable.h
@@ -15,7 +15,7 @@
 #include "llvm/Support/Compiler.h"
 
 namespace lld::elf {
-
+struct Ctx;
 class InputFile;
 class SharedFile;
 
@@ -38,6 +38,7 @@ struct ArmCmseEntryFunction {
 // is one add* function per symbol type.
 class SymbolTable {
 public:
+  SymbolTable(Ctx &ctx) : ctx(ctx) {}
   ArrayRef<Symbol *> getSymbols() const { return symVector; }
 
   void wrap(Symbol *sym, Symbol *real, Symbol *wrap);
@@ -91,6 +92,8 @@ class SymbolTable {
   void assignWildcardVersion(SymbolVersion ver, uint16_t versionId,
                              bool includeNonDefault);
 
+  Ctx &ctx;
+
   // Global symbols and a map from symbol name to the index. The order is not
   // defined. We can use an arbitrary order, but it has to be deterministic even
   // when cross linking.
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index 65283adf4e505..efdaf07c97514 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -250,7 +250,7 @@ void Symbol::parseSymbolVersion() {
 void Symbol::extract() const {
   if (file->lazy) {
     file->lazy = false;
-    parseFile(file);
+    parseFile(ctx, file);
   }
 }
 
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index ce31c379ab182..8298420b8b01d 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1568,7 +1568,7 @@ DynamicSection<ELFT>::computeContents() {
   }
 
   if (ctx.arg.emachine == EM_PPC64)
-    addInt(DT_PPC64_OPT, getPPC64TargetInfo()->ppc64DynamicSectionOpt);
+    addInt(DT_PPC64_OPT, getPPC64TargetInfo(ctx)->ppc64DynamicSectionOpt);
 
   addInt(DT_NULL, 0);
   return entries;
@@ -3946,7 +3946,7 @@ void MergeNoTailSection::finalizeContents() {
   });
 }
 
-template <class ELFT> void elf::splitSections() {
+template <class ELFT> void elf::splitSections(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("Split sections");
   // splitIntoPieces needs to be called on each MergeInputSection
   // before calling finalizeContents().
@@ -3962,7 +3962,7 @@ template <class ELFT> void elf::splitSections() {
   });
 }
 
-void elf::combineEhSections() {
+void elf::combineEhSections(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("Combine EH sections");
   for (EhInputSection *sec : ctx.ehInputSections) {
     EhFrameSection &eh = *sec->getPartition().ehFrame;
@@ -4495,7 +4495,7 @@ void InStruct::reset() {
   symTabShndx.reset();
 }
 
-static bool needsInterpSection() {
+static bool needsInterpSection(Ctx &ctx) {
   return !ctx.arg.relocatable && !ctx.arg.shared &&
          !ctx.arg.dynamicLinker.empty() && ctx.script->needsInterpSection();
 }
@@ -4513,7 +4513,7 @@ bool elf::hasMemtag() {
 // that ifuncs use in fully static executables.
 bool elf::canHaveMemtagGlobals() {
   return hasMemtag() &&
-         (ctx.arg.relocatable || ctx.arg.shared || needsInterpSection());
+         (ctx.arg.relocatable || ctx.arg.shared || needsInterpSection(ctx));
 }
 
 constexpr char kMemtagAndroidNoteName[] = "Android";
@@ -4652,11 +4652,11 @@ static Defined *addOptionalRegular(StringRef name, SectionBase *sec,
   return cast<Defined>(s);
 }
 
-template <class ELFT> void elf::createSyntheticSections() {
+template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
   // Add the .interp section first because it is not a SyntheticSection.
   // The removeUnusedSyntheticSections() function relies on the
   // SyntheticSections coming last.
-  if (needsInterpSection()) {
+  if (needsInterpSection(ctx)) {
     for (size_t i = 1; i <= ctx.partitions.size(); ++i) {
       InputSection *sec = createInterpSection();
       sec->partition = i;
@@ -4664,7 +4664,7 @@ template <class ELFT> void elf::createSyntheticSections() {
     }
   }
 
-  auto add = [](SyntheticSection &sec) { ctx.inputSections.push_back(&sec); };
+  auto add = [&](SyntheticSection &sec) { ctx.inputSections.push_back(&sec); };
 
   if (ctx.arg.zSectionHeader)
     ctx.in.shStrTab = std::make_unique<StringTableSection>(".shstrtab", false);
@@ -4927,10 +4927,10 @@ template <class ELFT> void elf::createSyntheticSections() {
     add(*ctx.in.strTab);
 }
 
-template void elf::splitSections<ELF32LE>();
-template void elf::splitSections<ELF32BE>();
-template void elf::splitSections<ELF64LE>();
-template void elf::splitSections<ELF64BE>();
+template void elf::splitSections<ELF32LE>(Ctx &);
+template void elf::splitSections<ELF32BE>(Ctx &);
+template void elf::splitSections<ELF64LE>(Ctx &);
+template void elf::splitSections<ELF64BE>(Ctx &);
 
 template void EhFrameSection::iterateFDEWithLSDA<ELF32LE>(
     function_ref<void(InputSection &)>);
@@ -4956,7 +4956,7 @@ template void elf::writePhdrs<ELF32BE>(uint8_t *Buf, Partition &Part);
 template void elf::writePhdrs<ELF64LE>(uint8_t *Buf, Partition &Part);
 template void elf::writePhdrs<ELF64BE>(uint8_t *Buf, Partition &Part);
 
-template void elf::createSyntheticSections<ELF32LE>();
-template void elf::createSyntheticSections<ELF32BE>();
-template void elf::createSyntheticSections<ELF64LE>();
-template void elf::createSyntheticSections<ELF64BE>();
+template void elf::createSyntheticSections<ELF32LE>(Ctx &);
+template void elf::createSyntheticSections<ELF32BE>(Ctx &);
+template void elf::createSyntheticSections<ELF64LE>(Ctx &);
+template void elf::createSyntheticSections<ELF64BE>(Ctx &);
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 34654a2c57846..759a908b202a9 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -1426,11 +1426,11 @@ class MemtagGlobalDescriptors final : public SyntheticSection {
   SmallVector<const Symbol *, 0> symbols;
 };
 
-template <class ELFT> void createSyntheticSections();
+template <class ELFT> void createSyntheticSections(Ctx &);
 InputSection *createInterpSection();
 MergeInputSection *createCommentSection();
-template <class ELFT> void splitSections();
-void combineEhSections();
+template <class ELFT> void splitSections(Ctx &);
+void combineEhSections(Ctx &);
 
 bool hasMemtag();
 bool canHaveMemtagGlobals();
diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp
index d895757ad4e49..d5d11b9549e03 100644
--- a/lld/ELF/Target.cpp
+++ b/lld/ELF/Target.cpp
@@ -45,56 +45,45 @@ std::string lld::toString(RelType type) {
   return std::string(s);
 }
 
-TargetInfo *elf::getTarget() {
+TargetInfo *elf::getTarget(Ctx &ctx) {
   switch (ctx.arg.emachine) {
   case EM_386:
   case EM_IAMCU:
-    return getX86TargetInfo();
+    return getX86TargetInfo(ctx);
   case EM_AARCH64:
-    return getAArch64TargetInfo();
+    return getAArch64TargetInfo(ctx);
   case EM_AMDGPU:
-    return getAMDGPUTargetInfo();
+    return getAMDGPUTargetInfo(ctx);
   case EM_ARM:
-    return getARMTargetInfo();
+    return getARMTargetInfo(ctx);
   case EM_AVR:
-    return getAVRTargetInfo();
+    return getAVRTargetInfo(ctx);
   case EM_HEXAGON:
-    return getHexagonTargetInfo();
+    return getHexagonTargetInfo(ctx);
   case EM_LOONGARCH:
-    return getLoongArchTargetInfo();
+    return getLoongArchTargetInfo(ctx);
   case EM_MIPS:
-    switch (ctx.arg.ekind) {
-    case ELF32LEKind:
-      return getMipsTargetInfo<ELF32LE>();
-    case ELF32BEKind:
-      return getMipsTargetInfo<ELF32BE>();
-    case ELF64LEKind:
-      return getMipsTargetInfo<ELF64LE>();
-    case ELF64BEKind:
-      return getMipsTargetInfo<ELF64BE>();
-    default:
-      llvm_unreachable("unsupported MIPS target");
-    }
+    return getMipsTargetInfo(ctx);
   case EM_MSP430:
-    return getMSP430TargetInfo();
+    return getMSP430TargetInfo(ctx);
   case EM_PPC:
-    return getPPCTargetInfo();
+    return getPPCTargetInfo(ctx);
   case EM_PPC64:
-    return getPPC64TargetInfo();
+    return getPPC64TargetInfo(ctx);
   case EM_RISCV:
-    return getRISCVTargetInfo();
+    return getRISCVTargetInfo(ctx);
   case EM_SPARCV9:
-    return getSPARCV9TargetInfo();
+    return getSPARCV9TargetInfo(ctx);
   case EM_S390:
-    return getSystemZTargetInfo();
+    return getSystemZTargetInfo(ctx);
   case EM_X86_64:
-    return getX86_64TargetInfo();
+    return getX86_64TargetInfo(ctx);
   default:
     fatal("unsupported e_machine value: " + Twine(ctx.arg.emachine));
   }
 }
 
-ErrorPlace elf::getErrorPlace(const uint8_t *loc) {
+ErrorPlace elf::getErrorPlace(Ctx &ctx, const uint8_t *loc) {
   assert(loc != nullptr);
   for (InputSectionBase *d : ctx.inputSections) {
     auto *isec = dyn_cast<InputSection>(d);
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index d51b30a81d629..f18770dfc424d 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -29,6 +29,7 @@ class Symbol;
 
 class TargetInfo {
 public:
+  TargetInfo(Ctx &ctx) : ctx(ctx) {}
   virtual uint32_t calcEFlags() const { return 0; }
   virtual RelExpr getRelExpr(RelType type, const Symbol &s,
                              const uint8_t *loc) const = 0;
@@ -113,6 +114,7 @@ class TargetInfo {
     return false;
   }
 
+  Ctx &ctx;
   unsigned defaultCommonPageSize = 4096;
   unsigned defaultMaxPageSize = 4096;
 
@@ -177,21 +179,21 @@ class TargetInfo {
   uint64_t defaultImageBase = 0x10000;
 };
 
-TargetInfo *getAArch64TargetInfo();
-TargetInfo *getAMDGPUTargetInfo();
-TargetInfo *getARMTargetInfo();
-TargetInfo *getAVRTargetInfo();
-TargetInfo *getHexagonTargetInfo();
-TargetInfo *getLoongArchTargetInfo();
-TargetInfo *getMSP430TargetInfo();
-TargetInfo *getPPC64TargetInfo();
-TargetInfo *getPPCTargetInfo();
-TargetInfo *getRISCVTargetInfo();
-TargetInfo *getSPARCV9TargetInfo();
-TargetInfo *getSystemZTargetInfo();
-TargetInfo *getX86TargetInfo();
-TargetInfo *getX86_64TargetInfo();
-template <class ELFT> TargetInfo *getMipsTargetInfo();
+TargetInfo *getAArch64TargetInfo(Ctx &);
+TargetInfo *getAMDGPUTargetInfo(Ctx &);
+TargetInfo *getARMTargetInfo(Ctx &);
+TargetInfo *getAVRTargetInfo(Ctx &);
+TargetInfo *getHexagonTargetInfo(Ctx &);
+TargetInfo *getLoongArchTargetInfo(Ctx &);
+TargetInfo *getMSP430TargetInfo(Ctx &);
+TargetInfo *getMipsTargetInfo(Ctx &);
+TargetInfo *getPPC64TargetInfo(Ctx &);
+TargetInfo *getPPCTargetInfo(Ctx &);
+TargetInfo *getRISCVTargetInfo(Ctx &);
+TargetInfo *getSPARCV9TargetInfo(Ctx &);
+TargetInfo *getSystemZTargetInfo(Ctx &);
+TargetInfo *getX86TargetInfo(Ctx &);
+TargetInfo *getX86_64TargetInfo(Ctx &);
 
 struct ErrorPlace {
   InputSectionBase *isec;
@@ -200,13 +202,13 @@ struct ErrorPlace {
 };
 
 // Returns input section and corresponding source string for the given location.
-ErrorPlace getErrorPlace(const uint8_t *loc);
+ErrorPlace getErrorPlace(Ctx &ctx, const uint8_t *loc);
 
 static inline std::string getErrorLocation(const uint8_t *loc) {
-  return getErrorPlace(loc).loc;
+  return getErrorPlace(ctx, loc).loc;
 }
 
-void processArmCmseSymbols();
+void processArmCmseSymbols(Ctx &);
 
 void writePPC32GlinkSection(uint8_t *buf, size_t numEntries);
 
@@ -230,10 +232,11 @@ void writePrefixedInstruction(uint8_t *loc, uint64_t insn);
 void addPPC64SaveRestore();
 uint64_t getPPC64TocBase();
 uint64_t getAArch64Page(uint64_t expr);
+bool isAArch64BTILandingPad(Symbol &s, int64_t a);
 template <typename ELFT> void writeARMCmseImportLib();
 uint64_t getLoongArchPageDelta(uint64_t dest, uint64_t pc, RelType type);
 void riscvFinalizeRelax(int passes);
-void mergeRISCVAttributesSections();
+void mergeRISCVAttributesSections(Ctx &);
 void addArmInputSectionMappingSymbols();
 void addArmSyntheticSectionMappingSymbol(Defined *);
 void sortArmMappingSymbols();
@@ -241,25 +244,26 @@ void convertArmInstructionstoBE8(InputSection *sec, uint8_t *buf);
 void createTaggedSymbols(const SmallVector<ELFFileBase *, 0> &files);
 void initSymbolAnchors();
 
-TargetInfo *getTarget();
+TargetInfo *getTarget(Ctx &);
 
 template <class ELFT> bool isMipsPIC(const Defined *sym);
 
-void reportRangeError(uint8_t *loc, const Relocation &rel, const Twine &v,
-                      int64_t min, uint64_t max);
-void reportRangeError(uint8_t *loc, int64_t v, int n, const Symbol &sym,
-                      const Twine &msg);
+void reportRangeError(Ctx &, uint8_t *loc, const Relocation &rel,
+                      const Twine &v, int64_t min, uint64_t max);
+void reportRangeError(Ctx &ctx, uint8_t *loc, int64_t v, int n,
+                      const Symbol &sym, const Twine &msg);
 
 // Make sure that V can be represented as an N bit signed integer.
 inline void checkInt(uint8_t *loc, int64_t v, int n, const Relocation &rel) {
   if (v != llvm::SignExtend64(v, n))
-    reportRangeError(loc, rel, Twine(v), llvm::minIntN(n), llvm::maxIntN(n));
+    reportRangeError(ctx, loc, rel, Twine(v), llvm::minIntN(n),
+                     llvm::maxIntN(n));
 }
 
 // Make sure that V can be represented as an N bit unsigned integer.
 inline void checkUInt(uint8_t *loc, uint64_t v, int n, const Relocation &rel) {
   if ((v >> n) != 0)
-    reportRangeError(loc, rel, Twine(v), 0, llvm::maxUIntN(n));
+    reportRangeError(ctx, loc, rel, Twine(v), 0, llvm::maxUIntN(n));
 }
 
 // Make sure that V can be represented as an N bit signed or unsigned integer.
@@ -268,7 +272,7 @@ inline void checkIntUInt(uint8_t *loc, uint64_t v, int n,
   // For the error message we should cast V to a signed integer so that error
   // messages show a small negative value rather than an extremely large one
   if (v != (uint64_t)llvm::SignExtend64(v, n) && (v >> n) != 0)
-    reportRangeError(loc, rel, Twine((int64_t)v), llvm::minIntN(n),
+    reportRangeError(ctx, loc, rel, Twine((int64_t)v), llvm::minIntN(n),
                      llvm::maxUIntN(n));
 }
 
diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp
index 330dba75edc5e..ef97530679469 100644
--- a/lld/ELF/Thunks.cpp
+++ b/lld/ELF/Thunks.cpp
@@ -51,12 +51,20 @@ namespace {
 // distance from the thunk to the target is less than 128MB. Long thunks can
 // branch to any virtual address and they are implemented in the derived
 // classes. This class tries to create a short thunk if the target is in range,
-// otherwise it creates a long thunk.
+// otherwise it creates a long thunk. When BTI is enabled indirect branches
+// must land on a BTI instruction. If the destination does not have a BTI
+// instruction mayNeedLandingPad is set to true and Thunk::landingPad points
+// to an alternative entry point with a BTI.
 class AArch64Thunk : public Thunk {
 public:
-  AArch64Thunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) {}
+  AArch64Thunk(Ctx &ctx, Symbol &dest, int64_t addend, bool mayNeedLandingPad)
+      : Thunk(ctx, dest, addend), mayNeedLandingPad(mayNeedLandingPad) {}
   bool getMayUseShortThunk();
   void writeTo(uint8_t *buf) override;
+  bool needsSyntheticLandingPad() override;
+
+protected:
+  bool mayNeedLandingPad;
 
 private:
   bool mayUseShortThunk = true;
@@ -66,8 +74,9 @@ class AArch64Thunk : public Thunk {
 // AArch64 long range Thunks.
 class AArch64ABSLongThunk final : public AArch64Thunk {
 public:
-  AArch64ABSLongThunk(Symbol &dest, int64_t addend)
-      : AArch64Thunk(dest, addend) {}
+  AArch64ABSLongThunk(Ctx &ctx, Symbol &dest, int64_t addend,
+                      bool mayNeedLandingPad)
+      : AArch64Thunk(ctx, dest, addend, mayNeedLandingPad) {}
   uint32_t size() override { return getMayUseShortThunk() ? 4 : 16; }
   void addSymbols(ThunkSection &isec) override;
 
@@ -77,7 +86,9 @@ class AArch64ABSLongThunk final : public AArch64Thunk {
 
 class AArch64ADRPThunk final : public AArch64Thunk {
 public:
-  AArch64ADRPThunk(Symbol &dest, int64_t addend) : AArch64Thunk(dest, addend) {}
+  AArch64ADRPThunk(Ctx &ctx, Symbol &dest, int64_t addend,
+                   bool mayNeedLandingPad)
+      : AArch64Thunk(ctx, dest, addend, mayNeedLandingPad) {}
   uint32_t size() override { return getMayUseShortThunk() ? 4 : 12; }
   void addSymbols(ThunkSection &isec) override;
 
@@ -85,6 +96,26 @@ class AArch64ADRPThunk final : public AArch64Thunk {
   void writeLong(uint8_t *buf) override;
 };
 
+// AArch64 BTI Landing Pad
+// When BTI is enabled indirect branches must land on a BTI
+// compatible instruction. When the destination does not have a
+// BTI compatible instruction a Thunk doing an indirect branch
+// targets a Landing Pad Thunk that direct branches to the target.
+class AArch64BTILandingPadThunk final : public Thunk {
+public:
+  AArch64BTILandingPadThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : Thunk(ctx, dest, addend) {}
+
+  uint32_t size() override { return getMayUseShortThunk() ? 4 : 8; }
+  void addSymbols(ThunkSection &isec) override;
+  void writeTo(uint8_t *buf) override;
+
+private:
+  bool getMayUseShortThunk();
+  void writeLong(uint8_t *buf);
+  bool mayUseShortThunk = true;
+};
+
 // Base class for ARM thunks.
 //
 // An ARM thunk may be either short or long. A short thunk is simply a branch
@@ -95,7 +126,7 @@ class AArch64ADRPThunk final : public AArch64Thunk {
 // if the target is in range, otherwise it creates a long thunk.
 class ARMThunk : public Thunk {
 public:
-  ARMThunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) {}
+  ARMThunk(Ctx &ctx, Symbol &dest, int64_t addend) : Thunk(ctx, dest, addend) {}
 
   bool getMayUseShortThunk();
   uint32_t size() override { return getMayUseShortThunk() ? 4 : sizeLong(); }
@@ -125,7 +156,8 @@ class ARMThunk : public Thunk {
 // which has a range of 16MB.
 class ThumbThunk : public Thunk {
 public:
-  ThumbThunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) {
+  ThumbThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : Thunk(ctx, dest, addend) {
     alignment = 2;
   }
 
@@ -150,7 +182,8 @@ class ThumbThunk : public Thunk {
 // Source State, TargetState, Target Requirement, ABS or PI, Range
 class ARMV7ABSLongThunk final : public ARMThunk {
 public:
-  ARMV7ABSLongThunk(Symbol &dest, int64_t addend) : ARMThunk(dest, addend) {}
+  ARMV7ABSLongThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ARMThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 12; }
   void writeLong(uint8_t *buf) override;
@@ -159,7 +192,8 @@ class ARMV7ABSLongThunk final : public ARMThunk {
 
 class ARMV7PILongThunk final : public ARMThunk {
 public:
-  ARMV7PILongThunk(Symbol &dest, int64_t addend) : ARMThunk(dest, addend) {}
+  ARMV7PILongThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ARMThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 16; }
   void writeLong(uint8_t *buf) override;
@@ -168,8 +202,8 @@ class ARMV7PILongThunk final : public ARMThunk {
 
 class ThumbV7ABSLongThunk final : public ThumbThunk {
 public:
-  ThumbV7ABSLongThunk(Symbol &dest, int64_t addend)
-      : ThumbThunk(dest, addend) {}
+  ThumbV7ABSLongThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ThumbThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 10; }
   void writeLong(uint8_t *buf) override;
@@ -178,7 +212,8 @@ class ThumbV7ABSLongThunk final : public ThumbThunk {
 
 class ThumbV7PILongThunk final : public ThumbThunk {
 public:
-  ThumbV7PILongThunk(Symbol &dest, int64_t addend) : ThumbThunk(dest, addend) {}
+  ThumbV7PILongThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ThumbThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 12; }
   void writeLong(uint8_t *buf) override;
@@ -188,8 +223,8 @@ class ThumbV7PILongThunk final : public ThumbThunk {
 // Implementations of Thunks for Arm v6-M. Only Thumb instructions are permitted
 class ThumbV6MABSLongThunk final : public ThumbThunk {
 public:
-  ThumbV6MABSLongThunk(Symbol &dest, int64_t addend)
-      : ThumbThunk(dest, addend) {}
+  ThumbV6MABSLongThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ThumbThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 12; }
   void writeLong(uint8_t *buf) override;
@@ -198,8 +233,8 @@ class ThumbV6MABSLongThunk final : public ThumbThunk {
 
 class ThumbV6MABSXOLongThunk final : public ThumbThunk {
 public:
-  ThumbV6MABSXOLongThunk(Symbol &dest, int64_t addend)
-      : ThumbThunk(dest, addend) {}
+  ThumbV6MABSXOLongThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ThumbThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 20; }
   void writeLong(uint8_t *buf) override;
@@ -208,8 +243,8 @@ class ThumbV6MABSXOLongThunk final : public ThumbThunk {
 
 class ThumbV6MPILongThunk final : public ThumbThunk {
 public:
-  ThumbV6MPILongThunk(Symbol &dest, int64_t addend)
-      : ThumbThunk(dest, addend) {}
+  ThumbV6MPILongThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ThumbThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 16; }
   void writeLong(uint8_t *buf) override;
@@ -226,7 +261,8 @@ class ThumbV6MPILongThunk final : public ThumbThunk {
 // can also use this thunk, but only for Arm->Arm calls.
 class ARMV5LongLdrPcThunk final : public ARMThunk {
 public:
-  ARMV5LongLdrPcThunk(Symbol &dest, int64_t addend) : ARMThunk(dest, addend) {}
+  ARMV5LongLdrPcThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ARMThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 8; }
   void writeLong(uint8_t *buf) override;
@@ -237,7 +273,8 @@ class ARMV5LongLdrPcThunk final : public ARMThunk {
 // will not invoke Arm/Thumb state changes.
 class ARMV4PILongBXThunk final : public ARMThunk {
 public:
-  ARMV4PILongBXThunk(Symbol &dest, int64_t addend) : ARMThunk(dest, addend) {}
+  ARMV4PILongBXThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ARMThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 16; }
   void writeLong(uint8_t *buf) override;
@@ -246,7 +283,8 @@ class ARMV4PILongBXThunk final : public ARMThunk {
 
 class ARMV4PILongThunk final : public ARMThunk {
 public:
-  ARMV4PILongThunk(Symbol &dest, int64_t addend) : ARMThunk(dest, addend) {}
+  ARMV4PILongThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ARMThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 12; }
   void writeLong(uint8_t *buf) override;
@@ -255,8 +293,8 @@ class ARMV4PILongThunk final : public ARMThunk {
 
 class ThumbV4PILongBXThunk final : public ThumbThunk {
 public:
-  ThumbV4PILongBXThunk(Symbol &dest, int64_t addend)
-      : ThumbThunk(dest, addend) {}
+  ThumbV4PILongBXThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ThumbThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 16; }
   void writeLong(uint8_t *buf) override;
@@ -265,8 +303,8 @@ class ThumbV4PILongBXThunk final : public ThumbThunk {
 
 class ThumbV4PILongThunk final : public ThumbThunk {
 public:
-  ThumbV4PILongThunk(Symbol &dest, int64_t addend)
-      : ThumbThunk(dest, addend) {}
+  ThumbV4PILongThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ThumbThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 20; }
   void writeLong(uint8_t *buf) override;
@@ -275,7 +313,8 @@ class ThumbV4PILongThunk final : public ThumbThunk {
 
 class ARMV4ABSLongBXThunk final : public ARMThunk {
 public:
-  ARMV4ABSLongBXThunk(Symbol &dest, int64_t addend) : ARMThunk(dest, addend) {}
+  ARMV4ABSLongBXThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ARMThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 12; }
   void writeLong(uint8_t *buf) override;
@@ -284,8 +323,8 @@ class ARMV4ABSLongBXThunk final : public ARMThunk {
 
 class ThumbV4ABSLongBXThunk final : public ThumbThunk {
 public:
-  ThumbV4ABSLongBXThunk(Symbol &dest, int64_t addend)
-      : ThumbThunk(dest, addend) {}
+  ThumbV4ABSLongBXThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ThumbThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 12; }
   void writeLong(uint8_t *buf) override;
@@ -294,8 +333,8 @@ class ThumbV4ABSLongBXThunk final : public ThumbThunk {
 
 class ThumbV4ABSLongThunk final : public ThumbThunk {
 public:
-  ThumbV4ABSLongThunk(Symbol &dest, int64_t addend)
-      : ThumbThunk(dest, addend) {}
+  ThumbV4ABSLongThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : ThumbThunk(ctx, dest, addend) {}
 
   uint32_t sizeLong() override { return 16; }
   void writeLong(uint8_t *buf) override;
@@ -306,7 +345,7 @@ class ThumbV4ABSLongThunk final : public ThumbThunk {
 // when their destination is out of range [0, 0x1ffff].
 class AVRThunk : public Thunk {
 public:
-  AVRThunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) {}
+  AVRThunk(Ctx &ctx, Symbol &dest, int64_t addend) : Thunk(ctx, dest, addend) {}
   uint32_t size() override { return 4; }
   void writeTo(uint8_t *buf) override;
   void addSymbols(ThunkSection &isec) override;
@@ -315,7 +354,7 @@ class AVRThunk : public Thunk {
 // MIPS LA25 thunk
 class MipsThunk final : public Thunk {
 public:
-  MipsThunk(Symbol &dest) : Thunk(dest, 0) {}
+  MipsThunk(Ctx &ctx, Symbol &dest) : Thunk(ctx, dest, 0) {}
 
   uint32_t size() override { return 16; }
   void writeTo(uint8_t *buf) override;
@@ -326,7 +365,7 @@ class MipsThunk final : public Thunk {
 // microMIPS R2-R5 LA25 thunk
 class MicroMipsThunk final : public Thunk {
 public:
-  MicroMipsThunk(Symbol &dest) : Thunk(dest, 0) {}
+  MicroMipsThunk(Ctx &ctx, Symbol &dest) : Thunk(ctx, dest, 0) {}
 
   uint32_t size() override { return 14; }
   void writeTo(uint8_t *buf) override;
@@ -337,7 +376,7 @@ class MicroMipsThunk final : public Thunk {
 // microMIPS R6 LA25 thunk
 class MicroMipsR6Thunk final : public Thunk {
 public:
-  MicroMipsR6Thunk(Symbol &dest) : Thunk(dest, 0) {}
+  MicroMipsR6Thunk(Ctx &ctx, Symbol &dest) : Thunk(ctx, dest, 0) {}
 
   uint32_t size() override { return 12; }
   void writeTo(uint8_t *buf) override;
@@ -349,9 +388,9 @@ class PPC32PltCallStub final : public Thunk {
 public:
   // For R_PPC_PLTREL24, Thunk::addend records the addend which will be used to
   // decide the offsets in the call stub.
-  PPC32PltCallStub(const InputSection &isec, const Relocation &rel,
+  PPC32PltCallStub(Ctx &ctx, const InputSection &isec, const Relocation &rel,
                    Symbol &dest)
-      : Thunk(dest, rel.addend), file(isec.file) {}
+      : Thunk(ctx, dest, rel.addend), file(isec.file) {}
   uint32_t size() override { return 16; }
   void writeTo(uint8_t *buf) override;
   void addSymbols(ThunkSection &isec) override;
@@ -364,7 +403,8 @@ class PPC32PltCallStub final : public Thunk {
 
 class PPC32LongThunk final : public Thunk {
 public:
-  PPC32LongThunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) {}
+  PPC32LongThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : Thunk(ctx, dest, addend) {}
   uint32_t size() override { return ctx.arg.isPic ? 32 : 16; }
   void writeTo(uint8_t *buf) override;
   void addSymbols(ThunkSection &isec) override;
@@ -380,7 +420,7 @@ class PPC32LongThunk final : public Thunk {
 // 3) Transferring control to the target function through an indirect branch.
 class PPC64PltCallStub final : public Thunk {
 public:
-  PPC64PltCallStub(Symbol &dest) : Thunk(dest, 0) {}
+  PPC64PltCallStub(Ctx &ctx, Symbol &dest) : Thunk(ctx, dest, 0) {}
   uint32_t size() override { return 20; }
   void writeTo(uint8_t *buf) override;
   void addSymbols(ThunkSection &isec) override;
@@ -396,7 +436,8 @@ class PPC64PltCallStub final : public Thunk {
 // 2) Tail calls the callee.
 class PPC64R2SaveStub final : public Thunk {
 public:
-  PPC64R2SaveStub(Symbol &dest, int64_t addend) : Thunk(dest, addend) {
+  PPC64R2SaveStub(Ctx &ctx, Symbol &dest, int64_t addend)
+      : Thunk(ctx, dest, addend) {
     alignment = 16;
   }
 
@@ -433,8 +474,8 @@ class PPC64R2SaveStub final : public Thunk {
 // set r12 to satisfy the requirement of the global entry point.
 class PPC64R12SetupStub final : public Thunk {
 public:
-  PPC64R12SetupStub(Symbol &dest, bool gotPlt)
-      : Thunk(dest, 0), gotPlt(gotPlt) {
+  PPC64R12SetupStub(Ctx &ctx, Symbol &dest, bool gotPlt)
+      : Thunk(ctx, dest, 0), gotPlt(gotPlt) {
     alignment = 16;
   }
   uint32_t size() override { return 32; }
@@ -464,13 +505,14 @@ class PPC64LongBranchThunk : public Thunk {
                         const Relocation &rel) const override;
 
 protected:
-  PPC64LongBranchThunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) {}
+  PPC64LongBranchThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : Thunk(ctx, dest, addend) {}
 };
 
 class PPC64PILongBranchThunk final : public PPC64LongBranchThunk {
 public:
-  PPC64PILongBranchThunk(Symbol &dest, int64_t addend)
-      : PPC64LongBranchThunk(dest, addend) {
+  PPC64PILongBranchThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : PPC64LongBranchThunk(ctx, dest, addend) {
     assert(!dest.isPreemptible);
     if (std::optional<uint32_t> index =
             ctx.in.ppc64LongBranchTarget->addEntry(&dest, addend)) {
@@ -485,8 +527,8 @@ class PPC64PILongBranchThunk final : public PPC64LongBranchThunk {
 
 class PPC64PDLongBranchThunk final : public PPC64LongBranchThunk {
 public:
-  PPC64PDLongBranchThunk(Symbol &dest, int64_t addend)
-      : PPC64LongBranchThunk(dest, addend) {
+  PPC64PDLongBranchThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : PPC64LongBranchThunk(ctx, dest, addend) {
     ctx.in.ppc64LongBranchTarget->addEntry(&dest, addend);
   }
 };
@@ -532,6 +574,12 @@ void AArch64Thunk::writeTo(uint8_t *buf) {
   ctx.target->relocateNoSym(buf, R_AARCH64_CALL26, s - p);
 }
 
+bool AArch64Thunk::needsSyntheticLandingPad() {
+  // Short Thunks use a direct branch, no synthetic landing pad
+  // required.
+  return mayNeedLandingPad && !getMayUseShortThunk();
+}
+
 // AArch64 long range Thunks.
 void AArch64ABSLongThunk::writeLong(uint8_t *buf) {
   const uint8_t data[] = {
@@ -540,7 +588,11 @@ void AArch64ABSLongThunk::writeLong(uint8_t *buf) {
     0x00, 0x00, 0x00, 0x00, // L0: .xword S
     0x00, 0x00, 0x00, 0x00,
   };
-  uint64_t s = getAArch64ThunkDestVA(destination, addend);
+  // If mayNeedLandingPad is true then destination is an
+  // AArch64BTILandingPadThunk that defines landingPad.
+  assert(!mayNeedLandingPad || landingPad != nullptr);
+  uint64_t s = mayNeedLandingPad ? landingPad->getVA(0)
+                                 : getAArch64ThunkDestVA(destination, addend);
   memcpy(buf, data, sizeof(data));
   ctx.target->relocateNoSym(buf + 8, R_AARCH64_ABS64, s);
 }
@@ -564,7 +616,11 @@ void AArch64ADRPThunk::writeLong(uint8_t *buf) {
       0x10, 0x02, 0x00, 0x91, // add  x16, x16, R_AARCH64_ADD_ABS_LO12_NC(Dest)
       0x00, 0x02, 0x1f, 0xd6, // br   x16
   };
-  uint64_t s = getAArch64ThunkDestVA(destination, addend);
+  // if mayNeedLandingPad is true then destination is an
+  // AArch64BTILandingPadThunk that defines landingPad.
+  assert(!mayNeedLandingPad || landingPad != nullptr);
+  uint64_t s = mayNeedLandingPad ? landingPad->getVA(0)
+                                 : getAArch64ThunkDestVA(destination, addend);
   uint64_t p = getThunkTargetSym()->getVA();
   memcpy(buf, data, sizeof(data));
   ctx.target->relocateNoSym(buf, R_AARCH64_ADR_PREL_PG_HI21,
@@ -578,6 +634,47 @@ void AArch64ADRPThunk::addSymbols(ThunkSection &isec) {
   addSymbol("$x", STT_NOTYPE, 0, isec);
 }
 
+void AArch64BTILandingPadThunk::addSymbols(ThunkSection &isec) {
+  addSymbol(saver().save("__AArch64BTIThunk_" + destination.getName()),
+            STT_FUNC, 0, isec);
+  addSymbol("$x", STT_NOTYPE, 0, isec);
+}
+
+void AArch64BTILandingPadThunk::writeTo(uint8_t *buf) {
+  if (!getMayUseShortThunk()) {
+    writeLong(buf);
+    return;
+  }
+  write32(buf, 0xd503245f); // BTI c
+  // Control falls through to target in following section.
+}
+
+bool AArch64BTILandingPadThunk::getMayUseShortThunk() {
+  if (!mayUseShortThunk)
+    return false;
+  // If the target is the following instruction then we can fall
+  // through without the indirect branch.
+  uint64_t s = destination.getVA(addend);
+  uint64_t p = getThunkTargetSym()->getVA();
+  // This function is called before addresses are stable.  We need to
+  // work out the range from the thunk to the next section but the
+  // address of the start of the next section depends on the size of
+  // the thunks in the previous pass.  s - p + offset == 0 represents
+  // the first pass where the Thunk and following section are assigned
+  // the same offset.  s - p <= 4 is the last Thunk in the Thunk
+  // Section.
+  mayUseShortThunk = (s - p + offset == 0 || s - p <= 4);
+  return mayUseShortThunk;
+}
+
+void AArch64BTILandingPadThunk::writeLong(uint8_t *buf) {
+  uint64_t s = destination.getVA(addend);
+  uint64_t p = getThunkTargetSym()->getVA() + 4;
+  write32(buf, 0xd503245f);     // BTI c
+  write32(buf + 4, 0x14000000); // B S
+  ctx.target->relocateNoSym(buf + 4, R_AARCH64_CALL26, s - p);
+}
+
 // ARM Target Thunks
 static uint64_t getARMThunkDestVA(const Symbol &s) {
   uint64_t v = s.isInPlt() ? s.getPltVA() : s.getVA();
@@ -1037,7 +1134,7 @@ InputSection *MicroMipsR6Thunk::getTargetInputSection() const {
   return dyn_cast<InputSection>(dr.section);
 }
 
-void elf::writePPC32PltCallStub(uint8_t *buf, uint64_t gotPltVA,
+void elf::writePPC32PltCallStub(Ctx &ctx, uint8_t *buf, uint64_t gotPltVA,
                                 const InputFile *file, int64_t addend) {
   if (!ctx.arg.isPic) {
     write32(buf + 0, 0x3d600000 | (gotPltVA + 0x8000) >> 16); // lis r11,ha
@@ -1074,7 +1171,7 @@ void elf::writePPC32PltCallStub(uint8_t *buf, uint64_t gotPltVA,
 }
 
 void PPC32PltCallStub::writeTo(uint8_t *buf) {
-  writePPC32PltCallStub(buf, destination.getGotPltVA(), file, addend);
+  writePPC32PltCallStub(ctx, buf, destination.getGotPltVA(), file, addend);
 }
 
 void PPC32PltCallStub::addSymbols(ThunkSection &isec) {
@@ -1199,7 +1296,8 @@ void PPC64R12SetupStub::writeTo(uint8_t *buf) {
   int64_t offset = (gotPlt ? destination.getGotPltVA() : destination.getVA()) -
                    getThunkTargetSym()->getVA();
   if (!isInt<34>(offset))
-    reportRangeError(buf, offset, 34, destination, "R12 setup stub offset");
+    reportRangeError(ctx, buf, offset, 34, destination,
+                     "R12 setup stub offset");
 
   int nextInstOffset;
   if (ctx.arg.power10Stubs) {
@@ -1254,19 +1352,23 @@ bool PPC64LongBranchThunk::isCompatibleWith(const InputSection &isec,
   return rel.type == R_PPC64_REL24 || rel.type == R_PPC64_REL14;
 }
 
-Thunk::Thunk(Symbol &d, int64_t a) : destination(d), addend(a), offset(0) {
+Thunk::Thunk(Ctx &ctx, Symbol &d, int64_t a)
+    : ctx(ctx), destination(d), addend(a), offset(0) {
   destination.thunkAccessed = true;
 }
 
 Thunk::~Thunk() = default;
 
-static Thunk *addThunkAArch64(RelType type, Symbol &s, int64_t a) {
+static Thunk *addThunkAArch64(Ctx &ctx, RelType type, Symbol &s, int64_t a) {
   if (type != R_AARCH64_CALL26 && type != R_AARCH64_JUMP26 &&
       type != R_AARCH64_PLT32)
     fatal("unrecognized relocation type");
+  bool mayNeedLandingPad =
+      (ctx.arg.andFeatures & GNU_PROPERTY_AARCH64_FEATURE_1_BTI) &&
+      !isAArch64BTILandingPad(s, a);
   if (ctx.arg.picThunk)
-    return make<AArch64ADRPThunk>(s, a);
-  return make<AArch64ABSLongThunk>(s, a);
+    return make<AArch64ADRPThunk>(ctx, s, a, mayNeedLandingPad);
+  return make<AArch64ABSLongThunk>(ctx, s, a, mayNeedLandingPad);
 }
 
 // Creates a thunk for long branches or Thumb-ARM interworking.
@@ -1277,7 +1379,7 @@ static Thunk *addThunkAArch64(RelType type, Symbol &s, int64_t a) {
 //
 // TODO: use B for short Thumb->Arm thunks instead of LDR (this doesn't work for
 //       Arm->Thumb, as in Arm state no BX PC trick; it doesn't switch state).
-static Thunk *addThunkArmv4(RelType reloc, Symbol &s, int64_t a) {
+static Thunk *addThunkArmv4(Ctx &ctx, RelType reloc, Symbol &s, int64_t a) {
   bool thumb_target = s.getVA(a) & 1;
 
   switch (reloc) {
@@ -1287,21 +1389,21 @@ static Thunk *addThunkArmv4(RelType reloc, Symbol &s, int64_t a) {
   case R_ARM_CALL:
     if (ctx.arg.picThunk) {
       if (thumb_target)
-        return make<ARMV4PILongBXThunk>(s, a);
-      return make<ARMV4PILongThunk>(s, a);
+        return make<ARMV4PILongBXThunk>(ctx, s, a);
+      return make<ARMV4PILongThunk>(ctx, s, a);
     }
     if (thumb_target)
-      return make<ARMV4ABSLongBXThunk>(s, a);
-    return make<ARMV5LongLdrPcThunk>(s, a);
+      return make<ARMV4ABSLongBXThunk>(ctx, s, a);
+    return make<ARMV5LongLdrPcThunk>(ctx, s, a);
   case R_ARM_THM_CALL:
     if (ctx.arg.picThunk) {
       if (thumb_target)
-        return make<ThumbV4PILongThunk>(s, a);
-      return make<ThumbV4PILongBXThunk>(s, a);
+        return make<ThumbV4PILongThunk>(ctx, s, a);
+      return make<ThumbV4PILongBXThunk>(ctx, s, a);
     }
     if (thumb_target)
-      return make<ThumbV4ABSLongThunk>(s, a);
-    return make<ThumbV4ABSLongBXThunk>(s, a);
+      return make<ThumbV4ABSLongThunk>(ctx, s, a);
+    return make<ThumbV4ABSLongBXThunk>(ctx, s, a);
   }
   fatal("relocation " + toString(reloc) + " to " + toString(s) +
         " not supported for Armv4 or Armv4T target");
@@ -1312,7 +1414,7 @@ static Thunk *addThunkArmv4(RelType reloc, Symbol &s, int64_t a) {
 // - MOVT and MOVW instructions cannot be used
 // - Only Thumb relocation that can generate a Thunk is a BL, this can always
 //   be transformed into a BLX
-static Thunk *addThunkArmv5v6(RelType reloc, Symbol &s, int64_t a) {
+static Thunk *addThunkArmv5v6(Ctx &ctx, RelType reloc, Symbol &s, int64_t a) {
   switch (reloc) {
   case R_ARM_PC24:
   case R_ARM_PLT32:
@@ -1320,8 +1422,8 @@ static Thunk *addThunkArmv5v6(RelType reloc, Symbol &s, int64_t a) {
   case R_ARM_CALL:
   case R_ARM_THM_CALL:
     if (ctx.arg.picThunk)
-      return make<ARMV4PILongBXThunk>(s, a);
-    return make<ARMV5LongLdrPcThunk>(s, a);
+      return make<ARMV4PILongBXThunk>(ctx, s, a);
+    return make<ARMV5LongLdrPcThunk>(ctx, s, a);
   }
   fatal("relocation " + toString(reloc) + " to " + toString(s) +
         " not supported for Armv5 or Armv6 targets");
@@ -1332,8 +1434,8 @@ static Thunk *addThunkArmv5v6(RelType reloc, Symbol &s, int64_t a) {
 // - MOVT and MOVW instructions cannot be used.
 // - Only a limited number of instructions can access registers r8 and above
 // - No interworking support is needed (all Thumb).
-static Thunk *addThunkV6M(const InputSection &isec, RelType reloc, Symbol &s,
-                          int64_t a) {
+static Thunk *addThunkV6M(Ctx &ctx, const InputSection &isec, RelType reloc,
+                          Symbol &s, int64_t a) {
   const bool isPureCode = isec.getParent()->flags & SHF_ARM_PURECODE;
   switch (reloc) {
   case R_ARM_THM_JUMP19:
@@ -1341,23 +1443,23 @@ static Thunk *addThunkV6M(const InputSection &isec, RelType reloc, Symbol &s,
   case R_ARM_THM_CALL:
     if (ctx.arg.isPic) {
       if (!isPureCode)
-        return make<ThumbV6MPILongThunk>(s, a);
+        return make<ThumbV6MPILongThunk>(ctx, s, a);
 
       fatal("relocation " + toString(reloc) + " to " + toString(s) +
             " not supported for Armv6-M targets for position independent"
             " and execute only code");
     }
     if (isPureCode)
-      return make<ThumbV6MABSXOLongThunk>(s, a);
-    return make<ThumbV6MABSLongThunk>(s, a);
+      return make<ThumbV6MABSXOLongThunk>(ctx, s, a);
+    return make<ThumbV6MABSLongThunk>(ctx, s, a);
   }
   fatal("relocation " + toString(reloc) + " to " + toString(s) +
         " not supported for Armv6-M targets");
 }
 
 // Creates a thunk for Thumb-ARM interworking or branch range extension.
-static Thunk *addThunkArm(const InputSection &isec, RelType reloc, Symbol &s,
-                          int64_t a) {
+static Thunk *addThunkArm(Ctx &ctx, const InputSection &isec, RelType reloc,
+                          Symbol &s, int64_t a) {
   // Decide which Thunk is needed based on:
   // Available instruction set
   // - An Arm Thunk can only be used if Arm state is available.
@@ -1377,10 +1479,10 @@ static Thunk *addThunkArm(const InputSection &isec, RelType reloc, Symbol &s,
   // architecture to flag.
   if (!ctx.arg.armHasMovtMovw) {
     if (ctx.arg.armJ1J2BranchEncoding)
-      return addThunkV6M(isec, reloc, s, a);
+      return addThunkV6M(ctx, isec, reloc, s, a);
     if (ctx.arg.armHasBlx)
-      return addThunkArmv5v6(reloc, s, a);
-    return addThunkArmv4(reloc, s, a);
+      return addThunkArmv5v6(ctx, reloc, s, a);
+    return addThunkArmv4(ctx, reloc, s, a);
   }
 
   switch (reloc) {
@@ -1389,47 +1491,47 @@ static Thunk *addThunkArm(const InputSection &isec, RelType reloc, Symbol &s,
   case R_ARM_JUMP24:
   case R_ARM_CALL:
     if (ctx.arg.picThunk)
-      return make<ARMV7PILongThunk>(s, a);
-    return make<ARMV7ABSLongThunk>(s, a);
+      return make<ARMV7PILongThunk>(ctx, s, a);
+    return make<ARMV7ABSLongThunk>(ctx, s, a);
   case R_ARM_THM_JUMP19:
   case R_ARM_THM_JUMP24:
   case R_ARM_THM_CALL:
     if (ctx.arg.picThunk)
-      return make<ThumbV7PILongThunk>(s, a);
-    return make<ThumbV7ABSLongThunk>(s, a);
+      return make<ThumbV7PILongThunk>(ctx, s, a);
+    return make<ThumbV7ABSLongThunk>(ctx, s, a);
   }
   fatal("unrecognized relocation type");
 }
 
-static Thunk *addThunkAVR(RelType type, Symbol &s, int64_t a) {
+static Thunk *addThunkAVR(Ctx &ctx, RelType type, Symbol &s, int64_t a) {
   switch (type) {
   case R_AVR_LO8_LDI_GS:
   case R_AVR_HI8_LDI_GS:
-    return make<AVRThunk>(s, a);
+    return make<AVRThunk>(ctx, s, a);
   default:
     fatal("unrecognized relocation type " + toString(type));
   }
 }
 
-static Thunk *addThunkMips(RelType type, Symbol &s) {
+static Thunk *addThunkMips(Ctx &ctx, RelType type, Symbol &s) {
   if ((s.stOther & STO_MIPS_MICROMIPS) && isMipsR6())
-    return make<MicroMipsR6Thunk>(s);
+    return make<MicroMipsR6Thunk>(ctx, s);
   if (s.stOther & STO_MIPS_MICROMIPS)
-    return make<MicroMipsThunk>(s);
-  return make<MipsThunk>(s);
+    return make<MicroMipsThunk>(ctx, s);
+  return make<MipsThunk>(ctx, s);
 }
 
-static Thunk *addThunkPPC32(const InputSection &isec, const Relocation &rel,
-                            Symbol &s) {
+static Thunk *addThunkPPC32(Ctx &ctx, const InputSection &isec,
+                            const Relocation &rel, Symbol &s) {
   assert((rel.type == R_PPC_LOCAL24PC || rel.type == R_PPC_REL24 ||
           rel.type == R_PPC_PLTREL24) &&
          "unexpected relocation type for thunk");
   if (s.isInPlt())
-    return make<PPC32PltCallStub>(isec, rel, s);
-  return make<PPC32LongThunk>(s, rel.addend);
+    return make<PPC32PltCallStub>(ctx, isec, rel, s);
+  return make<PPC32LongThunk>(ctx, s, rel.addend);
 }
 
-static Thunk *addThunkPPC64(RelType type, Symbol &s, int64_t a) {
+static Thunk *addThunkPPC64(Ctx &ctx, RelType type, Symbol &s, int64_t a) {
   assert((type == R_PPC64_REL14 || type == R_PPC64_REL24 ||
           type == R_PPC64_REL24_NOTOC) &&
          "unexpected relocation type for thunk");
@@ -1437,46 +1539,55 @@ static Thunk *addThunkPPC64(RelType type, Symbol &s, int64_t a) {
   // If we are emitting stubs for NOTOC relocations, we need to tell
   // the PLT resolver that there can be multiple TOCs.
   if (type == R_PPC64_REL24_NOTOC)
-    getPPC64TargetInfo()->ppc64DynamicSectionOpt = 0x2;
+    getPPC64TargetInfo(ctx)->ppc64DynamicSectionOpt = 0x2;
 
   if (s.isInPlt())
     return type == R_PPC64_REL24_NOTOC
-               ? (Thunk *)make<PPC64R12SetupStub>(s, /*gotPlt=*/true)
-               : (Thunk *)make<PPC64PltCallStub>(s);
+               ? (Thunk *)make<PPC64R12SetupStub>(ctx, s, /*gotPlt=*/true)
+               : (Thunk *)make<PPC64PltCallStub>(ctx, s);
 
   // This check looks at the st_other bits of the callee. If the value is 1
   // then the callee clobbers the TOC and we need an R2 save stub when RelType
   // is R_PPC64_REL14 or R_PPC64_REL24.
   if ((type == R_PPC64_REL14 || type == R_PPC64_REL24) && (s.stOther >> 5) == 1)
-    return make<PPC64R2SaveStub>(s, a);
+    return make<PPC64R2SaveStub>(ctx, s, a);
 
   if (type == R_PPC64_REL24_NOTOC)
-    return make<PPC64R12SetupStub>(s, /*gotPlt=*/false);
+    return make<PPC64R12SetupStub>(ctx, s, /*gotPlt=*/false);
 
   if (ctx.arg.picThunk)
-    return make<PPC64PILongBranchThunk>(s, a);
+    return make<PPC64PILongBranchThunk>(ctx, s, a);
 
-  return make<PPC64PDLongBranchThunk>(s, a);
+  return make<PPC64PDLongBranchThunk>(ctx, s, a);
 }
 
-Thunk *elf::addThunk(const InputSection &isec, Relocation &rel) {
+Thunk *elf::addThunk(Ctx &ctx, const InputSection &isec, Relocation &rel) {
   Symbol &s = *rel.sym;
   int64_t a = rel.addend;
 
   switch (ctx.arg.emachine) {
   case EM_AARCH64:
-    return addThunkAArch64(rel.type, s, a);
+    return addThunkAArch64(ctx, rel.type, s, a);
   case EM_ARM:
-    return addThunkArm(isec, rel.type, s, a);
+    return addThunkArm(ctx, isec, rel.type, s, a);
   case EM_AVR:
-    return addThunkAVR(rel.type, s, a);
+    return addThunkAVR(ctx, rel.type, s, a);
   case EM_MIPS:
-    return addThunkMips(rel.type, s);
+    return addThunkMips(ctx, rel.type, s);
   case EM_PPC:
-    return addThunkPPC32(isec, rel, s);
+    return addThunkPPC32(ctx, isec, rel, s);
   case EM_PPC64:
-    return addThunkPPC64(rel.type, s, a);
+    return addThunkPPC64(ctx, rel.type, s, a);
   default:
     llvm_unreachable("add Thunk only supported for ARM, AVR, Mips and PowerPC");
   }
 }
+
+Thunk *elf::addLandingPadThunk(Ctx &ctx, Symbol &s, int64_t a) {
+  switch (ctx.arg.emachine) {
+  case EM_AARCH64:
+    return make<AArch64BTILandingPadThunk>(ctx, s, a);
+  default:
+    llvm_unreachable("add landing pad only supported for AArch64");
+  }
+}
diff --git a/lld/ELF/Thunks.h b/lld/ELF/Thunks.h
index 12ddf08cadc09..3929aa0aee811 100644
--- a/lld/ELF/Thunks.h
+++ b/lld/ELF/Thunks.h
@@ -13,6 +13,7 @@
 #include "Relocations.h"
 
 namespace lld::elf {
+struct Ctx;
 class Defined;
 class InputFile;
 class Symbol;
@@ -28,7 +29,7 @@ class ThunkSection;
 // Thunks are assigned to synthetic ThunkSections
 class Thunk {
 public:
-  Thunk(Symbol &destination, int64_t addend);
+  Thunk(Ctx &, Symbol &destination, int64_t addend);
   virtual ~Thunk();
 
   virtual uint32_t size() = 0;
@@ -54,10 +55,18 @@ class Thunk {
     return true;
   }
 
+  // Thunks that indirectly branch to targets may need a synthetic landing
+  // pad generated close to the target. For example AArch64 when BTI is
+  // enabled.
+  virtual bool needsSyntheticLandingPad() { return false; }
+
   Defined *getThunkTargetSym() const { return syms[0]; }
 
+  Ctx &ctx;
   Symbol &destination;
   int64_t addend;
+  // Alternative target when indirect branch to destination can't be used.
+  Symbol *landingPad = nullptr;
   llvm::SmallVector<Defined *, 3> syms;
   uint64_t offset = 0;
   // The alignment requirement for this Thunk, defaults to the size of the
@@ -67,9 +76,13 @@ class Thunk {
 
 // For a Relocation to symbol S create a Thunk to be added to a synthetic
 // ThunkSection.
-Thunk *addThunk(const InputSection &isec, Relocation &rel);
+Thunk *addThunk(Ctx &, const InputSection &isec, Relocation &rel);
+
+// Create a landing pad Thunk for use when indirect branches from Thunks
+// are restricted.
+Thunk *addLandingPadThunk(Ctx &, Symbol &s, int64_t a);
 
-void writePPC32PltCallStub(uint8_t *buf, uint64_t gotPltVA,
+void writePPC32PltCallStub(Ctx &, uint8_t *buf, uint64_t gotPltVA,
                            const InputFile *file, int64_t addend);
 void writePPC64LoadAndBranch(uint8_t *buf, int64_t offset);
 
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index ce7cbc25d7eb0..2ad7de7324a84 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -294,7 +294,8 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
   }
 }
 
-static OutputSection *findSection(StringRef name, unsigned partition = 1) {
+static OutputSection *findSection(Ctx &ctx, StringRef name,
+                                  unsigned partition = 1) {
   for (SectionCommand *cmd : ctx.script->sectionCommands)
     if (auto *osd = dyn_cast<OutputDesc>(cmd))
       if (osd->osec.name == name && osd->osec.partition == partition)
@@ -336,7 +337,7 @@ template <class ELFT> void Writer<ELFT>::run() {
   // Handle --print-map(-M)/--Map and --cref. Dump them before checkSections()
   // because the files may be useful in case checkSections() or openFile()
   // fails, for example, due to an erroneous file size.
-  writeMapAndCref();
+  writeMapAndCref(ctx);
 
   // Handle --print-memory-usage option.
   if (ctx.arg.printMemoryUsage)
@@ -544,7 +545,7 @@ template <class ELFT> void Writer<ELFT>::addSectionSymbols() {
 //
 // This function returns true if a section needs to be put into a
 // PT_GNU_RELRO segment.
-static bool isRelroSection(const OutputSection *sec) {
+static bool isRelroSection(Ctx &ctx, const OutputSection *sec) {
   if (!ctx.arg.zRelro)
     return false;
   if (sec->relro)
@@ -648,7 +649,7 @@ enum RankFlags {
   RF_BSS = 1 << 7,
 };
 
-unsigned elf::getSectionRank(OutputSection &osec) {
+unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) {
   unsigned rank = osec.partition * RF_PARTITION;
 
   // We want to put section specified by -T option first, so we
@@ -713,7 +714,7 @@ unsigned elf::getSectionRank(OutputSection &osec) {
     // TLS sections directly before the other RELRO sections.
     if (!(osec.flags & SHF_TLS))
       rank |= RF_NOT_TLS;
-    if (isRelroSection(&osec))
+    if (isRelroSection(ctx, &osec))
       osec.relro = true;
     else
       rank |= RF_NOT_RELRO;
@@ -892,8 +893,8 @@ template <class ELFT> void Writer<ELFT>::setReservedSymbolSections() {
   if (ctx.sym.bss) {
     // On RISC-V, set __bss_start to the start of .sbss if present.
     OutputSection *sbss =
-        ctx.arg.emachine == EM_RISCV ? findSection(".sbss") : nullptr;
-    ctx.sym.bss->section = sbss ? sbss : findSection(".bss");
+        ctx.arg.emachine == EM_RISCV ? findSection(ctx, ".sbss") : nullptr;
+    ctx.sym.bss->section = sbss ? sbss : findSection(ctx, ".bss");
   }
 
   // Setup MIPS _gp_disp/__gnu_local_gp symbols which should
@@ -946,7 +947,7 @@ static bool shouldSkip(SectionCommand *cmd) {
 // characteristics with their neighbors as possible. For example, if
 // both are rw, or both are tls.
 static SmallVectorImpl<SectionCommand *>::iterator
-findOrphanPos(SmallVectorImpl<SectionCommand *>::iterator b,
+findOrphanPos(Ctx &ctx, SmallVectorImpl<SectionCommand *>::iterator b,
               SmallVectorImpl<SectionCommand *>::iterator e) {
   // Place non-alloc orphan sections at the end. This matches how we assign file
   // offsets to non-alloc sections.
@@ -1028,7 +1029,8 @@ findOrphanPos(SmallVectorImpl<SectionCommand *>::iterator b,
 }
 
 // Adds random priorities to sections not already in the map.
-static void maybeShuffle(DenseMap<const InputSectionBase *, int> &order) {
+static void maybeShuffle(Ctx &ctx,
+                         DenseMap<const InputSectionBase *, int> &order) {
   if (ctx.arg.shuffleSections.empty())
     return;
 
@@ -1066,7 +1068,7 @@ static void maybeShuffle(DenseMap<const InputSectionBase *, int> &order) {
 }
 
 // Builds section order for handling --symbol-ordering-file.
-static DenseMap<const InputSectionBase *, int> buildSectionOrder() {
+static DenseMap<const InputSectionBase *, int> buildSectionOrder(Ctx &ctx) {
   DenseMap<const InputSectionBase *, int> sectionOrder;
   // Use the rarely used option --call-graph-ordering-file to sort sections.
   if (!ctx.arg.callGraphProfile.empty())
@@ -1125,7 +1127,7 @@ static DenseMap<const InputSectionBase *, int> buildSectionOrder() {
 
 // Sorts the sections in ISD according to the provided section order.
 static void
-sortISDBySectionOrder(InputSectionDescription *isd,
+sortISDBySectionOrder(Ctx &ctx, InputSectionDescription *isd,
                       const DenseMap<const InputSectionBase *, int> &order,
                       bool executableOutputSection) {
   SmallVector<InputSection *, 0> unorderedSections;
@@ -1199,7 +1201,7 @@ sortISDBySectionOrder(InputSectionDescription *isd,
     isd->sections.push_back(isec);
 }
 
-static void sortSection(OutputSection &osec,
+static void sortSection(Ctx &ctx, OutputSection &osec,
                         const DenseMap<const InputSectionBase *, int> &order) {
   StringRef name = osec.name;
 
@@ -1214,7 +1216,7 @@ static void sortSection(OutputSection &osec,
   if (!order.empty())
     for (SectionCommand *b : osec.commands)
       if (auto *isd = dyn_cast<InputSectionDescription>(b))
-        sortISDBySectionOrder(isd, order, osec.flags & SHF_EXECINSTR);
+        sortISDBySectionOrder(ctx, isd, order, osec.flags & SHF_EXECINSTR);
 
   if (ctx.script->hasSectionsCommand)
     return;
@@ -1243,11 +1245,11 @@ static void sortSection(OutputSection &osec,
 // sorting for special input sections. This also handles --symbol-ordering-file.
 template <class ELFT> void Writer<ELFT>::sortInputSections() {
   // Build the order once since it is expensive.
-  DenseMap<const InputSectionBase *, int> order = buildSectionOrder();
-  maybeShuffle(order);
+  DenseMap<const InputSectionBase *, int> order = buildSectionOrder(ctx);
+  maybeShuffle(ctx, order);
   for (SectionCommand *cmd : ctx.script->sectionCommands)
     if (auto *osd = dyn_cast<OutputDesc>(cmd))
-      sortSection(osd->osec, order);
+      sortSection(ctx, osd->osec, order);
 }
 
 template <class ELFT> void Writer<ELFT>::sortSections() {
@@ -1264,7 +1266,7 @@ template <class ELFT> void Writer<ELFT>::sortSections() {
 
   for (SectionCommand *cmd : ctx.script->sectionCommands)
     if (auto *osd = dyn_cast<OutputDesc>(cmd))
-      osd->osec.sortRank = getSectionRank(osd->osec);
+      osd->osec.sortRank = getSectionRank(ctx, osd->osec);
   if (!ctx.script->hasSectionsCommand) {
     // OutputDescs are mostly contiguous, but may be interleaved with
     // SymbolAssignments in the presence of INSERT commands.
@@ -1348,7 +1350,7 @@ template <class ELFT> void Writer<ELFT>::sortOrphanSections() {
   i = firstSectionOrDotAssignment;
 
   while (nonScriptI != e) {
-    auto pos = findOrphanPos(i, nonScriptI);
+    auto pos = findOrphanPos(ctx, i, nonScriptI);
     OutputSection *orphan = &cast<OutputDesc>(*nonScriptI)->osec;
 
     // As an optimization, find all sections with the same sort rank
@@ -1435,7 +1437,7 @@ static void finalizeSynthetic(SyntheticSection *sec) {
 // in Writer<ELFT>::finalizeSections().
 template <class ELFT> void Writer<ELFT>::finalizeAddressDependentContent() {
   llvm::TimeTraceScope timeScope("Finalize address dependent content");
-  ThunkCreator tc;
+  ThunkCreator tc(ctx);
   AArch64Err843419Patcher a64p;
   ARMErr657417Patcher a32p;
   ctx.script->assignAddresses();
@@ -1454,7 +1456,7 @@ template <class ELFT> void Writer<ELFT>::finalizeAddressDependentContent() {
 
   // Converts call x@GDPLT to call __tls_get_addr
   if (ctx.arg.emachine == EM_HEXAGON)
-    hexagonTLSSymbolUpdate(ctx.outputSections);
+    hexagonTLSSymbolUpdate(ctx);
 
   uint32_t pass = 0, assignPasses = 0;
   for (;;) {
@@ -1570,7 +1572,7 @@ template <class ELFT> void Writer<ELFT>::finalizeAddressDependentContent() {
 // update symbol values and sizes associated with these sections.  With basic
 // block sections, input sections can shrink when the jump instructions at
 // the end of the section are relaxed.
-static void fixSymbolsAfterShrinking() {
+static void fixSymbolsAfterShrinking(Ctx &ctx) {
   for (InputFile *File : ctx.objectFiles) {
     parallelForEach(File->getSymbols(), [&](Symbol *Sym) {
       auto *def = dyn_cast<Defined>(Sym);
@@ -1644,7 +1646,7 @@ template <class ELFT> void Writer<ELFT>::optimizeBasicBlockJumps() {
     }
   }
 
-  fixSymbolsAfterShrinking();
+  fixSymbolsAfterShrinking(ctx);
 
   for (OutputSection *osec : ctx.outputSections)
     for (InputSection *is : getInputSections(*osec, storage))
@@ -1709,9 +1711,9 @@ static void removeUnusedSyntheticSections(Ctx &ctx) {
 // Create output section objects and add them to OutputSections.
 template <class ELFT> void Writer<ELFT>::finalizeSections() {
   if (!ctx.arg.relocatable) {
-    ctx.out.preinitArray = findSection(".preinit_array");
-    ctx.out.initArray = findSection(".init_array");
-    ctx.out.finiArray = findSection(".fini_array");
+    ctx.out.preinitArray = findSection(ctx, ".preinit_array");
+    ctx.out.initArray = findSection(ctx, ".init_array");
+    ctx.out.finiArray = findSection(ctx, ".fini_array");
 
     // The linker needs to define SECNAME_start, SECNAME_end and SECNAME_stop
     // symbols for sections, so that the runtime can get the start and end
@@ -1741,7 +1743,7 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
     // st_shndx arbitrarily to 1 (ctx.out.elfHeader).
     if (ctx.arg.emachine == EM_RISCV) {
       if (!ctx.arg.shared) {
-        OutputSection *sec = findSection(".sdata");
+        OutputSection *sec = findSection(ctx, ".sdata");
         addOptionalRegular(ctx, "__global_pointer$",
                            sec ? sec : ctx.out.elfHeader, 0x800, STV_DEFAULT);
         // Set riscvGlobalPointer to be used by the optional global pointer
@@ -1804,9 +1806,9 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
     // that we can correctly decide if a dynamic relocation is needed. This is
     // called after processSymbolAssignments() because it needs to know whether
     // a linker-script-defined symbol is absolute.
-    scanRelocations<ELFT>();
-    reportUndefinedSymbols();
-    postScanRelocations();
+    scanRelocations<ELFT>(ctx);
+    reportUndefinedSymbols(ctx);
+    postScanRelocations(ctx);
 
     if (ctx.in.plt && ctx.in.plt->isNeeded())
       ctx.in.plt->addSymbols();
@@ -1975,7 +1977,7 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
 
   if (ctx.script->noCrossRefs.size()) {
     llvm::TimeTraceScope timeScope("Check NOCROSSREFS");
-    checkNoCrossRefs<ELFT>();
+    checkNoCrossRefs<ELFT>(ctx);
   }
 
   {
@@ -2129,7 +2131,7 @@ template <class ELFT> void Writer<ELFT>::addStartEndSymbols() {
 
   // As a special case, don't unnecessarily retain .ARM.exidx, which would
   // create an empty PT_ARM_EXIDX.
-  if (OutputSection *sec = findSection(".ARM.exidx"))
+  if (OutputSection *sec = findSection(ctx, ".ARM.exidx"))
     define("__exidx_start", "__exidx_end", sec);
 }
 
@@ -2201,7 +2203,7 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
       addHdr(PT_PHDR, PF_R)->add(part.programHeaders->getParent());
 
     // PT_INTERP must be the second entry if exists.
-    if (OutputSection *cmd = findSection(".interp", partNo))
+    if (OutputSection *cmd = findSection(ctx, ".interp", partNo))
       addHdr(PT_INTERP, cmd->getPhdrFlags())->add(cmd);
 
     // Add the headers. We will remove them if they don't fit.
@@ -2224,7 +2226,7 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
   for (OutputSection *sec : ctx.outputSections) {
     if (sec->partition != partNo || !needsPtLoad(sec))
       continue;
-    if (isRelroSection(sec)) {
+    if (isRelroSection(ctx, sec)) {
       inRelroPhdr = true;
       if (!relroEnd)
         relRo->add(sec);
@@ -2318,17 +2320,17 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
   if (ctx.arg.osabi == ELFOSABI_OPENBSD) {
     // PT_OPENBSD_MUTABLE makes the dynamic linker fill the segment with
     // zero data, like bss, but it can be treated differently.
-    if (OutputSection *cmd = findSection(".openbsd.mutable", partNo))
+    if (OutputSection *cmd = findSection(ctx, ".openbsd.mutable", partNo))
       addHdr(PT_OPENBSD_MUTABLE, cmd->getPhdrFlags())->add(cmd);
 
     // PT_OPENBSD_RANDOMIZE makes the dynamic linker fill the segment
     // with random data.
-    if (OutputSection *cmd = findSection(".openbsd.randomdata", partNo))
+    if (OutputSection *cmd = findSection(ctx, ".openbsd.randomdata", partNo))
       addHdr(PT_OPENBSD_RANDOMIZE, cmd->getPhdrFlags())->add(cmd);
 
     // PT_OPENBSD_SYSCALLS makes the kernel and dynamic linker register
     // system call sites.
-    if (OutputSection *cmd = findSection(".openbsd.syscalls", partNo))
+    if (OutputSection *cmd = findSection(ctx, ".openbsd.syscalls", partNo))
       addHdr(PT_OPENBSD_SYSCALLS, cmd->getPhdrFlags())->add(cmd);
   }
 
@@ -2350,7 +2352,7 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
   if (ctx.arg.zWxneeded)
     addHdr(PT_OPENBSD_WXNEEDED, PF_X);
 
-  if (OutputSection *cmd = findSection(".note.gnu.property", partNo))
+  if (OutputSection *cmd = findSection(ctx, ".note.gnu.property", partNo))
     addHdr(PT_GNU_PROPERTY, PF_R)->add(cmd);
 
   // Create one PT_NOTE per a group of contiguous SHT_NOTE sections with the
@@ -2456,7 +2458,7 @@ template <class ELFT> void Writer<ELFT>::fixSectionAlignments() {
 // Compute an in-file position for a given section. The file offset must be the
 // same with its virtual address modulo the page size, so that the loader can
 // load executables without any address adjustment.
-static uint64_t computeFileOffset(OutputSection *os, uint64_t off) {
+static uint64_t computeFileOffset(Ctx &ctx, OutputSection *os, uint64_t off) {
   // The first section in a PT_LOAD has to have congruent offset and address
   // modulo the maximum page size.
   if (os->ptLoad && os->ptLoad->firstSec == os)
@@ -2519,7 +2521,7 @@ template <class ELFT> void Writer<ELFT>::assignFileOffsets() {
   for (OutputSection *sec : ctx.outputSections) {
     if (!(sec->flags & SHF_ALLOC))
       continue;
-    off = computeFileOffset(sec, off);
+    off = computeFileOffset(ctx, sec, off);
     sec->offset = off;
     if (sec->type != SHT_NOBITS)
       off += sec->size;
@@ -2702,7 +2704,7 @@ template <class ELFT> void Writer<ELFT>::checkSections() {
 // 3. the value of the symbol _start, if present;
 // 4. the number represented by the entry symbol, if it is a number;
 // 5. the address 0.
-static uint64_t getEntryAddr() {
+static uint64_t getEntryAddr(Ctx &ctx) {
   // Case 1, 2 or 3
   if (Symbol *b = ctx.symtab->find(ctx.arg.entry))
     return b->getVA();
@@ -2719,7 +2721,7 @@ static uint64_t getEntryAddr() {
   return 0;
 }
 
-static uint16_t getELFType() {
+static uint16_t getELFType(Ctx &ctx) {
   if (ctx.arg.isPic)
     return ET_DYN;
   if (ctx.arg.relocatable)
@@ -2732,8 +2734,8 @@ template <class ELFT> void Writer<ELFT>::writeHeader() {
   writePhdrs<ELFT>(ctx.bufferStart + sizeof(Elf_Ehdr), *ctx.mainPart);
 
   auto *eHdr = reinterpret_cast<Elf_Ehdr *>(ctx.bufferStart);
-  eHdr->e_type = getELFType();
-  eHdr->e_entry = getEntryAddr();
+  eHdr->e_type = getELFType(ctx);
+  eHdr->e_entry = getEntryAddr(ctx);
 
   // If -z nosectionheader is specified, omit the section header table.
   if (!ctx.in.shStrTab)
@@ -2807,9 +2809,10 @@ template <class ELFT> void Writer<ELFT>::writeSectionsBinary() {
       sec->writeTo<ELFT>(ctx.bufferStart + sec->offset, tg);
 }
 
-static void fillTrap(uint8_t *i, uint8_t *end) {
+static void fillTrap(std::array<uint8_t, 4> trapInstr, uint8_t *i,
+                     uint8_t *end) {
   for (; i + 4 <= end; i += 4)
-    memcpy(i, &ctx.target->trapInstr, 4);
+    memcpy(i, trapInstr.data(), 4);
 }
 
 // Fill the last page of executable segments with trap instructions
@@ -2824,6 +2827,7 @@ template <class ELFT> void Writer<ELFT>::writeTrapInstr() {
     for (PhdrEntry *p : part.phdrs)
       if (p->p_type == PT_LOAD && (p->p_flags & PF_X))
         fillTrap(
+            ctx.target->trapInstr,
             ctx.bufferStart + alignDown(p->firstSec->offset + p->p_filesz, 4),
             ctx.bufferStart + alignToPowerOf2(p->firstSec->offset + p->p_filesz,
                                               ctx.arg.maxPageSize));
diff --git a/lld/ELF/Writer.h b/lld/ELF/Writer.h
index bf4783cc52f6b..3a82a4bb5f8bf 100644
--- a/lld/ELF/Writer.h
+++ b/lld/ELF/Writer.h
@@ -46,7 +46,7 @@ struct PhdrEntry {
 
 void addReservedSymbols(Ctx &ctx);
 bool includeInSymtab(const Symbol &b);
-unsigned getSectionRank(OutputSection &osec);
+unsigned getSectionRank(Ctx &, OutputSection &osec);
 
 template <class ELFT> uint32_t calcMipsEFlags();
 
diff --git a/lld/test/COFF/Inputs/loadconfig-arm64ec.s b/lld/test/COFF/Inputs/loadconfig-arm64ec.s
index cb79b5c257e6e..548634cfcfb4f 100644
--- a/lld/test/COFF/Inputs/loadconfig-arm64ec.s
+++ b/lld/test/COFF/Inputs/loadconfig-arm64ec.s
@@ -42,12 +42,6 @@ __os_arm64x_check_icall_cfg:
         .xword 0
 __os_arm64x_dispatch_fptr:
         .xword 0
-__os_arm64x_helper0:
-        .xword 0
-__os_arm64x_helper1:
-        .xword 0
-__os_arm64x_helper2:
-        .xword 0
 __os_arm64x_helper3:
         .xword 0
 __os_arm64x_helper4:
@@ -65,7 +59,7 @@ __os_arm64x_helper8:
         .globl __chpe_metadata
         .p2align 3, 0
 __chpe_metadata:
-        .word 1
+        .word 2
         .rva __hybrid_code_map
         .word __hybrid_code_map_count
         .rva __x64_code_ranges_to_entry_points
@@ -75,7 +69,7 @@ __chpe_metadata:
         .rva __os_arm64x_check_call
         .rva __os_arm64x_check_icall
         .rva __os_arm64x_check_icall_cfg
-        .word 0 // __arm64x_native_entrypoint
+        .rva __arm64x_native_entrypoint
         .rva __hybrid_auxiliary_iat
         .word __x64_code_ranges_to_entry_points_count
         .word __arm64x_redirection_metadata_count
@@ -85,9 +79,9 @@ __chpe_metadata:
         .word __arm64x_extra_rfe_table_size
         .rva __os_arm64x_dispatch_fptr
         .rva __hybrid_auxiliary_iat_copy
-        .rva __os_arm64x_helper0
-        .rva __os_arm64x_helper1
-        .rva __os_arm64x_helper2
+        .rva __hybrid_auxiliary_delayload_iat
+        .rva __hybrid_auxiliary_delayload_iat_copy
+        .word __hybrid_image_info_bitfield
         .rva __os_arm64x_helper3
         .rva __os_arm64x_helper4
         .rva __os_arm64x_helper5
diff --git a/lld/test/COFF/arm64ec-delayimport.test b/lld/test/COFF/arm64ec-delayimport.test
new file mode 100644
index 0000000000000..a0236d902eeab
--- /dev/null
+++ b/lld/test/COFF/arm64ec-delayimport.test
@@ -0,0 +1,201 @@
+REQUIRES: aarch64, x86
+RUN: split-file %s %t.dir && cd %t.dir
+
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test.s -o test.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+RUN: llvm-lib -machine:arm64ec -def:test.def -out:test-arm64ec.lib
+RUN: llvm-lib -machine:arm64ec -def:test2.def -out:test2-arm64ec.lib
+
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj test.obj \
+RUN:          test-arm64ec.lib test2-arm64ec.lib -delayload:test.dll -map
+
+RUN: llvm-readobj --hex-dump=.test out.dll | FileCheck --check-prefix=TESTSEC %s
+TESTSEC:      0x180008000 00600000 88700000 00200000 10100000
+TESTSEC-NEXT: 0x180008010 08600000 90700000 10200000 30100000
+TESTSEC-NEXT: 0x180008020 1c100000 3c100000 00300000
+
+RUN: llvm-objdump -d out.dll | FileCheck --check-prefix=DISASM %s
+DISASM:      0000000180001000 <.text>:
+DISASM-NEXT: 80001000: 52800000     mov     w0, #0x0                // =0
+DISASM-NEXT: 180001004: d65f03c0     ret
+DISASM-NEXT: 180001008: 52800020     mov     w0, #0x1                // =1
+DISASM-NEXT: 18000100c: d65f03c0     ret
+DISASM-NEXT: 180001010: b0000030     adrp    x16, 0x180006000
+DISASM-NEXT: 180001014: f9400210     ldr     x16, [x16]
+DISASM-NEXT: 180001018: d61f0200     br      x16
+DISASM-NEXT: 18000101c: d000002b     adrp    x11, 0x180007000
+DISASM-NEXT: 180001020: f940456b     ldr     x11, [x11, #0x88]
+DISASM-NEXT: 180001024: 9000000a     adrp    x10, 0x180001000 <.text>
+DISASM-NEXT: 180001028: 9101414a     add     x10, x10, #0x50
+DISASM-NEXT: 18000102c: 17fffff5     b       0x180001000 <.text>
+DISASM-NEXT: 180001030: b0000030     adrp    x16, 0x180006000
+DISASM-NEXT: 180001034: f9400610     ldr     x16, [x16, #0x8]
+DISASM-NEXT: 180001038: d61f0200     br      x16
+DISASM-NEXT: 18000103c: d000002b     adrp    x11, 0x180007000
+DISASM-NEXT: 180001040: f940496b     ldr     x11, [x11, #0x90]
+DISASM-NEXT: 180001044: 9000000a     adrp    x10, 0x180001000 <.text>
+DISASM-NEXT: 180001048: 9101614a     add     x10, x10, #0x58
+DISASM-NEXT: 18000104c: 17ffffed     b       0x180001000 <.text>
+DISASM-NEXT: 180001050: 52800040     mov     w0, #0x2                // =2
+DISASM-NEXT: 180001054: d65f03c0     ret
+DISASM-NEXT: 180001058: 52800060     mov     w0, #0x3                // =3
+DISASM-NEXT: 18000105c: d65f03c0     ret
+DISASM-NEXT:                 ...
+DISASM-NEXT: 180002000: ff 25 82 50 00 00            jmpq    *0x5082(%rip)           # 0x180007088
+DISASM-NEXT:                 ...
+DISASM-NEXT: 18000200e: 00 00                        addb    %al, (%rax)
+DISASM-NEXT: 180002010: ff 25 7a 50 00 00            jmpq    *0x507a(%rip)           # 0x180007090
+DISASM-NEXT: 180002016: 48 8d 05 6b 50 00 00         leaq    0x506b(%rip), %rax      # 0x180007088
+DISASM-NEXT: 18000201d: e9 0c 00 00 00               jmp     0x18000202e <.text+0x102e>
+DISASM-NEXT: 180002022: 48 8d 05 67 50 00 00         leaq    0x5067(%rip), %rax      # 0x180007090
+DISASM-NEXT: 180002029: e9 00 00 00 00               jmp     0x18000202e <.text+0x102e>
+DISASM-NEXT: 18000202e: 51                           pushq   %rcx
+DISASM-NEXT: 18000202f: 52                           pushq   %rdx
+DISASM-NEXT: 180002030: 41 50                        pushq   %r8
+DISASM-NEXT: 180002032: 41 51                        pushq   %r9
+DISASM-NEXT: 180002034: 48 83 ec 48                  subq    $0x48, %rsp
+DISASM-NEXT: 180002038: 66 0f 7f 04 24               movdqa  %xmm0, (%rsp)
+DISASM-NEXT: 18000203d: 66 0f 7f 4c 24 10            movdqa  %xmm1, 0x10(%rsp)
+DISASM-NEXT: 180002043: 66 0f 7f 54 24 20            movdqa  %xmm2, 0x20(%rsp)
+DISASM-NEXT: 180002049: 66 0f 7f 5c 24 30            movdqa  %xmm3, 0x30(%rsp)
+DISASM-NEXT: 18000204f: 48 8b d0                     movq    %rax, %rdx
+DISASM-NEXT: 180002052: 48 8d 0d 97 21 00 00         leaq    0x2197(%rip), %rcx      # 0x1800041f0
+DISASM-NEXT: 180002059: e8 aa ef ff ff               callq   0x180001008 <.text+0x8>
+DISASM-NEXT: 18000205e: 66 0f 6f 04 24               movdqa  (%rsp), %xmm0
+DISASM-NEXT: 180002063: 66 0f 6f 4c 24 10            movdqa  0x10(%rsp), %xmm1
+DISASM-NEXT: 180002069: 66 0f 6f 54 24 20            movdqa  0x20(%rsp), %xmm2
+DISASM-NEXT: 18000206f: 66 0f 6f 5c 24 30            movdqa  0x30(%rsp), %xmm3
+DISASM-NEXT: 180002075: 48 83 c4 48                  addq    $0x48, %rsp
+DISASM-NEXT: 180002079: 41 59                        popq    %r9
+DISASM-NEXT: 18000207b: 41 58                        popq    %r8
+DISASM-NEXT: 18000207d: 5a                           popq    %rdx
+DISASM-NEXT: 18000207e: 59                           popq    %rcx
+DISASM-NEXT: 18000207f: ff e0                        jmpq    *%rax
+
+RUN: llvm-readobj --coff-load-config out.dll | FileCheck --check-prefix=LOADCFG %s
+LOADCFG:      CHPEMetadata [
+LOADCFG:       AuxiliaryDelayloadIAT: 0x6000
+LOADCFG-NEXT:  AuxiliaryDelayloadIATCopy: 0x4000
+
+RUN: llvm-readobj --coff-imports out.dll | FileCheck --check-prefix=IMPORTS %s
+IMPORTS:      DelayImport {
+IMPORTS-NEXT:   Name: test.dll
+IMPORTS-NEXT:   Attributes: 0x1
+IMPORTS-NEXT:   ModuleHandle: 0x7080
+IMPORTS-NEXT:   ImportAddressTable: 0x7088
+IMPORTS-NEXT:   ImportNameTable: 0x4230
+IMPORTS-NEXT:   BoundDelayImportTable: 0x0
+IMPORTS-NEXT:   UnloadDelayImportTable: 0x0
+IMPORTS-NEXT:   Import {
+IMPORTS-NEXT:     Symbol: func (0)
+IMPORTS-NEXT:     Address: 0x180002016
+IMPORTS-NEXT:   }
+IMPORTS-NEXT:   Import {
+IMPORTS-NEXT:     Symbol: func2 (0)
+IMPORTS-NEXT:     Address: 0x180002022
+IMPORTS-NEXT:   }
+IMPORTS-NEXT: }
+
+RUN: FileCheck --check-prefix=MAP %s < out.map
+MAP:       0001:00000008       #__delayLoadHelper2        0000000180001008     test.obj
+MAP:       0001:00000010       #func                      0000000180001010     test-arm64ec:test.dll
+MAP-NEXT:  0001:0000001c       __impchk_func              000000018000101c     test-arm64ec:test.dll
+MAP-NEXT:  0001:00000030       #func2                     0000000180001030     test-arm64ec:test.dll
+MAP-NEXT:  0001:0000003c       __impchk_func2             000000018000103c     test-arm64ec:test.dll
+MAP-NEXT:  0001:00000050       func_exit_thunk            0000000180001050     test.obj
+MAP-NEXT:  0001:00000058       func2_exit_thunk           0000000180001058     test.obj
+MAP-NEXT:  0001:00001000       func                       0000000180002000     test-arm64ec:test.dll
+MAP-NEXT:  0001:00001010       func2                      0000000180002010     test-arm64ec:test.dll
+MAP-NEXT:  0002:00000000       __imp_data                 0000000180003000     test2-arm64ec:test2.dll
+MAP-NEXT:  0000:00000000       __hybrid_auxiliary_delayload_iat_copy 0000000180004000     <linker-defined>
+MAP-NEXT:  0002:00001000       __auximpcopy_func          0000000180004000     test-arm64ec:test.dll
+MAP-NEXT:  0002:00001008       __auximpcopy_func2         0000000180004008     test-arm64ec:test.dll
+MAP:       0002:00003000       __imp_func                 0000000180006000     test-arm64ec:test.dll
+MAP-NEXT:  0002:00003008       __imp_func2                0000000180006008     test-arm64ec:test.dll
+MAP:       0003:00000088       __imp_aux_func             0000000180007088     test-arm64ec:test.dll
+MAP-NEXT:  0003:00000090       __imp_aux_func2            0000000180007090     test-arm64ec:test.dll
+
+RUN: llvm-readobj --hex-dump=.rdata out.dll | FileCheck --check-prefix=RDATA %s
+RDATA:      0x180004000 1c100080 01000000 3c100080 01000000
+RDATA-NEXT: 0x180004010 00000000 00000000
+RDATA:      0x180006000 1c100080 01000000 3c100080 01000000
+RDATA-NEXT: 0x180006010 00000000 00000000
+
+RUN: llvm-readobj --coff-basereloc out.dll | FileCheck --check-prefix=RELOC %s
+RELOC:      BaseReloc [
+RELOC-NEXT:   Entry {
+RELOC-NEXT:     Type: DIR64
+RELOC-NEXT:     Address: 0x4000
+RELOC-NEXT:   }
+RELOC-NEXT:   Entry {
+RELOC-NEXT:     Type: DIR64
+RELOC-NEXT:     Address: 0x4008
+RELOC-NEXT:   }
+RELOC:          Address: 0x6000
+RELOC-NEXT:   }
+RELOC-NEXT:   Entry {
+RELOC-NEXT:     Type: DIR64
+RELOC-NEXT:     Address: 0x6008
+RELOC-NEXT:   }
+
+#--- test.s
+    .section .test,"r"
+    .rva __imp_func
+    .rva __imp_aux_func
+    .rva func
+    .rva "#func"
+    .rva __imp_func2
+    .rva __imp_aux_func2
+    .rva func2
+    .rva "#func2"
+    .rva __impchk_func
+    .rva __impchk_func2
+    .rva __imp_data
+
+    .section .text,"xr",discard,__icall_helper_arm64ec
+    .globl __icall_helper_arm64ec
+    .p2align 2, 0x0
+__icall_helper_arm64ec:
+    mov w0, #0
+    ret
+
+    .section .text,"xr",discard,"#__delayLoadHelper2"
+    .globl "#__delayLoadHelper2"
+    .p2align 2, 0x0
+"#__delayLoadHelper2":
+    mov w0, #1
+    ret
+
+    .weak_anti_dep __delayLoadHelper2
+.set __delayLoadHelper2,"#__delayLoadHelper2"
+
+    .section .hybmp$x, "yi"
+    .symidx __imp_func
+    .symidx func_exit_thunk
+    .word 4
+    .symidx __imp_func2
+    .symidx func2_exit_thunk
+    .word 4
+
+    .section .wowthk$aa,"xr",discard,func_exit_thunk
+    .globl func_exit_thunk
+func_exit_thunk:
+    mov w0, #2
+    ret
+
+    .section .wowthk$aa,"xr",discard,func2_exit_thunk
+    .globl func2_exit_thunk
+func2_exit_thunk:
+    mov w0, #3
+    ret
+
+#--- test.def
+NAME test.dll
+EXPORTS
+    func
+    func2
+
+#--- test2.def
+NAME test2.dll
+EXPORTS
+    data DATA
diff --git a/lld/test/COFF/arm64ec-import-range-ext.test b/lld/test/COFF/arm64ec-import-range-ext.test
new file mode 100644
index 0000000000000..701d4c11cc564
--- /dev/null
+++ b/lld/test/COFF/arm64ec-import-range-ext.test
@@ -0,0 +1,39 @@
+REQUIRES: aarch64, x86
+RUN: split-file %s %t.dir && cd %t.dir
+
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test.s -o test.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+RUN: llvm-lib -machine:arm64ec -def:test.def -out:test.lib
+
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj test.obj test.lib
+
+RUN: llvm-objdump -d out.dll | FileCheck --check-prefix=DISASM %s
+DISASM:      0000000180001000 <.text>:
+DISASM-NEXT: 180001000: 52800000     mov     w0, #0x0                // =0
+DISASM-NEXT: 180001004: d65f03c0     ret
+DISASM-NEXT:                 ...
+DISASM-NEXT: 188001008: b000000b     adrp    x11, 0x188002000
+DISASM-NEXT: 18800100c: f940016b     ldr     x11, [x11]
+DISASM-NEXT: 188001010: f0fbffea     adrp    x10, 0x180000000
+DISASM-NEXT: 188001014: 9100014a     add     x10, x10, #0x0
+DISASM-NEXT: 188001018: 90fc0010     adrp    x16, 0x180001000 <.text>
+DISASM-NEXT: 18800101c: 91000210     add     x16, x16, #0x0
+DISASM-NEXT: 188001020: d61f0200     br      x16
+
+#--- test.s
+    .text
+    .globl __icall_helper_arm64ec
+    .p2align 2, 0x0
+__icall_helper_arm64ec:
+    mov w0, #0
+    ret
+
+    .space 0x8000000
+
+    .data
+    .rva __imp_func
+
+#--- test.def
+NAME test.dll
+EXPORTS
+    func
diff --git a/lld/test/COFF/arm64ec-import.test b/lld/test/COFF/arm64ec-import.test
index 08ff31ce1a8f3..9cf0914322941 100644
--- a/lld/test/COFF/arm64ec-import.test
+++ b/lld/test/COFF/arm64ec-import.test
@@ -27,7 +27,7 @@ RUN: llvm-readobj --coff-imports out2.dll | FileCheck --check-prefix=IMPORTS %s
 RUN: llvm-readobj --coff-imports out3.dll | FileCheck -check-prefix=IMPORTS %s
 IMPORTS:      Import {
 IMPORTS-NEXT:   Name: test.dll
-IMPORTS-NEXT:   ImportLookupTableRVA: 0x4230
+IMPORTS-NEXT:   ImportLookupTableRVA: 0x4218
 IMPORTS-NEXT:   ImportAddressTableRVA: 0x3000
 IMPORTS-NEXT:   Symbol: data (0)
 IMPORTS-NEXT:   Symbol: func (0)
@@ -79,13 +79,13 @@ RUN: llvm-readobj --hex-dump=.test out2.dll | FileCheck --check-prefix=TESTSEC %
 TESTSEC:      0x180007000 08500000 00300000 10500000 20500000
 TESTSEC-NEXT: 0x180007010 08300000 00500000 10300000 20300000
 TESTSEC-NEXT: 0x180007020 14100000 28100000 00200000 08100000
-TESTSEC-NEXT: 0x180007030 3c100000 a0420000
+TESTSEC-NEXT: 0x180007030 3c100000 88420000
 
 RUN: llvm-readobj --hex-dump=.test out3.dll | FileCheck -check-prefix=TESTSEC-X64 %s
 TESTSEC-X64:      0x180007000 08300000 00300000 10300000 20300000
 TESTSEC-X64-NEXT: 0x180007010 08300000 00500000 10300000 20300000
 TESTSEC-X64-NEXT: 0x180007020 14100000 28100000 00200000 08100000
-TESTSEC-X64-NEXT: 0x180007030 3c100000 a0420000
+TESTSEC-X64-NEXT: 0x180007030 3c100000 88420000
 
 RUN: FileCheck --check-prefix=MAP %s < out.map
 RUN: FileCheck --check-prefix=MAP %s < out2.map
@@ -100,10 +100,10 @@ MAP-NEXT: 0002:00000000       __imp_data                 0000000180003000     te
 MAP-NEXT: 0002:00000008       __imp_aux_func             0000000180003008     test{{.*}}:test.dll
 MAP-NEXT: 0002:00000010       __imp_aux_func2            0000000180003010     test{{.*}}:test.dll
 MAP-NEXT: 0002:00000020       __imp_aux_t2func           0000000180003020     test2{{.*}}:test2.dll
-MAP:      0002:00001298       __auximpcopy_data          0000000180004298     test{{.*}}:test.dll
-MAP-NEXT: 0002:000012a0       __auximpcopy_func          00000001800042a0     test{{.*}}:test.dll
-MAP-NEXT: 0002:000012a8       __auximpcopy_func2         00000001800042a8     test{{.*}}:test.dll
-MAP-NEXT: 0002:000012b8       __auximpcopy_t2func        00000001800042b8     test2{{.*}}:test2.dll
+MAP:      0002:00001280       __auximpcopy_data          0000000180004280     test{{.*}}:test.dll
+MAP-NEXT: 0002:00001288       __auximpcopy_func          0000000180004288     test{{.*}}:test.dll
+MAP-NEXT: 0002:00001290       __auximpcopy_func2         0000000180004290     test{{.*}}:test.dll
+MAP-NEXT: 0002:000012a0       __auximpcopy_t2func        00000001800042a0     test2{{.*}}:test2.dll
 MAP:      0002:00002000       __imp_aux_data             0000000180005000     test{{.*}}:test.dll
 MAP-NEXT: 0002:00002008       __imp_func                 0000000180005008     test{{.*}}:test.dll
 MAP-NEXT: 0002:00002010       __imp_func2                0000000180005010     test{{.*}}:test.dll
@@ -120,15 +120,14 @@ RUN: llvm-readobj --coff-load-config out.dll | FileCheck -check-prefix=LOADCONFI
 RUN: llvm-readobj --coff-load-config out2.dll | FileCheck -check-prefix=LOADCONFIG %s
 RUN: llvm-readobj --coff-load-config out3.dll | FileCheck -check-prefix=LOADCONFIG %s
 LOADCONFIG: AuxiliaryIAT: 0x5000
-LOADCONFIG: AuxiliaryIATCopy: 0x4298
+LOADCONFIG: AuxiliaryIATCopy: 0x4280
 
 RUN: llvm-readobj --hex-dump=.rdata out.dll | FileCheck -check-prefix=RDATA %s
 RUN: llvm-readobj --hex-dump=.rdata out2.dll | FileCheck -check-prefix=RDATA %s
 RUN: llvm-readobj --hex-dump=.rdata out3.dll | FileCheck -check-prefix=RDATA %s
-RDATA:      0x180004290 2e646c6c 00000000 00000000 00000000
-RDATA-NEXT: 0x1800042a0 14100080 01000000 28100080 01000000
-RDATA-NEXT: 0x1800042b0 00000000 00000000 48100080 01000000
-RDATA-NEXT: 0x1800042c0 00000000 00000000 00000000 00000000
+RDATA:      0x180004280 00000000 00000000 14100080 01000000
+RDATA-NEXT: 0x180004290 28100080 01000000 00000000 00000000
+RDATA-NEXT: 0x1800042a0 48100080 01000000 00000000 00000000
 RDATA:      0x180005000 00000000 00000000 14100080 01000000
 RDATA-NEXT: 0x180005010 28100080 01000000 00000000 00000000
 RDATA-NEXT: 0x180005020 48100080 01000000 00000000 00000000
@@ -138,15 +137,15 @@ RUN: llvm-readobj --coff-basereloc out2.dll | FileCheck -check-prefix=BASERELOC
 RUN: llvm-readobj --coff-basereloc out3.dll | FileCheck -check-prefix=BASERELOC %s
 BASERELOC:      BaseReloc [
 Aux IAT copy:
-BASERELOC:          Address: 0x42A0
+BASERELOC:          Address: 0x4288
 BASERELOC-NEXT:   }
 BASERELOC-NEXT:   Entry {
 BASERELOC-NEXT:     Type: DIR64
-BASERELOC-NEXT:     Address: 0x42A8
+BASERELOC-NEXT:     Address: 0x4290
 BASERELOC-NEXT:   }
 BASERELOC-NEXT:   Entry {
 BASERELOC-NEXT:     Type: DIR64
-BASERELOC-NEXT:     Address: 0x42B8
+BASERELOC-NEXT:     Address: 0x42A0
 BASERELOC-NEXT:   }
 Aux IAT:
 BASERELOC-NOT:      Address: 0x5000
diff --git a/lld/test/COFF/arm64ec-loadcfg.s b/lld/test/COFF/arm64ec-loadcfg.s
new file mode 100644
index 0000000000000..d7df96334313d
--- /dev/null
+++ b/lld/test/COFF/arm64ec-loadcfg.s
@@ -0,0 +1,38 @@
+# REQUIRES: aarch64
+
+# RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %s -o %t.obj
+# RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o %t-loadconfig.obj
+
+# RUN: lld-link -machine:arm64ec -dll -noentry %t.obj %t-loadconfig.obj -out:%t.dll
+
+# RUN: llvm-readobj --coff-load-config %t.dll | FileCheck --check-prefix=LOADCFG %s
+# LOADCFG:      CHPEMetadata [
+# LOADCFG-NEXT:   Version: 0x2
+# LOADCFG-NEXT:   CodeMap: 4096
+# LOADCFG-NEXT:   CodeRangesToEntryPoints: 4096
+# LOADCFG-NEXT:   RedirectionMetadata: 12288
+# LOADCFG-NEXT:   __os_arm64x_dispatch_call_no_redirect: 0x1158
+# LOADCFG-NEXT:   __os_arm64x_dispatch_ret: 0x1160
+# LOADCFG-NEXT:   __os_arm64x_dispatch_call: 0x1168
+# LOADCFG-NEXT:   __os_arm64x_dispatch_icall: 0x1170
+# LOADCFG-NEXT:   __os_arm64x_dispatch_icall_cfg: 0x1188
+# LOADCFG-NEXT:   AlternateEntryPoint: 0x0
+# LOADCFG-NEXT:   AuxiliaryIAT: 0x0
+# LOADCFG-NEXT:   GetX64InformationFunctionPointer: 0x1178
+# LOADCFG-NEXT:   SetX64InformationFunctionPointer: 0x1180
+# LOADCFG-NEXT:   ExtraRFETable: 0x0
+# LOADCFG-NEXT:   ExtraRFETableSize: 0x0
+# LOADCFG-NEXT:   __os_arm64x_dispatch_fptr: 0x1190
+# LOADCFG-NEXT:   AuxiliaryIATCopy: 0x0
+# LOADCFG-NEXT:   AuxiliaryDelayloadIAT: 0x0
+# LOADCFG-NEXT:   AuxiliaryDelayloadIATCopy: 0x0
+# LOADCFG-NEXT:   HybridImageInfoBitfield: 0x0
+# LOADCFG-NEXT: ]
+
+# RUN: llvm-readobj --hex-dump=.test %t.dll | FileCheck --check-prefix=TEST %s
+# TEST: 0x180003000 00000000 00000000 00000000
+
+.section .test,"dr"
+        .rva __arm64x_native_entrypoint
+        .rva __guard_check_icall_a64n_fptr
+        .word __hybrid_image_info_bitfield
diff --git a/lld/test/ELF/aarch64-thunk-bti.s b/lld/test/ELF/aarch64-thunk-bti.s
new file mode 100644
index 0000000000000..a16e1569f358e
--- /dev/null
+++ b/lld/test/ELF/aarch64-thunk-bti.s
@@ -0,0 +1,482 @@
+// REQUIRES: aarch64
+// RUN: rm -rf %t && split-file %s %t && cd %t
+// RUN: llvm-mc -filetype=obj -triple=aarch64 asm -o a.o
+// RUN: ld.lld --threads=1 --shared --script=lds a.o -o out.so --defsym absolute=0xf0000000
+// RUN: llvm-objdump -d --no-show-raw-insn out.so | FileCheck %s
+// RUN: llvm-objdump -d --no-show-raw-insn out.so | FileCheck %s --check-prefix=CHECK-PADS
+// RUN: llvm-mc -filetype=obj -triple=aarch64 shared -o shared.o
+// RUN: ld.lld --shared -o shared.so shared.o
+// RUN: ld.lld shared.so --script=lds a.o -o exe --defsym absolute=0xf0000000
+// RUN: llvm-objdump -d --no-show-raw-insn exe | FileCheck %s --check-prefix=CHECK-EXE
+// RUN: llvm-objdump -d --no-show-raw-insn exe | FileCheck %s --check-prefix=CHECK-PADS
+
+/// Test thunk generation when destination does not have a BTI compatible
+/// landing pad. Linker must generate landing pad sections for thunks that use
+/// indirect branches.
+
+//--- asm
+.section ".note.gnu.property", "a"
+.p2align 3
+.long 4
+.long 0x10
+.long 0x5
+.asciz "GNU"
+
+/// Enable BTI.
+.long 0xc0000000 // GNU_PROPERTY_AARCH64_FEATURE_1_AND.
+.long 4
+.long 1          // GNU_PROPERTY_AARCH64_FEATURE_1_BTI.
+.long 0
+
+
+/// Short thunks are direct branches so we don't need landing pads. Expect
+/// all thunks to branch directly to target.
+.section .text.0, "ax", %progbits
+.balign 0x1000
+.global _start
+.type _start, %function
+_start:
+ bl bti_c_target
+ bl bti_j_target
+ bl bti_jc_target
+ bl paciasp_target
+ bl pacibsp_target
+ bl .text.2 + 0x4 // fn2
+ b  .text.2 + 0x4 // fn2
+ bl fn1
+ b  fn1
+ bl fn3
+ b  fn3
+ bl fn4
+ b  fn4
+ bl via_plt
+/// We cannot add landing pads for absolute symbols.
+ bl absolute
+
+/// padding so that we require thunks that can be placed after this section.
+/// The thunks are close enough to the target to be short.
+ .space 0x1000
+
+// CHECK-PADS-LABEL: <_start>:
+// CHECK-PADS-NEXT: 10001000: bl      0x1000203c
+// CHECK-PADS-NEXT:           bl      0x10002040
+// CHECK-PADS-NEXT:           bl      0x10002044
+// CHECK-PADS-NEXT:           bl      0x10002048
+// CHECK-PADS-NEXT:           bl      0x1000204c
+// CHECK-PADS-NEXT:           bl      0x10002050
+// CHECK-PADS-NEXT:           b       0x10002050
+// CHECK-PADS-NEXT:           bl      0x10002054
+// CHECK-PADS-NEXT:           b       0x10002054
+// CHECK-PADS-NEXT:           bl      0x10002058
+// CHECK-PADS-NEXT:           b       0x10002058
+// CHECK-PADS-NEXT:           bl      0x1000205c
+// CHECK-PADS-NEXT:           b       0x1000205c
+// CHECK-PADS-NEXT:           bl      0x10002060
+// CHECK-PADS-NEXT:           bl      0x10002064
+
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT: 1000203c: b       0x18001000 <bti_c_target>
+
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT: 10002040: b       0x18001008 <bti_j_target>
+
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT: 10002044: b       0x18001010 <bti_jc_target>
+
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT: 10002048: b       0x18001018 <paciasp_target>
+
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT: 1000204c: b       0x18001020 <pacibsp_target>
+
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT: 10002050: b       0x18001038 <fn2>
+
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT: 10002054:       b       0x18001034 <fn1>
+
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT: 10002058:       b       0x18001040 <fn3>
+
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT: 1000205c:       b       0x18001050 <fn4>
+
+// CHECK-LABEL: <__AArch64ADRPThunk_via_plt>:
+// CHECK-NEXT: 10002060:       b       0x18001080 <via_plt@plt>
+
+// CHECK-LABEL: <__AArch64ADRPThunk_absolute>:
+// CHECK-NEXT: 10002064:       b       0x18001098 <absolute@plt>
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 1000203c: b       0x18001000 <bti_c_target>
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 10002040: b       0x18001008 <bti_j_target>
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 10002044: b       0x18001010 <bti_jc_target>
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 10002048: b       0x18001018 <paciasp_target>
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 1000204c: b       0x18001020 <pacibsp_target>
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 10002050: b       0x18001038 <fn2>
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 10002054: b       0x18001034 <fn1>
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 10002058: b       0x18001040 <fn3>
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 1000205c: b       0x18001050 <fn4>
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_via_plt>:
+// CHECK-EXE-NEXT: 10002060: b       0x18001080 <via_plt@plt>
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_absolute>:
+// CHECK-EXE-NEXT: 10002064:   ldr     x16, 0x1000206c <__AArch64AbsLongThunk_absolute+0x8>
+// CHECK-EXE-NEXT:             br      x16
+// CHECK-EXE-NEXT: 00 00 00 f0 .word   0xf0000000
+// CHECK-EXE-NEXT: 00 00 00 00 .word   0x00000000
+
+.section .text.1, "ax", %progbits
+/// These indirect branch targets already have a BTI compatible landing pad,
+/// no alternative entry point required.
+.hidden bti_c_target
+.type bti_c_target, %function
+bti_c_target:
+ bti c
+ ret
+
+.hidden bti_j_target
+.type bti_j_target, %function
+bti_j_target:
+ bti j
+ ret
+
+.hidden bti_jc_target
+.type bti_jc_target, %function
+bti_jc_target:
+ bti jc
+ ret
+
+.hidden paciasp_target
+.type paciasp_target, %function
+paciasp_target:
+ paciasp
+ ret
+
+.hidden pacibsp_target
+.type pacibsp_target, %function
+pacibsp_target:
+ pacibsp
+ ret
+
+// CHECk-PADS-LABEL: <bti_c_target>:
+// CHECK-PADS: 18001000:      bti     c
+// CHECK-PADS-NEXT:           ret
+
+// CHECK-PADS-LABEL: <bti_j_target>:
+// CHECK-PADS-NEXT: 18001008: bti     j
+// CHECK-PADS-NEXT:           ret
+
+// CHECK-PADS-LABEL: <bti_jc_target>:
+// CHECK-PADS-NEXT: 18001010: bti     jc
+// CHECK-PADS-NEXT:           ret
+
+// CHECK-PADS-LABEL: <paciasp_target>:
+// CHECK-PADS-NEXT: 18001018: paciasp
+// CHECK-PADS-NEXT:           ret
+
+// CHECK-PADS-LABEL: <pacibsp_target>:
+// CHECK-PADS-NEXT: 18001020: pacibsp
+// CHECK-PADS-NEXT:           ret
+
+/// These functions do not have BTI compatible landing pads. Expect linker
+/// generated landing pads for indirect branch thunks.
+.section .text.2, "ax", %progbits
+.hidden fn1
+.type fn1, %function
+fn1:
+ ret
+.hidden fn2
+.type fn2, %function
+fn2:
+ ret
+
+// CHECK-PADS-LABEL: <__AArch64BTIThunk_>:
+// CHECK-PADS-NEXT: 18001028: bti     c
+// CHECK-PADS-NEXT:           b       0x18001038 <fn2>
+
+// CHECK-PADS-LABEL: <__AArch64BTIThunk_>:
+// CHECK-PADS-NEXT: 18001030: bti     c
+
+// CHECK-PADS-LABEL: <fn1>:
+// CHECK-PADS-NEXT: 18001034: ret
+
+// CHECK-PADS-LABEL <fn2>:
+// CHECK-PADS:      18001038: ret
+
+/// Section with only one function at offset 0. Landing pad should be able to
+/// fall through.
+.section .text.3, "ax", %progbits
+.hidden fn3
+.type fn3, %function
+fn3:
+ ret
+
+// CHECK-PADS-LABEL: <__AArch64BTIThunk_>:
+// CHECK-PADS-NEXT: 1800103c: bti     c
+
+// CHECK-PADS-LABEL: <fn3>:
+// CHECK-PADS-NEXT: 18001040: ret
+
+/// Section with only one function at offset 0, also with a high alignment
+/// requirement. Check that we don't fall through into alignment padding.
+.section .text.4, "ax", %progbits
+.hidden fn4
+.type fn4, %function
+.balign 16
+fn4:
+ ret
+
+// CHECK-PADS-LABEL: <__AArch64BTIThunk_>:
+// CHECK-PADS:      18001044: bti     c
+// CHECK-PADS-NEXT:           b       0x18001050 <fn4>
+// CHECK-PADS-NEXT:           udf     #0x0
+
+// CHECK-PADS-LABEL: <fn4>:
+// CHECK-PADS-NEXT: 18001050: ret
+
+.section .long_calls, "ax", %progbits
+.global long_calls
+.type long_calls, %function
+long_calls:
+/// Expect thunk to target as targets have BTI or implicit BTI.
+ bl bti_c_target
+ bl bti_j_target
+ bl bti_jc_target
+ bl paciasp_target
+ bl pacibsp_target
+/// Expect thunk to target a linker generated entry point with BTI landing pad.
+/// Two calls to make sure only one landing pad is created.
+ bl .text.2 + 0x4 // fn2
+ b  .text.2 + 0x4 // fn2
+/// fn2 before fn1 so that landing pad for fn1 can fall through.
+ bl fn1
+ b  fn1
+ bl fn3
+ b  fn3
+ bl fn4
+ b  fn4
+/// PLT entries reachable via Thunks have a BTI c at the start of each entry
+/// so no additional landing pad required.
+ bl via_plt
+/// We cannot add landing pads for absolute symbols.
+ bl absolute
+
+/// PLT entries have BTI at start.
+// CHECK-LABEL: <via_plt@plt>:
+// CHECK-NEXT:           bti     c
+// CHECK-NEXT:           adrp    x16, 0x30000000
+// CHECK-NEXT:           ldr     x17, [x16, #0x198]
+// CHECK-NEXT:           add     x16, x16, #0x198
+// CHECK-NEXT:           br      x17
+// CHECK-NEXT:           nop
+
+// CHECK: <absolute@plt>:
+// CHECK-NEXT:           bti     c
+// CHECK-NEXT:           adrp    x16, 0x30000000
+// CHECK-NEXT:           ldr     x17, [x16, #0x1a0]
+// CHECK-NEXT:           add     x16, x16, #0x1a0
+// CHECK-NEXT:           br      x17
+// CHECK-NEXT:           nop
+
+// CHECK-EXE-LABEL: <via_plt@plt>:
+// CHECK-EXE-NEXT: 18001080: bti     c
+// CHECK-EXE-NEXT:           adrp    x16, 0x30000000
+// CHECK-EXE-NEXT:           ldr     x17, [x16, #0x1e8]
+// CHECK-EXE-NEXT:           add     x16, x16, #0x1e8
+// CHECK-EXE-NEXT:           br      x17
+// CHECK-EXE-NEXT:           nop
+
+// CHECK-LABEL: <long_calls>:
+// CHECK-NEXT: 30000000: bl      0x3000003c <__AArch64ADRPThunk_>
+// CHECK-NEXT:           bl      0x30000048 <__AArch64ADRPThunk_>
+// CHECK-NEXT:           bl      0x30000054 <__AArch64ADRPThunk_>
+// CHECK-NEXT:           bl      0x30000060 <__AArch64ADRPThunk_>
+// CHECK-NEXT:           bl      0x3000006c <__AArch64ADRPThunk_>
+// CHECK-NEXT:           bl      0x30000078 <__AArch64ADRPThunk_>
+// CHECK-NEXT:           b       0x30000078 <__AArch64ADRPThunk_>
+// CHECK-NEXT:           bl      0x30000084 <__AArch64ADRPThunk_>
+// CHECK-NEXT:           b       0x30000084 <__AArch64ADRPThunk_>
+// CHECK-NEXT:           bl      0x30000090 <__AArch64ADRPThunk_>
+// CHECK-NEXT:           b       0x30000090 <__AArch64ADRPThunk_>
+// CHECK-NEXT:           bl      0x3000009c <__AArch64ADRPThunk_>
+// CHECK-NEXT:           b       0x3000009c <__AArch64ADRPThunk_>
+// CHECK-NEXT:           bl      0x300000a8 <__AArch64ADRPThunk_via_plt>
+// CHECK-NEXT:           bl      0x300000b4 <__AArch64ADRPThunk_absolute>
+
+/// bti_c_target.
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT: 3000003c: adrp    x16, 0x18001000 <bti_c_target>
+// CHECK-NEXT:           add     x16, x16, #0x0
+// CHECK-NEXT:           br      x16
+/// bti_j_target.
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT:           adrp    x16, 0x18001000 <bti_c_target>
+// CHECK-NEXT:           add     x16, x16, #0x8
+// CHECK-NEXT:           br      x16
+/// bti_jc_target.
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT:           adrp    x16, 0x18001000 <bti_c_target>
+// CHECK-NEXT:           add     x16, x16, #0x10
+// CHECK-NEXT:           br      x16
+/// paciasp_target.
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT:           adrp    x16, 0x18001000 <bti_c_target>
+// CHECK-NEXT:           add     x16, x16, #0x18
+// CHECK-NEXT:           br      x16
+/// pacibsp_target.
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT:           adrp    x16, 0x18001000 <bti_c_target>
+// CHECK-NEXT:           add     x16, x16, #0x20
+// CHECK-NEXT:           br      x16
+/// Landing pad for fn2.
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT:           adrp    x16, 0x18001000 <bti_c_target>
+// CHECK-NEXT:           add     x16, x16, #0x28
+// CHECK-NEXT:           br      x16
+/// Landing pad for fn1.
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT:           adrp    x16, 0x18001000 <bti_c_target>
+// CHECK-NEXT:           add     x16, x16, #0x30
+// CHECK-NEXT:           br      x16
+/// Landing pad for fn3.
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT:           adrp    x16, 0x18001000 <bti_c_target>
+// CHECK-NEXT:           add     x16, x16, #0x3c
+// CHECK-NEXT:           br      x16
+/// Landing pad for fn4.
+// CHECK-LABEL: <__AArch64ADRPThunk_>:
+// CHECK-NEXT:           adrp    x16, 0x18001000 <bti_c_target>
+// CHECK-NEXT:           add     x16, x16, #0x44
+// CHECK-NEXT:           br      x16
+
+// CHECK-LABEL: <__AArch64ADRPThunk_via_plt>:
+// CHECK-NEXT:           adrp    x16, 0x18001000 <bti_c_target>
+// CHECK-NEXT:           add     x16, x16, #0x80
+// CHECK-NEXT:           br      x16
+
+// CHECK-LABEL: <__AArch64ADRPThunk_absolute>:
+// CHECK-NEXT:           adrp    x16, 0x18001000 <bti_c_target>
+// CHECK-NEXT:           add     x16, x16, #0x98
+// CHECK-NEXT:           br      x16
+
+// CHECK-EXE-LABEL: <long_calls>:
+// CHECK-EXE-NEXT: 30000000: bl      0x3000003c <__AArch64AbsLongThunk_>
+// CHECK-EXE-NEXT:           bl      0x3000004c <__AArch64AbsLongThunk_>
+// CHECK-EXE-NEXT:           bl      0x3000005c <__AArch64AbsLongThunk_>
+// CHECK-EXE-NEXT:           bl      0x3000006c <__AArch64AbsLongThunk_>
+// CHECK-EXE-NEXT:           bl      0x3000007c <__AArch64AbsLongThunk_>
+// CHECK-EXE-NEXT:           bl      0x3000008c <__AArch64AbsLongThunk_>
+// CHECK-EXE-NEXT:           b       0x3000008c <__AArch64AbsLongThunk_>
+// CHECK-EXE-NEXT:           bl      0x3000009c <__AArch64AbsLongThunk_>
+// CHECK-EXE-NEXT:           b       0x3000009c <__AArch64AbsLongThunk_>
+// CHECK-EXE-NEXT:           bl      0x300000ac <__AArch64AbsLongThunk_>
+// CHECK-EXE-NEXT:           b       0x300000ac <__AArch64AbsLongThunk_>
+// CHECK-EXE-NEXT:           bl      0x300000bc <__AArch64AbsLongThunk_>
+// CHECK-EXE-NEXT:           b       0x300000bc <__AArch64AbsLongThunk_>
+// CHECK-EXE-NEXT:           bl      0x300000cc <__AArch64AbsLongThunk_via_plt>
+// CHECK-EXE-NEXT:           bl      0x300000dc <__AArch64AbsLongThunk_absolute>
+
+// CHECK-EXE-LABEL: 000000003000003c <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 3000003c: ldr     x16, 0x30000044 <__AArch64AbsLongThunk_+0x8>
+// CHECK-EXE-NEXT:           br      x16
+// CHECK-EXE-NEXT:     00 10 00 18   .word   0x18001000
+// CHECK-EXE-NEXT:     00 00 00 00   .word   0x00000000
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 3000004c: ldr     x16, 0x30000054 <__AArch64AbsLongThunk_+0x8>
+// CHECK-EXE-NEXT:           br      x16
+// CHECK-EXE-NEXT:     08 10 00 18   .word   0x18001008
+// CHECK-EXE-NEXT:     00 00 00 00   .word   0x00000000
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 3000005c: ldr     x16, 0x30000064 <__AArch64AbsLongThunk_+0x8>
+// CHECK-EXE-NEXT:           br      x16
+// CHECK-EXE-NEXT:     10 10 00 18   .word   0x18001010
+// CHECK-EXE-NEXT:     00 00 00 00   .word   0x00000000
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 3000006c: ldr     x16, 0x30000074 <__AArch64AbsLongThunk_+0x8>
+// CHECK-EXE-NEXT:           br      x16
+// CHECK-EXE-NEXT:     18 10 00 18   .word   0x18001018
+// CHECK-EXE-NEXT:     00 00 00 00   .word   0x00000000
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 3000007c: ldr     x16, 0x30000084 <__AArch64AbsLongThunk_+0x8>
+// CHECK-EXE-NEXT:           br      x16
+// CHECK-EXE-NEXT:     20 10 00 18   .word   0x18001020
+// CHECK-EXE-NEXT:     00 00 00 00   .word   0x00000000
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 3000008c: ldr     x16, 0x30000094 <__AArch64AbsLongThunk_+0x8>
+// CHECK-EXE-NEXT:           br      x16
+// CHECK-EXE-NEXT:     28 10 00 18   .word   0x18001028
+// CHECK-EXE-NEXT:     00 00 00 00   .word   0x00000000
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 3000009c: ldr     x16, 0x300000a4 <__AArch64AbsLongThunk_+0x8>
+// CHECK-EXE-NEXT:           br      x16
+// CHECK-EXE-NEXT:     30 10 00 18   .word   0x18001030
+// CHECK-EXE-NEXT:     00 00 00 00   .word   0x00000000
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 300000ac: ldr     x16, 0x300000b4 <__AArch64AbsLongThunk_+0x8>
+// CHECK-EXE-NEXT:           br      x16
+// CHECK-EXE-NEXT:     3c 10 00 18   .word   0x1800103c
+// CHECK-EXE-NEXT:     00 00 00 00   .word   0x00000000
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>:
+// CHECK-EXE-NEXT: 300000bc: ldr     x16, 0x300000c4 <__AArch64AbsLongThunk_+0x8>
+// CHECK-EXE-NEXT:           br      x16
+// CHECK-EXE-NEXT:     44 10 00 18   .word   0x18001044
+// CHECK-EXE-NEXT:     00 00 00 00   .word   0x00000000
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_via_plt>:
+// CHECK-EXE-NEXT: 300000cc: ldr     x16, 0x300000d4 <__AArch64AbsLongThunk_via_plt+0x8>
+// CHECK-EXE-NEXT:           br      x16
+// CHECK-EXE-NEXT:     80 10 00 18   .word   0x18001080
+// CHECK-EXE-NEXT:     00 00 00 00   .word   0x00000000
+
+// CHECK-EXE-LABEL: <__AArch64AbsLongThunk_absolute>:
+// CHECK-EXE-NEXT: 300000dc: ldr     x16, 0x300000e4 <__AArch64AbsLongThunk_absolute+0x8>
+// CHECK-EXE-NEXT:           br      x16
+// CHECK-EXE-NEXT:     00 00 00 f0   .word   0xf0000000
+// CHECK-EXE-NEXT:     00 00 00 00   .word   0x00000000
+
+//--- lds
+PHDRS {
+  low PT_LOAD FLAGS(0x1 | 0x4);
+  mid PT_LOAD FLAGS(0x1 | 0x4);
+  high PT_LOAD FLAGS(0x1 | 0x4);
+}
+SECTIONS {
+  .rodata 0x10000000 : { *(.note.gnu.property) } :low
+  .text_low : { *(.text.0) } :low
+  .text 0x18001000 : { *(.text.*) } :mid
+  .plt : { *(.plt) } :mid
+  .text_high 0x30000000 : { *(.long_calls) } :high
+}
+
+//--- shared
+.text
+.global via_plt
+.type via_plt, %function
+via_plt:
+ ret
diff --git a/lld/test/ELF/aarch64-undefined-weak.s b/lld/test/ELF/aarch64-undefined-weak.s
index f4628453ec3fe..015f9c9a043e5 100644
--- a/lld/test/ELF/aarch64-undefined-weak.s
+++ b/lld/test/ELF/aarch64-undefined-weak.s
@@ -1,7 +1,7 @@
 // REQUIRES: aarch64
 // RUN: llvm-mc -filetype=obj -triple=aarch64-none-linux %s -o %t.o
 // RUN: ld.lld --image-base=0x10000000 %t.o -o %t
-// RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+// RUN: llvm-objdump -d -z --no-show-raw-insn %t | FileCheck %s
 
 // Check that the ARM 64-bit ABI rules for undefined weak symbols are applied.
 // Branch instructions are resolved to the next instruction. Undefined
diff --git a/lld/test/ELF/x86-64-gotpc-no-relax-err.s b/lld/test/ELF/x86-64-gotpc-no-relax-err.s
index 618dca47755f4..4280c8fd1dc97 100644
--- a/lld/test/ELF/x86-64-gotpc-no-relax-err.s
+++ b/lld/test/ELF/x86-64-gotpc-no-relax-err.s
@@ -7,15 +7,19 @@
 ## `>>> defined in` for linker synthesized __stop_* symbols (there is no
 ## associated file or linker script line number).
 
-# CHECK:      error: {{.*}}:(.text+0x2): relocation R_X86_64_GOTPCRELX out of range: 2147483658 is not in [-2147483648, 2147483647]; references '__stop_data'
+# CHECK:      error: {{.*}}:(.text+0x2): relocation R_X86_64_GOTPCRELX out of range: 2147483666 is not in [-2147483648, 2147483647]; references '__stop_data'
 # CHECK-NEXT: >>> defined in <internal>
 # CHECK-EMPTY:
-# CHECK-NEXT: error: {{.*}}:(.text+0x9): relocation R_X86_64_REX_GOTPCRELX out of range: 2147483651 is not in [-2147483648, 2147483647]; references '__stop_data'
+# CHECK-NEXT: error: {{.*}}:(.text+0x9): relocation R_X86_64_REX_GOTPCRELX out of range: 2147483659 is not in [-2147483648, 2147483647]; references '__stop_data'
+# CHECK-NEXT: >>> defined in <internal>
+# CHECK-EMPTY:
+# CHECK-NEXT: error: {{.*}}:(.text+0x11): relocation R_X86_64_REX2_GOTPCRELX out of range: 2147483651 is not in [-2147483648, 2147483647]; references '__stop_data'
 # CHECK-NEXT: >>> defined in <internal>
 
 #--- a.s
   movl __stop_data@GOTPCREL(%rip), %eax  # out of range
   movq __stop_data@GOTPCREL(%rip), %rax  # out of range
+  movq __stop_data@GOTPCREL(%rip), %r16  # out of range
   movq __stop_data@GOTPCREL(%rip), %rax  # in range
 
 .section data,"aw",@progbits
@@ -23,5 +27,5 @@
 #--- lds
 SECTIONS {
   .text 0x200000 : { *(.text) }
-  .got 0x80200010 : { *(.got) }
+  .got 0x80200016 : { *(.got) }
 }
diff --git a/lld/test/ELF/x86-64-gotpc-relax-nopic.s b/lld/test/ELF/x86-64-gotpc-relax-nopic.s
index 7481904d16f1b..e3cd93d1d5796 100644
--- a/lld/test/ELF/x86-64-gotpc-relax-nopic.s
+++ b/lld/test/ELF/x86-64-gotpc-relax-nopic.s
@@ -10,30 +10,39 @@
 # SYMRELOC:      Symbols [
 # SYMRELOC:       Symbol {
 # SYMRELOC:        Name: bar
-# SYMRELOC-NEXT:   Value: 0x203248
+# SYMRELOC-NEXT:   Value: 0x203290
 
 ## 2105751 = 0x202197 (bar)
 # DISASM:      Disassembly of section .text:
 # DISASM-EMPTY:
 # DISASM-NEXT: <_start>:
-# DISASM-NEXT:   2011c8:       adcl  {{.*}}(%rip), %eax  # 0x202240
-# DISASM-NEXT:                 addl  {{.*}}(%rip), %ebx  # 0x202240
-# DISASM-NEXT:                 andl  {{.*}}(%rip), %ecx  # 0x202240
-# DISASM-NEXT:                 cmpl  {{.*}}(%rip), %edx  # 0x202240
-# DISASM-NEXT:                 orl   {{.*}}(%rip), %edi  # 0x202240
-# DISASM-NEXT:                 sbbl  {{.*}}(%rip), %esi  # 0x202240
-# DISASM-NEXT:                 subl  {{.*}}(%rip), %ebp  # 0x202240
-# DISASM-NEXT:                 xorl  $0x203248, %r8d
-# DISASM-NEXT:                 testl $0x203248, %r15d
-# DISASM-NEXT:   201200:       adcq  $0x203248, %rax
-# DISASM-NEXT:                 addq  $0x203248, %rbx
-# DISASM-NEXT:                 andq  $0x203248, %rcx
-# DISASM-NEXT:                 cmpq  $0x203248, %rdx
-# DISASM-NEXT:                 orq   $0x203248, %rdi
-# DISASM-NEXT:                 sbbq  $0x203248, %rsi
-# DISASM-NEXT:                 subq  $0x203248, %rbp
-# DISASM-NEXT:                 xorq  $0x203248, %r8
-# DISASM-NEXT:                 testq $0x203248, %r15
+# DISASM-NEXT:   2011c8:       adcl  {{.*}}(%rip), %eax  # 0x202288
+# DISASM-NEXT:                 addl  {{.*}}(%rip), %ebx  # 0x202288
+# DISASM-NEXT:                 andl  {{.*}}(%rip), %ecx  # 0x202288
+# DISASM-NEXT:                 cmpl  {{.*}}(%rip), %edx  # 0x202288
+# DISASM-NEXT:                 orl   {{.*}}(%rip), %edi  # 0x202288
+# DISASM-NEXT:                 sbbl  {{.*}}(%rip), %esi  # 0x202288
+# DISASM-NEXT:                 subl  {{.*}}(%rip), %ebp  # 0x202288
+# DISASM-NEXT:                 xorl  $0x203290, %r8d
+# DISASM-NEXT:                 testl $0x203290, %r15d
+# DISASM-NEXT:   201200:       adcq  $0x203290, %rax
+# DISASM-NEXT:                 addq  $0x203290, %rbx
+# DISASM-NEXT:                 andq  $0x203290, %rcx
+# DISASM-NEXT:                 cmpq  $0x203290, %rdx
+# DISASM-NEXT:                 orq   $0x203290, %rdi
+# DISASM-NEXT:                 sbbq  $0x203290, %rsi
+# DISASM-NEXT:                 subq  $0x203290, %rbp
+# DISASM-NEXT:                 xorq  $0x203290, %r8
+# DISASM-NEXT:                 testq $0x203290, %r15
+# DISASM-NEXT:   20123f:       adcq  $0x203290, %r16
+# DISASM-NEXT:                 addq  $0x203290, %r17
+# DISASM-NEXT:                 andq  $0x203290, %r18
+# DISASM-NEXT:                 cmpq  $0x203290, %r19
+# DISASM-NEXT:                 orq   $0x203290, %r20
+# DISASM-NEXT:                 sbbq  $0x203290, %r21
+# DISASM-NEXT:                 subq  $0x203290, %r22
+# DISASM-NEXT:                 xorq  $0x203290, %r23
+# DISASM-NEXT:                 testq $0x203290, %r24
 
 # RUN: ld.lld --hash-style=sysv -shared %t.o -o %t2
 # RUN: llvm-readobj -S -r -d %t2 | FileCheck --check-prefix=SEC-PIC    %s
@@ -46,8 +55,8 @@
 # SEC-PIC-NEXT:     SHF_ALLOC
 # SEC-PIC-NEXT:     SHF_WRITE
 # SEC-PIC-NEXT:   ]
-# SEC-PIC-NEXT:   Address: 0x2380
-# SEC-PIC-NEXT:   Offset: 0x380
+# SEC-PIC-NEXT:   Address: 0x23C8
+# SEC-PIC-NEXT:   Offset: 0x3C8
 # SEC-PIC-NEXT:   Size: 8
 # SEC-PIC-NEXT:   Link:
 # SEC-PIC-NEXT:   Info:
@@ -57,7 +66,7 @@
 # SEC-PIC:      0x000000006FFFFFF9 RELACOUNT            1
 # SEC-PIC:      Relocations [
 # SEC-PIC-NEXT:   Section ({{.*}}) .rela.dyn {
-# SEC-PIC-NEXT:     0x2380 R_X86_64_RELATIVE - 0x3388
+# SEC-PIC-NEXT:     0x23C8 R_X86_64_RELATIVE - 0x33D0
 # SEC-PIC-NEXT:   }
 # SEC-PIC-NEXT: ]
 
@@ -65,24 +74,33 @@
 # DISASM-PIC:      Disassembly of section .text:
 # DISASM-PIC-EMPTY:
 # DISASM-PIC-NEXT: <_start>:
-# DISASM-PIC-NEXT: 1268:       adcl  {{.*}}(%rip), %eax  # 0x2380
-# DISASM-PIC-NEXT:             addl  {{.*}}(%rip), %ebx  # 0x2380
-# DISASM-PIC-NEXT:             andl  {{.*}}(%rip), %ecx  # 0x2380
-# DISASM-PIC-NEXT:             cmpl  {{.*}}(%rip), %edx  # 0x2380
-# DISASM-PIC-NEXT:             orl   {{.*}}(%rip), %edi  # 0x2380
-# DISASM-PIC-NEXT:             sbbl  {{.*}}(%rip), %esi  # 0x2380
-# DISASM-PIC-NEXT:             subl  {{.*}}(%rip), %ebp  # 0x2380
-# DISASM-PIC-NEXT:             xorl  {{.*}}(%rip), %r8d  # 0x2380
-# DISASM-PIC-NEXT:             testl %r15d, {{.*}}(%rip) # 0x2380
-# DISASM-PIC-NEXT: 12a0:       adcq  {{.*}}(%rip), %rax  # 0x2380
-# DISASM-PIC-NEXT:             addq  {{.*}}(%rip), %rbx  # 0x2380
-# DISASM-PIC-NEXT:             andq  {{.*}}(%rip), %rcx  # 0x2380
-# DISASM-PIC-NEXT:             cmpq  {{.*}}(%rip), %rdx  # 0x2380
-# DISASM-PIC-NEXT:             orq   {{.*}}(%rip), %rdi  # 0x2380
-# DISASM-PIC-NEXT:             sbbq  {{.*}}(%rip), %rsi  # 0x2380
-# DISASM-PIC-NEXT:             subq  {{.*}}(%rip), %rbp  # 0x2380
-# DISASM-PIC-NEXT:             xorq  {{.*}}(%rip), %r8   # 0x2380
-# DISASM-PIC-NEXT:             testq %r15, {{.*}}(%rip)  # 0x2380
+# DISASM-PIC-NEXT: 1268:       adcl  {{.*}}(%rip), %eax  # 0x23c8
+# DISASM-PIC-NEXT:             addl  {{.*}}(%rip), %ebx  # 0x23c8
+# DISASM-PIC-NEXT:             andl  {{.*}}(%rip), %ecx  # 0x23c8
+# DISASM-PIC-NEXT:             cmpl  {{.*}}(%rip), %edx  # 0x23c8
+# DISASM-PIC-NEXT:             orl   {{.*}}(%rip), %edi  # 0x23c8
+# DISASM-PIC-NEXT:             sbbl  {{.*}}(%rip), %esi  # 0x23c8
+# DISASM-PIC-NEXT:             subl  {{.*}}(%rip), %ebp  # 0x23c8
+# DISASM-PIC-NEXT:             xorl  {{.*}}(%rip), %r8d  # 0x23c8
+# DISASM-PIC-NEXT:             testl %r15d, {{.*}}(%rip) # 0x23c8
+# DISASM-PIC-NEXT: 12a0:       adcq  {{.*}}(%rip), %rax  # 0x23c8
+# DISASM-PIC-NEXT:             addq  {{.*}}(%rip), %rbx  # 0x23c8
+# DISASM-PIC-NEXT:             andq  {{.*}}(%rip), %rcx  # 0x23c8
+# DISASM-PIC-NEXT:             cmpq  {{.*}}(%rip), %rdx  # 0x23c8
+# DISASM-PIC-NEXT:             orq   {{.*}}(%rip), %rdi  # 0x23c8
+# DISASM-PIC-NEXT:             sbbq  {{.*}}(%rip), %rsi  # 0x23c8
+# DISASM-PIC-NEXT:             subq  {{.*}}(%rip), %rbp  # 0x23c8
+# DISASM-PIC-NEXT:             xorq  {{.*}}(%rip), %r8   # 0x23c8
+# DISASM-PIC-NEXT:             testq %r15, {{.*}}(%rip)  # 0x23c8
+# DISASM-PIC-NEXT: 12df:       adcq  {{.*}}(%rip), %r16  # 0x23c8
+# DISASM-PIC-NEXT:             addq  {{.*}}(%rip), %r17  # 0x23c8
+# DISASM-PIC-NEXT:             andq  {{.*}}(%rip), %r18  # 0x23c8
+# DISASM-PIC-NEXT:             cmpq  {{.*}}(%rip), %r19  # 0x23c8
+# DISASM-PIC-NEXT:             orq   {{.*}}(%rip), %r20  # 0x23c8
+# DISASM-PIC-NEXT:             sbbq  {{.*}}(%rip), %r21  # 0x23c8
+# DISASM-PIC-NEXT:             subq  {{.*}}(%rip), %r22  # 0x23c8
+# DISASM-PIC-NEXT:             xorq  {{.*}}(%rip), %r23   # 0x23c8
+# DISASM-PIC-NEXT:             testq %r24, {{.*}}(%rip)  # 0x23c8
 
 .data
 .type   bar, @object
@@ -115,3 +133,14 @@ _start:
   subq    bar@GOTPCREL(%rip), %rbp
   xorq    bar@GOTPCREL(%rip), %r8
   testq   %r15, bar@GOTPCREL(%rip)
+
+## R_X86_64_REX2_GOTPCRELX
+  adcq    bar@GOTPCREL(%rip), %r16
+  addq    bar@GOTPCREL(%rip), %r17
+  andq    bar@GOTPCREL(%rip), %r18
+  cmpq    bar@GOTPCREL(%rip), %r19
+  orq     bar@GOTPCREL(%rip), %r20
+  sbbq    bar@GOTPCREL(%rip), %r21
+  subq    bar@GOTPCREL(%rip), %r22
+  xorq    bar@GOTPCREL(%rip), %r23
+  testq   %r24, bar@GOTPCREL(%rip)
diff --git a/lld/test/ELF/x86-64-gotpc-relax.s b/lld/test/ELF/x86-64-gotpc-relax.s
index 5945bfc04a022..b1ff995b3fc21 100644
--- a/lld/test/ELF/x86-64-gotpc-relax.s
+++ b/lld/test/ELF/x86-64-gotpc-relax.s
@@ -1,5 +1,5 @@
 # REQUIRES: x86
-## Test R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX GOT optimization.
+## Test R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX/R_X86_64_REX2_GOTPCRELX GOT optimization.
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
 # RUN: ld.lld %t.o -o %t1 --no-apply-dynamic-relocs
@@ -15,16 +15,16 @@
 
 ## In our implementation, .got is retained even if all GOT-generating relocations are optimized.
 # CHECK:      Name              Type            Address          Off    Size   ES Flg Lk Inf Al
-# CHECK:      .iplt             PROGBITS        0000000000201280 000280 000010 00  AX  0   0 16
-# CHECK-NEXT: .got              PROGBITS        0000000000202290 000290 000000 00  WA  0   0  8
+# CHECK:      .iplt             PROGBITS        00000000002012e0 0002e0 000010 00  AX  0   0 16
+# CHECK-NEXT: .got              PROGBITS        00000000002022f0 0002f0 000000 00  WA  0   0  8
 
 ## There is one R_X86_64_IRELATIVE relocations.
 # RELOC-LABEL: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entry:
 # CHECK:           Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
-# CHECK:       0000000000203290  0000000000000025 R_X86_64_IRELATIVE                        2011e2
+# CHECK:       00000000002032f0  0000000000000025 R_X86_64_IRELATIVE                        2011e2
 # CHECK-LABEL: Hex dump of section '.got.plt':
-# NOAPPLY-NEXT:  0x00203290 00000000 00000000
-# APPLY-NEXT:    0x00203290 e2112000 00000000
+# NOAPPLY-NEXT:  0x002032f0 00000000 00000000
+# APPLY-NEXT:    0x002032f0 e2112000 00000000
 
 # 0x201173 + 7 - 10 = 0x201170
 # 0x20117a + 7 - 17 = 0x201170
@@ -43,20 +43,20 @@
 # DISASM-NEXT: leaq -17(%rip), %rax
 # DISASM-NEXT: leaq -23(%rip), %rax
 # DISASM-NEXT: leaq -30(%rip), %rax
-# DISASM-NEXT: movq 8330(%rip), %rax
-# DISASM-NEXT: movq 8323(%rip), %rax
+# DISASM-NEXT: movq 8426(%rip), %rax
+# DISASM-NEXT: movq 8419(%rip), %rax
 # DISASM-NEXT: leaq -52(%rip), %rax
 # DISASM-NEXT: leaq -59(%rip), %rax
 # DISASM-NEXT: leaq -65(%rip), %rax
 # DISASM-NEXT: leaq -72(%rip), %rax
-# DISASM-NEXT: movq 8288(%rip), %rax
-# DISASM-NEXT: movq 8281(%rip), %rax
+# DISASM-NEXT: movq 8384(%rip), %rax
+# DISASM-NEXT: movq 8377(%rip), %rax
 # DISASM-NEXT: callq 0x2011e0 <foo>
 # DISASM-NEXT: callq 0x2011e0 <foo>
 # DISASM-NEXT: callq 0x2011e1 <hid>
 # DISASM-NEXT: callq 0x2011e1 <hid>
-# DISASM-NEXT: callq *8251(%rip)
-# DISASM-NEXT: callq *8245(%rip)
+# DISASM-NEXT: callq *8347(%rip)
+# DISASM-NEXT: callq *8341(%rip)
 # DISASM-NEXT: jmp   0x2011e0 <foo>
 # DISASM-NEXT: nop
 # DISASM-NEXT: jmp   0x2011e0 <foo>
@@ -65,13 +65,26 @@
 # DISASM-NEXT: nop
 # DISASM-NEXT: jmp   0x2011e1 <hid>
 # DISASM-NEXT: nop
-# DISASM-NEXT: jmpq  *8215(%rip)
-# DISASM-NEXT: jmpq  *8209(%rip)
+# DISASM-NEXT: jmpq  *8311(%rip)
+# DISASM-NEXT: jmpq  *8305(%rip)
+# DISASM-NEXT: leaq -167(%rip), %r16
+# DISASM-NEXT: leaq -175(%rip), %r16
+# DISASM-NEXT: leaq -182(%rip), %r16
+# DISASM-NEXT: leaq -190(%rip), %r16
+# DISASM-NEXT: movq 8265(%rip), %r16
+# DISASM-NEXT: movq 8257(%rip), %r16
+# DISASM-NEXT: leaq -215(%rip), %r16
+# DISASM-NEXT: leaq -223(%rip), %r16
+# DISASM-NEXT: leaq -230(%rip), %r16
+# DISASM-NEXT: leaq -238(%rip), %r16
+# DISASM-NEXT: movq 8217(%rip), %r16
+# DISASM-NEXT: movq 8209(%rip), %r16
 
 # NORELAX-LABEL: <_start>:
 # NORELAX-COUNT-12: movq
 # NORELAX-COUNT-6:  callq *
 # NORELAX-COUNT-6:  jmpq *
+# NORELAX-COUNT-12: movq
 
 .text
 .globl foo
@@ -120,3 +133,16 @@ _start:
  jmp *hid@GOTPCREL(%rip)
  jmp *ifunc@GOTPCREL(%rip)
  jmp *ifunc@GOTPCREL(%rip)
+
+ movq foo@GOTPCREL(%rip), %r16
+ movq foo@GOTPCREL(%rip), %r16
+ movq hid@GOTPCREL(%rip), %r16
+ movq hid@GOTPCREL(%rip), %r16
+ movq ifunc@GOTPCREL(%rip), %r16
+ movq ifunc@GOTPCREL(%rip), %r16
+ movq foo@GOTPCREL(%rip), %r16
+ movq foo@GOTPCREL(%rip), %r16
+ movq hid@GOTPCREL(%rip), %r16
+ movq hid@GOTPCREL(%rip), %r16
+ movq ifunc@GOTPCREL(%rip), %r16
+ movq ifunc@GOTPCREL(%rip), %r16
diff --git a/lld/test/wasm/data-segments.ll b/lld/test/wasm/data-segments.ll
index 9354e6c8e4d2b..670ac3c1f373f 100644
--- a/lld/test/wasm/data-segments.ll
+++ b/lld/test/wasm/data-segments.ll
@@ -113,7 +113,7 @@
 ; PASSIVE-NEXT:        Name:            __wasm_init_memory
 
 ;      PASSIVE-PIC:  - Type:            START
-; PASSIVE-PIC-NEXT:    StartFunction:   3
+; PASSIVE-PIC-NEXT:    StartFunction:   2
 ; PASSIVE-PIC-NEXT:  - Type:            DATACOUNT
 ; PASSIVE-PIC-NEXT:    Count:           3
 ; PASSIVE-PIC-NEXT:  - Type:            CODE
@@ -125,9 +125,6 @@
 ; PASSIVE-PIC-NEXT:        Locals:          []
 ; PASSIVE-PIC-NEXT:        Body:            {{.*}}
 ; PASSIVE-PIC-NEXT:      - Index:           2
-; PASSIVE-PIC-NEXT:        Locals:          []
-; PASSIVE-PIC-NEXT:        Body:            0B
-; PASSIVE-PIC-NEXT:      - Index:           3
 ; PASSIVE-PIC-NEXT:        Locals:
 ; PASSIVE32-PIC-NEXT:          - Type:            I32
 ; PASSIVE64-PIC-NEXT:          - Type:            I64
@@ -152,8 +149,6 @@
 ; PASSIVE-PIC-NEXT:      - Index:           1
 ; PASSIVE-PIC-NEXT:        Name:            __wasm_init_tls
 ; PASSIVE-PIC-NEXT:      - Index:           2
-; PASSIVE-PIC-NEXT:        Name:            __wasm_apply_data_relocs
-; PASSIVE-PIC-NEXT:      - Index:           3
 ; PASSIVE-PIC-NEXT:        Name:            __wasm_init_memory
 
 ; no data relocations.
@@ -161,8 +156,6 @@
 ; DIS-EMPTY:
 ; DIS-NEXT:        end
 
-; In PIC mode __wasm_apply_data_relocs is export separatly to __wasm_call_ctors
-; PIC-DIS:     <__wasm_apply_data_relocs>:
 ; PIC-DIS-EMPTY:
 
 ; DIS-LABEL:       <__wasm_init_memory>:
diff --git a/lld/test/wasm/shared-weak-symbols.s b/lld/test/wasm/shared-weak-symbols.s
index 90de006353b3d..df049ce4600fe 100644
--- a/lld/test/wasm/shared-weak-symbols.s
+++ b/lld/test/wasm/shared-weak-symbols.s
@@ -30,7 +30,7 @@ call_weak:
 # ASM:           10 80 80 80 80 00      call  0
   drop
   call hidden_weak_func
-# ASM:           10 84 80 80 80 00      call  4
+# ASM:           10 83 80 80 80 00      call  3
   end_function
 # ASM-NEXT:      0b                     end
 
@@ -62,15 +62,12 @@ call_weak:
 # CHECK-NEXT:       - Name:            __wasm_call_ctors
 # CHECK-NEXT:         Kind:            FUNCTION
 # CHECK-NEXT:         Index:           1
-# CHECK-NEXT:       - Name:            __wasm_apply_data_relocs
-# CHECK-NEXT:         Kind:            FUNCTION
-# CHECK-NEXT:         Index:           2
 # CHECK-NEXT:       - Name:            weak_func
 # CHECK-NEXT:         Kind:            FUNCTION
-# CHECK-NEXT:         Index:           3
+# CHECK-NEXT:         Index:           2
 # CHECK-NEXT:       - Name:            call_weak
 # CHECK-NEXT:         Kind:            FUNCTION
-# CHECK-NEXT:         Index:           5
+# CHECK-NEXT:         Index:           4
 # CHECK-NEXT:   - Type:            CODE
 
 #      CHECK:   - Type:            CUSTOM
@@ -81,10 +78,8 @@ call_weak:
 # CHECK-NEXT:       - Index:           1
 # CHECK-NEXT:         Name:            __wasm_call_ctors
 # CHECK-NEXT:       - Index:           2
-# CHECK-NEXT:         Name:            __wasm_apply_data_relocs
-# CHECK-NEXT:       - Index:           3
 # CHECK-NEXT:         Name:            weak_func
-# CHECK-NEXT:       - Index:           4
+# CHECK-NEXT:       - Index:           3
 # CHECK-NEXT:         Name:            hidden_weak_func
-# CHECK-NEXT:       - Index:           5
+# CHECK-NEXT:       - Index:           4
 # CHECK-NEXT:         Name:            call_weak
diff --git a/lld/test/wasm/tls-export.s b/lld/test/wasm/tls-export.s
index 1f64be607abb2..619f9d2df312a 100644
--- a/lld/test/wasm/tls-export.s
+++ b/lld/test/wasm/tls-export.s
@@ -40,9 +40,6 @@ tls1:
 # CHECK-NEXT:      - Name:            __wasm_call_ctors
 # CHECK-NEXT:        Kind:            FUNCTION
 # CHECK-NEXT:        Index:           0
-# CHECK-NEXT:      - Name:            __wasm_apply_data_relocs
-# CHECK-NEXT:        Kind:            FUNCTION
-# CHECK-NEXT:        Index:           1
 # CHECK-NEXT:      - Name:            tls1
 # CHECK-NEXT:        Kind:            GLOBAL
 # CHECK-NEXT:        Index:           2
diff --git a/lld/test/wasm/tls-non-shared-memory.s b/lld/test/wasm/tls-non-shared-memory.s
index a2e2257cc9392..1754fd6254bb8 100644
--- a/lld/test/wasm/tls-non-shared-memory.s
+++ b/lld/test/wasm/tls-non-shared-memory.s
@@ -127,9 +127,6 @@ tls1:
 # PIE-NEXT:       - Name:            memory
 # PIE-NEXT:         Kind:            MEMORY
 # PIE-NEXT:         Index:           0
-# PIE-NEXT:       - Name:            __wasm_apply_data_relocs
-# PIE-NEXT:         Kind:            FUNCTION
-# PIE-NEXT:         Index:           1
 # PIE-NEXT:   - Type:
 
 # .tdata and .data are combined into single segment in PIC mode.
diff --git a/lld/test/wasm/tls-relocations.s b/lld/test/wasm/tls-relocations.s
index ebe83227631f4..7260d72535a00 100644
--- a/lld/test/wasm/tls-relocations.s
+++ b/lld/test/wasm/tls-relocations.s
@@ -66,7 +66,7 @@ tls_sym:
 # ASM-NEXT:                 i32.const 16
 # ASM-NEXT:                 memory.init 0, 0
 # call to __wasm_apply_tls_relocs
-# ASM-NEXT:                 call  4
+# ASM-NEXT:                 call  3
 # ASM-NEXT:                 end
 
 # ASM: <__wasm_apply_tls_relocs>:
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 2de7dcaeb43d4..289c1217ff5ea 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -917,17 +917,6 @@ static void createSyntheticSymbols() {
             is64 ? i64ArgSignature : i32ArgSignature,
             "__wasm_init_tls"));
   }
-
-  if (ctx.isPic ||
-      config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic) {
-    // For PIC code, or when dynamically importing addresses, we create
-    // synthetic functions that apply relocations.  These get called from
-    // __wasm_call_ctors before the user-level constructors.
-    WasmSym::applyDataRelocs = symtab->addSyntheticFunction(
-        "__wasm_apply_data_relocs",
-        WASM_SYMBOL_VISIBILITY_DEFAULT | WASM_SYMBOL_EXPORTED,
-        make<SyntheticFunction>(nullSignature, "__wasm_apply_data_relocs"));
-  }
 }
 
 static void createOptionalSymbols() {
diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp
index 975225974aff6..9383dcaeb4f55 100644
--- a/lld/wasm/InputChunks.cpp
+++ b/lld/wasm/InputChunks.cpp
@@ -361,11 +361,12 @@ uint64_t InputChunk::getVA(uint64_t offset) const {
 // Generate code to apply relocations to the data section at runtime.
 // This is only called when generating shared libraries (PIC) where address are
 // not known at static link time.
-void InputChunk::generateRelocationCode(raw_ostream &os) const {
+bool InputChunk::generateRelocationCode(raw_ostream &os) const {
   LLVM_DEBUG(dbgs() << "generating runtime relocations: " << name
                     << " count=" << relocations.size() << "\n");
 
   bool is64 = config->is64.value_or(false);
+  bool generated = false;
   unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST
                                    : WASM_OPCODE_I32_CONST;
   unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD
@@ -378,7 +379,10 @@ void InputChunk::generateRelocationCode(raw_ostream &os) const {
     uint64_t offset = getVA(rel.Offset) - getInputSectionOffset();
 
     Symbol *sym = file->getSymbol(rel);
-    if (!ctx.isPic && sym->isDefined())
+    // Runtime relocations are needed when we don't know the address of
+    // a symbol statically.
+    bool requiresRuntimeReloc = ctx.isPic || sym->hasGOTIndex();
+    if (!requiresRuntimeReloc)
       continue;
 
     LLVM_DEBUG(dbgs() << "gen reloc: type=" << relocTypeToString(rel.Type)
@@ -435,7 +439,9 @@ void InputChunk::generateRelocationCode(raw_ostream &os) const {
     writeU8(os, opcode_reloc_store, "I32_STORE");
     writeUleb128(os, 2, "align");
     writeUleb128(os, 0, "offset");
+    generated = true;
   }
+  return generated;
 }
 
 // Split WASM_SEG_FLAG_STRINGS section. Such a section is a sequence of
diff --git a/lld/wasm/InputChunks.h b/lld/wasm/InputChunks.h
index 5174439facc67..14eb008c212fb 100644
--- a/lld/wasm/InputChunks.h
+++ b/lld/wasm/InputChunks.h
@@ -78,7 +78,7 @@ class InputChunk {
 
   size_t getNumRelocations() const { return relocations.size(); }
   void writeRelocations(llvm::raw_ostream &os) const;
-  void generateRelocationCode(raw_ostream &os) const;
+  bool generateRelocationCode(raw_ostream &os) const;
 
   bool isTLS() const { return flags & llvm::wasm::WASM_SEG_FLAG_TLS; }
   bool isRetained() const { return flags & llvm::wasm::WASM_SEG_FLAG_RETAIN; }
diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp
index f74699d0763fd..b2bbd11c53ef2 100644
--- a/lld/wasm/Symbols.cpp
+++ b/lld/wasm/Symbols.cpp
@@ -80,7 +80,6 @@ namespace wasm {
 DefinedFunction *WasmSym::callCtors;
 DefinedFunction *WasmSym::callDtors;
 DefinedFunction *WasmSym::initMemory;
-DefinedFunction *WasmSym::applyDataRelocs;
 DefinedFunction *WasmSym::applyGlobalRelocs;
 DefinedFunction *WasmSym::applyTLSRelocs;
 DefinedFunction *WasmSym::applyGlobalTLSRelocs;
diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h
index 2ba575fddc879..5ce3ecbc4ab19 100644
--- a/lld/wasm/Symbols.h
+++ b/lld/wasm/Symbols.h
@@ -591,18 +591,14 @@ struct WasmSym {
   // Function that calls the libc/etc. cleanup function.
   static DefinedFunction *callDtors;
 
-  // __wasm_apply_data_relocs
-  // Function that applies relocations to data segment post-instantiation.
-  static DefinedFunction *applyDataRelocs;
-
   // __wasm_apply_global_relocs
   // Function that applies relocations to wasm globals post-instantiation.
   // Unlike __wasm_apply_data_relocs this needs to run on every thread.
   static DefinedFunction *applyGlobalRelocs;
 
   // __wasm_apply_tls_relocs
-  // Like applyDataRelocs but for TLS section.  These must be delayed until
-  // __wasm_init_tls.
+  // Like __wasm_apply_data_relocs but for TLS section.  These must be
+  // delayed until __wasm_init_tls.
   static DefinedFunction *applyTLSRelocs;
 
   // __wasm_apply_global_tls_relocs
diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp
index 681f6a137ceac..77cddfc34389c 100644
--- a/lld/wasm/Writer.cpp
+++ b/lld/wasm/Writer.cpp
@@ -1145,6 +1145,8 @@ void Writer::createSyntheticInitFunctions() {
 
   static WasmSignature nullSignature = {{}, {}};
 
+  createApplyDataRelocationsFunction();
+
   // Passive segments are used to avoid memory being reinitialized on each
   // thread's instantiation. These passive segments are initialized and
   // dropped in __wasm_init_memory, which is registered as the start function
@@ -1467,15 +1469,29 @@ void Writer::createApplyDataRelocationsFunction() {
   {
     raw_string_ostream os(bodyContent);
     writeUleb128(os, 0, "num locals");
+    bool generated = false;
     for (const OutputSegment *seg : segments)
       if (!config->sharedMemory || !seg->isTLS())
         for (const InputChunk *inSeg : seg->inputSegments)
-          inSeg->generateRelocationCode(os);
+          generated |= inSeg->generateRelocationCode(os);
 
+    if (!generated) {
+      LLVM_DEBUG(dbgs() << "skipping empty __wasm_apply_data_relocs\n");
+      return;
+    }
     writeU8(os, WASM_OPCODE_END, "END");
   }
 
-  createFunction(WasmSym::applyDataRelocs, bodyContent);
+  // __wasm_apply_data_relocs
+  // Function that applies relocations to data segment post-instantiation.
+  static WasmSignature nullSignature = {{}, {}};
+  auto def = symtab->addSyntheticFunction(
+      "__wasm_apply_data_relocs",
+      WASM_SYMBOL_VISIBILITY_DEFAULT | WASM_SYMBOL_EXPORTED,
+      make<SyntheticFunction>(nullSignature, "__wasm_apply_data_relocs"));
+  def->markLive();
+
+  createFunction(def, bodyContent);
 }
 
 void Writer::createApplyTLSRelocationsFunction() {
@@ -1771,8 +1787,6 @@ void Writer::run() {
 
   if (!config->relocatable) {
     // Create linker synthesized functions
-    if (WasmSym::applyDataRelocs)
-      createApplyDataRelocationsFunction();
     if (WasmSym::applyGlobalRelocs)
       createApplyGlobalRelocationsFunction();
     if (WasmSym::applyTLSRelocs)
diff --git a/lldb/docs/use/aarch64-linux.rst b/lldb/docs/use/aarch64-linux.md
similarity index 61%
rename from lldb/docs/use/aarch64-linux.rst
rename to lldb/docs/use/aarch64-linux.md
index c790d9d369a48..803f56d16f981 100644
--- a/lldb/docs/use/aarch64-linux.rst
+++ b/lldb/docs/use/aarch64-linux.md
@@ -1,5 +1,4 @@
-Using LLDB On AArch64 Linux
-===========================
+# Using LLDB On AArch64 Linux
 
 This page explains the details of debugging certain AArch64 extensions using
 LLDB. If something is not mentioned here, it likely works as you would expect.
@@ -8,23 +7,21 @@ This is not a replacement for ptrace and Linux Kernel documentation. This covers
 how LLDB has chosen to use those things and how that effects your experience as
 a user.
 
-Scalable Vector Extension (SVE)
--------------------------------
+## Scalable Vector Extension (SVE)
 
-See `here <https://developer.arm.com/Architectures/Scalable%20Vector%20Extensions>`__
-to learn about the extension and `here <https://kernel.org/doc/html/latest/arch/arm64/sve.html>`__
+See [here](https://developer.arm.com/Architectures/Scalable%20Vector%20Extensions)
+to learn about the extension and [here](https://kernel.org/doc/html/latest/arch/arm64/sve.html)
 for the Linux Kernel's handling of it.
 
 In LLDB you will be able to see the following new registers:
 
-* ``z0-z31`` vector registers, each one has size equal to the vector length.
-* ``p0-p15`` predicate registers, each one containing 1 bit per byte in the vector
+* `z0-z31` vector registers, each one has size equal to the vector length.
+* `p0-p15` predicate registers, each one containing 1 bit per byte in the vector
   length. Making each one vector length / 8 sized.
-* ``ffr`` the first fault register, same size as a predicate register.
-* ``vg``, the vector length in "granules". Each granule is 8 bytes.
-
-.. code-block::
+* `ffr` the first fault register, same size as a predicate register.
+* `vg`, the vector length in "granules". Each granule is 8 bytes.
 
+```
        Scalable Vector Extension Registers:
              vg = 0x0000000000000002
              z0 = {0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 <...> }
@@ -32,19 +29,19 @@ In LLDB you will be able to see the following new registers:
              p0 = {0xff 0xff}
            <...>
             ffr = {0xff 0xff}
+```
 
 The example above has a vector length of 16 bytes. Within LLDB you will always
-see "vg" as in the ``vg`` register, which is 2 in this case (8*2 = 16).
+see "vg" as in the `vg` register, which is 2 in this case (8*2 = 16).
 Elsewhere in kernel code or applications, you may see "vq" which is the vector
 length in quadwords (16 bytes). Where you see "vl", it is in bytes.
 
-While you can count the size of a P or Z register, it is intended that ``vg`` be
+While you can count the size of a P or Z register, it is intended that `vg` be
 used to find the current vector length.
 
-Changing the Vector Length
-..........................
+### Changing the Vector Length
 
-The ``vg`` register can be written during a debug session. Writing the current
+The `vg` register can be written during a debug session. Writing the current
 vector length changes nothing. If you increase the vector length, the registers
 will likely be reset to 0. If you decrease it, LLDB will truncate the Z
 registers but everything else will be reset to 0.
@@ -54,21 +51,20 @@ way the same as it was previously. Whether that is done from within the
 debuggee, or by LLDB. If you need to change the vector length, do so before a
 function's first use of SVE.
 
-Z Register Presentation
-.......................
+### Z Register Presentation
 
 LLDB makes no attempt to predict how SVE Z registers will be used. Since LLDB
 does not know what sort of elements future instructions will interpret the
 register as. It therefore does not change the visualisation of the register
 and always defaults to showing a vector of byte sized elements.
 
-If you know what format you are going to use, give a format option::
-
+If you know what format you are going to use, give a format option:
+```
   (lldb) register read z0 -f uint32_t[]
       z0 = {0x01010101 0x01010101 0x01010101 0x01010101}
+```
 
-FPSIMD and SVE Modes
-....................
+### FPSIMD and SVE Modes
 
 Prior to the debugee's first use of SVE, it is in what the Linux Kernel terms
 SIMD mode. Only the FPU is being used. In this state LLDB will still show the
@@ -82,18 +78,16 @@ You can also trigger this with LLDB by writing to an SVE register. Note that
 there is no way to undo this change from within LLDB. However, the debugee
 itself could do something to end up back in SIMD mode.
 
-Expression evaluation
-.....................
+### Expression evaluation
 
 If you evaluate an expression, all SVE state is saved prior to, and restored
 after the expression has been evaluated. Including the register values and
 vector length.
 
-Scalable Matrix Extension (SME)
--------------------------------
+## Scalable Matrix Extension (SME)
 
-See `here <https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/scalable-matrix-extension-armv9-a-architecture>`__
-to learn about the extension and `here <https://kernel.org/doc/html/latest/arch/arm64/sme.html>`__
+See [here](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/scalable-matrix-extension-armv9-a-architecture)
+to learn about the extension and [here](https://kernel.org/doc/html/latest/arch/arm64/sme.html)
 for the Linux Kernel's handling of it.
 
 SME adds a "Streaming Mode" to SVE, and this mode has its own vector length
@@ -101,37 +95,38 @@ known as the "Streaming Vector Length".
 
 In LLDB you will see the following new registers:
 
-* ``tpidr2``, an extra per thread pointer reserved for use by the SME ABI.
+* `tpidr2`, an extra per thread pointer reserved for use by the SME ABI.
   This is not scalable, just pointer sized aka 64 bit.
-* ``z0-z31`` streaming SVE registers. These have the same names as the
+* `z0-z31` streaming SVE registers. These have the same names as the
   non-streaming registers and therefore you will only see the active set in
   LLDB. You cannot read or write the inactive mode's registers. Their size
   is the same as the streaming vector length.
-* ``za`` the Array Storage register. The "Matrix" part of "Scalable Matrix
+* `za` the Array Storage register. The "Matrix" part of "Scalable Matrix
   Extension". This is a square made up of rows of length equal to the streaming
   vector length (svl). Meaning that the total size is svl * svl.
-* ``svcr`` the Streaming Vector Control Register. This is actually a pseduo
-  register but it matches the content of the architecturaly defined ``SVCR``.
+* `svcr` the Streaming Vector Control Register. This is actually a pseduo
+  register but it matches the content of the architecturaly defined `SVCR`.
   This is the register you should use to check whether streaming mode and/or
-  ``za`` is active. This register is read only.
-* ``svg`` the streaming vector length in granules. This value is not connected
+  `za` is active. This register is read only.
+* `svg` the streaming vector length in granules. This value is not connected
   to the vector length of non-streaming mode and may change independently. This
   register is read only.
 
-.. note::
-  While in non-streaming mode, the ``vg`` register shows the non-streaming
-  vector length, and the ``svg`` register shows the streaming vector length.
-  When in streaming mode, both ``vg`` and ``svg`` show the streaming mode vector
+```{note}
+  While in non-streaming mode, the `vg` register shows the non-streaming
+  vector length, and the `svg` register shows the streaming vector length.
+  When in streaming mode, both `vg` and `svg` show the streaming mode vector
   length. Therefore it is not possible at this time to read the non-streaming
   vector length within LLDB, while in streaming mode. This is a limitation of
   the LLDB implementation not the architecture, which stores both lengths
   independently.
+```
 
 In the example below, the streaming vector length is 16 bytes and we are in
-streaming mode. Note that bits 0 and 1 of ``svcr`` are set, indicating that we
-are in streaming mode and ZA is active. ``vg`` and ``svg`` report the same value
-as ``vg`` is showing the streaming mode vector length::
-
+streaming mode. Note that bits 0 and 1 of `svcr` are set, indicating that we
+are in streaming mode and ZA is active. `vg` and `svg` report the same value
+as `vg` is showing the streaming mode vector length:
+```
   Scalable Vector Extension Registers:
         vg = 0x0000000000000002
         z0 = {0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 <...> }
@@ -150,11 +145,11 @@ as ``vg`` is showing the streaming mode vector length::
          svg = 0x0000000000000002
         svcr = 0x0000000000000003
           za = {0x00 <...> 0x00}
+```
 
-Changing the Streaming Vector Length
-....................................
+### Changing the Streaming Vector Length
 
-To reduce complexity for LLDB, ``svg`` is read only. This means that you can
+To reduce complexity for LLDB, `svg` is read only. This means that you can
 only change the streaming vector length using LLDB when the debugee is in
 streaming mode.
 
@@ -162,80 +157,75 @@ As for non-streaming SVE, doing so will essentially make the content of the SVE
 registers undefined. It will also disable ZA, which follows what the Linux
 Kernel does.
 
-Visibility of an Inactive ZA Register
-.....................................
+### Visibility of an Inactive ZA Register
 
 LLDB does not handle registers that can come and go at runtime (SVE changes
-size but it does not dissappear). Therefore when ``za`` is not enabled, LLDB
+size but it does not dissappear). Therefore when `za` is not enabled, LLDB
 will return a block of 0s instead. This block will match the expected size of
-``za``::
-
+`za`:
+```
   (lldb) register read za svg svcr
       za = {0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 <...> }
      svg = 0x0000000000000002
     svcr = 0x0000000000000001
+```
 
-Note that ``svcr`` bit 2 is not set, meaning ``za`` is inactive.
+Note that `svcr` bit 2 is not set, meaning `za` is inactive.
 
-If you were to write to ``za`` from LLDB, ``za`` will be made active. There is
+If you were to write to `za` from LLDB, `za` will be made active. There is
 no way from within LLDB to reverse this change. As for changing the vector
-length, the debugee could still do something that would disable ``za`` again.
+length, the debugee could still do something that would disable `za` again.
 
-If you want to know whether ``za`` is active or not, refer to bit 2 of the
-``svcr`` register, otherwise known as ``SVCR.ZA``.
+If you want to know whether `za` is active or not, refer to bit 2 of the
+`svcr` register, otherwise known as `SVCR.ZA`.
 
-ZA Register Presentation
-........................
+### ZA Register Presentation
 
-As for SVE, LLDB does not know how the debugee will use ``za``, and therefore
+As for SVE, LLDB does not know how the debugee will use `za`, and therefore
 does not know how it would be best to display it. At any time any given
 instrucion could interpret its contents as many kinds and sizes of data.
 
-So LLDB will default to showing  ``za`` as one large vector of individual bytes.
+So LLDB will default to showing  `za` as one large vector of individual bytes.
 You can override this with a format option (see the SVE example above).
 
-Expression Evaluation
-.....................
+### Expression Evaluation
 
 The mode (streaming or non-streaming), streaming vector length and ZA state will
 be restored after expression evaluation. On top of all the things saved for SVE
 in general.
 
-Scalable Matrix Extension (SME2)
---------------------------------
+## Scalable Matrix Extension (SME2)
 
 The Scalable Matrix Extension 2 is documented in the same architecture
 specification as SME, and covered by the same kernel documentation page as SME.
 
-SME2 adds 1 new register, ``zt0``. This register is a fixed size 512 bit
+SME2 adds 1 new register, `zt0`. This register is a fixed size 512 bit
 register that is used by new instructions added in SME2. It is shown in LLDB in
 the existing SME register set.
 
-``zt0`` can be active or inactive, as ``za`` can. The same ``SVCR.ZA`` bit
-controls this. An inactive ``zt0`` is shown as 0s, like ``za`` is. Though in
-``zt0``'s case, LLDB does not need to fake the value. Ptrace already returns a
-block of 0s for an inactive ``zt0``.
+`zt0` can be active or inactive, as `za` can. The same `SVCR.ZA` bit
+controls this. An inactive `zt0` is shown as 0s, like `za` is. Though in
+`zt0`'s case, LLDB does not need to fake the value. Ptrace already returns a
+block of 0s for an inactive `zt0`.
 
-Like ``za``, writing to an inactive ``zt0`` will enable it and ``za``. This can
-be done from within LLDB. If the write is instead to ``za``, ``zt0`` becomes
+Like `za`, writing to an inactive `zt0` will enable it and `za`. This can
+be done from within LLDB. If the write is instead to `za`, `zt0` becomes
 active but with a value of all 0s.
 
-Since ``svcr`` is read only, there is no way at this time to deactivate the
+Since `svcr` is read only, there is no way at this time to deactivate the
 registers from within LLDB (though of course a running process can still do
 this).
 
-To check whether ``zt0`` is active, refer to ``SVCR.ZA`` and not to the value of
-``zt0``.
+To check whether `zt0` is active, refer to `SVCR.ZA` and not to the value of
+`zt0`.
 
-ZT0 Register Presentation
-.........................
+### ZT0 Register Presentation
 
-As for ``za``, the meaning of ``zt0`` depends on the instructions used with it,
+As for `za`, the meaning of `zt0` depends on the instructions used with it,
 so LLDB does not attempt to guess this and defaults to showing it as a vector of
 bytes.
 
-Expression Evaluation
-.....................
+### Expression Evaluation
 
-``zt0``'s value and whether it is active or not will be saved prior to
+`zt0`'s value and whether it is active or not will be saved prior to
 expression evaluation and restored afterwards.
\ No newline at end of file
diff --git a/lldb/docs/use/tutorial.rst b/lldb/docs/use/tutorial.rst
index 00e7befdd087a..76e8ac4aeab89 100644
--- a/lldb/docs/use/tutorial.rst
+++ b/lldb/docs/use/tutorial.rst
@@ -536,6 +536,43 @@ This command will run the thread in the current frame until it reaches line 100
 in this frame or stops if it leaves the current frame. This is a pretty close
 equivalent to GDB's ``until`` command.
 
+One other useful thing to note about the lldb stepping commands is that they
+are implemented as a stack of interruptible operations.  Until the operation -
+e.g. step to the next line - is completed, it will remain on the
+stack.  If the step over is interrupted and control returned to you,
+any new stepping commands you issue won't replace the step-over, but instead
+their operations will be pushed onto the stack after the original step over.
+Then each of them will be retired as they are completed, finally returning to the
+original step over operation.
+
+Suppose, for instance, you ``step-over`` a source line with a function call.
+If there is a breakpoint in that function, hitting the breakpoint will interrupt
+the step over.  At that point, you will likely want to examine the state at
+the breakpoint, maybe stepping around in that frame, or stepping into other
+functions, running some expressions, etc.
+
+Because the original step-over has remained on the stack, when you've finished
+your examinations, a simple ``continue`` will resume the original ``step-over``
+operation, and you will arrive at the end of your starting source line in the
+original frame.
+
+This saves you from having to keep track of your original intention, and manually
+issuing the requisite number of ``step-out`` commands to get back to the frame
+you were stepping over.  The stack maintains that information for you.
+
+Hand-called functions using the ``expr`` command are also implemented by
+operations on this same stack.  So if you are calling some code with the ``expr`` command,
+and hit a breakpoint during the evaluation of that code, you can examine
+the state where you stopped, and when you're satisfied,  issue a
+``continue`` to finish the expression evaluation operation and print the function
+result.
+
+You can examine the state of the operations stack using the ``thread plan list``
+command, and if, for instance, you decide you don't actually want that outermost
+next to continue running, you can remove it with the ``thread plan discard``
+command.  If you are interested in following this process in more detail, the
+``lldb step`` logging channel is useful source of information.
+
 A process, by default, will share the LLDB terminal with the inferior process.
 When in this mode, much like when debugging with GDB, when the process is
 running anything you type will go to the ``STDIN`` of the inferior process. To
diff --git a/lldb/include/lldb/API/SBDebugger.h b/lldb/include/lldb/API/SBDebugger.h
index 84ea9c0f772e1..6afa1c932ab05 100644
--- a/lldb/include/lldb/API/SBDebugger.h
+++ b/lldb/include/lldb/API/SBDebugger.h
@@ -304,6 +304,8 @@ class LLDB_API SBDebugger {
 
   bool GetUseColor() const;
 
+  bool SetShowInlineDiagnostics(bool);
+
   bool SetUseSourceCache(bool use_source_cache);
 
   bool GetUseSourceCache() const;
diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h
index a72c2596cc2c5..1d5f2fcc20626 100644
--- a/lldb/include/lldb/Core/Debugger.h
+++ b/lldb/include/lldb/Core/Debugger.h
@@ -364,6 +364,10 @@ class Debugger : public std::enable_shared_from_this<Debugger>,
 
   const std::string &GetInstanceName() { return m_instance_name; }
 
+  bool GetShowInlineDiagnostics() const;
+
+  bool SetShowInlineDiagnostics(bool);
+
   bool LoadPlugin(const FileSpec &spec, Status &error);
 
   void RunIOHandlers();
diff --git a/lldb/include/lldb/Expression/DiagnosticManager.h b/lldb/include/lldb/Expression/DiagnosticManager.h
index d49b7c99b114f..b9a6421577781 100644
--- a/lldb/include/lldb/Expression/DiagnosticManager.h
+++ b/lldb/include/lldb/Expression/DiagnosticManager.h
@@ -12,6 +12,9 @@
 #include "lldb/lldb-defines.h"
 #include "lldb/lldb-types.h"
 
+#include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/Status.h"
+
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 
@@ -20,6 +23,54 @@
 
 namespace lldb_private {
 
+/// A compiler-independent representation of a Diagnostic. Expression
+/// evaluation failures often have more than one diagnostic that a UI
+/// layer might want to render differently, for example to colorize
+/// it.
+///
+/// Running example:
+///   (lldb) expr 1+foo
+///   error: <user expression 0>:1:3: use of undeclared identifier 'foo'
+///   1+foo
+///     ^
+struct DiagnosticDetail {
+  struct SourceLocation {
+    FileSpec file;
+    unsigned line = 0;
+    uint16_t column = 0;
+    uint16_t length = 0;
+    bool hidden = false;
+    bool in_user_input = false;
+  };
+  /// Contains {{}, 1, 3, 3, true} in the example above.
+  std::optional<SourceLocation> source_location;
+  /// Contains eSeverityError in the example above.
+  lldb::Severity severity = lldb::eSeverityInfo;
+  /// Contains "use of undeclared identifier 'x'" in the example above.
+  std::string message;
+  /// Contains the fully rendered error message.
+  std::string rendered;
+};
+
+/// An llvm::Error used to communicate diagnostics in Status. Multiple
+/// diagnostics may be chained in an llvm::ErrorList.
+class ExpressionError
+    : public llvm::ErrorInfo<ExpressionError, ExpressionErrorBase> {
+  std::string m_message;
+  std::vector<DiagnosticDetail> m_details;
+
+public:
+  static char ID;
+  using llvm::ErrorInfo<ExpressionError, ExpressionErrorBase>::ErrorInfo;
+  ExpressionError(lldb::ExpressionResults result, std::string msg,
+                  std::vector<DiagnosticDetail> details = {});
+  std::string message() const override;
+  llvm::ArrayRef<DiagnosticDetail> GetDetails() const { return m_details; }
+  std::error_code convertToErrorCode() const override;
+  void log(llvm::raw_ostream &OS) const override;
+  std::unique_ptr<CloneableError> Clone() const override;
+};
+
 enum DiagnosticOrigin {
   eDiagnosticOriginUnknown = 0,
   eDiagnosticOriginLLDB,
@@ -49,37 +100,28 @@ class Diagnostic {
     }
   }
 
-  Diagnostic(llvm::StringRef message, lldb::Severity severity,
-             DiagnosticOrigin origin, uint32_t compiler_id)
-      : m_message(message), m_severity(severity), m_origin(origin),
-        m_compiler_id(compiler_id) {}
-
-  Diagnostic(const Diagnostic &rhs)
-      : m_message(rhs.m_message), m_severity(rhs.m_severity),
-        m_origin(rhs.m_origin), m_compiler_id(rhs.m_compiler_id) {}
+  Diagnostic(DiagnosticOrigin origin, uint32_t compiler_id,
+             DiagnosticDetail detail)
+      : m_origin(origin), m_compiler_id(compiler_id), m_detail(detail) {}
 
   virtual ~Diagnostic() = default;
 
   virtual bool HasFixIts() const { return false; }
 
-  lldb::Severity GetSeverity() const { return m_severity; }
+  lldb::Severity GetSeverity() const { return m_detail.severity; }
 
   uint32_t GetCompilerID() const { return m_compiler_id; }
 
-  llvm::StringRef GetMessage() const { return m_message; }
+  llvm::StringRef GetMessage() const { return m_detail.message; }
+  const DiagnosticDetail &GetDetail() const { return m_detail; }
 
-  void AppendMessage(llvm::StringRef message,
-                     bool precede_with_newline = true) {
-    if (precede_with_newline)
-      m_message.push_back('\n');
-    m_message += message;
-  }
+  void AppendMessage(llvm::StringRef message, bool precede_with_newline = true);
 
 protected:
-  std::string m_message;
-  lldb::Severity m_severity;
   DiagnosticOrigin m_origin;
-  uint32_t m_compiler_id; // Compiler-specific diagnostic ID
+  /// Compiler-specific diagnostic ID.
+  uint32_t m_compiler_id;
+  DiagnosticDetail m_detail;
 };
 
 typedef std::vector<std::unique_ptr<Diagnostic>> DiagnosticList;
@@ -102,10 +144,7 @@ class DiagnosticManager {
 
   void AddDiagnostic(llvm::StringRef message, lldb::Severity severity,
                      DiagnosticOrigin origin,
-                     uint32_t compiler_id = LLDB_INVALID_COMPILER_ID) {
-    m_diagnostics.emplace_back(
-        std::make_unique<Diagnostic>(message, severity, origin, compiler_id));
-  }
+                     uint32_t compiler_id = LLDB_INVALID_COMPILER_ID);
 
   void AddDiagnostic(std::unique_ptr<Diagnostic> diagnostic) {
     if (diagnostic)
@@ -130,6 +169,10 @@ class DiagnosticManager {
       m_diagnostics.back()->AppendMessage(str);
   }
 
+  /// Returns an \ref ExpressionError with \c arg as error code.
+  llvm::Error GetAsError(lldb::ExpressionResults result,
+                         llvm::Twine message = {}) const;
+
   // Returns a string containing errors in this format:
   //
   // "error: error text\n
diff --git a/lldb/include/lldb/Interpreter/CommandObject.h b/lldb/include/lldb/Interpreter/CommandObject.h
index 20c4769af9033..c5167e5e0ecb6 100644
--- a/lldb/include/lldb/Interpreter/CommandObject.h
+++ b/lldb/include/lldb/Interpreter/CommandObject.h
@@ -340,6 +340,13 @@ class CommandObject : public std::enable_shared_from_this<CommandObject> {
       return false;
   }
 
+  /// Set the command input as it appeared in the terminal. This
+  /// is used to have errors refer directly to the original command.
+  void SetOriginalCommandString(std::string s) { m_original_command = s; }
+
+  /// \param offset_in_command is on what column \c args_string
+  /// appears, if applicable. This enables diagnostics that refer back
+  /// to the user input.
   virtual void Execute(const char *args_string,
                        CommandReturnObject &result) = 0;
 
@@ -404,6 +411,7 @@ class CommandObject : public std::enable_shared_from_this<CommandObject> {
   std::string m_cmd_help_short;
   std::string m_cmd_help_long;
   std::string m_cmd_syntax;
+  std::string m_original_command;
   Flags m_flags;
   std::vector<CommandArgumentEntry> m_arguments;
   lldb::CommandOverrideCallback m_deprecated_command_override_callback;
diff --git a/lldb/include/lldb/Utility/Status.h b/lldb/include/lldb/Utility/Status.h
index 084ce4afb8cef..3910c26d115a0 100644
--- a/lldb/include/lldb/Utility/Status.h
+++ b/lldb/include/lldb/Utility/Status.h
@@ -38,6 +38,7 @@ class CloneableError
   using llvm::ErrorInfo<CloneableError, llvm::ErrorInfoBase>::ErrorInfo;
   CloneableError() : ErrorInfo() {}
   virtual std::unique_ptr<CloneableError> Clone() const = 0;
+  virtual lldb::ErrorType GetErrorType() const = 0;
   static char ID;
 };
 
@@ -48,6 +49,7 @@ class CloneableECError
   using llvm::ErrorInfo<CloneableECError, CloneableError>::ErrorInfo;
   std::error_code convertToErrorCode() const override { return EC; }
   void log(llvm::raw_ostream &OS) const override { OS << EC.message(); }
+  lldb::ErrorType GetErrorType() const override;
   static char ID;
 
 protected:
@@ -63,6 +65,7 @@ class MachKernelError
   MachKernelError(std::error_code ec) : ErrorInfo(ec) {}
   std::string message() const override;
   std::unique_ptr<CloneableError> Clone() const override;
+  lldb::ErrorType GetErrorType() const override;
   static char ID;
 };
 
@@ -72,21 +75,18 @@ class Win32Error : public llvm::ErrorInfo<Win32Error, CloneableECError> {
   Win32Error(std::error_code ec, const llvm::Twine &msg = {}) : ErrorInfo(ec) {}
   std::string message() const override;
   std::unique_ptr<CloneableError> Clone() const override;
+  lldb::ErrorType GetErrorType() const override;
   static char ID;
 };
 
-class ExpressionError
-    : public llvm::ErrorInfo<ExpressionError, CloneableECError> {
+class ExpressionErrorBase
+    : public llvm::ErrorInfo<ExpressionErrorBase, CloneableECError> {
 public:
-  using llvm::ErrorInfo<ExpressionError, CloneableECError>::ErrorInfo;
-  ExpressionError(std::error_code ec, std::string msg = {})
-      : ErrorInfo(ec), m_string(msg) {}
-  std::unique_ptr<CloneableError> Clone() const override;
-  std::string message() const override { return m_string; }
+  using llvm::ErrorInfo<ExpressionErrorBase, CloneableECError>::ErrorInfo;
+  ExpressionErrorBase(std::error_code ec, std::string msg = {})
+      : ErrorInfo(ec) {}
+  lldb::ErrorType GetErrorType() const override;
   static char ID;
-
-protected:
-  std::string m_string;
 };
 
 /// \class Status Status.h "lldb/Utility/Status.h" An error handling class.
@@ -160,9 +160,6 @@ class Status {
     return Status(llvm::formatv(format, std::forward<Args>(args)...));
   }
 
-  static Status FromExpressionError(lldb::ExpressionResults result,
-                                    std::string msg);
-
   /// Set the current error to errno.
   ///
   /// Update the error value to be \c errno and update the type to be \c
@@ -175,8 +172,11 @@ class Status {
   /// Avoid using this in new code. Migrate APIs to llvm::Expected instead.
   static Status FromError(llvm::Error error);
 
-  /// FIXME: Replace this with a takeError() method.
+  /// FIXME: Replace all uses with takeError() instead.
   llvm::Error ToError() const;
+
+  llvm::Error takeError() { return std::move(m_error); }
+
   /// Don't call this function in new code. Instead, redesign the API
   /// to use llvm::Expected instead of Status.
   Status Clone() const { return Status(ToError()); }
diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py
index 564918c58b6dd..e3099219e437e 100644
--- a/lldb/packages/Python/lldbsuite/test/builders/builder.py
+++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py
@@ -110,6 +110,10 @@ def getToolchainSpec(self, compiler):
         if not cc:
             return []
 
+        exe_ext = ""
+        if lldbplatformutil.getHostPlatform() == "windows":
+            exe_ext = ".exe"
+
         cc = cc.strip()
         cc_path = pathlib.Path(cc)
 
@@ -149,9 +153,9 @@ def getToolchainSpec(self, compiler):
         cc_dir = cc_path.parent
 
         def getToolchainUtil(util_name):
-            return cc_dir / (cc_prefix + util_name + cc_ext)
+            return os.path.join(configuration.llvm_tools_dir, util_name + exe_ext)
 
-        cxx = getToolchainUtil(cxx_type)
+        cxx = cc_dir / (cc_prefix + cxx_type + cc_ext)
 
         util_names = {
             "OBJCOPY": "objcopy",
@@ -161,6 +165,10 @@ def getToolchainUtil(util_name):
         }
         utils = []
 
+        # Required by API TestBSDArchives.py tests.
+        if not os.getenv("LLVM_AR"):
+            utils.extend(["LLVM_AR=%s" % getToolchainUtil("llvm-ar")])
+
         if not lldbplatformutil.platformIsDarwin():
             if cc_type in ["clang", "cc", "gcc"]:
                 util_paths = {}
diff --git a/lldb/packages/Python/lldbsuite/test/configuration.py b/lldb/packages/Python/lldbsuite/test/configuration.py
index 27eef040497d1..1bacd74a968c3 100644
--- a/lldb/packages/Python/lldbsuite/test/configuration.py
+++ b/lldb/packages/Python/lldbsuite/test/configuration.py
@@ -118,6 +118,9 @@
 # same base name.
 all_tests = set()
 
+# Path to LLVM tools to be used by tests.
+llvm_tools_dir = None
+
 # LLDB library directory.
 lldb_libs_dir = None
 lldb_obj_root = None
diff --git a/lldb/packages/Python/lldbsuite/test/dotest.py b/lldb/packages/Python/lldbsuite/test/dotest.py
index f14a00a2394b0..b1ae896d3fd3b 100644
--- a/lldb/packages/Python/lldbsuite/test/dotest.py
+++ b/lldb/packages/Python/lldbsuite/test/dotest.py
@@ -280,6 +280,7 @@ def parseOptionsAndInitTestdirs():
             "xcrun -find -toolchain default dsymutil"
         )
     if args.llvm_tools_dir:
+        configuration.llvm_tools_dir = args.llvm_tools_dir
         configuration.filecheck = shutil.which("FileCheck", path=args.llvm_tools_dir)
         configuration.yaml2obj = shutil.which("yaml2obj", path=args.llvm_tools_dir)
 
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index c6b7ce84109c0..8884ef5933ada 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -172,9 +172,9 @@
 WATCHPOINT_CREATED = "Watchpoint created successfully"
 
 
-def CMD_MSG(str):
+def CMD_MSG(command):
     """A generic "Command '%s' did not return successfully" message generator."""
-    return "Command '%s' did not return successfully" % str
+    return f"Command '{command}' did not return successfully"
 
 
 def COMPLETION_MSG(str_before, str_after, completions):
@@ -990,16 +990,14 @@ def runCmd(self, cmd, msg=None, check=True, trace=False, inHistory=False):
                     print("Command '" + cmd + "' failed!", file=sbuf)
 
         if check:
+            if not msg:
+                msg = CMD_MSG(cmd)
             output = ""
             if self.res.GetOutput():
                 output += "\nCommand output:\n" + self.res.GetOutput()
             if self.res.GetError():
                 output += "\nError output:\n" + self.res.GetError()
-            if msg:
-                msg += output
-            if cmd:
-                cmd += output
-            self.assertTrue(self.res.Succeeded(), msg if (msg) else CMD_MSG(cmd))
+            self.assertTrue(self.res.Succeeded(), msg + output)
 
     def HideStdout(self):
         """Hide output to stdout from the user.
diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp
index 6b72994fc96af..47931f1c16f9a 100644
--- a/lldb/source/API/SBDebugger.cpp
+++ b/lldb/source/API/SBDebugger.cpp
@@ -1483,6 +1483,12 @@ bool SBDebugger::GetUseColor() const {
   return (m_opaque_sp ? m_opaque_sp->GetUseColor() : false);
 }
 
+bool SBDebugger::SetShowInlineDiagnostics(bool value) {
+  LLDB_INSTRUMENT_VA(this, value);
+
+  return (m_opaque_sp ? m_opaque_sp->SetShowInlineDiagnostics(value) : false);
+}
+
 bool SBDebugger::SetUseSourceCache(bool value) {
   LLDB_INSTRUMENT_VA(this, value);
 
diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp
index 8d7364052a006..35058a713aef8 100644
--- a/lldb/source/Breakpoint/BreakpointLocation.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocation.cpp
@@ -264,9 +264,10 @@ bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx,
     if (!m_user_expression_sp->Parse(diagnostics, exe_ctx,
                                      eExecutionPolicyOnlyWhenNeeded, true,
                                      false)) {
-      error = Status::FromErrorStringWithFormat(
-          "Couldn't parse conditional expression:\n%s",
-          diagnostics.GetString().c_str());
+      error = Status::FromError(
+          diagnostics.GetAsError(lldb::eExpressionParseError,
+                                 "Couldn't parse conditional expression:"));
+
       m_user_expression_sp.reset();
       return true;
     }
@@ -324,8 +325,8 @@ bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx,
     }
   } else {
     ret = false;
-    error = Status::FromErrorStringWithFormat(
-        "Couldn't execute expression:\n%s", diagnostics.GetString().c_str());
+    error = Status::FromError(diagnostics.GetAsError(
+        lldb::eExpressionParseError, "Couldn't execute expression:"));
   }
 
   return ret;
diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp
index 771194638e1b6..9722c85a79b78 100644
--- a/lldb/source/Commands/CommandObjectExpression.cpp
+++ b/lldb/source/Commands/CommandObjectExpression.cpp
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/StringRef.h"
-
 #include "CommandObjectExpression.h"
+#include "DiagnosticRendering.h"
 #include "lldb/Core/Debugger.h"
+#include "lldb/Expression/DiagnosticManager.h"
 #include "lldb/Expression/ExpressionVariable.h"
 #include "lldb/Expression/REPL.h"
 #include "lldb/Expression/UserExpression.h"
@@ -486,19 +486,34 @@ bool CommandObjectExpression::EvaluateExpression(llvm::StringRef expr,
 
         result.SetStatus(eReturnStatusSuccessFinishResult);
       } else {
-        const char *error_cstr = result_valobj_sp->GetError().AsCString();
-        if (error_cstr && error_cstr[0]) {
-          const size_t error_cstr_len = strlen(error_cstr);
-          const bool ends_with_newline = error_cstr[error_cstr_len - 1] == '\n';
-          if (strstr(error_cstr, "error:") != error_cstr)
-            error_stream.PutCString("error: ");
-          error_stream.Write(error_cstr, error_cstr_len);
-          if (!ends_with_newline)
-            error_stream.EOL();
+        // Retrieve the diagnostics.
+        std::vector<DiagnosticDetail> details;
+        llvm::consumeError(llvm::handleErrors(
+            result_valobj_sp->GetError().ToError(),
+            [&](ExpressionError &error) { details = error.GetDetails(); }));
+        // Find the position of the expression in the command.
+        std::optional<uint16_t> expr_pos;
+        size_t nchar = m_original_command.find(expr);
+        if (nchar != std::string::npos)
+          expr_pos = nchar + GetDebugger().GetPrompt().size();
+
+        if (!details.empty()) {
+          bool show_inline =
+              GetDebugger().GetShowInlineDiagnostics() && !expr.contains('\n');
+          RenderDiagnosticDetails(error_stream, expr_pos, show_inline, details);
         } else {
-          error_stream.PutCString("error: unknown error\n");
+          const char *error_cstr = result_valobj_sp->GetError().AsCString();
+          llvm::StringRef error(error_cstr);
+          if (!error.empty()) {
+            if (!error.starts_with("error:"))
+              error_stream << "error: ";
+            error_stream << error;
+            if (!error.ends_with('\n'))
+              error_stream.EOL();
+          } else {
+            error_stream << "error: unknown error\n";
+          }
         }
-
         result.SetStatus(eReturnStatusFailed);
       }
     }
diff --git a/lldb/source/Commands/DiagnosticRendering.h b/lldb/source/Commands/DiagnosticRendering.h
new file mode 100644
index 0000000000000..5fdd090253a82
--- /dev/null
+++ b/lldb/source/Commands/DiagnosticRendering.h
@@ -0,0 +1,133 @@
+//===-- DiagnosticRendering.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_COMMANDS_DIAGNOSTICRENDERING_H
+#define LLDB_SOURCE_COMMANDS_DIAGNOSTICRENDERING_H
+
+#include "lldb/Expression/DiagnosticManager.h"
+#include "lldb/Utility/Stream.h"
+#include "llvm/Support/WithColor.h"
+
+namespace lldb_private {
+
+static llvm::raw_ostream &PrintSeverity(Stream &stream,
+                                        lldb::Severity severity) {
+  llvm::HighlightColor color;
+  llvm::StringRef text;
+  switch (severity) {
+  case lldb::eSeverityError:
+    color = llvm::HighlightColor::Error;
+    text = "error: ";
+    break;
+  case lldb::eSeverityWarning:
+    color = llvm::HighlightColor::Warning;
+    text = "warning: ";
+    break;
+  case lldb::eSeverityInfo:
+    color = llvm::HighlightColor::Remark;
+    text = "note: ";
+    break;
+  }
+  return llvm::WithColor(stream.AsRawOstream(), color, llvm::ColorMode::Enable)
+         << text;
+}
+  
+// Public for unittesting.
+static void RenderDiagnosticDetails(Stream &stream,
+                                    std::optional<uint16_t> offset_in_command,
+                                    bool show_inline,
+                                    llvm::ArrayRef<DiagnosticDetail> details) {
+  if (details.empty())
+    return;
+
+  if (!offset_in_command) {
+    for (const DiagnosticDetail &detail : details) {
+      PrintSeverity(stream, detail.severity);
+      stream << detail.rendered << '\n';
+    }
+    return;
+  }
+
+  // Print a line with caret indicator(s) below the lldb prompt + command.
+  const size_t padding = *offset_in_command;
+  stream << std::string(padding, ' ');
+
+  size_t offset = 1;
+  std::vector<DiagnosticDetail> remaining_details, other_details,
+      hidden_details;
+  for (const DiagnosticDetail &detail : details) {
+    if (!show_inline || !detail.source_location) {
+      other_details.push_back(detail);
+      continue;
+    }
+    if (detail.source_location->hidden) {
+      hidden_details.push_back(detail);
+      continue;
+    }
+    if (!detail.source_location->in_user_input) {
+      other_details.push_back(detail);
+      continue;
+    }
+
+    auto &loc = *detail.source_location;
+    remaining_details.push_back(detail);
+    if (offset > loc.column)
+      continue;
+    stream << std::string(loc.column - offset, ' ') << '^';
+    if (loc.length > 1)
+      stream << std::string(loc.length - 1, '~');
+    offset = loc.column + 1;
+  }
+  stream << '\n';
+
+  // Work through each detail in reverse order using the vector/stack.
+  bool did_print = false;
+  for (auto detail = remaining_details.rbegin();
+       detail != remaining_details.rend();
+       ++detail, remaining_details.pop_back()) {
+    // Get the information to print this detail and remove it from the stack.
+    // Print all the lines for all the other messages first.
+    stream << std::string(padding, ' ');
+    size_t offset = 1;
+    for (auto &remaining_detail :
+         llvm::ArrayRef(remaining_details).drop_back(1)) {
+      uint16_t column = remaining_detail.source_location->column;
+      stream << std::string(column - offset, ' ') << "│";
+      offset = column + 1;
+    }
+
+    // Print the line connecting the ^ with the error message.
+    uint16_t column = detail->source_location->column;
+    if (offset <= column)
+      stream << std::string(column - offset, ' ') << "╰─ ";
+
+    // Print a colorized string based on the message's severity type.
+    PrintSeverity(stream, detail->severity);
+
+    // Finally, print the message and start a new line.
+    stream << detail->message << '\n';
+    did_print = true;
+  }
+
+  // Print the non-located details.
+  for (const DiagnosticDetail &detail : other_details) {
+    PrintSeverity(stream, detail.severity);
+    stream << detail.rendered << '\n';
+    did_print = true;
+  }
+
+  // Print the hidden details as a last resort.
+  if (!did_print)
+    for (const DiagnosticDetail &detail : hidden_details) {
+      PrintSeverity(stream, detail.severity);
+      stream << detail.rendered << '\n';
+    }
+}
+
+} // namespace lldb_private
+#endif
diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td
index a6cb951187a04..e11aad2660b46 100644
--- a/lldb/source/Core/CoreProperties.td
+++ b/lldb/source/Core/CoreProperties.td
@@ -225,4 +225,8 @@ let Definition = "debugger" in {
     DefaultEnumValue<"eDWIMPrintVerbosityNone">,
     EnumValues<"OptionEnumValues(g_dwim_print_verbosities)">,
     Desc<"The verbosity level used by dwim-print.">;
+  def ShowInlineDiagnostics: Property<"show-inline-diagnostics", "Boolean">,
+    Global,
+    DefaultFalse,
+    Desc<"Controls whether diagnostics can refer directly to the command input, drawing arrows to it. If false, diagnostics will echo the input.">;
 }
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index 9bdc5a3949751..e6b9eedd89b4e 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -592,7 +592,18 @@ lldb::DWIMPrintVerbosity Debugger::GetDWIMPrintVerbosity() const {
   const uint32_t idx = ePropertyDWIMPrintVerbosity;
   return GetPropertyAtIndexAs<lldb::DWIMPrintVerbosity>(
       idx, static_cast<lldb::DWIMPrintVerbosity>(
-               g_debugger_properties[idx].default_uint_value));
+               g_debugger_properties[idx].default_uint_value != 0));
+}
+
+bool Debugger::GetShowInlineDiagnostics() const {
+  const uint32_t idx = ePropertyShowInlineDiagnostics;
+  return GetPropertyAtIndexAs<bool>(
+      idx, g_debugger_properties[idx].default_uint_value);
+}
+
+bool Debugger::SetShowInlineDiagnostics(bool b) {
+  const uint32_t idx = ePropertyShowInlineDiagnostics;
+  return SetPropertyAtIndex(idx, b);
 }
 
 #pragma mark Debugger
diff --git a/lldb/source/Expression/DiagnosticManager.cpp b/lldb/source/Expression/DiagnosticManager.cpp
index a8330138f3d53..7c67a0ce4aa02 100644
--- a/lldb/source/Expression/DiagnosticManager.cpp
+++ b/lldb/source/Expression/DiagnosticManager.cpp
@@ -14,24 +14,30 @@
 #include "lldb/Utility/StreamString.h"
 
 using namespace lldb_private;
+char ExpressionError::ID;
 
-void DiagnosticManager::Dump(Log *log) {
-  if (!log)
-    return;
-
-  std::string str = GetString();
-
-  // GetString() puts a separator after each diagnostic. We want to remove the
-  // last '\n' because log->PutCString will add one for us.
-
-  if (str.size() && str.back() == '\n') {
-    str.pop_back();
+/// A std::error_code category for eErrorTypeExpression.
+class ExpressionCategory : public std::error_category {
+  const char *name() const noexcept override {
+    return "LLDBExpressionCategory";
   }
-
-  log->PutCString(str.c_str());
+  std::string message(int __ev) const override {
+    return ExpressionResultAsCString(
+        static_cast<lldb::ExpressionResults>(__ev));
+  };
+};
+ExpressionCategory &expression_category() {
+  static ExpressionCategory g_expression_category;
+  return g_expression_category;
 }
 
-static const char *StringForSeverity(lldb::Severity severity) {
+ExpressionError::ExpressionError(lldb::ExpressionResults result,
+                                 std::string msg,
+                                 std::vector<DiagnosticDetail> details)
+    : ErrorInfo(std::error_code(result, expression_category())), m_message(msg),
+      m_details(details) {}
+
+static llvm::StringRef StringForSeverity(lldb::Severity severity) {
   switch (severity) {
   // this should be exhaustive
   case lldb::eSeverityError:
@@ -44,9 +50,33 @@ static const char *StringForSeverity(lldb::Severity severity) {
   llvm_unreachable("switch needs another case for lldb::Severity enum");
 }
 
+std::string ExpressionError::message() const {
+  std::string str;
+  {
+    llvm::raw_string_ostream os(str);
+    if (!m_message.empty())
+      os << m_message << '\n';
+    for (const auto &detail : m_details)
+      os << StringForSeverity(detail.severity) << detail.rendered << '\n';
+  }
+  return str;
+}
+
+std::error_code ExpressionError::convertToErrorCode() const {
+  return llvm::inconvertibleErrorCode();
+}
+
+void ExpressionError::log(llvm::raw_ostream &OS) const { OS << message(); }
+
+std::unique_ptr<CloneableError> ExpressionError::Clone() const {
+  return std::make_unique<ExpressionError>(
+      (lldb::ExpressionResults)convertToErrorCode().value(), m_message,
+      m_details);
+}
+
 std::string DiagnosticManager::GetString(char separator) {
-  std::string ret;
-  llvm::raw_string_ostream stream(ret);
+  std::string str;
+  llvm::raw_string_ostream stream(str);
 
   for (const auto &diagnostic : Diagnostics()) {
     llvm::StringRef severity = StringForSeverity(diagnostic->GetSeverity());
@@ -61,8 +91,39 @@ std::string DiagnosticManager::GetString(char separator) {
       stream << message.drop_front(severity_pos + severity.size());
     stream << separator;
   }
+  return str;
+}
+
+void DiagnosticManager::Dump(Log *log) {
+  if (!log)
+    return;
 
-  return ret;
+  std::string str = GetString();
+
+  // We want to remove the last '\n' because log->PutCString will add
+  // one for us.
+
+  if (!str.empty() && str.back() == '\n')
+    str.pop_back();
+
+  log->PutString(str);
+}
+
+llvm::Error DiagnosticManager::GetAsError(lldb::ExpressionResults result,
+                                          llvm::Twine message) const {
+  std::vector<DiagnosticDetail> details;
+  for (const auto &diag : m_diagnostics)
+    details.push_back(diag->GetDetail());
+  return llvm::make_error<ExpressionError>(result, message.str(), details);
+}
+
+void DiagnosticManager::AddDiagnostic(llvm::StringRef message,
+                                      lldb::Severity severity,
+                                      DiagnosticOrigin origin,
+                                      uint32_t compiler_id) {
+  m_diagnostics.emplace_back(std::make_unique<Diagnostic>(
+      origin, compiler_id,
+      DiagnosticDetail{{}, severity, message.str(), message.str()}));
 }
 
 size_t DiagnosticManager::Printf(lldb::Severity severity, const char *format,
@@ -85,3 +146,13 @@ void DiagnosticManager::PutString(lldb::Severity severity,
     return;
   AddDiagnostic(str, severity, eDiagnosticOriginLLDB);
 }
+
+void Diagnostic::AppendMessage(llvm::StringRef message,
+                               bool precede_with_newline) {
+  if (precede_with_newline) {
+    m_detail.message.push_back('\n');
+    m_detail.rendered.push_back('\n');
+  }
+  m_detail.message += message;
+  m_detail.rendered += message;
+}
diff --git a/lldb/source/Expression/ExpressionParser.cpp b/lldb/source/Expression/ExpressionParser.cpp
index 868556c1c58a5..1ba5e10d65d05 100644
--- a/lldb/source/Expression/ExpressionParser.cpp
+++ b/lldb/source/Expression/ExpressionParser.cpp
@@ -63,9 +63,8 @@ ExpressionParser::RunStaticInitializers(IRExecutionUnitSP &execution_unit_sp,
             exe_ctx, call_static_initializer, options, execution_errors);
 
     if (results != eExpressionCompleted) {
-      err = Status::FromErrorStringWithFormat(
-          "couldn't run static initializer: %s",
-          execution_errors.GetString().c_str());
+      err = Status::FromError(execution_errors.GetAsError(
+          lldb::eExpressionSetupError, "couldn't run static initializer:"));
       return err;
     }
   }
diff --git a/lldb/source/Expression/UserExpression.cpp b/lldb/source/Expression/UserExpression.cpp
index 872f6304f91ba..b3c81af24893d 100644
--- a/lldb/source/Expression/UserExpression.cpp
+++ b/lldb/source/Expression/UserExpression.cpp
@@ -328,18 +328,20 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
     }
 
     if (!parse_success) {
-      std::string msg;
-      {
-        llvm::raw_string_ostream os(msg);
-        if (!diagnostic_manager.Diagnostics().empty())
-          os << diagnostic_manager.GetString();
-        else
-          os << "expression failed to parse (no further compiler diagnostics)";
-        if (target->GetEnableNotifyAboutFixIts() && fixed_expression &&
-            !fixed_expression->empty())
-          os << "\nfixed expression suggested:\n  " << *fixed_expression;
+      if (target->GetEnableNotifyAboutFixIts() && fixed_expression &&
+          !fixed_expression->empty()) {
+        std::string fixit =
+            "fixed expression suggested:\n  " + *fixed_expression;
+        diagnostic_manager.AddDiagnostic(fixit, lldb::eSeverityInfo,
+                                         eDiagnosticOriginLLDB);
       }
-      error = Status::FromExpressionError(execution_results, msg);
+      if (diagnostic_manager.Diagnostics().empty())
+        error = Status::FromError(llvm::make_error<ExpressionError>(
+            execution_results,
+            "expression failed to parse (no further compiler diagnostics)"));
+      else
+        error =
+            Status::FromError(diagnostic_manager.GetAsError(execution_results));
     }
   }
 
@@ -351,18 +353,18 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
       LLDB_LOG(log, "== [UserExpression::Evaluate] Expression may not run, but "
                     "is not constant ==");
 
-      if (!diagnostic_manager.Diagnostics().size())
-        error = Status::FromExpressionError(
+      if (diagnostic_manager.Diagnostics().empty())
+        error = Status::FromError(llvm::make_error<ExpressionError>(
             lldb::eExpressionSetupError,
-            "expression needed to run but couldn't");
+            "expression needed to run but couldn't"));
     } else if (execution_policy == eExecutionPolicyTopLevel) {
       error = Status(UserExpression::kNoResult, lldb::eErrorTypeGeneric);
       return lldb::eExpressionCompleted;
     } else {
       if (options.InvokeCancelCallback(lldb::eExpressionEvaluationExecution)) {
-        error = Status::FromExpressionError(
+        error = Status::FromError(llvm::make_error<ExpressionError>(
             lldb::eExpressionInterrupted,
-            "expression interrupted by callback before execution");
+            "expression interrupted by callback before execution"));
         result_valobj_sp = ValueObjectConstResult::Create(
             exe_ctx.GetBestExecutionContextScope(), std::move(error));
         return lldb::eExpressionInterrupted;
@@ -380,12 +382,13 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
         LLDB_LOG(log, "== [UserExpression::Evaluate] Execution completed "
                       "abnormally ==");
 
-        if (!diagnostic_manager.Diagnostics().size())
-          error = Status::FromExpressionError(
-              execution_results, "expression failed to execute, unknown error");
+        if (diagnostic_manager.Diagnostics().empty())
+          error = Status::FromError(llvm::make_error<ExpressionError>(
+              execution_results,
+              "expression failed to execute, unknown error"));
         else
-          error = Status::FromExpressionError(execution_results,
-                                              diagnostic_manager.GetString());
+          error = Status::FromError(
+              diagnostic_manager.GetAsError(execution_results));
       } else {
         if (expr_result) {
           result_valobj_sp = expr_result->GetValueObject();
@@ -407,9 +410,9 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
   }
 
   if (options.InvokeCancelCallback(lldb::eExpressionEvaluationComplete)) {
-    error = Status::FromExpressionError(
+    error = Status::FromError(llvm::make_error<ExpressionError>(
         lldb::eExpressionInterrupted,
-        "expression interrupted by callback after complete");
+        "expression interrupted by callback after complete"));
     return lldb::eExpressionInterrupted;
   }
 
diff --git a/lldb/source/Expression/UtilityFunction.cpp b/lldb/source/Expression/UtilityFunction.cpp
index 55ebfb8ef342e..97c226ae1c5f9 100644
--- a/lldb/source/Expression/UtilityFunction.cpp
+++ b/lldb/source/Expression/UtilityFunction.cpp
@@ -83,19 +83,19 @@ FunctionCaller *UtilityFunction::MakeFunctionCaller(
   m_caller_up.reset(process_sp->GetTarget().GetFunctionCallerForLanguage(
       Language().AsLanguageType(), return_type, impl_code_address,
       arg_value_list, name.c_str(), error));
-  if (error.Fail()) {
-
+  if (error.Fail())
     return nullptr;
-  }
+
   if (m_caller_up) {
     DiagnosticManager diagnostics;
 
     unsigned num_errors =
         m_caller_up->CompileFunction(thread_to_use_sp, diagnostics);
     if (num_errors) {
-      error = Status::FromErrorStringWithFormat(
-          "Error compiling %s caller function: \"%s\".",
-          m_function_name.c_str(), diagnostics.GetString().c_str());
+      error = Status::FromError(diagnostics.GetAsError(
+          lldb::eExpressionParseError,
+          "Error compiling " + m_function_name + " caller function:"));
+
       m_caller_up.reset();
       return nullptr;
     }
@@ -104,9 +104,9 @@ FunctionCaller *UtilityFunction::MakeFunctionCaller(
     ExecutionContext exe_ctx(process_sp);
 
     if (!m_caller_up->WriteFunctionWrapper(exe_ctx, diagnostics)) {
-      error = Status::FromErrorStringWithFormat(
-          "Error inserting caller function for %s: \"%s\".",
-          m_function_name.c_str(), diagnostics.GetString().c_str());
+      error = Status::FromError(diagnostics.GetAsError(
+          lldb::eExpressionSetupError,
+          "Error inserting " + m_function_name + " caller function:"));
       m_caller_up.reset();
       return nullptr;
     }
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index acd592c3bd2db..d17aa6fec1f00 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -1887,7 +1887,8 @@ bool CommandInterpreter::HandleCommand(const char *command_line,
                                        CommandReturnObject &result,
                                        bool force_repeat_command) {
   std::string command_string(command_line);
-  std::string original_command_string(command_line);
+  std::string original_command_string(command_string);
+  std::string real_original_command_string(command_string);
 
   Log *log = GetLog(LLDBLog::Commands);
   llvm::PrettyStackTraceFormat stack_trace("HandleCommand(command = \"%s\")",
@@ -2076,6 +2077,7 @@ bool CommandInterpreter::HandleCommand(const char *command_line,
     }
 
     ElapsedTime elapsed(execute_time);
+    cmd_obj->SetOriginalCommandString(real_original_command_string);
     cmd_obj->Execute(remainder.c_str(), result);
   }
 
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
index 624848dee6ec3..30242038a5f66 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
@@ -594,7 +594,7 @@ void DynamicLoaderDarwin::UpdateSpecialBinariesFromNewImageInfos(
   }
 }
 
-void DynamicLoaderDarwin::UpdateDYLDImageInfoFromNewImageInfo(
+bool DynamicLoaderDarwin::UpdateDYLDImageInfoFromNewImageInfo(
     ImageInfo &image_info) {
   if (image_info.header.filetype == llvm::MachO::MH_DYLINKER) {
     const bool can_create = true;
@@ -605,8 +605,10 @@ void DynamicLoaderDarwin::UpdateDYLDImageInfoFromNewImageInfo(
       target.GetImages().AppendIfNeeded(dyld_sp);
       UpdateImageLoadAddress(dyld_sp.get(), image_info);
       SetDYLDModule(dyld_sp);
+      return true;
     }
   }
+  return false;
 }
 
 std::optional<lldb_private::Address> DynamicLoaderDarwin::GetStartAddress() {
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
index 3613c4c29b178..45c693163f810 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
@@ -208,7 +208,7 @@ class DynamicLoaderDarwin : public lldb_private::DynamicLoader {
   UpdateSpecialBinariesFromNewImageInfos(ImageInfo::collection &image_infos);
 
   // if image_info is a dyld binary, call this method
-  void UpdateDYLDImageInfoFromNewImageInfo(ImageInfo &image_info);
+  bool UpdateDYLDImageInfoFromNewImageInfo(ImageInfo &image_info);
 
   // If image_infos contains / may contain executable image, call this method
   // to keep our internal record keeping of the special dyld binary up-to-date.
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
index fe0224483b7c2..debd0f6ee83f4 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
@@ -259,10 +259,13 @@ bool DynamicLoaderMacOSXDYLD::ReadDYLDInfoFromMemoryAndSetNotificationCallback(
       ModuleSP dyld_module_sp;
       if (ParseLoadCommands(data, m_dyld, &m_dyld.file_spec)) {
         if (m_dyld.file_spec) {
-          UpdateDYLDImageInfoFromNewImageInfo(m_dyld);
+          if (!UpdateDYLDImageInfoFromNewImageInfo(m_dyld))
+            return false;
         }
       }
       dyld_module_sp = GetDYLDModule();
+      if (!dyld_module_sp)
+        return false;
 
       Target &target = m_process->GetTarget();
 
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ASTResultSynthesizer.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ASTResultSynthesizer.cpp
index 3e2c208bd2018..fd965d0127a2d 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ASTResultSynthesizer.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ASTResultSynthesizer.cpp
@@ -129,8 +129,6 @@ bool ASTResultSynthesizer::SynthesizeFunctionResult(FunctionDecl *FunDecl) {
 
     function_decl->print(os);
 
-    os.flush();
-
     LLDB_LOGF(log, "Untransformed function AST:\n%s", s.c_str());
   }
 
@@ -145,8 +143,6 @@ bool ASTResultSynthesizer::SynthesizeFunctionResult(FunctionDecl *FunDecl) {
 
     function_decl->print(os);
 
-    os.flush();
-
     LLDB_LOGF(log, "Transformed function AST:\n%s", s.c_str());
   }
 
@@ -169,8 +165,6 @@ bool ASTResultSynthesizer::SynthesizeObjCMethodResult(
 
     MethodDecl->print(os);
 
-    os.flush();
-
     LLDB_LOGF(log, "Untransformed method AST:\n%s", s.c_str());
   }
 
@@ -189,8 +183,6 @@ bool ASTResultSynthesizer::SynthesizeObjCMethodResult(
 
     MethodDecl->print(os);
 
-    os.flush();
-
     LLDB_LOGF(log, "Transformed method AST:\n%s", s.c_str());
   }
 
@@ -476,7 +468,6 @@ void ASTResultSynthesizer::CommitPersistentDecls() {
         std::string s;
         llvm::raw_string_ostream ss(s);
         decl->dump(ss);
-        ss.flush();
 
         LLDB_LOGF(log, "Couldn't commit persistent  decl: %s\n", s.c_str());
       }
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp
index adf13ff736adc..630ad7e20ab7e 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp
@@ -1162,7 +1162,6 @@ void ClangASTImporter::ASTImporterDelegate::ImportDefinitionTo(
         if (NamedDecl *from_named_decl = dyn_cast<clang::NamedDecl>(from)) {
           llvm::raw_string_ostream name_stream(name_string);
           from_named_decl->printName(name_stream);
-          name_stream.flush();
         }
         LLDB_LOG(log_ast,
                  "==== [ClangASTImporter][TUDecl: {0:x}] Imported "
@@ -1292,7 +1291,6 @@ void ClangASTImporter::ASTImporterDelegate::Imported(clang::Decl *from,
       std::string name_string;
       llvm::raw_string_ostream name_stream(name_string);
       from_named_decl->printName(name_stream);
-      name_stream.flush();
 
       LLDB_LOG(
           log,
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangDiagnostic.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangDiagnostic.h
index 21abd71cc34ee..c473df808ee8d 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangDiagnostic.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangDiagnostic.h
@@ -29,9 +29,8 @@ class ClangDiagnostic : public Diagnostic {
     return diag->getKind() == eDiagnosticOriginClang;
   }
 
-  ClangDiagnostic(llvm::StringRef message, lldb::Severity severity,
-                  uint32_t compiler_id)
-      : Diagnostic(message, severity, eDiagnosticOriginClang, compiler_id) {}
+  ClangDiagnostic(DiagnosticDetail detail, uint32_t compiler_id)
+      : Diagnostic(eDiagnosticOriginClang, compiler_id, detail) {}
 
   ~ClangDiagnostic() override = default;
 
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index 2fe3c0460aa7f..9b056ea73a77f 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -26,6 +26,7 @@
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/FrontendPluginRegistry.h"
+#include "clang/Frontend/TextDiagnostic.h"
 #include "clang/Frontend/TextDiagnosticBuffer.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Lex/Preprocessor.h"
@@ -161,7 +162,8 @@ static void AddAllFixIts(ClangDiagnostic *diag, const clang::Diagnostic &Info) {
 
 class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer {
 public:
-  ClangDiagnosticManagerAdapter(DiagnosticOptions &opts) {
+  ClangDiagnosticManagerAdapter(DiagnosticOptions &opts, StringRef filename)
+      : m_filename(filename) {
     DiagnosticOptions *options = new DiagnosticOptions(opts);
     options->ShowPresumedLoc = true;
     options->ShowLevel = false;
@@ -174,15 +176,22 @@ class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer {
     m_manager = manager;
   }
 
-  /// Returns the last ClangDiagnostic message that the DiagnosticManager
-  /// received or a nullptr if the DiagnosticMangager hasn't seen any
-  /// Clang diagnostics yet.
+  /// Returns the last error ClangDiagnostic message that the
+  /// DiagnosticManager received or a nullptr.
   ClangDiagnostic *MaybeGetLastClangDiag() const {
     if (m_manager->Diagnostics().empty())
       return nullptr;
-    lldb_private::Diagnostic *diag = m_manager->Diagnostics().back().get();
-    ClangDiagnostic *clang_diag = dyn_cast<ClangDiagnostic>(diag);
-    return clang_diag;
+    auto &diags = m_manager->Diagnostics();
+    for (auto it = diags.rbegin(); it != diags.rend(); it++) {
+      lldb_private::Diagnostic *diag = it->get();
+      if (ClangDiagnostic *clang_diag = dyn_cast<ClangDiagnostic>(diag)) {
+        if (clang_diag->GetSeverity() == lldb::eSeverityWarning)
+          return nullptr;
+        if (clang_diag->GetSeverity() == lldb::eSeverityError)
+          return clang_diag;
+      }
+    }
+    return nullptr;
   }
 
   void HandleDiagnostic(DiagnosticsEngine::Level DiagLevel,
@@ -210,27 +219,21 @@ class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer {
     // Render diagnostic message to m_output.
     m_output.clear();
     m_passthrough->HandleDiagnostic(DiagLevel, Info);
-    m_os->flush();
-
-    lldb::Severity severity;
-    bool make_new_diagnostic = true;
 
+    DiagnosticDetail detail;
     switch (DiagLevel) {
     case DiagnosticsEngine::Level::Fatal:
     case DiagnosticsEngine::Level::Error:
-      severity = lldb::eSeverityError;
+      detail.severity = lldb::eSeverityError;
       break;
     case DiagnosticsEngine::Level::Warning:
-      severity = lldb::eSeverityWarning;
+      detail.severity = lldb::eSeverityWarning;
       break;
     case DiagnosticsEngine::Level::Remark:
     case DiagnosticsEngine::Level::Ignored:
-      severity = lldb::eSeverityInfo;
+      detail.severity = lldb::eSeverityInfo;
       break;
     case DiagnosticsEngine::Level::Note:
-      m_manager->AppendMessageToDiagnostic(m_output);
-      make_new_diagnostic = false;
-
       // 'note:' diagnostics for errors and warnings can also contain Fix-Its.
       // We add these Fix-Its to the last error diagnostic to make sure
       // that we later have all Fix-Its related to an 'error' diagnostic when
@@ -248,24 +251,55 @@ class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer {
       AddAllFixIts(clang_diag, Info);
       break;
     }
-    if (make_new_diagnostic) {
       // ClangDiagnostic messages are expected to have no whitespace/newlines
       // around them.
       std::string stripped_output =
           std::string(llvm::StringRef(m_output).trim());
 
-      auto new_diagnostic = std::make_unique<ClangDiagnostic>(
-          stripped_output, severity, Info.getID());
+      // Translate the source location.
+      if (Info.hasSourceManager()) {
+        DiagnosticDetail::SourceLocation loc;
+        clang::SourceManager &sm = Info.getSourceManager();
+        const clang::SourceLocation sloc = Info.getLocation();
+        if (sloc.isValid()) {
+          const clang::FullSourceLoc fsloc(sloc, sm);
+          clang::PresumedLoc PLoc = fsloc.getPresumedLoc(true);
+          StringRef filename =
+              PLoc.isValid() ? PLoc.getFilename() : StringRef{};
+          loc.file = FileSpec(filename);
+          loc.line = fsloc.getSpellingLineNumber();
+          loc.column = fsloc.getSpellingColumnNumber();
+          loc.in_user_input = filename == m_filename;
+          loc.hidden = filename.starts_with("<lldb wrapper ");
+
+          // Find the range of the primary location.
+          for (const auto &range : Info.getRanges()) {
+            if (range.getBegin() == sloc) {
+              // FIXME: This is probably not handling wide characters correctly.
+              unsigned end_col = sm.getSpellingColumnNumber(range.getEnd());
+              if (end_col > loc.column)
+                loc.length = end_col - loc.column;
+              break;
+            }
+          }
+          detail.source_location = loc;
+        }
+      }
+      llvm::SmallString<0> msg;
+      Info.FormatDiagnostic(msg);
+      detail.message = msg.str();
+      detail.rendered = stripped_output;
+      auto new_diagnostic =
+          std::make_unique<ClangDiagnostic>(detail, Info.getID());
 
       // Don't store away warning fixits, since the compiler doesn't have
       // enough context in an expression for the warning to be useful.
       // FIXME: Should we try to filter out FixIts that apply to our generated
       // code, and not the user's expression?
-      if (severity == lldb::eSeverityError)
+      if (detail.severity == lldb::eSeverityError)
         AddAllFixIts(new_diagnostic.get(), Info);
 
       m_manager->AddDiagnostic(std::move(new_diagnostic));
-    }
   }
 
   void BeginSourceFile(const LangOptions &LO, const Preprocessor *PP) override {
@@ -281,6 +315,7 @@ class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer {
   std::shared_ptr<llvm::raw_string_ostream> m_os;
   /// Output string filled by m_os.
   std::string m_output;
+  StringRef m_filename;
 };
 
 /// Returns true if the SDK for the specified triple supports
@@ -711,8 +746,8 @@ ClangExpressionParser::ClangExpressionParser(
 
   // 4. Set language options.
   SetupLangOpts(*m_compiler, *exe_scope, expr);
-  if (auto *clang_expr = dyn_cast<ClangUserExpression>(&m_expr);
-      clang_expr && clang_expr->DidImportCxxModules()) {
+  auto *clang_expr = dyn_cast<ClangUserExpression>(&m_expr);
+  if (clang_expr && clang_expr->DidImportCxxModules()) {
     LLDB_LOG(log, "Adding lang options for importing C++ modules");
     SetupImportStdModuleLangOpts(*m_compiler, *target_sp);
     SetupModuleHeaderPaths(m_compiler.get(), m_include_directories, target_sp);
@@ -739,9 +774,9 @@ ClangExpressionParser::ClangExpressionParser(
 		                 m_compiler->getLangOpts());
 
   // 5. Set up the diagnostic buffer for reporting errors
-
   auto diag_mgr = new ClangDiagnosticManagerAdapter(
-      m_compiler->getDiagnostics().getDiagnosticOptions());
+      m_compiler->getDiagnostics().getDiagnosticOptions(),
+      clang_expr ? clang_expr->GetFilename() : StringRef());
   m_compiler->getDiagnostics().setClient(diag_mgr);
 
   // 6. Set up the source management objects inside the compiler
@@ -1503,13 +1538,9 @@ lldb_private::Status ClangExpressionParser::DoPrepareForExecution(
               new ClangDynamicCheckerFunctions();
 
           DiagnosticManager install_diags;
-          if (Error Err = dynamic_checkers->Install(install_diags, exe_ctx)) {
-            std::string ErrMsg = "couldn't install checkers: " + toString(std::move(Err));
-            if (install_diags.Diagnostics().size())
-              ErrMsg = ErrMsg + "\n" + install_diags.GetString().c_str();
-            err = Status(ErrMsg);
-            return err;
-          }
+          if (Error Err = dynamic_checkers->Install(install_diags, exe_ctx))
+            return Status::FromError(install_diags.GetAsError(
+                lldb::eExpressionSetupError, "couldn't install checkers:"));
 
           process->SetDynamicCheckers(dynamic_checkers);
 
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
index 09604feea5dec..7c0c6a0147e2a 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
@@ -177,6 +177,8 @@ class ClangUserExpression : public LLVMUserExpression {
   /// Returns true iff this expression is using any imported C++ modules.
   bool DidImportCxxModules() const { return !m_imported_cpp_modules.empty(); }
 
+  llvm::StringRef GetFilename() const { return m_filename; }
+
 private:
   /// Populate m_in_cplusplus_method and m_in_objectivec_method based on the
   /// environment.
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUtil.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangUtil.cpp
index 2e0bb318cb507..4cda426e72704 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUtil.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUtil.cpp
@@ -74,7 +74,6 @@ std::string ClangUtil::DumpDecl(const clang::Decl *d) {
   bool deserialize = false;
   d->dump(stream, deserialize);
 
-  stream.flush();
   return result;
 }
 
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 06da83e26a26a..b542e237f023d 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -1409,7 +1409,7 @@ void ObjectFileMachO::SanitizeSegmentCommand(
         seg_cmd.cmd == LC_SEGMENT_64 ? "LC_SEGMENT_64" : "LC_SEGMENT";
     GetModule()->ReportWarning(
         "load command {0} {1} has a fileoff + filesize ({2:x16}) that "
-        "extends beyond the end of the file ({4:x16}), the segment will be "
+        "extends beyond the end of the file ({3:x16}), the segment will be "
         "truncated to match",
         cmd_idx, lc_segment_name, seg_cmd.fileoff + seg_cmd.filesize, m_length);
 
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index 3f1e25f730a18..f6c16b6e3d96a 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -853,7 +853,7 @@ Status MinidumpFileBuilder::AddMemoryList() {
   uint64_t total_size = GetCurrentDataEndOffset();
   auto iterator = all_core_memory_vec.begin();
   while (iterator != all_core_memory_vec.end()) {
-    if (m_saved_stack_ranges.count(iterator->range.start()) > 0) {
+    if (m_thread_by_range_end.count(iterator->range.end()) > 0) {
       // We don't save stacks twice.
       ranges_32.push_back(*iterator);
       total_size +=
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
index d5eac9015ac42..a4240f871c8a2 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
@@ -172,7 +172,6 @@ class MinidumpFileBuilder {
   // to duplicate it in the exception data.
   std::unordered_map<lldb::tid_t, llvm::minidump::LocationDescriptor>
       m_tid_to_reg_ctx;
-  std::unordered_set<lldb::addr_t> m_saved_stack_ranges;
   lldb::FileUP m_core_file;
   lldb_private::SaveCoreOptions m_save_core_options;
 };
diff --git a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
index 31315e46ca168..e9830c9f8722b 100644
--- a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
+++ b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
@@ -863,9 +863,9 @@ uint32_t PlatformPOSIX::DoLoadImage(lldb_private::Process *process,
                                                  func_args_addr,
                                                  arguments,
                                                  diagnostics)) {
-    error = Status::FromErrorStringWithFormat(
-        "dlopen error: could not write function arguments: %s",
-        diagnostics.GetString().c_str());
+    error = Status::FromError(diagnostics.GetAsError(
+        lldb::eExpressionSetupError,
+        "dlopen error: could not write function arguments:"));
     return LLDB_INVALID_IMAGE_TOKEN;
   }
   
@@ -906,9 +906,9 @@ uint32_t PlatformPOSIX::DoLoadImage(lldb_private::Process *process,
   ExpressionResults results = do_dlopen_function->ExecuteFunction(
       exe_ctx, &func_args_addr, options, diagnostics, return_value);
   if (results != eExpressionCompleted) {
-    error = Status::FromErrorStringWithFormat(
-        "dlopen error: failed executing dlopen wrapper function: %s",
-        diagnostics.GetString().c_str());
+    error = Status::FromError(diagnostics.GetAsError(
+        lldb::eExpressionSetupError,
+        "dlopen error: failed executing dlopen wrapper function:"));
     return LLDB_INVALID_IMAGE_TOKEN;
   }
   
diff --git a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
index 7352d6f33f217..3936b8367fb83 100644
--- a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
+++ b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
@@ -341,9 +341,9 @@ uint32_t PlatformWindows::DoLoadImage(Process *process,
   diagnostics.Clear();
   if (!invocation->WriteFunctionArguments(context, injected_parameters,
                                           parameters, diagnostics)) {
-    error = Status::FromErrorStringWithFormat(
-        "LoadLibrary error: unable to write function parameters: %s",
-        diagnostics.GetString().c_str());
+    error = Status::FromError(diagnostics.GetAsError(
+        eExpressionSetupError,
+        "LoadLibrary error: unable to write function parameters:"));
     return LLDB_INVALID_IMAGE_TOKEN;
   }
 
@@ -384,9 +384,9 @@ uint32_t PlatformWindows::DoLoadImage(Process *process,
       invocation->ExecuteFunction(context, &injected_parameters, options,
                                   diagnostics, value);
   if (result != eExpressionCompleted) {
-    error = Status::FromErrorStringWithFormat(
-        "LoadLibrary error: failed to execute LoadLibrary helper: %s",
-        diagnostics.GetString().c_str());
+    error = Status::FromError(diagnostics.GetAsError(
+        eExpressionSetupError,
+        "LoadLibrary error: failed to execute LoadLibrary helper:"));
     return LLDB_INVALID_IMAGE_TOKEN;
   }
 
diff --git a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
index 7c8dba3680938..9f82c935c0e7e 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
@@ -23,9 +23,33 @@
 #define HWCAP2_AFP (1ULL << 20)
 #define HWCAP2_SME (1ULL << 23)
 #define HWCAP2_EBF16 (1ULL << 32)
+#define HWCAP2_FPMR (1ULL << 48)
 
 using namespace lldb_private;
 
+Arm64RegisterFlagsDetector::Fields
+Arm64RegisterFlagsDetector::DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2) {
+  (void)hwcap;
+
+  if (!(hwcap2 & HWCAP2_FPMR))
+    return {};
+
+  static const FieldEnum fp8_format_enum("fp8_format_enum", {
+                                                                {0, "FP8_E5M2"},
+                                                                {1, "FP8_E4M3"},
+                                                            });
+  return {
+      {"LSCALE2", 32, 37},
+      {"NSCALE", 24, 31},
+      {"LSCALE", 16, 22},
+      {"OSC", 15},
+      {"OSM", 14},
+      {"F8D", 6, 8, &fp8_format_enum},
+      {"F8S2", 3, 5, &fp8_format_enum},
+      {"F8S1", 0, 2, &fp8_format_enum},
+  };
+}
+
 Arm64RegisterFlagsDetector::Fields
 Arm64RegisterFlagsDetector::DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2) {
   (void)hwcap;
diff --git a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h
index a5bb38670b9cd..0f3d53d93892b 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h
@@ -60,6 +60,7 @@ class Arm64RegisterFlagsDetector {
   static Fields DetectFPCRFields(uint64_t hwcap, uint64_t hwcap2);
   static Fields DetectMTECtrlFields(uint64_t hwcap, uint64_t hwcap2);
   static Fields DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2);
+  static Fields DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2);
 
   struct RegisterEntry {
     RegisterEntry(llvm::StringRef name, unsigned size, DetectorFn detector)
@@ -69,12 +70,13 @@ class Arm64RegisterFlagsDetector {
     llvm::StringRef m_name;
     RegisterFlags m_flags;
     DetectorFn m_detector;
-  } m_registers[5] = {
+  } m_registers[6] = {
       RegisterEntry("cpsr", 4, DetectCPSRFields),
       RegisterEntry("fpsr", 4, DetectFPSRFields),
       RegisterEntry("fpcr", 4, DetectFPCRFields),
       RegisterEntry("mte_ctrl", 8, DetectMTECtrlFields),
       RegisterEntry("svcr", 8, DetectSVCRFields),
+      RegisterEntry("fpmr", 8, DetectFPMRFields),
   };
 
   // Becomes true once field detection has been run for all registers.
diff --git a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
index d62eb26ca1a29..ca8e9c078e1f9 100644
--- a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
@@ -374,7 +374,6 @@ DebuggerThread::HandleCreateProcessEvent(const CREATE_PROCESS_DEBUG_INFO &info,
   std::string thread_name;
   llvm::raw_string_ostream name_stream(thread_name);
   name_stream << "lldb.plugin.process-windows.secondary[" << process_id << "]";
-  name_stream.flush();
   llvm::set_thread_name(thread_name);
 
   // info.hProcess and info.hThread are closed automatically by Windows when
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index d005cf1e3d3c2..e42526c8fd726 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -3702,7 +3702,6 @@ GDBRemoteCommunicationClient::SendTraceStart(const llvm::json::Value &params,
   std::string json_string;
   llvm::raw_string_ostream os(json_string);
   os << params;
-  os.flush();
 
   escaped_packet.PutEscapedBytes(json_string.c_str(), json_string.size());
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp
index d4aa90b2c7731..5bd29ae40aa9e 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp
@@ -155,7 +155,6 @@ GDBRemoteCommunicationServer::SendJSONResponse(const json::Value &value) {
   std::string json_string;
   raw_string_ostream os(json_string);
   os << value;
-  os.flush();
   StreamGDBRemote escaped_response;
   escaped_response.PutEscapedBytes(json_string.c_str(), json_string.size());
   return SendPacketNoLock(escaped_response.GetString());
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index 77ef59d605c9c..66d0bc4b90cb5 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -229,9 +229,9 @@ bool DWARFDebugInfoEntry::GetDIENamesAndRanges(
         case DW_AT_frame_base:
           if (frame_base) {
             if (form_value.BlockData()) {
-              uint32_t block_offset =
+              uint64_t block_offset =
                   form_value.BlockData() - data.GetDataStart();
-              uint32_t block_length = form_value.Unsigned();
+              uint64_t block_length = form_value.Unsigned();
               *frame_base =
                   DWARFExpressionList(module,
                                       DWARFExpression(DataExtractor(
diff --git a/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.h b/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.h
index 1a034e87cfb65..06834edf14ea1 100644
--- a/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.h
+++ b/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.h
@@ -48,7 +48,7 @@ class CommandObjectThreadTraceExportCTF : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  void DoExecute(Args &command, CommandReturnObject &result) override;
+  void DoExecute(Args &args, CommandReturnObject &result) override;
 
   CommandOptions m_options;
 };
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index b0f49ebf2d2cb..fe0c53a7e9a3e 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -54,6 +54,7 @@
 #include "Plugins/ExpressionParser/Clang/ClangUserExpression.h"
 #include "Plugins/ExpressionParser/Clang/ClangUtil.h"
 #include "Plugins/ExpressionParser/Clang/ClangUtilityFunction.h"
+#include "lldb/Core/Debugger.h"
 #include "lldb/Core/DumpDataExtractor.h"
 #include "lldb/Core/Module.h"
 #include "lldb/Core/PluginManager.h"
@@ -697,10 +698,20 @@ void TypeSystemClang::CreateASTContext() {
   TargetInfo *target_info = getTargetInfo();
   if (target_info)
     m_ast_up->InitBuiltinTypes(*target_info);
-  else if (auto *log = GetLog(LLDBLog::Expressions))
-    LLDB_LOG(log,
-             "Failed to initialize builtin ASTContext types for target '{0}'",
-             m_target_triple);
+  else {
+    std::string err =
+        llvm::formatv(
+            "Failed to initialize builtin ASTContext types for target '{0}'. "
+            "Printing variables may behave unexpectedly.",
+            m_target_triple)
+            .str();
+
+    LLDB_LOG(GetLog(LLDBLog::Expressions), err.c_str());
+
+    static std::once_flag s_uninitialized_target_warning;
+    Debugger::ReportWarning(std::move(err), /*debugger_id=*/std::nullopt,
+                            &s_uninitialized_target_warning);
+  }
 
   GetASTMap().Insert(m_ast_up.get(), this);
 
@@ -749,6 +760,10 @@ CompilerType
 TypeSystemClang::GetBuiltinTypeForEncodingAndBitSize(Encoding encoding,
                                                      size_t bit_size) {
   ASTContext &ast = getASTContext();
+
+  if (!ast.VoidPtrTy)
+    return {};
+
   switch (encoding) {
   case eEncodingInvalid:
     if (QualTypeMatchesBitSize(bit_size, ast, ast.VoidPtrTy))
@@ -891,6 +906,9 @@ CompilerType TypeSystemClang::GetBuiltinTypeForDWARFEncodingAndBitSize(
     llvm::StringRef type_name, uint32_t dw_ate, uint32_t bit_size) {
   ASTContext &ast = getASTContext();
 
+  if (!ast.VoidPtrTy)
+    return {};
+
   switch (dw_ate) {
   default:
     break;
@@ -2335,6 +2353,9 @@ CompilerType TypeSystemClang::GetIntTypeFromBitSize(size_t bit_size,
                                                     bool is_signed) {
   clang::ASTContext &ast = getASTContext();
 
+  if (!ast.VoidPtrTy)
+    return {};
+
   if (is_signed) {
     if (bit_size == ast.getTypeSize(ast.SignedCharTy))
       return GetType(ast.SignedCharTy);
@@ -2376,6 +2397,9 @@ CompilerType TypeSystemClang::GetIntTypeFromBitSize(size_t bit_size,
 }
 
 CompilerType TypeSystemClang::GetPointerSizedIntType(bool is_signed) {
+  if (!getASTContext().VoidPtrTy)
+    return {};
+
   return GetIntTypeFromBitSize(
       getASTContext().getTypeSize(getASTContext().VoidPtrTy), is_signed);
 }
@@ -5075,7 +5099,8 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
       break;
 
     // AMD GPU builtin types.
-#define AMDGPU_TYPE(Name, Id, SingletonId) case clang::BuiltinType::Id:
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
+  case clang::BuiltinType::Id:
 #include "clang/Basic/AMDGPUTypes.def"
       break;
     }
@@ -7453,6 +7478,13 @@ clang::FieldDecl *TypeSystemClang::AddFieldToRecordType(
 
   clang::Expr *bit_width = nullptr;
   if (bitfield_bit_size != 0) {
+    if (clang_ast.IntTy.isNull()) {
+      LLDB_LOG(
+          GetLog(LLDBLog::Expressions),
+          "{0} failed: builtin ASTContext types have not been initialized");
+      return nullptr;
+    }
+
     llvm::APInt bitfield_bit_size_apint(clang_ast.getTypeSize(clang_ast.IntTy),
                                         bitfield_bit_size);
     bit_width = new (clang_ast)
@@ -9702,7 +9734,7 @@ ScratchTypeSystemClang::GetForTarget(Target &target,
       lldb::eLanguageTypeC, create_on_demand);
   if (auto err = type_system_or_err.takeError()) {
     LLDB_LOG_ERROR(GetLog(LLDBLog::Target), std::move(err),
-                   "Couldn't get scratch TypeSystemClang");
+                   "Couldn't get scratch TypeSystemClang: {0}");
     return nullptr;
   }
   auto ts_sp = *type_system_or_err;
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 29e9efb83efeb..04395e37f0425 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -2680,7 +2680,8 @@ Target::CreateUtilityFunction(std::string expression, std::string name,
 
   DiagnosticManager diagnostics;
   if (!utility_fn->Install(diagnostics, exe_ctx))
-    return llvm::createStringError(diagnostics.GetString());
+    return diagnostics.GetAsError(lldb::eExpressionSetupError,
+                                  "Could not install utility function:");
 
   return std::move(utility_fn);
 }
@@ -3617,7 +3618,7 @@ void Target::FinalizeFileActions(ProcessLaunchInfo &info) {
       if (info.GetFileActionForFD(STDERR_FILENO) == nullptr)
         err_file_spec = GetStandardErrorPath();
 
-      LLDB_LOG(log, "target stdin='{0}', target stdout='{1}', stderr='{1}'",
+      LLDB_LOG(log, "target stdin='{0}', target stdout='{1}', stderr='{2}'",
                in_file_spec, out_file_spec, err_file_spec);
 
       if (in_file_spec) {
diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp
index 7f73962c7fc9a..cf3772bc480ba 100644
--- a/lldb/source/Utility/Status.cpp
+++ b/lldb/source/Utility/Status.cpp
@@ -43,7 +43,7 @@ char CloneableError::ID;
 char CloneableECError::ID;
 char MachKernelError::ID;
 char Win32Error::ID;
-char ExpressionError::ID;
+char ExpressionErrorBase::ID;
 
 namespace {
 /// A std::error_code category for eErrorTypeGeneric.
@@ -55,21 +55,6 @@ LLDBGenericCategory &lldb_generic_category() {
   static LLDBGenericCategory g_generic_category;
   return g_generic_category;
 }
-
-/// A std::error_code category for eErrorTypeExpression.
-class ExpressionCategory : public std::error_category {
-  const char *name() const noexcept override {
-    return "LLDBExpressionCategory";
-  }
-  std::string message(int __ev) const override {
-    return ExpressionResultAsCString(
-        static_cast<lldb::ExpressionResults>(__ev));
-  };
-};
-ExpressionCategory &expression_category() {
-  static ExpressionCategory g_expression_category;
-  return g_expression_category;
-}
 } // namespace
 
 Status::Status() : m_error(llvm::Error::success()) {}
@@ -132,12 +117,6 @@ Status Status::FromErrorStringWithFormat(const char *format, ...) {
   return Status(string);
 }
 
-Status Status::FromExpressionError(lldb::ExpressionResults result,
-                                   std::string msg) {
-  return Status(llvm::make_error<ExpressionError>(
-      std::error_code(result, expression_category()), msg));
-}
-
 /// Creates a deep copy of all known errors and converts all other
 /// errors to a new llvm::StringError.
 static llvm::Error CloneError(const llvm::Error &error) {
@@ -211,10 +190,6 @@ std::unique_ptr<CloneableError> Win32Error::Clone() const {
   return std::make_unique<Win32Error>(convertToErrorCode());
 }
 
-std::unique_ptr<CloneableError> ExpressionError::Clone() const {
-  return std::make_unique<ExpressionError>(convertToErrorCode(), message());
-}
-
 // Get the error value as a NULL C string. The error string will be fetched and
 // cached on demand. The cached error string value will remain until the error
 // value is changed or cleared.
@@ -257,26 +232,38 @@ Status::ValueType Status::GetError() const {
   return result;
 }
 
-// Access the error type.
+static ErrorType ErrorCodeToErrorType(std::error_code ec) {
+  if (ec.category() == std::generic_category())
+    return eErrorTypePOSIX;
+  if (ec.category() == lldb_generic_category() ||
+      ec == llvm::inconvertibleErrorCode())
+    return eErrorTypeGeneric;
+  return eErrorTypeInvalid;
+}
+
+ErrorType CloneableECError::GetErrorType() const {
+  return ErrorCodeToErrorType(EC);
+}
+
+lldb::ErrorType MachKernelError::GetErrorType() const {
+  return lldb::eErrorTypeMachKernel;
+}
+
+lldb::ErrorType Win32Error::GetErrorType() const {
+  return lldb::eErrorTypeWin32;
+}
+
+lldb::ErrorType ExpressionErrorBase::GetErrorType() const {
+  return lldb::eErrorTypeExpression;
+}
+
 ErrorType Status::GetType() const {
   ErrorType result = eErrorTypeInvalid;
   llvm::visitErrors(m_error, [&](const llvm::ErrorInfoBase &error) {
     // Return the first only.
     if (result != eErrorTypeInvalid)
       return;
-    if (error.isA<MachKernelError>())
-      result = eErrorTypeMachKernel;
-    else if (error.isA<Win32Error>())
-      result = eErrorTypeWin32;
-    else if (error.isA<ExpressionError>())
-      result = eErrorTypeExpression;
-    else if (error.convertToErrorCode().category() == std::generic_category())
-      result = eErrorTypePOSIX;
-    else if (error.convertToErrorCode().category() == lldb_generic_category() ||
-             error.convertToErrorCode() == llvm::inconvertibleErrorCode())
-      result = eErrorTypeGeneric;
-    else
-      result = eErrorTypeInvalid;
+    result = ErrorCodeToErrorType(error.convertToErrorCode());
   });
   return result;
 }
diff --git a/lldb/source/Utility/UUID.cpp b/lldb/source/Utility/UUID.cpp
index 57e3a39d1f8e9..370b8b6848c7e 100644
--- a/lldb/source/Utility/UUID.cpp
+++ b/lldb/source/Utility/UUID.cpp
@@ -56,7 +56,6 @@ std::string UUID::GetAsString(llvm::StringRef separator) const {
 
     os << llvm::format_hex_no_prefix(B.value(), 2, true);
   }
-  os.flush();
 
   return result;
 }
diff --git a/lldb/test/API/commands/expression/diagnostics/TestExprDiagnostics.py b/lldb/test/API/commands/expression/diagnostics/TestExprDiagnostics.py
index ddc1c3598480c..1687b617350d9 100644
--- a/lldb/test/API/commands/expression/diagnostics/TestExprDiagnostics.py
+++ b/lldb/test/API/commands/expression/diagnostics/TestExprDiagnostics.py
@@ -183,3 +183,54 @@ def test_source_locations_from_objc_modules(self):
         # The NSLog definition source line should be printed. Return value and
         # the first argument are probably stable enough that this test can check for them.
         self.assertIn("void NSLog(NSString *format", value.GetError().GetCString())
+
+    def test_command_expr_formatting(self):
+        """Test that the source and caret positions LLDB prints are correct"""
+        self.build()
+
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "// Break here", self.main_source_spec
+        )
+        frame = thread.GetFrameAtIndex(0)
+        self.expect("settings set show-inline-diagnostics true")
+
+        def check(input_ref):
+            self.expect(input_ref[0], error=True, substrs=input_ref[1:])
+
+        check(
+            [
+                "expression -- a+b",
+                "              ^ ^",
+                "              │ ╰─ error: use of undeclared identifier 'b'",
+                "              ╰─ error: use of undeclared identifier 'a'",
+            ]
+        )
+
+        check(
+            [
+                "expr -- a",
+                "        ^",
+                "        ╰─ error: use of undeclared identifier 'a'",
+            ]
+        )
+        check(
+            [
+                "expr -i 0 -o 0 -- a",
+                "                  ^",
+                "                  ╰─ error: use of undeclared identifier 'a'",
+            ]
+        )
+
+        self.expect(
+            "expression --top-level -- template<typename T> T FOO(T x) { return x/2;}"
+        )
+        check(
+            [
+                'expression -- FOO("")',
+                "              ^",
+                "              ╰─ note: in instantiation of function template specialization 'FOO<const char *>' requested here",
+                "error: <user expression",
+                "invalid operands to binary expression",
+            ]
+        )
+        check(["expression --\na\n+\nb", "error: <user", "a", "error: <user", "b"])
diff --git a/lldb/test/API/commands/expression/persistent_variables/TestPersistentVariables.py b/lldb/test/API/commands/expression/persistent_variables/TestPersistentVariables.py
index 6a7995ff2a837..d0d9358589a71 100644
--- a/lldb/test/API/commands/expression/persistent_variables/TestPersistentVariables.py
+++ b/lldb/test/API/commands/expression/persistent_variables/TestPersistentVariables.py
@@ -56,7 +56,7 @@ def test_persistent_variables(self):
         self.expect(
             "expr int $i = 123",
             error=True,
-            substrs=["error: redefinition of persistent variable '$i'"],
+            substrs=["redefinition of persistent variable '$i'"],
         )
         self.expect_expr("$i", result_type="int", result_value="5")
 
@@ -65,7 +65,7 @@ def test_persistent_variables(self):
         self.expect(
             "expr long $i = 123",
             error=True,
-            substrs=["error: redefinition of persistent variable '$i'"],
+            substrs=["redefinition of persistent variable '$i'"],
         )
         self.expect_expr("$i", result_type="int", result_value="5")
 
diff --git a/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py b/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py
index ea3aa6a4608c4..c734033bd6c02 100644
--- a/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py
+++ b/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py
@@ -35,7 +35,7 @@ def test_failing_init(self):
         self.expect(
             "expr -p -- struct Foo2 { Foo2() { do_abort(); } }; Foo2 f;",
             error=True,
-            substrs=["error: couldn't run static initializer:"],
+            substrs=["couldn't run static initializer:"],
         )
 
     def test_without_process(self):
diff --git a/lldb/test/API/functionalities/archives/Makefile b/lldb/test/API/functionalities/archives/Makefile
index c4c593e6db051..4b9696e26b575 100644
--- a/lldb/test/API/functionalities/archives/Makefile
+++ b/lldb/test/API/functionalities/archives/Makefile
@@ -12,12 +12,10 @@ libfoo.a: a.o b.o
 
 # This tests whether lldb can load a thin archive
 libbar.a: c.o
-	$(eval LLVM_AR := $(LLVM_TOOLS_DIR)/llvm-ar)
 	$(eval LLVM_ARFLAGS := -rcsDT)
 	$(LLVM_AR) $(LLVM_ARFLAGS) $@ $^
 
 libfoo-thin.a: a.o b.o
-	$(eval LLVM_AR := $(LLVM_TOOLS_DIR)/llvm-ar)
 	$(eval LLVM_ARFLAGS := -rcsUT)
 	$(LLVM_AR) $(LLVM_ARFLAGS) $@ $^
 
diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
index ccdb6653cf16f..03cc415924e0b 100644
--- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
+++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
@@ -522,3 +522,46 @@ def minidump_deleted_on_save_failure(self):
 
         finally:
             self.assertTrue(self.dbg.DeleteTarget(target))
+
+    def minidump_deterministic_difference(self):
+        """Test that verifies that two minidumps produced are identical."""
+
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        try:
+            target = self.dbg.CreateTarget(exe)
+            process = target.LaunchSimple(
+                None, None, self.get_process_working_directory()
+            )
+            self.assertState(process.GetState(), lldb.eStateStopped)
+
+            core_styles = [
+                lldb.eSaveCoreStackOnly,
+                lldb.eSaveCoreDirtyOnly,
+                lldb.eSaveCoreFull,
+            ]
+            for style in core_styles:
+                spec_one = lldb.SBFileSpec(self.getBuildArtifact("core.one.dmp"))
+                spec_two = lldb.SBFileSpec(self.getBuildArtifact("core.two.dmp"))
+                options = lldb.SBSaveCoreOptions()
+                options.SetOutputFile(spec_one)
+                options.SetPluginName("minidump")
+                options.SetStyle(style)
+                error = process.SaveCore(options)
+                self.assertTrue(error.Success())
+                options.SetOutputFile(spec_two)
+                error = process.SaveCore(options)
+                self.assertTrue(error.Success())
+
+                file_one = None
+                file_two = None
+                with open(spec_one.GetFileName(), mode="rb") as file:
+                    file_one = file.read()
+                with open(spec_two.GetFileName(), mode="rb") as file:
+                    file_two = file.read()
+                self.assertEqual(file_one, file_two)
+                self.assertTrue(os.unlink(spec_one.GetFileName()))
+                self.assertTrue(os.unlink(spec_two.GetFileName()))
+
+        finally:
+            self.assertTrue(self.dbg.DeleteTarget(target))
diff --git a/lldb/test/API/lang/cpp/template-function/TestTemplateFunctions.py b/lldb/test/API/lang/cpp/template-function/TestTemplateFunctions.py
index 71df90e6a6d16..3be93dedfd11d 100644
--- a/lldb/test/API/lang/cpp/template-function/TestTemplateFunctions.py
+++ b/lldb/test/API/lang/cpp/template-function/TestTemplateFunctions.py
@@ -21,7 +21,8 @@ def do_test_template_function(self, add_cast):
                 "expr b1 <=> b2",
                 error=True,
                 substrs=[
-                    "warning: <user expression 0>:1:4: '<=>' is a single token in C++20; add a space to avoid a change in behavior"
+                    "warning:",
+                    "'<=>' is a single token in C++20; add a space to avoid a change in behavior",
                 ],
             )
 
diff --git a/lldb/test/API/lang/mixed/TestMixedLanguages.py b/lldb/test/API/lang/mixed/TestMixedLanguages.py
index 8b73254cce4a9..1637d59a5edcb 100644
--- a/lldb/test/API/lang/mixed/TestMixedLanguages.py
+++ b/lldb/test/API/lang/mixed/TestMixedLanguages.py
@@ -40,7 +40,7 @@ def cleanup():
         self.runCmd("run")
         self.expect("thread backtrace", substrs=["`main", "lang=c"])
         # Make sure evaluation of C++11 fails.
-        self.expect("expr foo != nullptr", error=True, startstr="error")
+        self.expect("expr foo != nullptr", error=True, substrs=["error"])
 
         # Run to BP at foo (in foo.cpp) and test that the language is C++.
         self.runCmd("breakpoint set -n foo")
diff --git a/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py b/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py
index 620b6e44fc852..36e302be2525b 100644
--- a/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py
+++ b/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py
@@ -21,7 +21,7 @@ def test(self):
             "expr @import LLDBTestModule",
             error=True,
             substrs=[
-                "module.h:4:1: use of undeclared identifier 'syntax_error_for_lldb_to_find'",
+                "module.h:4:1: error: use of undeclared identifier 'syntax_error_for_lldb_to_find'",
                 "syntax_error_for_lldb_to_find // comment that tests source printing",
                 "could not build module 'LLDBTestModule'",
             ],
diff --git a/lldb/test/API/linux/aarch64/fpmr/TestAArch64LinuxFPMR.py b/lldb/test/API/linux/aarch64/fpmr/TestAArch64LinuxFPMR.py
index 5a3b8f501095e..d022c8eb3d6cc 100644
--- a/lldb/test/API/linux/aarch64/fpmr/TestAArch64LinuxFPMR.py
+++ b/lldb/test/API/linux/aarch64/fpmr/TestAArch64LinuxFPMR.py
@@ -45,6 +45,11 @@ def test_fpmr_register(self):
             substrs=["Floating Point Mode Register", f"fpmr = {expected_fpmr:#018x}"],
         )
 
+        if self.hasXMLSupport():
+            self.expect(
+                "register read fpmr", substrs=["LSCALE2 = 42", "F8S1 = FP8_E4M3 | 0x4"]
+            )
+
         # Write a value for the program to find. Same fields but with bit values
         # inverted.
         new_fpmr = (0b010101 << 32) | 0b010
diff --git a/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py b/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py
index 1082541aebcf7..ea43fccf016a7 100644
--- a/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py
+++ b/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py
@@ -93,15 +93,18 @@ def test_readMemory(self):
 
         # We can read the complete string
         mem = self.dap_server.request_readMemory(memref, 0, 5)["body"]
-        self.assertEqual(mem["unreadableBytes"], 0)
         self.assertEqual(b64decode(mem["data"]), b"dead\0")
 
+        # We can read large chunks, potentially returning partial results
+        mem = self.dap_server.request_readMemory(memref, 0, 4096)["body"]
+        self.assertEqual(b64decode(mem["data"])[0:5], b"dead\0")
+
         # Use an offset
         mem = self.dap_server.request_readMemory(memref, 2, 3)["body"]
         self.assertEqual(b64decode(mem["data"]), b"ad\0")
 
         # Reads of size 0 are successful
-        # VS-Code sends those in order to check if a `memoryReference` can actually be dereferenced.
+        # VS Code sends those in order to check if a `memoryReference` can actually be dereferenced.
         mem = self.dap_server.request_readMemory(memref, 0, 0)
         self.assertEqual(mem["success"], True)
         self.assertEqual(mem["body"]["data"], "")
@@ -109,4 +112,3 @@ def test_readMemory(self):
         # Reads at offset 0x0 fail
         mem = self.dap_server.request_readMemory("0x0", 0, 6)
         self.assertEqual(mem["success"], False)
-        self.assertEqual(mem["message"], "Memory region is not readable")
diff --git a/lldb/test/Shell/Expr/TestObjCIDCast.test b/lldb/test/Shell/Expr/TestObjCIDCast.test
index 0611171da09e2..19ca404643c1d 100644
--- a/lldb/test/Shell/Expr/TestObjCIDCast.test
+++ b/lldb/test/Shell/Expr/TestObjCIDCast.test
@@ -6,4 +6,4 @@
 // RUN:   2>&1 | FileCheck %s
 
 // CHECK: (lldb) expression --language objc -- *(id)0x1
-// CHECK: error: Couldn't apply expression side effects : Couldn't dematerialize a result variable: couldn't read its memory
+// CHECK: error:{{.*}}Couldn't apply expression side effects : Couldn't dematerialize a result variable: couldn't read its memory
diff --git a/lldb/test/Shell/Expr/TestObjCInCXXContext.test b/lldb/test/Shell/Expr/TestObjCInCXXContext.test
index 8537799bdeb67..f8cad5b58a1e5 100644
--- a/lldb/test/Shell/Expr/TestObjCInCXXContext.test
+++ b/lldb/test/Shell/Expr/TestObjCInCXXContext.test
@@ -18,4 +18,4 @@
 // CHECK-NEXT: (NSString *){{.*}}= nil
 
 // CHECK:      (lldb) expression NSString
-// CHECK-NEXT: error:{{.*}} use of undeclared identifier 'NSString'
+// CHECK:      error:{{.*}}use of undeclared identifier 'NSString'
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/incomplete-tag-type.cpp b/lldb/test/Shell/SymbolFile/NativePDB/incomplete-tag-type.cpp
index a249057282d89..8c16828690301 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/incomplete-tag-type.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/incomplete-tag-type.cpp
@@ -13,9 +13,7 @@
 // CHECK: (lldb) expression d
 // CHECK: (D) $1 = {}
 // CHECK: (lldb) expression static_e_ref
-// CHECK: error: {{.*}}incomplete type 'E' where a complete type is required
-// CHECK: static_e_ref
-// CHECK: ^
+// CHECK: error:{{.*}}incomplete type 'E' where a complete type is required
 
 // Complete base class.
 struct A { int x; A(); };
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index 14371da64f2f2..afb1a1ff95c3a 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -442,6 +442,7 @@ int Driver::MainLoop() {
   m_debugger.SetInputFileHandle(stdin, false);
 
   m_debugger.SetUseExternalEditor(m_option_data.m_use_external_editor);
+  m_debugger.SetShowInlineDiagnostics(true);
 
   struct winsize window_size;
   if ((isatty(STDIN_FILENO) != 0) &&
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index c7653fed2def4..db4dbbd6f6200 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -4422,14 +4422,6 @@ void request_readMemory(const llvm::json::Object &request) {
   FillResponse(request, response);
   auto *arguments = request.getObject("arguments");
 
-  lldb::SBProcess process = g_dap.target.GetProcess();
-  if (!process.IsValid()) {
-    response["success"] = false;
-    response["message"] = "No process running";
-    g_dap.SendJSON(llvm::json::Value(std::move(response)));
-    return;
-  }
-
   llvm::StringRef memoryReference = GetString(arguments, "memoryReference");
   auto addr_opt = DecodeMemoryReference(memoryReference);
   if (!addr_opt.has_value()) {
@@ -4439,57 +4431,32 @@ void request_readMemory(const llvm::json::Object &request) {
     g_dap.SendJSON(llvm::json::Value(std::move(response)));
     return;
   }
-  lldb::addr_t addr = *addr_opt;
-
-  addr += GetSigned(arguments, "offset", 0);
-  const uint64_t requested_count = GetUnsigned(arguments, "count", 0);
-  lldb::SBMemoryRegionInfo region_info;
-  lldb::SBError memreg_error = process.GetMemoryRegionInfo(addr, region_info);
-  if (memreg_error.Fail()) {
-    response["success"] = false;
-    EmplaceSafeString(response, "message",
-                      "Unable to find memory region: " +
-                          std::string(memreg_error.GetCString()));
-    g_dap.SendJSON(llvm::json::Value(std::move(response)));
-    return;
-  }
-  if (!region_info.IsReadable()) {
+  lldb::addr_t addr_int = *addr_opt;
+  addr_int += GetSigned(arguments, "offset", 0);
+  const uint64_t count_requested = GetUnsigned(arguments, "count", 0);
+
+  // We also need support reading 0 bytes
+  // VS Code sends those requests to check if a `memoryReference`
+  // can be dereferenced.
+  const uint64_t count_read = std::max<uint64_t>(count_requested, 1);
+  std::vector<uint8_t> buf;
+  buf.resize(count_read);
+  lldb::SBError error;
+  lldb::SBAddress addr{addr_int, g_dap.target};
+  size_t count_result =
+      g_dap.target.ReadMemory(addr, buf.data(), count_read, error);
+  if (count_result == 0) {
     response["success"] = false;
-    response.try_emplace("message", "Memory region is not readable");
+    EmplaceSafeString(response, "message", error.GetCString());
     g_dap.SendJSON(llvm::json::Value(std::move(response)));
     return;
   }
-  const uint64_t available_count =
-      std::min(requested_count, region_info.GetRegionEnd() - addr);
-  const uint64_t unavailable_count = requested_count - available_count;
-
-  std::vector<uint8_t> buf;
-  buf.resize(available_count);
-  if (available_count > 0) {
-    lldb::SBError memread_error;
-    uint64_t bytes_read =
-        process.ReadMemory(addr, buf.data(), available_count, memread_error);
-    if (memread_error.Fail()) {
-      response["success"] = false;
-      EmplaceSafeString(response, "message",
-                        "Unable to read memory: " +
-                            std::string(memread_error.GetCString()));
-      g_dap.SendJSON(llvm::json::Value(std::move(response)));
-      return;
-    }
-    if (bytes_read != available_count) {
-      response["success"] = false;
-      EmplaceSafeString(response, "message", "Unexpected, short read");
-      g_dap.SendJSON(llvm::json::Value(std::move(response)));
-      return;
-    }
-  }
+  buf.resize(std::min<size_t>(count_result, count_requested));
 
   llvm::json::Object body;
-  std::string formatted_addr = "0x" + llvm::utohexstr(addr);
+  std::string formatted_addr = "0x" + llvm::utohexstr(addr_int);
   body.try_emplace("address", formatted_addr);
   body.try_emplace("data", llvm::encodeBase64(buf));
-  body.try_emplace("unreadableBytes", unavailable_count);
   response.try_emplace("body", std::move(body));
   g_dap.SendJSON(llvm::json::Value(std::move(response)));
 }
diff --git a/lldb/unittests/Expression/DiagnosticManagerTest.cpp b/lldb/unittests/Expression/DiagnosticManagerTest.cpp
index 05fe7c164d681..7e04d4b023e4c 100644
--- a/lldb/unittests/Expression/DiagnosticManagerTest.cpp
+++ b/lldb/unittests/Expression/DiagnosticManagerTest.cpp
@@ -19,8 +19,8 @@ class FixItDiag : public Diagnostic {
 
 public:
   FixItDiag(llvm::StringRef msg, bool has_fixits)
-      : Diagnostic(msg, lldb::eSeverityError,
-                   DiagnosticOrigin::eDiagnosticOriginLLDB, custom_diag_id),
+      : Diagnostic(DiagnosticOrigin::eDiagnosticOriginLLDB, custom_diag_id,
+                   DiagnosticDetail{{}, lldb::eSeverityError, msg.str(), {}}),
         m_has_fixits(has_fixits) {}
   bool HasFixIts() const override { return m_has_fixits; }
 };
@@ -30,8 +30,8 @@ namespace {
 class TextDiag : public Diagnostic {
 public:
   TextDiag(llvm::StringRef msg, lldb::Severity severity)
-      : Diagnostic(msg, severity, DiagnosticOrigin::eDiagnosticOriginLLDB,
-                   custom_diag_id) {}
+      : Diagnostic(DiagnosticOrigin::eDiagnosticOriginLLDB, custom_diag_id,
+                   DiagnosticDetail{{}, severity, msg.str(), msg.str()}) {}
 };
 } // namespace
 
@@ -42,8 +42,8 @@ TEST(DiagnosticManagerTest, AddDiagnostic) {
   std::string msg = "foo bar has happened";
   lldb::Severity severity = lldb::eSeverityError;
   DiagnosticOrigin origin = DiagnosticOrigin::eDiagnosticOriginLLDB;
-  auto diag =
-      std::make_unique<Diagnostic>(msg, severity, origin, custom_diag_id);
+  auto diag = std::make_unique<Diagnostic>(
+      origin, custom_diag_id, DiagnosticDetail{{}, severity, msg, {}});
   mgr.AddDiagnostic(std::move(diag));
   EXPECT_EQ(1U, mgr.Diagnostics().size());
   const Diagnostic *got = mgr.Diagnostics().front().get();
@@ -72,18 +72,25 @@ TEST(DiagnosticManagerTest, HasFixits) {
   EXPECT_TRUE(mgr.HasFixIts());
 }
 
+static std::string toString(DiagnosticManager &mgr) {
+  // The error code doesn't really matter since we just convert the
+  // diagnostics to a string.
+  auto result = lldb::eExpressionCompleted;
+  return llvm::toString(mgr.GetAsError(result));
+}
+
 TEST(DiagnosticManagerTest, GetStringNoDiags) {
   DiagnosticManager mgr;
-  EXPECT_EQ("", mgr.GetString());
+  EXPECT_EQ("", toString(mgr));
   std::unique_ptr<Diagnostic> empty;
   mgr.AddDiagnostic(std::move(empty));
-  EXPECT_EQ("", mgr.GetString());
+  EXPECT_EQ("", toString(mgr));
 }
 
 TEST(DiagnosticManagerTest, GetStringBasic) {
   DiagnosticManager mgr;
   mgr.AddDiagnostic(std::make_unique<TextDiag>("abc", lldb::eSeverityError));
-  EXPECT_EQ("error: abc\n", mgr.GetString());
+  EXPECT_EQ("error: abc\n", toString(mgr));
 }
 
 TEST(DiagnosticManagerTest, GetStringMultiline) {
@@ -91,15 +98,15 @@ TEST(DiagnosticManagerTest, GetStringMultiline) {
 
   // Multiline diagnostics should only get one severity label.
   mgr.AddDiagnostic(std::make_unique<TextDiag>("b\nc", lldb::eSeverityError));
-  EXPECT_EQ("error: b\nc\n", mgr.GetString());
+  EXPECT_EQ("error: b\nc\n", toString(mgr));
 }
 
 TEST(DiagnosticManagerTest, GetStringMultipleDiags) {
   DiagnosticManager mgr;
   mgr.AddDiagnostic(std::make_unique<TextDiag>("abc", lldb::eSeverityError));
-  EXPECT_EQ("error: abc\n", mgr.GetString());
+  EXPECT_EQ("error: abc\n", toString(mgr));
   mgr.AddDiagnostic(std::make_unique<TextDiag>("def", lldb::eSeverityError));
-  EXPECT_EQ("error: abc\nerror: def\n", mgr.GetString());
+  EXPECT_EQ("error: abc\nerror: def\n", toString(mgr));
 }
 
 TEST(DiagnosticManagerTest, GetStringSeverityLabels) {
@@ -110,7 +117,7 @@ TEST(DiagnosticManagerTest, GetStringSeverityLabels) {
   mgr.AddDiagnostic(std::make_unique<TextDiag>("bar", lldb::eSeverityWarning));
   // Remarks have no labels.
   mgr.AddDiagnostic(std::make_unique<TextDiag>("baz", lldb::eSeverityInfo));
-  EXPECT_EQ("error: foo\nwarning: bar\nbaz\n", mgr.GetString());
+  EXPECT_EQ("error: foo\nwarning: bar\nbaz\n", toString(mgr));
 }
 
 TEST(DiagnosticManagerTest, GetStringPreserveOrder) {
@@ -120,7 +127,7 @@ TEST(DiagnosticManagerTest, GetStringPreserveOrder) {
   mgr.AddDiagnostic(std::make_unique<TextDiag>("baz", lldb::eSeverityInfo));
   mgr.AddDiagnostic(std::make_unique<TextDiag>("bar", lldb::eSeverityWarning));
   mgr.AddDiagnostic(std::make_unique<TextDiag>("foo", lldb::eSeverityError));
-  EXPECT_EQ("baz\nwarning: bar\nerror: foo\n", mgr.GetString());
+  EXPECT_EQ("baz\nwarning: bar\nerror: foo\n", toString(mgr));
 }
 
 TEST(DiagnosticManagerTest, AppendMessageNoDiag) {
@@ -139,7 +146,7 @@ TEST(DiagnosticManagerTest, AppendMessageAttachToLastDiag) {
   // This should append to 'bar' and not to 'foo'.
   mgr.AppendMessageToDiagnostic("message text");
 
-  EXPECT_EQ("error: foo\nerror: bar\nmessage text\n", mgr.GetString());
+  EXPECT_EQ("error: foo\nerror: bar\nmessage text\n", toString(mgr));
 }
 
 TEST(DiagnosticManagerTest, AppendMessageSubsequentDiags) {
@@ -150,7 +157,7 @@ TEST(DiagnosticManagerTest, AppendMessageSubsequentDiags) {
   // Pushing another diag after the message should work fine.
   mgr.AddDiagnostic(std::make_unique<TextDiag>("foo", lldb::eSeverityError));
 
-  EXPECT_EQ("error: bar\nmessage text\nerror: foo\n", mgr.GetString());
+  EXPECT_EQ("error: bar\nmessage text\nerror: foo\n", toString(mgr));
 }
 
 TEST(DiagnosticManagerTest, PutString) {
@@ -159,7 +166,7 @@ TEST(DiagnosticManagerTest, PutString) {
   mgr.PutString(lldb::eSeverityError, "foo");
   EXPECT_EQ(1U, mgr.Diagnostics().size());
   EXPECT_EQ(eDiagnosticOriginLLDB, mgr.Diagnostics().front()->getKind());
-  EXPECT_EQ("error: foo\n", mgr.GetString());
+  EXPECT_EQ("error: foo\n", toString(mgr));
 }
 
 TEST(DiagnosticManagerTest, PutStringMultiple) {
@@ -169,7 +176,7 @@ TEST(DiagnosticManagerTest, PutStringMultiple) {
   mgr.PutString(lldb::eSeverityError, "foo");
   mgr.PutString(lldb::eSeverityError, "bar");
   EXPECT_EQ(2U, mgr.Diagnostics().size());
-  EXPECT_EQ("error: foo\nerror: bar\n", mgr.GetString());
+  EXPECT_EQ("error: foo\nerror: bar\n", toString(mgr));
 }
 
 TEST(DiagnosticManagerTest, PutStringSeverities) {
@@ -180,7 +187,7 @@ TEST(DiagnosticManagerTest, PutStringSeverities) {
   mgr.PutString(lldb::eSeverityError, "foo");
   mgr.PutString(lldb::eSeverityWarning, "bar");
   EXPECT_EQ(2U, mgr.Diagnostics().size());
-  EXPECT_EQ("error: foo\nwarning: bar\n", mgr.GetString());
+  EXPECT_EQ("error: foo\nwarning: bar\n", toString(mgr));
 }
 
 TEST(DiagnosticManagerTest, FixedExpression) {
@@ -197,3 +204,13 @@ TEST(DiagnosticManagerTest, FixedExpression) {
   mgr.SetFixedExpression("bar");
   EXPECT_EQ("bar", mgr.GetFixedExpression());
 }
+
+TEST(DiagnosticManagerTest, StatusConversion) {
+  DiagnosticManager mgr;
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("abc", lldb::eSeverityError));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("def", lldb::eSeverityWarning));
+  Status status =
+      Status::FromError(mgr.GetAsError(lldb::eExpressionParseError));
+  EXPECT_EQ(std::string("error: abc\nwarning: def\n"),
+            std::string(status.AsCString()));
+}
diff --git a/lldb/unittests/Interpreter/CMakeLists.txt b/lldb/unittests/Interpreter/CMakeLists.txt
index 54cea995084d3..f7d639f50f5bf 100644
--- a/lldb/unittests/Interpreter/CMakeLists.txt
+++ b/lldb/unittests/Interpreter/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_lldb_unittest(InterpreterTests
   TestCommandPaths.cpp
+  TestCommandObjectExpression.cpp
   TestCompletion.cpp
   TestOptionArgParser.cpp
   TestOptions.cpp
@@ -8,6 +9,7 @@ add_lldb_unittest(InterpreterTests
   TestRegexCommand.cpp
 
   LINK_LIBS
+      lldbCommands
       lldbCore
       lldbHost
       lldbTarget
diff --git a/lldb/unittests/Interpreter/TestCommandObjectExpression.cpp b/lldb/unittests/Interpreter/TestCommandObjectExpression.cpp
new file mode 100644
index 0000000000000..9e3417b542892
--- /dev/null
+++ b/lldb/unittests/Interpreter/TestCommandObjectExpression.cpp
@@ -0,0 +1,27 @@
+#include "../../source/Commands/DiagnosticRendering.h"
+#include "lldb/Utility/StreamString.h"
+#include "gtest/gtest.h"
+
+using namespace lldb_private;
+using namespace lldb;
+using llvm::StringRef;
+namespace {
+class ErrorDisplayTest : public ::testing::Test {};
+} // namespace
+
+static std::string Render(std::vector<DiagnosticDetail> details) {
+  StreamString stream;
+  RenderDiagnosticDetails(stream, 0, true, details);
+  return stream.GetData();
+}
+
+TEST_F(ErrorDisplayTest, RenderStatus) {
+  DiagnosticDetail::SourceLocation inline_loc;
+  inline_loc.in_user_input = true;
+  {
+    std::string result =
+        Render({DiagnosticDetail{inline_loc, eSeverityError, "foo", ""}});
+    ASSERT_TRUE(StringRef(result).contains("error:"));
+    ASSERT_TRUE(StringRef(result).contains("foo"));
+  }
+}
diff --git a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp
index c7547ba261c7f..a6d015e79a7ef 100644
--- a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp
+++ b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp
@@ -59,7 +59,6 @@ class MinidumpParserTest : public testing::Test {
       return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                      "convertYAML() failed");
 
-    os.flush();
     auto data_buffer_sp =
         std::make_shared<DataBufferHeap>(data.data(), data.size());
     auto expected_parser = MinidumpParser::Create(std::move(data_buffer_sp));
@@ -85,7 +84,6 @@ TEST_F(MinidumpParserTest, InvalidMinidump) {
   )");
 
   ASSERT_TRUE(llvm::yaml::convertYAML(YIn, os, [](const llvm::Twine &Msg){}));
-  os.flush();
   auto data_buffer_sp = std::make_shared<DataBufferHeap>(
       duplicate_streams.data(), duplicate_streams.size());
   ASSERT_THAT_EXPECTED(MinidumpParser::Create(data_buffer_sp), llvm::Failed());
diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
index 7d64e1cdd56f6..0733e42bb4633 100644
--- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp
+++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
@@ -13,6 +13,7 @@
 #include "lldb/Core/Declaration.h"
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Host/HostInfo.h"
+#include "lldb/lldb-enumerations.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/ExprCXX.h"
@@ -231,6 +232,37 @@ TEST_F(TestTypeSystemClang, TestBuiltinTypeForEncodingAndBitSize) {
   VerifyEncodingAndBitSize(*m_ast, eEncodingIEEE754, 64);
 }
 
+TEST_F(TestTypeSystemClang, TestBuiltinTypeForEmptyTriple) {
+  // Test that we can access type-info of builtin Clang AST
+  // types without crashing even when the target triple is
+  // empty.
+
+  TypeSystemClang ast("empty triple AST", llvm::Triple{});
+
+  // This test only makes sense if the builtin ASTContext types were
+  // not initialized.
+  ASSERT_TRUE(ast.getASTContext().VoidPtrTy.isNull());
+
+  EXPECT_FALSE(ast.GetBuiltinTypeByName(ConstString("int")).IsValid());
+  EXPECT_FALSE(ast.GetBuiltinTypeForDWARFEncodingAndBitSize(
+                      "char", dwarf::DW_ATE_signed_char, 8)
+                   .IsValid());
+  EXPECT_FALSE(ast.GetBuiltinTypeForEncodingAndBitSize(lldb::eEncodingUint, 8)
+                   .IsValid());
+  EXPECT_FALSE(ast.GetPointerSizedIntType(/*is_signed=*/false));
+  EXPECT_FALSE(ast.GetIntTypeFromBitSize(8, /*is_signed=*/false));
+
+  CompilerType record_type = ast.CreateRecordType(
+      nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "Record",
+      llvm::to_underlying(clang::TagTypeKind::Struct),
+      lldb::eLanguageTypeC_plus_plus, std::nullopt);
+  TypeSystemClang::StartTagDeclarationDefinition(record_type);
+  EXPECT_EQ(ast.AddFieldToRecordType(record_type, "field", record_type,
+                                     eAccessPublic, /*bitfield_bit_size=*/8),
+            nullptr);
+  TypeSystemClang::CompleteTagDeclarationDefinition(record_type);
+}
+
 TEST_F(TestTypeSystemClang, TestDisplayName) {
   TypeSystemClang ast("some name", llvm::Triple());
   EXPECT_EQ("some name", ast.getDisplayName());
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 330db65e85cab..0044c38f566a7 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -161,12 +161,6 @@ foreach(proj IN LISTS LLVM_ENABLE_RUNTIMES)
   endif()
 endforeach()
 
-foreach(proj IN LISTS LLVM_ENABLE_RUNTIMES)
-  if ("${proj}" IN_LIST LLVM_ENABLE_PROJECTS)
-    message(FATAL_ERROR "Runtime project \"${proj}\" found in LLVM_ENABLE_PROJECTS and LLVM_ENABLE_RUNTIMES. It must only appear in one of them and that one should almost always be LLVM_ENABLE_RUNTIMES.")
-  endif()
-endforeach()
-
 # Set a shorthand option to enable the GPU build of the 'libc' project.
 option(LIBC_GPU_BUILD "Enable the 'libc' project targeting the GPU" OFF)
 if(LIBC_GPU_BUILD)
@@ -204,6 +198,16 @@ if(NEED_LIBC_HDRGEN)
     list(APPEND LLVM_ENABLE_PROJECTS "libc")
   endif()
 endif()
+
+foreach(proj IN LISTS LLVM_ENABLE_RUNTIMES)
+  if("${proj}" IN_LIST LLVM_ENABLE_PROJECTS)
+    # The 'libc' project bootstraps a few executables via the project build and
+    # should not emit an error currently.
+    if(NOT (NEED_LIBC_HDRGEN AND "${proj}" STREQUAL "libc"))
+      message(FATAL_ERROR "Runtime project \"${proj}\" found in LLVM_ENABLE_PROJECTS and LLVM_ENABLE_RUNTIMES. It must only appear in one of them and that one should almost always be LLVM_ENABLE_RUNTIMES.")
+    endif()
+  endif()
+endforeach()
 unset(NEED_LIBC_HDRGEN)
 
 # LLVM_ENABLE_PROJECTS_USED is `ON` if the user has ever used the
diff --git a/llvm/benchmarks/SandboxIRBench.cpp b/llvm/benchmarks/SandboxIRBench.cpp
index d4601d5f53d07..45f352697868b 100644
--- a/llvm/benchmarks/SandboxIRBench.cpp
+++ b/llvm/benchmarks/SandboxIRBench.cpp
@@ -18,7 +18,9 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
-#include "llvm/SandboxIR/SandboxIR.h"
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/SandboxIR/Instruction.h"
+#include "llvm/SandboxIR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include <memory>
 #include <sstream>
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index ed13a82905b4e..e17e2169cd880 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -324,6 +324,12 @@ function(append value)
   endforeach(variable)
 endfunction()
 
+function(prepend value)
+  foreach(variable ${ARGN})
+    set(${variable} "${value} ${${variable}}" PARENT_SCOPE)
+  endforeach(variable)
+endfunction()
+
 function(append_if condition value)
   if (${condition})
     foreach(variable ${ARGN})
@@ -1196,7 +1202,7 @@ if (CLANG_CL AND (LLVM_BUILD_INSTRUMENTED OR LLVM_USE_SANITIZER))
   endif()
   file(TO_CMAKE_PATH "${clang_compiler_rt_file}" clang_compiler_rt_file)
   get_filename_component(clang_runtime_dir "${clang_compiler_rt_file}" DIRECTORY)
-  append("/libpath:\"${clang_runtime_dir}\""
+  prepend("/libpath:\"${clang_runtime_dir}\""
     CMAKE_EXE_LINKER_FLAGS
     CMAKE_MODULE_LINKER_FLAGS
     CMAKE_SHARED_LINKER_FLAGS)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 9e11b13c101d4..0b8f2e4f96715 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -361,7 +361,7 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
                                                     - tgsplit           flat          - *rocm-amdhsa* - AMD Instinct MI250 Accelerator
                                                     - xnack             scratch       - *rocm-amdhsa* - AMD Instinct MI250X Accelerator
                                                     - kernarg preload - Packed
-                                                                        work-item
+                                                      (except MI210)    work-item
                                                                         IDs
 
      ``gfx90c``                  ``amdgcn``   APU   - xnack           - Absolute      - *pal-amdpal*  - Ryzen 7 4700G
@@ -1757,6 +1757,55 @@ As part of the AMDGPU MC layer, AMDGPU provides the following target specific
 
      =================== ================= ========================================================
 
+Function Resource Usage
+-----------------------
+
+A function's resource usage depends on each of its callees' resource usage. The
+expressions used to denote resource usage reflect this by propagating each
+callees' equivalent expressions. Said expressions are emitted as symbols by the
+compiler when compiling to either assembly or object format and should not be
+overwritten or redefined.
+
+The following describes all emitted function resource usage symbols:
+
+  .. table:: Function Resource Usage:
+     :name: function-usage-table
+
+     ===================================== ========= ========================================= ===============================================================================
+     Symbol                                Type      Description                               Example
+     ===================================== ========= ========================================= ===============================================================================
+     <function_name>.num_vgpr              Integer   Number of VGPRs used by <function_name>,  .set foo.num_vgpr, max(32, bar.num_vgpr, baz.num_vgpr)
+                                                     worst case of itself and its callees'
+                                                     VGPR use
+     <function_name>.num_agpr              Integer   Number of AGPRs used by <function_name>,  .set foo.num_agpr, max(35, bar.num_agpr)
+                                                     worst case of itself and its callees'
+                                                     AGPR use
+     <function_name>.numbered_sgpr         Integer   Number of SGPRs used by <function_name>,  .set foo.num_sgpr, 21
+                                                     worst case of itself and its callees'
+                                                     SGPR use (without any of the implicitly
+                                                     used SGPRs)
+     <function_name>.private_seg_size      Integer   Total stack size required for             .set foo.private_seg_size, 16+max(bar.private_seg_size, baz.private_seg_size)
+                                                     <function_name>, expression is the
+                                                     locally used stack size + the worst case
+                                                     callee
+     <function_name>.uses_vcc              Bool      Whether <function_name>, or any of its    .set foo.uses_vcc, or(0, bar.uses_vcc)
+                                                     callees, uses vcc
+     <function_name>.uses_flat_scratch     Bool      Whether <function_name>, or any of its    .set foo.uses_flat_scratch, 1
+                                                     callees, uses flat scratch or not
+     <function_name>.has_dyn_sized_stack   Bool      Whether <function_name>, or any of its    .set foo.has_dyn_sized_stack, 1
+                                                     callees, is dynamically sized
+     <function_name>.has_recursion         Bool      Whether <function_name>, or any of its    .set foo.has_recursion, 0
+                                                     callees, contains recursion
+     <function_name>.has_indirect_call     Bool      Whether <function_name>, or any of its    .set foo.has_indirect_call, max(0, bar.has_indirect_call)
+                                                     callees, contains an indirect call
+     ===================================== ========= ========================================= ===============================================================================
+
+Futhermore, three symbols are additionally emitted describing the compilation
+unit's worst case (i.e, maxima) ``num_vgpr``, ``num_agpr``, and
+``numbered_sgpr`` which may be referenced and used by the aforementioned
+symbolic expressions. These three symbols are ``amdgcn.max_num_vgpr``,
+``amdgcn.max_num_agpr``, and ``amdgcn.max_num_sgpr``.
+
 .. _amdgpu-elf-code-object:
 
 ELF Code Object
diff --git a/llvm/docs/CommandGuide/llvm-objdump.rst b/llvm/docs/CommandGuide/llvm-objdump.rst
index 7f8def756c696..ab9f583e96ec6 100644
--- a/llvm/docs/CommandGuide/llvm-objdump.rst
+++ b/llvm/docs/CommandGuide/llvm-objdump.rst
@@ -272,7 +272,7 @@ OPTIONS
   When printing a PC-relative global symbol reference, print it as an offset from the leading symbol.
 
   When a bb-address-map section is present (i.e., the object file is built with
-  ``-fbasic-block-sections=labels``), labels are retrieved from that section
+  ``-fbasic-block-address-map``), labels are retrieved from that section
   instead. If a pgo-analysis-map is present alongside the bb-address-map, any
   available analyses are printed after the relevant block label. By default,
   any analysis with a special representation (i.e. BlockFrequency,
diff --git a/llvm/docs/Extensions.rst b/llvm/docs/Extensions.rst
index abc34bc3202c0..ea267842cdc35 100644
--- a/llvm/docs/Extensions.rst
+++ b/llvm/docs/Extensions.rst
@@ -401,7 +401,7 @@ the symbol that belongs to the partition. It may be constructed as follows:
 This section stores the binary address of basic blocks along with other related
 metadata. This information can be used to map binary profiles (like perf
 profiles) directly to machine basic blocks.
-This section is emitted with ``-basic-block-sections=labels`` and will contain
+This section is emitted with ``-basic-block-address-map`` and will contain
 a BB address map table for every function.
 
 The ``SHT_LLVM_BB_ADDR_MAP`` type provides backward compatibility to allow
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index c42adc10b10a2..1c4e00b956bc4 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -633,8 +633,8 @@ G_FCEIL, G_FSQRT, G_FFLOOR, G_FRINT, G_FNEARBYINT
 
 These correspond to the standard C functions of the same name.
 
-G_FCOS, G_FSIN, G_FTAN, G_FACOS, G_FASIN, G_FATAN, G_FCOSH, G_FSINH, G_FTANH
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+G_FCOS, G_FSIN, G_FTAN, G_FACOS, G_FASIN, G_FATAN, G_FATAN2, G_FCOSH, G_FSINH, G_FTANH
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 These correspond to the standard C trigonometry functions of the same name.
 
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 3b905c2788128..3f39d58b322a4 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -12658,10 +12658,67 @@ This instruction requires several arguments:
       the return value of the callee is returned to the caller's caller, even
       if a void return type is in use.
 
-   Both markers imply that the callee does not access allocas from the caller.
-   The ``tail`` marker additionally implies that the callee does not access
-   varargs from the caller. Calls marked ``musttail`` must obey the following
-   additional  rules:
+   Both markers imply that the callee does not access allocas, va_args, or
+   byval arguments from the caller. As an exception to that, an alloca or byval
+   argument may be passed to the callee as a byval argument, which can be
+   dereferenced inside the callee. For example:
+
+.. code-block:: llvm
+
+      declare void @take_byval(ptr byval(i64))
+      declare void @take_ptr(ptr)
+
+      ; Invalid (assuming @take_ptr dereferences the pointer), because %local
+      ; may be de-allocated before the call to @take_ptr.
+      define void @invalid_alloca() {
+      entry:
+        %local = alloca i64
+        tail call void @take_ptr(ptr %local)
+        ret void
+      }
+
+      ; Valid, the byval attribute causes the memory allocated by %local to be
+      ; copied into @take_byval's stack frame.
+      define void @byval_alloca() {
+      entry:
+        %local = alloca i64
+        tail call void @take_byval(ptr byval(i64) %local)
+        ret void
+      }
+
+      ; Invalid, because @use_global_va_list uses the variadic arguments from
+      ; @invalid_va_list.
+      %struct.va_list = type { ptr }
+      @va_list = external global %struct.va_list
+      define void @use_global_va_list() {
+      entry:
+        %arg = va_arg ptr @va_list, i64
+        ret void
+      }
+      define void @invalid_va_list(i32 %a, ...) {
+      entry:
+        call void @llvm.va_start.p0(ptr @va_list)
+        tail call void @use_global_va_list()
+        ret void
+      }
+
+      ; Valid, byval argument forwarded to tail call as another byval argument.
+      define void @forward_byval(ptr byval(i64) %x) {
+      entry:
+        tail call void @take_byval(ptr byval(i64) %x)
+        ret void
+      }
+
+      ; Invalid (assuming @take_ptr dereferences the pointer), byval argument
+      ; passed to tail callee as non-byval ptr.
+      define void @invalid_byval(ptr byval(i64) %x) {
+      entry:
+        tail call void @take_ptr(ptr %x)
+        ret void
+      }
+
+
+   Calls marked ``musttail`` must obey the following additional rules:
 
    - The call must immediately precede a :ref:`ret <i_ret>` instruction,
      or a pointer bitcast followed by a ret instruction.
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
new file mode 100644
index 0000000000000..f44d636a20337
--- /dev/null
+++ b/llvm/docs/ReleaseNotes.md
@@ -0,0 +1,281 @@
+<!-- This document is written in Markdown and uses extra directives provided by
+MyST (https://myst-parser.readthedocs.io/en/latest/). -->
+
+LLVM {{env.config.release}} Release Notes
+=========================================
+
+```{contents}
+```
+
+````{only} PreRelease
+```{warning} These are in-progress notes for the upcoming LLVM {{env.config.release}}
+             release. Release notes for previous releases can be found on
+             [the Download Page](https://releases.llvm.org/download.html).
+```
+````
+
+Introduction
+============
+
+This document contains the release notes for the LLVM Compiler Infrastructure,
+release {{env.config.release}}.  Here we describe the status of LLVM, including
+major improvements from the previous release, improvements in various subprojects
+of LLVM, and some of the current users of the code.  All LLVM releases may be
+downloaded from the [LLVM releases web site](https://llvm.org/releases/).
+
+For more information about LLVM, including information about the latest
+release, please check out the [main LLVM web site](https://llvm.org/).  If you
+have questions or comments, the [Discourse forums](https://discourse.llvm.org)
+is a good place to ask them.
+
+Note that if you are reading this file from a Git checkout or the main
+LLVM web page, this document applies to the *next* release, not the current
+one.  To see the release notes for a specific release, please see the
+[releases page](https://llvm.org/releases/).
+
+Non-comprehensive list of changes in this release
+=================================================
+
+<!-- For small 1-3 sentence descriptions, just add an entry at the end of
+this list. If your description won't fit comfortably in one bullet
+point (e.g. maybe you would like to give an example of the
+functionality, or simply have a lot to talk about), see the comment below
+for adding a new subsection. -->
+
+* ...
+
+<!-- If you would like to document a larger change, then you can add a
+subsection about it right here. You can copy the following boilerplate:
+
+Special New Feature
+-------------------
+
+Makes programs 10x faster by doing Special New Thing.
+-->
+
+Changes to the LLVM IR
+----------------------
+
+* The `x86_mmx` IR type has been removed. It will be translated to
+  the standard vector type `<1 x i64>` in bitcode upgrade.
+* Renamed `llvm.experimental.stepvector` intrinsic to `llvm.stepvector`.
+
+* Added `usub_cond` and `usub_sat` operations to `atomicrmw`.
+
+* Remove the following intrinsics which can be replaced with a `bitcast`:
+
+  * `llvm.nvvm.bitcast.f2i`
+  * `llvm.nvvm.bitcast.i2f`
+  * `llvm.nvvm.bitcast.d2ll`
+  * `llvm.nvvm.bitcast.ll2d`
+
+* Remove the following intrinsics which can be replaced with a funnel-shift:
+
+  * `llvm.nvvm.rotate.b32`
+  * `llvm.nvvm.rotate.right.b64`
+  * `llvm.nvvm.rotate.b64`
+
+* Remove the following intrinsics which can be replaced with an
+  `addrspacecast`:
+
+  * `llvm.nvvm.ptr.gen.to.global`
+  * `llvm.nvvm.ptr.gen.to.shared`
+  * `llvm.nvvm.ptr.gen.to.constant`
+  * `llvm.nvvm.ptr.gen.to.local`
+  * `llvm.nvvm.ptr.global.to.gen`
+  * `llvm.nvvm.ptr.shared.to.gen`
+  * `llvm.nvvm.ptr.constant.to.gen`
+  * `llvm.nvvm.ptr.local.to.gen`
+
+Changes to LLVM infrastructure
+------------------------------
+
+Changes to building LLVM
+------------------------
+
+Changes to TableGen
+-------------------
+
+Changes to Interprocedural Optimizations
+----------------------------------------
+
+Changes to the AArch64 Backend
+------------------------------
+
+* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
+  the required alignment space with a sequence of `0x0` bytes (the requested
+  fill value) rather than NOPs.
+
+Changes to the AMDGPU Backend
+-----------------------------
+
+* Removed `llvm.amdgcn.flat.atomic.fadd` and
+  `llvm.amdgcn.global.atomic.fadd` intrinsics. Users should use the
+  {ref}`atomicrmw <i_atomicrmw>` instruction with `fadd` and
+  addrspace(0) or addrspace(1) instead.
+
+Changes to the ARM Backend
+--------------------------
+
+* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
+  the required alignment space with a sequence of `0x0` bytes (the requested
+  fill value) rather than NOPs.
+
+Changes to the AVR Backend
+--------------------------
+
+Changes to the DirectX Backend
+------------------------------
+
+Changes to the Hexagon Backend
+------------------------------
+
+Changes to the LoongArch Backend
+--------------------------------
+
+Changes to the MIPS Backend
+---------------------------
+
+Changes to the PowerPC Backend
+------------------------------
+
+Changes to the RISC-V Backend
+-----------------------------
+
+* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
+  the required alignment space with a sequence of `0x0` bytes (the requested
+  fill value) rather than NOPs.
+* Added Syntacore SCR4 and SCR5 CPUs: `-mcpu=syntacore-scr4/5-rv32/64`
+* `-mcpu=sifive-p470` was added.
+* Added Hazard3 CPU as taped out for RP2350: `-mcpu=rp2350-hazard3` (32-bit
+  only).
+* Fixed length vector support using RVV instructions now requires VLEN>=64. This
+  means Zve32x and Zve32f will also require Zvl64b. The prior support was
+  largely untested.
+* The `Zvbc32e` and `Zvkgs` extensions are now supported experimentally.
+* Added `Smctr` and `Ssctr` extensions.
+* `-mcpu=syntacore-scr7` was added.
+* The `Zacas` extension is no longer marked as experimental.
+
+Changes to the WebAssembly Backend
+----------------------------------
+
+Changes to the Windows Target
+-----------------------------
+
+Changes to the X86 Backend
+--------------------------
+
+* `.balign N, 0x90`, `.p2align N, 0x90`, and `.align N, 0x90` in code sections
+  now fill the required alignment space with repeating `0x90` bytes, rather than
+  using optimised NOP filling. Optimised NOP filling fills the space with NOP
+  instructions of various widths, not just those that use the `0x90` byte
+  encoding. To use optimised NOP filling in a code section, leave off the
+  "fillval" argument, i.e. `.balign N`, `.p2align N` or `.align N` respectively.
+
+* Due to the removal of the `x86_mmx` IR type, functions with
+  `x86_mmx` arguments or return values will use a different,
+  incompatible, calling convention ABI. Such functions are not
+  generally seen in the wild (Clang never generates them!), so this is
+  not expected to result in real-world compatibility problems.
+
+* Support ISA of `AVX10.2-256` and `AVX10.2-512`.
+
+Changes to the OCaml bindings
+-----------------------------
+
+Changes to the Python bindings
+------------------------------
+
+Changes to the C API
+--------------------
+
+* The following symbols are deleted due to the removal of the `x86_mmx` IR type:
+
+  * `LLVMX86_MMXTypeKind`
+  * `LLVMX86MMXTypeInContext`
+  * `LLVMX86MMXType`
+
+ * The following functions are added to further support non-null-terminated strings:
+
+  * `LLVMGetNamedFunctionWithLength`
+  * `LLVMGetNamedGlobalWithLength`
+
+* The following functions are added to access the `LLVMContextRef` associated
+   with `LLVMValueRef` and `LLVMBuilderRef` objects:
+
+  * `LLVMGetValueContext`
+  * `LLVMGetBuilderContext`
+
+* The new pass manager can now be invoked with a custom alias analysis pipeline, using
+  the `LLVMPassBuilderOptionsSetAAPipeline` function.
+
+* It is now also possible to run the new pass manager on a single function, by calling
+  `LLVMRunPassesOnFunction` instead of `LLVMRunPasses`.
+
+* Support for creating instructions with custom synchronization scopes has been added:
+
+  * `LLVMGetSyncScopeID` to map a synchronization scope name to an ID.
+  * `LLVMBuildFenceSyncScope`, `LLVMBuildAtomicRMWSyncScope` and
+    `LLVMBuildAtomicCmpXchgSyncScope` versions of the existing builder functions
+    with an additional synchronization scope ID parameter.
+  * `LLVMGetAtomicSyncScopeID` and `LLVMSetAtomicSyncScopeID` to get and set the
+    synchronization scope of any atomic instruction.
+  * `LLVMIsAtomic` to check if an instruction is atomic, for use with the above functions.
+    Because of backwards compatibility, `LLVMIsAtomicSingleThread` and
+    `LLVMSetAtomicSingleThread` continue to work with any instruction type.
+
+* The `LLVMSetPersonalityFn` and `LLVMSetInitializer` APIs now support clearing the
+  personality function and initializer respectively by passing a null pointer.
+
+* The following functions are added to allow iterating over debug records attached to
+  instructions:
+
+  * `LLVMGetFirstDbgRecord`
+  * `LLVMGetLastDbgRecord`
+  * `LLVMGetNextDbgRecord`
+  * `LLVMGetPreviousDbgRecord`
+
+* Added `LLVMAtomicRMWBinOpUSubCond` and `LLVMAtomicRMWBinOpUSubSat` to `LLVMAtomicRMWBinOp` enum for AtomicRMW instructions.
+
+Changes to the CodeGen infrastructure
+-------------------------------------
+
+Changes to the Metadata Info
+---------------------------------
+
+Changes to the Debug Info
+---------------------------------
+
+Changes to the LLVM tools
+---------------------------------
+
+Changes to LLDB
+---------------------------------
+
+Changes to BOLT
+---------------------------------
+
+Changes to Sanitizers
+---------------------
+
+Other Changes
+-------------
+
+External Open Source Projects Using LLVM {{env.config.release}}
+===============================================================
+
+* A project...
+
+Additional Information
+======================
+
+A wide variety of additional information is available on the
+[LLVM web page](https://llvm.org/), in particular in the
+[documentation](https://llvm.org/docs/) section.  The web page also contains
+versions of the API documentation which is up-to-date with the Git version of
+the source code.  You can access versions of these documents specific to this
+release by going into the `llvm/docs/` directory in the LLVM tree.
+
+If you have any questions or comments about LLVM, please feel free to contact
+us via the [Discourse forums](https://discourse.llvm.org).
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
deleted file mode 100644
index 9bf838c39643d..0000000000000
--- a/llvm/docs/ReleaseNotes.rst
+++ /dev/null
@@ -1,275 +0,0 @@
-============================
-LLVM |release| Release Notes
-============================
-
-.. contents::
-    :local:
-
-.. only:: PreRelease
-
-  .. warning::
-     These are in-progress notes for the upcoming LLVM |version| release.
-     Release notes for previous releases can be found on
-     `the Download Page <https://releases.llvm.org/download.html>`_.
-
-
-Introduction
-============
-
-This document contains the release notes for the LLVM Compiler Infrastructure,
-release |release|.  Here we describe the status of LLVM, including major improvements
-from the previous release, improvements in various subprojects of LLVM, and
-some of the current users of the code.  All LLVM releases may be downloaded
-from the `LLVM releases web site <https://llvm.org/releases/>`_.
-
-For more information about LLVM, including information about the latest
-release, please check out the `main LLVM web site <https://llvm.org/>`_.  If you
-have questions or comments, the `Discourse forums
-<https://discourse.llvm.org>`_ is a good place to ask
-them.
-
-Note that if you are reading this file from a Git checkout or the main
-LLVM web page, this document applies to the *next* release, not the current
-one.  To see the release notes for a specific release, please see the `releases
-page <https://llvm.org/releases/>`_.
-
-Non-comprehensive list of changes in this release
-=================================================
-.. NOTE
-   For small 1-3 sentence descriptions, just add an entry at the end of
-   this list. If your description won't fit comfortably in one bullet
-   point (e.g. maybe you would like to give an example of the
-   functionality, or simply have a lot to talk about), see the `NOTE` below
-   for adding a new subsection.
-
-* ...
-
-Update on required toolchains to build LLVM
--------------------------------------------
-
-Changes to the LLVM IR
-----------------------
-
-* The ``x86_mmx`` IR type has been removed. It will be translated to
-  the standard vector type ``<1 x i64>`` in bitcode upgrade.
-* Renamed ``llvm.experimental.stepvector`` intrinsic to ``llvm.stepvector``.
-
-* Added ``usub_cond`` and ``usub_sat`` operations to ``atomicrmw``.
-
-* Remove the following intrinsics which can be replaced with a ``bitcast``:
-
-  * ``llvm.nvvm.bitcast.f2i``
-  * ``llvm.nvvm.bitcast.i2f``
-  * ``llvm.nvvm.bitcast.d2ll``
-  * ``llvm.nvvm.bitcast.ll2d``
-
-* Remove the following intrinsics which can be replaced with a funnel-shift:
-
-  * ``llvm.nvvm.rotate.b32``
-  * ``llvm.nvvm.rotate.right.b64``
-  * ``llvm.nvvm.rotate.b64``
-
-* Remove the following intrinsics which can be replaced with an
-  ``addrspacecast``:
-
-  * ``llvm.nvvm.ptr.gen.to.global``
-  * ``llvm.nvvm.ptr.gen.to.shared``
-  * ``llvm.nvvm.ptr.gen.to.constant``
-  * ``llvm.nvvm.ptr.gen.to.local``
-  * ``llvm.nvvm.ptr.global.to.gen``
-  * ``llvm.nvvm.ptr.shared.to.gen``
-  * ``llvm.nvvm.ptr.constant.to.gen``
-  * ``llvm.nvvm.ptr.local.to.gen``
-
-Changes to LLVM infrastructure
-------------------------------
-
-Changes to building LLVM
-------------------------
-
-Changes to TableGen
--------------------
-
-Changes to Interprocedural Optimizations
-----------------------------------------
-
-Changes to the AArch64 Backend
-------------------------------
-
-* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
-  the required alignment space with a sequence of `0x0` bytes (the requested
-  fill value) rather than NOPs.
-
-Changes to the AMDGPU Backend
------------------------------
-
-* Removed ``llvm.amdgcn.flat.atomic.fadd`` and
-  ``llvm.amdgcn.global.atomic.fadd`` intrinsics. Users should use the
-  :ref:`atomicrmw <i_atomicrmw>` instruction with `fadd` and
-  addrspace(0) or addrspace(1) instead.
-
-Changes to the ARM Backend
---------------------------
-
-* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
-  the required alignment space with a sequence of `0x0` bytes (the requested
-  fill value) rather than NOPs.
-
-Changes to the AVR Backend
---------------------------
-
-Changes to the DirectX Backend
-------------------------------
-
-Changes to the Hexagon Backend
-------------------------------
-
-Changes to the LoongArch Backend
---------------------------------
-
-Changes to the MIPS Backend
----------------------------
-
-Changes to the PowerPC Backend
-------------------------------
-
-Changes to the RISC-V Backend
------------------------------
-
-* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
-  the required alignment space with a sequence of `0x0` bytes (the requested
-  fill value) rather than NOPs.
-* Added Syntacore SCR4 and SCR5 CPUs: ``-mcpu=syntacore-scr4/5-rv32/64``
-* ``-mcpu=sifive-p470`` was added.
-* Added Hazard3 CPU as taped out for RP2350: ``-mcpu=rp2350-hazard3`` (32-bit
-  only).
-* Fixed length vector support using RVV instructions now requires VLEN>=64. This
-  means Zve32x and Zve32f will also require Zvl64b. The prior support was
-  largely untested.
-* The ``Zvbc32e`` and ``Zvkgs`` extensions are now supported experimentally.
-* Added ``Smctr`` and ``Ssctr`` extensions.
-* ``-mcpu=syntacore-scr7`` was added.
-* The ``Zacas`` extension is no longer marked as experimental.
-
-Changes to the WebAssembly Backend
-----------------------------------
-
-Changes to the Windows Target
------------------------------
-
-Changes to the X86 Backend
---------------------------
-
-* `.balign N, 0x90`, `.p2align N, 0x90`, and `.align N, 0x90` in code sections
-  now fill the required alignment space with repeating `0x90` bytes, rather than
-  using optimised NOP filling. Optimised NOP filling fills the space with NOP
-  instructions of various widths, not just those that use the `0x90` byte
-  encoding. To use optimised NOP filling in a code section, leave off the
-  "fillval" argument, i.e. `.balign N`, `.p2align N` or `.align N` respectively.
-
-* Due to the removal of the ``x86_mmx`` IR type, functions with
-  ``x86_mmx`` arguments or return values will use a different,
-  incompatible, calling convention ABI. Such functions are not
-  generally seen in the wild (Clang never generates them!), so this is
-  not expected to result in real-world compatibility problems.
-
-* Support ISA of ``AVX10.2-256`` and ``AVX10.2-512``.
-
-Changes to the OCaml bindings
------------------------------
-
-Changes to the Python bindings
-------------------------------
-
-Changes to the C API
---------------------
-
-* The following symbols are deleted due to the removal of the ``x86_mmx`` IR type:
-
-  * ``LLVMX86_MMXTypeKind``
-  * ``LLVMX86MMXTypeInContext``
-  * ``LLVMX86MMXType``
-
- * The following functions are added to further support non-null-terminated strings:
-
-  * ``LLVMGetNamedFunctionWithLength``
-  * ``LLVMGetNamedGlobalWithLength``
-
-* The following functions are added to access the ``LLVMContextRef`` associated
-   with ``LLVMValueRef`` and ``LLVMBuilderRef`` objects:
-
-  * ``LLVMGetValueContext``
-  * ``LLVMGetBuilderContext``
-
-* The new pass manager can now be invoked with a custom alias analysis pipeline, using
-  the ``LLVMPassBuilderOptionsSetAAPipeline`` function.
-
-* It is now also possible to run the new pass manager on a single function, by calling
-  ``LLVMRunPassesOnFunction`` instead of ``LLVMRunPasses``.
-
-* Support for creating instructions with custom synchronization scopes has been added:
-
-  * ``LLVMGetSyncScopeID`` to map a synchronization scope name to an ID.
-  * ``LLVMBuildFenceSyncScope``, ``LLVMBuildAtomicRMWSyncScope`` and
-    ``LLVMBuildAtomicCmpXchgSyncScope`` versions of the existing builder functions
-    with an additional synchronization scope ID parameter.
-  * ``LLVMGetAtomicSyncScopeID`` and ``LLVMSetAtomicSyncScopeID`` to get and set the
-    synchronization scope of any atomic instruction.
-  * ``LLVMIsAtomic`` to check if an instruction is atomic, for use with the above functions.
-    Because of backwards compatibility, ``LLVMIsAtomicSingleThread`` and
-    ``LLVMSetAtomicSingleThread`` continue to work with any instruction type.
-
-* The `LLVMSetPersonalityFn` and `LLVMSetInitializer` APIs now support clearing the
-  personality function and initializer respectively by passing a null pointer.
-
-* The following functions are added to allow iterating over debug records attached to
-  instructions:
-
-  * ``LLVMGetFirstDbgRecord``
-  * ``LLVMGetLastDbgRecord``
-  * ``LLVMGetNextDbgRecord``
-  * ``LLVMGetPreviousDbgRecord``
-
-* Added ``LLVMAtomicRMWBinOpUSubCond`` and ``LLVMAtomicRMWBinOpUSubSat`` to ``LLVMAtomicRMWBinOp`` enum for AtomicRMW instructions.
-
-Changes to the CodeGen infrastructure
--------------------------------------
-
-Changes to the Metadata Info
----------------------------------
-
-Changes to the Debug Info
----------------------------------
-
-Changes to the LLVM tools
----------------------------------
-
-Changes to LLDB
----------------------------------
-
-Changes to BOLT
----------------------------------
-
-Changes to Sanitizers
----------------------
-
-Other Changes
--------------
-
-External Open Source Projects Using LLVM 19
-===========================================
-
-* A project...
-
-Additional Information
-======================
-
-A wide variety of additional information is available on the `LLVM web page
-<https://llvm.org/>`_, in particular in the `documentation
-<https://llvm.org/docs/>`_ section.  The web page also contains versions of the
-API documentation which is up-to-date with the Git version of the source
-code.  You can access versions of these documents specific to this release by
-going into the ``llvm/docs/`` directory in the LLVM tree.
-
-If you have any questions or comments about LLVM, please feel free to contact
-us via the `Discourse forums <https://discourse.llvm.org>`_.
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index 0f0b21fb23770..38c41b0fad12e 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -147,6 +147,8 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na
      - Adds atomic add instruction on floating-point numbers.
    * - ``SPV_EXT_shader_atomic_float_min_max``
      - Adds atomic min and max instruction on floating-point numbers.
+   * - ``SPV_EXT_arithmetic_fence``
+     - Adds an instruction that prevents fast-math optimizations between its argument and the expression that contains it.
    * - ``SPV_INTEL_arbitrary_precision_integers``
      - Allows generating arbitrary width integer types.
    * - ``SPV_INTEL_bfloat16_conversion``
@@ -275,6 +277,10 @@ SPIR-V backend, along with their descriptions and argument details.
      - None
      - `[Type, Vararg]`
      - Assigns names to types or values, enhancing readability and debuggability of SPIR-V code. Not emitted directly but used for metadata enrichment.
+   * - `int_spv_value_md`
+     - None
+     - `[Metadata]`
+     - Assigns a set of attributes (such as name and data type) to a value that is the argument of the associated `llvm.fake.use` intrinsic call. The latter is used as a mean to map virtual registers created by IRTranslator to the original value.
    * - `int_spv_assign_decoration`
      - None
      - `[Type, Metadata]`
diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst
index 2b5b5139858e7..67b6ebb4b04d9 100644
--- a/llvm/docs/Security.rst
+++ b/llvm/docs/Security.rst
@@ -37,7 +37,6 @@ meet the criteria for inclusion below. The list is in the format
 username for an individual isn't available, the brackets will be empty.
 
 * Ahmed Bougacha (Apple) [@ahmedbougacha]
-* Andy Kaylor (Intel) [@andykaylor]
 * Artur Pilipenko (Azul Systems Inc) []
 * Boovaragavan Dasarathan (Nvidia) [@mrragava]
 * Dimitry Andric (individual; FreeBSD) [@DimitryAndric]
@@ -52,6 +51,7 @@ username for an individual isn't available, the brackets will be empty.
 * Peter Smith (ARM) [@smithp35]
 * Pietro Albini (Ferrous Systems; Rust) [@pietroalbini]
 * Serge Guelton (Mozilla) [@serge-sans-paille]
+* Sergey Zverev (Intel) [@offsake]
 * Shayne Hiet-Block (Microsoft) [@GreatKeeper]
 * Tim Penge (Sony) [@tpenge]
 * Tulio Magno Quites Machado Filho (Red Hat) [@tuliom]
diff --git a/llvm/docs/conf.py b/llvm/docs/conf.py
index a40da828ae2a2..d9fa6961032b4 100644
--- a/llvm/docs/conf.py
+++ b/llvm/docs/conf.py
@@ -38,6 +38,8 @@
 except ImportError:
     if not tags.has("builder-man"):
         raise
+else:
+    myst_enable_extensions = ["substitution"]
 
 # Automatic anchors for markdown titles
 myst_heading_anchors = 6
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index ec5c0e7dbbd65..28dc270ca368d 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -2795,7 +2795,7 @@ void LLVMSetPersonalityFn(LLVMValueRef Fn, LLVMValueRef PersonalityFn);
 /**
  * Obtain the intrinsic ID number which matches the given function name.
  *
- * @see llvm::Function::lookupIntrinsicID()
+ * @see llvm::Intrinsic::lookupIntrinsicID()
  */
 unsigned LLVMLookupIntrinsicID(const char *Name, size_t NameLen);
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 9cc8369a0bf52..acb3b2e210300 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -1483,13 +1483,14 @@ inline APFloat maxnum(const APFloat &A, const APFloat &B) {
 }
 
 /// Implements IEEE 754-2019 minimum semantics. Returns the smaller of 2
-/// arguments, propagating NaNs and treating -0 as less than +0.
+/// arguments, returning a quiet NaN if an argument is a NaN and treating -0
+/// as less than +0.
 LLVM_READONLY
 inline APFloat minimum(const APFloat &A, const APFloat &B) {
   if (A.isNaN())
-    return A;
+    return A.makeQuiet();
   if (B.isNaN())
-    return B;
+    return B.makeQuiet();
   if (A.isZero() && B.isZero() && (A.isNegative() != B.isNegative()))
     return A.isNegative() ? A : B;
   return B < A ? B : A;
@@ -1509,13 +1510,14 @@ inline APFloat minimumnum(const APFloat &A, const APFloat &B) {
 }
 
 /// Implements IEEE 754-2019 maximum semantics. Returns the larger of 2
-/// arguments, propagating NaNs and treating -0 as less than +0.
+/// arguments, returning a quiet NaN if an argument is a NaN and treating -0
+/// as less than +0.
 LLVM_READONLY
 inline APFloat maximum(const APFloat &A, const APFloat &B) {
   if (A.isNaN())
-    return A;
+    return A.makeQuiet();
   if (B.isNaN())
-    return B;
+    return B.makeQuiet();
   if (A.isZero() && B.isZero() && (A.isNegative() != B.isNegative()))
     return A.isNegative() ? B : A;
   return A < B ? B : A;
diff --git a/llvm/include/llvm/ADT/SmallSet.h b/llvm/include/llvm/ADT/SmallSet.h
index 630c98504261a..ed3c6bfd3418d 100644
--- a/llvm/include/llvm/ADT/SmallSet.h
+++ b/llvm/include/llvm/ADT/SmallSet.h
@@ -16,14 +16,11 @@
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/iterator.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/type_traits.h"
 #include <cstddef>
 #include <functional>
+#include <initializer_list>
 #include <set>
-#include <type_traits>
 #include <utility>
 
 namespace llvm {
@@ -139,10 +136,6 @@ class SmallSet {
   SmallVector<T, N> Vector;
   std::set<T, C> Set;
 
-  using VIterator = typename SmallVector<T, N>::const_iterator;
-  using SIterator = typename std::set<T, C>::const_iterator;
-  using mutable_iterator = typename SmallVector<T, N>::iterator;
-
   // In small mode SmallPtrSet uses linear search for the elements, so it is
   // not a good idea to choose this value too high. You may consider using a
   // DenseSet<> instead if you expect many elements in the set.
@@ -155,6 +148,22 @@ class SmallSet {
   using const_iterator = SmallSetIterator<T, N, C>;
 
   SmallSet() = default;
+  SmallSet(const SmallSet &) = default;
+  SmallSet(SmallSet &&) = default;
+
+  template <typename IterT> SmallSet(IterT Begin, IterT End) {
+    insert(Begin, End);
+  }
+
+  template <typename RangeT>
+  explicit SmallSet(const iterator_range<RangeT> &R) {
+    insert(R.begin(), R.end());
+  }
+
+  SmallSet(std::initializer_list<T> L) { insert(L.begin(), L.end()); }
+
+  SmallSet &operator=(const SmallSet &) = default;
+  SmallSet &operator=(SmallSet &&) = default;
 
   [[nodiscard]] bool empty() const { return Vector.empty() && Set.empty(); }
 
@@ -163,38 +172,16 @@ class SmallSet {
   }
 
   /// count - Return 1 if the element is in the set, 0 otherwise.
-  size_type count(const T &V) const {
-    if (isSmall()) {
-      // Since the collection is small, just do a linear search.
-      return vfind(V) == Vector.end() ? 0 : 1;
-    }
-    return Set.count(V);
-  }
+  size_type count(const T &V) const { return contains(V) ? 1 : 0; }
 
   /// insert - Insert an element into the set if it isn't already there.
   /// Returns a pair. The first value of it is an iterator to the inserted
   /// element or the existing element in the set. The second value is true
   /// if the element is inserted (it was not in the set before).
-  std::pair<const_iterator, bool> insert(const T &V) {
-    if (!isSmall()) {
-      auto [I, Inserted] = Set.insert(V);
-      return std::make_pair(const_iterator(I), Inserted);
-    }
-
-    VIterator I = vfind(V);
-    if (I != Vector.end())    // Don't reinsert if it already exists.
-      return std::make_pair(const_iterator(I), false);
-    if (Vector.size() < N) {
-      Vector.push_back(V);
-      return std::make_pair(const_iterator(std::prev(Vector.end())), true);
-    }
+  std::pair<const_iterator, bool> insert(const T &V) { return insertImpl(V); }
 
-    // Otherwise, grow from vector to set.
-    while (!Vector.empty()) {
-      Set.insert(Vector.back());
-      Vector.pop_back();
-    }
-    return std::make_pair(const_iterator(Set.insert(V).first), true);
+  std::pair<const_iterator, bool> insert(T &&V) {
+    return insertImpl(std::move(V));
   }
 
   template <typename IterT>
@@ -206,11 +193,11 @@ class SmallSet {
   bool erase(const T &V) {
     if (!isSmall())
       return Set.erase(V);
-    for (mutable_iterator I = Vector.begin(), E = Vector.end(); I != E; ++I)
-      if (*I == V) {
-        Vector.erase(I);
-        return true;
-      }
+    auto I = vfind(V);
+    if (I != Vector.end()) {
+      Vector.erase(I);
+      return true;
+    }
     return false;
   }
 
@@ -241,8 +228,33 @@ class SmallSet {
 private:
   bool isSmall() const { return Set.empty(); }
 
-  VIterator vfind(const T &V) const {
-    for (VIterator I = Vector.begin(), E = Vector.end(); I != E; ++I)
+  template <typename ArgType>
+  std::pair<const_iterator, bool> insertImpl(ArgType &&V) {
+    static_assert(std::is_convertible_v<ArgType, T>,
+                  "ArgType must be convertible to T!");
+    if (!isSmall()) {
+      auto [I, Inserted] = Set.insert(std::forward<ArgType>(V));
+      return {const_iterator(I), Inserted};
+    }
+
+    auto I = vfind(V);
+    if (I != Vector.end()) // Don't reinsert if it already exists.
+      return {const_iterator(I), false};
+    if (Vector.size() < N) {
+      Vector.push_back(std::forward<ArgType>(V));
+      return {const_iterator(std::prev(Vector.end())), true};
+    }
+    // Otherwise, grow from vector to set.
+    Set.insert(std::make_move_iterator(Vector.begin()),
+               std::make_move_iterator(Vector.end()));
+    Vector.clear();
+    return {const_iterator(Set.insert(std::forward<ArgType>(V)).first), true};
+  }
+
+  // Handwritten linear search. The use of std::find might hurt performance as
+  // its implementation may be optimized for larger containers.
+  typename SmallVector<T, N>::const_iterator vfind(const T &V) const {
+    for (auto I = Vector.begin(), E = Vector.end(); I != E; ++I)
       if (*I == V)
         return I;
     return Vector.end();
diff --git a/llvm/include/llvm/Analysis/CtxProfAnalysis.h b/llvm/include/llvm/Analysis/CtxProfAnalysis.h
index 0a5beb92fcbcc..be040d5eca5f3 100644
--- a/llvm/include/llvm/Analysis/CtxProfAnalysis.h
+++ b/llvm/include/llvm/Analysis/CtxProfAnalysis.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_ANALYSIS_CTXPROFANALYSIS_H
 #define LLVM_ANALYSIS_CTXPROFANALYSIS_H
 
+#include "llvm/ADT/SetVector.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -34,7 +35,7 @@ class PGOContextualProfile {
     uint32_t NextCounterIndex = 0;
     uint32_t NextCallsiteIndex = 0;
     const std::string Name;
-
+    PGOCtxProfContext Index;
     FunctionInfo(StringRef Name) : Name(Name) {}
   };
   std::optional<PGOCtxProfContext::CallTargetMapTy> Profiles;
@@ -49,6 +50,8 @@ class PGOContextualProfile {
   // its state piecemeal.
   PGOContextualProfile() = default;
 
+  void initIndex();
+
 public:
   PGOContextualProfile(const PGOContextualProfile &) = delete;
   PGOContextualProfile(PGOContextualProfile &&) = default;
@@ -63,6 +66,13 @@ class PGOContextualProfile {
     return getDefinedFunctionGUID(F) != 0;
   }
 
+  StringRef getFunctionName(GlobalValue::GUID GUID) const {
+    auto It = FuncInfo.find(GUID);
+    if (It == FuncInfo.end())
+      return "";
+    return It->second.Name;
+  }
+
   uint32_t getNumCounters(const Function &F) const {
     assert(isFunctionKnown(F));
     return FuncInfo.find(getDefinedFunctionGUID(F))->second.NextCounterIndex;
@@ -86,7 +96,7 @@ class PGOContextualProfile {
   using ConstVisitor = function_ref<void(const PGOCtxProfContext &)>;
   using Visitor = function_ref<void(PGOCtxProfContext &)>;
 
-  void update(Visitor, const Function *F = nullptr);
+  void update(Visitor, const Function &F);
   void visit(ConstVisitor, const Function *F = nullptr) const;
 
   const CtxProfFlatProfile flatten() const;
@@ -120,6 +130,11 @@ class CtxProfAnalysis : public AnalysisInfoMixin<CtxProfAnalysis> {
 
   /// Get the step instrumentation associated with a `select`
   static InstrProfIncrementInstStep *getSelectInstrumentation(SelectInst &SI);
+
+  // FIXME: refactor to an advisor model, and separate
+  static void collectIndirectCallPromotionList(
+      CallBase &IC, Result &Profile,
+      SetVector<std::pair<CallBase *, Function *>> &Candidates);
 };
 
 class CtxProfAnalysisPrinterPass
diff --git a/llvm/include/llvm/Analysis/EHUtils.h b/llvm/include/llvm/Analysis/EHUtils.h
index 3ad0878bd64f8..aaf2882d697d1 100644
--- a/llvm/include/llvm/Analysis/EHUtils.h
+++ b/llvm/include/llvm/Analysis/EHUtils.h
@@ -24,10 +24,8 @@ static void computeEHOnlyBlocks(FunctionT &F, DenseSet<BlockT *> &EHBlocks) {
   DenseMap<BlockT *, Status> Statuses;
 
   auto GetStatus = [&](BlockT *BB) {
-    if (Statuses.contains(BB))
-      return Statuses[BB];
-    else
-      return Unknown;
+    auto It = Statuses.find(BB);
+    return It != Statuses.end() ? It->second : Unknown;
   };
 
   auto CheckPredecessors = [&](BlockT *BB, Status Stat) {
diff --git a/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h b/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h
index decb33e6af6bc..c31e663498d5f 100644
--- a/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -492,7 +492,7 @@ class MemoryDependenceResults {
                                    const MemoryLocation &Loc, bool isLoad,
                                    BasicBlock *BB,
                                    SmallVectorImpl<NonLocalDepResult> &Result,
-                                   DenseMap<BasicBlock *, Value *> &Visited,
+                                   SmallDenseMap<BasicBlock *, Value *, 16> &Visited,
                                    bool SkipFirstBlock = false,
                                    bool IsIncomplete = false);
   MemDepResult getNonLocalInfoForBlock(Instruction *QueryInst,
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 68b860725752d..179a2c38d9d3c 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1125,15 +1125,10 @@ class ScalarEvolution {
     // Not taken either exactly ConstantMaxNotTaken or zero times
     bool MaxOrZero = false;
 
-    /// A set of predicate guards for this ExitLimit. The result is only valid
-    /// if all of the predicates in \c Predicates evaluate to 'true' at
+    /// A vector of predicate guards for this ExitLimit. The result is only
+    /// valid if all of the predicates in \c Predicates evaluate to 'true' at
     /// run-time.
-    SmallPtrSet<const SCEVPredicate *, 4> Predicates;
-
-    void addPredicate(const SCEVPredicate *P) {
-      assert(!isa<SCEVUnionPredicate>(P) && "Only add leaf predicates here!");
-      Predicates.insert(P);
-    }
+    SmallVector<const SCEVPredicate *, 4> Predicates;
 
     /// Construct either an exact exit limit from a constant, or an unknown
     /// one from a SCEVCouldNotCompute.  No other types of SCEVs are allowed
@@ -1142,12 +1137,11 @@ class ScalarEvolution {
 
     ExitLimit(const SCEV *E, const SCEV *ConstantMaxNotTaken,
               const SCEV *SymbolicMaxNotTaken, bool MaxOrZero,
-              ArrayRef<const SmallPtrSetImpl<const SCEVPredicate *> *>
-                  PredSetList = {});
+              ArrayRef<ArrayRef<const SCEVPredicate *>> PredLists = {});
 
     ExitLimit(const SCEV *E, const SCEV *ConstantMaxNotTaken,
               const SCEV *SymbolicMaxNotTaken, bool MaxOrZero,
-              const SmallPtrSetImpl<const SCEVPredicate *> &PredSet);
+              ArrayRef<const SCEVPredicate *> PredList);
 
     /// Test whether this ExitLimit contains any computed information, or
     /// whether it's all SCEVCouldNotCompute values.
@@ -1297,7 +1291,7 @@ class ScalarEvolution {
   /// adding additional predicates to \p Preds as required.
   const SCEVAddRecExpr *convertSCEVToAddRecWithPredicates(
       const SCEV *S, const Loop *L,
-      SmallPtrSetImpl<const SCEVPredicate *> &Preds);
+      SmallVectorImpl<const SCEVPredicate *> &Preds);
 
   /// Compute \p LHS - \p RHS and returns the result as an APInt if it is a
   /// constant, and std::nullopt if it isn't.
@@ -1489,12 +1483,13 @@ class ScalarEvolution {
     const SCEV *ExactNotTaken;
     const SCEV *ConstantMaxNotTaken;
     const SCEV *SymbolicMaxNotTaken;
-    SmallPtrSet<const SCEVPredicate *, 4> Predicates;
+    SmallVector<const SCEVPredicate *, 4> Predicates;
 
-    explicit ExitNotTakenInfo(
-        PoisoningVH<BasicBlock> ExitingBlock, const SCEV *ExactNotTaken,
-        const SCEV *ConstantMaxNotTaken, const SCEV *SymbolicMaxNotTaken,
-        const SmallPtrSet<const SCEVPredicate *, 4> &Predicates)
+    explicit ExitNotTakenInfo(PoisoningVH<BasicBlock> ExitingBlock,
+                              const SCEV *ExactNotTaken,
+                              const SCEV *ConstantMaxNotTaken,
+                              const SCEV *SymbolicMaxNotTaken,
+                              ArrayRef<const SCEVPredicate *> Predicates)
         : ExitingBlock(ExitingBlock), ExactNotTaken(ExactNotTaken),
           ConstantMaxNotTaken(ConstantMaxNotTaken),
           SymbolicMaxNotTaken(SymbolicMaxNotTaken), Predicates(Predicates) {}
diff --git a/llvm/include/llvm/Analysis/SparsePropagation.h b/llvm/include/llvm/Analysis/SparsePropagation.h
index d5805a7314757..cc79870229873 100644
--- a/llvm/include/llvm/Analysis/SparsePropagation.h
+++ b/llvm/include/llvm/Analysis/SparsePropagation.h
@@ -87,10 +87,9 @@ template <class LatticeKey, class LatticeVal> class AbstractLatticeFunction {
   /// ComputeInstructionState - Compute the LatticeKeys that change as a result
   /// of executing instruction \p I. Their associated LatticeVals are store in
   /// \p ChangedValues.
-  virtual void
-  ComputeInstructionState(Instruction &I,
-                          DenseMap<LatticeKey, LatticeVal> &ChangedValues,
-                          SparseSolver<LatticeKey, LatticeVal> &SS) = 0;
+  virtual void ComputeInstructionState(
+      Instruction &I, SmallDenseMap<LatticeKey, LatticeVal, 16> &ChangedValues,
+      SparseSolver<LatticeKey, LatticeVal> &SS) = 0;
 
   /// PrintLatticeVal - Render the given LatticeVal to the specified stream.
   virtual void PrintLatticeVal(LatticeVal LV, raw_ostream &OS);
@@ -401,7 +400,7 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::visitPHINode(PHINode &PN) {
   // computed from its incoming values.  For example, SSI form stores its sigma
   // functions as PHINodes with a single incoming value.
   if (LatticeFunc->IsSpecialCasedPHI(&PN)) {
-    DenseMap<LatticeKey, LatticeVal> ChangedValues;
+    SmallDenseMap<LatticeKey, LatticeVal, 16> ChangedValues;
     LatticeFunc->ComputeInstructionState(PN, ChangedValues, *this);
     for (auto &ChangedValue : ChangedValues)
       if (ChangedValue.second != LatticeFunc->getUntrackedVal())
@@ -456,7 +455,7 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::visitInst(Instruction &I) {
 
   // Otherwise, ask the transfer function what the result is.  If this is
   // something that we care about, remember it.
-  DenseMap<LatticeKey, LatticeVal> ChangedValues;
+  SmallDenseMap<LatticeKey, LatticeVal, 16> ChangedValues;
   LatticeFunc->ComputeInstructionState(I, ChangedValues, *this);
   for (auto &ChangedValue : ChangedValues)
     if (ChangedValue.second != LatticeFunc->getUntrackedVal())
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index cd69a8a371b6e..89a85bc8a9086 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1371,11 +1371,15 @@ class TargetTransformInfo {
   /// is an existing instruction that holds Opcode, it may be passed in the
   /// 'I' parameter. The \p VecPred parameter can be used to indicate the select
   /// is using a compare with the specified predicate as condition. When vector
-  /// types are passed, \p VecPred must be used for all lanes.
+  /// types are passed, \p VecPred must be used for all lanes.  For a
+  /// comparison, the two operands are the natural values.  For a select, the
+  /// two operands are the *value* operands, not the condition operand.
   InstructionCost
   getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                      CmpInst::Predicate VecPred,
                      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+                     OperandValueInfo Op1Info = {OK_AnyValue, OP_None},
+                     OperandValueInfo Op2Info = {OK_AnyValue, OP_None},
                      const Instruction *I = nullptr) const;
 
   /// \return The expected cost of vector Insert and Extract.
@@ -2049,11 +2053,11 @@ class TargetTransformInfo::Concept {
   virtual InstructionCost getCFInstrCost(unsigned Opcode,
                                          TTI::TargetCostKind CostKind,
                                          const Instruction *I = nullptr) = 0;
-  virtual InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                             Type *CondTy,
-                                             CmpInst::Predicate VecPred,
-                                             TTI::TargetCostKind CostKind,
-                                             const Instruction *I) = 0;
+  virtual InstructionCost
+  getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                     CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind,
+                     OperandValueInfo Op1Info, OperandValueInfo Op2Info,
+                     const Instruction *I) = 0;
   virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                              TTI::TargetCostKind CostKind,
                                              unsigned Index, Value *Op0,
@@ -2710,8 +2714,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                                      CmpInst::Predicate VecPred,
                                      TTI::TargetCostKind CostKind,
+                                     OperandValueInfo Op1Info,
+                                     OperandValueInfo Op2Info,
                                      const Instruction *I) override {
-    return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
+    return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+                                   Op1Info, Op2Info, I);
   }
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      TTI::TargetCostKind CostKind,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 79c8bafbc6c0d..eca8818cc25e6 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -666,6 +666,8 @@ class TargetTransformInfoImplBase {
   InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                                      CmpInst::Predicate VecPred,
                                      TTI::TargetCostKind CostKind,
+                                     TTI::OperandValueInfo Op1Info,
+                                     TTI::OperandValueInfo Op2Info,
                                      const Instruction *I) const {
     return 1;
   }
@@ -1332,19 +1334,23 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
             match(U, m_LogicalOr()) ? Instruction::Or : Instruction::And, Ty,
             CostKind, Op1Info, Op2Info, Operands, I);
       }
+      const auto Op1Info = TTI::getOperandInfo(Operands[1]);
+      const auto Op2Info = TTI::getOperandInfo(Operands[2]);
       Type *CondTy = Operands[0]->getType();
       return TargetTTI->getCmpSelInstrCost(Opcode, U->getType(), CondTy,
                                            CmpInst::BAD_ICMP_PREDICATE,
-                                           CostKind, I);
+                                           CostKind, Op1Info, Op2Info, I);
     }
     case Instruction::ICmp:
     case Instruction::FCmp: {
+      const auto Op1Info = TTI::getOperandInfo(Operands[0]);
+      const auto Op2Info = TTI::getOperandInfo(Operands[1]);
       Type *ValTy = Operands[0]->getType();
       // TODO: Also handle ICmp/FCmp constant expressions.
       return TargetTTI->getCmpSelInstrCost(Opcode, ValTy, U->getType(),
                                            I ? cast<CmpInst>(I)->getPredicate()
                                              : CmpInst::BAD_ICMP_PREDICATE,
-                                           CostKind, I);
+                                           CostKind, Op1Info, Op2Info, I);
     }
     case Instruction::InsertElement: {
       auto *IE = dyn_cast<InsertElementInst>(U);
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 7198e134a2d26..c36a346c1b2e0 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1222,10 +1222,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return BaseT::getCFInstrCost(Opcode, CostKind, I);
   }
 
-  InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-                                     CmpInst::Predicate VecPred,
-                                     TTI::TargetCostKind CostKind,
-                                     const Instruction *I = nullptr) {
+  InstructionCost getCmpSelInstrCost(
+      unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
+      TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr) {
     const TargetLoweringBase *TLI = getTLI();
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
     assert(ISD && "Invalid opcode");
@@ -1233,7 +1235,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     // TODO: Handle other cost kinds.
     if (CostKind != TTI::TCK_RecipThroughput)
       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
-                                       I);
+                                       Op1Info, Op2Info, I);
 
     // Selects on vectors are actually vector selects.
     if (ISD == ISD::SELECT) {
@@ -1260,8 +1262,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
       if (CondTy)
         CondTy = CondTy->getScalarType();
-      InstructionCost Cost = thisT()->getCmpSelInstrCost(
-          Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
+      InstructionCost Cost =
+          thisT()->getCmpSelInstrCost(Opcode, ValVTy->getScalarType(), CondTy,
+                                      VecPred, CostKind, Op1Info, Op2Info, I);
 
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
@@ -1552,7 +1555,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
     // Assume that target intrinsics are cheap.
     Intrinsic::ID IID = ICA.getID();
-    if (Function::isTargetIntrinsic(IID))
+    if (Intrinsic::isTargetIntrinsic(IID))
       return TargetTransformInfo::TCC_Basic;
 
     if (ICA.isTypeBasedOnly())
@@ -2535,7 +2538,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
   unsigned getNumberOfParts(Type *Tp) {
     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
-    return LT.first.isValid() ? *LT.first.getValue() : 0;
+    if (!LT.first.isValid())
+      return 0;
+    // Try to find actual number of parts for non-power-of-2 elements as
+    // ceil(num-of-elements/num-of-subtype-elements).
+    if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
+        Tp && LT.second.isFixedLengthVector() &&
+        !has_single_bit(FTp->getNumElements())) {
+      if (auto *SubTp = dyn_cast_if_present<FixedVectorType>(
+              EVT(LT.second).getTypeForEVT(Tp->getContext()));
+          SubTp && SubTp->getElementType() == FTp->getElementType())
+        return divideCeil(FTp->getNumElements(), SubTp->getNumElements());
+    }
+    return *LT.first.getValue();
   }
 
   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
diff --git a/llvm/include/llvm/CodeGen/FastISel.h b/llvm/include/llvm/CodeGen/FastISel.h
index 3cbc35400181d..f3c4cc8d0511d 100644
--- a/llvm/include/llvm/CodeGen/FastISel.h
+++ b/llvm/include/llvm/CodeGen/FastISel.h
@@ -275,7 +275,7 @@ class FastISel {
 
   /// This is a wrapper around getRegForValue that also takes care of
   /// truncating or sign-extending the given getelementptr index value.
-  Register getRegForGEPIndex(const Value *Idx);
+  Register getRegForGEPIndex(MVT PtrVT, const Value *Idx);
 
   /// We're checking to see if we can fold \p LI into \p FoldInst. Note
   /// that we could have a sequence where multiple LLVM IR instructions are
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 2796ea4a86617..6fd05c8fddd5f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -546,8 +546,10 @@ class IRTranslator : public MachineFunctionPass {
   bool translateVAArg(const User &U, MachineIRBuilder &MIRBuilder);
 
   bool translateInsertElement(const User &U, MachineIRBuilder &MIRBuilder);
+  bool translateInsertVector(const User &U, MachineIRBuilder &MIRBuilder);
 
   bool translateExtractElement(const User &U, MachineIRBuilder &MIRBuilder);
+  bool translateExtractVector(const User &U, MachineIRBuilder &MIRBuilder);
 
   bool translateShuffleVector(const User &U, MachineIRBuilder &MIRBuilder);
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h
index b749d9ef588ab..0283fdded7f43 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h
@@ -318,11 +318,8 @@ class LegacyLegalizerInfo {
                         const unsigned AddressSpace,
                         const SizeAndActionsVec &SizeAndActions) {
     const unsigned OpcodeIdx = Opcode - FirstOp;
-    if (AddrSpace2PointerActions[OpcodeIdx].find(AddressSpace) ==
-        AddrSpace2PointerActions[OpcodeIdx].end())
-      AddrSpace2PointerActions[OpcodeIdx][AddressSpace] = {{}};
     SmallVector<SizeAndActionsVec, 1> &Actions =
-        AddrSpace2PointerActions[OpcodeIdx].find(AddressSpace)->second;
+        AddrSpace2PointerActions[OpcodeIdx][AddressSpace];
     setActions(TypeIndex, Actions, SizeAndActions);
   }
 
@@ -347,11 +344,8 @@ class LegacyLegalizerInfo {
                                  const unsigned ElementSize,
                                  const SizeAndActionsVec &SizeAndActions) {
     const unsigned OpcodeIdx = Opcode - FirstOp;
-    if (NumElements2Actions[OpcodeIdx].find(ElementSize) ==
-        NumElements2Actions[OpcodeIdx].end())
-      NumElements2Actions[OpcodeIdx][ElementSize] = {{}};
     SmallVector<SizeAndActionsVec, 1> &Actions =
-        NumElements2Actions[OpcodeIdx].find(ElementSize)->second;
+        NumElements2Actions[OpcodeIdx][ElementSize];
     setActions(TypeIndex, Actions, SizeAndActions);
   }
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 5360850deeffd..ecade6b5caed6 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -394,6 +394,8 @@ class LegalizerHelper {
   LegalizeResult lowerRotate(MachineInstr &MI);
 
   LegalizeResult lowerU64ToF32BitOps(MachineInstr &MI);
+  LegalizeResult lowerU64ToF32WithSITOFP(MachineInstr &MI);
+  LegalizeResult lowerU64ToF64BitFloatOps(MachineInstr &MI);
   LegalizeResult lowerUITOFP(MachineInstr &MI);
   LegalizeResult lowerSITOFP(MachineInstr &MI);
   LegalizeResult lowerFPTOUI(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index 5c1da4fa762e8..997960fcd5d09 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -699,11 +699,6 @@ class LLVM_ABI MachineFunction {
             BBSectionsType == BasicBlockSection::Preset);
   }
 
-  /// Returns true if basic block labels are to be generated for this function.
-  bool hasBBLabels() const {
-    return BBSectionsType == BasicBlockSection::Labels;
-  }
-
   void setBBSectionsType(BasicBlockSection V) { BBSectionsType = V; }
 
   /// Assign IsBeginSection IsEndSection fields for basic blocks in this
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 2367d8d04787d..7a2c23c13a3ce 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -184,6 +184,8 @@ class MachineRegisterInfo {
       TheDelegate->MRI_NoteCloneVirtualRegister(NewReg, SrcReg);
   }
 
+  const MachineFunction &getMF() const { return *MF; }
+
   //===--------------------------------------------------------------------===//
   // Function State
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
index ec652f448f0f6..d6a1f064ec0a5 100644
--- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
+++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
@@ -65,6 +65,50 @@ struct PointerLikeTypeTraits<ReachingDef> {
   }
 };
 
+// The storage for all reaching definitions.
+class MBBReachingDefsInfo {
+public:
+  void init(unsigned NumBlockIDs) { AllReachingDefs.resize(NumBlockIDs); }
+
+  unsigned numBlockIDs() const { return AllReachingDefs.size(); }
+
+  void startBasicBlock(unsigned MBBNumber, unsigned NumRegUnits) {
+    AllReachingDefs[MBBNumber].resize(NumRegUnits);
+  }
+
+  void append(unsigned MBBNumber, unsigned Unit, int Def) {
+    AllReachingDefs[MBBNumber][Unit].push_back(Def);
+  }
+
+  void prepend(unsigned MBBNumber, unsigned Unit, int Def) {
+    auto &Defs = AllReachingDefs[MBBNumber][Unit];
+    Defs.insert(Defs.begin(), Def);
+  }
+
+  void replaceFront(unsigned MBBNumber, unsigned Unit, int Def) {
+    assert(!AllReachingDefs[MBBNumber][Unit].empty());
+    *AllReachingDefs[MBBNumber][Unit].begin() = Def;
+  }
+
+  void clear() { AllReachingDefs.clear(); }
+
+  ArrayRef<ReachingDef> defs(unsigned MBBNumber, unsigned Unit) const {
+    if (AllReachingDefs[MBBNumber].empty())
+      // Block IDs are not necessarily dense.
+      return ArrayRef<ReachingDef>();
+    return AllReachingDefs[MBBNumber][Unit];
+  }
+
+private:
+  /// All reaching defs of a given RegUnit for a given MBB.
+  using MBBRegUnitDefs = TinyPtrVector<ReachingDef>;
+  /// All reaching defs of all reg units for a given MBB
+  using MBBDefsInfo = std::vector<MBBRegUnitDefs>;
+
+  /// All reaching defs of all reg units for all MBBs
+  SmallVector<MBBDefsInfo, 4> AllReachingDefs;
+};
+
 /// This class provides the reaching def analysis.
 class ReachingDefAnalysis : public MachineFunctionPass {
 private:
@@ -93,12 +137,6 @@ class ReachingDefAnalysis : public MachineFunctionPass {
   /// their basic blocks.
   DenseMap<MachineInstr *, int> InstIds;
 
-  /// All reaching defs of a given RegUnit for a given MBB.
-  using MBBRegUnitDefs = TinyPtrVector<ReachingDef>;
-  /// All reaching defs of all reg units for a given MBB
-  using MBBDefsInfo = std::vector<MBBRegUnitDefs>;
-  /// All reaching defs of all reg units for a all MBBs
-  using MBBReachingDefsInfo = SmallVector<MBBDefsInfo, 4>;
   MBBReachingDefsInfo MBBReachingDefs;
 
   /// Default values are 'nothing happened a long time ago'.
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 1a2f31e199336..9ea0fba1144b1 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -489,6 +489,16 @@ class TargetRegisterInfo : public MCRegisterInfo {
   virtual const MCPhysReg*
   getCalleeSavedRegs(const MachineFunction *MF) const = 0;
 
+  /// Return a null-terminated list of all of the callee-saved registers on
+  /// this target when IPRA is on. The list should include any non-allocatable
+  /// registers that the backend uses and assumes will be saved by all calling
+  /// conventions. This is typically the ISA-standard frame pointer, but could
+  /// include the thread pointer, TOC pointer, or base pointer for different
+  /// targets.
+  virtual const MCPhysReg *getIPRACSRegs(const MachineFunction *MF) const {
+    return nullptr;
+  }
+
   /// Return a mask of call-preserved registers for the given calling convention
   /// on the current function. The mask should include all call-preserved
   /// aliases. This is used by the register allocator to determine which
diff --git a/llvm/include/llvm/DWARFLinker/Classic/DWARFLinkerDeclContext.h b/llvm/include/llvm/DWARFLinker/Classic/DWARFLinkerDeclContext.h
index b00f68c3be84e..9fb1b3f80e2ff 100644
--- a/llvm/include/llvm/DWARFLinker/Classic/DWARFLinkerDeclContext.h
+++ b/llvm/include/llvm/DWARFLinker/Classic/DWARFLinkerDeclContext.h
@@ -42,15 +42,15 @@ class CachedPathResolver {
 
     // If the ParentPath has not yet been resolved, resolve and cache it for
     // future look-ups.
-    if (!ResolvedPaths.count(ParentPath)) {
+    auto [It, Inserted] = ResolvedPaths.try_emplace(ParentPath);
+    if (Inserted) {
       SmallString<256> RealPath;
       sys::fs::real_path(ParentPath, RealPath);
-      ResolvedPaths.insert(
-          {ParentPath, std::string(RealPath.c_str(), RealPath.size())});
+      It->second = std::string(RealPath);
     }
 
     // Join the file name again with the resolved path.
-    SmallString<256> ResolvedPath(ResolvedPaths[ParentPath]);
+    SmallString<256> ResolvedPath(It->second);
     sys::path::append(ResolvedPath, FileName);
     return StringPool.internString(ResolvedPath);
   }
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 9ada4d747b1ce..56ff3cfb148f0 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -2632,7 +2632,8 @@ template<typename NodeT> struct NodeKind;
 #include "ItaniumNodes.def"
 
 inline bool NodeArray::printAsString(OutputBuffer &OB) const {
-  auto Fail = [&OB, StartPos = OB.getCurrentPosition()] {
+  auto StartPos = OB.getCurrentPosition();
+  auto Fail = [&OB, StartPos] {
     OB.setCurrentPosition(StartPos);
     return false;
   };
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
index 2c5b110492696..40b85e3272010 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
@@ -21,6 +21,7 @@
 
 #include <future>
 #include <thread>
+#include <unordered_map>
 #include <vector>
 
 namespace llvm {
@@ -31,25 +32,37 @@ struct ELFPerObjectSectionsToRegister {
   ExecutorAddrRange ThreadDataSection;
 };
 
-struct ELFNixJITDylibInitializers {
-  using SectionList = std::vector<ExecutorAddrRange>;
+using ELFNixJITDylibDepInfo = std::vector<ExecutorAddr>;
+using ELFNixJITDylibDepInfoMap =
+    std::vector<std::pair<ExecutorAddr, ELFNixJITDylibDepInfo>>;
 
-  ELFNixJITDylibInitializers(std::string Name, ExecutorAddr DSOHandleAddress)
-      : Name(std::move(Name)), DSOHandleAddress(std::move(DSOHandleAddress)) {}
-
-  std::string Name;
-  ExecutorAddr DSOHandleAddress;
-
-  StringMap<SectionList> InitSections;
+struct RuntimeFunction {
+  RuntimeFunction(SymbolStringPtr Name) : Name(std::move(Name)) {}
+  SymbolStringPtr Name;
+  ExecutorAddr Addr;
 };
 
-class ELFNixJITDylibDeinitializers {};
+struct FunctionPairKeyHash {
+  std::size_t
+  operator()(const std::pair<RuntimeFunction *, RuntimeFunction *> &key) const {
+    return std::hash<void *>()(key.first->Addr.toPtr<void *>()) ^
+           std::hash<void *>()(key.second->Addr.toPtr<void *>());
+  }
+};
 
-using ELFNixJITDylibInitializerSequence =
-    std::vector<ELFNixJITDylibInitializers>;
+struct FunctionPairKeyEqual {
+  std::size_t
+  operator()(const std::pair<RuntimeFunction *, RuntimeFunction *> &lhs,
+             const std::pair<RuntimeFunction *, RuntimeFunction *> &rhs) const {
+    return lhs.first == rhs.first && lhs.second == rhs.second;
+  }
+};
 
-using ELFNixJITDylibDeinitializerSequence =
-    std::vector<ELFNixJITDylibDeinitializers>;
+using DeferredRuntimeFnMap = std::unordered_map<
+    std::pair<RuntimeFunction *, RuntimeFunction *>,
+    SmallVector<std::pair<shared::WrapperFunctionCall::ArgDataBufferType,
+                          shared::WrapperFunctionCall::ArgDataBufferType>>,
+    FunctionPairKeyHash, FunctionPairKeyEqual>;
 
 /// Mediates between ELFNix initialization and ExecutionSession state.
 class ELFNixPlatform : public Platform {
@@ -126,6 +139,23 @@ class ELFNixPlatform : public Platform {
   standardRuntimeUtilityAliases();
 
 private:
+  // Data needed for bootstrap only.
+  struct BootstrapInfo {
+    std::mutex Mutex;
+    std::condition_variable CV;
+    size_t ActiveGraphs = 0;
+    ExecutorAddr ELFNixHeaderAddr;
+    DeferredRuntimeFnMap DeferredRTFnMap;
+
+    void addArgumentsToRTFnMap(
+        RuntimeFunction *func1, RuntimeFunction *func2,
+        const shared::WrapperFunctionCall::ArgDataBufferType &arg1,
+        const shared::WrapperFunctionCall::ArgDataBufferType &arg2) {
+      auto &argList = DeferredRTFnMap[std::make_pair(func1, func2)];
+      argList.emplace_back(arg1, arg2);
+    }
+  };
+
   // The ELFNixPlatformPlugin scans/modifies LinkGraphs to support ELF
   // platform features including initializers, exceptions, TLV, and language
   // runtime registration.
@@ -151,19 +181,22 @@ class ELFNixPlatform : public Platform {
                                      ResourceKey SrcKey) override {}
 
   private:
-    void addInitializerSupportPasses(MaterializationResponsibility &MR,
-                                     jitlink::PassConfiguration &Config);
+    Error bootstrapPipelineStart(jitlink::LinkGraph &G);
+    Error bootstrapPipelineRecordRuntimeFunctions(jitlink::LinkGraph &G);
+    Error bootstrapPipelineEnd(jitlink::LinkGraph &G);
 
     void addDSOHandleSupportPasses(MaterializationResponsibility &MR,
                                    jitlink::PassConfiguration &Config);
 
     void addEHAndTLVSupportPasses(MaterializationResponsibility &MR,
-                                  jitlink::PassConfiguration &Config);
+                                  jitlink::PassConfiguration &Config,
+                                  bool IsBootstrapping);
 
     Error preserveInitSections(jitlink::LinkGraph &G,
                                MaterializationResponsibility &MR);
 
-    Error registerInitSections(jitlink::LinkGraph &G, JITDylib &JD);
+    Error registerInitSections(jitlink::LinkGraph &G, JITDylib &JD,
+                               bool IsBootstrapping);
 
     Error fixTLVSectionsAndEdges(jitlink::LinkGraph &G, JITDylib &JD);
 
@@ -171,11 +204,8 @@ class ELFNixPlatform : public Platform {
     ELFNixPlatform &MP;
   };
 
-  using SendInitializerSequenceFn =
-      unique_function<void(Expected<ELFNixJITDylibInitializerSequence>)>;
-
-  using SendDeinitializerSequenceFn =
-      unique_function<void(Expected<ELFNixJITDylibDeinitializerSequence>)>;
+  using PushInitializersSendResultFn =
+      unique_function<void(Expected<ELFNixJITDylibDepInfoMap>)>;
 
   using SendSymbolAddressFn = unique_function<void(Expected<ExecutorAddr>)>;
 
@@ -189,53 +219,58 @@ class ELFNixPlatform : public Platform {
   // Associate ELFNixPlatform JIT-side runtime support functions with handlers.
   Error associateRuntimeSupportFunctions(JITDylib &PlatformJD);
 
-  void getInitializersBuildSequencePhase(SendInitializerSequenceFn SendResult,
-                                         JITDylib &JD,
-                                         std::vector<JITDylibSP> DFSLinkOrder);
+  void pushInitializersLoop(PushInitializersSendResultFn SendResult,
+                            JITDylibSP JD);
 
-  void getInitializersLookupPhase(SendInitializerSequenceFn SendResult,
-                                  JITDylib &JD);
-
-  void rt_getInitializers(SendInitializerSequenceFn SendResult,
-                          StringRef JDName);
-
-  void rt_getDeinitializers(SendDeinitializerSequenceFn SendResult,
-                            ExecutorAddr Handle);
+  void rt_recordInitializers(PushInitializersSendResultFn SendResult,
+                             ExecutorAddr JDHeader);
 
   void rt_lookupSymbol(SendSymbolAddressFn SendResult, ExecutorAddr Handle,
                        StringRef SymbolName);
 
-  // Records the addresses of runtime symbols used by the platform.
-  Error bootstrapELFNixRuntime(JITDylib &PlatformJD);
-
-  Error registerInitInfo(JITDylib &JD,
-                         ArrayRef<jitlink::Section *> InitSections);
-
-  Error registerPerObjectSections(const ELFPerObjectSectionsToRegister &POSR);
+  Error registerPerObjectSections(jitlink::LinkGraph &G,
+                                  const ELFPerObjectSectionsToRegister &POSR,
+                                  bool IsBootstrapping);
 
   Expected<uint64_t> createPThreadKey();
 
   ExecutionSession &ES;
+  JITDylib &PlatformJD;
   ObjectLinkingLayer &ObjLinkingLayer;
 
   SymbolStringPtr DSOHandleSymbol;
-  std::atomic<bool> RuntimeBootstrapped{false};
 
-  ExecutorAddr orc_rt_elfnix_platform_bootstrap;
-  ExecutorAddr orc_rt_elfnix_platform_shutdown;
-  ExecutorAddr orc_rt_elfnix_register_object_sections;
-  ExecutorAddr orc_rt_elfnix_create_pthread_key;
+  RuntimeFunction PlatformBootstrap{
+      ES.intern("__orc_rt_elfnix_platform_bootstrap")};
+  RuntimeFunction PlatformShutdown{
+      ES.intern("__orc_rt_elfnix_platform_shutdown")};
+  RuntimeFunction RegisterJITDylib{
+      ES.intern("__orc_rt_elfnix_register_jitdylib")};
+  RuntimeFunction DeregisterJITDylib{
+      ES.intern("__orc_rt_elfnix_deregister_jitdylib")};
+  RuntimeFunction RegisterObjectSections{
+      ES.intern("__orc_rt_elfnix_register_object_sections")};
+  RuntimeFunction DeregisterObjectSections{
+      ES.intern("__orc_rt_elfnix_deregister_object_sections")};
+  RuntimeFunction RegisterInitSections{
+      ES.intern("__orc_rt_elfnix_register_init_sections")};
+  RuntimeFunction DeregisterInitSections{
+      ES.intern("__orc_rt_elfnix_deregister_init_sections")};
+  RuntimeFunction CreatePThreadKey{
+      ES.intern("__orc_rt_elfnix_create_pthread_key")};
 
   DenseMap<JITDylib *, SymbolLookupSet> RegisteredInitSymbols;
 
   // InitSeqs gets its own mutex to avoid locking the whole session when
   // aggregating data from the jitlink.
   std::mutex PlatformMutex;
-  DenseMap<JITDylib *, ELFNixJITDylibInitializers> InitSeqs;
   std::vector<ELFPerObjectSectionsToRegister> BootstrapPOSRs;
 
   DenseMap<ExecutorAddr, JITDylib *> HandleAddrToJITDylib;
+  DenseMap<JITDylib *, ExecutorAddr> JITDylibToHandleAddr;
   DenseMap<JITDylib *, uint64_t> JITDylibToPThreadKey;
+
+  std::atomic<BootstrapInfo *> Bootstrap;
 };
 
 namespace shared {
@@ -266,63 +301,11 @@ class SPSSerializationTraits<SPSELFPerObjectSectionsToRegister,
   }
 };
 
-using SPSNamedExecutorAddrRangeSequenceMap =
-    SPSSequence<SPSTuple<SPSString, SPSExecutorAddrRangeSequence>>;
-
-using SPSELFNixJITDylibInitializers =
-    SPSTuple<SPSString, SPSExecutorAddr, SPSNamedExecutorAddrRangeSequenceMap>;
-
-using SPSELFNixJITDylibInitializerSequence =
-    SPSSequence<SPSELFNixJITDylibInitializers>;
-
-/// Serialization traits for ELFNixJITDylibInitializers.
-template <>
-class SPSSerializationTraits<SPSELFNixJITDylibInitializers,
-                             ELFNixJITDylibInitializers> {
-public:
-  static size_t size(const ELFNixJITDylibInitializers &MOJDIs) {
-    return SPSELFNixJITDylibInitializers::AsArgList::size(
-        MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections);
-  }
-
-  static bool serialize(SPSOutputBuffer &OB,
-                        const ELFNixJITDylibInitializers &MOJDIs) {
-    return SPSELFNixJITDylibInitializers::AsArgList::serialize(
-        OB, MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections);
-  }
-
-  static bool deserialize(SPSInputBuffer &IB,
-                          ELFNixJITDylibInitializers &MOJDIs) {
-    return SPSELFNixJITDylibInitializers::AsArgList::deserialize(
-        IB, MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections);
-  }
-};
-
-using SPSELFJITDylibDeinitializers = SPSEmpty;
-
-using SPSELFJITDylibDeinitializerSequence =
-    SPSSequence<SPSELFJITDylibDeinitializers>;
-
-template <>
-class SPSSerializationTraits<SPSELFJITDylibDeinitializers,
-                             ELFNixJITDylibDeinitializers> {
-public:
-  static size_t size(const ELFNixJITDylibDeinitializers &MOJDDs) { return 0; }
-
-  static bool serialize(SPSOutputBuffer &OB,
-                        const ELFNixJITDylibDeinitializers &MOJDDs) {
-    return true;
-  }
-
-  static bool deserialize(SPSInputBuffer &IB,
-                          ELFNixJITDylibDeinitializers &MOJDDs) {
-    MOJDDs = ELFNixJITDylibDeinitializers();
-    return true;
-  }
-};
+using SPSELFNixJITDylibDepInfoMap =
+    SPSSequence<SPSTuple<SPSExecutorAddr, SPSSequence<SPSExecutorAddr>>>;
 
 } // end namespace shared
 } // end namespace orc
 } // end namespace llvm
 
-#endif // LLVM_EXECUTIONENGINE_ORC_ELFNIXPLATFORM_H
+#endif // LLVM_EXECUTIONENGINE_ORC_ELFNIXPLATFORM_H
\ No newline at end of file
diff --git a/llvm/include/llvm/IR/ConstantFPRange.h b/llvm/include/llvm/IR/ConstantFPRange.h
index 23f0e8b8e0d13..67f9f945d748b 100644
--- a/llvm/include/llvm/IR/ConstantFPRange.h
+++ b/llvm/include/llvm/IR/ConstantFPRange.h
@@ -175,9 +175,6 @@ class [[nodiscard]] ConstantFPRange {
   /// Return the FPClassTest which will return true for the value.
   FPClassTest classify() const;
 
-  /// Return known floating-point classes for values in this range.
-  KnownFPClass toKnownFPClass() const;
-
   /// Print out the bounds to a stream.
   void print(raw_ostream &OS) const;
 
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index 3b16aa039a508..15b90589b7e2b 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -1210,8 +1210,11 @@ class ConstantExpr : public Constant {
   /// Return the absorbing element for the given binary
   /// operation, i.e. a constant C such that X op C = C and C op X = C for
   /// every X.  For example, this returns zero for integer multiplication.
-  /// It returns null if the operator doesn't have an absorbing element.
-  static Constant *getBinOpAbsorber(unsigned Opcode, Type *Ty);
+  /// If AllowLHSConstant is true, the LHS operand is a constant C that must be
+  /// defined as C op X = C. It returns null if the operator doesn't have
+  /// an absorbing element.
+  static Constant *getBinOpAbsorber(unsigned Opcode, Type *Ty,
+                                    bool AllowLHSConstant = false);
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index a4d55285380b0..fec876eaafc86 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -255,10 +255,6 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
   /// returns Intrinsic::not_intrinsic!
   bool isIntrinsic() const { return HasLLVMReservedName; }
 
-  /// isTargetIntrinsic - Returns true if IID is an intrinsic specific to a
-  /// certain target. If it is a generic intrinsic false is returned.
-  static bool isTargetIntrinsic(Intrinsic::ID IID);
-
   /// isTargetIntrinsic - Returns true if this function is an intrinsic and the
   /// intrinsic is specific to a certain target. If this is not an intrinsic
   /// or a generic intrinsic, false is returned.
@@ -269,8 +265,6 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
   /// getIntrinsicID() returns Intrinsic::not_intrinsic.
   bool isConstrainedFPIntrinsic() const;
 
-  static Intrinsic::ID lookupIntrinsicID(StringRef Name);
-
   /// Update internal caches that depend on the function name (such as the
   /// intrinsic ID and libcall cache).
   /// Note, this method does not need to be called directly, as it is called
diff --git a/llvm/include/llvm/IR/GEPNoWrapFlags.h b/llvm/include/llvm/IR/GEPNoWrapFlags.h
index 55a25c69193f2..4e6ab0d88bfcf 100644
--- a/llvm/include/llvm/IR/GEPNoWrapFlags.h
+++ b/llvm/include/llvm/IR/GEPNoWrapFlags.h
@@ -74,6 +74,16 @@ class GEPNoWrapFlags {
     return GEPNoWrapFlags(Flags & ~NUWFlag);
   }
 
+  /// Given (gep (gep p, x), y), determine the nowrap flags for (gep p, x+y).
+  GEPNoWrapFlags intersectForOffsetAdd(GEPNoWrapFlags Other) const {
+    GEPNoWrapFlags Res = *this & Other;
+    // Without inbounds, we could only preserve nusw if we know that x + y does
+    // not wrap.
+    if (!Res.isInBounds() && Res.hasNoUnsignedSignedWrap())
+      Res = Res.withoutNoUnsignedSignedWrap();
+    return Res;
+  }
+
   bool operator==(GEPNoWrapFlags Other) const { return Flags == Other.Flags; }
   bool operator!=(GEPNoWrapFlags Other) const { return !(*this == Other); }
 
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 75a059760f48f..695a7a6aa9f25 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -1117,7 +1117,7 @@ class GetElementPtrInst : public Instruction {
   /// the base GEP pointer.
   bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset) const;
   bool collectOffset(const DataLayout &DL, unsigned BitWidth,
-                     MapVector<Value *, APInt> &VariableOffsets,
+                     SmallMapVector<Value *, APInt, 4> &VariableOffsets,
                      APInt &ConstantOffset) const;
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 4bd7fda77f313..b251036247c5c 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defines a set of enums which allow processing of intrinsic
-// functions.  Values of these enum types are returned by
+// functions. Values of these enum types are returned by
 // Function::getIntrinsicID.
 //
 //===----------------------------------------------------------------------===//
@@ -78,6 +78,12 @@ namespace Intrinsic {
   /// Returns true if the intrinsic can be overloaded.
   bool isOverloaded(ID id);
 
+  /// isTargetIntrinsic - Returns true if IID is an intrinsic specific to a
+  /// certain target. If it is a generic intrinsic false is returned.
+  bool isTargetIntrinsic(ID IID);
+
+  ID lookupIntrinsicID(StringRef Name);
+
   /// Return the attributes for an intrinsic.
   AttributeList getAttributes(LLVMContext &C, ID id);
 
@@ -95,7 +101,7 @@ namespace Intrinsic {
   /// match for Name or a prefix of Name followed by a dot, its index in
   /// NameTable is returned. Otherwise, -1 is returned.
   int lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
-                                StringRef Name);
+                                StringRef Name, StringRef Target = "");
 
   /// Map a Clang builtin name to an intrinsic ID.
   ID getIntrinsicForClangBuiltin(StringRef TargetPrefix, StringRef BuiltinName);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 48d57907e6d0b..079ac61adef6e 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -626,7 +626,7 @@ class TypeInfoGen<
   list<LLVMType> Types = !foreach(ty, AllTypes,
     !if(!isa<LLVMMatchType>(ty), ACTys[MappingRIdxs[ty.Number]], ty));
 
-  list<list<int>> TypeSig = !listconcat(
+  list<int> TypeSig = !listflatten(!listconcat(
     [IIT_RetNumbers[!size(RetTypes)]],
     !foreach(i, !range(AllTypes),
       !foreach(a, AllTypes[i].Sig,
@@ -634,7 +634,7 @@ class TypeInfoGen<
           MappingRIdxs,
           ArgCodes,
           ACIdxs[i],
-          a>.ret)));
+          a>.ret))));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 8ffa2d0878e11..cd9e66b10d1de 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -563,6 +563,13 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   def int_aarch64_neon_vcmla_rot90  : AdvSIMD_3VectorArg_Intrinsic;
   def int_aarch64_neon_vcmla_rot180 : AdvSIMD_3VectorArg_Intrinsic;
   def int_aarch64_neon_vcmla_rot270 : AdvSIMD_3VectorArg_Intrinsic;
+  
+  // FP8 fscale
+  def int_aarch64_neon_fp8_fscale : DefaultAttrsIntrinsic<
+                                    [llvm_anyvector_ty],
+                                    [LLVMMatchType<0>,
+                                    LLVMVectorOfBitcastsToInt<0>],
+                                    [IntrNoMem]>;
 }
 
 let TargetPrefix = "aarch64" in {
@@ -2954,6 +2961,15 @@ let TargetPrefix = "aarch64" in {
           [llvm_i32_ty],
           [IntrNoMem, IntrHasSideEffects]>;
 
+  def int_aarch64_sme_write_lane_zt
+       :  DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_anyvector_ty, llvm_i32_ty],
+            [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects]>;
+
+  def int_aarch64_sme_write_zt
+       :  DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_anyvector_ty],
+            [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
+
+
   def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 
   class SME_OuterProduct_Intrinsic
@@ -3762,6 +3778,38 @@ let TargetPrefix = "aarch64" in {
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                             [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
                             [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;
+
+  def int_aarch64_sme_luti4_zt_x4
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                            [llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
+                            [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
+
+  
+  //
+  // Register scaling
+  //
+  def int_aarch64_sme_fp8_scale_single_x2
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>],
+                            [IntrNoMem]>;
+
+  def int_aarch64_sme_fp8_scale_single_x4
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>],
+                            [IntrNoMem]>;
+
+  def int_aarch64_sme_fp8_scale_x2
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                            [LLVMMatchType<0>, LLVMMatchType<0>,
+                             LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
+                            [IntrNoMem]>;
+
+  def int_aarch64_sme_fp8_scale_x4
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+                             LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
+                            [IntrNoMem]>;
+
 }
 
 // SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4cd32a0502c66..50179c1ceddb4 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -558,7 +558,8 @@ class AMDGPUDSAppendConsumedIntrinsic : Intrinsic<
   [llvm_anyptr_ty, // LDS or GDS ptr
    llvm_i1_ty], // isVolatile
    [IntrConvergent, IntrWillReturn, IntrArgMemOnly,
-    NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree],
+    Align<ArgIndex<0>, 4>, NoCapture<ArgIndex<0>>,
+    ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree],
    "",
    [SDNPMemOperand]
 >;
@@ -862,8 +863,8 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
       !foreach(arg, P_.DataArgs, arg.Type),    // vdata(VGPR) -- for store/atomic
       !if(P_.IsAtomic, [], [llvm_i32_ty]),     // dmask(imm)
       P_.AddrTypes,                            // vaddr(VGPR)
-      [llvm_v8i32_ty],                         // rsrc(SGPR)
-      !if(P_.IsSample, [llvm_v4i32_ty,         // samp(SGPR)
+      [llvm_any_ty],                           // rsrc(SGPR); Valid types: v4i32 and v8i32
+      !if(P_.IsSample, [llvm_any_ty,           // samp(SGPR);
                         llvm_i1_ty], []),      // unorm(imm)
       [llvm_i32_ty,                            // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
        llvm_i32_ty]),                          // auxiliary/cachepolicy(imm):
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 3ce7b8b987ef8..555877e7aaf0e 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -17,9 +17,6 @@ def int_dx_group_id : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWi
 def int_dx_thread_id_in_group : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>;
 def int_dx_flattened_thread_id_in_group : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrWillReturn]>;
 
-def int_dx_create_handle : ClangBuiltin<"__builtin_hlsl_create_handle">,
-    Intrinsic<[ llvm_ptr_ty ], [llvm_i8_ty], [IntrWillReturn]>;
-
 // Create resource handle given binding information. Returns a `target("dx.")`
 // type appropriate for the kind of resource given a register space ID, lower
 // bound and range size of the binding, as well as an index and an indicator
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td b/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td
index 38263f375c469..6e7e90438c621 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td
@@ -61,14 +61,14 @@ let TargetPrefix = "riscv" in {
 
   def int_riscv_cv_alu_clip   : ScalarCoreVAluGprGprIntrinsic;
   def int_riscv_cv_alu_clipu  : ScalarCoreVAluGprGprIntrinsic;
-  def int_riscv_cv_alu_addn   : ScalarCoreVAluGprGprGprIntrinsic;
-  def int_riscv_cv_alu_addun  : ScalarCoreVAluGprGprGprIntrinsic;
-  def int_riscv_cv_alu_addrn  : ScalarCoreVAluGprGprGprIntrinsic;
-  def int_riscv_cv_alu_addurn : ScalarCoreVAluGprGprGprIntrinsic;
-  def int_riscv_cv_alu_subn   : ScalarCoreVAluGprGprGprIntrinsic;
-  def int_riscv_cv_alu_subun  : ScalarCoreVAluGprGprGprIntrinsic;
-  def int_riscv_cv_alu_subrn  : ScalarCoreVAluGprGprGprIntrinsic;
-  def int_riscv_cv_alu_suburn : ScalarCoreVAluGprGprGprIntrinsic;
+  def int_riscv_cv_alu_addN   : ScalarCoreVAluGprGprGprIntrinsic;
+  def int_riscv_cv_alu_adduN  : ScalarCoreVAluGprGprGprIntrinsic;
+  def int_riscv_cv_alu_addRN  : ScalarCoreVAluGprGprGprIntrinsic;
+  def int_riscv_cv_alu_adduRN : ScalarCoreVAluGprGprGprIntrinsic;
+  def int_riscv_cv_alu_subN   : ScalarCoreVAluGprGprGprIntrinsic;
+  def int_riscv_cv_alu_subuN  : ScalarCoreVAluGprGprGprIntrinsic;
+  def int_riscv_cv_alu_subRN  : ScalarCoreVAluGprGprGprIntrinsic;
+  def int_riscv_cv_alu_subuRN : ScalarCoreVAluGprGprGprIntrinsic;
 
   def int_riscv_cv_mac_mac : ScalarCoreVMacGprGprGprIntrinsic;
   def int_riscv_cv_mac_msu : ScalarCoreVMacGprGprGprIntrinsic;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 7ac479f31386f..7ff3d58690ba7 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -15,6 +15,7 @@ let TargetPrefix = "spv" in {
   def int_spv_assign_ptr_type : Intrinsic<[], [llvm_any_ty, llvm_metadata_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
   def int_spv_assign_name : Intrinsic<[], [llvm_any_ty, llvm_vararg_ty]>;
   def int_spv_assign_decoration : Intrinsic<[], [llvm_any_ty, llvm_metadata_ty]>;
+  def int_spv_value_md : Intrinsic<[], [llvm_metadata_ty]>;
 
   def int_spv_track_constant : Intrinsic<[llvm_any_ty], [llvm_any_ty, llvm_metadata_ty]>;
   def int_spv_init_global : Intrinsic<[], [llvm_any_ty, llvm_any_ty]>;
@@ -58,8 +59,6 @@ let TargetPrefix = "spv" in {
 
   // The following intrinsic(s) are mirrored from IntrinsicsDirectX.td for HLSL support.
   def int_spv_thread_id : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>;
-  def int_spv_create_handle : ClangBuiltin<"__builtin_hlsl_create_handle">,
-      Intrinsic<[ llvm_ptr_ty ], [llvm_i8_ty], [IntrWillReturn]>;
   def int_spv_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>;
   def int_spv_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>;
   def int_spv_frac : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]>;
diff --git a/llvm/include/llvm/IR/LLVMContext.h b/llvm/include/llvm/IR/LLVMContext.h
index 558816e146587..6d4a59ba6b1f6 100644
--- a/llvm/include/llvm/IR/LLVMContext.h
+++ b/llvm/include/llvm/IR/LLVMContext.h
@@ -130,6 +130,10 @@ class LLVMContext {
   /// scope names are ordered by increasing synchronization scope IDs.
   void getSyncScopeNames(SmallVectorImpl<StringRef> &SSNs) const;
 
+  /// getSyncScopeName - Returns the name of a SyncScope::ID
+  /// registered with LLVMContext, if any.
+  std::optional<StringRef> getSyncScopeName(SyncScope::ID Id) const;
+
   /// Define the GC for a function
   void setGC(const Function &Fn, std::string GCName);
 
diff --git a/llvm/include/llvm/IR/Operator.h b/llvm/include/llvm/IR/Operator.h
index 88b9bfc0be4b1..0e9f6ed35dcb4 100644
--- a/llvm/include/llvm/IR/Operator.h
+++ b/llvm/include/llvm/IR/Operator.h
@@ -528,7 +528,7 @@ class GEPOperator
   /// Collect the offset of this GEP as a map of Values to their associated
   /// APInt multipliers, as well as a total Constant Offset.
   bool collectOffset(const DataLayout &DL, unsigned BitWidth,
-                     MapVector<Value *, APInt> &VariableOffsets,
+                     SmallMapVector<Value *, APInt, 4> &VariableOffsets,
                      APInt &ConstantOffset) const;
 };
 
diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index a617ddecd38a2..6db42878d2e52 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -268,6 +268,8 @@ class MCRegisterInfo {
   friend class MCRegUnitRootIterator;
   friend class MCRegAliasIterator;
 
+  virtual ~MCRegisterInfo() {}
+
   /// Initialize MCRegisterInfo, called by TableGen
   /// auto-generated routines. *DO NOT USE*.
   void InitMCRegisterInfo(const MCRegisterDesc *D, unsigned NR, unsigned RA,
@@ -418,15 +420,15 @@ class MCRegisterInfo {
   /// number.  Returns -1 if there is no equivalent value.  The second
   /// parameter allows targets to use different numberings for EH info and
   /// debugging info.
-  int getDwarfRegNum(MCRegister RegNum, bool isEH) const;
+  virtual int64_t getDwarfRegNum(MCRegister RegNum, bool isEH) const;
 
   /// Map a dwarf register back to a target register. Returns std::nullopt if
   /// there is no mapping.
-  std::optional<MCRegister> getLLVMRegNum(unsigned RegNum, bool isEH) const;
+  std::optional<MCRegister> getLLVMRegNum(uint64_t RegNum, bool isEH) const;
 
   /// Map a target EH register number to an equivalent DWARF register
   /// number.
-  int getDwarfRegNumFromDwarfEHRegNum(unsigned RegNum) const;
+  int64_t getDwarfRegNumFromDwarfEHRegNum(uint64_t RegNum) const;
 
   /// Map a target register to an equivalent SEH register
   /// number.  Returns LLVM register number if there is no equivalent value.
diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfReader.h b/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
index a00c21ddc7d7a..d358041e3a001 100644
--- a/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
@@ -22,6 +22,58 @@
 #include <map>
 
 namespace llvm {
+class PGOContextualProfile;
+class PGOCtxProfContext;
+
+namespace internal {
+// When we traverse the contextual profile, we typically want to visit contexts
+// pertaining to a specific function. To avoid traversing the whole tree, we
+// want to keep a per-function list - which will be in preorder - of that
+// function's contexts. This happens in PGOContextualProfile. For memory use
+// efficiency, we want to make PGOCtxProfContext an intrusive double-linked list
+// node. We need to handle the cases where PGOCtxProfContext nodes are moved and
+// deleted: in both cases, we need to update the index (==list). We can do that
+// directly from the node in the list, without knowing who the "parent" of the
+// list is. That makes the ADT ilist overkill here. Finally, IndexNode is meant
+// to be an implementation detail of PGOCtxProfContext, and the only reason it's
+// factored out is to avoid implementing move semantics for all its members.
+class IndexNode {
+  // This class' members are intentionally private - it's a convenience
+  // implementation detail.
+  friend class ::llvm::PGOCtxProfContext;
+  friend class ::llvm::PGOContextualProfile;
+
+  IndexNode *Previous = nullptr;
+  IndexNode *Next = nullptr;
+
+  ~IndexNode() {
+    if (Next)
+      Next->Previous = Previous;
+    if (Previous)
+      Previous->Next = Next;
+  }
+
+  IndexNode(const IndexNode &Other) = delete;
+
+  IndexNode(IndexNode &&Other) {
+    // Copy the neighbor info
+    Next = Other.Next;
+    Previous = Other.Previous;
+
+    // Update the neighbors to point to this object
+    if (Other.Next)
+      Other.Next->Previous = this;
+    if (Other.Previous)
+      Other.Previous->Next = this;
+
+    // Make sure the dtor is a noop
+    Other.Next = nullptr;
+    Other.Previous = nullptr;
+  }
+  IndexNode() = default;
+};
+} // namespace internal
+
 /// A node (context) in the loaded contextual profile, suitable for mutation
 /// during IPO passes. We generally expect a fraction of counters and
 /// callsites to be populated. We continue to model counters as vectors, but
@@ -29,13 +81,15 @@ namespace llvm {
 /// there is a small number of indirect targets (usually, 1 for direct calls);
 /// but potentially a large number of callsites, and, as inlining progresses,
 /// the callsite count of a caller will grow.
-class PGOCtxProfContext final {
+class PGOCtxProfContext final : public internal::IndexNode {
 public:
   using CallTargetMapTy = std::map<GlobalValue::GUID, PGOCtxProfContext>;
   using CallsiteMapTy = std::map<uint32_t, CallTargetMapTy>;
 
 private:
   friend class PGOCtxProfileReader;
+  friend class PGOContextualProfile;
+
   GlobalValue::GUID GUID = 0;
   SmallVector<uint64_t, 16> Counters;
   CallsiteMapTy Callsites;
@@ -47,11 +101,15 @@ class PGOCtxProfContext final {
   getOrEmplace(uint32_t Index, GlobalValue::GUID G,
                SmallVectorImpl<uint64_t> &&Counters);
 
+  // Create a bogus context object, used for anchoring the index double linked
+  // list - see IndexNode
+  PGOCtxProfContext() = default;
+
 public:
   PGOCtxProfContext(const PGOCtxProfContext &) = delete;
   PGOCtxProfContext &operator=(const PGOCtxProfContext &) = delete;
   PGOCtxProfContext(PGOCtxProfContext &&) = default;
-  PGOCtxProfContext &operator=(PGOCtxProfContext &&) = default;
+  PGOCtxProfContext &operator=(PGOCtxProfContext &&) = delete;
 
   GlobalValue::GUID guid() const { return GUID; }
   const SmallVectorImpl<uint64_t> &counters() const { return Counters; }
diff --git a/llvm/include/llvm/SandboxIR/Argument.h b/llvm/include/llvm/SandboxIR/Argument.h
new file mode 100644
index 0000000000000..aed886e8f22f2
--- /dev/null
+++ b/llvm/include/llvm/SandboxIR/Argument.h
@@ -0,0 +1,38 @@
+//===- Argument.h -----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SANDBOXIR_ARGUMENT_H
+#define LLVM_SANDBOXIR_ARGUMENT_H
+
+#include "llvm/IR/Argument.h"
+#include "llvm/SandboxIR/Value.h"
+
+namespace llvm::sandboxir {
+
+/// Argument of a sandboxir::Function.
+class Argument : public sandboxir::Value {
+  Argument(llvm::Argument *Arg, sandboxir::Context &Ctx)
+      : Value(ClassID::Argument, Arg, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::Argument;
+  }
+#ifndef NDEBUG
+  void verify() const final {
+    assert(isa<llvm::Argument>(Val) && "Expected Argument!");
+  }
+  void printAsOperand(raw_ostream &OS) const;
+  void dumpOS(raw_ostream &OS) const final;
+#endif
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_SANDBOXIR_ARGUMENT_H
diff --git a/llvm/include/llvm/SandboxIR/BasicBlock.h b/llvm/include/llvm/SandboxIR/BasicBlock.h
new file mode 100644
index 0000000000000..93e79e2a421f9
--- /dev/null
+++ b/llvm/include/llvm/SandboxIR/BasicBlock.h
@@ -0,0 +1,112 @@
+//===- BasicBlock.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SANDBOXIR_BASICBLOCK_H
+#define LLVM_SANDBOXIR_BASICBLOCK_H
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/SandboxIR/Value.h"
+
+namespace llvm::sandboxir {
+
+class BasicBlock;
+class Function;
+class Instruction;
+
+/// Iterator for `Instruction`s in a `BasicBlock.
+/// \Returns an sandboxir::Instruction & when derereferenced.
+class BBIterator {
+public:
+  using difference_type = std::ptrdiff_t;
+  using value_type = Instruction;
+  using pointer = value_type *;
+  using reference = value_type &;
+  using iterator_category = std::bidirectional_iterator_tag;
+
+private:
+  llvm::BasicBlock *BB;
+  llvm::BasicBlock::iterator It;
+  Context *Ctx;
+  pointer getInstr(llvm::BasicBlock::iterator It) const;
+
+public:
+  BBIterator() : BB(nullptr), Ctx(nullptr) {}
+  BBIterator(llvm::BasicBlock *BB, llvm::BasicBlock::iterator It, Context *Ctx)
+      : BB(BB), It(It), Ctx(Ctx) {}
+  reference operator*() const { return *getInstr(It); }
+  BBIterator &operator++();
+  BBIterator operator++(int) {
+    auto Copy = *this;
+    ++*this;
+    return Copy;
+  }
+  BBIterator &operator--();
+  BBIterator operator--(int) {
+    auto Copy = *this;
+    --*this;
+    return Copy;
+  }
+  bool operator==(const BBIterator &Other) const {
+    assert(Ctx == Other.Ctx && "BBIterators in different context!");
+    return It == Other.It;
+  }
+  bool operator!=(const BBIterator &Other) const { return !(*this == Other); }
+  /// \Returns the SBInstruction that corresponds to this iterator, or null if
+  /// the instruction is not found in the IR-to-SandboxIR tables.
+  pointer get() const { return getInstr(It); }
+  /// \Returns the parent BB.
+  BasicBlock *getNodeParent() const;
+};
+
+/// Contains a list of sandboxir::Instruction's.
+class BasicBlock : public Value {
+  /// Builds a graph that contains all values in \p BB in their original form
+  /// i.e., no vectorization is taking place here.
+  void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB);
+  friend class Context;     // For `buildBasicBlockFromIR`
+  friend class Instruction; // For LLVM Val.
+
+  BasicBlock(llvm::BasicBlock *BB, Context &SBCtx)
+      : Value(ClassID::Block, BB, SBCtx) {
+    buildBasicBlockFromLLVMIR(BB);
+  }
+
+public:
+  ~BasicBlock() = default;
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == Value::ClassID::Block;
+  }
+  Function *getParent() const;
+  using iterator = BBIterator;
+  iterator begin() const;
+  iterator end() const {
+    auto *BB = cast<llvm::BasicBlock>(Val);
+    return iterator(BB, BB->end(), &Ctx);
+  }
+  std::reverse_iterator<iterator> rbegin() const {
+    return std::make_reverse_iterator(end());
+  }
+  std::reverse_iterator<iterator> rend() const {
+    return std::make_reverse_iterator(begin());
+  }
+  Context &getContext() const { return Ctx; }
+  Instruction *getTerminator() const;
+  bool empty() const { return begin() == end(); }
+  Instruction &front() const;
+  Instruction &back() const;
+
+#ifndef NDEBUG
+  void verify() const final;
+  void dumpOS(raw_ostream &OS) const final;
+#endif
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_SANDBOXIR_BASICBLOCK_H
diff --git a/llvm/include/llvm/SandboxIR/Constant.h b/llvm/include/llvm/SandboxIR/Constant.h
new file mode 100644
index 0000000000000..6eafdefde6026
--- /dev/null
+++ b/llvm/include/llvm/SandboxIR/Constant.h
@@ -0,0 +1,1232 @@
+//===- Constant.h -----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SANDBOXIR_CONSTANT_H
+#define LLVM_SANDBOXIR_CONSTANT_H
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/SandboxIR/Argument.h"
+#include "llvm/SandboxIR/BasicBlock.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Type.h"
+#include "llvm/SandboxIR/User.h"
+
+namespace llvm::sandboxir {
+
+class BasicBlock;
+class Function;
+
+class Constant : public sandboxir::User {
+protected:
+  Constant(llvm::Constant *C, sandboxir::Context &SBCtx)
+      : sandboxir::User(ClassID::Constant, C, SBCtx) {}
+  Constant(ClassID ID, llvm::Constant *C, sandboxir::Context &SBCtx)
+      : sandboxir::User(ID, C, SBCtx) {}
+  friend class ConstantInt; // For constructor.
+  friend class Function;    // For constructor
+  friend class Context;     // For constructor.
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const override {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+
+public:
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    switch (From->getSubclassID()) {
+#define DEF_CONST(ID, CLASS) case ClassID::ID:
+#include "llvm/SandboxIR/Values.def"
+      return true;
+    default:
+      return false;
+    }
+  }
+  sandboxir::Context &getParent() const { return getContext(); }
+  unsigned getUseOperandNo(const Use &Use) const override {
+    return getUseOperandNoDefault(Use);
+  }
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::Constant>(Val) && "Expected Constant!");
+  }
+  void dumpOS(raw_ostream &OS) const override;
+#endif
+};
+
+// TODO: This should inherit from ConstantData.
+class ConstantInt : public Constant {
+  ConstantInt(llvm::ConstantInt *C, Context &Ctx)
+      : Constant(ClassID::ConstantInt, C, Ctx) {}
+  friend class Context; // For constructor.
+
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    llvm_unreachable("ConstantInt has no operands!");
+  }
+
+public:
+  static ConstantInt *getTrue(Context &Ctx);
+  static ConstantInt *getFalse(Context &Ctx);
+  static ConstantInt *getBool(Context &Ctx, bool V);
+  static Constant *getTrue(Type *Ty);
+  static Constant *getFalse(Type *Ty);
+  static Constant *getBool(Type *Ty, bool V);
+
+  /// If Ty is a vector type, return a Constant with a splat of the given
+  /// value. Otherwise return a ConstantInt for the given value.
+  static ConstantInt *get(Type *Ty, uint64_t V, bool IsSigned = false);
+
+  /// Return a ConstantInt with the specified integer value for the specified
+  /// type. If the type is wider than 64 bits, the value will be zero-extended
+  /// to fit the type, unless IsSigned is true, in which case the value will
+  /// be interpreted as a 64-bit signed integer and sign-extended to fit
+  /// the type.
+  /// Get a ConstantInt for a specific value.
+  static ConstantInt *get(IntegerType *Ty, uint64_t V, bool IsSigned = false);
+
+  /// Return a ConstantInt with the specified value for the specified type. The
+  /// value V will be canonicalized to a an unsigned APInt. Accessing it with
+  /// either getSExtValue() or getZExtValue() will yield a correctly sized and
+  /// signed value for the type Ty.
+  /// Get a ConstantInt for a specific signed value.
+  static ConstantInt *getSigned(IntegerType *Ty, int64_t V);
+  static Constant *getSigned(Type *Ty, int64_t V);
+
+  /// Return a ConstantInt with the specified value and an implied Type. The
+  /// type is the integer type that corresponds to the bit width of the value.
+  static ConstantInt *get(Context &Ctx, const APInt &V);
+
+  /// Return a ConstantInt constructed from the string strStart with the given
+  /// radix.
+  static ConstantInt *get(IntegerType *Ty, StringRef Str, uint8_t Radix);
+
+  /// If Ty is a vector type, return a Constant with a splat of the given
+  /// value. Otherwise return a ConstantInt for the given value.
+  static Constant *get(Type *Ty, const APInt &V);
+
+  /// Return the constant as an APInt value reference. This allows clients to
+  /// obtain a full-precision copy of the value.
+  /// Return the constant's value.
+  inline const APInt &getValue() const {
+    return cast<llvm::ConstantInt>(Val)->getValue();
+  }
+
+  /// getBitWidth - Return the scalar bitwidth of this constant.
+  unsigned getBitWidth() const {
+    return cast<llvm::ConstantInt>(Val)->getBitWidth();
+  }
+  /// Return the constant as a 64-bit unsigned integer value after it
+  /// has been zero extended as appropriate for the type of this constant. Note
+  /// that this method can assert if the value does not fit in 64 bits.
+  /// Return the zero extended value.
+  inline uint64_t getZExtValue() const {
+    return cast<llvm::ConstantInt>(Val)->getZExtValue();
+  }
+
+  /// Return the constant as a 64-bit integer value after it has been sign
+  /// extended as appropriate for the type of this constant. Note that
+  /// this method can assert if the value does not fit in 64 bits.
+  /// Return the sign extended value.
+  inline int64_t getSExtValue() const {
+    return cast<llvm::ConstantInt>(Val)->getSExtValue();
+  }
+
+  /// Return the constant as an llvm::MaybeAlign.
+  /// Note that this method can assert if the value does not fit in 64 bits or
+  /// is not a power of two.
+  inline MaybeAlign getMaybeAlignValue() const {
+    return cast<llvm::ConstantInt>(Val)->getMaybeAlignValue();
+  }
+
+  /// Return the constant as an llvm::Align, interpreting `0` as `Align(1)`.
+  /// Note that this method can assert if the value does not fit in 64 bits or
+  /// is not a power of two.
+  inline Align getAlignValue() const {
+    return cast<llvm::ConstantInt>(Val)->getAlignValue();
+  }
+
+  /// A helper method that can be used to determine if the constant contained
+  /// within is equal to a constant.  This only works for very small values,
+  /// because this is all that can be represented with all types.
+  /// Determine if this constant's value is same as an unsigned char.
+  bool equalsInt(uint64_t V) const {
+    return cast<llvm::ConstantInt>(Val)->equalsInt(V);
+  }
+
+  /// Variant of the getType() method to always return an IntegerType, which
+  /// reduces the amount of casting needed in parts of the compiler.
+  IntegerType *getIntegerType() const;
+
+  /// This static method returns true if the type Ty is big enough to
+  /// represent the value V. This can be used to avoid having the get method
+  /// assert when V is larger than Ty can represent. Note that there are two
+  /// versions of this method, one for unsigned and one for signed integers.
+  /// Although ConstantInt canonicalizes everything to an unsigned integer,
+  /// the signed version avoids callers having to convert a signed quantity
+  /// to the appropriate unsigned type before calling the method.
+  /// @returns true if V is a valid value for type Ty
+  /// Determine if the value is in range for the given type.
+  static bool isValueValidForType(Type *Ty, uint64_t V);
+  static bool isValueValidForType(Type *Ty, int64_t V);
+
+  bool isNegative() const { return cast<llvm::ConstantInt>(Val)->isNegative(); }
+
+  /// This is just a convenience method to make client code smaller for a
+  /// common code. It also correctly performs the comparison without the
+  /// potential for an assertion from getZExtValue().
+  bool isZero() const { return cast<llvm::ConstantInt>(Val)->isZero(); }
+
+  /// This is just a convenience method to make client code smaller for a
+  /// common case. It also correctly performs the comparison without the
+  /// potential for an assertion from getZExtValue().
+  /// Determine if the value is one.
+  bool isOne() const { return cast<llvm::ConstantInt>(Val)->isOne(); }
+
+  /// This function will return true iff every bit in this constant is set
+  /// to true.
+  /// @returns true iff this constant's bits are all set to true.
+  /// Determine if the value is all ones.
+  bool isMinusOne() const { return cast<llvm::ConstantInt>(Val)->isMinusOne(); }
+
+  /// This function will return true iff this constant represents the largest
+  /// value that may be represented by the constant's type.
+  /// @returns true iff this is the largest value that may be represented
+  /// by this type.
+  /// Determine if the value is maximal.
+  bool isMaxValue(bool IsSigned) const {
+    return cast<llvm::ConstantInt>(Val)->isMaxValue(IsSigned);
+  }
+
+  /// This function will return true iff this constant represents the smallest
+  /// value that may be represented by this constant's type.
+  /// @returns true if this is the smallest value that may be represented by
+  /// this type.
+  /// Determine if the value is minimal.
+  bool isMinValue(bool IsSigned) const {
+    return cast<llvm::ConstantInt>(Val)->isMinValue(IsSigned);
+  }
+
+  /// This function will return true iff this constant represents a value with
+  /// active bits bigger than 64 bits or a value greater than the given uint64_t
+  /// value.
+  /// @returns true iff this constant is greater or equal to the given number.
+  /// Determine if the value is greater or equal to the given number.
+  bool uge(uint64_t Num) const {
+    return cast<llvm::ConstantInt>(Val)->uge(Num);
+  }
+
+  /// getLimitedValue - If the value is smaller than the specified limit,
+  /// return it, otherwise return the limit value.  This causes the value
+  /// to saturate to the limit.
+  /// @returns the min of the value of the constant and the specified value
+  /// Get the constant's value with a saturation limit
+  uint64_t getLimitedValue(uint64_t Limit = ~0ULL) const {
+    return cast<llvm::ConstantInt>(Val)->getLimitedValue(Limit);
+  }
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::ConstantInt;
+  }
+  unsigned getUseOperandNo(const Use &Use) const override {
+    llvm_unreachable("ConstantInt has no operands!");
+  }
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::ConstantInt>(Val) && "Expected a ConstantInst!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
+// TODO: This should inherit from ConstantData.
+class ConstantFP final : public Constant {
+  ConstantFP(llvm::ConstantFP *C, Context &Ctx)
+      : Constant(ClassID::ConstantFP, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// This returns a ConstantFP, or a vector containing a splat of a ConstantFP,
+  /// for the specified value in the specified type. This should only be used
+  /// for simple constant values like 2.0/1.0 etc, that are known-valid both as
+  /// host double and as the target format.
+  static Constant *get(Type *Ty, double V);
+
+  /// If Ty is a vector type, return a Constant with a splat of the given
+  /// value. Otherwise return a ConstantFP for the given value.
+  static Constant *get(Type *Ty, const APFloat &V);
+
+  static Constant *get(Type *Ty, StringRef Str);
+
+  static ConstantFP *get(const APFloat &V, Context &Ctx);
+
+  static Constant *getNaN(Type *Ty, bool Negative = false,
+                          uint64_t Payload = 0);
+  static Constant *getQNaN(Type *Ty, bool Negative = false,
+                           APInt *Payload = nullptr);
+  static Constant *getSNaN(Type *Ty, bool Negative = false,
+                           APInt *Payload = nullptr);
+  static Constant *getZero(Type *Ty, bool Negative = false);
+
+  static Constant *getNegativeZero(Type *Ty);
+  static Constant *getInfinity(Type *Ty, bool Negative = false);
+
+  /// Return true if Ty is big enough to represent V.
+  static bool isValueValidForType(Type *Ty, const APFloat &V);
+
+  inline const APFloat &getValueAPF() const {
+    return cast<llvm::ConstantFP>(Val)->getValueAPF();
+  }
+  inline const APFloat &getValue() const {
+    return cast<llvm::ConstantFP>(Val)->getValue();
+  }
+
+  /// Return true if the value is positive or negative zero.
+  bool isZero() const { return cast<llvm::ConstantFP>(Val)->isZero(); }
+
+  /// Return true if the sign bit is set.
+  bool isNegative() const { return cast<llvm::ConstantFP>(Val)->isNegative(); }
+
+  /// Return true if the value is infinity
+  bool isInfinity() const { return cast<llvm::ConstantFP>(Val)->isInfinity(); }
+
+  /// Return true if the value is a NaN.
+  bool isNaN() const { return cast<llvm::ConstantFP>(Val)->isNaN(); }
+
+  /// We don't rely on operator== working on double values, as it returns true
+  /// for things that are clearly not equal, like -0.0 and 0.0.
+  /// As such, this method can be used to do an exact bit-for-bit comparison of
+  /// two floating point values.  The version with a double operand is retained
+  /// because it's so convenient to write isExactlyValue(2.0), but please use
+  /// it only for simple constants.
+  bool isExactlyValue(const APFloat &V) const {
+    return cast<llvm::ConstantFP>(Val)->isExactlyValue(V);
+  }
+
+  bool isExactlyValue(double V) const {
+    return cast<llvm::ConstantFP>(Val)->isExactlyValue(V);
+  }
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::ConstantFP;
+  }
+
+  // TODO: Better name: getOperandNo(const Use&). Should be private.
+  unsigned getUseOperandNo(const Use &Use) const final {
+    llvm_unreachable("ConstantFP has no operands!");
+  }
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::ConstantFP>(Val) && "Expected a ConstantFP!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
+/// Base class for aggregate constants (with operands).
+class ConstantAggregate : public Constant {
+protected:
+  ConstantAggregate(ClassID ID, llvm::Constant *C, Context &Ctx)
+      : Constant(ID, C, Ctx) {}
+
+public:
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    auto ID = From->getSubclassID();
+    return ID == ClassID::ConstantVector || ID == ClassID::ConstantStruct ||
+           ID == ClassID::ConstantArray;
+  }
+};
+
+class ConstantArray final : public ConstantAggregate {
+  ConstantArray(llvm::ConstantArray *C, Context &Ctx)
+      : ConstantAggregate(ClassID::ConstantArray, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  static Constant *get(ArrayType *T, ArrayRef<Constant *> V);
+  ArrayType *getType() const;
+
+  // TODO: Missing functions: getType(), getTypeForElements(), getAnon(), get().
+
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::ConstantArray;
+  }
+};
+
+class ConstantStruct final : public ConstantAggregate {
+  ConstantStruct(llvm::ConstantStruct *C, Context &Ctx)
+      : ConstantAggregate(ClassID::ConstantStruct, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  static Constant *get(StructType *T, ArrayRef<Constant *> V);
+
+  template <typename... Csts>
+  static std::enable_if_t<are_base_of<Constant, Csts...>::value, Constant *>
+  get(StructType *T, Csts *...Vs) {
+    return get(T, ArrayRef<Constant *>({Vs...}));
+  }
+  /// Return an anonymous struct that has the specified elements.
+  /// If the struct is possibly empty, then you must specify a context.
+  static Constant *getAnon(ArrayRef<Constant *> V, bool Packed = false) {
+    return get(getTypeForElements(V, Packed), V);
+  }
+  static Constant *getAnon(Context &Ctx, ArrayRef<Constant *> V,
+                           bool Packed = false) {
+    return get(getTypeForElements(Ctx, V, Packed), V);
+  }
+  /// This version of the method allows an empty list.
+  static StructType *getTypeForElements(Context &Ctx, ArrayRef<Constant *> V,
+                                        bool Packed = false);
+  /// Return an anonymous struct type to use for a constant with the specified
+  /// set of elements. The list must not be empty.
+  static StructType *getTypeForElements(ArrayRef<Constant *> V,
+                                        bool Packed = false) {
+    assert(!V.empty() &&
+           "ConstantStruct::getTypeForElements cannot be called on empty list");
+    return getTypeForElements(V[0]->getContext(), V, Packed);
+  }
+
+  /// Specialization - reduce amount of casting.
+  inline StructType *getType() const {
+    return cast<StructType>(Value::getType());
+  }
+
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::ConstantStruct;
+  }
+};
+
+class ConstantVector final : public ConstantAggregate {
+  ConstantVector(llvm::ConstantVector *C, Context &Ctx)
+      : ConstantAggregate(ClassID::ConstantVector, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  // TODO: Missing functions: getSplat(), getType(), getSplatValue(), get().
+
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::ConstantVector;
+  }
+};
+
+// TODO: Inherit from ConstantData.
+class ConstantAggregateZero final : public Constant {
+  ConstantAggregateZero(llvm::ConstantAggregateZero *C, Context &Ctx)
+      : Constant(ClassID::ConstantAggregateZero, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  static ConstantAggregateZero *get(Type *Ty);
+  /// If this CAZ has array or vector type, return a zero with the right element
+  /// type.
+  Constant *getSequentialElement() const;
+  /// If this CAZ has struct type, return a zero with the right element type for
+  /// the specified element.
+  Constant *getStructElement(unsigned Elt) const;
+  /// Return a zero of the right value for the specified GEP index if we can,
+  /// otherwise return null (e.g. if C is a ConstantExpr).
+  Constant *getElementValue(Constant *C) const;
+  /// Return a zero of the right value for the specified GEP index.
+  Constant *getElementValue(unsigned Idx) const;
+  /// Return the number of elements in the array, vector, or struct.
+  ElementCount getElementCount() const {
+    return cast<llvm::ConstantAggregateZero>(Val)->getElementCount();
+  }
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::ConstantAggregateZero;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    llvm_unreachable("ConstantAggregateZero has no operands!");
+  }
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::ConstantAggregateZero>(Val) && "Expected a CAZ!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
+// TODO: Inherit from ConstantData.
+class ConstantPointerNull final : public Constant {
+  ConstantPointerNull(llvm::ConstantPointerNull *C, Context &Ctx)
+      : Constant(ClassID::ConstantPointerNull, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  static ConstantPointerNull *get(PointerType *Ty);
+
+  PointerType *getType() const;
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::ConstantPointerNull;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    llvm_unreachable("ConstantPointerNull has no operands!");
+  }
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::ConstantPointerNull>(Val) && "Expected a CPNull!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
+// TODO: Inherit from ConstantData.
+class UndefValue : public Constant {
+protected:
+  UndefValue(llvm::UndefValue *C, Context &Ctx)
+      : Constant(ClassID::UndefValue, C, Ctx) {}
+  UndefValue(ClassID ID, llvm::Constant *C, Context &Ctx)
+      : Constant(ID, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// Static factory methods - Return an 'undef' object of the specified type.
+  static UndefValue *get(Type *T);
+
+  /// If this Undef has array or vector type, return a undef with the right
+  /// element type.
+  UndefValue *getSequentialElement() const;
+
+  /// If this undef has struct type, return a undef with the right element type
+  /// for the specified element.
+  UndefValue *getStructElement(unsigned Elt) const;
+
+  /// Return an undef of the right value for the specified GEP index if we can,
+  /// otherwise return null (e.g. if C is a ConstantExpr).
+  UndefValue *getElementValue(Constant *C) const;
+
+  /// Return an undef of the right value for the specified GEP index.
+  UndefValue *getElementValue(unsigned Idx) const;
+
+  /// Return the number of elements in the array, vector, or struct.
+  unsigned getNumElements() const {
+    return cast<llvm::UndefValue>(Val)->getNumElements();
+  }
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::UndefValue ||
+           From->getSubclassID() == ClassID::PoisonValue;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    llvm_unreachable("UndefValue has no operands!");
+  }
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::UndefValue>(Val) && "Expected an UndefValue!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
+class PoisonValue final : public UndefValue {
+  PoisonValue(llvm::PoisonValue *C, Context &Ctx)
+      : UndefValue(ClassID::PoisonValue, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// Static factory methods - Return an 'poison' object of the specified type.
+  static PoisonValue *get(Type *T);
+
+  /// If this poison has array or vector type, return a poison with the right
+  /// element type.
+  PoisonValue *getSequentialElement() const;
+
+  /// If this poison has struct type, return a poison with the right element
+  /// type for the specified element.
+  PoisonValue *getStructElement(unsigned Elt) const;
+
+  /// Return an poison of the right value for the specified GEP index if we can,
+  /// otherwise return null (e.g. if C is a ConstantExpr).
+  PoisonValue *getElementValue(Constant *C) const;
+
+  /// Return an poison of the right value for the specified GEP index.
+  PoisonValue *getElementValue(unsigned Idx) const;
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::PoisonValue;
+  }
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::PoisonValue>(Val) && "Expected a PoisonValue!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
+class GlobalValue : public Constant {
+protected:
+  GlobalValue(ClassID ID, llvm::GlobalValue *C, Context &Ctx)
+      : Constant(ID, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  using LinkageTypes = llvm::GlobalValue::LinkageTypes;
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    switch (From->getSubclassID()) {
+    case ClassID::Function:
+    case ClassID::GlobalVariable:
+    case ClassID::GlobalAlias:
+    case ClassID::GlobalIFunc:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  unsigned getAddressSpace() const {
+    return cast<llvm::GlobalValue>(Val)->getAddressSpace();
+  }
+  bool hasGlobalUnnamedAddr() const {
+    return cast<llvm::GlobalValue>(Val)->hasGlobalUnnamedAddr();
+  }
+
+  /// Returns true if this value's address is not significant in this module.
+  /// This attribute is intended to be used only by the code generator and LTO
+  /// to allow the linker to decide whether the global needs to be in the symbol
+  /// table. It should probably not be used in optimizations, as the value may
+  /// have uses outside the module; use hasGlobalUnnamedAddr() instead.
+  bool hasAtLeastLocalUnnamedAddr() const {
+    return cast<llvm::GlobalValue>(Val)->hasAtLeastLocalUnnamedAddr();
+  }
+
+  using UnnamedAddr = llvm::GlobalValue::UnnamedAddr;
+
+  UnnamedAddr getUnnamedAddr() const {
+    return cast<llvm::GlobalValue>(Val)->getUnnamedAddr();
+  }
+  void setUnnamedAddr(UnnamedAddr V);
+
+  static UnnamedAddr getMinUnnamedAddr(UnnamedAddr A, UnnamedAddr B) {
+    return llvm::GlobalValue::getMinUnnamedAddr(A, B);
+  }
+
+  bool hasComdat() const { return cast<llvm::GlobalValue>(Val)->hasComdat(); }
+
+  // TODO: We need a SandboxIR Comdat if we want to implement getComdat().
+  using VisibilityTypes = llvm::GlobalValue::VisibilityTypes;
+  VisibilityTypes getVisibility() const {
+    return cast<llvm::GlobalValue>(Val)->getVisibility();
+  }
+  bool hasDefaultVisibility() const {
+    return cast<llvm::GlobalValue>(Val)->hasDefaultVisibility();
+  }
+  bool hasHiddenVisibility() const {
+    return cast<llvm::GlobalValue>(Val)->hasHiddenVisibility();
+  }
+  bool hasProtectedVisibility() const {
+    return cast<llvm::GlobalValue>(Val)->hasProtectedVisibility();
+  }
+  void setVisibility(VisibilityTypes V);
+
+  // TODO: Add missing functions.
+};
+
+class GlobalObject : public GlobalValue {
+protected:
+  GlobalObject(ClassID ID, llvm::GlobalObject *C, Context &Ctx)
+      : GlobalValue(ID, C, Ctx) {}
+  friend class Context; // For constructor.
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+
+public:
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    switch (From->getSubclassID()) {
+    case ClassID::Function:
+    case ClassID::GlobalVariable:
+    case ClassID::GlobalIFunc:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  /// FIXME: Remove this function once transition to Align is over.
+  uint64_t getAlignment() const {
+    return cast<llvm::GlobalObject>(Val)->getAlignment();
+  }
+
+  /// Returns the alignment of the given variable or function.
+  ///
+  /// Note that for functions this is the alignment of the code, not the
+  /// alignment of a function pointer.
+  MaybeAlign getAlign() const {
+    return cast<llvm::GlobalObject>(Val)->getAlign();
+  }
+
+  // TODO: Add missing: setAlignment(Align)
+
+  /// Sets the alignment attribute of the GlobalObject.
+  /// This method will be deprecated as the alignment property should always be
+  /// defined.
+  void setAlignment(MaybeAlign Align);
+
+  unsigned getGlobalObjectSubClassData() const {
+    return cast<llvm::GlobalObject>(Val)->getGlobalObjectSubClassData();
+  }
+
+  void setGlobalObjectSubClassData(unsigned V);
+
+  /// Check if this global has a custom object file section.
+  ///
+  /// This is more efficient than calling getSection() and checking for an empty
+  /// string.
+  bool hasSection() const {
+    return cast<llvm::GlobalObject>(Val)->hasSection();
+  }
+
+  /// Get the custom section of this global if it has one.
+  ///
+  /// If this global does not have a custom section, this will be empty and the
+  /// default object file section (.text, .data, etc) will be used.
+  StringRef getSection() const {
+    return cast<llvm::GlobalObject>(Val)->getSection();
+  }
+
+  /// Change the section for this global.
+  ///
+  /// Setting the section to the empty string tells LLVM to choose an
+  /// appropriate default object file section.
+  void setSection(StringRef S);
+
+  bool hasComdat() const { return cast<llvm::GlobalObject>(Val)->hasComdat(); }
+
+  // TODO: implement get/setComdat(), etc. once we have a sandboxir::Comdat.
+
+  // TODO: We currently don't support Metadata in sandboxir so all
+  // Metadata-related functions are missing.
+
+  using VCallVisibility = llvm::GlobalObject::VCallVisibility;
+
+  VCallVisibility getVCallVisibility() const {
+    return cast<llvm::GlobalObject>(Val)->getVCallVisibility();
+  }
+
+  /// Returns true if the alignment of the value can be unilaterally
+  /// increased.
+  ///
+  /// Note that for functions this is the alignment of the code, not the
+  /// alignment of a function pointer.
+  bool canIncreaseAlignment() const {
+    return cast<llvm::GlobalObject>(Val)->canIncreaseAlignment();
+  }
+};
+
+/// Provides API functions, like getIterator() and getReverseIterator() to
+/// GlobalIFunc, Function, GlobalVariable and GlobalAlias. In LLVM IR these are
+/// provided by ilist_node.
+template <typename GlobalT, typename LLVMGlobalT, typename ParentT,
+          typename LLVMParentT>
+class GlobalWithNodeAPI : public ParentT {
+  /// Helper for mapped_iterator.
+  struct LLVMGVToGV {
+    Context &Ctx;
+    LLVMGVToGV(Context &Ctx) : Ctx(Ctx) {}
+    GlobalT &operator()(LLVMGlobalT &LLVMGV) const;
+  };
+
+public:
+  GlobalWithNodeAPI(Value::ClassID ID, LLVMParentT *C, Context &Ctx)
+      : ParentT(ID, C, Ctx) {}
+
+  Module *getParent() const {
+    llvm::Module *LLVMM = cast<LLVMGlobalT>(this->Val)->getParent();
+    return this->Ctx.getModule(LLVMM);
+  }
+
+  using iterator = mapped_iterator<
+      decltype(static_cast<LLVMGlobalT *>(nullptr)->getIterator()), LLVMGVToGV>;
+  using reverse_iterator = mapped_iterator<
+      decltype(static_cast<LLVMGlobalT *>(nullptr)->getReverseIterator()),
+      LLVMGVToGV>;
+  iterator getIterator() const {
+    auto *LLVMGV = cast<LLVMGlobalT>(this->Val);
+    LLVMGVToGV ToGV(this->Ctx);
+    return map_iterator(LLVMGV->getIterator(), ToGV);
+  }
+  reverse_iterator getReverseIterator() const {
+    auto *LLVMGV = cast<LLVMGlobalT>(this->Val);
+    LLVMGVToGV ToGV(this->Ctx);
+    return map_iterator(LLVMGV->getReverseIterator(), ToGV);
+  }
+};
+
+class GlobalIFunc final
+    : public GlobalWithNodeAPI<GlobalIFunc, llvm::GlobalIFunc, GlobalObject,
+                               llvm::GlobalObject> {
+  GlobalIFunc(llvm::GlobalObject *C, Context &Ctx)
+      : GlobalWithNodeAPI(ClassID::GlobalIFunc, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::GlobalIFunc;
+  }
+
+  // TODO: Missing create() because we don't have a sandboxir::Module yet.
+
+  // TODO: Missing functions: copyAttributesFrom(), removeFromParent(),
+  // eraseFromParent()
+
+  void setResolver(Constant *Resolver);
+
+  Constant *getResolver() const;
+
+  // Return the resolver function after peeling off potential ConstantExpr
+  // indirection.
+  Function *getResolverFunction();
+  const Function *getResolverFunction() const {
+    return const_cast<GlobalIFunc *>(this)->getResolverFunction();
+  }
+
+  static bool isValidLinkage(LinkageTypes L) {
+    return llvm::GlobalIFunc::isValidLinkage(L);
+  }
+
+  // TODO: Missing applyAlongResolverPath().
+
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::GlobalIFunc>(Val) && "Expected a GlobalIFunc!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
+class GlobalVariable final
+    : public GlobalWithNodeAPI<GlobalVariable, llvm::GlobalVariable,
+                               GlobalObject, llvm::GlobalObject> {
+  GlobalVariable(llvm::GlobalObject *C, Context &Ctx)
+      : GlobalWithNodeAPI(ClassID::GlobalVariable, C, Ctx) {}
+  friend class Context; // For constructor.
+
+  /// Helper for mapped_iterator.
+  struct LLVMGVToGV {
+    Context &Ctx;
+    LLVMGVToGV(Context &Ctx) : Ctx(Ctx) {}
+    GlobalVariable &operator()(llvm::GlobalVariable &LLVMGV) const;
+  };
+
+public:
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::GlobalVariable;
+  }
+
+  /// Definitions have initializers, declarations don't.
+  ///
+  inline bool hasInitializer() const {
+    return cast<llvm::GlobalVariable>(Val)->hasInitializer();
+  }
+
+  /// hasDefinitiveInitializer - Whether the global variable has an initializer,
+  /// and any other instances of the global (this can happen due to weak
+  /// linkage) are guaranteed to have the same initializer.
+  ///
+  /// Note that if you want to transform a global, you must use
+  /// hasUniqueInitializer() instead, because of the *_odr linkage type.
+  ///
+  /// Example:
+  ///
+  /// @a = global SomeType* null - Initializer is both definitive and unique.
+  ///
+  /// @b = global weak SomeType* null - Initializer is neither definitive nor
+  /// unique.
+  ///
+  /// @c = global weak_odr SomeType* null - Initializer is definitive, but not
+  /// unique.
+  inline bool hasDefinitiveInitializer() const {
+    return cast<llvm::GlobalVariable>(Val)->hasDefinitiveInitializer();
+  }
+
+  /// hasUniqueInitializer - Whether the global variable has an initializer, and
+  /// any changes made to the initializer will turn up in the final executable.
+  inline bool hasUniqueInitializer() const {
+    return cast<llvm::GlobalVariable>(Val)->hasUniqueInitializer();
+  }
+
+  /// getInitializer - Return the initializer for this global variable.  It is
+  /// illegal to call this method if the global is external, because we cannot
+  /// tell what the value is initialized to!
+  ///
+  Constant *getInitializer() const;
+  /// setInitializer - Sets the initializer for this global variable, removing
+  /// any existing initializer if InitVal==NULL. The initializer must have the
+  /// type getValueType().
+  void setInitializer(Constant *InitVal);
+
+  // TODO: Add missing replaceInitializer(). Requires special tracker
+
+  /// If the value is a global constant, its value is immutable throughout the
+  /// runtime execution of the program.  Assigning a value into the constant
+  /// leads to undefined behavior.
+  ///
+  bool isConstant() const {
+    return cast<llvm::GlobalVariable>(Val)->isConstant();
+  }
+  void setConstant(bool V);
+
+  bool isExternallyInitialized() const {
+    return cast<llvm::GlobalVariable>(Val)->isExternallyInitialized();
+  }
+  void setExternallyInitialized(bool Val);
+
+  // TODO: Missing copyAttributesFrom()
+
+  // TODO: Missing removeFromParent(), eraseFromParent(), dropAllReferences()
+
+  // TODO: Missing addDebugInfo(), getDebugInfo()
+
+  // TODO: Missing attribute setter functions: addAttribute(), setAttributes().
+  //       There seems to be no removeAttribute() so we can't undo them.
+
+  /// Return true if the attribute exists.
+  bool hasAttribute(Attribute::AttrKind Kind) const {
+    return cast<llvm::GlobalVariable>(Val)->hasAttribute(Kind);
+  }
+
+  /// Return true if the attribute exists.
+  bool hasAttribute(StringRef Kind) const {
+    return cast<llvm::GlobalVariable>(Val)->hasAttribute(Kind);
+  }
+
+  /// Return true if any attributes exist.
+  bool hasAttributes() const {
+    return cast<llvm::GlobalVariable>(Val)->hasAttributes();
+  }
+
+  /// Return the attribute object.
+  Attribute getAttribute(Attribute::AttrKind Kind) const {
+    return cast<llvm::GlobalVariable>(Val)->getAttribute(Kind);
+  }
+
+  /// Return the attribute object.
+  Attribute getAttribute(StringRef Kind) const {
+    return cast<llvm::GlobalVariable>(Val)->getAttribute(Kind);
+  }
+
+  /// Return the attribute set for this global
+  AttributeSet getAttributes() const {
+    return cast<llvm::GlobalVariable>(Val)->getAttributes();
+  }
+
+  /// Return attribute set as list with index.
+  /// FIXME: This may not be required once ValueEnumerators
+  /// in bitcode-writer can enumerate attribute-set.
+  AttributeList getAttributesAsList(unsigned Index) const {
+    return cast<llvm::GlobalVariable>(Val)->getAttributesAsList(Index);
+  }
+
+  /// Check if section name is present
+  bool hasImplicitSection() const {
+    return cast<llvm::GlobalVariable>(Val)->hasImplicitSection();
+  }
+
+  /// Get the custom code model raw value of this global.
+  ///
+  unsigned getCodeModelRaw() const {
+    return cast<llvm::GlobalVariable>(Val)->getCodeModelRaw();
+  }
+
+  /// Get the custom code model of this global if it has one.
+  ///
+  /// If this global does not have a custom code model, the empty instance
+  /// will be returned.
+  std::optional<CodeModel::Model> getCodeModel() const {
+    return cast<llvm::GlobalVariable>(Val)->getCodeModel();
+  }
+
+  // TODO: Missing setCodeModel(). Requires custom tracker.
+
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::GlobalVariable>(Val) && "Expected a GlobalVariable!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
+class GlobalAlias final
+    : public GlobalWithNodeAPI<GlobalAlias, llvm::GlobalAlias, GlobalValue,
+                               llvm::GlobalValue> {
+  GlobalAlias(llvm::GlobalAlias *C, Context &Ctx)
+      : GlobalWithNodeAPI(ClassID::GlobalAlias, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::GlobalAlias;
+  }
+
+  // TODO: Missing create() due to unimplemented sandboxir::Module.
+
+  // TODO: Missing copyAttributresFrom().
+  // TODO: Missing removeFromParent(), eraseFromParent().
+
+  void setAliasee(Constant *Aliasee);
+  Constant *getAliasee() const;
+
+  const GlobalObject *getAliaseeObject() const;
+  GlobalObject *getAliaseeObject() {
+    return const_cast<GlobalObject *>(
+        static_cast<const GlobalAlias *>(this)->getAliaseeObject());
+  }
+
+  static bool isValidLinkage(LinkageTypes L) {
+    return llvm::GlobalAlias::isValidLinkage(L);
+  }
+};
+
+class NoCFIValue final : public Constant {
+  NoCFIValue(llvm::NoCFIValue *C, Context &Ctx)
+      : Constant(ClassID::NoCFIValue, C, Ctx) {}
+  friend class Context; // For constructor.
+
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+
+public:
+  /// Return a NoCFIValue for the specified function.
+  static NoCFIValue *get(GlobalValue *GV);
+
+  GlobalValue *getGlobalValue() const;
+
+  /// NoCFIValue is always a pointer.
+  PointerType *getType() const;
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::NoCFIValue;
+  }
+
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::NoCFIValue>(Val) && "Expected a NoCFIValue!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
+class ConstantPtrAuth final : public Constant {
+  ConstantPtrAuth(llvm::ConstantPtrAuth *C, Context &Ctx)
+      : Constant(ClassID::ConstantPtrAuth, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// Return a pointer signed with the specified parameters.
+  static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key,
+                              ConstantInt *Disc, Constant *AddrDisc);
+  /// The pointer that is signed in this ptrauth signed pointer.
+  Constant *getPointer() const;
+
+  /// The Key ID, an i32 constant.
+  ConstantInt *getKey() const;
+
+  /// The integer discriminator, an i64 constant, or 0.
+  ConstantInt *getDiscriminator() const;
+
+  /// The address discriminator if any, or the null constant.
+  /// If present, this must be a value equivalent to the storage location of
+  /// the only global-initializer user of the ptrauth signed pointer.
+  Constant *getAddrDiscriminator() const;
+
+  /// Whether there is any non-null address discriminator.
+  bool hasAddressDiscriminator() const {
+    return cast<llvm::ConstantPtrAuth>(Val)->hasAddressDiscriminator();
+  }
+
+  /// Whether the address uses a special address discriminator.
+  /// These discriminators can't be used in real pointer-auth values; they
+  /// can only be used in "prototype" values that indicate how some real
+  /// schema is supposed to be produced.
+  bool hasSpecialAddressDiscriminator(uint64_t Value) const {
+    return cast<llvm::ConstantPtrAuth>(Val)->hasSpecialAddressDiscriminator(
+        Value);
+  }
+
+  /// Check whether an authentication operation with key \p Key and (possibly
+  /// blended) discriminator \p Discriminator is known to be compatible with
+  /// this ptrauth signed pointer.
+  bool isKnownCompatibleWith(const Value *Key, const Value *Discriminator,
+                             const DataLayout &DL) const {
+    return cast<llvm::ConstantPtrAuth>(Val)->isKnownCompatibleWith(
+        Key->Val, Discriminator->Val, DL);
+  }
+
+  /// Produce a new ptrauth expression signing the given value using
+  /// the same schema as is stored in one.
+  ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const;
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::ConstantPtrAuth;
+  }
+};
+
+class ConstantExpr : public Constant {
+  ConstantExpr(llvm::ConstantExpr *C, Context &Ctx)
+      : Constant(ClassID::ConstantExpr, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::ConstantExpr;
+  }
+  // TODO: Missing functions.
+};
+
+class BlockAddress final : public Constant {
+  BlockAddress(llvm::BlockAddress *C, Context &Ctx)
+      : Constant(ClassID::BlockAddress, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// Return a BlockAddress for the specified function and basic block.
+  static BlockAddress *get(Function *F, BasicBlock *BB);
+
+  /// Return a BlockAddress for the specified basic block.  The basic
+  /// block must be embedded into a function.
+  static BlockAddress *get(BasicBlock *BB);
+
+  /// Lookup an existing \c BlockAddress constant for the given BasicBlock.
+  ///
+  /// \returns 0 if \c !BB->hasAddressTaken(), otherwise the \c BlockAddress.
+  static BlockAddress *lookup(const BasicBlock *BB);
+
+  Function *getFunction() const;
+  BasicBlock *getBasicBlock() const;
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::BlockAddress;
+  }
+};
+
+class DSOLocalEquivalent final : public Constant {
+  DSOLocalEquivalent(llvm::DSOLocalEquivalent *C, Context &Ctx)
+      : Constant(ClassID::DSOLocalEquivalent, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// Return a DSOLocalEquivalent for the specified global value.
+  static DSOLocalEquivalent *get(GlobalValue *GV);
+
+  GlobalValue *getGlobalValue() const;
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::DSOLocalEquivalent;
+  }
+
+  unsigned getUseOperandNo(const Use &Use) const final {
+    llvm_unreachable("DSOLocalEquivalent has no operands!");
+  }
+
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::DSOLocalEquivalent>(Val) &&
+           "Expected a DSOLocalEquivalent!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
+// TODO: This should inherit from ConstantData.
+class ConstantTokenNone final : public Constant {
+  ConstantTokenNone(llvm::ConstantTokenNone *C, Context &Ctx)
+      : Constant(ClassID::ConstantTokenNone, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// Return the ConstantTokenNone.
+  static ConstantTokenNone *get(Context &Ctx);
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::ConstantTokenNone;
+  }
+
+  unsigned getUseOperandNo(const Use &Use) const final {
+    llvm_unreachable("ConstantTokenNone has no operands!");
+  }
+
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::ConstantTokenNone>(Val) &&
+           "Expected a ConstantTokenNone!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_SANDBOXIR_CONSTANT_H
diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h
new file mode 100644
index 0000000000000..77924fbcd5ace
--- /dev/null
+++ b/llvm/include/llvm/SandboxIR/Context.h
@@ -0,0 +1,204 @@
+//===- Context.h ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SANDBOXIR_CONTEXT_H
+#define LLVM_SANDBOXIR_CONTEXT_H
+
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/SandboxIR/Tracker.h"
+#include "llvm/SandboxIR/Type.h"
+
+namespace llvm::sandboxir {
+
+class Module;
+class Value;
+class Argument;
+class Constant;
+
+class Context {
+protected:
+  LLVMContext &LLVMCtx;
+  friend class Type;        // For LLVMCtx.
+  friend class PointerType; // For LLVMCtx.
+  friend class IntegerType; // For LLVMCtx.
+  friend class StructType;  // For LLVMCtx.
+  friend class Region;      // For LLVMCtx.
+
+  Tracker IRTracker;
+
+  /// Maps LLVM Value to the corresponding sandboxir::Value. Owns all
+  /// SandboxIR objects.
+  DenseMap<llvm::Value *, std::unique_ptr<Value>> LLVMValueToValueMap;
+
+  /// Maps an LLVM Module to the corresponding sandboxir::Module.
+  DenseMap<llvm::Module *, std::unique_ptr<Module>> LLVMModuleToModuleMap;
+
+  /// Type has a protected destructor to prohibit the user from managing the
+  /// lifetime of the Type objects. Context is friend of Type, and this custom
+  /// deleter can destroy Type.
+  struct TypeDeleter {
+    void operator()(Type *Ty) { delete Ty; }
+  };
+  /// Maps LLVM Type to the corresonding sandboxir::Type. Owns all Sandbox IR
+  /// Type objects.
+  DenseMap<llvm::Type *, std::unique_ptr<Type, TypeDeleter>> LLVMTypeToTypeMap;
+
+  /// Remove \p V from the maps and returns the unique_ptr.
+  std::unique_ptr<Value> detachLLVMValue(llvm::Value *V);
+  /// Remove \p SBV from all SandboxIR maps and stop owning it. This effectively
+  /// detaches \p V from the underlying IR.
+  std::unique_ptr<Value> detach(Value *V);
+  friend class Instruction; // For detach().
+  /// Take ownership of VPtr and store it in `LLVMValueToValueMap`.
+  Value *registerValue(std::unique_ptr<Value> &&VPtr);
+  friend class EraseFromParent; // For registerValue().
+  /// This is the actual function that creates sandboxir values for \p V,
+  /// and among others handles all instruction types.
+  Value *getOrCreateValueInternal(llvm::Value *V, llvm::User *U = nullptr);
+  /// Get or create a sandboxir::Argument for an existing LLVM IR \p LLVMArg.
+  Argument *getOrCreateArgument(llvm::Argument *LLVMArg);
+  /// Get or create a sandboxir::Value for an existing LLVM IR \p LLVMV.
+  Value *getOrCreateValue(llvm::Value *LLVMV) {
+    return getOrCreateValueInternal(LLVMV, 0);
+  }
+  /// Get or create a sandboxir::Constant from an existing LLVM IR \p LLVMC.
+  Constant *getOrCreateConstant(llvm::Constant *LLVMC);
+
+  // Friends for getOrCreateConstant().
+#define DEF_CONST(ID, CLASS) friend class CLASS;
+#include "llvm/SandboxIR/Values.def"
+
+  /// Create a sandboxir::BasicBlock for an existing LLVM IR \p BB. This will
+  /// also create all contents of the block.
+  BasicBlock *createBasicBlock(llvm::BasicBlock *BB);
+  friend class BasicBlock; // For getOrCreateValue().
+
+  IRBuilder<ConstantFolder> LLVMIRBuilder;
+  auto &getLLVMIRBuilder() { return LLVMIRBuilder; }
+
+  VAArgInst *createVAArgInst(llvm::VAArgInst *SI);
+  friend VAArgInst; // For createVAArgInst()
+  FreezeInst *createFreezeInst(llvm::FreezeInst *SI);
+  friend FreezeInst; // For createFreezeInst()
+  FenceInst *createFenceInst(llvm::FenceInst *SI);
+  friend FenceInst; // For createFenceInst()
+  SelectInst *createSelectInst(llvm::SelectInst *SI);
+  friend SelectInst; // For createSelectInst()
+  InsertElementInst *createInsertElementInst(llvm::InsertElementInst *IEI);
+  friend InsertElementInst; // For createInsertElementInst()
+  ExtractElementInst *createExtractElementInst(llvm::ExtractElementInst *EEI);
+  friend ExtractElementInst; // For createExtractElementInst()
+  ShuffleVectorInst *createShuffleVectorInst(llvm::ShuffleVectorInst *SVI);
+  friend ShuffleVectorInst; // For createShuffleVectorInst()
+  ExtractValueInst *createExtractValueInst(llvm::ExtractValueInst *IVI);
+  friend ExtractValueInst; // For createExtractValueInst()
+  InsertValueInst *createInsertValueInst(llvm::InsertValueInst *IVI);
+  friend InsertValueInst; // For createInsertValueInst()
+  BranchInst *createBranchInst(llvm::BranchInst *I);
+  friend BranchInst; // For createBranchInst()
+  LoadInst *createLoadInst(llvm::LoadInst *LI);
+  friend LoadInst; // For createLoadInst()
+  StoreInst *createStoreInst(llvm::StoreInst *SI);
+  friend StoreInst; // For createStoreInst()
+  ReturnInst *createReturnInst(llvm::ReturnInst *I);
+  friend ReturnInst; // For createReturnInst()
+  CallInst *createCallInst(llvm::CallInst *I);
+  friend CallInst; // For createCallInst()
+  InvokeInst *createInvokeInst(llvm::InvokeInst *I);
+  friend InvokeInst; // For createInvokeInst()
+  CallBrInst *createCallBrInst(llvm::CallBrInst *I);
+  friend CallBrInst; // For createCallBrInst()
+  LandingPadInst *createLandingPadInst(llvm::LandingPadInst *I);
+  friend LandingPadInst; // For createLandingPadInst()
+  CatchPadInst *createCatchPadInst(llvm::CatchPadInst *I);
+  friend CatchPadInst; // For createCatchPadInst()
+  CleanupPadInst *createCleanupPadInst(llvm::CleanupPadInst *I);
+  friend CleanupPadInst; // For createCleanupPadInst()
+  CatchReturnInst *createCatchReturnInst(llvm::CatchReturnInst *I);
+  friend CatchReturnInst; // For createCatchReturnInst()
+  CleanupReturnInst *createCleanupReturnInst(llvm::CleanupReturnInst *I);
+  friend CleanupReturnInst; // For createCleanupReturnInst()
+  GetElementPtrInst *createGetElementPtrInst(llvm::GetElementPtrInst *I);
+  friend GetElementPtrInst; // For createGetElementPtrInst()
+  CatchSwitchInst *createCatchSwitchInst(llvm::CatchSwitchInst *I);
+  friend CatchSwitchInst; // For createCatchSwitchInst()
+  ResumeInst *createResumeInst(llvm::ResumeInst *I);
+  friend ResumeInst; // For createResumeInst()
+  SwitchInst *createSwitchInst(llvm::SwitchInst *I);
+  friend SwitchInst; // For createSwitchInst()
+  UnaryOperator *createUnaryOperator(llvm::UnaryOperator *I);
+  friend UnaryOperator; // For createUnaryOperator()
+  BinaryOperator *createBinaryOperator(llvm::BinaryOperator *I);
+  friend BinaryOperator; // For createBinaryOperator()
+  AtomicRMWInst *createAtomicRMWInst(llvm::AtomicRMWInst *I);
+  friend AtomicRMWInst; // For createAtomicRMWInst()
+  AtomicCmpXchgInst *createAtomicCmpXchgInst(llvm::AtomicCmpXchgInst *I);
+  friend AtomicCmpXchgInst; // For createAtomicCmpXchgInst()
+  AllocaInst *createAllocaInst(llvm::AllocaInst *I);
+  friend AllocaInst; // For createAllocaInst()
+  CastInst *createCastInst(llvm::CastInst *I);
+  friend CastInst; // For createCastInst()
+  PHINode *createPHINode(llvm::PHINode *I);
+  friend PHINode; // For createPHINode()
+  UnreachableInst *createUnreachableInst(llvm::UnreachableInst *UI);
+  friend UnreachableInst; // For createUnreachableInst()
+  CmpInst *createCmpInst(llvm::CmpInst *I);
+  friend CmpInst; // For createCmpInst()
+  ICmpInst *createICmpInst(llvm::ICmpInst *I);
+  friend ICmpInst; // For createICmpInst()
+  FCmpInst *createFCmpInst(llvm::FCmpInst *I);
+  friend FCmpInst; // For createFCmpInst()
+
+public:
+  Context(LLVMContext &LLVMCtx);
+  ~Context();
+
+  Tracker &getTracker() { return IRTracker; }
+  /// Convenience function for `getTracker().save()`
+  void save() { IRTracker.save(); }
+  /// Convenience function for `getTracker().revert()`
+  void revert() { IRTracker.revert(); }
+  /// Convenience function for `getTracker().accept()`
+  void accept() { IRTracker.accept(); }
+
+  sandboxir::Value *getValue(llvm::Value *V) const;
+  const sandboxir::Value *getValue(const llvm::Value *V) const {
+    return getValue(const_cast<llvm::Value *>(V));
+  }
+
+  Module *getModule(llvm::Module *LLVMM) const;
+
+  Module *getOrCreateModule(llvm::Module *LLVMM);
+
+  Type *getType(llvm::Type *LLVMTy) {
+    if (LLVMTy == nullptr)
+      return nullptr;
+    auto Pair = LLVMTypeToTypeMap.insert({LLVMTy, nullptr});
+    auto It = Pair.first;
+    if (Pair.second)
+      It->second = std::unique_ptr<Type, TypeDeleter>(new Type(LLVMTy, *this));
+    return It->second.get();
+  }
+
+  /// Create a sandboxir::Function for an existing LLVM IR \p F, including all
+  /// blocks and instructions.
+  /// This is the main API function for creating Sandbox IR.
+  /// Note: this will not fully populate its parent module. The only globals
+  /// that will be available are those used within the function.
+  Function *createFunction(llvm::Function *F);
+
+  /// Create a sandboxir::Module corresponding to \p LLVMM.
+  Module *createModule(llvm::Module *LLVMM);
+
+  /// \Returns the number of values registered with Context.
+  size_t getNumValues() const { return LLVMValueToValueMap.size(); }
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_SANDBOXIR_CONTEXT_H
diff --git a/llvm/include/llvm/SandboxIR/Function.h b/llvm/include/llvm/SandboxIR/Function.h
new file mode 100644
index 0000000000000..a810533f769f7
--- /dev/null
+++ b/llvm/include/llvm/SandboxIR/Function.h
@@ -0,0 +1,72 @@
+//===- Function.h -----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SANDBOXIR_FUNCTION_H
+#define LLVM_SANDBOXIR_FUNCTION_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/SandboxIR/Constant.h"
+
+namespace llvm::sandboxir {
+
+class Function : public GlobalWithNodeAPI<Function, llvm::Function,
+                                          GlobalObject, llvm::GlobalObject> {
+  /// Helper for mapped_iterator.
+  struct LLVMBBToBB {
+    Context &Ctx;
+    LLVMBBToBB(Context &Ctx) : Ctx(Ctx) {}
+    BasicBlock &operator()(llvm::BasicBlock &LLVMBB) const {
+      return *cast<BasicBlock>(Ctx.getValue(&LLVMBB));
+    }
+  };
+  /// Use Context::createFunction() instead.
+  Function(llvm::Function *F, sandboxir::Context &Ctx)
+      : GlobalWithNodeAPI(ClassID::Function, F, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::Function;
+  }
+
+  Module *getParent() {
+    return Ctx.getModule(cast<llvm::Function>(Val)->getParent());
+  }
+
+  Argument *getArg(unsigned Idx) const {
+    llvm::Argument *Arg = cast<llvm::Function>(Val)->getArg(Idx);
+    return cast<Argument>(Ctx.getValue(Arg));
+  }
+
+  size_t arg_size() const { return cast<llvm::Function>(Val)->arg_size(); }
+  bool arg_empty() const { return cast<llvm::Function>(Val)->arg_empty(); }
+
+  using iterator = mapped_iterator<llvm::Function::iterator, LLVMBBToBB>;
+  iterator begin() const {
+    LLVMBBToBB BBGetter(Ctx);
+    return iterator(cast<llvm::Function>(Val)->begin(), BBGetter);
+  }
+  iterator end() const {
+    LLVMBBToBB BBGetter(Ctx);
+    return iterator(cast<llvm::Function>(Val)->end(), BBGetter);
+  }
+  FunctionType *getFunctionType() const;
+
+#ifndef NDEBUG
+  void verify() const final {
+    assert(isa<llvm::Function>(Val) && "Expected Function!");
+  }
+  void dumpNameAndArgs(raw_ostream &OS) const;
+  void dumpOS(raw_ostream &OS) const final;
+#endif
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_SANDBOXIR_FUNCTION_H
diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/Instruction.h
similarity index 58%
rename from llvm/include/llvm/SandboxIR/SandboxIR.h
rename to llvm/include/llvm/SandboxIR/Instruction.h
index d4c907ce8327d..a34573a0bc1b0 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/Instruction.h
@@ -1,1879 +1,38 @@
-//===- SandboxIR.h ----------------------------------------------*- C++ -*-===//
+//===- Instruction.h --------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// Sandbox IR is a lightweight overlay transactional IR on top of LLVM IR.
-// Features:
-// - You can save/rollback the state of the IR at any time.
-// - Any changes made to Sandbox IR will automatically update the underlying
-//   LLVM IR so both IRs are always in sync.
-// - Feels like LLVM IR, similar API.
-//
-// SandboxIR forms a class hierarchy that resembles that of LLVM IR
-// but is in the `sandboxir` namespace:
-//
-// namespace sandboxir {
-//
-// Value -+- Argument
-//        |
-//        +- BasicBlock
-//        |
-//        +- User ------+- Constant ------ Function
-//                      |
-//                      +- Instruction -+- BinaryOperator
-//                                      |
-//                                      +- BranchInst
-//                                      |
-//                                      +- CastInst --------+- AddrSpaceCastInst
-//                                      |                   |
-//                                      |                   +- BitCastInst
-//                                      |                   |
-//                                      |                   +- FPExtInst
-//                                      |                   |
-//                                      |                   +- FPToSIInst
-//                                      |                   |
-//                                      |                   +- FPToUIInst
-//                                      |                   |
-//                                      |                   +- FPTruncInst
-//                                      |                   |
-//                                      |                   +- IntToPtrInst
-//                                      |                   |
-//                                      |                   +- PtrToIntInst
-//                                      |                   |
-//                                      |                   +- SExtInst
-//                                      |                   |
-//                                      |                   +- SIToFPInst
-//                                      |                   |
-//                                      |                   +- TruncInst
-//                                      |                   |
-//                                      |                   +- UIToFPInst
-//                                      |                   |
-//                                      |                   +- ZExtInst
-//                                      |
-//                                      +- CallBase --------+- CallBrInst
-//                                      |                   |
-//                                      |                   +- CallInst
-//                                      |                   |
-//                                      |                   +- InvokeInst
-//                                      |
-//                                      +- CmpInst ---------+- ICmpInst
-//                                      |                   |
-//                                      |                   +- FCmpInst
-//                                      |
-//                                      +- ExtractElementInst
-//                                      |
-//                                      +- GetElementPtrInst
-//                                      |
-//                                      +- InsertElementInst
-//                                      |
-//                                      +- OpaqueInst
-//                                      |
-//                                      +- PHINode
-//                                      |
-//                                      +- ReturnInst
-//                                      |
-//                                      +- SelectInst
-//                                      |
-//                                      +- ShuffleVectorInst
-//                                      |
-//                                      +- ExtractValueInst
-//                                      |
-//                                      +- InsertValueInst
-//                                      |
-//                                      +- StoreInst
-//                                      |
-//                                      +- UnaryInstruction -+- LoadInst
-//                                      |                    |
-//                                      |                    +- CastInst
-//                                      |
-//                                      +- UnaryOperator
-//                                      |
-//                                      +- UnreachableInst
-//
-// Use
-//
-// } // namespace sandboxir
-//
-
-#ifndef LLVM_SANDBOXIR_SANDBOXIR_H
-#define LLVM_SANDBOXIR_SANDBOXIR_H
-
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/SandboxIR/Tracker.h"
-#include "llvm/SandboxIR/Type.h"
-#include "llvm/SandboxIR/Use.h"
-#include "llvm/Support/raw_ostream.h"
-#include <iterator>
-
-namespace llvm {
-
-namespace sandboxir {
-
-class BasicBlock;
-class ConstantInt;
-class ConstantFP;
-class ConstantAggregateZero;
-class ConstantPointerNull;
-class PoisonValue;
-class BlockAddress;
-class DSOLocalEquivalent;
-class ConstantTokenNone;
-class GlobalValue;
-class GlobalObject;
-class GlobalIFunc;
-class GlobalVariable;
-class GlobalAlias;
-class NoCFIValue;
-class ConstantPtrAuth;
-class ConstantExpr;
-class Context;
-class Function;
-class Instruction;
-class VAArgInst;
-class FreezeInst;
-class FenceInst;
-class SelectInst;
-class ExtractElementInst;
-class InsertElementInst;
-class ShuffleVectorInst;
-class ExtractValueInst;
-class InsertValueInst;
-class BranchInst;
-class UnaryInstruction;
-class LoadInst;
-class ReturnInst;
-class StoreInst;
-class User;
-class UnreachableInst;
-class Value;
-class CallBase;
-class CallInst;
-class InvokeInst;
-class CallBrInst;
-class LandingPadInst;
-class FuncletPadInst;
-class CatchPadInst;
-class CleanupPadInst;
-class CatchReturnInst;
-class CleanupReturnInst;
-class GetElementPtrInst;
-class CastInst;
-class PossiblyNonNegInst;
-class PtrToIntInst;
-class BitCastInst;
-class AllocaInst;
-class ResumeInst;
-class CatchSwitchInst;
-class SwitchInst;
-class UnaryOperator;
-class BinaryOperator;
-class PossiblyDisjointInst;
-class AtomicRMWInst;
-class AtomicCmpXchgInst;
-class CmpInst;
-class ICmpInst;
-class FCmpInst;
-
-/// Iterator for the `Use` edges of a User's operands.
-/// \Returns the operand `Use` when dereferenced.
-class OperandUseIterator {
-  sandboxir::Use Use;
-  /// Don't let the user create a non-empty OperandUseIterator.
-  OperandUseIterator(const class Use &Use) : Use(Use) {}
-  friend class User;                                  // For constructor
-#define DEF_INSTR(ID, OPC, CLASS) friend class CLASS; // For constructor
-#include "llvm/SandboxIR/SandboxIRValues.def"
-
-public:
-  using difference_type = std::ptrdiff_t;
-  using value_type = sandboxir::Use;
-  using pointer = value_type *;
-  using reference = value_type &;
-  using iterator_category = std::input_iterator_tag;
-
-  OperandUseIterator() = default;
-  value_type operator*() const;
-  OperandUseIterator &operator++();
-  OperandUseIterator operator++(int) {
-    auto Copy = *this;
-    this->operator++();
-    return Copy;
-  }
-  bool operator==(const OperandUseIterator &Other) const {
-    return Use == Other.Use;
-  }
-  bool operator!=(const OperandUseIterator &Other) const {
-    return !(*this == Other);
-  }
-  OperandUseIterator operator+(unsigned Num) const;
-  OperandUseIterator operator-(unsigned Num) const;
-  int operator-(const OperandUseIterator &Other) const;
-};
-
-/// Iterator for the `Use` edges of a Value's users.
-/// \Returns a `Use` when dereferenced.
-class UserUseIterator {
-  sandboxir::Use Use;
-  /// Don't let the user create a non-empty UserUseIterator.
-  UserUseIterator(const class Use &Use) : Use(Use) {}
-  friend class Value; // For constructor
-
-public:
-  using difference_type = std::ptrdiff_t;
-  using value_type = sandboxir::Use;
-  using pointer = value_type *;
-  using reference = value_type &;
-  using iterator_category = std::input_iterator_tag;
-
-  UserUseIterator() = default;
-  value_type operator*() const { return Use; }
-  UserUseIterator &operator++();
-  bool operator==(const UserUseIterator &Other) const {
-    return Use == Other.Use;
-  }
-  bool operator!=(const UserUseIterator &Other) const {
-    return !(*this == Other);
-  }
-  const sandboxir::Use &getUse() const { return Use; }
-};
-
-/// A SandboxIR Value has users. This is the base class.
-class Value {
-public:
-  enum class ClassID : unsigned {
-#define DEF_VALUE(ID, CLASS) ID,
-#define DEF_USER(ID, CLASS) ID,
-#define DEF_CONST(ID, CLASS) ID,
-#define DEF_INSTR(ID, OPC, CLASS) ID,
-#include "llvm/SandboxIR/SandboxIRValues.def"
-  };
-
-protected:
-  static const char *getSubclassIDStr(ClassID ID) {
-    switch (ID) {
-#define DEF_VALUE(ID, CLASS)                                                   \
-  case ClassID::ID:                                                            \
-    return #ID;
-#define DEF_USER(ID, CLASS)                                                    \
-  case ClassID::ID:                                                            \
-    return #ID;
-#define DEF_CONST(ID, CLASS)                                                   \
-  case ClassID::ID:                                                            \
-    return #ID;
-#define DEF_INSTR(ID, OPC, CLASS)                                              \
-  case ClassID::ID:                                                            \
-    return #ID;
-#include "llvm/SandboxIR/SandboxIRValues.def"
-    }
-    llvm_unreachable("Unimplemented ID");
-  }
-
-  /// For isa/dyn_cast.
-  ClassID SubclassID;
-#ifndef NDEBUG
-  /// A unique ID used for forming the name (used for debugging).
-  unsigned UID;
-#endif
-  /// The LLVM Value that corresponds to this SandboxIR Value.
-  /// NOTE: Some sandboxir Instructions, like Packs, may include more than one
-  /// value and in these cases `Val` points to the last instruction in program
-  /// order.
-  llvm::Value *Val = nullptr;
-
-  friend class Context;               // For getting `Val`.
-  friend class User;                  // For getting `Val`.
-  friend class Use;                   // For getting `Val`.
-  friend class VAArgInst;             // For getting `Val`.
-  friend class FreezeInst;            // For getting `Val`.
-  friend class FenceInst;             // For getting `Val`.
-  friend class SelectInst;            // For getting `Val`.
-  friend class ExtractElementInst;    // For getting `Val`.
-  friend class InsertElementInst;     // For getting `Val`.
-  friend class ShuffleVectorInst;     // For getting `Val`.
-  friend class ExtractValueInst;      // For getting `Val`.
-  friend class InsertValueInst;       // For getting `Val`.
-  friend class BranchInst;            // For getting `Val`.
-  friend class LoadInst;              // For getting `Val`.
-  friend class StoreInst;             // For getting `Val`.
-  friend class ReturnInst;            // For getting `Val`.
-  friend class CallBase;              // For getting `Val`.
-  friend class CallInst;              // For getting `Val`.
-  friend class InvokeInst;            // For getting `Val`.
-  friend class CallBrInst;            // For getting `Val`.
-  friend class LandingPadInst;        // For getting `Val`.
-  friend class FuncletPadInst;        // For getting `Val`.
-  friend class CatchPadInst;          // For getting `Val`.
-  friend class CleanupPadInst;        // For getting `Val`.
-  friend class CatchReturnInst;       // For getting `Val`.
-  friend class GetElementPtrInst;     // For getting `Val`.
-  friend class ResumeInst;            // For getting `Val`.
-  friend class CatchSwitchInst;       // For getting `Val`.
-  friend class CleanupReturnInst;     // For getting `Val`.
-  friend class SwitchInst;            // For getting `Val`.
-  friend class UnaryOperator;         // For getting `Val`.
-  friend class BinaryOperator;        // For getting `Val`.
-  friend class AtomicRMWInst;         // For getting `Val`.
-  friend class AtomicCmpXchgInst;     // For getting `Val`.
-  friend class AllocaInst;            // For getting `Val`.
-  friend class CastInst;              // For getting `Val`.
-  friend class PHINode;               // For getting `Val`.
-  friend class UnreachableInst;       // For getting `Val`.
-  friend class CatchSwitchAddHandler; // For `Val`.
-  friend class CmpInst;               // For getting `Val`.
-  friend class ConstantArray;         // For `Val`.
-  friend class ConstantStruct;        // For `Val`.
-  friend class ConstantAggregateZero; // For `Val`.
-  friend class ConstantPointerNull;   // For `Val`.
-  friend class UndefValue;            // For `Val`.
-  friend class PoisonValue;           // For `Val`.
-  friend class BlockAddress;          // For `Val`.
-  friend class GlobalValue;           // For `Val`.
-  friend class DSOLocalEquivalent;    // For `Val`.
-  friend class GlobalObject;          // For `Val`.
-  friend class GlobalIFunc;           // For `Val`.
-  friend class GlobalVariable;        // For `Val`.
-  friend class GlobalAlias;           // For `Val`.
-  friend class NoCFIValue;            // For `Val`.
-  friend class ConstantPtrAuth;       // For `Val`.
-  friend class ConstantExpr;          // For `Val`.
-
-  // Region needs to manipulate metadata in the underlying LLVM Value, we don't
-  // expose metadata in sandboxir.
-  friend class Region;
-
-  /// All values point to the context.
-  Context &Ctx;
-  // This is used by eraseFromParent().
-  void clearValue() { Val = nullptr; }
-  template <typename ItTy, typename SBTy> friend class LLVMOpUserItToSBTy;
-
-  Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx);
-  /// Disable copies.
-  Value(const Value &) = delete;
-  Value &operator=(const Value &) = delete;
-
-public:
-  virtual ~Value() = default;
-  ClassID getSubclassID() const { return SubclassID; }
-
-  using use_iterator = UserUseIterator;
-  using const_use_iterator = UserUseIterator;
-
-  use_iterator use_begin();
-  const_use_iterator use_begin() const {
-    return const_cast<Value *>(this)->use_begin();
-  }
-  use_iterator use_end() { return use_iterator(Use(nullptr, nullptr, Ctx)); }
-  const_use_iterator use_end() const {
-    return const_cast<Value *>(this)->use_end();
-  }
-
-  iterator_range<use_iterator> uses() {
-    return make_range<use_iterator>(use_begin(), use_end());
-  }
-  iterator_range<const_use_iterator> uses() const {
-    return make_range<const_use_iterator>(use_begin(), use_end());
-  }
-
-  /// Helper for mapped_iterator.
-  struct UseToUser {
-    User *operator()(const Use &Use) const { return &*Use.getUser(); }
-  };
-
-  using user_iterator = mapped_iterator<sandboxir::UserUseIterator, UseToUser>;
-  using const_user_iterator = user_iterator;
-
-  user_iterator user_begin();
-  user_iterator user_end() {
-    return user_iterator(Use(nullptr, nullptr, Ctx), UseToUser());
-  }
-  const_user_iterator user_begin() const {
-    return const_cast<Value *>(this)->user_begin();
-  }
-  const_user_iterator user_end() const {
-    return const_cast<Value *>(this)->user_end();
-  }
-
-  iterator_range<user_iterator> users() {
-    return make_range<user_iterator>(user_begin(), user_end());
-  }
-  iterator_range<const_user_iterator> users() const {
-    return make_range<const_user_iterator>(user_begin(), user_end());
-  }
-  /// \Returns the number of user edges (not necessarily to unique users).
-  /// WARNING: This is a linear-time operation.
-  unsigned getNumUses() const;
-  /// Return true if this value has N uses or more.
-  /// This is logically equivalent to getNumUses() >= N.
-  /// WARNING: This can be expensive, as it is linear to the number of users.
-  bool hasNUsesOrMore(unsigned Num) const {
-    unsigned Cnt = 0;
-    for (auto It = use_begin(), ItE = use_end(); It != ItE; ++It) {
-      if (++Cnt >= Num)
-        return true;
-    }
-    return false;
-  }
-  /// Return true if this Value has exactly N uses.
-  bool hasNUses(unsigned Num) const {
-    unsigned Cnt = 0;
-    for (auto It = use_begin(), ItE = use_end(); It != ItE; ++It) {
-      if (++Cnt > Num)
-        return false;
-    }
-    return Cnt == Num;
-  }
-
-  Type *getType() const;
-
-  Context &getContext() const { return Ctx; }
-
-  void replaceUsesWithIf(Value *OtherV,
-                         llvm::function_ref<bool(const Use &)> ShouldReplace);
-  void replaceAllUsesWith(Value *Other);
-
-  /// \Returns the LLVM IR name of the bottom-most LLVM value.
-  StringRef getName() const { return Val->getName(); }
-
-#ifndef NDEBUG
-  /// Should crash if there is something wrong with the instruction.
-  virtual void verify() const = 0;
-  /// Returns the unique id in the form 'SB<number>.' like 'SB1.'
-  std::string getUid() const;
-  virtual void dumpCommonHeader(raw_ostream &OS) const;
-  void dumpCommonFooter(raw_ostream &OS) const;
-  void dumpCommonPrefix(raw_ostream &OS) const;
-  void dumpCommonSuffix(raw_ostream &OS) const;
-  void printAsOperandCommon(raw_ostream &OS) const;
-  friend raw_ostream &operator<<(raw_ostream &OS, const sandboxir::Value &V) {
-    V.dumpOS(OS);
-    return OS;
-  }
-  virtual void dumpOS(raw_ostream &OS) const = 0;
-  LLVM_DUMP_METHOD void dump() const;
-#endif
-};
-
-/// Argument of a sandboxir::Function.
-class Argument : public sandboxir::Value {
-  Argument(llvm::Argument *Arg, sandboxir::Context &Ctx)
-      : sandboxir::Value(ClassID::Argument, Arg, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::Argument;
-  }
-#ifndef NDEBUG
-  void verify() const final {
-    assert(isa<llvm::Argument>(Val) && "Expected Argument!");
-  }
-  void printAsOperand(raw_ostream &OS) const;
-  void dumpOS(raw_ostream &OS) const final;
-#endif
-};
-
-/// A sandboxir::User has operands.
-class User : public Value {
-protected:
-  User(ClassID ID, llvm::Value *V, Context &Ctx) : Value(ID, V, Ctx) {}
-
-  /// \Returns the Use edge that corresponds to \p OpIdx.
-  /// Note: This is the default implementation that works for instructions that
-  /// match the underlying LLVM instruction. All others should use a different
-  /// implementation.
-  Use getOperandUseDefault(unsigned OpIdx, bool Verify) const;
-  /// \Returns the Use for the \p OpIdx'th operand. This is virtual to allow
-  /// instructions to deviate from the LLVM IR operands, which is a requirement
-  /// for sandboxir Instructions that consist of more than one LLVM Instruction.
-  virtual Use getOperandUseInternal(unsigned OpIdx, bool Verify) const = 0;
-  friend class OperandUseIterator; // for getOperandUseInternal()
-
-  /// The default implementation works only for single-LLVMIR-instruction
-  /// Users and only if they match exactly the LLVM instruction.
-  unsigned getUseOperandNoDefault(const Use &Use) const {
-    return Use.LLVMUse->getOperandNo();
-  }
-  /// \Returns the operand index of \p Use.
-  virtual unsigned getUseOperandNo(const Use &Use) const = 0;
-  friend unsigned Use::getOperandNo() const; // For getUseOperandNo()
-
-  void swapOperandsInternal(unsigned OpIdxA, unsigned OpIdxB) {
-    assert(OpIdxA < getNumOperands() && "OpIdxA out of bounds!");
-    assert(OpIdxB < getNumOperands() && "OpIdxB out of bounds!");
-    auto UseA = getOperandUse(OpIdxA);
-    auto UseB = getOperandUse(OpIdxB);
-    UseA.swap(UseB);
-  }
-
-#ifndef NDEBUG
-  void verifyUserOfLLVMUse(const llvm::Use &Use) const;
-#endif // NDEBUG
-
-public:
-  /// For isa/dyn_cast.
-  static bool classof(const Value *From);
-  using op_iterator = OperandUseIterator;
-  using const_op_iterator = OperandUseIterator;
-  using op_range = iterator_range<op_iterator>;
-  using const_op_range = iterator_range<const_op_iterator>;
-
-  virtual op_iterator op_begin() {
-    assert(isa<llvm::User>(Val) && "Expect User value!");
-    return op_iterator(getOperandUseInternal(0, /*Verify=*/false));
-  }
-  virtual op_iterator op_end() {
-    assert(isa<llvm::User>(Val) && "Expect User value!");
-    return op_iterator(
-        getOperandUseInternal(getNumOperands(), /*Verify=*/false));
-  }
-  virtual const_op_iterator op_begin() const {
-    return const_cast<User *>(this)->op_begin();
-  }
-  virtual const_op_iterator op_end() const {
-    return const_cast<User *>(this)->op_end();
-  }
-
-  op_range operands() { return make_range<op_iterator>(op_begin(), op_end()); }
-  const_op_range operands() const {
-    return make_range<const_op_iterator>(op_begin(), op_end());
-  }
-  Value *getOperand(unsigned OpIdx) const { return getOperandUse(OpIdx).get(); }
-  /// \Returns the operand edge for \p OpIdx. NOTE: This should also work for
-  /// OpIdx == getNumOperands(), which is used for op_end().
-  Use getOperandUse(unsigned OpIdx) const {
-    return getOperandUseInternal(OpIdx, /*Verify=*/true);
-  }
-  virtual unsigned getNumOperands() const {
-    return isa<llvm::User>(Val) ? cast<llvm::User>(Val)->getNumOperands() : 0;
-  }
-
-  virtual void setOperand(unsigned OperandIdx, Value *Operand);
-  /// Replaces any operands that match \p FromV with \p ToV. Returns whether any
-  /// operands were replaced.
-  bool replaceUsesOfWith(Value *FromV, Value *ToV);
-
-#ifndef NDEBUG
-  void verify() const override {
-    assert(isa<llvm::User>(Val) && "Expected User!");
-  }
-  void dumpCommonHeader(raw_ostream &OS) const final;
-  void dumpOS(raw_ostream &OS) const override {
-    // TODO: Remove this tmp implementation once we get the Instruction classes.
-  }
-#endif
-};
-
-class Constant : public sandboxir::User {
-protected:
-  Constant(llvm::Constant *C, sandboxir::Context &SBCtx)
-      : sandboxir::User(ClassID::Constant, C, SBCtx) {}
-  Constant(ClassID ID, llvm::Constant *C, sandboxir::Context &SBCtx)
-      : sandboxir::User(ID, C, SBCtx) {}
-  friend class ConstantInt; // For constructor.
-  friend class Function;    // For constructor
-  friend class Context;     // For constructor.
-  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const override {
-    return getOperandUseDefault(OpIdx, Verify);
-  }
-
-public:
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    switch (From->getSubclassID()) {
-#define DEF_CONST(ID, CLASS) case ClassID::ID:
-#include "llvm/SandboxIR/SandboxIRValues.def"
-      return true;
-    default:
-      return false;
-    }
-  }
-  sandboxir::Context &getParent() const { return getContext(); }
-  unsigned getUseOperandNo(const Use &Use) const override {
-    return getUseOperandNoDefault(Use);
-  }
-#ifndef NDEBUG
-  void verify() const override {
-    assert(isa<llvm::Constant>(Val) && "Expected Constant!");
-  }
-  void dumpOS(raw_ostream &OS) const override;
-#endif
-};
-
-// TODO: This should inherit from ConstantData.
-class ConstantInt : public Constant {
-  ConstantInt(llvm::ConstantInt *C, Context &Ctx)
-      : Constant(ClassID::ConstantInt, C, Ctx) {}
-  friend class Context; // For constructor.
-
-  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
-    llvm_unreachable("ConstantInt has no operands!");
-  }
-
-public:
-  static ConstantInt *getTrue(Context &Ctx);
-  static ConstantInt *getFalse(Context &Ctx);
-  static ConstantInt *getBool(Context &Ctx, bool V);
-  static Constant *getTrue(Type *Ty);
-  static Constant *getFalse(Type *Ty);
-  static Constant *getBool(Type *Ty, bool V);
-
-  /// If Ty is a vector type, return a Constant with a splat of the given
-  /// value. Otherwise return a ConstantInt for the given value.
-  static ConstantInt *get(Type *Ty, uint64_t V, bool IsSigned = false);
-
-  /// Return a ConstantInt with the specified integer value for the specified
-  /// type. If the type is wider than 64 bits, the value will be zero-extended
-  /// to fit the type, unless IsSigned is true, in which case the value will
-  /// be interpreted as a 64-bit signed integer and sign-extended to fit
-  /// the type.
-  /// Get a ConstantInt for a specific value.
-  static ConstantInt *get(IntegerType *Ty, uint64_t V, bool IsSigned = false);
-
-  /// Return a ConstantInt with the specified value for the specified type. The
-  /// value V will be canonicalized to a an unsigned APInt. Accessing it with
-  /// either getSExtValue() or getZExtValue() will yield a correctly sized and
-  /// signed value for the type Ty.
-  /// Get a ConstantInt for a specific signed value.
-  static ConstantInt *getSigned(IntegerType *Ty, int64_t V);
-  static Constant *getSigned(Type *Ty, int64_t V);
-
-  /// Return a ConstantInt with the specified value and an implied Type. The
-  /// type is the integer type that corresponds to the bit width of the value.
-  static ConstantInt *get(Context &Ctx, const APInt &V);
-
-  /// Return a ConstantInt constructed from the string strStart with the given
-  /// radix.
-  static ConstantInt *get(IntegerType *Ty, StringRef Str, uint8_t Radix);
-
-  /// If Ty is a vector type, return a Constant with a splat of the given
-  /// value. Otherwise return a ConstantInt for the given value.
-  static Constant *get(Type *Ty, const APInt &V);
-
-  /// Return the constant as an APInt value reference. This allows clients to
-  /// obtain a full-precision copy of the value.
-  /// Return the constant's value.
-  inline const APInt &getValue() const {
-    return cast<llvm::ConstantInt>(Val)->getValue();
-  }
-
-  /// getBitWidth - Return the scalar bitwidth of this constant.
-  unsigned getBitWidth() const {
-    return cast<llvm::ConstantInt>(Val)->getBitWidth();
-  }
-  /// Return the constant as a 64-bit unsigned integer value after it
-  /// has been zero extended as appropriate for the type of this constant. Note
-  /// that this method can assert if the value does not fit in 64 bits.
-  /// Return the zero extended value.
-  inline uint64_t getZExtValue() const {
-    return cast<llvm::ConstantInt>(Val)->getZExtValue();
-  }
-
-  /// Return the constant as a 64-bit integer value after it has been sign
-  /// extended as appropriate for the type of this constant. Note that
-  /// this method can assert if the value does not fit in 64 bits.
-  /// Return the sign extended value.
-  inline int64_t getSExtValue() const {
-    return cast<llvm::ConstantInt>(Val)->getSExtValue();
-  }
-
-  /// Return the constant as an llvm::MaybeAlign.
-  /// Note that this method can assert if the value does not fit in 64 bits or
-  /// is not a power of two.
-  inline MaybeAlign getMaybeAlignValue() const {
-    return cast<llvm::ConstantInt>(Val)->getMaybeAlignValue();
-  }
-
-  /// Return the constant as an llvm::Align, interpreting `0` as `Align(1)`.
-  /// Note that this method can assert if the value does not fit in 64 bits or
-  /// is not a power of two.
-  inline Align getAlignValue() const {
-    return cast<llvm::ConstantInt>(Val)->getAlignValue();
-  }
-
-  /// A helper method that can be used to determine if the constant contained
-  /// within is equal to a constant.  This only works for very small values,
-  /// because this is all that can be represented with all types.
-  /// Determine if this constant's value is same as an unsigned char.
-  bool equalsInt(uint64_t V) const {
-    return cast<llvm::ConstantInt>(Val)->equalsInt(V);
-  }
-
-  /// Variant of the getType() method to always return an IntegerType, which
-  /// reduces the amount of casting needed in parts of the compiler.
-  IntegerType *getIntegerType() const;
-
-  /// This static method returns true if the type Ty is big enough to
-  /// represent the value V. This can be used to avoid having the get method
-  /// assert when V is larger than Ty can represent. Note that there are two
-  /// versions of this method, one for unsigned and one for signed integers.
-  /// Although ConstantInt canonicalizes everything to an unsigned integer,
-  /// the signed version avoids callers having to convert a signed quantity
-  /// to the appropriate unsigned type before calling the method.
-  /// @returns true if V is a valid value for type Ty
-  /// Determine if the value is in range for the given type.
-  static bool isValueValidForType(Type *Ty, uint64_t V);
-  static bool isValueValidForType(Type *Ty, int64_t V);
-
-  bool isNegative() const { return cast<llvm::ConstantInt>(Val)->isNegative(); }
-
-  /// This is just a convenience method to make client code smaller for a
-  /// common code. It also correctly performs the comparison without the
-  /// potential for an assertion from getZExtValue().
-  bool isZero() const { return cast<llvm::ConstantInt>(Val)->isZero(); }
-
-  /// This is just a convenience method to make client code smaller for a
-  /// common case. It also correctly performs the comparison without the
-  /// potential for an assertion from getZExtValue().
-  /// Determine if the value is one.
-  bool isOne() const { return cast<llvm::ConstantInt>(Val)->isOne(); }
-
-  /// This function will return true iff every bit in this constant is set
-  /// to true.
-  /// @returns true iff this constant's bits are all set to true.
-  /// Determine if the value is all ones.
-  bool isMinusOne() const { return cast<llvm::ConstantInt>(Val)->isMinusOne(); }
-
-  /// This function will return true iff this constant represents the largest
-  /// value that may be represented by the constant's type.
-  /// @returns true iff this is the largest value that may be represented
-  /// by this type.
-  /// Determine if the value is maximal.
-  bool isMaxValue(bool IsSigned) const {
-    return cast<llvm::ConstantInt>(Val)->isMaxValue(IsSigned);
-  }
-
-  /// This function will return true iff this constant represents the smallest
-  /// value that may be represented by this constant's type.
-  /// @returns true if this is the smallest value that may be represented by
-  /// this type.
-  /// Determine if the value is minimal.
-  bool isMinValue(bool IsSigned) const {
-    return cast<llvm::ConstantInt>(Val)->isMinValue(IsSigned);
-  }
-
-  /// This function will return true iff this constant represents a value with
-  /// active bits bigger than 64 bits or a value greater than the given uint64_t
-  /// value.
-  /// @returns true iff this constant is greater or equal to the given number.
-  /// Determine if the value is greater or equal to the given number.
-  bool uge(uint64_t Num) const {
-    return cast<llvm::ConstantInt>(Val)->uge(Num);
-  }
-
-  /// getLimitedValue - If the value is smaller than the specified limit,
-  /// return it, otherwise return the limit value.  This causes the value
-  /// to saturate to the limit.
-  /// @returns the min of the value of the constant and the specified value
-  /// Get the constant's value with a saturation limit
-  uint64_t getLimitedValue(uint64_t Limit = ~0ULL) const {
-    return cast<llvm::ConstantInt>(Val)->getLimitedValue(Limit);
-  }
-
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::ConstantInt;
-  }
-  unsigned getUseOperandNo(const Use &Use) const override {
-    llvm_unreachable("ConstantInt has no operands!");
-  }
-#ifndef NDEBUG
-  void verify() const override {
-    assert(isa<llvm::ConstantInt>(Val) && "Expected a ConstantInst!");
-  }
-  void dumpOS(raw_ostream &OS) const override {
-    dumpCommonPrefix(OS);
-    dumpCommonSuffix(OS);
-  }
-#endif
-};
-
-// TODO: This should inherit from ConstantData.
-class ConstantFP final : public Constant {
-  ConstantFP(llvm::ConstantFP *C, Context &Ctx)
-      : Constant(ClassID::ConstantFP, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  /// This returns a ConstantFP, or a vector containing a splat of a ConstantFP,
-  /// for the specified value in the specified type. This should only be used
-  /// for simple constant values like 2.0/1.0 etc, that are known-valid both as
-  /// host double and as the target format.
-  static Constant *get(Type *Ty, double V);
-
-  /// If Ty is a vector type, return a Constant with a splat of the given
-  /// value. Otherwise return a ConstantFP for the given value.
-  static Constant *get(Type *Ty, const APFloat &V);
-
-  static Constant *get(Type *Ty, StringRef Str);
-
-  static ConstantFP *get(const APFloat &V, Context &Ctx);
-
-  static Constant *getNaN(Type *Ty, bool Negative = false,
-                          uint64_t Payload = 0);
-  static Constant *getQNaN(Type *Ty, bool Negative = false,
-                           APInt *Payload = nullptr);
-  static Constant *getSNaN(Type *Ty, bool Negative = false,
-                           APInt *Payload = nullptr);
-  static Constant *getZero(Type *Ty, bool Negative = false);
-
-  static Constant *getNegativeZero(Type *Ty);
-  static Constant *getInfinity(Type *Ty, bool Negative = false);
-
-  /// Return true if Ty is big enough to represent V.
-  static bool isValueValidForType(Type *Ty, const APFloat &V);
-
-  inline const APFloat &getValueAPF() const {
-    return cast<llvm::ConstantFP>(Val)->getValueAPF();
-  }
-  inline const APFloat &getValue() const {
-    return cast<llvm::ConstantFP>(Val)->getValue();
-  }
-
-  /// Return true if the value is positive or negative zero.
-  bool isZero() const { return cast<llvm::ConstantFP>(Val)->isZero(); }
-
-  /// Return true if the sign bit is set.
-  bool isNegative() const { return cast<llvm::ConstantFP>(Val)->isNegative(); }
-
-  /// Return true if the value is infinity
-  bool isInfinity() const { return cast<llvm::ConstantFP>(Val)->isInfinity(); }
-
-  /// Return true if the value is a NaN.
-  bool isNaN() const { return cast<llvm::ConstantFP>(Val)->isNaN(); }
-
-  /// We don't rely on operator== working on double values, as it returns true
-  /// for things that are clearly not equal, like -0.0 and 0.0.
-  /// As such, this method can be used to do an exact bit-for-bit comparison of
-  /// two floating point values.  The version with a double operand is retained
-  /// because it's so convenient to write isExactlyValue(2.0), but please use
-  /// it only for simple constants.
-  bool isExactlyValue(const APFloat &V) const {
-    return cast<llvm::ConstantFP>(Val)->isExactlyValue(V);
-  }
-
-  bool isExactlyValue(double V) const {
-    return cast<llvm::ConstantFP>(Val)->isExactlyValue(V);
-  }
-
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::ConstantFP;
-  }
-
-  // TODO: Better name: getOperandNo(const Use&). Should be private.
-  unsigned getUseOperandNo(const Use &Use) const final {
-    llvm_unreachable("ConstantFP has no operands!");
-  }
-#ifndef NDEBUG
-  void verify() const override {
-    assert(isa<llvm::ConstantFP>(Val) && "Expected a ConstantFP!");
-  }
-  void dumpOS(raw_ostream &OS) const override {
-    dumpCommonPrefix(OS);
-    dumpCommonSuffix(OS);
-  }
-#endif
-};
-
-/// Base class for aggregate constants (with operands).
-class ConstantAggregate : public Constant {
-protected:
-  ConstantAggregate(ClassID ID, llvm::Constant *C, Context &Ctx)
-      : Constant(ID, C, Ctx) {}
-
-public:
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    auto ID = From->getSubclassID();
-    return ID == ClassID::ConstantVector || ID == ClassID::ConstantStruct ||
-           ID == ClassID::ConstantArray;
-  }
-};
-
-class ConstantArray final : public ConstantAggregate {
-  ConstantArray(llvm::ConstantArray *C, Context &Ctx)
-      : ConstantAggregate(ClassID::ConstantArray, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  static Constant *get(ArrayType *T, ArrayRef<Constant *> V);
-  ArrayType *getType() const;
-
-  // TODO: Missing functions: getType(), getTypeForElements(), getAnon(), get().
-
-  /// For isa/dyn_cast.
-  static bool classof(const Value *From) {
-    return From->getSubclassID() == ClassID::ConstantArray;
-  }
-};
-
-class ConstantStruct final : public ConstantAggregate {
-  ConstantStruct(llvm::ConstantStruct *C, Context &Ctx)
-      : ConstantAggregate(ClassID::ConstantStruct, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  static Constant *get(StructType *T, ArrayRef<Constant *> V);
-
-  template <typename... Csts>
-  static std::enable_if_t<are_base_of<Constant, Csts...>::value, Constant *>
-  get(StructType *T, Csts *...Vs) {
-    return get(T, ArrayRef<Constant *>({Vs...}));
-  }
-  /// Return an anonymous struct that has the specified elements.
-  /// If the struct is possibly empty, then you must specify a context.
-  static Constant *getAnon(ArrayRef<Constant *> V, bool Packed = false) {
-    return get(getTypeForElements(V, Packed), V);
-  }
-  static Constant *getAnon(Context &Ctx, ArrayRef<Constant *> V,
-                           bool Packed = false) {
-    return get(getTypeForElements(Ctx, V, Packed), V);
-  }
-  /// This version of the method allows an empty list.
-  static StructType *getTypeForElements(Context &Ctx, ArrayRef<Constant *> V,
-                                        bool Packed = false);
-  /// Return an anonymous struct type to use for a constant with the specified
-  /// set of elements. The list must not be empty.
-  static StructType *getTypeForElements(ArrayRef<Constant *> V,
-                                        bool Packed = false) {
-    assert(!V.empty() &&
-           "ConstantStruct::getTypeForElements cannot be called on empty list");
-    return getTypeForElements(V[0]->getContext(), V, Packed);
-  }
-
-  /// Specialization - reduce amount of casting.
-  inline StructType *getType() const {
-    return cast<StructType>(Value::getType());
-  }
-
-  /// For isa/dyn_cast.
-  static bool classof(const Value *From) {
-    return From->getSubclassID() == ClassID::ConstantStruct;
-  }
-};
-
-class ConstantVector final : public ConstantAggregate {
-  ConstantVector(llvm::ConstantVector *C, Context &Ctx)
-      : ConstantAggregate(ClassID::ConstantVector, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  // TODO: Missing functions: getSplat(), getType(), getSplatValue(), get().
-
-  /// For isa/dyn_cast.
-  static bool classof(const Value *From) {
-    return From->getSubclassID() == ClassID::ConstantVector;
-  }
-};
-
-// TODO: Inherit from ConstantData.
-class ConstantAggregateZero final : public Constant {
-  ConstantAggregateZero(llvm::ConstantAggregateZero *C, Context &Ctx)
-      : Constant(ClassID::ConstantAggregateZero, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  static ConstantAggregateZero *get(Type *Ty);
-  /// If this CAZ has array or vector type, return a zero with the right element
-  /// type.
-  Constant *getSequentialElement() const;
-  /// If this CAZ has struct type, return a zero with the right element type for
-  /// the specified element.
-  Constant *getStructElement(unsigned Elt) const;
-  /// Return a zero of the right value for the specified GEP index if we can,
-  /// otherwise return null (e.g. if C is a ConstantExpr).
-  Constant *getElementValue(Constant *C) const;
-  /// Return a zero of the right value for the specified GEP index.
-  Constant *getElementValue(unsigned Idx) const;
-  /// Return the number of elements in the array, vector, or struct.
-  ElementCount getElementCount() const {
-    return cast<llvm::ConstantAggregateZero>(Val)->getElementCount();
-  }
-
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::ConstantAggregateZero;
-  }
-  unsigned getUseOperandNo(const Use &Use) const final {
-    llvm_unreachable("ConstantAggregateZero has no operands!");
-  }
-#ifndef NDEBUG
-  void verify() const override {
-    assert(isa<llvm::ConstantAggregateZero>(Val) && "Expected a CAZ!");
-  }
-  void dumpOS(raw_ostream &OS) const override {
-    dumpCommonPrefix(OS);
-    dumpCommonSuffix(OS);
-  }
-#endif
-};
-
-// TODO: Inherit from ConstantData.
-class ConstantPointerNull final : public Constant {
-  ConstantPointerNull(llvm::ConstantPointerNull *C, Context &Ctx)
-      : Constant(ClassID::ConstantPointerNull, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  static ConstantPointerNull *get(PointerType *Ty);
-
-  PointerType *getType() const;
-
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::ConstantPointerNull;
-  }
-  unsigned getUseOperandNo(const Use &Use) const final {
-    llvm_unreachable("ConstantPointerNull has no operands!");
-  }
-#ifndef NDEBUG
-  void verify() const override {
-    assert(isa<llvm::ConstantPointerNull>(Val) && "Expected a CPNull!");
-  }
-  void dumpOS(raw_ostream &OS) const override {
-    dumpCommonPrefix(OS);
-    dumpCommonSuffix(OS);
-  }
-#endif
-};
-
-// TODO: Inherit from ConstantData.
-class UndefValue : public Constant {
-protected:
-  UndefValue(llvm::UndefValue *C, Context &Ctx)
-      : Constant(ClassID::UndefValue, C, Ctx) {}
-  UndefValue(ClassID ID, llvm::Constant *C, Context &Ctx)
-      : Constant(ID, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  /// Static factory methods - Return an 'undef' object of the specified type.
-  static UndefValue *get(Type *T);
-
-  /// If this Undef has array or vector type, return a undef with the right
-  /// element type.
-  UndefValue *getSequentialElement() const;
-
-  /// If this undef has struct type, return a undef with the right element type
-  /// for the specified element.
-  UndefValue *getStructElement(unsigned Elt) const;
-
-  /// Return an undef of the right value for the specified GEP index if we can,
-  /// otherwise return null (e.g. if C is a ConstantExpr).
-  UndefValue *getElementValue(Constant *C) const;
-
-  /// Return an undef of the right value for the specified GEP index.
-  UndefValue *getElementValue(unsigned Idx) const;
-
-  /// Return the number of elements in the array, vector, or struct.
-  unsigned getNumElements() const {
-    return cast<llvm::UndefValue>(Val)->getNumElements();
-  }
-
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::UndefValue ||
-           From->getSubclassID() == ClassID::PoisonValue;
-  }
-  unsigned getUseOperandNo(const Use &Use) const final {
-    llvm_unreachable("UndefValue has no operands!");
-  }
-#ifndef NDEBUG
-  void verify() const override {
-    assert(isa<llvm::UndefValue>(Val) && "Expected an UndefValue!");
-  }
-  void dumpOS(raw_ostream &OS) const override {
-    dumpCommonPrefix(OS);
-    dumpCommonSuffix(OS);
-  }
-#endif
-};
-
-class PoisonValue final : public UndefValue {
-  PoisonValue(llvm::PoisonValue *C, Context &Ctx)
-      : UndefValue(ClassID::PoisonValue, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  /// Static factory methods - Return an 'poison' object of the specified type.
-  static PoisonValue *get(Type *T);
-
-  /// If this poison has array or vector type, return a poison with the right
-  /// element type.
-  PoisonValue *getSequentialElement() const;
-
-  /// If this poison has struct type, return a poison with the right element
-  /// type for the specified element.
-  PoisonValue *getStructElement(unsigned Elt) const;
-
-  /// Return an poison of the right value for the specified GEP index if we can,
-  /// otherwise return null (e.g. if C is a ConstantExpr).
-  PoisonValue *getElementValue(Constant *C) const;
-
-  /// Return an poison of the right value for the specified GEP index.
-  PoisonValue *getElementValue(unsigned Idx) const;
-
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::PoisonValue;
-  }
-#ifndef NDEBUG
-  void verify() const override {
-    assert(isa<llvm::PoisonValue>(Val) && "Expected a PoisonValue!");
-  }
-  void dumpOS(raw_ostream &OS) const override {
-    dumpCommonPrefix(OS);
-    dumpCommonSuffix(OS);
-  }
-#endif
-};
-
-class GlobalValue : public Constant {
-protected:
-  GlobalValue(ClassID ID, llvm::GlobalValue *C, Context &Ctx)
-      : Constant(ID, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  using LinkageTypes = llvm::GlobalValue::LinkageTypes;
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    switch (From->getSubclassID()) {
-    case ClassID::Function:
-    case ClassID::GlobalVariable:
-    case ClassID::GlobalAlias:
-    case ClassID::GlobalIFunc:
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  unsigned getAddressSpace() const {
-    return cast<llvm::GlobalValue>(Val)->getAddressSpace();
-  }
-  bool hasGlobalUnnamedAddr() const {
-    return cast<llvm::GlobalValue>(Val)->hasGlobalUnnamedAddr();
-  }
-
-  /// Returns true if this value's address is not significant in this module.
-  /// This attribute is intended to be used only by the code generator and LTO
-  /// to allow the linker to decide whether the global needs to be in the symbol
-  /// table. It should probably not be used in optimizations, as the value may
-  /// have uses outside the module; use hasGlobalUnnamedAddr() instead.
-  bool hasAtLeastLocalUnnamedAddr() const {
-    return cast<llvm::GlobalValue>(Val)->hasAtLeastLocalUnnamedAddr();
-  }
-
-  using UnnamedAddr = llvm::GlobalValue::UnnamedAddr;
-
-  UnnamedAddr getUnnamedAddr() const {
-    return cast<llvm::GlobalValue>(Val)->getUnnamedAddr();
-  }
-  void setUnnamedAddr(UnnamedAddr V);
-
-  static UnnamedAddr getMinUnnamedAddr(UnnamedAddr A, UnnamedAddr B) {
-    return llvm::GlobalValue::getMinUnnamedAddr(A, B);
-  }
-
-  bool hasComdat() const { return cast<llvm::GlobalValue>(Val)->hasComdat(); }
-
-  // TODO: We need a SandboxIR Comdat if we want to implement getComdat().
-  using VisibilityTypes = llvm::GlobalValue::VisibilityTypes;
-  VisibilityTypes getVisibility() const {
-    return cast<llvm::GlobalValue>(Val)->getVisibility();
-  }
-  bool hasDefaultVisibility() const {
-    return cast<llvm::GlobalValue>(Val)->hasDefaultVisibility();
-  }
-  bool hasHiddenVisibility() const {
-    return cast<llvm::GlobalValue>(Val)->hasHiddenVisibility();
-  }
-  bool hasProtectedVisibility() const {
-    return cast<llvm::GlobalValue>(Val)->hasProtectedVisibility();
-  }
-  void setVisibility(VisibilityTypes V);
-
-  // TODO: Add missing functions.
-};
-
-class GlobalObject : public GlobalValue {
-protected:
-  GlobalObject(ClassID ID, llvm::GlobalObject *C, Context &Ctx)
-      : GlobalValue(ID, C, Ctx) {}
-  friend class Context; // For constructor.
-  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
-    return getOperandUseDefault(OpIdx, Verify);
-  }
-
-public:
-  unsigned getUseOperandNo(const Use &Use) const final {
-    return getUseOperandNoDefault(Use);
-  }
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    switch (From->getSubclassID()) {
-    case ClassID::Function:
-    case ClassID::GlobalVariable:
-    case ClassID::GlobalIFunc:
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  /// FIXME: Remove this function once transition to Align is over.
-  uint64_t getAlignment() const {
-    return cast<llvm::GlobalObject>(Val)->getAlignment();
-  }
-
-  /// Returns the alignment of the given variable or function.
-  ///
-  /// Note that for functions this is the alignment of the code, not the
-  /// alignment of a function pointer.
-  MaybeAlign getAlign() const {
-    return cast<llvm::GlobalObject>(Val)->getAlign();
-  }
-
-  // TODO: Add missing: setAlignment(Align)
-
-  /// Sets the alignment attribute of the GlobalObject.
-  /// This method will be deprecated as the alignment property should always be
-  /// defined.
-  void setAlignment(MaybeAlign Align);
-
-  unsigned getGlobalObjectSubClassData() const {
-    return cast<llvm::GlobalObject>(Val)->getGlobalObjectSubClassData();
-  }
-
-  void setGlobalObjectSubClassData(unsigned V);
-
-  /// Check if this global has a custom object file section.
-  ///
-  /// This is more efficient than calling getSection() and checking for an empty
-  /// string.
-  bool hasSection() const {
-    return cast<llvm::GlobalObject>(Val)->hasSection();
-  }
-
-  /// Get the custom section of this global if it has one.
-  ///
-  /// If this global does not have a custom section, this will be empty and the
-  /// default object file section (.text, .data, etc) will be used.
-  StringRef getSection() const {
-    return cast<llvm::GlobalObject>(Val)->getSection();
-  }
-
-  /// Change the section for this global.
-  ///
-  /// Setting the section to the empty string tells LLVM to choose an
-  /// appropriate default object file section.
-  void setSection(StringRef S);
-
-  bool hasComdat() const { return cast<llvm::GlobalObject>(Val)->hasComdat(); }
-
-  // TODO: implement get/setComdat(), etc. once we have a sandboxir::Comdat.
-
-  // TODO: We currently don't support Metadata in sandboxir so all
-  // Metadata-related functions are missing.
-
-  using VCallVisibility = llvm::GlobalObject::VCallVisibility;
-
-  VCallVisibility getVCallVisibility() const {
-    return cast<llvm::GlobalObject>(Val)->getVCallVisibility();
-  }
-
-  /// Returns true if the alignment of the value can be unilaterally
-  /// increased.
-  ///
-  /// Note that for functions this is the alignment of the code, not the
-  /// alignment of a function pointer.
-  bool canIncreaseAlignment() const {
-    return cast<llvm::GlobalObject>(Val)->canIncreaseAlignment();
-  }
-};
-
-/// Provides API functions, like getIterator() and getReverseIterator() to
-/// GlobalIFunc, Function, GlobalVariable and GlobalAlias. In LLVM IR these are
-/// provided by ilist_node.
-template <typename GlobalT, typename LLVMGlobalT, typename ParentT,
-          typename LLVMParentT>
-class GlobalWithNodeAPI : public ParentT {
-  /// Helper for mapped_iterator.
-  struct LLVMGVToGV {
-    Context &Ctx;
-    LLVMGVToGV(Context &Ctx) : Ctx(Ctx) {}
-    GlobalT &operator()(LLVMGlobalT &LLVMGV) const;
-  };
-
-public:
-  GlobalWithNodeAPI(Value::ClassID ID, LLVMParentT *C, Context &Ctx)
-      : ParentT(ID, C, Ctx) {}
-
-  // TODO: Missing getParent(). Should be added once Module is available.
-
-  using iterator = mapped_iterator<
-      decltype(static_cast<LLVMGlobalT *>(nullptr)->getIterator()), LLVMGVToGV>;
-  using reverse_iterator = mapped_iterator<
-      decltype(static_cast<LLVMGlobalT *>(nullptr)->getReverseIterator()),
-      LLVMGVToGV>;
-  iterator getIterator() const {
-    auto *LLVMGV = cast<LLVMGlobalT>(this->Val);
-    LLVMGVToGV ToGV(this->Ctx);
-    return map_iterator(LLVMGV->getIterator(), ToGV);
-  }
-  reverse_iterator getReverseIterator() const {
-    auto *LLVMGV = cast<LLVMGlobalT>(this->Val);
-    LLVMGVToGV ToGV(this->Ctx);
-    return map_iterator(LLVMGV->getReverseIterator(), ToGV);
-  }
-};
-
-class GlobalIFunc final
-    : public GlobalWithNodeAPI<GlobalIFunc, llvm::GlobalIFunc, GlobalObject,
-                               llvm::GlobalObject> {
-  GlobalIFunc(llvm::GlobalObject *C, Context &Ctx)
-      : GlobalWithNodeAPI(ClassID::GlobalIFunc, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::GlobalIFunc;
-  }
-
-  // TODO: Missing create() because we don't have a sandboxir::Module yet.
-
-  // TODO: Missing functions: copyAttributesFrom(), removeFromParent(),
-  // eraseFromParent()
-
-  void setResolver(Constant *Resolver);
-
-  Constant *getResolver() const;
-
-  // Return the resolver function after peeling off potential ConstantExpr
-  // indirection.
-  Function *getResolverFunction();
-  const Function *getResolverFunction() const {
-    return const_cast<GlobalIFunc *>(this)->getResolverFunction();
-  }
-
-  static bool isValidLinkage(LinkageTypes L) {
-    return llvm::GlobalIFunc::isValidLinkage(L);
-  }
-
-  // TODO: Missing applyAlongResolverPath().
-
-#ifndef NDEBUG
-  void verify() const override {
-    assert(isa<llvm::GlobalIFunc>(Val) && "Expected a GlobalIFunc!");
-  }
-  void dumpOS(raw_ostream &OS) const override {
-    dumpCommonPrefix(OS);
-    dumpCommonSuffix(OS);
-  }
-#endif
-};
-
-class GlobalVariable final
-    : public GlobalWithNodeAPI<GlobalVariable, llvm::GlobalVariable,
-                               GlobalObject, llvm::GlobalObject> {
-  GlobalVariable(llvm::GlobalObject *C, Context &Ctx)
-      : GlobalWithNodeAPI(ClassID::GlobalVariable, C, Ctx) {}
-  friend class Context; // For constructor.
-
-  /// Helper for mapped_iterator.
-  struct LLVMGVToGV {
-    Context &Ctx;
-    LLVMGVToGV(Context &Ctx) : Ctx(Ctx) {}
-    GlobalVariable &operator()(llvm::GlobalVariable &LLVMGV) const;
-  };
-
-public:
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::GlobalVariable;
-  }
-
-  /// Definitions have initializers, declarations don't.
-  ///
-  inline bool hasInitializer() const {
-    return cast<llvm::GlobalVariable>(Val)->hasInitializer();
-  }
-
-  /// hasDefinitiveInitializer - Whether the global variable has an initializer,
-  /// and any other instances of the global (this can happen due to weak
-  /// linkage) are guaranteed to have the same initializer.
-  ///
-  /// Note that if you want to transform a global, you must use
-  /// hasUniqueInitializer() instead, because of the *_odr linkage type.
-  ///
-  /// Example:
-  ///
-  /// @a = global SomeType* null - Initializer is both definitive and unique.
-  ///
-  /// @b = global weak SomeType* null - Initializer is neither definitive nor
-  /// unique.
-  ///
-  /// @c = global weak_odr SomeType* null - Initializer is definitive, but not
-  /// unique.
-  inline bool hasDefinitiveInitializer() const {
-    return cast<llvm::GlobalVariable>(Val)->hasDefinitiveInitializer();
-  }
-
-  /// hasUniqueInitializer - Whether the global variable has an initializer, and
-  /// any changes made to the initializer will turn up in the final executable.
-  inline bool hasUniqueInitializer() const {
-    return cast<llvm::GlobalVariable>(Val)->hasUniqueInitializer();
-  }
-
-  /// getInitializer - Return the initializer for this global variable.  It is
-  /// illegal to call this method if the global is external, because we cannot
-  /// tell what the value is initialized to!
-  ///
-  Constant *getInitializer() const;
-  /// setInitializer - Sets the initializer for this global variable, removing
-  /// any existing initializer if InitVal==NULL. The initializer must have the
-  /// type getValueType().
-  void setInitializer(Constant *InitVal);
-
-  // TODO: Add missing replaceInitializer(). Requires special tracker
-
-  /// If the value is a global constant, its value is immutable throughout the
-  /// runtime execution of the program.  Assigning a value into the constant
-  /// leads to undefined behavior.
-  ///
-  bool isConstant() const {
-    return cast<llvm::GlobalVariable>(Val)->isConstant();
-  }
-  void setConstant(bool V);
-
-  bool isExternallyInitialized() const {
-    return cast<llvm::GlobalVariable>(Val)->isExternallyInitialized();
-  }
-  void setExternallyInitialized(bool Val);
-
-  // TODO: Missing copyAttributesFrom()
-
-  // TODO: Missing removeFromParent(), eraseFromParent(), dropAllReferences()
-
-  // TODO: Missing addDebugInfo(), getDebugInfo()
-
-  // TODO: Missing attribute setter functions: addAttribute(), setAttributes().
-  //       There seems to be no removeAttribute() so we can't undo them.
-
-  /// Return true if the attribute exists.
-  bool hasAttribute(Attribute::AttrKind Kind) const {
-    return cast<llvm::GlobalVariable>(Val)->hasAttribute(Kind);
-  }
-
-  /// Return true if the attribute exists.
-  bool hasAttribute(StringRef Kind) const {
-    return cast<llvm::GlobalVariable>(Val)->hasAttribute(Kind);
-  }
-
-  /// Return true if any attributes exist.
-  bool hasAttributes() const {
-    return cast<llvm::GlobalVariable>(Val)->hasAttributes();
-  }
-
-  /// Return the attribute object.
-  Attribute getAttribute(Attribute::AttrKind Kind) const {
-    return cast<llvm::GlobalVariable>(Val)->getAttribute(Kind);
-  }
-
-  /// Return the attribute object.
-  Attribute getAttribute(StringRef Kind) const {
-    return cast<llvm::GlobalVariable>(Val)->getAttribute(Kind);
-  }
-
-  /// Return the attribute set for this global
-  AttributeSet getAttributes() const {
-    return cast<llvm::GlobalVariable>(Val)->getAttributes();
-  }
-
-  /// Return attribute set as list with index.
-  /// FIXME: This may not be required once ValueEnumerators
-  /// in bitcode-writer can enumerate attribute-set.
-  AttributeList getAttributesAsList(unsigned Index) const {
-    return cast<llvm::GlobalVariable>(Val)->getAttributesAsList(Index);
-  }
-
-  /// Check if section name is present
-  bool hasImplicitSection() const {
-    return cast<llvm::GlobalVariable>(Val)->hasImplicitSection();
-  }
-
-  /// Get the custom code model raw value of this global.
-  ///
-  unsigned getCodeModelRaw() const {
-    return cast<llvm::GlobalVariable>(Val)->getCodeModelRaw();
-  }
-
-  /// Get the custom code model of this global if it has one.
-  ///
-  /// If this global does not have a custom code model, the empty instance
-  /// will be returned.
-  std::optional<CodeModel::Model> getCodeModel() const {
-    return cast<llvm::GlobalVariable>(Val)->getCodeModel();
-  }
-
-  // TODO: Missing setCodeModel(). Requires custom tracker.
-
-#ifndef NDEBUG
-  void verify() const override {
-    assert(isa<llvm::GlobalVariable>(Val) && "Expected a GlobalVariable!");
-  }
-  void dumpOS(raw_ostream &OS) const override {
-    dumpCommonPrefix(OS);
-    dumpCommonSuffix(OS);
-  }
-#endif
-};
-
-class GlobalAlias final
-    : public GlobalWithNodeAPI<GlobalAlias, llvm::GlobalAlias, GlobalValue,
-                               llvm::GlobalValue> {
-  GlobalAlias(llvm::GlobalAlias *C, Context &Ctx)
-      : GlobalWithNodeAPI(ClassID::GlobalAlias, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::GlobalAlias;
-  }
-
-  // TODO: Missing create() due to unimplemented sandboxir::Module.
-
-  // TODO: Missing copyAttributresFrom().
-  // TODO: Missing removeFromParent(), eraseFromParent().
-
-  void setAliasee(Constant *Aliasee);
-  Constant *getAliasee() const;
-
-  const GlobalObject *getAliaseeObject() const;
-  GlobalObject *getAliaseeObject() {
-    return const_cast<GlobalObject *>(
-        static_cast<const GlobalAlias *>(this)->getAliaseeObject());
-  }
-
-  static bool isValidLinkage(LinkageTypes L) {
-    return llvm::GlobalAlias::isValidLinkage(L);
-  }
-};
-
-class NoCFIValue final : public Constant {
-  NoCFIValue(llvm::NoCFIValue *C, Context &Ctx)
-      : Constant(ClassID::NoCFIValue, C, Ctx) {}
-  friend class Context; // For constructor.
-
-  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
-    return getOperandUseDefault(OpIdx, Verify);
-  }
-
-public:
-  /// Return a NoCFIValue for the specified function.
-  static NoCFIValue *get(GlobalValue *GV);
-
-  GlobalValue *getGlobalValue() const;
-
-  /// NoCFIValue is always a pointer.
-  PointerType *getType() const;
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::NoCFIValue;
-  }
-
-  unsigned getUseOperandNo(const Use &Use) const final {
-    return getUseOperandNoDefault(Use);
-  }
-
-#ifndef NDEBUG
-  void verify() const override {
-    assert(isa<llvm::NoCFIValue>(Val) && "Expected a NoCFIValue!");
-  }
-  void dumpOS(raw_ostream &OS) const override {
-    dumpCommonPrefix(OS);
-    dumpCommonSuffix(OS);
-  }
-#endif
-};
-
-class ConstantPtrAuth final : public Constant {
-  ConstantPtrAuth(llvm::ConstantPtrAuth *C, Context &Ctx)
-      : Constant(ClassID::ConstantPtrAuth, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  /// Return a pointer signed with the specified parameters.
-  static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key,
-                              ConstantInt *Disc, Constant *AddrDisc);
-  /// The pointer that is signed in this ptrauth signed pointer.
-  Constant *getPointer() const;
-
-  /// The Key ID, an i32 constant.
-  ConstantInt *getKey() const;
-
-  /// The integer discriminator, an i64 constant, or 0.
-  ConstantInt *getDiscriminator() const;
-
-  /// The address discriminator if any, or the null constant.
-  /// If present, this must be a value equivalent to the storage location of
-  /// the only global-initializer user of the ptrauth signed pointer.
-  Constant *getAddrDiscriminator() const;
-
-  /// Whether there is any non-null address discriminator.
-  bool hasAddressDiscriminator() const {
-    return cast<llvm::ConstantPtrAuth>(Val)->hasAddressDiscriminator();
-  }
-
-  /// Whether the address uses a special address discriminator.
-  /// These discriminators can't be used in real pointer-auth values; they
-  /// can only be used in "prototype" values that indicate how some real
-  /// schema is supposed to be produced.
-  bool hasSpecialAddressDiscriminator(uint64_t Value) const {
-    return cast<llvm::ConstantPtrAuth>(Val)->hasSpecialAddressDiscriminator(
-        Value);
-  }
-
-  /// Check whether an authentication operation with key \p Key and (possibly
-  /// blended) discriminator \p Discriminator is known to be compatible with
-  /// this ptrauth signed pointer.
-  bool isKnownCompatibleWith(const Value *Key, const Value *Discriminator,
-                             const DataLayout &DL) const {
-    return cast<llvm::ConstantPtrAuth>(Val)->isKnownCompatibleWith(
-        Key->Val, Discriminator->Val, DL);
-  }
-
-  /// Produce a new ptrauth expression signing the given value using
-  /// the same schema as is stored in one.
-  ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const;
-
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::ConstantPtrAuth;
-  }
-};
-
-class ConstantExpr : public Constant {
-  ConstantExpr(llvm::ConstantExpr *C, Context &Ctx)
-      : Constant(ClassID::ConstantExpr, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::ConstantExpr;
-  }
-  // TODO: Missing functions.
-};
-
-class BlockAddress final : public Constant {
-  BlockAddress(llvm::BlockAddress *C, Context &Ctx)
-      : Constant(ClassID::BlockAddress, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  /// Return a BlockAddress for the specified function and basic block.
-  static BlockAddress *get(Function *F, BasicBlock *BB);
-
-  /// Return a BlockAddress for the specified basic block.  The basic
-  /// block must be embedded into a function.
-  static BlockAddress *get(BasicBlock *BB);
-
-  /// Lookup an existing \c BlockAddress constant for the given BasicBlock.
-  ///
-  /// \returns 0 if \c !BB->hasAddressTaken(), otherwise the \c BlockAddress.
-  static BlockAddress *lookup(const BasicBlock *BB);
-
-  Function *getFunction() const;
-  BasicBlock *getBasicBlock() const;
-
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::BlockAddress;
-  }
-};
-
-class DSOLocalEquivalent final : public Constant {
-  DSOLocalEquivalent(llvm::DSOLocalEquivalent *C, Context &Ctx)
-      : Constant(ClassID::DSOLocalEquivalent, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  /// Return a DSOLocalEquivalent for the specified global value.
-  static DSOLocalEquivalent *get(GlobalValue *GV);
-
-  GlobalValue *getGlobalValue() const;
-
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::DSOLocalEquivalent;
-  }
-
-  unsigned getUseOperandNo(const Use &Use) const final {
-    llvm_unreachable("DSOLocalEquivalent has no operands!");
-  }
-
-#ifndef NDEBUG
-  void verify() const override {
-    assert(isa<llvm::DSOLocalEquivalent>(Val) &&
-           "Expected a DSOLocalEquivalent!");
-  }
-  void dumpOS(raw_ostream &OS) const override {
-    dumpCommonPrefix(OS);
-    dumpCommonSuffix(OS);
-  }
-#endif
-};
-
-// TODO: This should inherit from ConstantData.
-class ConstantTokenNone final : public Constant {
-  ConstantTokenNone(llvm::ConstantTokenNone *C, Context &Ctx)
-      : Constant(ClassID::ConstantTokenNone, C, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  /// Return the ConstantTokenNone.
-  static ConstantTokenNone *get(Context &Ctx);
-
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::ConstantTokenNone;
-  }
-
-  unsigned getUseOperandNo(const Use &Use) const final {
-    llvm_unreachable("ConstantTokenNone has no operands!");
-  }
-
-#ifndef NDEBUG
-  void verify() const override {
-    assert(isa<llvm::ConstantTokenNone>(Val) &&
-           "Expected a ConstantTokenNone!");
-  }
-  void dumpOS(raw_ostream &OS) const override {
-    dumpCommonPrefix(OS);
-    dumpCommonSuffix(OS);
-  }
-#endif
-};
-
-/// Iterator for `Instruction`s in a `BasicBlock.
-/// \Returns an sandboxir::Instruction & when derereferenced.
-class BBIterator {
-public:
-  using difference_type = std::ptrdiff_t;
-  using value_type = Instruction;
-  using pointer = value_type *;
-  using reference = value_type &;
-  using iterator_category = std::bidirectional_iterator_tag;
-
-private:
-  llvm::BasicBlock *BB;
-  llvm::BasicBlock::iterator It;
-  Context *Ctx;
-  pointer getInstr(llvm::BasicBlock::iterator It) const;
 
-public:
-  BBIterator() : BB(nullptr), Ctx(nullptr) {}
-  BBIterator(llvm::BasicBlock *BB, llvm::BasicBlock::iterator It, Context *Ctx)
-      : BB(BB), It(It), Ctx(Ctx) {}
-  reference operator*() const { return *getInstr(It); }
-  BBIterator &operator++();
-  BBIterator operator++(int) {
-    auto Copy = *this;
-    ++*this;
-    return Copy;
-  }
-  BBIterator &operator--();
-  BBIterator operator--(int) {
-    auto Copy = *this;
-    --*this;
-    return Copy;
-  }
-  bool operator==(const BBIterator &Other) const {
-    assert(Ctx == Other.Ctx && "BBIterators in different context!");
-    return It == Other.It;
-  }
-  bool operator!=(const BBIterator &Other) const { return !(*this == Other); }
-  /// \Returns the SBInstruction that corresponds to this iterator, or null if
-  /// the instruction is not found in the IR-to-SandboxIR tables.
-  pointer get() const { return getInstr(It); }
-  /// \Returns the parent BB.
-  BasicBlock *getNodeParent() const;
-};
-
-/// Contains a list of sandboxir::Instruction's.
-class BasicBlock : public Value {
-  /// Builds a graph that contains all values in \p BB in their original form
-  /// i.e., no vectorization is taking place here.
-  void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB);
-  friend class Context;     // For `buildBasicBlockFromIR`
-  friend class Instruction; // For LLVM Val.
-
-  BasicBlock(llvm::BasicBlock *BB, Context &SBCtx)
-      : Value(ClassID::Block, BB, SBCtx) {
-    buildBasicBlockFromLLVMIR(BB);
-  }
+#ifndef LLVM_SANDBOXIR_INSTRUCTION_H
+#define LLVM_SANDBOXIR_INSTRUCTION_H
 
-public:
-  ~BasicBlock() = default;
-  /// For isa/dyn_cast.
-  static bool classof(const Value *From) {
-    return From->getSubclassID() == Value::ClassID::Block;
-  }
-  Function *getParent() const;
-  using iterator = BBIterator;
-  iterator begin() const;
-  iterator end() const {
-    auto *BB = cast<llvm::BasicBlock>(Val);
-    return iterator(BB, BB->end(), &Ctx);
-  }
-  std::reverse_iterator<iterator> rbegin() const {
-    return std::make_reverse_iterator(end());
-  }
-  std::reverse_iterator<iterator> rend() const {
-    return std::make_reverse_iterator(begin());
-  }
-  Context &getContext() const { return Ctx; }
-  Instruction *getTerminator() const;
-  bool empty() const { return begin() == end(); }
-  Instruction &front() const;
-  Instruction &back() const;
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/SandboxIR/BasicBlock.h"
+#include "llvm/SandboxIR/Constant.h"
+#include "llvm/SandboxIR/User.h"
 
-#ifndef NDEBUG
-  void verify() const final;
-  void dumpOS(raw_ostream &OS) const final;
-#endif
-};
+namespace llvm::sandboxir {
 
 /// A sandboxir::User with operands, opcode and linked with previous/next
 /// instructions in an instruction list.
-class Instruction : public sandboxir::User {
+class Instruction : public User {
 public:
   enum class Opcode {
 #define OP(OPC) OPC,
 #define OPCODES(...) __VA_ARGS__
 #define DEF_INSTR(ID, OPC, CLASS) OPC
-#include "llvm/SandboxIR/SandboxIRValues.def"
+#include "llvm/SandboxIR/Values.def"
   };
 
 protected:
   Instruction(ClassID ID, Opcode Opc, llvm::Instruction *I,
               sandboxir::Context &SBCtx)
-      : sandboxir::User(ID, I, SBCtx), Opc(Opc) {}
+      : User(ID, I, SBCtx), Opc(Opc) {}
 
   Opcode Opc;
 
@@ -1936,7 +95,13 @@ class Instruction : public sandboxir::User {
   /// state to allow for new SandboxIR-specific instructions.
   Opcode getOpcode() const { return Opc; }
 
-  // TODO: Missing function getOpcodeName().
+  const char *getOpcodeName() const { return getOpcodeName(Opc); }
+
+  // Note that these functions below are calling into llvm::Instruction.
+  // A sandbox IR instruction could introduce a new opcode that could change the
+  // behavior of one of these functions. It is better that these functions are
+  // only added as needed and new sandbox IR instructions must explicitly check
+  // if any of these functions could have a different behavior.
 
   bool isTerminator() const {
     return cast<llvm::Instruction>(Val)->isTerminator();
@@ -1948,6 +113,41 @@ class Instruction : public sandboxir::User {
   }
   bool isShift() const { return cast<llvm::Instruction>(Val)->isShift(); }
   bool isCast() const { return cast<llvm::Instruction>(Val)->isCast(); }
+  bool isFuncletPad() const {
+    return cast<llvm::Instruction>(Val)->isFuncletPad();
+  }
+  bool isSpecialTerminator() const {
+    return cast<llvm::Instruction>(Val)->isSpecialTerminator();
+  }
+  bool isOnlyUserOfAnyOperand() const {
+    return cast<llvm::Instruction>(Val)->isOnlyUserOfAnyOperand();
+  }
+  bool isLogicalShift() const {
+    return cast<llvm::Instruction>(Val)->isLogicalShift();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Metadata manipulation.
+  //===--------------------------------------------------------------------===//
+
+  /// Return true if the instruction has any metadata attached to it.
+  bool hasMetadata() const {
+    return cast<llvm::Instruction>(Val)->hasMetadata();
+  }
+
+  /// Return true if this instruction has metadata attached to it other than a
+  /// debug location.
+  bool hasMetadataOtherThanDebugLoc() const {
+    return cast<llvm::Instruction>(Val)->hasMetadataOtherThanDebugLoc();
+  }
+
+  /// Return true if this instruction has the given type of metadata attached.
+  bool hasMetadata(unsigned KindID) const {
+    return cast<llvm::Instruction>(Val)->hasMetadata(KindID);
+  }
+
+  // TODO: Implement getMetadata and getAllMetadata after sandboxir::MDNode is
+  // available.
 
   // TODO: More missing functions
 
@@ -2165,7 +365,7 @@ template <typename LLVMT> class SingleLLVMInstructionImpl : public Instruction {
 
   // All instructions are friends with this so they can call the constructor.
 #define DEF_INSTR(ID, OPC, CLASS) friend class CLASS;
-#include "llvm/SandboxIR/SandboxIRValues.def"
+#include "llvm/SandboxIR/Values.def"
   friend class UnaryInstruction;
   friend class CallBase;
   friend class FuncletPadInst;
@@ -4536,237 +2736,6 @@ class OpaqueInst : public SingleLLVMInstructionImpl<llvm::Instruction> {
   }
 };
 
-class Context {
-protected:
-  LLVMContext &LLVMCtx;
-  friend class Type;        // For LLVMCtx.
-  friend class PointerType; // For LLVMCtx.
-  friend class CmpInst; // For LLVMCtx. TODO: cleanup when sandboxir::VectorType
-                        // is complete
-  friend class IntegerType;           // For LLVMCtx.
-  friend class StructType;            // For LLVMCtx.
-  friend class ::llvm::TargetExtType; // For LLVMCtx.
-  friend class Region;                // For LLVMCtx.
-
-  Tracker IRTracker;
-
-  /// Maps LLVM Value to the corresponding sandboxir::Value. Owns all
-  /// SandboxIR objects.
-  DenseMap<llvm::Value *, std::unique_ptr<sandboxir::Value>>
-      LLVMValueToValueMap;
-
-  /// Type has a protected destructor to prohibit the user from managing the
-  /// lifetime of the Type objects. Context is friend of Type, and this custom
-  /// deleter can destroy Type.
-  struct TypeDeleter {
-    void operator()(Type *Ty) { delete Ty; }
-  };
-  /// Maps LLVM Type to the corresonding sandboxir::Type. Owns all Sandbox IR
-  /// Type objects.
-  DenseMap<llvm::Type *, std::unique_ptr<Type, TypeDeleter>> LLVMTypeToTypeMap;
-
-  /// Remove \p V from the maps and returns the unique_ptr.
-  std::unique_ptr<Value> detachLLVMValue(llvm::Value *V);
-  /// Remove \p SBV from all SandboxIR maps and stop owning it. This effectively
-  /// detaches \p V from the underlying IR.
-  std::unique_ptr<Value> detach(Value *V);
-  friend void Instruction::eraseFromParent(); // For detach().
-  /// Take ownership of VPtr and store it in `LLVMValueToValueMap`.
-  Value *registerValue(std::unique_ptr<Value> &&VPtr);
-  friend class EraseFromParent; // For registerValue().
-  /// This is the actual function that creates sandboxir values for \p V,
-  /// and among others handles all instruction types.
-  Value *getOrCreateValueInternal(llvm::Value *V, llvm::User *U = nullptr);
-  /// Get or create a sandboxir::Argument for an existing LLVM IR \p LLVMArg.
-  Argument *getOrCreateArgument(llvm::Argument *LLVMArg) {
-    auto Pair = LLVMValueToValueMap.insert({LLVMArg, nullptr});
-    auto It = Pair.first;
-    if (Pair.second) {
-      It->second = std::unique_ptr<Argument>(new Argument(LLVMArg, *this));
-      return cast<Argument>(It->second.get());
-    }
-    return cast<Argument>(It->second.get());
-  }
-  /// Get or create a sandboxir::Value for an existing LLVM IR \p LLVMV.
-  Value *getOrCreateValue(llvm::Value *LLVMV) {
-    return getOrCreateValueInternal(LLVMV, 0);
-  }
-  /// Get or create a sandboxir::Constant from an existing LLVM IR \p LLVMC.
-  Constant *getOrCreateConstant(llvm::Constant *LLVMC) {
-    return cast<Constant>(getOrCreateValueInternal(LLVMC, 0));
-  }
-  // Friends for getOrCreateConstant().
-#define DEF_CONST(ID, CLASS) friend class CLASS;
-#include "llvm/SandboxIR/SandboxIRValues.def"
-
-  /// Create a sandboxir::BasicBlock for an existing LLVM IR \p BB. This will
-  /// also create all contents of the block.
-  BasicBlock *createBasicBlock(llvm::BasicBlock *BB);
-  friend class BasicBlock; // For getOrCreateValue().
-
-  IRBuilder<ConstantFolder> LLVMIRBuilder;
-  auto &getLLVMIRBuilder() { return LLVMIRBuilder; }
-
-  VAArgInst *createVAArgInst(llvm::VAArgInst *SI);
-  friend VAArgInst; // For createVAArgInst()
-  FreezeInst *createFreezeInst(llvm::FreezeInst *SI);
-  friend FreezeInst; // For createFreezeInst()
-  FenceInst *createFenceInst(llvm::FenceInst *SI);
-  friend FenceInst; // For createFenceInst()
-  SelectInst *createSelectInst(llvm::SelectInst *SI);
-  friend SelectInst; // For createSelectInst()
-  InsertElementInst *createInsertElementInst(llvm::InsertElementInst *IEI);
-  friend InsertElementInst; // For createInsertElementInst()
-  ExtractElementInst *createExtractElementInst(llvm::ExtractElementInst *EEI);
-  friend ExtractElementInst; // For createExtractElementInst()
-  ShuffleVectorInst *createShuffleVectorInst(llvm::ShuffleVectorInst *SVI);
-  friend ShuffleVectorInst; // For createShuffleVectorInst()
-  ExtractValueInst *createExtractValueInst(llvm::ExtractValueInst *IVI);
-  friend ExtractValueInst; // For createExtractValueInst()
-  InsertValueInst *createInsertValueInst(llvm::InsertValueInst *IVI);
-  friend InsertValueInst; // For createInsertValueInst()
-  BranchInst *createBranchInst(llvm::BranchInst *I);
-  friend BranchInst; // For createBranchInst()
-  LoadInst *createLoadInst(llvm::LoadInst *LI);
-  friend LoadInst; // For createLoadInst()
-  StoreInst *createStoreInst(llvm::StoreInst *SI);
-  friend StoreInst; // For createStoreInst()
-  ReturnInst *createReturnInst(llvm::ReturnInst *I);
-  friend ReturnInst; // For createReturnInst()
-  CallInst *createCallInst(llvm::CallInst *I);
-  friend CallInst; // For createCallInst()
-  InvokeInst *createInvokeInst(llvm::InvokeInst *I);
-  friend InvokeInst; // For createInvokeInst()
-  CallBrInst *createCallBrInst(llvm::CallBrInst *I);
-  friend CallBrInst; // For createCallBrInst()
-  LandingPadInst *createLandingPadInst(llvm::LandingPadInst *I);
-  friend LandingPadInst; // For createLandingPadInst()
-  CatchPadInst *createCatchPadInst(llvm::CatchPadInst *I);
-  friend CatchPadInst; // For createCatchPadInst()
-  CleanupPadInst *createCleanupPadInst(llvm::CleanupPadInst *I);
-  friend CleanupPadInst; // For createCleanupPadInst()
-  CatchReturnInst *createCatchReturnInst(llvm::CatchReturnInst *I);
-  friend CatchReturnInst; // For createCatchReturnInst()
-  CleanupReturnInst *createCleanupReturnInst(llvm::CleanupReturnInst *I);
-  friend CleanupReturnInst; // For createCleanupReturnInst()
-  GetElementPtrInst *createGetElementPtrInst(llvm::GetElementPtrInst *I);
-  friend GetElementPtrInst; // For createGetElementPtrInst()
-  CatchSwitchInst *createCatchSwitchInst(llvm::CatchSwitchInst *I);
-  friend CatchSwitchInst; // For createCatchSwitchInst()
-  ResumeInst *createResumeInst(llvm::ResumeInst *I);
-  friend ResumeInst; // For createResumeInst()
-  SwitchInst *createSwitchInst(llvm::SwitchInst *I);
-  friend SwitchInst; // For createSwitchInst()
-  UnaryOperator *createUnaryOperator(llvm::UnaryOperator *I);
-  friend UnaryOperator; // For createUnaryOperator()
-  BinaryOperator *createBinaryOperator(llvm::BinaryOperator *I);
-  friend BinaryOperator; // For createBinaryOperator()
-  AtomicRMWInst *createAtomicRMWInst(llvm::AtomicRMWInst *I);
-  friend AtomicRMWInst; // For createAtomicRMWInst()
-  AtomicCmpXchgInst *createAtomicCmpXchgInst(llvm::AtomicCmpXchgInst *I);
-  friend AtomicCmpXchgInst; // For createAtomicCmpXchgInst()
-  AllocaInst *createAllocaInst(llvm::AllocaInst *I);
-  friend AllocaInst; // For createAllocaInst()
-  CastInst *createCastInst(llvm::CastInst *I);
-  friend CastInst; // For createCastInst()
-  PHINode *createPHINode(llvm::PHINode *I);
-  friend PHINode; // For createPHINode()
-  UnreachableInst *createUnreachableInst(llvm::UnreachableInst *UI);
-  friend UnreachableInst; // For createUnreachableInst()
-  CmpInst *createCmpInst(llvm::CmpInst *I);
-  friend CmpInst; // For createCmpInst()
-  ICmpInst *createICmpInst(llvm::ICmpInst *I);
-  friend ICmpInst; // For createICmpInst()
-  FCmpInst *createFCmpInst(llvm::FCmpInst *I);
-  friend FCmpInst; // For createFCmpInst()
-
-public:
-  Context(LLVMContext &LLVMCtx)
-      : LLVMCtx(LLVMCtx), IRTracker(*this),
-        LLVMIRBuilder(LLVMCtx, ConstantFolder()) {}
-
-  Tracker &getTracker() { return IRTracker; }
-  /// Convenience function for `getTracker().save()`
-  void save() { IRTracker.save(); }
-  /// Convenience function for `getTracker().revert()`
-  void revert() { IRTracker.revert(); }
-  /// Convenience function for `getTracker().accept()`
-  void accept() { IRTracker.accept(); }
-
-  sandboxir::Value *getValue(llvm::Value *V) const;
-  const sandboxir::Value *getValue(const llvm::Value *V) const {
-    return getValue(const_cast<llvm::Value *>(V));
-  }
-
-  Type *getType(llvm::Type *LLVMTy) {
-    if (LLVMTy == nullptr)
-      return nullptr;
-    auto Pair = LLVMTypeToTypeMap.insert({LLVMTy, nullptr});
-    auto It = Pair.first;
-    if (Pair.second)
-      It->second = std::unique_ptr<Type, TypeDeleter>(new Type(LLVMTy, *this));
-    return It->second.get();
-  }
-
-  /// Create a sandboxir::Function for an existing LLVM IR \p F, including all
-  /// blocks and instructions.
-  /// This is the main API function for creating Sandbox IR.
-  Function *createFunction(llvm::Function *F);
-
-  /// \Returns the number of values registered with Context.
-  size_t getNumValues() const { return LLVMValueToValueMap.size(); }
-};
-
-class Function : public GlobalWithNodeAPI<Function, llvm::Function,
-                                          GlobalObject, llvm::GlobalObject> {
-  /// Helper for mapped_iterator.
-  struct LLVMBBToBB {
-    Context &Ctx;
-    LLVMBBToBB(Context &Ctx) : Ctx(Ctx) {}
-    BasicBlock &operator()(llvm::BasicBlock &LLVMBB) const {
-      return *cast<BasicBlock>(Ctx.getValue(&LLVMBB));
-    }
-  };
-  /// Use Context::createFunction() instead.
-  Function(llvm::Function *F, sandboxir::Context &Ctx)
-      : GlobalWithNodeAPI(ClassID::Function, F, Ctx) {}
-  friend class Context; // For constructor.
-
-public:
-  /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::Function;
-  }
-
-  Argument *getArg(unsigned Idx) const {
-    llvm::Argument *Arg = cast<llvm::Function>(Val)->getArg(Idx);
-    return cast<Argument>(Ctx.getValue(Arg));
-  }
-
-  size_t arg_size() const { return cast<llvm::Function>(Val)->arg_size(); }
-  bool arg_empty() const { return cast<llvm::Function>(Val)->arg_empty(); }
-
-  using iterator = mapped_iterator<llvm::Function::iterator, LLVMBBToBB>;
-  iterator begin() const {
-    LLVMBBToBB BBGetter(Ctx);
-    return iterator(cast<llvm::Function>(Val)->begin(), BBGetter);
-  }
-  iterator end() const {
-    LLVMBBToBB BBGetter(Ctx);
-    return iterator(cast<llvm::Function>(Val)->end(), BBGetter);
-  }
-  FunctionType *getFunctionType() const;
-
-#ifndef NDEBUG
-  void verify() const final {
-    assert(isa<llvm::Function>(Val) && "Expected Function!");
-  }
-  void dumpNameAndArgs(raw_ostream &OS) const;
-  void dumpOS(raw_ostream &OS) const final;
-#endif
-};
-
-} // namespace sandboxir
-} // namespace llvm
+} // namespace llvm::sandboxir
 
-#endif // LLVM_SANDBOXIR_SANDBOXIR_H
+#endif // LLVM_SANDBOXIR_INSTRUCTION_H
diff --git a/llvm/include/llvm/SandboxIR/Module.h b/llvm/include/llvm/SandboxIR/Module.h
new file mode 100644
index 0000000000000..429bb04539bcb
--- /dev/null
+++ b/llvm/include/llvm/SandboxIR/Module.h
@@ -0,0 +1,92 @@
+//===- Module.h -------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SANDBOXIR_MODULE_H
+#define LLVM_SANDBOXIR_MODULE_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Module.h"
+#include <string>
+
+namespace llvm {
+
+class DataLayout;
+
+namespace sandboxir {
+
+class Context;
+class Function;
+class GlobalVariable;
+class Type;
+class Constant;
+class GlobalAlias;
+class GlobalIFunc;
+
+/// In SandboxIR the Module is mainly used to access the list of global objects.
+class Module {
+  llvm::Module &LLVMM;
+  Context &Ctx;
+
+  Module(llvm::Module &LLVMM, Context &Ctx) : LLVMM(LLVMM), Ctx(Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  Context &getContext() const { return Ctx; }
+
+  Function *getFunction(StringRef Name) const;
+
+  const DataLayout &getDataLayout() const { return LLVMM.getDataLayout(); }
+
+  const std::string &getSourceFileName() const {
+    return LLVMM.getSourceFileName();
+  }
+
+  /// Look up the specified global variable in the module symbol table. If it
+  /// does not exist, return null. If AllowInternal is set to true, this
+  /// function will return types that have InternalLinkage. By default, these
+  /// types are not returned.
+  GlobalVariable *getGlobalVariable(StringRef Name, bool AllowInternal) const;
+  GlobalVariable *getGlobalVariable(StringRef Name) const {
+    return getGlobalVariable(Name, /*AllowInternal=*/false);
+  }
+  /// Return the global variable in the module with the specified name, of
+  /// arbitrary type. This method returns null if a global with the specified
+  /// name is not found.
+  GlobalVariable *getNamedGlobal(StringRef Name) const {
+    return getGlobalVariable(Name, true);
+  }
+
+  // TODO: missing getOrInsertGlobal().
+
+  /// Return the global alias in the module with the specified name, of
+  /// arbitrary type. This method returns null if a global with the specified
+  /// name is not found.
+  GlobalAlias *getNamedAlias(StringRef Name) const;
+
+  /// Return the global ifunc in the module with the specified name, of
+  /// arbitrary type. This method returns null if a global with the specified
+  /// name is not found.
+  GlobalIFunc *getNamedIFunc(StringRef Name) const;
+
+  // TODO: Missing removeGlobalVariable() eraseGlobalVariable(),
+  // insertGlobalVariable()
+
+  // TODO: Missing global_begin(), global_end(), globals().
+
+  // TODO: Missing many other functions.
+
+#ifndef NDEBUG
+  void dumpOS(raw_ostream &OS) const;
+  LLVM_DUMP_METHOD void dump() const;
+#endif // NDEBUG
+};
+
+} // namespace sandboxir
+} // namespace llvm
+
+#endif // LLVM_SANDBOXIR_MODULE_H
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Region.h b/llvm/include/llvm/SandboxIR/Region.h
similarity index 98%
rename from llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Region.h
rename to llvm/include/llvm/SandboxIR/Region.h
index 884f1324df782..67411f3fb741d 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Region.h
+++ b/llvm/include/llvm/SandboxIR/Region.h
@@ -13,7 +13,7 @@
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/SandboxIR/SandboxIR.h"
+#include "llvm/SandboxIR/Instruction.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm::sandboxir {
diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h
index 5fc43db82bd70..3e3e539a8c7c1 100644
--- a/llvm/include/llvm/SandboxIR/Tracker.h
+++ b/llvm/include/llvm/SandboxIR/Tracker.h
@@ -64,6 +64,9 @@ class SwitchInst;
 class ConstantInt;
 class ShuffleVectorInst;
 class CmpInst;
+class Module;
+class GlobalVariable;
+
 /// The base class for IR Change classes.
 class IRChangeBase {
 protected:
diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h
index f99f80967797c..8094f66567fb8 100644
--- a/llvm/include/llvm/SandboxIR/Type.h
+++ b/llvm/include/llvm/SandboxIR/Type.h
@@ -31,9 +31,11 @@ class IntegerType;
 class FunctionType;
 class ArrayType;
 class StructType;
+class TargetExtType;
+class Module;
 #define DEF_INSTR(ID, OPCODE, CLASS) class CLASS;
 #define DEF_CONST(ID, CLASS) class CLASS;
-#include "llvm/SandboxIR/SandboxIRValues.def"
+#include "llvm/SandboxIR/Values.def"
 
 /// Just like llvm::Type these are immutable, unique, never get freed and can
 /// only be created via static factory methods.
@@ -57,11 +59,13 @@ class Type {
   friend class CmpInst;            // For LLVMTy. TODO: Cleanup after
                                    // sandboxir::VectorType is more complete.
   friend class Utils;              // for LLVMTy
+  friend class TargetExtType;      // For LLVMTy.
+  friend class Module;             // For LLVMTy.
 
   // Friend all instruction classes because `create()` functions use LLVMTy.
 #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS;
 #define DEF_CONST(ID, CLASS) friend class CLASS;
-#include "llvm/SandboxIR/SandboxIRValues.def"
+#include "llvm/SandboxIR/Values.def"
   Context &Ctx;
 
   Type(llvm::Type *LLVMTy, Context &Ctx) : LLVMTy(LLVMTy), Ctx(Ctx) {}
diff --git a/llvm/include/llvm/SandboxIR/User.h b/llvm/include/llvm/SandboxIR/User.h
new file mode 100644
index 0000000000000..80e672de34905
--- /dev/null
+++ b/llvm/include/llvm/SandboxIR/User.h
@@ -0,0 +1,150 @@
+//===- User.h ---------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SANDBOXIR_USER_H
+#define LLVM_SANDBOXIR_USER_H
+
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/SandboxIR/Use.h"
+#include "llvm/SandboxIR/Value.h"
+
+namespace llvm::sandboxir {
+
+class Context;
+
+/// Iterator for the `Use` edges of a User's operands.
+/// \Returns the operand `Use` when dereferenced.
+class OperandUseIterator {
+  sandboxir::Use Use;
+  /// Don't let the user create a non-empty OperandUseIterator.
+  OperandUseIterator(const class Use &Use) : Use(Use) {}
+  friend class User;                                  // For constructor
+#define DEF_INSTR(ID, OPC, CLASS) friend class CLASS; // For constructor
+#include "llvm/SandboxIR/Values.def"
+
+public:
+  using difference_type = std::ptrdiff_t;
+  using value_type = sandboxir::Use;
+  using pointer = value_type *;
+  using reference = value_type &;
+  using iterator_category = std::input_iterator_tag;
+
+  OperandUseIterator() = default;
+  value_type operator*() const;
+  OperandUseIterator &operator++();
+  OperandUseIterator operator++(int) {
+    auto Copy = *this;
+    this->operator++();
+    return Copy;
+  }
+  bool operator==(const OperandUseIterator &Other) const {
+    return Use == Other.Use;
+  }
+  bool operator!=(const OperandUseIterator &Other) const {
+    return !(*this == Other);
+  }
+  OperandUseIterator operator+(unsigned Num) const;
+  OperandUseIterator operator-(unsigned Num) const;
+  int operator-(const OperandUseIterator &Other) const;
+};
+
+/// A sandboxir::User has operands.
+class User : public Value {
+protected:
+  User(ClassID ID, llvm::Value *V, Context &Ctx) : Value(ID, V, Ctx) {}
+
+  /// \Returns the Use edge that corresponds to \p OpIdx.
+  /// Note: This is the default implementation that works for instructions that
+  /// match the underlying LLVM instruction. All others should use a different
+  /// implementation.
+  Use getOperandUseDefault(unsigned OpIdx, bool Verify) const;
+  /// \Returns the Use for the \p OpIdx'th operand. This is virtual to allow
+  /// instructions to deviate from the LLVM IR operands, which is a requirement
+  /// for sandboxir Instructions that consist of more than one LLVM Instruction.
+  virtual Use getOperandUseInternal(unsigned OpIdx, bool Verify) const = 0;
+  friend class OperandUseIterator; // for getOperandUseInternal()
+
+  /// The default implementation works only for single-LLVMIR-instruction
+  /// Users and only if they match exactly the LLVM instruction.
+  unsigned getUseOperandNoDefault(const Use &Use) const {
+    return Use.LLVMUse->getOperandNo();
+  }
+  /// \Returns the operand index of \p Use.
+  virtual unsigned getUseOperandNo(const Use &Use) const = 0;
+  friend unsigned Use::getOperandNo() const; // For getUseOperandNo()
+
+  void swapOperandsInternal(unsigned OpIdxA, unsigned OpIdxB) {
+    assert(OpIdxA < getNumOperands() && "OpIdxA out of bounds!");
+    assert(OpIdxB < getNumOperands() && "OpIdxB out of bounds!");
+    auto UseA = getOperandUse(OpIdxA);
+    auto UseB = getOperandUse(OpIdxB);
+    UseA.swap(UseB);
+  }
+
+#ifndef NDEBUG
+  void verifyUserOfLLVMUse(const llvm::Use &Use) const;
+#endif // NDEBUG
+
+public:
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From);
+  using op_iterator = OperandUseIterator;
+  using const_op_iterator = OperandUseIterator;
+  using op_range = iterator_range<op_iterator>;
+  using const_op_range = iterator_range<const_op_iterator>;
+
+  virtual op_iterator op_begin() {
+    assert(isa<llvm::User>(Val) && "Expect User value!");
+    return op_iterator(getOperandUseInternal(0, /*Verify=*/false));
+  }
+  virtual op_iterator op_end() {
+    assert(isa<llvm::User>(Val) && "Expect User value!");
+    return op_iterator(
+        getOperandUseInternal(getNumOperands(), /*Verify=*/false));
+  }
+  virtual const_op_iterator op_begin() const {
+    return const_cast<User *>(this)->op_begin();
+  }
+  virtual const_op_iterator op_end() const {
+    return const_cast<User *>(this)->op_end();
+  }
+
+  op_range operands() { return make_range<op_iterator>(op_begin(), op_end()); }
+  const_op_range operands() const {
+    return make_range<const_op_iterator>(op_begin(), op_end());
+  }
+  Value *getOperand(unsigned OpIdx) const { return getOperandUse(OpIdx).get(); }
+  /// \Returns the operand edge for \p OpIdx. NOTE: This should also work for
+  /// OpIdx == getNumOperands(), which is used for op_end().
+  Use getOperandUse(unsigned OpIdx) const {
+    return getOperandUseInternal(OpIdx, /*Verify=*/true);
+  }
+  virtual unsigned getNumOperands() const {
+    return isa<llvm::User>(Val) ? cast<llvm::User>(Val)->getNumOperands() : 0;
+  }
+
+  virtual void setOperand(unsigned OperandIdx, Value *Operand);
+  /// Replaces any operands that match \p FromV with \p ToV. Returns whether any
+  /// operands were replaced.
+  bool replaceUsesOfWith(Value *FromV, Value *ToV);
+
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::User>(Val) && "Expected User!");
+  }
+  void dumpCommonHeader(raw_ostream &OS) const final;
+  void dumpOS(raw_ostream &OS) const override {
+    // TODO: Remove this tmp implementation once we get the Instruction classes.
+  }
+#endif
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_SANDBOXIR_USER_H
diff --git a/llvm/include/llvm/SandboxIR/Utils.h b/llvm/include/llvm/SandboxIR/Utils.h
index ccc0030868a55..17fc837f555b8 100644
--- a/llvm/include/llvm/SandboxIR/Utils.h
+++ b/llvm/include/llvm/SandboxIR/Utils.h
@@ -12,6 +12,13 @@
 #ifndef LLVM_SANDBOXIR_UTILS_H
 #define LLVM_SANDBOXIR_UTILS_H
 
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/SandboxIR/Instruction.h"
+#include <optional>
+
 namespace llvm::sandboxir {
 
 class Utils {
@@ -48,6 +55,44 @@ class Utils {
     Type *Ty = getExpectedType(V);
     return DL.getTypeSizeInBits(Ty->LLVMTy);
   }
+
+  /// Equivalent to MemoryLocation::getOrNone(I).
+  static std::optional<llvm::MemoryLocation>
+  memoryLocationGetOrNone(const Instruction *I) {
+    return llvm::MemoryLocation::getOrNone(cast<llvm::Instruction>(I->Val));
+  }
+
+  /// \Returns the gap between the memory locations accessed by \p I0 and
+  /// \p I1 in bytes.
+  template <typename LoadOrStoreT>
+  static std::optional<int>
+  getPointerDiffInBytes(LoadOrStoreT *I0, LoadOrStoreT *I1, ScalarEvolution &SE,
+                        const DataLayout &DL) {
+    static_assert(std::is_same_v<LoadOrStoreT, LoadInst> ||
+                      std::is_same_v<LoadOrStoreT, StoreInst>,
+                  "Expected sandboxir::Load or sandboxir::Store!");
+    llvm::Value *Opnd0 = I0->getPointerOperand()->Val;
+    llvm::Value *Opnd1 = I1->getPointerOperand()->Val;
+    llvm::Value *Ptr0 = getUnderlyingObject(Opnd0);
+    llvm::Value *Ptr1 = getUnderlyingObject(Opnd1);
+    if (Ptr0 != Ptr1)
+      return false;
+    llvm::Type *ElemTy = llvm::Type::getInt8Ty(SE.getContext());
+    return getPointersDiff(ElemTy, Opnd0, ElemTy, Opnd1, DL, SE,
+                           /*StrictCheck=*/false, /*CheckType=*/false);
+  }
+
+  /// \Returns true if \p I0 accesses a memory location lower than \p I1.
+  /// Returns false if the difference cannot be determined, if the memory
+  /// locations are equal, or if I1 accesses a memory location greater than I0.
+  template <typename LoadOrStoreT>
+  static bool atLowerAddress(LoadOrStoreT *I0, LoadOrStoreT *I1,
+                             ScalarEvolution &SE, const DataLayout &DL) {
+    auto Diff = getPointerDiffInBytes(I0, I1, SE, DL);
+    if (!Diff)
+      return false;
+    return *Diff > 0;
+  }
 };
 } // namespace llvm::sandboxir
 
diff --git a/llvm/include/llvm/SandboxIR/Value.h b/llvm/include/llvm/SandboxIR/Value.h
new file mode 100644
index 0000000000000..b28f0e664f80b
--- /dev/null
+++ b/llvm/include/llvm/SandboxIR/Value.h
@@ -0,0 +1,278 @@
+//===- Value.h --------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SANDBOXIR_VALUE_H
+#define LLVM_SANDBOXIR_VALUE_H
+
+#include "llvm/IR/Value.h"
+#include "llvm/SandboxIR/Use.h"
+
+namespace llvm::sandboxir {
+
+// Forward declare all classes to avoid some MSVC build errors.
+#define DEF_INSTR(ID, OPC, CLASS) class CLASS;
+#define DEF_CONST(ID, CLASS) class CLASS;
+#define DEF_USER(ID, CLASS) class CLASS;
+#include "llvm/SandboxIR/Values.def"
+class Context;
+class FuncletPadInst;
+class Type;
+class GlobalValue;
+class GlobalObject;
+class Module;
+class UnaryInstruction;
+class CmpInst;
+
+/// Iterator for the `Use` edges of a Value's users.
+/// \Returns a `Use` when dereferenced.
+class UserUseIterator {
+  sandboxir::Use Use;
+  /// Don't let the user create a non-empty UserUseIterator.
+  UserUseIterator(const class Use &Use) : Use(Use) {}
+  friend class Value; // For constructor
+
+public:
+  using difference_type = std::ptrdiff_t;
+  using value_type = sandboxir::Use;
+  using pointer = value_type *;
+  using reference = value_type &;
+  using iterator_category = std::input_iterator_tag;
+
+  UserUseIterator() = default;
+  value_type operator*() const { return Use; }
+  UserUseIterator &operator++();
+  bool operator==(const UserUseIterator &Other) const {
+    return Use == Other.Use;
+  }
+  bool operator!=(const UserUseIterator &Other) const {
+    return !(*this == Other);
+  }
+  const sandboxir::Use &getUse() const { return Use; }
+};
+
+/// A SandboxIR Value has users. This is the base class.
+class Value {
+public:
+  enum class ClassID : unsigned {
+#define DEF_VALUE(ID, CLASS) ID,
+#define DEF_USER(ID, CLASS) ID,
+#define DEF_CONST(ID, CLASS) ID,
+#define DEF_INSTR(ID, OPC, CLASS) ID,
+#include "llvm/SandboxIR/Values.def"
+  };
+
+protected:
+  static const char *getSubclassIDStr(ClassID ID) {
+    switch (ID) {
+#define DEF_VALUE(ID, CLASS)                                                   \
+  case ClassID::ID:                                                            \
+    return #ID;
+#define DEF_USER(ID, CLASS)                                                    \
+  case ClassID::ID:                                                            \
+    return #ID;
+#define DEF_CONST(ID, CLASS)                                                   \
+  case ClassID::ID:                                                            \
+    return #ID;
+#define DEF_INSTR(ID, OPC, CLASS)                                              \
+  case ClassID::ID:                                                            \
+    return #ID;
+#include "llvm/SandboxIR/Values.def"
+    }
+    llvm_unreachable("Unimplemented ID");
+  }
+
+  /// For isa/dyn_cast.
+  ClassID SubclassID;
+#ifndef NDEBUG
+  /// A unique ID used for forming the name (used for debugging).
+  unsigned UID;
+#endif
+  /// The LLVM Value that corresponds to this SandboxIR Value.
+  /// NOTE: Some sandboxir Instructions, like Packs, may include more than one
+  /// value and in these cases `Val` points to the last instruction in program
+  /// order.
+  llvm::Value *Val = nullptr;
+
+  friend class Context;               // For getting `Val`.
+  friend class User;                  // For getting `Val`.
+  friend class Use;                   // For getting `Val`.
+  friend class VAArgInst;             // For getting `Val`.
+  friend class FreezeInst;            // For getting `Val`.
+  friend class FenceInst;             // For getting `Val`.
+  friend class SelectInst;            // For getting `Val`.
+  friend class ExtractElementInst;    // For getting `Val`.
+  friend class InsertElementInst;     // For getting `Val`.
+  friend class ShuffleVectorInst;     // For getting `Val`.
+  friend class ExtractValueInst;      // For getting `Val`.
+  friend class InsertValueInst;       // For getting `Val`.
+  friend class BranchInst;            // For getting `Val`.
+  friend class LoadInst;              // For getting `Val`.
+  friend class StoreInst;             // For getting `Val`.
+  friend class ReturnInst;            // For getting `Val`.
+  friend class CallBase;              // For getting `Val`.
+  friend class CallInst;              // For getting `Val`.
+  friend class InvokeInst;            // For getting `Val`.
+  friend class CallBrInst;            // For getting `Val`.
+  friend class LandingPadInst;        // For getting `Val`.
+  friend class FuncletPadInst;        // For getting `Val`.
+  friend class CatchPadInst;          // For getting `Val`.
+  friend class CleanupPadInst;        // For getting `Val`.
+  friend class CatchReturnInst;       // For getting `Val`.
+  friend class GetElementPtrInst;     // For getting `Val`.
+  friend class ResumeInst;            // For getting `Val`.
+  friend class CatchSwitchInst;       // For getting `Val`.
+  friend class CleanupReturnInst;     // For getting `Val`.
+  friend class SwitchInst;            // For getting `Val`.
+  friend class UnaryOperator;         // For getting `Val`.
+  friend class BinaryOperator;        // For getting `Val`.
+  friend class AtomicRMWInst;         // For getting `Val`.
+  friend class AtomicCmpXchgInst;     // For getting `Val`.
+  friend class AllocaInst;            // For getting `Val`.
+  friend class CastInst;              // For getting `Val`.
+  friend class PHINode;               // For getting `Val`.
+  friend class UnreachableInst;       // For getting `Val`.
+  friend class CatchSwitchAddHandler; // For `Val`.
+  friend class CmpInst;               // For getting `Val`.
+  friend class ConstantArray;         // For `Val`.
+  friend class ConstantStruct;        // For `Val`.
+  friend class ConstantAggregateZero; // For `Val`.
+  friend class ConstantPointerNull;   // For `Val`.
+  friend class UndefValue;            // For `Val`.
+  friend class PoisonValue;           // For `Val`.
+  friend class BlockAddress;          // For `Val`.
+  friend class GlobalValue;           // For `Val`.
+  friend class DSOLocalEquivalent;    // For `Val`.
+  friend class GlobalObject;          // For `Val`.
+  friend class GlobalIFunc;           // For `Val`.
+  friend class GlobalVariable;        // For `Val`.
+  friend class GlobalAlias;           // For `Val`.
+  friend class NoCFIValue;            // For `Val`.
+  friend class ConstantPtrAuth;       // For `Val`.
+  friend class ConstantExpr;          // For `Val`.
+  friend class Utils;                 // For `Val`.
+  friend class Module;                // For `Val`.
+  // Region needs to manipulate metadata in the underlying LLVM Value, we don't
+  // expose metadata in sandboxir.
+  friend class Region;
+
+  /// All values point to the context.
+  Context &Ctx;
+  // This is used by eraseFromParent().
+  void clearValue() { Val = nullptr; }
+  template <typename ItTy, typename SBTy> friend class LLVMOpUserItToSBTy;
+
+  Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx);
+  /// Disable copies.
+  Value(const Value &) = delete;
+  Value &operator=(const Value &) = delete;
+
+public:
+  virtual ~Value() = default;
+  ClassID getSubclassID() const { return SubclassID; }
+
+  using use_iterator = UserUseIterator;
+  using const_use_iterator = UserUseIterator;
+
+  use_iterator use_begin();
+  const_use_iterator use_begin() const {
+    return const_cast<Value *>(this)->use_begin();
+  }
+  use_iterator use_end() { return use_iterator(Use(nullptr, nullptr, Ctx)); }
+  const_use_iterator use_end() const {
+    return const_cast<Value *>(this)->use_end();
+  }
+
+  iterator_range<use_iterator> uses() {
+    return make_range<use_iterator>(use_begin(), use_end());
+  }
+  iterator_range<const_use_iterator> uses() const {
+    return make_range<const_use_iterator>(use_begin(), use_end());
+  }
+
+  /// Helper for mapped_iterator.
+  struct UseToUser {
+    User *operator()(const Use &Use) const { return &*Use.getUser(); }
+  };
+
+  using user_iterator = mapped_iterator<sandboxir::UserUseIterator, UseToUser>;
+  using const_user_iterator = user_iterator;
+
+  user_iterator user_begin();
+  user_iterator user_end() {
+    return user_iterator(Use(nullptr, nullptr, Ctx), UseToUser());
+  }
+  const_user_iterator user_begin() const {
+    return const_cast<Value *>(this)->user_begin();
+  }
+  const_user_iterator user_end() const {
+    return const_cast<Value *>(this)->user_end();
+  }
+
+  iterator_range<user_iterator> users() {
+    return make_range<user_iterator>(user_begin(), user_end());
+  }
+  iterator_range<const_user_iterator> users() const {
+    return make_range<const_user_iterator>(user_begin(), user_end());
+  }
+  /// \Returns the number of user edges (not necessarily to unique users).
+  /// WARNING: This is a linear-time operation.
+  unsigned getNumUses() const;
+  /// Return true if this value has N uses or more.
+  /// This is logically equivalent to getNumUses() >= N.
+  /// WARNING: This can be expensive, as it is linear to the number of users.
+  bool hasNUsesOrMore(unsigned Num) const {
+    unsigned Cnt = 0;
+    for (auto It = use_begin(), ItE = use_end(); It != ItE; ++It) {
+      if (++Cnt >= Num)
+        return true;
+    }
+    return false;
+  }
+  /// Return true if this Value has exactly N uses.
+  bool hasNUses(unsigned Num) const {
+    unsigned Cnt = 0;
+    for (auto It = use_begin(), ItE = use_end(); It != ItE; ++It) {
+      if (++Cnt > Num)
+        return false;
+    }
+    return Cnt == Num;
+  }
+
+  Type *getType() const;
+
+  Context &getContext() const { return Ctx; }
+
+  void replaceUsesWithIf(Value *OtherV,
+                         llvm::function_ref<bool(const Use &)> ShouldReplace);
+  void replaceAllUsesWith(Value *Other);
+
+  /// \Returns the LLVM IR name of the bottom-most LLVM value.
+  StringRef getName() const { return Val->getName(); }
+
+#ifndef NDEBUG
+  /// Should crash if there is something wrong with the instruction.
+  virtual void verify() const = 0;
+  /// Returns the unique id in the form 'SB<number>.' like 'SB1.'
+  std::string getUid() const;
+  virtual void dumpCommonHeader(raw_ostream &OS) const;
+  void dumpCommonFooter(raw_ostream &OS) const;
+  void dumpCommonPrefix(raw_ostream &OS) const;
+  void dumpCommonSuffix(raw_ostream &OS) const;
+  void printAsOperandCommon(raw_ostream &OS) const;
+  friend raw_ostream &operator<<(raw_ostream &OS, const sandboxir::Value &V) {
+    V.dumpOS(OS);
+    return OS;
+  }
+  virtual void dumpOS(raw_ostream &OS) const = 0;
+  LLVM_DUMP_METHOD void dump() const;
+#endif
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_SANDBOXIR_VALUE_H
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/Values.def
similarity index 98%
rename from llvm/include/llvm/SandboxIR/SandboxIRValues.def
rename to llvm/include/llvm/SandboxIR/Values.def
index 2a9ca6d3d73ce..3d8ad6ce197f4 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/Values.def
@@ -1,4 +1,4 @@
-//===- SandboxIRValues.def --------------------------------------*- C++ -*-===//
+//===- Values.def -----------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index 4a278d0acc23b..a7533b99a8441 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -93,6 +93,33 @@ inline bool isExtendedGlobalAddrSpace(unsigned AS) {
          AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
          AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
 }
+
+inline bool isConstantAddressSpace(unsigned AS) {
+  switch (AS) {
+    using namespace AMDGPUAS;
+  case CONSTANT_ADDRESS:
+  case CONSTANT_ADDRESS_32BIT:
+  case CONSTANT_BUFFER_0:
+  case CONSTANT_BUFFER_1:
+  case CONSTANT_BUFFER_2:
+  case CONSTANT_BUFFER_3:
+  case CONSTANT_BUFFER_4:
+  case CONSTANT_BUFFER_5:
+  case CONSTANT_BUFFER_6:
+  case CONSTANT_BUFFER_7:
+  case CONSTANT_BUFFER_8:
+  case CONSTANT_BUFFER_9:
+  case CONSTANT_BUFFER_10:
+  case CONSTANT_BUFFER_11:
+  case CONSTANT_BUFFER_12:
+  case CONSTANT_BUFFER_13:
+  case CONSTANT_BUFFER_14:
+  case CONSTANT_BUFFER_15:
+    return true;
+  default:
+    return false;
+  }
+}
 } // end namespace AMDGPU
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 9e70eb8d8fdd3..3556a253d875f 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -821,6 +821,9 @@ HANDLE_TARGET_OPCODE(G_FASIN)
 /// Floating point arctangent.
 HANDLE_TARGET_OPCODE(G_FATAN)
 
+/// Floating point arctangent of y/x.
+HANDLE_TARGET_OPCODE(G_FATAN2)
+
 /// Floating point hyperbolic cosine.
 HANDLE_TARGET_OPCODE(G_FCOSH)
 
diff --git a/llvm/include/llvm/TableGen/DirectiveEmitter.h b/llvm/include/llvm/TableGen/DirectiveEmitter.h
index 4ab0e889078ff..70d7d8fbe14f8 100644
--- a/llvm/include/llvm/TableGen/DirectiveEmitter.h
+++ b/llvm/include/llvm/TableGen/DirectiveEmitter.h
@@ -135,24 +135,24 @@ class Directive : public BaseRecord {
 public:
   Directive(const Record *Def) : BaseRecord(Def) {}
 
-  std::vector<Record *> getAllowedClauses() const {
-    return Def->getValueAsListOfDefs("allowedClauses");
+  std::vector<const Record *> getAllowedClauses() const {
+    return Def->getValueAsListOfConstDefs("allowedClauses");
   }
 
-  std::vector<Record *> getAllowedOnceClauses() const {
-    return Def->getValueAsListOfDefs("allowedOnceClauses");
+  std::vector<const Record *> getAllowedOnceClauses() const {
+    return Def->getValueAsListOfConstDefs("allowedOnceClauses");
   }
 
-  std::vector<Record *> getAllowedExclusiveClauses() const {
-    return Def->getValueAsListOfDefs("allowedExclusiveClauses");
+  std::vector<const Record *> getAllowedExclusiveClauses() const {
+    return Def->getValueAsListOfConstDefs("allowedExclusiveClauses");
   }
 
-  std::vector<Record *> getRequiredClauses() const {
-    return Def->getValueAsListOfDefs("requiredClauses");
+  std::vector<const Record *> getRequiredClauses() const {
+    return Def->getValueAsListOfConstDefs("requiredClauses");
   }
 
-  std::vector<Record *> getLeafConstructs() const {
-    return Def->getValueAsListOfDefs("leafConstructs");
+  std::vector<const Record *> getLeafConstructs() const {
+    return Def->getValueAsListOfConstDefs("leafConstructs");
   }
 
   Record *getAssociation() const { return Def->getValueAsDef("association"); }
@@ -203,8 +203,8 @@ class Clause : public BaseRecord {
     return Def->getValueAsString("enumClauseValue");
   }
 
-  std::vector<Record *> getClauseVals() const {
-    return Def->getValueAsListOfDefs("allowedClauseValues");
+  std::vector<const Record *> getClauseVals() const {
+    return Def->getValueAsListOfConstDefs("allowedClauseValues");
   }
 
   bool isValueOptional() const { return Def->getValueAsBit("isValueOptional"); }
@@ -247,7 +247,7 @@ class VersionedClause {
 
 class ClauseVal : public BaseRecord {
 public:
-  explicit ClauseVal(const Record *Def) : BaseRecord(Def) {}
+  ClauseVal(const Record *Def) : BaseRecord(Def) {}
 
   int getValue() const { return Def->getValueAsInt("value"); }
 
diff --git a/llvm/include/llvm/TableGen/Main.h b/llvm/include/llvm/TableGen/Main.h
index 4639ec756e9b1..e8c60e2869902 100644
--- a/llvm/include/llvm/TableGen/Main.h
+++ b/llvm/include/llvm/TableGen/Main.h
@@ -22,7 +22,7 @@ class RecordKeeper;
 
 /// Perform the action using Records, and write output to OS.
 /// Returns true on error, false otherwise.
-using TableGenMainFn = bool (raw_ostream &OS, RecordKeeper &Records);
+using TableGenMainFn = bool(raw_ostream &OS, const RecordKeeper &Records);
 
 int TableGenMain(const char *argv0,
                  std::function<TableGenMainFn> MainFn = nullptr);
diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index 4cd73c3f67552..f39a2ee9f1d9b 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -73,8 +73,9 @@ class RecTy {
   RecTyKind Kind;
   /// The RecordKeeper that uniqued this Type.
   RecordKeeper &RK;
-  /// ListRecTy of the list that has elements of this type.
-  ListRecTy *ListTy = nullptr;
+  /// ListRecTy of the list that has elements of this type. Its a cache that
+  /// is populated on demand.
+  mutable const ListRecTy *ListTy = nullptr;
 
 public:
   RecTy(RecTyKind K, RecordKeeper &RK) : Kind(K), RK(RK) {}
@@ -98,7 +99,7 @@ class RecTy {
   virtual bool typeIsA(const RecTy *RHS) const;
 
   /// Returns the type representing list<thistype>.
-  ListRecTy *getListTy();
+  const ListRecTy *getListTy() const;
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const RecTy &Ty) {
@@ -117,7 +118,7 @@ class BitRecTy : public RecTy {
     return RT->getRecTyKind() == BitRecTyKind;
   }
 
-  static BitRecTy *get(RecordKeeper &RK);
+  static const BitRecTy *get(RecordKeeper &RK);
 
   std::string getAsString() const override { return "bit"; }
 
@@ -136,7 +137,7 @@ class BitsRecTy : public RecTy {
     return RT->getRecTyKind() == BitsRecTyKind;
   }
 
-  static BitsRecTy *get(RecordKeeper &RK, unsigned Sz);
+  static const BitsRecTy *get(RecordKeeper &RK, unsigned Sz);
 
   unsigned getNumBits() const { return Size; }
 
@@ -156,7 +157,7 @@ class IntRecTy : public RecTy {
     return RT->getRecTyKind() == IntRecTyKind;
   }
 
-  static IntRecTy *get(RecordKeeper &RK);
+  static const IntRecTy *get(RecordKeeper &RK);
 
   std::string getAsString() const override { return "int"; }
 
@@ -174,7 +175,7 @@ class StringRecTy : public RecTy {
     return RT->getRecTyKind() == StringRecTyKind;
   }
 
-  static StringRecTy *get(RecordKeeper &RK);
+  static const StringRecTy *get(RecordKeeper &RK);
 
   std::string getAsString() const override;
 
@@ -184,11 +185,11 @@ class StringRecTy : public RecTy {
 /// 'list<Ty>' - Represent a list of element values, all of which must be of
 /// the specified type. The type is stored in ElementTy.
 class ListRecTy : public RecTy {
-  friend ListRecTy *RecTy::getListTy();
+  friend const ListRecTy *RecTy::getListTy() const;
 
-  RecTy *ElementTy;
+  const RecTy *ElementTy;
 
-  explicit ListRecTy(RecTy *T)
+  explicit ListRecTy(const RecTy *T)
       : RecTy(ListRecTyKind, T->getRecordKeeper()), ElementTy(T) {}
 
 public:
@@ -196,8 +197,8 @@ class ListRecTy : public RecTy {
     return RT->getRecTyKind() == ListRecTyKind;
   }
 
-  static ListRecTy *get(RecTy *T) { return T->getListTy(); }
-  RecTy *getElementType() const { return ElementTy; }
+  static const ListRecTy *get(const RecTy *T) { return T->getListTy(); }
+  const RecTy *getElementType() const { return ElementTy; }
 
   std::string getAsString() const override;
 
@@ -217,7 +218,7 @@ class DagRecTy : public RecTy {
     return RT->getRecTyKind() == DagRecTyKind;
   }
 
-  static DagRecTy *get(RecordKeeper &RK);
+  static const DagRecTy *get(RecordKeeper &RK);
 
   std::string getAsString() const override;
 };
@@ -226,8 +227,9 @@ class DagRecTy : public RecTy {
 ///
 /// The list of superclasses is non-redundant, i.e. only contains classes that
 /// are not the superclass of some other listed class.
-class RecordRecTy final : public RecTy, public FoldingSetNode,
-                          public TrailingObjects<RecordRecTy, Record *> {
+class RecordRecTy final : public RecTy,
+                          public FoldingSetNode,
+                          public TrailingObjects<RecordRecTy, const Record *> {
   friend class Record;
   friend detail::RecordKeeperImpl;
 
@@ -248,23 +250,24 @@ class RecordRecTy final : public RecTy, public FoldingSetNode,
   }
 
   /// Get the record type with the given non-redundant list of superclasses.
-  static RecordRecTy *get(RecordKeeper &RK, ArrayRef<Record *> Classes);
-  static RecordRecTy *get(Record *Class);
+  static const RecordRecTy *get(RecordKeeper &RK,
+                                ArrayRef<const Record *> Classes);
+  static const RecordRecTy *get(const Record *Class);
 
   void Profile(FoldingSetNodeID &ID) const;
 
-  ArrayRef<Record *> getClasses() const {
-    return ArrayRef(getTrailingObjects<Record *>(), NumClasses);
+  ArrayRef<const Record *> getClasses() const {
+    return ArrayRef(getTrailingObjects<const Record *>(), NumClasses);
   }
 
-  using const_record_iterator = Record * const *;
+  using const_record_iterator = const Record *const *;
 
   const_record_iterator classes_begin() const { return getClasses().begin(); }
   const_record_iterator classes_end() const { return getClasses().end(); }
 
   std::string getAsString() const override;
 
-  bool isSubClassOf(Record *Class) const;
+  bool isSubClassOf(const Record *Class) const;
   bool typeIsConvertibleTo(const RecTy *RHS) const override;
 
   bool typeIsA(const RecTy *RHS) const override;
@@ -272,7 +275,7 @@ class RecordRecTy final : public RecTy, public FoldingSetNode,
 
 /// Find a common type that T1 and T2 convert to.
 /// Return 0 if no such type exists.
-RecTy *resolveTypes(RecTy *T1, RecTy *T2);
+const RecTy *resolveTypes(const RecTy *T1, const RecTy *T2);
 
 //===----------------------------------------------------------------------===//
 //  Initializer Classes
@@ -370,12 +373,12 @@ class Init {
   /// If this value is convertible to type \p Ty, return a value whose
   /// type is \p Ty, generating a !cast operation if required.
   /// Otherwise, return null.
-  virtual Init *getCastTo(RecTy *Ty) const = 0;
+  virtual Init *getCastTo(const RecTy *Ty) const = 0;
 
   /// Convert to a value whose type is \p Ty, or return null if this
   /// is not possible. This can happen if the value's type is convertible
   /// to \p Ty, but there are unresolved references.
-  virtual Init *convertInitializerTo(RecTy *Ty) const = 0;
+  virtual Init *convertInitializerTo(const RecTy *Ty) const = 0;
 
   /// This function is used to implement the bit range
   /// selection operator. Given a value, it selects the specified bits,
@@ -388,7 +391,7 @@ class Init {
   /// This function is used to implement the FieldInit class.
   /// Implementors of this method should return the type of the named
   /// field if they are of type record.
-  virtual RecTy *getFieldType(StringInit *FieldName) const {
+  virtual const RecTy *getFieldType(StringInit *FieldName) const {
     return nullptr;
   }
 
@@ -411,10 +414,10 @@ inline raw_ostream &operator<<(raw_ostream &OS, const Init &I) {
 /// This is the common superclass of types that have a specific,
 /// explicit type, stored in ValueTy.
 class TypedInit : public Init {
-  RecTy *ValueTy;
+  const RecTy *ValueTy;
 
 protected:
-  explicit TypedInit(InitKind K, RecTy *T, uint8_t Opc = 0)
+  explicit TypedInit(InitKind K, const RecTy *T, uint8_t Opc = 0)
       : Init(K, Opc), ValueTy(T) {}
 
 public:
@@ -427,20 +430,20 @@ class TypedInit : public Init {
   }
 
   /// Get the type of the Init as a RecTy.
-  RecTy *getType() const { return ValueTy; }
+  const RecTy *getType() const { return ValueTy; }
 
   /// Get the record keeper that initialized this Init.
   RecordKeeper &getRecordKeeper() const { return ValueTy->getRecordKeeper(); }
 
-  Init *getCastTo(RecTy *Ty) const override;
-  Init *convertInitializerTo(RecTy *Ty) const override;
+  Init *getCastTo(const RecTy *Ty) const override;
+  Init *convertInitializerTo(const RecTy *Ty) const override;
 
   Init *convertInitializerBitRange(ArrayRef<unsigned> Bits) const override;
 
   /// This method is used to implement the FieldInit class.
   /// Implementors of this method should return the type of the named field if
   /// they are of type record.
-  RecTy *getFieldType(StringInit *FieldName) const override;
+  const RecTy *getFieldType(StringInit *FieldName) const override;
 };
 
 /// '?' - Represents an uninitialized value.
@@ -466,8 +469,8 @@ class UnsetInit : public Init {
   /// Get the record keeper that initialized this Init.
   RecordKeeper &getRecordKeeper() const { return RK; }
 
-  Init *getCastTo(RecTy *Ty) const override;
-  Init *convertInitializerTo(RecTy *Ty) const override;
+  Init *getCastTo(const RecTy *Ty) const override;
+  Init *convertInitializerTo(const RecTy *Ty) const override;
 
   Init *getBit(unsigned Bit) const override {
     return const_cast<UnsetInit*>(this);
@@ -538,8 +541,10 @@ class ArgumentInit : public Init, public FoldingSetNode {
   bool isComplete() const override { return false; }
   bool isConcrete() const override { return false; }
   Init *getBit(unsigned Bit) const override { return Value->getBit(Bit); }
-  Init *getCastTo(RecTy *Ty) const override { return Value->getCastTo(Ty); }
-  Init *convertInitializerTo(RecTy *Ty) const override {
+  Init *getCastTo(const RecTy *Ty) const override {
+    return Value->getCastTo(Ty);
+  }
+  Init *convertInitializerTo(const RecTy *Ty) const override {
     return Value->convertInitializerTo(Ty);
   }
 };
@@ -550,7 +555,8 @@ class BitInit final : public TypedInit {
 
   bool Value;
 
-  explicit BitInit(bool V, RecTy *T) : TypedInit(IK_BitInit, T), Value(V) {}
+  explicit BitInit(bool V, const RecTy *T)
+      : TypedInit(IK_BitInit, T), Value(V) {}
 
 public:
   BitInit(const BitInit &) = delete;
@@ -564,7 +570,7 @@ class BitInit final : public TypedInit {
 
   bool getValue() const { return Value; }
 
-  Init *convertInitializerTo(RecTy *Ty) const override;
+  Init *convertInitializerTo(const RecTy *Ty) const override;
 
   Init *getBit(unsigned Bit) const override {
     assert(Bit < 1 && "Bit index out of range!");
@@ -601,7 +607,7 @@ class BitsInit final : public TypedInit, public FoldingSetNode,
 
   unsigned getNumBits() const { return NumBits; }
 
-  Init *convertInitializerTo(RecTy *Ty) const override;
+  Init *convertInitializerTo(const RecTy *Ty) const override;
   Init *convertInitializerBitRange(ArrayRef<unsigned> Bits) const override;
   std::optional<int64_t> convertInitializerToInt() const;
 
@@ -647,7 +653,7 @@ class IntInit : public TypedInit {
 
   int64_t getValue() const { return Value; }
 
-  Init *convertInitializerTo(RecTy *Ty) const override;
+  Init *convertInitializerTo(const RecTy *Ty) const override;
   Init *convertInitializerBitRange(ArrayRef<unsigned> Bits) const override;
 
   bool isConcrete() const override { return true; }
@@ -722,7 +728,7 @@ class StringInit : public TypedInit {
   StringFormat getFormat() const { return Format; }
   bool hasCodeFormat() const { return Format == SF_Code; }
 
-  Init *convertInitializerTo(RecTy *Ty) const override;
+  Init *convertInitializerTo(const RecTy *Ty) const override;
 
   bool isConcrete() const override { return true; }
 
@@ -752,7 +758,7 @@ class ListInit final : public TypedInit, public FoldingSetNode,
   using const_iterator = Init *const *;
 
 private:
-  explicit ListInit(unsigned N, RecTy *EltTy)
+  explicit ListInit(unsigned N, const RecTy *EltTy)
       : TypedInit(IK_ListInit, ListRecTy::get(EltTy)), NumValues(N) {}
 
 public:
@@ -765,7 +771,7 @@ class ListInit final : public TypedInit, public FoldingSetNode,
   static bool classof(const Init *I) {
     return I->getKind() == IK_ListInit;
   }
-  static ListInit *get(ArrayRef<Init *> Range, RecTy *EltTy);
+  static ListInit *get(ArrayRef<Init *> Range, const RecTy *EltTy);
 
   void Profile(FoldingSetNodeID &ID) const;
 
@@ -773,13 +779,13 @@ class ListInit final : public TypedInit, public FoldingSetNode,
     assert(i < NumValues && "List element index out of range!");
     return getTrailingObjects<Init *>()[i];
   }
-  RecTy *getElementType() const {
+  const RecTy *getElementType() const {
     return cast<ListRecTy>(getType())->getElementType();
   }
 
   Record *getElementAsRecord(unsigned i) const;
 
-  Init *convertInitializerTo(RecTy *Ty) const override;
+  Init *convertInitializerTo(const RecTy *Ty) const override;
 
   /// This method is used by classes that refer to other
   /// variables which may not be defined at the time they expression is formed.
@@ -811,8 +817,8 @@ class ListInit final : public TypedInit, public FoldingSetNode,
 ///
 class OpInit : public TypedInit {
 protected:
-  explicit OpInit(InitKind K, RecTy *Type, uint8_t Opc)
-    : TypedInit(K, Type, Opc) {}
+  explicit OpInit(InitKind K, const RecTy *Type, uint8_t Opc)
+      : TypedInit(K, Type, Opc) {}
 
 public:
   OpInit(const OpInit &) = delete;
@@ -854,8 +860,8 @@ class UnOpInit : public OpInit, public FoldingSetNode {
 private:
   Init *LHS;
 
-  UnOpInit(UnaryOp opc, Init *lhs, RecTy *Type)
-    : OpInit(IK_UnOpInit, Type, opc), LHS(lhs) {}
+  UnOpInit(UnaryOp opc, Init *lhs, const RecTy *Type)
+      : OpInit(IK_UnOpInit, Type, opc), LHS(lhs) {}
 
 public:
   UnOpInit(const UnOpInit &) = delete;
@@ -865,7 +871,7 @@ class UnOpInit : public OpInit, public FoldingSetNode {
     return I->getKind() == IK_UnOpInit;
   }
 
-  static UnOpInit *get(UnaryOp opc, Init *lhs, RecTy *Type);
+  static UnOpInit *get(UnaryOp opc, Init *lhs, const RecTy *Type);
 
   void Profile(FoldingSetNodeID &ID) const;
 
@@ -932,8 +938,8 @@ class BinOpInit : public OpInit, public FoldingSetNode {
 private:
   Init *LHS, *RHS;
 
-  BinOpInit(BinaryOp opc, Init *lhs, Init *rhs, RecTy *Type) :
-      OpInit(IK_BinOpInit, Type, opc), LHS(lhs), RHS(rhs) {}
+  BinOpInit(BinaryOp opc, Init *lhs, Init *rhs, const RecTy *Type)
+      : OpInit(IK_BinOpInit, Type, opc), LHS(lhs), RHS(rhs) {}
 
 public:
   BinOpInit(const BinOpInit &) = delete;
@@ -943,8 +949,7 @@ class BinOpInit : public OpInit, public FoldingSetNode {
     return I->getKind() == IK_BinOpInit;
   }
 
-  static BinOpInit *get(BinaryOp opc, Init *lhs, Init *rhs,
-                        RecTy *Type);
+  static BinOpInit *get(BinaryOp opc, Init *lhs, Init *rhs, const RecTy *Type);
   static Init *getStrConcat(Init *lhs, Init *rhs);
   static Init *getListConcat(TypedInit *lhs, Init *rhs);
 
@@ -1000,9 +1005,8 @@ class TernOpInit : public OpInit, public FoldingSetNode {
 private:
   Init *LHS, *MHS, *RHS;
 
-  TernOpInit(TernaryOp opc, Init *lhs, Init *mhs, Init *rhs,
-             RecTy *Type) :
-      OpInit(IK_TernOpInit, Type, opc), LHS(lhs), MHS(mhs), RHS(rhs) {}
+  TernOpInit(TernaryOp opc, Init *lhs, Init *mhs, Init *rhs, const RecTy *Type)
+      : OpInit(IK_TernOpInit, Type, opc), LHS(lhs), MHS(mhs), RHS(rhs) {}
 
 public:
   TernOpInit(const TernOpInit &) = delete;
@@ -1012,9 +1016,8 @@ class TernOpInit : public OpInit, public FoldingSetNode {
     return I->getKind() == IK_TernOpInit;
   }
 
-  static TernOpInit *get(TernaryOp opc, Init *lhs,
-                         Init *mhs, Init *rhs,
-                         RecTy *Type);
+  static TernOpInit *get(TernaryOp opc, Init *lhs, Init *mhs, Init *rhs,
+                         const RecTy *Type);
 
   void Profile(FoldingSetNodeID &ID) const;
 
@@ -1060,11 +1063,10 @@ class TernOpInit : public OpInit, public FoldingSetNode {
 class CondOpInit final : public TypedInit, public FoldingSetNode,
                       public TrailingObjects<CondOpInit, Init *> {
   unsigned NumConds;
-  RecTy *ValType;
+  const RecTy *ValType;
 
-  CondOpInit(unsigned NC, RecTy *Type)
-    : TypedInit(IK_CondOpInit, Type),
-      NumConds(NC), ValType(Type) {}
+  CondOpInit(unsigned NC, const RecTy *Type)
+      : TypedInit(IK_CondOpInit, Type), NumConds(NC), ValType(Type) {}
 
   size_t numTrailingObjects(OverloadToken<Init *>) const {
     return 2*NumConds;
@@ -1078,12 +1080,12 @@ class CondOpInit final : public TypedInit, public FoldingSetNode,
     return I->getKind() == IK_CondOpInit;
   }
 
-  static CondOpInit *get(ArrayRef<Init*> C, ArrayRef<Init*> V,
-                        RecTy *Type);
+  static CondOpInit *get(ArrayRef<Init *> C, ArrayRef<Init *> V,
+                         const RecTy *Type);
 
   void Profile(FoldingSetNodeID &ID) const;
 
-  RecTy *getValType() const { return ValType; }
+  const RecTy *getValType() const { return ValType; }
 
   unsigned getNumConds() const { return NumConds; }
 
@@ -1140,7 +1142,8 @@ class FoldOpInit : public TypedInit, public FoldingSetNode {
   Init *B;
   Init *Expr;
 
-  FoldOpInit(Init *Start, Init *List, Init *A, Init *B, Init *Expr, RecTy *Type)
+  FoldOpInit(Init *Start, Init *List, Init *A, Init *B, Init *Expr,
+             const RecTy *Type)
       : TypedInit(IK_FoldOpInit, Type), Start(Start), List(List), A(A), B(B),
         Expr(Expr) {}
 
@@ -1151,7 +1154,7 @@ class FoldOpInit : public TypedInit, public FoldingSetNode {
   static bool classof(const Init *I) { return I->getKind() == IK_FoldOpInit; }
 
   static FoldOpInit *get(Init *Start, Init *List, Init *A, Init *B, Init *Expr,
-                         RecTy *Type);
+                         const RecTy *Type);
 
   void Profile(FoldingSetNodeID &ID) const;
 
@@ -1171,10 +1174,10 @@ class FoldOpInit : public TypedInit, public FoldingSetNode {
 /// !isa<type>(expr) - Dynamically determine the type of an expression.
 class IsAOpInit : public TypedInit, public FoldingSetNode {
 private:
-  RecTy *CheckType;
+  const RecTy *CheckType;
   Init *Expr;
 
-  IsAOpInit(RecTy *CheckType, Init *Expr)
+  IsAOpInit(const RecTy *CheckType, Init *Expr)
       : TypedInit(IK_IsAOpInit, IntRecTy::get(CheckType->getRecordKeeper())),
         CheckType(CheckType), Expr(Expr) {}
 
@@ -1184,7 +1187,7 @@ class IsAOpInit : public TypedInit, public FoldingSetNode {
 
   static bool classof(const Init *I) { return I->getKind() == IK_IsAOpInit; }
 
-  static IsAOpInit *get(RecTy *CheckType, Init *Expr);
+  static IsAOpInit *get(const RecTy *CheckType, Init *Expr);
 
   void Profile(FoldingSetNodeID &ID) const;
 
@@ -1205,10 +1208,10 @@ class IsAOpInit : public TypedInit, public FoldingSetNode {
 /// `expr` exists.
 class ExistsOpInit : public TypedInit, public FoldingSetNode {
 private:
-  RecTy *CheckType;
+  const RecTy *CheckType;
   Init *Expr;
 
-  ExistsOpInit(RecTy *CheckType, Init *Expr)
+  ExistsOpInit(const RecTy *CheckType, Init *Expr)
       : TypedInit(IK_ExistsOpInit, IntRecTy::get(CheckType->getRecordKeeper())),
         CheckType(CheckType), Expr(Expr) {}
 
@@ -1218,7 +1221,7 @@ class ExistsOpInit : public TypedInit, public FoldingSetNode {
 
   static bool classof(const Init *I) { return I->getKind() == IK_ExistsOpInit; }
 
-  static ExistsOpInit *get(RecTy *CheckType, Init *Expr);
+  static ExistsOpInit *get(const RecTy *CheckType, Init *Expr);
 
   void Profile(FoldingSetNodeID &ID) const;
 
@@ -1239,7 +1242,7 @@ class ExistsOpInit : public TypedInit, public FoldingSetNode {
 class VarInit : public TypedInit {
   Init *VarName;
 
-  explicit VarInit(Init *VN, RecTy *T)
+  explicit VarInit(Init *VN, const RecTy *T)
       : TypedInit(IK_VarInit, T), VarName(VN) {}
 
 public:
@@ -1250,8 +1253,8 @@ class VarInit : public TypedInit {
     return I->getKind() == IK_VarInit;
   }
 
-  static VarInit *get(StringRef VN, RecTy *T);
-  static VarInit *get(Init *VN, RecTy *T);
+  static VarInit *get(StringRef VN, const RecTy *T);
+  static VarInit *get(Init *VN, const RecTy *T);
 
   StringRef getName() const;
   Init *getNameInit() const { return VarName; }
@@ -1325,11 +1328,11 @@ class DefInit : public TypedInit {
     return I->getKind() == IK_DefInit;
   }
 
-  Init *convertInitializerTo(RecTy *Ty) const override;
+  Init *convertInitializerTo(const RecTy *Ty) const override;
 
   Record *getDef() const { return Def; }
 
-  RecTy *getFieldType(StringInit *FieldName) const override;
+  const RecTy *getFieldType(StringInit *FieldName) const override;
 
   bool isConcrete() const override { return true; }
   std::string getAsString() const override;
@@ -1551,7 +1554,7 @@ class RecordVal {
 private:
   Init *Name;
   SMLoc Loc; // Source location of definition of name.
-  PointerIntPair<RecTy *, 2, FieldKind> TyAndKind;
+  PointerIntPair<const RecTy *, 2, FieldKind> TyAndKind;
   Init *Value;
   bool IsUsed = false;
 
@@ -1559,8 +1562,8 @@ class RecordVal {
   SmallVector<SMRange> ReferenceLocs;
 
 public:
-  RecordVal(Init *N, RecTy *T, FieldKind K);
-  RecordVal(Init *N, SMLoc Loc, RecTy *T, FieldKind K);
+  RecordVal(Init *N, const RecTy *T, FieldKind K);
+  RecordVal(Init *N, SMLoc Loc, const RecTy *T, FieldKind K);
 
   /// Get the record keeper used to unique this value.
   RecordKeeper &getRecordKeeper() const { return Name->getRecordKeeper(); }
@@ -1590,7 +1593,7 @@ class RecordVal {
   }
 
   /// Get the type of the field value as a RecTy.
-  RecTy *getType() const { return TyAndKind.getPointer(); }
+  const RecTy *getType() const { return TyAndKind.getPointer(); }
 
   /// Get the type of the field for printing purposes.
   std::string getPrintType() const;
@@ -1735,7 +1738,7 @@ class Record {
   void updateClassLoc(SMLoc Loc);
 
   // Make the type that this record should have based on its superclasses.
-  RecordRecTy *getType() const;
+  const RecordRecTy *getType() const;
 
   /// get the corresponding DefInit.
   DefInit *getDefInit() const;
@@ -1763,7 +1766,7 @@ class Record {
   bool hasDirectSuperClass(const Record *SuperClass) const;
 
   /// Append the direct superclasses of this record to Classes.
-  void getDirectSuperClasses(SmallVectorImpl<Record *> &Classes) const;
+  void getDirectSuperClasses(SmallVectorImpl<const Record *> &Classes) const;
 
   bool isTemplateArg(Init *Name) const {
     return llvm::is_contained(TemplateArgs, Name);
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index f5e62dda6fd04..8b8bc9a0e9cf5 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1048,6 +1048,13 @@ def G_FATAN : GenericInstruction {
   let hasSideEffects = false;
 }
 
+// Floating point arctangent of a value.
+def G_FATAN2 : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = false;
+}
+
 // Floating point hyperbolic cosine of a value.
 def G_FCOSH : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index 94e0fa2404d6f..88f253805ca99 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -64,9 +64,6 @@ namespace llvm {
     List,   // Get list of functions & BBs from a file. Selectively enables
             // basic block sections for a subset of basic blocks which can be
             // used to control object size bloats from creating sections.
-    Labels, // Do not use Basic Block Sections but label basic blocks.  This
-            // is useful when associating profile counts from virtual addresses
-            // to basic blocks.
     Preset, // Similar to list but the blocks are identified by passes which
             // seek to use Basic Block Sections, e.g. MachineFunctionSplitter.
             // This option cannot be set via the command line.
diff --git a/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc b/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc
index ef90f02f57f9f..5c6c3475ed910 100644
--- a/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc
+++ b/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc
@@ -33,7 +33,7 @@ enum CPUFeatures {
   FEAT_FP,
   FEAT_SIMD,
   FEAT_CRC,
-  FEAT_SHA1,
+  RESERVED_FEAT_SHA1, // previously used and now ABI legacy
   FEAT_SHA2,
   FEAT_SHA3,
   FEAT_AES,
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 921fe94553951..b32f07e6427e8 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1341,6 +1341,9 @@ struct InformationCache {
   const ArrayRef<Function *>
   getIndirectlyCallableFunctions(Attributor &A) const;
 
+  /// Return the flat address space if the associated target has.
+  std::optional<unsigned> getFlatAddressSpace() const;
+
 private:
   struct FunctionInfo {
     ~FunctionInfo();
@@ -6267,11 +6270,12 @@ struct AAAddressSpace : public StateWrapper<BooleanState, AbstractAttribute> {
     return (AA->getIdAddr() == &ID);
   }
 
-  // No address space which indicates the associated value is dead.
-  static const uint32_t NoAddressSpace = ~0U;
-
   /// Unique ID (due to the unique address)
   static const char ID;
+
+protected:
+  // Invalid address space which indicates the associated value is dead.
+  static const uint32_t InvalidAddressSpace = ~0U;
 };
 
 struct AAAllocationInfo : public StateWrapper<BooleanState, AbstractAttribute> {
diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 8ed03d7f3ddbf..e5f78ac228683 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -26,6 +26,7 @@ class AssumptionCache;
 class CallBase;
 class CallInst;
 class DominatorTree;
+class EarliestEscapeInfo;
 class Function;
 class Instruction;
 class LoadInst;
@@ -48,6 +49,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
   PostDominatorTree *PDT = nullptr;
   MemorySSA *MSSA = nullptr;
   MemorySSAUpdater *MSSAU = nullptr;
+  EarliestEscapeInfo *EEI = nullptr;
 
 public:
   MemCpyOptPass() = default;
diff --git a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
index 1979c4af770b0..a8fb38e726004 100644
--- a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -251,7 +251,7 @@ namespace llvm {
 
   /// Emit a call to the calloc function.
   Value *emitCalloc(Value *Num, Value *Size, IRBuilderBase &B,
-                    const TargetLibraryInfo &TLI);
+                    const TargetLibraryInfo &TLI, unsigned AddrSpace);
 
   /// Emit a call to the hot/cold operator new function.
   Value *emitHotColdNew(Value *Num, IRBuilderBase &B,
diff --git a/llvm/include/llvm/Transforms/Utils/CodeLayout.h b/llvm/include/llvm/Transforms/Utils/CodeLayout.h
index 3ba8b9137113b..c737643ee1014 100644
--- a/llvm/include/llvm/Transforms/Utils/CodeLayout.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeLayout.h
@@ -49,12 +49,10 @@ std::vector<uint64_t> computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
 /// the given order, which is anti-correlated with the number of I-cache misses
 /// in a typical execution of the function.
 double calcExtTspScore(ArrayRef<uint64_t> Order, ArrayRef<uint64_t> NodeSizes,
-                       ArrayRef<uint64_t> NodeCounts,
                        ArrayRef<EdgeCount> EdgeCounts);
 
 /// Estimate the "quality" of the current node order in CFG.
 double calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
-                       ArrayRef<uint64_t> NodeCounts,
                        ArrayRef<EdgeCount> EdgeCounts);
 
 /// Algorithm-specific params for Cache-Directed Sort. The values are tuned for
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index a761859465210..4b3d6fbed8362 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -224,9 +224,9 @@ bool promoteLoopAccessesToScalars(
     bool AllowSpeculation, bool HasReadsOutsideSet);
 
 /// Does a BFS from a given node to all of its children inside a given loop.
-/// The returned vector of nodes includes the starting point.
-SmallVector<DomTreeNode *, 16> collectChildrenInLoop(DomTreeNode *N,
-                                                     const Loop *CurLoop);
+/// The returned vector of basic blocks includes the starting point.
+SmallVector<BasicBlock *, 16>
+collectChildrenInLoop(DominatorTree *DT, DomTreeNode *N, const Loop *CurLoop);
 
 /// Returns the instructions that use values defined in the loop.
 SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L);
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h
index b4ea1ad840f9d..7231e45fe8eb7 100644
--- a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h
@@ -247,9 +247,10 @@ FlowFunction SampleProfileInference<BT>::createFlowFunction(
   // Create FlowBlocks
   for (const auto *BB : BasicBlocks) {
     FlowBlock Block;
-    if (SampleBlockWeights.contains(BB)) {
+    auto It = SampleBlockWeights.find(BB);
+    if (It != SampleBlockWeights.end()) {
       Block.HasUnknownWeight = false;
-      Block.Weight = SampleBlockWeights[BB];
+      Block.Weight = It->second;
     } else {
       Block.HasUnknownWeight = true;
       Block.Weight = 0;
diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index 62c1e15a9a60e..0af3efeacd040 100644
--- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -47,6 +47,7 @@ struct PoisonFlags {
   unsigned Exact : 1;
   unsigned Disjoint : 1;
   unsigned NNeg : 1;
+  GEPNoWrapFlags GEPNW;
 
   PoisonFlags(const Instruction *I);
   void apply(Instruction *I);
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index f5b91919a9692..dc7e484a40a45 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -224,6 +224,18 @@ class LoopVectorizationRequirements {
   Instruction *ExactFPMathInst = nullptr;
 };
 
+/// This holds details about a histogram operation -- a load -> update -> store
+/// sequence where each lane in a vector might be updating the same element as
+/// another lane.
+struct HistogramInfo {
+  LoadInst *Load;
+  Instruction *Update;
+  StoreInst *Store;
+
+  HistogramInfo(LoadInst *Load, Instruction *Update, StoreInst *Store)
+      : Load(Load), Update(Update), Store(Store) {}
+};
+
 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
 /// to what vectorization factor.
 /// This class does not look at the profitability of vectorization, only the
@@ -408,6 +420,20 @@ class LoopVectorizationLegality {
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
+  /// Returns a HistogramInfo* for the given instruction if it was determined
+  /// to be part of a load -> update -> store sequence where multiple lanes
+  /// may be working on the same memory address.
+  std::optional<const HistogramInfo *> getHistogramInfo(Instruction *I) const {
+    for (const HistogramInfo &HGram : Histograms)
+      if (HGram.Load == I || HGram.Update == I || HGram.Store == I)
+        return &HGram;
+
+    return std::nullopt;
+  }
+
+  /// Returns a list of all known histogram operations in the loop.
+  bool hasHistograms() const { return !Histograms.empty(); }
+
   PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
     return &PSE;
   }
@@ -472,6 +498,11 @@ class LoopVectorizationLegality {
   /// Returns true if the loop is vectorizable
   bool canVectorizeMemory();
 
+  /// If LAA cannot determine whether all dependences are safe, we may be able
+  /// to further analyse some IndirectUnsafe dependences and if they match a
+  /// certain pattern (like a histogram) then we may still be able to vectorize.
+  bool canVectorizeIndirectUnsafeDependences();
+
   /// Return true if we can vectorize this loop using the IF-conversion
   /// transformation.
   bool canVectorizeWithIfConvert();
@@ -593,6 +624,11 @@ class LoopVectorizationLegality {
   /// conditional assumes.
   SmallPtrSet<const Instruction *, 8> MaskedOp;
 
+  /// Contains all identified histogram operations, which are sequences of
+  /// load -> update -> store instructions where multiple lanes in a vector
+  /// may work on the same memory location.
+  SmallVector<HistogramInfo, 1> Histograms;
+
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
index 0120d9cf51fe9..7f6e6d11e5f53 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
@@ -24,8 +24,8 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/SandboxIR/SandboxIR.h"
-#include "llvm/Transforms/Vectorize/SandboxVectorizer/InstrInterval.h"
+#include "llvm/SandboxIR/Instruction.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h"
 
 namespace llvm::sandboxir {
 
@@ -85,7 +85,7 @@ class DependencyGraph {
   }
   /// Build/extend the dependency graph such that it includes \p Instrs. Returns
   /// the interval spanning \p Instrs.
-  InstrInterval extend(ArrayRef<Instruction *> Instrs);
+  Interval<Instruction> extend(ArrayRef<Instruction *> Instrs);
 #ifndef NDEBUG
   void print(raw_ostream &OS) const;
   LLVM_DUMP_METHOD void dump() const;
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrInterval.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrInterval.h
deleted file mode 100644
index 1343f521b29bb..0000000000000
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrInterval.h
+++ /dev/null
@@ -1,124 +0,0 @@
-//===- InstrInterval.h ------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The InstrInterval class is an interval of instructions in a block.
-// It provides an API for some basic operations on the interval, including some
-// simple set operations, like union, interseciton and others.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_INSTRINTERVAL_H
-#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_INSTRINTERVAL_H
-
-#include "llvm/SandboxIR/SandboxIR.h"
-#include <iterator>
-
-namespace llvm::sandboxir {
-
-/// A simple iterator for iterating the interval.
-template <typename DerefType, typename InstrIntervalType>
-class InstrIntervalIterator {
-  sandboxir::Instruction *I;
-  InstrIntervalType &R;
-
-public:
-  using difference_type = std::ptrdiff_t;
-  using value_type = sandboxir::Instruction;
-  using pointer = value_type *;
-  using reference = sandboxir::Instruction &;
-  using iterator_category = std::bidirectional_iterator_tag;
-
-  InstrIntervalIterator(sandboxir::Instruction *I, InstrIntervalType &R)
-      : I(I), R(R) {}
-  bool operator==(const InstrIntervalIterator &Other) const {
-    assert(&R == &Other.R && "Iterators belong to different regions!");
-    return Other.I == I;
-  }
-  bool operator!=(const InstrIntervalIterator &Other) const {
-    return !(*this == Other);
-  }
-  InstrIntervalIterator &operator++() {
-    assert(I != nullptr && "already at end()!");
-    I = I->getNextNode();
-    return *this;
-  }
-  InstrIntervalIterator operator++(int) {
-    auto ItCopy = *this;
-    ++*this;
-    return ItCopy;
-  }
-  InstrIntervalIterator &operator--() {
-    // `I` is nullptr for end() when ToI is the BB terminator.
-    I = I != nullptr ? I->getPrevNode() : R.ToI;
-    return *this;
-  }
-  InstrIntervalIterator operator--(int) {
-    auto ItCopy = *this;
-    --*this;
-    return ItCopy;
-  }
-  template <typename T =
-                std::enable_if<std::is_same<DerefType, Instruction *&>::value>>
-  sandboxir::Instruction &operator*() {
-    return *I;
-  }
-  DerefType operator*() const { return *I; }
-};
-
-class InstrInterval {
-  Instruction *FromI;
-  Instruction *ToI;
-
-public:
-  InstrInterval() : FromI(nullptr), ToI(nullptr) {}
-  InstrInterval(Instruction *FromI, Instruction *ToI) : FromI(FromI), ToI(ToI) {
-    assert((FromI == ToI || FromI->comesBefore(ToI)) &&
-           "FromI should come before TopI!");
-  }
-  InstrInterval(ArrayRef<Instruction *> Instrs) {
-    assert(!Instrs.empty() && "Expected non-empty Instrs!");
-    FromI = Instrs[0];
-    ToI = Instrs[0];
-    for (auto *I : drop_begin(Instrs)) {
-      if (I->comesBefore(FromI))
-        FromI = I;
-      else if (ToI->comesBefore(I))
-        ToI = I;
-    }
-  }
-  bool empty() const {
-    assert(((FromI == nullptr && ToI == nullptr) ||
-            (FromI != nullptr && ToI != nullptr)) &&
-           "Either none or both should be null");
-    return FromI == nullptr;
-  }
-  bool contains(Instruction *I) const {
-    if (empty())
-      return false;
-    return (FromI == I || FromI->comesBefore(I)) &&
-           (I == ToI || I->comesBefore(ToI));
-  }
-  Instruction *top() const { return FromI; }
-  Instruction *bottom() const { return ToI; }
-
-  using iterator =
-      InstrIntervalIterator<sandboxir::Instruction &, InstrInterval>;
-  using const_iterator = InstrIntervalIterator<const sandboxir::Instruction &,
-                                               const InstrInterval>;
-  iterator begin() { return iterator(FromI, *this); }
-  iterator end() {
-    return iterator(ToI != nullptr ? ToI->getNextNode() : nullptr, *this);
-  }
-  const_iterator begin() const { return const_iterator(FromI, *this); }
-  const_iterator end() const {
-    return const_iterator(ToI != nullptr ? ToI->getNextNode() : nullptr, *this);
-  }
-};
-} // namespace llvm::sandboxir
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_INSTRINTERVAL_H
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h
new file mode 100644
index 0000000000000..8f25ad109f6a6
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h
@@ -0,0 +1,183 @@
+//===- Interval.h -----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The Interval class is a generic interval of ordered objects that implement:
+// - T * T::getPrevNode()
+// - T * T::getNextNode()
+// - bool T::comesBefore(const T *) const
+//
+// This is currently used for Instruction intervals.
+// It provides an API for some basic operations on the interval, including some
+// simple set operations, like union, interseciton and others.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_INSTRINTERVAL_H
+#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_INSTRINTERVAL_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include <iterator>
+
+namespace llvm::sandboxir {
+
+/// A simple iterator for iterating the interval.
+template <typename T, typename IntervalType> class IntervalIterator {
+  T *I;
+  IntervalType &R;
+
+public:
+  using difference_type = std::ptrdiff_t;
+  using value_type = T;
+  using pointer = value_type *;
+  using reference = T &;
+  using iterator_category = std::bidirectional_iterator_tag;
+
+  IntervalIterator(T *I, IntervalType &R) : I(I), R(R) {}
+  bool operator==(const IntervalIterator &Other) const {
+    assert(&R == &Other.R && "Iterators belong to different regions!");
+    return Other.I == I;
+  }
+  bool operator!=(const IntervalIterator &Other) const {
+    return !(*this == Other);
+  }
+  IntervalIterator &operator++() {
+    assert(I != nullptr && "already at end()!");
+    I = I->getNextNode();
+    return *this;
+  }
+  IntervalIterator operator++(int) {
+    auto ItCopy = *this;
+    ++*this;
+    return ItCopy;
+  }
+  IntervalIterator &operator--() {
+    // `I` is nullptr for end() when To is the BB terminator.
+    I = I != nullptr ? I->getPrevNode() : R.To;
+    return *this;
+  }
+  IntervalIterator operator--(int) {
+    auto ItCopy = *this;
+    --*this;
+    return ItCopy;
+  }
+  template <typename HT = std::enable_if<std::is_same<T, T *&>::value>>
+  T &operator*() {
+    return *I;
+  }
+  T &operator*() const { return *I; }
+};
+
+template <typename T> class Interval {
+  T *From;
+  T *To;
+
+public:
+  Interval() : From(nullptr), To(nullptr) {}
+  Interval(T *From, T *To) : From(From), To(To) {
+    assert((From == To || From->comesBefore(To)) &&
+           "From should come before From!");
+  }
+  Interval(ArrayRef<T *> Elems) {
+    assert(!Elems.empty() && "Expected non-empty Elems!");
+    From = Elems[0];
+    To = Elems[0];
+    for (auto *I : drop_begin(Elems)) {
+      if (I->comesBefore(From))
+        From = I;
+      else if (To->comesBefore(I))
+        To = I;
+    }
+  }
+  bool empty() const {
+    assert(((From == nullptr && To == nullptr) ||
+            (From != nullptr && To != nullptr)) &&
+           "Either none or both should be null");
+    return From == nullptr;
+  }
+  bool contains(T *I) const {
+    if (empty())
+      return false;
+    return (From == I || From->comesBefore(I)) &&
+           (I == To || I->comesBefore(To));
+  }
+  T *top() const { return From; }
+  T *bottom() const { return To; }
+
+  using iterator = IntervalIterator<T, Interval>;
+  using const_iterator = IntervalIterator<const T, const Interval>;
+  iterator begin() { return iterator(From, *this); }
+  iterator end() {
+    return iterator(To != nullptr ? To->getNextNode() : nullptr, *this);
+  }
+  const_iterator begin() const { return const_iterator(From, *this); }
+  const_iterator end() const {
+    return const_iterator(To != nullptr ? To->getNextNode() : nullptr, *this);
+  }
+  /// Equality.
+  bool operator==(const Interval &Other) const {
+    return From == Other.From && To == Other.To;
+  }
+  /// Inequality.
+  bool operator!=(const Interval &Other) const { return !(*this == Other); }
+  /// \Returns true if this and \p Other have nothing in common.
+  bool disjoint(const Interval &Other) const {
+    if (Other.empty())
+      return true;
+    if (empty())
+      return true;
+    return Other.To->comesBefore(From) || To->comesBefore(Other.From);
+  }
+  /// \Returns the intersection between this and \p Other.
+  // Example:
+  // |----|   this
+  //    |---| Other
+  //    |-|   this->getIntersection(Other)
+  Interval intersection(const Interval &Other) const {
+    if (empty())
+      return *this;
+    if (Other.empty())
+      return Interval();
+    // 1. No overlap
+    // A---B      this
+    //       C--D Other
+    if (To->comesBefore(Other.From) || Other.To->comesBefore(From))
+      return Interval();
+    // 2. Overlap.
+    // A---B   this
+    //   C--D  Other
+    auto NewFromI = From->comesBefore(Other.From) ? Other.From : From;
+    auto NewToI = To->comesBefore(Other.To) ? To : Other.To;
+    return Interval(NewFromI, NewToI);
+  }
+  /// Difference operation. This returns up to two intervals.
+  // Example:
+  // |--------| this
+  //    |-|     Other
+  // |-|   |--| this - Other
+  SmallVector<Interval, 2> operator-(const Interval &Other) {
+    if (disjoint(Other))
+      return {*this};
+    if (Other.empty())
+      return {*this};
+    if (*this == Other)
+      return {Interval()};
+    Interval Intersection = intersection(Other);
+    SmallVector<Interval, 2> Result;
+    // Part 1, skip if empty.
+    if (From != Intersection.From)
+      Result.emplace_back(From, Intersection.From->getPrevNode());
+    // Part 2, skip if empty.
+    if (Intersection.To != To)
+      Result.emplace_back(Intersection.To->getNextNode(), To);
+    return Result;
+  }
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_INSTRINTERVAL_H
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
index 78c1c0e4c0464..50fa56c5b2194 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
@@ -12,11 +12,12 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_LEGALITY_H
 #define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_LEGALITY_H
 
-#include "llvm/SandboxIR/SandboxIR.h"
+#include "llvm/ADT/ArrayRef.h"
 
 namespace llvm::sandboxir {
 
 class LegalityAnalysis;
+class Value;
 
 enum class LegalityResultID {
   Widen, ///> Vectorize by combining scalars to a vector.
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
index 99582e3e0e023..a2108f07c28e5 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
@@ -13,8 +13,8 @@
 #define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_BOTTOMUPVEC_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/SandboxIR/Constant.h"
 #include "llvm/SandboxIR/Pass.h"
-#include "llvm/SandboxIR/SandboxIR.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h"
 
 namespace llvm::sandboxir {
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index d90bb213f4208..20cdbb6320322 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -892,7 +892,10 @@ bool llvm::isWritableObject(const Value *Object,
     return true;
 
   if (auto *A = dyn_cast<Argument>(Object)) {
-    if (A->hasAttribute(Attribute::Writable)) {
+    // Also require noalias, otherwise writability at function entry cannot be
+    // generalized to writability at other program points, even if the pointer
+    // does not escape.
+    if (A->hasAttribute(Attribute::Writable) && A->hasNoAliasAttr()) {
       ExplicitlyDereferenceableOnly = true;
       return true;
     }
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index a00ed7530ebc4..f471c0db11d3e 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -117,7 +117,8 @@ static std::optional<TypeSize> getObjectSize(const Value *V,
 }
 
 /// Returns true if we can prove that the object specified by V is smaller than
-/// Size.
+/// Size. Bails out early unless the root object is passed as the first
+/// parameter.
 static bool isObjectSmallerThan(const Value *V, TypeSize Size,
                                 const DataLayout &DL,
                                 const TargetLibraryInfo &TLI,
@@ -134,20 +135,14 @@ static bool isObjectSmallerThan(const Value *V, TypeSize Size,
   //     char *p = (char*)malloc(100)
   //     char *q = p+80;
   //
-  //  In the context of c1 and c2, the "object" pointed by q refers to the
+  // In the context of c1 and c2, the "object" pointed by q refers to the
   // stretch of memory of q[0:19]. So, getObjectSize(q) should return 20.
   //
-  //  However, in the context of c3, the "object" refers to the chunk of memory
-  // being allocated. So, the "object" has 100 bytes, and q points to the middle
-  // the "object". In case q is passed to isObjectSmallerThan() as the 1st
-  // parameter, before the llvm::getObjectSize() is called to get the size of
-  // entire object, we should:
-  //    - either rewind the pointer q to the base-address of the object in
-  //      question (in this case rewind to p), or
-  //    - just give up. It is up to caller to make sure the pointer is pointing
-  //      to the base address the object.
-  //
-  // We go for 2nd option for simplicity.
+  // In the context of c3, the "object" refers to the chunk of memory being
+  // allocated. So, the "object" has 100 bytes, and q points to the middle the
+  // "object". However, unless p, the root object, is passed as the first
+  // parameter, the call to isIdentifiedObject() makes isObjectSmallerThan()
+  // bail out early.
   if (!isIdentifiedObject(V))
     return false;
 
diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp
index 7517011395a7d..825e517cd09f5 100644
--- a/llvm/lib/Analysis/CtxProfAnalysis.cpp
+++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp
@@ -184,6 +184,7 @@ PGOContextualProfile CtxProfAnalysis::run(Module &M,
   // If we made it this far, the Result is valid - which we mark by setting
   // .Profiles.
   Result.Profiles = std::move(*MaybeCtx);
+  Result.initIndex();
   return Result;
 }
 
@@ -265,11 +266,9 @@ CtxProfAnalysis::getSelectInstrumentation(SelectInst &SI) {
 
 template <class ProfilesTy, class ProfTy>
 static void preorderVisit(ProfilesTy &Profiles,
-                          function_ref<void(ProfTy &)> Visitor,
-                          GlobalValue::GUID Match = 0) {
+                          function_ref<void(ProfTy &)> Visitor) {
   std::function<void(ProfTy &)> Traverser = [&](auto &Ctx) {
-    if (!Match || Ctx.guid() == Match)
-      Visitor(Ctx);
+    Visitor(Ctx);
     for (auto &[_, SubCtxSet] : Ctx.callsites())
       for (auto &[__, Subctx] : SubCtxSet)
         Traverser(Subctx);
@@ -278,16 +277,44 @@ static void preorderVisit(ProfilesTy &Profiles,
     Traverser(P);
 }
 
-void PGOContextualProfile::update(Visitor V, const Function *F) {
-  GlobalValue::GUID G = F ? getDefinedFunctionGUID(*F) : 0U;
+void PGOContextualProfile::initIndex() {
+  // Initialize the head of the index list for each function. We don't need it
+  // after this point.
+  DenseMap<GlobalValue::GUID, PGOCtxProfContext *> InsertionPoints;
+  for (auto &[Guid, FI] : FuncInfo)
+    InsertionPoints[Guid] = &FI.Index;
   preorderVisit<PGOCtxProfContext::CallTargetMapTy, PGOCtxProfContext>(
-      *Profiles, V, G);
+      *Profiles, [&](PGOCtxProfContext &Ctx) {
+        auto InsertIt = InsertionPoints.find(Ctx.guid());
+        if (InsertIt == InsertionPoints.end())
+          return;
+        // Insert at the end of the list. Since we traverse in preorder, it
+        // means that when we iterate the list from the beginning, we'd
+        // encounter the contexts in the order we would have, should we have
+        // performed a full preorder traversal.
+        InsertIt->second->Next = &Ctx;
+        Ctx.Previous = InsertIt->second;
+        InsertIt->second = &Ctx;
+      });
+}
+
+void PGOContextualProfile::update(Visitor V, const Function &F) {
+  assert(isFunctionKnown(F));
+  GlobalValue::GUID G = getDefinedFunctionGUID(F);
+  for (auto *Node = FuncInfo.find(G)->second.Index.Next; Node;
+       Node = Node->Next)
+    V(*reinterpret_cast<PGOCtxProfContext *>(Node));
 }
 
 void PGOContextualProfile::visit(ConstVisitor V, const Function *F) const {
-  GlobalValue::GUID G = F ? getDefinedFunctionGUID(*F) : 0U;
-  preorderVisit<const PGOCtxProfContext::CallTargetMapTy,
-                const PGOCtxProfContext>(*Profiles, V, G);
+  if (!F)
+    return preorderVisit<const PGOCtxProfContext::CallTargetMapTy,
+                         const PGOCtxProfContext>(*Profiles, V);
+  assert(isFunctionKnown(*F));
+  GlobalValue::GUID G = getDefinedFunctionGUID(*F);
+  for (const auto *Node = FuncInfo.find(G)->second.Index.Next; Node;
+       Node = Node->Next)
+    V(*reinterpret_cast<const PGOCtxProfContext *>(Node));
 }
 
 const CtxProfFlatProfile PGOContextualProfile::flatten() const {
@@ -309,3 +336,25 @@ const CtxProfFlatProfile PGOContextualProfile::flatten() const {
       });
   return Flat;
 }
+
+void CtxProfAnalysis::collectIndirectCallPromotionList(
+    CallBase &IC, Result &Profile,
+    SetVector<std::pair<CallBase *, Function *>> &Candidates) {
+  const auto *Instr = CtxProfAnalysis::getCallsiteInstrumentation(IC);
+  if (!Instr)
+    return;
+  Module &M = *IC.getParent()->getModule();
+  const uint32_t CallID = Instr->getIndex()->getZExtValue();
+  Profile.visit(
+      [&](const PGOCtxProfContext &Ctx) {
+        const auto &Targets = Ctx.callsites().find(CallID);
+        if (Targets == Ctx.callsites().end())
+          return;
+        for (const auto &[Guid, _] : Targets->second)
+          if (auto Name = Profile.getFunctionName(Guid); !Name.empty())
+            if (auto *Target = M.getFunction(Name))
+              if (Target->hasFnAttribute(Attribute::AlwaysInline))
+                Candidates.insert({&IC, Target});
+      },
+      IC.getCaller());
+}
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 4b65fa0ae41b2..d2c329ba748e5 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -504,8 +504,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   InlineResult analyze();
 
   std::optional<Constant *> getSimplifiedValue(Instruction *I) {
-    if (SimplifiedValues.contains(I))
-      return SimplifiedValues[I];
+    auto It = SimplifiedValues.find(I);
+    if (It != SimplifiedValues.end())
+      return It->second;
     return std::nullopt;
   }
 
@@ -1129,8 +1130,9 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
   void print(raw_ostream &OS);
 
   std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
-    if (InstructionCostDetailMap.contains(I))
-      return InstructionCostDetailMap[I];
+    auto It = InstructionCostDetailMap.find(I);
+    if (It != InstructionCostDetailMap.end())
+      return It->second;
     return std::nullopt;
   }
 
diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index e0a029802bbd9..4689451243cd9 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -67,6 +67,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
@@ -102,6 +103,8 @@ class Lint : public InstVisitor<Lint> {
   void visitReturnInst(ReturnInst &I);
   void visitLoadInst(LoadInst &I);
   void visitStoreInst(StoreInst &I);
+  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I);
+  void visitAtomicRMWInst(AtomicRMWInst &I);
   void visitXor(BinaryOperator &I);
   void visitSub(BinaryOperator &I);
   void visitLShr(BinaryOperator &I);
@@ -124,6 +127,7 @@ class Lint : public InstVisitor<Lint> {
 
 public:
   Module *Mod;
+  Triple TT;
   const DataLayout *DL;
   AliasAnalysis *AA;
   AssumptionCache *AC;
@@ -135,8 +139,8 @@ class Lint : public InstVisitor<Lint> {
 
   Lint(Module *Mod, const DataLayout *DL, AliasAnalysis *AA,
        AssumptionCache *AC, DominatorTree *DT, TargetLibraryInfo *TLI)
-      : Mod(Mod), DL(DL), AA(AA), AC(AC), DT(DT), TLI(TLI),
-        MessagesStr(Messages) {}
+      : Mod(Mod), TT(Triple::normalize(Mod->getTargetTriple())), DL(DL), AA(AA),
+        AC(AC), DT(DT), TLI(TLI), MessagesStr(Messages) {}
 
   void WriteValues(ArrayRef<const Value *> Vs) {
     for (const Value *V : Vs) {
@@ -401,6 +405,11 @@ void Lint::visitMemoryReference(Instruction &I, const MemoryLocation &Loc,
         "Unusual: Address one pointer dereference", &I);
 
   if (Flags & MemRef::Write) {
+    if (TT.isAMDGPU())
+      Check(!AMDGPU::isConstantAddressSpace(
+                UnderlyingObject->getType()->getPointerAddressSpace()),
+            "Undefined behavior: Write to memory in const addrspace", &I);
+
     if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(UnderlyingObject))
       Check(!GV->isConstant(), "Undefined behavior: Write to read-only memory",
             &I);
@@ -480,6 +489,16 @@ void Lint::visitStoreInst(StoreInst &I) {
                        I.getOperand(0)->getType(), MemRef::Write);
 }
 
+void Lint::visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+  visitMemoryReference(I, MemoryLocation::get(&I), I.getAlign(),
+                       I.getOperand(0)->getType(), MemRef::Write);
+}
+
+void Lint::visitAtomicRMWInst(AtomicRMWInst &I) {
+  visitMemoryReference(I, MemoryLocation::get(&I), I.getAlign(),
+                       I.getOperand(0)->getType(), MemRef::Write);
+}
+
 void Lint::visitXor(BinaryOperator &I) {
   Check(!isa<UndefValue>(I.getOperand(0)) || !isa<UndefValue>(I.getOperand(1)),
         "Undefined result: xor(undef, undef)", &I);
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 79504ca7b73c8..c5fba184cd085 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -888,7 +888,7 @@ void MemoryDependenceResults::getNonLocalPointerDependency(
   // each block.  Because of critical edges, we currently bail out if querying
   // a block with multiple different pointers.  This can happen during PHI
   // translation.
-  DenseMap<BasicBlock *, Value *> Visited;
+  SmallDenseMap<BasicBlock *, Value *, 16> Visited;
   if (getNonLocalPointerDepFromBB(QueryInst, Address, Loc, isLoad, FromBB,
                                    Result, Visited, true))
     return;
@@ -1038,7 +1038,7 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
     Instruction *QueryInst, const PHITransAddr &Pointer,
     const MemoryLocation &Loc, bool isLoad, BasicBlock *StartBB,
     SmallVectorImpl<NonLocalDepResult> &Result,
-    DenseMap<BasicBlock *, Value *> &Visited, bool SkipFirstBlock,
+    SmallDenseMap<BasicBlock *, Value *, 16> &Visited, bool SkipFirstBlock,
     bool IsIncomplete) {
   // Look up the cached info for Pointer.
   ValueIsLoadPair CacheKey(Pointer.getAddr(), isLoad);
diff --git a/llvm/lib/Analysis/ReplayInlineAdvisor.cpp b/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
index 2ca02eb174171..6db5737ef4268 100644
--- a/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
@@ -114,7 +114,7 @@ std::unique_ptr<InlineAdvice> ReplayInlineAdvisor::getAdviceImpl(CallBase &CB) {
   // Replay decision, if it has one
   auto Iter = InlineSitesFromRemarks.find(Combined);
   if (Iter != InlineSitesFromRemarks.end()) {
-    if (InlineSitesFromRemarks[Combined]) {
+    if (Iter->second) {
       LLVM_DEBUG(dbgs() << "Replay Inliner: Inlined " << Callee << " @ "
                         << CallSiteLoc << "\n");
       return std::make_unique<DefaultInlineAdvice>(
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 233f8edca5b13..c939270ed39a6 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -2255,7 +2255,7 @@ const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op,
 /// the common case where no interesting opportunities are present, and
 /// is also used as a check to avoid infinite recursion.
 static bool
-CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
+CollectAddOperandsWithScales(SmallDenseMap<const SCEV *, APInt, 16> &M,
                              SmallVectorImpl<const SCEV *> &NewOps,
                              APInt &AccumulatedConstant,
                              ArrayRef<const SCEV *> Ops, const APInt &Scale,
@@ -2753,7 +2753,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   // operands multiplied by constant values.
   if (Idx < Ops.size() && isa<SCEVMulExpr>(Ops[Idx])) {
     uint64_t BitWidth = getTypeSizeInBits(Ty);
-    DenseMap<const SCEV *, APInt> M;
+    SmallDenseMap<const SCEV *, APInt, 16> M;
     SmallVector<const SCEV *, 8> NewOps;
     APInt AccumulatedConstant(BitWidth, 0);
     if (CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant,
@@ -8693,12 +8693,12 @@ bool ScalarEvolution::BackedgeTakenInfo::isConstantMaxOrZero(
 }
 
 ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E)
-    : ExitLimit(E, E, E, false, {}) {}
+    : ExitLimit(E, E, E, false) {}
 
 ScalarEvolution::ExitLimit::ExitLimit(
     const SCEV *E, const SCEV *ConstantMaxNotTaken,
     const SCEV *SymbolicMaxNotTaken, bool MaxOrZero,
-    ArrayRef<const SmallPtrSetImpl<const SCEVPredicate *> *> PredSetList)
+    ArrayRef<ArrayRef<const SCEVPredicate *>> PredLists)
     : ExactNotTaken(E), ConstantMaxNotTaken(ConstantMaxNotTaken),
       SymbolicMaxNotTaken(SymbolicMaxNotTaken), MaxOrZero(MaxOrZero) {
   // If we prove the max count is zero, so is the symbolic bound.  This happens
@@ -8721,9 +8721,15 @@ ScalarEvolution::ExitLimit::ExitLimit(
   assert((isa<SCEVCouldNotCompute>(ConstantMaxNotTaken) ||
           isa<SCEVConstant>(ConstantMaxNotTaken)) &&
          "No point in having a non-constant max backedge taken count!");
-  for (const auto *PredSet : PredSetList)
-    for (const auto *P : *PredSet)
-      addPredicate(P);
+  SmallPtrSet<const SCEVPredicate *, 4> SeenPreds;
+  for (const auto PredList : PredLists)
+    for (const auto *P : PredList) {
+      if (SeenPreds.contains(P))
+        continue;
+      assert(!isa<SCEVUnionPredicate>(P) && "Only add leaf predicates here!");
+      SeenPreds.insert(P);
+      Predicates.push_back(P);
+    }
   assert((isa<SCEVCouldNotCompute>(E) || !E->getType()->isPointerTy()) &&
          "Backedge count should be int");
   assert((isa<SCEVCouldNotCompute>(ConstantMaxNotTaken) ||
@@ -8731,12 +8737,13 @@ ScalarEvolution::ExitLimit::ExitLimit(
          "Max backedge count should be int");
 }
 
-ScalarEvolution::ExitLimit::ExitLimit(
-    const SCEV *E, const SCEV *ConstantMaxNotTaken,
-    const SCEV *SymbolicMaxNotTaken, bool MaxOrZero,
-    const SmallPtrSetImpl<const SCEVPredicate *> &PredSet)
+ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E,
+                                      const SCEV *ConstantMaxNotTaken,
+                                      const SCEV *SymbolicMaxNotTaken,
+                                      bool MaxOrZero,
+                                      ArrayRef<const SCEVPredicate *> PredList)
     : ExitLimit(E, ConstantMaxNotTaken, SymbolicMaxNotTaken, MaxOrZero,
-                { &PredSet }) {}
+                ArrayRef({PredList})) {}
 
 /// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each
 /// computable exit into a persistent ExitNotTakenInfo array.
@@ -9098,7 +9105,7 @@ ScalarEvolution::computeExitLimitFromCondFromBinOp(
     SymbolicMaxBECount =
         isa<SCEVCouldNotCompute>(BECount) ? ConstantMaxBECount : BECount;
   return ExitLimit(BECount, ConstantMaxBECount, SymbolicMaxBECount, false,
-                   { &EL0.Predicates, &EL1.Predicates });
+                   {ArrayRef(EL0.Predicates), ArrayRef(EL1.Predicates)});
 }
 
 ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromICmp(
@@ -10129,8 +10136,11 @@ const SCEV *ScalarEvolution::stripInjectiveFunctions(const SCEV *S) const {
 /// A and B isn't important.
 ///
 /// If the equation does not have a solution, SCEVCouldNotCompute is returned.
-static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const SCEV *B,
-                                               ScalarEvolution &SE) {
+static const SCEV *
+SolveLinEquationWithOverflow(const APInt &A, const SCEV *B,
+                             SmallVectorImpl<const SCEVPredicate *> *Predicates,
+
+                             ScalarEvolution &SE) {
   uint32_t BW = A.getBitWidth();
   assert(BW == SE.getTypeSizeInBits(B->getType()));
   assert(A != 0 && "A must be non-zero.");
@@ -10146,8 +10156,22 @@ static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const SCEV *B,
   //
   // B is divisible by D if and only if the multiplicity of prime factor 2 for B
   // is not less than multiplicity of this prime factor for D.
-  if (SE.getMinTrailingZeros(B) < Mult2)
-    return SE.getCouldNotCompute();
+  if (SE.getMinTrailingZeros(B) < Mult2) {
+    // Check if we can prove there's no remainder using URem.
+    const SCEV *URem =
+        SE.getURemExpr(B, SE.getConstant(APInt::getOneBitSet(BW, Mult2)));
+    const SCEV *Zero = SE.getZero(B->getType());
+    if (!SE.isKnownPredicate(CmpInst::ICMP_EQ, URem, Zero)) {
+      // Try to add a predicate ensuring B is a multiple of 1 << Mult2.
+      if (!Predicates)
+        return SE.getCouldNotCompute();
+
+      // Avoid adding a predicate that is known to be false.
+      if (SE.isKnownPredicate(CmpInst::ICMP_NE, URem, Zero))
+        return SE.getCouldNotCompute();
+      Predicates->push_back(SE.getEqualPredicate(URem, Zero));
+    }
+  }
 
   // 3. Compute I: the multiplicative inverse of (A / D) in arithmetic
   // modulo (N / D).
@@ -10449,7 +10473,7 @@ ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V,
   // effectively V != 0.  We know and take advantage of the fact that this
   // expression only being used in a comparison by zero context.
 
-  SmallPtrSet<const SCEVPredicate *, 4> Predicates;
+  SmallVector<const SCEVPredicate *> Predicates;
   // If the value is a constant
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
     // If the value is already zero, the branch will execute zero times.
@@ -10577,8 +10601,9 @@ ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V,
   // Solve the general equation.
   if (!StepC || StepC->getValue()->isZero())
     return getCouldNotCompute();
-  const SCEV *E = SolveLinEquationWithOverflow(StepC->getAPInt(),
-                                               getNegativeSCEV(Start), *this);
+  const SCEV *E = SolveLinEquationWithOverflow(
+      StepC->getAPInt(), getNegativeSCEV(Start),
+      AllowPredicates ? &Predicates : nullptr, *this);
 
   const SCEV *M = E;
   if (E != getCouldNotCompute()) {
@@ -12867,7 +12892,7 @@ ScalarEvolution::ExitLimit
 ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
                                   const Loop *L, bool IsSigned,
                                   bool ControlsOnlyExit, bool AllowPredicates) {
-  SmallPtrSet<const SCEVPredicate *, 4> Predicates;
+  SmallVector<const SCEVPredicate *> Predicates;
 
   const SCEVAddRecExpr *IV = dyn_cast<SCEVAddRecExpr>(LHS);
   bool PredicatedIV = false;
@@ -13307,7 +13332,7 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
 ScalarEvolution::ExitLimit ScalarEvolution::howManyGreaterThans(
     const SCEV *LHS, const SCEV *RHS, const Loop *L, bool IsSigned,
     bool ControlsOnlyExit, bool AllowPredicates) {
-  SmallPtrSet<const SCEVPredicate *, 4> Predicates;
+  SmallVector<const SCEVPredicate *> Predicates;
   // We handle only IV > Invariant
   if (!isLoopInvariant(RHS, L))
     return getCouldNotCompute();
@@ -13677,7 +13702,7 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
       PrintSCEVWithTypeHint(OS, EC);
       if (isa<SCEVCouldNotCompute>(EC)) {
         // Retry with predicates.
-        SmallVector<const SCEVPredicate *, 4> Predicates;
+        SmallVector<const SCEVPredicate *> Predicates;
         EC = SE->getPredicatedExitCount(L, ExitingBlock, &Predicates);
         if (!isa<SCEVCouldNotCompute>(EC)) {
           OS << "\n  predicated exit count for " << ExitingBlock->getName()
@@ -13729,7 +13754,7 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
       PrintSCEVWithTypeHint(OS, ExitBTC);
       if (isa<SCEVCouldNotCompute>(ExitBTC)) {
         // Retry with predicates.
-        SmallVector<const SCEVPredicate *, 4> Predicates;
+        SmallVector<const SCEVPredicate *> Predicates;
         ExitBTC = SE->getPredicatedExitCount(L, ExitingBlock, &Predicates,
                                              ScalarEvolution::SymbolicMaximum);
         if (!isa<SCEVCouldNotCompute>(ExitBTC)) {
@@ -14709,7 +14734,7 @@ class SCEVPredicateRewriter : public SCEVRewriteVisitor<SCEVPredicateRewriter> {
   /// If \p NewPreds is non-null, rewrite is free to add further predicates to
   /// \p NewPreds such that the result will be an AddRecExpr.
   static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE,
-                             SmallPtrSetImpl<const SCEVPredicate *> *NewPreds,
+                             SmallVectorImpl<const SCEVPredicate *> *NewPreds,
                              const SCEVPredicate *Pred) {
     SCEVPredicateRewriter Rewriter(L, SE, NewPreds, Pred);
     return Rewriter.visit(S);
@@ -14765,9 +14790,10 @@ class SCEVPredicateRewriter : public SCEVRewriteVisitor<SCEVPredicateRewriter> {
   }
 
 private:
-  explicit SCEVPredicateRewriter(const Loop *L, ScalarEvolution &SE,
-                        SmallPtrSetImpl<const SCEVPredicate *> *NewPreds,
-                        const SCEVPredicate *Pred)
+  explicit SCEVPredicateRewriter(
+      const Loop *L, ScalarEvolution &SE,
+      SmallVectorImpl<const SCEVPredicate *> *NewPreds,
+      const SCEVPredicate *Pred)
       : SCEVRewriteVisitor(SE), NewPreds(NewPreds), Pred(Pred), L(L) {}
 
   bool addOverflowAssumption(const SCEVPredicate *P) {
@@ -14775,7 +14801,7 @@ class SCEVPredicateRewriter : public SCEVRewriteVisitor<SCEVPredicateRewriter> {
       // Check if we've already made this assumption.
       return Pred && Pred->implies(P);
     }
-    NewPreds->insert(P);
+    NewPreds->push_back(P);
     return true;
   }
 
@@ -14811,7 +14837,7 @@ class SCEVPredicateRewriter : public SCEVRewriteVisitor<SCEVPredicateRewriter> {
     return PredicatedRewrite->first;
   }
 
-  SmallPtrSetImpl<const SCEVPredicate *> *NewPreds;
+  SmallVectorImpl<const SCEVPredicate *> *NewPreds;
   const SCEVPredicate *Pred;
   const Loop *L;
 };
@@ -14826,8 +14852,8 @@ ScalarEvolution::rewriteUsingPredicate(const SCEV *S, const Loop *L,
 
 const SCEVAddRecExpr *ScalarEvolution::convertSCEVToAddRecWithPredicates(
     const SCEV *S, const Loop *L,
-    SmallPtrSetImpl<const SCEVPredicate *> &Preds) {
-  SmallPtrSet<const SCEVPredicate *, 4> TransformPreds;
+    SmallVectorImpl<const SCEVPredicate *> &Preds) {
+  SmallVector<const SCEVPredicate *> TransformPreds;
   S = SCEVPredicateRewriter::rewrite(S, L, *this, &TransformPreds, nullptr);
   auto *AddRec = dyn_cast<SCEVAddRecExpr>(S);
 
@@ -14836,7 +14862,7 @@ const SCEVAddRecExpr *ScalarEvolution::convertSCEVToAddRecWithPredicates(
 
   // Since the transformation was successful, we can now transfer the SCEV
   // predicates.
-  Preds.insert(TransformPreds.begin(), TransformPreds.end());
+  Preds.append(TransformPreds.begin(), TransformPreds.end());
 
   return AddRec;
 }
@@ -15083,7 +15109,7 @@ bool PredicatedScalarEvolution::hasNoOverflow(
 
 const SCEVAddRecExpr *PredicatedScalarEvolution::getAsAddRec(Value *V) {
   const SCEV *Expr = this->getSCEV(V);
-  SmallPtrSet<const SCEVPredicate *, 4> NewPreds;
+  SmallVector<const SCEVPredicate *, 4> NewPreds;
   auto *New = SE.convertSCEVToAddRecWithPredicates(Expr, &L, NewPreds);
 
   if (!New)
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 67b626f300a10..b5195f764cbd1 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1015,11 +1015,12 @@ InstructionCost TargetTransformInfo::getCFInstrCost(
 
 InstructionCost TargetTransformInfo::getCmpSelInstrCost(
     unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
-    TTI::TargetCostKind CostKind, const Instruction *I) const {
+    TTI::TargetCostKind CostKind, OperandValueInfo Op1Info,
+    OperandValueInfo Op2Info, const Instruction *I) const {
   assert((I == nullptr || I->getOpcode() == Opcode) &&
          "Opcode should reflect passed instruction.");
-  InstructionCost Cost =
-      TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
+  InstructionCost Cost = TTIImpl->getCmpSelInstrCost(
+      Opcode, ValTy, CondTy, VecPred, CostKind, Op1Info, Op2Info, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 08b917fdb260a..d84521d2e6e10 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -338,7 +338,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
 
   for (const auto &[Name, Info] : make_early_inc_range(ForwardRefVals)) {
     if (StringRef(Name).starts_with("llvm.")) {
-      Intrinsic::ID IID = Function::lookupIntrinsicID(Name);
+      Intrinsic::ID IID = Intrinsic::lookupIntrinsicID(Name);
       if (IID == Intrinsic::not_intrinsic)
         // Don't do anything for unknown intrinsics.
         continue;
@@ -5239,7 +5239,7 @@ bool LLParser::parseDISubrange(MDNode *&Result, bool IsDistinct) {
   Metadata *UpperBound = nullptr;
   Metadata *Stride = nullptr;
 
-  auto convToMetadata = [&](MDSignedOrMDField Bound) -> Metadata * {
+  auto convToMetadata = [&](const MDSignedOrMDField &Bound) -> Metadata * {
     if (Bound.isMDSignedField())
       return ConstantAsMetadata::get(ConstantInt::getSigned(
           Type::getInt64Ty(Context), Bound.getMDSignedValue()));
@@ -5271,7 +5271,7 @@ bool LLParser::parseDIGenericSubrange(MDNode *&Result, bool IsDistinct) {
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  auto ConvToMetadata = [&](MDSignedOrMDField Bound) -> Metadata * {
+  auto ConvToMetadata = [&](const MDSignedOrMDField &Bound) -> Metadata * {
     if (Bound.isMDSignedField())
       return DIExpression::get(
           Context, {dwarf::DW_OP_consts,
@@ -6301,7 +6301,7 @@ bool isOldDbgFormatIntrinsic(StringRef Name) {
   // intrinsics in the new debug info format.
   if (!Name.starts_with("llvm.dbg."))
     return false;
-  Intrinsic::ID FnID = Function::lookupIntrinsicID(Name);
+  Intrinsic::ID FnID = Intrinsic::lookupIntrinsicID(Name);
   return FnID == Intrinsic::dbg_declare || FnID == Intrinsic::dbg_value ||
          FnID == Intrinsic::dbg_assign;
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index d17800d375b7f..317278911b28f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1432,7 +1432,7 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
       OutStreamer->AddComment("BB id");
       // Emit the BB ID for this basic block.
       // We only emit BaseID since CloneID is unset for
-      // basic-block-sections=labels.
+      // -basic-block-adress-map.
       // TODO: Emit the full BBID when labels and sections can be mixed
       // together.
       OutStreamer->emitULEB128IntValue(MBB.getBBID()->BaseID);
@@ -1866,7 +1866,7 @@ void AsmPrinter::emitFunctionBody() {
     // We must emit temporary symbol for the end of this basic block, if either
     // we have BBLabels enabled or if this basic blocks marks the end of a
     // section.
-    if (MF->hasBBLabels() || MF->getTarget().Options.BBAddrMap ||
+    if (MF->getTarget().Options.BBAddrMap ||
         (MAI->hasDotTypeDotSizeDirective() && MBB.isEndSection()))
       OutStreamer->emitLabel(MBB.getEndSymbol());
 
@@ -2021,7 +2021,7 @@ void AsmPrinter::emitFunctionBody() {
   // Emit section containing BB address offsets and their metadata, when
   // BB labels are requested for this function. Skip empty functions.
   if (HasAnyRealCode) {
-    if (MF->hasBBLabels() || MF->getTarget().Options.BBAddrMap)
+    if (MF->getTarget().Options.BBAddrMap)
       emitBBAddrMapSection(*MF);
     else if (PgoAnalysisMapFeatures.getBits() != 0)
       MF->getContext().reportWarning(
@@ -2620,7 +2620,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
       F.hasFnAttribute("xray-instruction-threshold") ||
       needFuncLabels(MF, *this) || NeedsLocalForSize ||
       MF.getTarget().Options.EmitStackSizeSection ||
-      MF.getTarget().Options.BBAddrMap || MF.hasBBLabels()) {
+      MF.getTarget().Options.BBAddrMap) {
     CurrentFnBegin = createTempSymbol("func_begin");
     if (NeedsLocalForSize)
       CurrentFnSymForSize = CurrentFnBegin;
@@ -4155,8 +4155,7 @@ bool AsmPrinter::shouldEmitLabelForBasicBlock(
   // With `-fbasic-block-sections=`, a label is needed for every non-entry block
   // in the labels mode (option `=labels`) and every section beginning in the
   // sections mode (`=all` and `=list=`).
-  if ((MF->hasBBLabels() || MF->getTarget().Options.BBAddrMap ||
-       MBB.isBeginSection()) &&
+  if ((MF->getTarget().Options.BBAddrMap || MBB.isBeginSection()) &&
       !MBB.isEntryBlock())
     return true;
   // A label is needed for any block with at least one predecessor (when that
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 5cba2cbc241e4..a692e7aef6268 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -3406,10 +3406,8 @@ void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) {
     OS.emitInt32(getCompleteTypeIndex(DIGV->getType()).getIndex());
     OS.AddComment("DataOffset");
 
-    uint64_t Offset = 0;
-    if (CVGlobalVariableOffsets.contains(DIGV))
-      // Use the offset seen while collecting info on globals.
-      Offset = CVGlobalVariableOffsets[DIGV];
+    // Use the offset seen while collecting info on globals.
+    uint64_t Offset = CVGlobalVariableOffsets.lookup(DIGV);
     OS.emitCOFFSecRel32(GVSym, Offset);
 
     OS.AddComment("Segment");
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 0a1ff189bedbc..20ee50dca499f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -779,6 +779,13 @@ DIE *DwarfCompileUnit::constructVariableDIE(DbgVariable &DV, bool Abstract) {
 void DwarfCompileUnit::applyConcreteDbgVariableAttributes(
     const Loc::Single &Single, const DbgVariable &DV, DIE &VariableDie) {
   const DbgValueLoc *DVal = &Single.getValueLoc();
+  if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB() &&
+      !Single.getExpr()) {
+    // Lack of expression means it is a register.  Registers for PTX need to
+    // be marked with DW_AT_address_class = 2.  See
+    // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf
+    addUInt(VariableDie, dwarf::DW_AT_address_class, dwarf::DW_FORM_data1, 2);
+  }
   if (!DVal->isVariadic()) {
     const DbgValueLocEntry *Entry = DVal->getLocEntries().begin();
     if (Entry->isLocation()) {
@@ -1209,7 +1216,7 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
 }
 
 bool DwarfCompileUnit::useGNUAnalogForDwarf5Feature() const {
-  return DD->getDwarfVersion() == 4 && !DD->tuneForLLDB();
+  return DD->getDwarfVersion() <= 4 && !DD->tuneForLLDB();
 }
 
 dwarf::Tag DwarfCompileUnit::getDwarf5OrGNUTag(dwarf::Tag Tag) const {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index e9649f9ff8165..aa44d62da47be 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -570,7 +570,7 @@ void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
 /// debug expression to a register in the forwarded register worklist.
 struct FwdRegParamInfo {
   /// The described parameter register.
-  unsigned ParamReg;
+  uint64_t ParamReg;
 
   /// Debug expression that has been built up when walking through the
   /// instruction chain that produces the parameter's value.
@@ -578,7 +578,7 @@ struct FwdRegParamInfo {
 };
 
 /// Register worklist for finding call site values.
-using FwdRegWorklist = MapVector<unsigned, SmallVector<FwdRegParamInfo, 2>>;
+using FwdRegWorklist = MapVector<uint64_t, SmallVector<FwdRegParamInfo, 2>>;
 /// Container for the set of registers known to be clobbered on the path to a
 /// call site.
 using ClobberedRegSet = SmallSet<Register, 16>;
@@ -2831,7 +2831,8 @@ static void emitRangeList(
 
   // Gather all the ranges that apply to the same section so they can share
   // a base address entry.
-  MapVector<const MCSection *, std::vector<decltype(&*R.begin())>> SectionRanges;
+  SmallMapVector<const MCSection *, std::vector<decltype(&*R.begin())>, 16>
+      SectionRanges;
 
   for (const auto &Range : R)
     SectionRanges[&Range.Begin->getSection()].push_back(&Range);
@@ -3682,8 +3683,10 @@ void DwarfDebug::beginCodeAlignment(const MachineBasicBlock &MBB) {
     return;
 
   auto PrevLoc = Asm->OutStreamer->getContext().getCurrentDwarfLoc();
-  Asm->OutStreamer->emitDwarfLocDirective(
-      PrevLoc.getFileNum(), 0, PrevLoc.getColumn(), 0, 0, 0, StringRef());
-  MCDwarfLineEntry::make(Asm->OutStreamer.get(),
-                         Asm->OutStreamer->getCurrentSectionOnly());
+  if (PrevLoc.getLine()) {
+    Asm->OutStreamer->emitDwarfLocDirective(
+        PrevLoc.getFileNum(), 0, PrevLoc.getColumn(), 0, 0, 0, StringRef());
+    MCDwarfLineEntry::make(Asm->OutStreamer.get(),
+                           Asm->OutStreamer->getCurrentSectionOnly());
+  }
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 9d6e1bb367bc8..f5d2863ae70b7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -40,7 +40,7 @@ void DwarfExpression::emitConstu(uint64_t Value) {
   }
 }
 
-void DwarfExpression::addReg(int DwarfReg, const char *Comment) {
+void DwarfExpression::addReg(int64_t DwarfReg, const char *Comment) {
   assert(DwarfReg >= 0 && "invalid negative dwarf register number");
   assert((isUnknownLocation() || isRegisterLocation()) &&
          "location description already locked down");
@@ -53,7 +53,7 @@ void DwarfExpression::addReg(int DwarfReg, const char *Comment) {
   }
 }
 
-void DwarfExpression::addBReg(int DwarfReg, int Offset) {
+void DwarfExpression::addBReg(int64_t DwarfReg, int64_t Offset) {
   assert(DwarfReg >= 0 && "invalid negative dwarf register number");
   assert(!isRegisterLocation() && "location description already locked down");
   if (DwarfReg < 32) {
@@ -65,7 +65,7 @@ void DwarfExpression::addBReg(int DwarfReg, int Offset) {
   emitSigned(Offset);
 }
 
-void DwarfExpression::addFBReg(int Offset) {
+void DwarfExpression::addFBReg(int64_t Offset) {
   emitOp(dwarf::DW_OP_fbreg);
   emitSigned(Offset);
 }
@@ -105,10 +105,16 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
       DwarfRegs.push_back(Register::createRegister(-1, nullptr));
       return true;
     }
+    // Try getting dwarf register for virtual register anyway, eg. for NVPTX.
+    int64_t Reg = TRI.getDwarfRegNum(MachineReg, false);
+    if (Reg > 0) {
+      DwarfRegs.push_back(Register::createRegister(Reg, nullptr));
+      return true;
+    }
     return false;
   }
 
-  int Reg = TRI.getDwarfRegNum(MachineReg, false);
+  int64_t Reg = TRI.getDwarfRegNum(MachineReg, false);
 
   // If this is a valid register number, emit it.
   if (Reg >= 0) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
index 4daa78b15b8e2..06809ab263875 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -45,17 +45,17 @@ class DwarfExpression {
 protected:
   /// Holds information about all subregisters comprising a register location.
   struct Register {
-    int DwarfRegNo;
+    int64_t DwarfRegNo;
     unsigned SubRegSize;
     const char *Comment;
 
     /// Create a full register, no extra DW_OP_piece operators necessary.
-    static Register createRegister(int RegNo, const char *Comment) {
+    static Register createRegister(int64_t RegNo, const char *Comment) {
       return {RegNo, 0, Comment};
     }
 
     /// Create a subregister that needs a DW_OP_piece operator with SizeInBits.
-    static Register createSubRegister(int RegNo, unsigned SizeInBits,
+    static Register createSubRegister(int64_t RegNo, unsigned SizeInBits,
                                       const char *Comment) {
       return {RegNo, SizeInBits, Comment};
     }
@@ -161,13 +161,13 @@ class DwarfExpression {
 
   /// Emit a DW_OP_reg operation. Note that this is only legal inside a DWARF
   /// register location description.
-  void addReg(int DwarfReg, const char *Comment = nullptr);
+  void addReg(int64_t DwarfReg, const char *Comment = nullptr);
 
   /// Emit a DW_OP_breg operation.
-  void addBReg(int DwarfReg, int Offset);
+  void addBReg(int64_t DwarfReg, int64_t Offset);
 
   /// Emit DW_OP_fbreg <Offset>.
-  void addFBReg(int Offset);
+  void addFBReg(int64_t Offset);
 
   /// Emit a partial DWARF register operation.
   ///
diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp
index 0071284c86209..1eedfc4b25912 100644
--- a/llvm/lib/CodeGen/BasicBlockSections.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -311,11 +311,6 @@ bool BasicBlockSections::handleBBSections(MachineFunction &MF) {
   // original layout positions and finding the original fallthroughs.
   MF.RenumberBlocks();
 
-  if (BBSectionsType == BasicBlockSection::Labels) {
-    MF.setBBSectionsType(BBSectionsType);
-    return true;
-  }
-
   DenseMap<UniqueBBID, BBClusterInfo> FuncClusterInfo;
   if (BBSectionsType == BasicBlockSection::List) {
     auto [HasProfile, ClusterInfo] =
@@ -382,8 +377,6 @@ bool BasicBlockSections::handleBBSections(MachineFunction &MF) {
 // avoids the need to store basic block IDs in the BB address map section, since
 // they can be determined implicitly.
 bool BasicBlockSections::handleBBAddrMap(MachineFunction &MF) {
-  if (MF.getTarget().getBBSectionsType() == BasicBlockSection::Labels)
-    return false;
   if (!MF.getTarget().Options.BBAddrMap)
     return false;
   MF.RenumberBlocks();
diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp
index f50eb5e1730a3..a762aab43ddd2 100644
--- a/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -72,8 +72,8 @@ class BranchRelaxation : public MachineFunctionPass {
       if (Alignment <= ParentAlign)
         return alignTo(PO, Alignment);
 
-      // The alignment of this MBB is larger than the function's alignment, so we
-      // can't tell whether or not it will insert nops. Assume that it will.
+      // The alignment of this MBB is larger than the function's alignment, so
+      // we can't tell whether or not it will insert nops. Assume that it will.
       return alignTo(PO, Alignment) + Alignment.value() - ParentAlign.value();
     }
   };
@@ -103,7 +103,10 @@ class BranchRelaxation : public MachineFunctionPass {
   MachineBasicBlock *splitBlockBeforeInstr(MachineInstr &MI,
                                            MachineBasicBlock *DestBB);
   void adjustBlockOffsets(MachineBasicBlock &Start);
-  bool isBlockInRange(const MachineInstr &MI, const MachineBasicBlock &BB) const;
+  void adjustBlockOffsets(MachineBasicBlock &Start,
+                          MachineFunction::iterator End);
+  bool isBlockInRange(const MachineInstr &MI,
+                      const MachineBasicBlock &BB) const;
 
   bool fixupConditionalBranch(MachineInstr &MI);
   bool fixupUnconditionalBranch(MachineInstr &MI);
@@ -199,7 +202,8 @@ void BranchRelaxation::scanFunction() {
 }
 
 /// computeBlockSize - Compute the size for MBB.
-uint64_t BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) const {
+uint64_t
+BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) const {
   uint64_t Size = 0;
   for (const MachineInstr &MI : MBB)
     Size += TII->getInstSizeInBytes(MI);
@@ -227,9 +231,14 @@ unsigned BranchRelaxation::getInstrOffset(const MachineInstr &MI) const {
 }
 
 void BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
+  adjustBlockOffsets(Start, MF->end());
+}
+
+void BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start,
+                                          MachineFunction::iterator End) {
   unsigned PrevNum = Start.getNumber();
   for (auto &MBB :
-       make_range(std::next(MachineFunction::iterator(Start)), MF->end())) {
+       make_range(std::next(MachineFunction::iterator(Start)), End)) {
     unsigned Num = MBB.getNumber();
     // Get the offset and known bits at the end of the layout predecessor.
     // Include the alignment of the current block.
@@ -314,8 +323,8 @@ BranchRelaxation::splitBlockBeforeInstr(MachineInstr &MI,
   // block, it may contain a tablejump.
   BlockInfo[NewBB->getNumber()].Size = computeBlockSize(*NewBB);
 
-  // All BBOffsets following these blocks must be modified.
-  adjustBlockOffsets(*OrigBB);
+  // Update the offset of the new block.
+  adjustBlockOffsets(*OrigBB, std::next(NewBB->getIterator()));
 
   // Need to fix live-in lists if we track liveness.
   if (TRI->trackLivenessAfterRegAlloc(*MF))
@@ -328,8 +337,8 @@ BranchRelaxation::splitBlockBeforeInstr(MachineInstr &MI,
 
 /// isBlockInRange - Returns true if the distance between specific MI and
 /// specific BB can fit in MI's displacement field.
-bool BranchRelaxation::isBlockInRange(
-  const MachineInstr &MI, const MachineBasicBlock &DestBB) const {
+bool BranchRelaxation::isBlockInRange(const MachineInstr &MI,
+                                      const MachineBasicBlock &DestBB) const {
   int64_t BrOffset = getInstrOffset(MI);
   int64_t DestOffset = BlockInfo[DestBB.getNumber()].Offset;
 
@@ -369,7 +378,7 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
   };
   auto insertBranch = [&](MachineBasicBlock *MBB, MachineBasicBlock *TBB,
                           MachineBasicBlock *FBB,
-                          SmallVectorImpl<MachineOperand>& Cond) {
+                          SmallVectorImpl<MachineOperand> &Cond) {
     unsigned &BBSize = BlockInfo[MBB->getNumber()].Size;
     int NewBrSize = 0;
     TII->insertBranch(*MBB, TBB, FBB, Cond, DL, &NewBrSize);
@@ -382,13 +391,18 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
     BBSize -= RemovedSize;
   };
 
-  auto finalizeBlockChanges = [&](MachineBasicBlock *MBB,
-                                  MachineBasicBlock *NewBB) {
-    // Keep the block offsets up to date.
-    adjustBlockOffsets(*MBB);
+  // Populate the block offset and live-ins for a new basic block.
+  auto updateOffsetAndLiveness = [&](MachineBasicBlock *NewBB) {
+    assert(NewBB != nullptr && "can't populate offset for nullptr");
+
+    // Keep the block offsets approximately up to date. While they will be
+    // slight underestimates, we will update them appropriately in the next
+    // scan through the function.
+    adjustBlockOffsets(*std::prev(NewBB->getIterator()),
+                       std::next(NewBB->getIterator()));
 
     // Need to fix live-in lists if we track liveness.
-    if (NewBB && TRI->trackLivenessAfterRegAlloc(*MF))
+    if (TRI->trackLivenessAfterRegAlloc(*MF))
       computeAndAddLiveIns(LiveRegs, *NewBB);
   };
 
@@ -428,7 +442,7 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
       insertBranch(MBB, NewBB, FBB, Cond);
 
       TrampolineInsertionPoint = NewBB;
-      finalizeBlockChanges(MBB, NewBB);
+      updateOffsetAndLiveness(NewBB);
       return true;
     }
 
@@ -438,6 +452,7 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
                << ".\n");
     TrampolineInsertionPoint->setIsEndSection(NewBB->isEndSection());
     MF->erase(NewBB);
+    NewBB = nullptr;
   }
 
   // Add an unconditional branch to the destination and invert the branch
@@ -464,7 +479,6 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
 
       removeBranch(MBB);
       insertBranch(MBB, FBB, TBB, Cond);
-      finalizeBlockChanges(MBB, nullptr);
       return true;
     }
     if (FBB) {
@@ -477,10 +491,11 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
       // Do it here since if there's no split, no update is needed.
       MBB->replaceSuccessor(FBB, NewBB);
       NewBB->addSuccessor(FBB);
+      updateOffsetAndLiveness(NewBB);
     }
 
-    // We now have an appropriate fall-through block in place (either naturally or
-    // just created), so we can use the inverted the condition.
+    // We now have an appropriate fall-through block in place (either naturally
+    // or just created), so we can use the inverted the condition.
     MachineBasicBlock &NextBB = *std::next(MachineFunction::iterator(MBB));
 
     LLVM_DEBUG(dbgs() << "  Insert B to " << printMBBReference(*TBB)
@@ -490,8 +505,6 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
     removeBranch(MBB);
     // Insert a new conditional branch and a new unconditional branch.
     insertBranch(MBB, &NextBB, TBB, Cond);
-
-    finalizeBlockChanges(MBB, NewBB);
     return true;
   }
   // Branch cond can't be inverted.
@@ -531,7 +544,7 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
   removeBranch(MBB);
   insertBranch(MBB, NewBB, FBB, Cond);
 
-  finalizeBlockChanges(MBB, NewBB);
+  updateOffsetAndLiveness(NewBB);
   return true;
 }
 
@@ -577,8 +590,8 @@ bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) {
   // Create the optional restore block and, initially, place it at the end of
   // function. That block will be placed later if it's used; otherwise, it will
   // be erased.
-  MachineBasicBlock *RestoreBB = createNewBlockAfter(MF->back(),
-                                                     DestBB->getBasicBlock());
+  MachineBasicBlock *RestoreBB =
+      createNewBlockAfter(MF->back(), DestBB->getBasicBlock());
   std::prev(RestoreBB->getIterator())
       ->setIsEndSection(RestoreBB->isEndSection());
   RestoreBB->setIsEndSection(false);
@@ -589,8 +602,10 @@ bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) {
                                 : DestOffset - SrcOffset,
                             RS.get());
 
+  // Update the block size and offset for the BranchBB (which may be newly
+  // created).
   BlockInfo[BranchBB->getNumber()].Size = computeBlockSize(*BranchBB);
-  adjustBlockOffsets(*MBB);
+  adjustBlockOffsets(*MBB, std::next(BranchBB->getIterator()));
 
   // If RestoreBB is required, place it appropriately.
   if (!RestoreBB->empty()) {
@@ -601,6 +616,8 @@ bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) {
       MachineBasicBlock *NewBB = createNewBlockAfter(*TrampolineInsertionPoint);
       TII->insertUnconditionalBranch(*NewBB, DestBB, DebugLoc());
       BlockInfo[NewBB->getNumber()].Size = computeBlockSize(*NewBB);
+      adjustBlockOffsets(*TrampolineInsertionPoint,
+                         std::next(NewBB->getIterator()));
 
       // New trampolines should be inserted after NewBB.
       TrampolineInsertionPoint = NewBB;
@@ -636,8 +653,8 @@ bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) {
       computeAndAddLiveIns(LiveRegs, *RestoreBB);
     // Compute the restore block size.
     BlockInfo[RestoreBB->getNumber()].Size = computeBlockSize(*RestoreBB);
-    // Update the offset starting from the previous block.
-    adjustBlockOffsets(*PrevBB);
+    // Update the estimated offset for the restore block.
+    adjustBlockOffsets(*PrevBB, DestBB->getIterator());
 
     // Fix up section information for RestoreBB and DestBB
     RestoreBB->setSectionID(DestBB->getSectionID());
@@ -718,6 +735,12 @@ bool BranchRelaxation::relaxBranchInstructions() {
     }
   }
 
+  // If we relaxed a branch, we must recompute offsets for *all* basic blocks.
+  // Otherwise, we may underestimate branch distances and fail to relax a branch
+  // that has been pushed out of range.
+  if (Changed)
+    adjustBlockOffsets(MF->front());
+
   return Changed;
 }
 
diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 9d8c9119f7719..88ed2291313c9 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -222,7 +222,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
 
   bool IsExiting = false;
   std::set<CopyHint> CopyHints;
-  DenseMap<unsigned, float> Hint;
+  SmallDenseMap<unsigned, float, 8> Hint;
   for (MachineRegisterInfo::reg_instr_nodbg_iterator
            I = MRI.reg_instr_nodbg_begin(LI.reg()),
            E = MRI.reg_instr_nodbg_end();
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 9e42deb94903d..d180cfcea658c 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -525,8 +525,6 @@ llvm::BasicBlockSection
 codegen::getBBSectionsMode(llvm::TargetOptions &Options) {
   if (getBBSections() == "all")
     return BasicBlockSection::All;
-  else if (getBBSections() == "labels")
-    return BasicBlockSection::Labels;
   else if (getBBSections() == "none")
     return BasicBlockSection::None;
   else {
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 513914d3218fb..08ee6169ecee8 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -1034,7 +1034,7 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy,
   unsigned NumValues = SplitVTs.size();
   Align BaseAlign = DL.getPrefTypeAlign(RetTy);
   unsigned AS = DL.getAllocaAddrSpace();
-  LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetTy->getPointerTo(AS)), DL);
+  LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetTy->getContext(), AS), DL);
 
   MachinePointerInfo PtrInfo(AS);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 8e860a1f74029..40360b0b0f1d8 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1885,6 +1885,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_FASIN;
     case Intrinsic::atan:
       return TargetOpcode::G_FATAN;
+    case Intrinsic::atan2:
+      return TargetOpcode::G_FATAN2;
     case Intrinsic::bswap:
       return TargetOpcode::G_BSWAP;
     case Intrinsic::bitreverse:
@@ -2586,6 +2588,10 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                          getOrCreateVReg(*CI.getOperand(0)),
                          getOrCreateVReg(*CI.getOperand(1)));
     return true;
+  case Intrinsic::vector_extract:
+    return translateExtractVector(CI, MIRBuilder);
+  case Intrinsic::vector_insert:
+    return translateInsertVector(CI, MIRBuilder);
   case Intrinsic::prefetch: {
     Value *Addr = CI.getOperand(0);
     unsigned RW = cast<ConstantInt>(CI.getOperand(1))->getZExtValue();
@@ -3161,6 +3167,57 @@ bool IRTranslator::translateInsertElement(const User &U,
   return true;
 }
 
+bool IRTranslator::translateInsertVector(const User &U,
+                                         MachineIRBuilder &MIRBuilder) {
+  Register Dst = getOrCreateVReg(U);
+  Register Vec = getOrCreateVReg(*U.getOperand(0));
+  Register Elt = getOrCreateVReg(*U.getOperand(1));
+
+  ConstantInt *CI = cast<ConstantInt>(U.getOperand(2));
+  unsigned PreferredVecIdxWidth = TLI->getVectorIdxTy(*DL).getSizeInBits();
+
+  // Resize Index to preferred index width.
+  if (CI->getBitWidth() != PreferredVecIdxWidth) {
+    APInt NewIdx = CI->getValue().zextOrTrunc(PreferredVecIdxWidth);
+    CI = ConstantInt::get(CI->getContext(), NewIdx);
+  }
+
+  // If it is a <1 x Ty> vector, we have to use other means.
+  if (auto *ResultType = dyn_cast<FixedVectorType>(U.getOperand(1)->getType());
+      ResultType && ResultType->getNumElements() == 1) {
+    if (auto *InputType = dyn_cast<FixedVectorType>(U.getOperand(0)->getType());
+        InputType && InputType->getNumElements() == 1) {
+      // We are inserting an illegal fixed vector into an illegal
+      // fixed vector, use the scalar as it is not a legal vector type
+      // in LLT.
+      return translateCopy(U, *U.getOperand(0), MIRBuilder);
+    }
+    if (isa<FixedVectorType>(U.getOperand(0)->getType())) {
+      // We are inserting an illegal fixed vector into a legal fixed
+      // vector, use the scalar as it is not a legal vector type in
+      // LLT.
+      Register Idx = getOrCreateVReg(*CI);
+      MIRBuilder.buildInsertVectorElement(Dst, Vec, Elt, Idx);
+      return true;
+    }
+    if (isa<ScalableVectorType>(U.getOperand(0)->getType())) {
+      // We are inserting an illegal fixed vector into a scalable
+      // vector, use a scalar element insert.
+      LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+      Register Idx = getOrCreateVReg(*CI);
+      auto ScaledIndex = MIRBuilder.buildMul(
+          VecIdxTy, MIRBuilder.buildVScale(VecIdxTy, 1), Idx);
+      MIRBuilder.buildInsertVectorElement(Dst, Vec, Elt, ScaledIndex);
+      return true;
+    }
+  }
+
+  MIRBuilder.buildInsertSubvector(
+      getOrCreateVReg(U), getOrCreateVReg(*U.getOperand(0)),
+      getOrCreateVReg(*U.getOperand(1)), CI->getZExtValue());
+  return true;
+}
+
 bool IRTranslator::translateExtractElement(const User &U,
                                            MachineIRBuilder &MIRBuilder) {
   // If it is a <1 x Ty> vector, use the scalar as it is
@@ -3189,6 +3246,54 @@ bool IRTranslator::translateExtractElement(const User &U,
   return true;
 }
 
+bool IRTranslator::translateExtractVector(const User &U,
+                                          MachineIRBuilder &MIRBuilder) {
+  Register Res = getOrCreateVReg(U);
+  Register Vec = getOrCreateVReg(*U.getOperand(0));
+  ConstantInt *CI = cast<ConstantInt>(U.getOperand(1));
+  unsigned PreferredVecIdxWidth = TLI->getVectorIdxTy(*DL).getSizeInBits();
+
+  // Resize Index to preferred index width.
+  if (CI->getBitWidth() != PreferredVecIdxWidth) {
+    APInt NewIdx = CI->getValue().zextOrTrunc(PreferredVecIdxWidth);
+    CI = ConstantInt::get(CI->getContext(), NewIdx);
+  }
+
+  // If it is a <1 x Ty> vector, we have to use other means.
+  if (auto *ResultType = dyn_cast<FixedVectorType>(U.getType());
+      ResultType && ResultType->getNumElements() == 1) {
+    if (auto *InputType = dyn_cast<FixedVectorType>(U.getOperand(0)->getType());
+        InputType && InputType->getNumElements() == 1) {
+      // We are extracting an illegal fixed vector from an illegal fixed vector,
+      // use the scalar as it is not a legal vector type in LLT.
+      return translateCopy(U, *U.getOperand(0), MIRBuilder);
+    }
+    if (isa<FixedVectorType>(U.getOperand(0)->getType())) {
+      // We are extracting an illegal fixed vector from a legal fixed
+      // vector, use the scalar as it is not a legal vector type in
+      // LLT.
+      Register Idx = getOrCreateVReg(*CI);
+      MIRBuilder.buildExtractVectorElement(Res, Vec, Idx);
+      return true;
+    }
+    if (isa<ScalableVectorType>(U.getOperand(0)->getType())) {
+      // We are extracting an illegal fixed vector from a scalable
+      // vector, use a scalar element extract.
+      LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+      Register Idx = getOrCreateVReg(*CI);
+      auto ScaledIndex = MIRBuilder.buildMul(
+          VecIdxTy, MIRBuilder.buildVScale(VecIdxTy, 1), Idx);
+      MIRBuilder.buildExtractVectorElement(Res, Vec, ScaledIndex);
+      return true;
+    }
+  }
+
+  MIRBuilder.buildExtractSubvector(getOrCreateVReg(U),
+                                   getOrCreateVReg(*U.getOperand(0)),
+                                   CI->getZExtValue());
+  return true;
+}
+
 bool IRTranslator::translateShuffleVector(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
   // A ShuffleVector that operates on scalable vectors is a splat vector where
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index e64d3f51a0111..2fb2d104f1ce3 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4051,12 +4051,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     auto [Res, SubByReg] = MI.getFirst2Regs();
     LLT Ty = MRI.getType(Res);
 
-    // TODO: Handle vector types once we are able to
-    // represent them.
-    if (Ty.isVector())
-      return UnableToLegalize;
-    auto SignMask =
-        MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
+    auto SignMask = MIRBuilder.buildConstant(
+        Ty, APInt::getSignMask(Ty.getScalarSizeInBits()));
     MIRBuilder.buildXor(Res, SubByReg, SignMask);
     MI.eraseFromParent();
     return Legalized;
@@ -7169,6 +7165,78 @@ LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
   return Legalized;
 }
 
+// Expand s32 = G_UITOFP s64 to an IEEE float representation using bit
+// operations and G_SITOFP
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
+  auto [Dst, Src] = MI.getFirst2Regs();
+  const LLT S64 = LLT::scalar(64);
+  const LLT S32 = LLT::scalar(32);
+  const LLT S1 = LLT::scalar(1);
+
+  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
+
+  // For i64 < INT_MAX we simply reuse SITOFP.
+  // Otherwise, divide i64 by 2, round result by ORing with the lowest bit
+  // saved before division, convert to float by SITOFP, multiply the result
+  // by 2.
+  auto One = MIRBuilder.buildConstant(S64, 1);
+  auto Zero = MIRBuilder.buildConstant(S64, 0);
+  // Result if Src < INT_MAX
+  auto SmallResult = MIRBuilder.buildSITOFP(S32, Src);
+  // Result if Src >= INT_MAX
+  auto Halved = MIRBuilder.buildLShr(S64, Src, One);
+  auto LowerBit = MIRBuilder.buildAnd(S64, Src, One);
+  auto RoundedHalved = MIRBuilder.buildOr(S64, Halved, LowerBit);
+  auto HalvedFP = MIRBuilder.buildSITOFP(S32, RoundedHalved);
+  auto LargeResult = MIRBuilder.buildFAdd(S32, HalvedFP, HalvedFP);
+  // Check if the original value is larger than INT_MAX by comparing with
+  // zero to pick one of the two conversions.
+  auto IsLarge =
+      MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, S1, Src, Zero);
+  MIRBuilder.buildSelect(Dst, IsLarge, LargeResult, SmallResult);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+// Expand s64 = G_UITOFP s64 using bit and float arithmetic operations to an
+// IEEE double representation.
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
+  auto [Dst, Src] = MI.getFirst2Regs();
+  const LLT S64 = LLT::scalar(64);
+  const LLT S32 = LLT::scalar(32);
+
+  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
+
+  // We create double value from 32 bit parts with 32 exponent difference.
+  // Note that + and - are float operations that adjust the implicit leading
+  // one, the bases 2^52 and 2^84 are for illustrative purposes.
+  //
+  // X = 2^52 * 1.0...LowBits
+  // Y = 2^84 * 1.0...HighBits
+  // Scratch = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0
+  //         = - 2^52 * 1.0...HighBits
+  // Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits
+  auto TwoP52 = MIRBuilder.buildConstant(S64, UINT64_C(0x4330000000000000));
+  auto TwoP84 = MIRBuilder.buildConstant(S64, UINT64_C(0x4530000000000000));
+  auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000));
+  auto TwoP52P84FP = MIRBuilder.buildFConstant(S64, TwoP52P84);
+  auto HalfWidth = MIRBuilder.buildConstant(S64, 32);
+
+  auto LowBits = MIRBuilder.buildTrunc(S32, Src);
+  LowBits = MIRBuilder.buildZExt(S64, LowBits);
+  auto LowBitsFP = MIRBuilder.buildOr(S64, TwoP52, LowBits);
+  auto HighBits = MIRBuilder.buildLShr(S64, Src, HalfWidth);
+  auto HighBitsFP = MIRBuilder.buildOr(S64, TwoP84, HighBits);
+  auto Scratch = MIRBuilder.buildFSub(S64, HighBitsFP, TwoP52P84FP);
+  MIRBuilder.buildFAdd(Dst, Scratch, LowBitsFP);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
 
@@ -7183,13 +7251,15 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
   if (SrcTy != LLT::scalar(64))
     return UnableToLegalize;
 
-  if (DstTy == LLT::scalar(32)) {
+  if (DstTy == LLT::scalar(32))
     // TODO: SelectionDAG has several alternative expansions to port which may
-    // be more reasonble depending on the available instructions. If a target
-    // has sitofp, does not have CTLZ, or can efficiently use f64 as an
-    // intermediate type, this is probably worse.
-    return lowerU64ToF32BitOps(MI);
-  }
+    // be more reasonable depending on the available instructions. We also need
+    // a more advanced mechanism to choose an optimal version depending on
+    // target features such as sitofp or CTLZ availability.
+    return lowerU64ToF32WithSITOFP(MI);
+
+  if (DstTy == LLT::scalar(64))
+    return lowerU64ToF64BitFloatOps(MI);
 
   return UnableToLegalize;
 }
diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index c31ba6b31ad9a..007bea9a6585e 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -578,12 +578,18 @@ bool GlobalMergeImpl::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
       Globals[k]->replaceAllUsesWith(GEP);
       Globals[k]->eraseFromParent();
 
-      // When the linkage is not internal we must emit an alias for the original
-      // variable name as it may be accessed from another object. On non-Mach-O
-      // we can also emit an alias for internal linkage as it's safe to do so.
-      // It's not safe on Mach-O as the alias (and thus the portion of the
-      // MergedGlobals variable) may be dead stripped at link time.
-      if (Linkage != GlobalValue::InternalLinkage || !IsMachO) {
+      // Emit an alias for the original variable name. This is necessary for an
+      // external symbol, as it may be accessed from another object. For
+      // internal symbols, it's not strictly required, but it's useful.
+      //
+      // This _should_ also work on Mach-O ever since '.alt_entry' support was
+      // added in 2016. Unfortunately, there's a bug in ld-prime (present at
+      // least from Xcode 15.0 through Xcode 16.0), in which -dead_strip doesn't
+      // always honor alt_entry. To workaround this issue, we don't emit aliases
+      // on Mach-O. Except, we _must_ do so for external symbols. That means
+      // MergeExternal is broken with that linker. (That option is currently off
+      // by default on MachO).
+      if (!IsMachO || Linkage == GlobalValue::ExternalLinkage) {
         GlobalAlias *GA = GlobalAlias::create(Tys[StructIdxs[idx]], AddrSpace,
                                               Linkage, Name, GEP, &M);
         GA->setVisibility(Visibility);
@@ -640,6 +646,18 @@ void GlobalMergeImpl::setMustKeepGlobalVariables(Module &M) {
   }
 }
 
+// This function returns true if the given data Section name has custom
+// subsection-splitting semantics in Mach-O (such as splitting by a fixed size)
+//
+// See also ObjFile::parseSections and getRecordSize in lld/MachO/InputFiles.cpp
+static bool isSpecialMachOSection(StringRef Section) {
+  // Uses starts_with, since section attributes can appear at the end of the
+  // name.
+  return Section.starts_with("__DATA,__cfstring") ||
+         Section.starts_with("__DATA,__objc_classrefs") ||
+         Section.starts_with("__DATA,__objc_selrefs");
+}
+
 bool GlobalMergeImpl::run(Module &M) {
   if (!EnableGlobalMerge)
     return false;
@@ -678,6 +696,10 @@ bool GlobalMergeImpl::run(Module &M) {
     unsigned AddressSpace = PT->getAddressSpace();
     StringRef Section = GV.getSection();
 
+    // On Mach-O, some section names have special semantics. Don't merge these.
+    if (IsMachO && isSpecialMachOSection(Section))
+      continue;
+
     // Ignore all 'special' globals.
     if (GV.getName().starts_with("llvm.") || GV.getName().starts_with(".llvm."))
       continue;
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index a73a3aa59403b..a9d28a39c4418 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -3958,12 +3958,10 @@ class LDVSSAUpdater {
   /// For a given MBB, create a wrapper block for it. Stores it in the
   /// LDVSSAUpdater block map.
   LDVSSABlock *getSSALDVBlock(MachineBasicBlock *BB) {
-    auto it = BlockMap.find(BB);
-    if (it == BlockMap.end()) {
-      BlockMap[BB] = new LDVSSABlock(*BB, *this);
-      it = BlockMap.find(BB);
-    }
-    return it->second;
+    auto [It, Inserted] = BlockMap.try_emplace(BB);
+    if (Inserted)
+      It->second = new LDVSSABlock(*BB, *this);
+    return It->second;
   }
 
   /// Find the live-in value number for the given block. Looks up the value at
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
index 5c095e79599f6..68db65ace9a42 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
@@ -1046,7 +1046,7 @@ class VLocTracker {
   /// transfer function for this block, as part of the dataflow analysis. The
   /// movement of values between locations inside of a block is handled at a
   /// much later stage, in the TransferTracker class.
-  MapVector<DebugVariableID, DbgValue> Vars;
+  SmallMapVector<DebugVariableID, DbgValue, 8> Vars;
   SmallDenseMap<DebugVariableID, const DILocation *, 8> Scopes;
   MachineBasicBlock *MBB = nullptr;
   const OverlapMap &OverlappingFragments;
@@ -1070,9 +1070,7 @@ class VLocTracker {
                        : DbgValue(Properties, DbgValue::Undef);
 
     // Attempt insertion; overwrite if it's already mapped.
-    auto Result = Vars.insert(std::make_pair(VarID, Rec));
-    if (!Result.second)
-      Result.first->second = Rec;
+    Vars.insert_or_assign(VarID, Rec);
     Scopes[VarID] = MI.getDebugLoc().get();
 
     considerOverlaps(Var, MI.getDebugLoc().get());
@@ -1100,9 +1098,7 @@ class VLocTracker {
       DbgValue Rec = DbgValue(EmptyProperties, DbgValue::Undef);
 
       // Attempt insertion; overwrite if it's already mapped.
-      auto Result = Vars.insert(std::make_pair(OverlappedID, Rec));
-      if (!Result.second)
-        Result.first->second = Rec;
+      Vars.insert_or_assign(OverlappedID, Rec);
       Scopes[OverlappedID] = Loc;
     }
   }
@@ -1132,7 +1128,7 @@ class InstrRefBasedLDV : public LDVImpl {
 
   /// Live in/out structure for the variable values: a per-block map of
   /// variables to their values.
-  using LiveIdxT = DenseMap<const MachineBasicBlock *, DbgValue *>;
+  using LiveIdxT = SmallDenseMap<const MachineBasicBlock *, DbgValue *, 16>;
 
   using VarAndLoc = std::pair<DebugVariableID, DbgValue>;
 
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 27f0a9331a3e3..74f38e886a6b9 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -780,7 +780,7 @@ bool MIParser::parseBasicBlockDefinition(
                             "' is not defined in the function '" +
                             MF.getName() + "'");
   }
-  auto *MBB = MF.CreateMachineBasicBlock(BB);
+  auto *MBB = MF.CreateMachineBasicBlock(BB, BBID);
   MF.insert(MF.end(), MBB);
   bool WasInserted = MBBSlots.insert(std::make_pair(ID, MBB)).second;
   if (!WasInserted)
@@ -799,13 +799,6 @@ bool MIParser::parseBasicBlockDefinition(
     MBB->setSectionID(*SectionID);
     MF.setBBSectionsType(BasicBlockSection::List);
   }
-  if (BBID.has_value()) {
-    // BBSectionsType is set to `List` if any basic blocks has `SectionID`.
-    // Here, we set it to `Labels` if it hasn't been set above.
-    if (!MF.hasBBSections())
-      MF.setBBSectionsType(BasicBlockSection::Labels);
-    MBB->setBBID(BBID.value());
-  }
   MBB->setCallFrameSize(CallFrameSize);
   return false;
 }
@@ -2661,7 +2654,7 @@ bool MIParser::parseIntrinsicOperand(MachineOperand &Dest) {
   // Find out what intrinsic we're dealing with, first try the global namespace
   // and then the target's private intrinsics if that fails.
   const TargetIntrinsicInfo *TII = MF.getTarget().getIntrinsicInfo();
-  Intrinsic::ID ID = Function::lookupIntrinsicID(Name);
+  Intrinsic::ID ID = Intrinsic::lookupIntrinsicID(Name);
   if (ID == Intrinsic::not_intrinsic && TII)
     ID = static_cast<Intrinsic::ID>(TII->lookupName(Name));
 
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index be07fbf478b1d..997c428ca77dc 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -569,9 +569,7 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
     return true;
   }
   // Check Basic Block Section Flags.
-  if (MF.getTarget().getBBSectionsType() == BasicBlockSection::Labels) {
-    MF.setBBSectionsType(BasicBlockSection::Labels);
-  } else if (MF.hasBBSections()) {
+  if (MF.hasBBSections()) {
     MF.assignBeginEndSections();
   }
   PFS.SM = &SM;
diff --git a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
index ccfc4565d3a9b..49c8a0e466337 100644
--- a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
+++ b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
@@ -39,8 +39,6 @@ VRegRenamer::getVRegRenameMap(const std::vector<NamedVReg> &VRegs) {
   StringMap<unsigned> VRegNameCollisionMap;
 
   auto GetUniqueVRegName = [&VRegNameCollisionMap](const NamedVReg &Reg) {
-    if (!VRegNameCollisionMap.contains(Reg.getName()))
-      VRegNameCollisionMap[Reg.getName()] = 0;
     const unsigned Counter = ++VRegNameCollisionMap[Reg.getName()];
     return Reg.getName() + "__" + std::to_string(Counter);
   };
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index a52c82d77ca64..7807875c06584 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -3619,9 +3619,8 @@ void MachineBlockPlacement::applyExtTsp() {
                     << " with profile = " << F->getFunction().hasProfileData()
                     << " (" << F->getName().str() << ")"
                     << "\n");
-  LLVM_DEBUG(
-      dbgs() << format("  original  layout score: %0.2f\n",
-                       calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
+  LLVM_DEBUG(dbgs() << format("  original  layout score: %0.2f\n",
+                              calcExtTspScore(BlockSizes, JumpCounts)));
 
   // Run the layout algorithm.
   auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
@@ -3630,9 +3629,9 @@ void MachineBlockPlacement::applyExtTsp() {
   for (uint64_t Node : NewOrder) {
     NewBlockOrder.push_back(CurrentBlockOrder[Node]);
   }
-  LLVM_DEBUG(dbgs() << format("  optimized layout score: %0.2f\n",
-                              calcExtTspScore(NewOrder, BlockSizes, BlockCounts,
-                                              JumpCounts)));
+  LLVM_DEBUG(
+      dbgs() << format("  optimized layout score: %0.2f\n",
+                       calcExtTspScore(NewOrder, BlockSizes, JumpCounts)));
 
   // Assign new block order.
   assignBlockOrder(NewBlockOrder);
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index ab45663436ced..b56888a0f71fe 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -465,11 +465,9 @@ MachineFunction::CreateMachineBasicBlock(const BasicBlock *BB,
   MachineBasicBlock *MBB =
       new (BasicBlockRecycler.Allocate<MachineBasicBlock>(Allocator))
           MachineBasicBlock(*this, BB);
-  // Set BBID for `-basic-block=sections=labels` and
-  // `-basic-block-sections=list` to allow robust mapping of profiles to basic
-  // blocks.
-  if (Target.getBBSectionsType() == BasicBlockSection::Labels ||
-      Target.Options.BBAddrMap ||
+  // Set BBID for `-basic-block-sections=list` and `-basic-block-address-map` to
+  // allow robust mapping of profiles to basic blocks.
+  if (Target.Options.BBAddrMap ||
       Target.getBBSectionsType() == BasicBlockSection::List)
     MBB->setBBID(BBID.has_value() ? *BBID : UniqueBBID{NextBBID++, 0});
   return MBB;
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 6768eeeb4364c..793ad75759ccb 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -148,13 +148,13 @@ namespace {
     DenseMap<MachineLoop *, SmallVector<MachineBasicBlock *, 8>> ExitBlockMap;
 
     bool isExitBlock(MachineLoop *CurLoop, const MachineBasicBlock *MBB) {
-      if (ExitBlockMap.contains(CurLoop))
-        return is_contained(ExitBlockMap[CurLoop], MBB);
-
-      SmallVector<MachineBasicBlock *, 8> ExitBlocks;
-      CurLoop->getExitBlocks(ExitBlocks);
-      ExitBlockMap[CurLoop] = ExitBlocks;
-      return is_contained(ExitBlocks, MBB);
+      auto [It, Inserted] = ExitBlockMap.try_emplace(CurLoop);
+      if (Inserted) {
+        SmallVector<MachineBasicBlock *, 8> ExitBlocks;
+        CurLoop->getExitBlocks(ExitBlocks);
+        It->second = ExitBlocks;
+      }
+      return is_contained(It->second, MBB);
     }
 
     // Track 'estimated' register pressure.
@@ -239,7 +239,7 @@ namespace {
 
     bool IsCheapInstruction(MachineInstr &MI) const;
 
-    bool CanCauseHighRegPressure(const DenseMap<unsigned, int> &Cost,
+    bool CanCauseHighRegPressure(const SmallDenseMap<unsigned, int> &Cost,
                                  bool Cheap);
 
     void UpdateBackTraceRegPressure(const MachineInstr *MI);
@@ -264,9 +264,9 @@ namespace {
 
     void InitRegPressure(MachineBasicBlock *BB);
 
-    DenseMap<unsigned, int> calcRegisterCost(const MachineInstr *MI,
-                                             bool ConsiderSeen,
-                                             bool ConsiderUnseenAsDef);
+    SmallDenseMap<unsigned, int> calcRegisterCost(const MachineInstr *MI,
+                                                  bool ConsiderSeen,
+                                                  bool ConsiderUnseenAsDef);
 
     void UpdateRegPressure(const MachineInstr *MI,
                            bool ConsiderUnseenAsDef = false);
@@ -977,10 +977,10 @@ void MachineLICMImpl::UpdateRegPressure(const MachineInstr *MI,
 /// If 'ConsiderSeen' is true, updates 'RegSeen' and uses the information to
 /// figure out which usages are live-ins.
 /// FIXME: Figure out a way to consider 'RegSeen' from all code paths.
-DenseMap<unsigned, int>
+SmallDenseMap<unsigned, int>
 MachineLICMImpl::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
                                   bool ConsiderUnseenAsDef) {
-  DenseMap<unsigned, int> Cost;
+  SmallDenseMap<unsigned, int> Cost;
   if (MI->isImplicitDef())
     return Cost;
   for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
@@ -1010,12 +1010,8 @@ MachineLICMImpl::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
     if (RCCost == 0)
       continue;
     const int *PS = TRI->getRegClassPressureSets(RC);
-    for (; *PS != -1; ++PS) {
-      if (!Cost.contains(*PS))
-        Cost[*PS] = RCCost;
-      else
-        Cost[*PS] += RCCost;
-    }
+    for (; *PS != -1; ++PS)
+      Cost[*PS] += RCCost;
   }
   return Cost;
 }
@@ -1248,7 +1244,7 @@ bool MachineLICMImpl::IsCheapInstruction(MachineInstr &MI) const {
 /// Visit BBs from header to current BB, check if hoisting an instruction of the
 /// given cost matrix can cause high register pressure.
 bool MachineLICMImpl::CanCauseHighRegPressure(
-    const DenseMap<unsigned, int> &Cost, bool CheapInstr) {
+    const SmallDenseMap<unsigned, int> &Cost, bool CheapInstr) {
   for (const auto &RPIdAndCost : Cost) {
     if (RPIdAndCost.second <= 0)
       continue;
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index cd8333931bb5f..5475743905032 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1415,14 +1415,12 @@ class HighRegisterPressureDetector {
         auto Reg = Use.RegUnit;
         if (!TargetRegs.contains(Reg))
           continue;
-        auto Ite = LastUseMI.find(Reg);
-        if (Ite == LastUseMI.end()) {
-          LastUseMI[Reg] = MI;
-        } else {
+        auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
+        if (!Inserted) {
           MachineInstr *Orig = Ite->second;
           MachineInstr *New = MI;
           if (InstrScore(Orig) < InstrScore(New))
-            LastUseMI[Reg] = New;
+            Ite->second = New;
         }
       }
     }
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 07fa92889d885..0e8220ec6251c 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -50,9 +50,9 @@ static bool isValidRegDefOf(const MachineOperand &MO, MCRegister PhysReg,
 
 void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) {
   unsigned MBBNumber = MBB->getNumber();
-  assert(MBBNumber < MBBReachingDefs.size() &&
+  assert(MBBNumber < MBBReachingDefs.numBlockIDs() &&
          "Unexpected basic block number.");
-  MBBReachingDefs[MBBNumber].resize(NumRegUnits);
+  MBBReachingDefs.startBasicBlock(MBBNumber, NumRegUnits);
 
   // Reset instruction counter in each basic block.
   CurInstr = 0;
@@ -71,7 +71,7 @@ void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) {
         // before the call.
         if (LiveRegs[Unit] != -1) {
           LiveRegs[Unit] = -1;
-          MBBReachingDefs[MBBNumber][Unit].push_back(-1);
+          MBBReachingDefs.append(MBBNumber, Unit, -1);
         }
       }
     }
@@ -97,7 +97,7 @@ void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) {
   // Insert the most recent reaching definition we found.
   for (unsigned Unit = 0; Unit != NumRegUnits; ++Unit)
     if (LiveRegs[Unit] != ReachingDefDefaultVal)
-      MBBReachingDefs[MBBNumber][Unit].push_back(LiveRegs[Unit]);
+      MBBReachingDefs.append(MBBNumber, Unit, LiveRegs[Unit]);
 }
 
 void ReachingDefAnalysis::leaveBasicBlock(MachineBasicBlock *MBB) {
@@ -122,7 +122,7 @@ void ReachingDefAnalysis::processDefs(MachineInstr *MI) {
   assert(!MI->isDebugInstr() && "Won't process debug instructions");
 
   unsigned MBBNumber = MI->getParent()->getNumber();
-  assert(MBBNumber < MBBReachingDefs.size() &&
+  assert(MBBNumber < MBBReachingDefs.numBlockIDs() &&
          "Unexpected basic block number.");
 
   for (auto &MO : MI->operands()) {
@@ -136,7 +136,7 @@ void ReachingDefAnalysis::processDefs(MachineInstr *MI) {
       // How many instructions since this reg unit was last written?
       if (LiveRegs[Unit] != CurInstr) {
         LiveRegs[Unit] = CurInstr;
-        MBBReachingDefs[MBBNumber][Unit].push_back(CurInstr);
+        MBBReachingDefs.append(MBBNumber, Unit, CurInstr);
       }
     }
   }
@@ -146,7 +146,7 @@ void ReachingDefAnalysis::processDefs(MachineInstr *MI) {
 
 void ReachingDefAnalysis::reprocessBasicBlock(MachineBasicBlock *MBB) {
   unsigned MBBNumber = MBB->getNumber();
-  assert(MBBNumber < MBBReachingDefs.size() &&
+  assert(MBBNumber < MBBReachingDefs.numBlockIDs() &&
          "Unexpected basic block number.");
 
   // Count number of non-debug instructions for end of block adjustment.
@@ -169,16 +169,16 @@ void ReachingDefAnalysis::reprocessBasicBlock(MachineBasicBlock *MBB) {
       if (Def == ReachingDefDefaultVal)
         continue;
 
-      auto Start = MBBReachingDefs[MBBNumber][Unit].begin();
-      if (Start != MBBReachingDefs[MBBNumber][Unit].end() && *Start < 0) {
-        if (*Start >= Def)
+      auto Defs = MBBReachingDefs.defs(MBBNumber, Unit);
+      if (!Defs.empty() && Defs.front() < 0) {
+        if (Defs.front() >= Def)
           continue;
 
         // Update existing reaching def from predecessor to a more recent one.
-        *Start = Def;
+        MBBReachingDefs.replaceFront(MBBNumber, Unit, Def);
       } else {
         // Insert new reaching def from predecessor.
-        MBBReachingDefs[MBBNumber][Unit].insert(Start, Def);
+        MBBReachingDefs.prepend(MBBNumber, Unit, Def);
       }
 
       // Update reaching def at end of BB. Keep in mind that these are
@@ -234,7 +234,7 @@ void ReachingDefAnalysis::reset() {
 
 void ReachingDefAnalysis::init() {
   NumRegUnits = TRI->getNumRegUnits();
-  MBBReachingDefs.resize(MF->getNumBlockIDs());
+  MBBReachingDefs.init(MF->getNumBlockIDs());
   // Initialize the MBBOutRegsInfos
   MBBOutRegsInfos.resize(MF->getNumBlockIDs());
   LoopTraversal Traversal;
@@ -247,10 +247,11 @@ void ReachingDefAnalysis::traverse() {
     processBasicBlock(TraversedMBB);
 #ifndef NDEBUG
   // Make sure reaching defs are sorted and unique.
-  for (MBBDefsInfo &MBBDefs : MBBReachingDefs) {
-    for (MBBRegUnitDefs &RegUnitDefs : MBBDefs) {
+  for (unsigned MBBNumber = 0, NumBlockIDs = MF->getNumBlockIDs();
+       MBBNumber != NumBlockIDs; ++MBBNumber) {
+    for (unsigned Unit = 0; Unit != NumRegUnits; ++Unit) {
       int LastDef = ReachingDefDefaultVal;
-      for (int Def : RegUnitDefs) {
+      for (int Def : MBBReachingDefs.defs(MBBNumber, Unit)) {
         assert(Def > LastDef && "Defs must be sorted and unique");
         LastDef = Def;
       }
@@ -265,11 +266,11 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI,
   int InstId = InstIds.lookup(MI);
   int DefRes = ReachingDefDefaultVal;
   unsigned MBBNumber = MI->getParent()->getNumber();
-  assert(MBBNumber < MBBReachingDefs.size() &&
+  assert(MBBNumber < MBBReachingDefs.numBlockIDs() &&
          "Unexpected basic block number.");
   int LatestDef = ReachingDefDefaultVal;
   for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
-    for (int Def : MBBReachingDefs[MBBNumber][Unit]) {
+    for (int Def : MBBReachingDefs.defs(MBBNumber, Unit)) {
       if (Def >= InstId)
         break;
       DefRes = Def;
@@ -299,7 +300,8 @@ bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B,
 
 MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB,
                                                  int InstId) const {
-  assert(static_cast<size_t>(MBB->getNumber()) < MBBReachingDefs.size() &&
+  assert(static_cast<size_t>(MBB->getNumber()) <
+             MBBReachingDefs.numBlockIDs() &&
          "Unexpected basic block number.");
   assert(InstId < static_cast<int>(MBB->size()) &&
          "Unexpected instruction id.");
diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp
index 59a1911555e9c..a517cb9631556 100644
--- a/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -1060,18 +1060,12 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
     LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes;
 
     // There may be parts of the register that were dead before the
-    // instruction, but became live afterwards. Similarly, some parts
-    // may have been killed in this instruction.
+    // instruction, but became live afterwards.
     decreaseRegPressure(Reg, LiveAfter, LiveAfter & LiveBefore);
-    increaseRegPressure(Reg, LiveAfter, ~LiveAfter & LiveBefore);
   }
-  // Generate liveness for uses.
+  // Generate liveness for uses. Also handle any uses which overlap with defs.
   for (const RegisterMaskPair &P : RegOpers.Uses) {
     Register Reg = P.RegUnit;
-    // If this register was also in a def operand, we've handled it
-    // with defs.
-    if (getRegLanes(RegOpers.Defs, Reg).any())
-      continue;
     LaneBitmask LiveAfter = LiveRegs.contains(Reg);
     LaneBitmask LiveBefore = LiveAfter | P.LaneMask;
     increaseRegPressure(Reg, LiveAfter, LiveBefore);
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 68dece6cf73e9..a0632eb17e65e 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -621,7 +621,8 @@ void ScheduleDAGInstrs::initSUnits() {
   }
 }
 
-class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> {
+class ScheduleDAGInstrs::Value2SUsMap
+    : public SmallMapVector<ValueType, SUList, 4> {
   /// Current total number of SUs in map.
   unsigned NumNodes = 0;
 
@@ -656,7 +657,7 @@ class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> {
 
   /// Clears map from all contents.
   void clear() {
-    MapVector<ValueType, SUList>::clear();
+    SmallMapVector<ValueType, SUList, 4>::clear();
     NumNodes = 0;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 162af2d9d708a..a4b0db90abb9f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -381,14 +381,13 @@ void FastISel::updateValueMap(const Value *I, Register Reg, unsigned NumRegs) {
   }
 }
 
-Register FastISel::getRegForGEPIndex(const Value *Idx) {
+Register FastISel::getRegForGEPIndex(MVT PtrVT, const Value *Idx) {
   Register IdxN = getRegForValue(Idx);
   if (!IdxN)
     // Unhandled operand. Halt "fast" selection and bail.
     return Register();
 
   // If the index is smaller or larger than intptr_t, truncate or extend it.
-  MVT PtrVT = TLI.getPointerTy(DL);
   EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
   if (IdxVT.bitsLT(PtrVT)) {
     IdxN = fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::SIGN_EXTEND, IdxN);
@@ -544,7 +543,8 @@ bool FastISel::selectGetElementPtr(const User *I) {
   uint64_t TotalOffs = 0;
   // FIXME: What's a good SWAG number for MaxOffs?
   uint64_t MaxOffs = 2048;
-  MVT VT = TLI.getPointerTy(DL);
+  MVT VT = TLI.getValueType(DL, I->getType()).getSimpleVT();
+
   for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I);
        GTI != E; ++GTI) {
     const Value *Idx = GTI.getOperand();
@@ -585,7 +585,7 @@ bool FastISel::selectGetElementPtr(const User *I) {
 
       // N = N + Idx * ElementSize;
       uint64_t ElementSize = GTI.getSequentialElementStride(DL);
-      Register IdxN = getRegForGEPIndex(Idx);
+      Register IdxN = getRegForGEPIndex(VT, Idx);
       if (!IdxN) // Unhandled operand. Halt "fast" selection and bail.
         return false;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 53ce21906204c..12a48ab06f1c0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -82,8 +82,7 @@ static unsigned countOperands(SDNode *Node, unsigned NumExpUses,
 /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
 /// implicit physical register output.
 void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
-                                   Register SrcReg,
-                                   DenseMap<SDValue, Register> &VRBaseMap) {
+                                   Register SrcReg, VRBaseMapType &VRBaseMap) {
   Register VRBase;
   if (SrcReg.isVirtual()) {
     // Just use the input register directly!
@@ -187,7 +186,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
                                        MachineInstrBuilder &MIB,
                                        const MCInstrDesc &II,
                                        bool IsClone, bool IsCloned,
-                                       DenseMap<SDValue, Register> &VRBaseMap) {
+                                       VRBaseMapType &VRBaseMap) {
   assert(Node->getMachineOpcode() != TargetOpcode::IMPLICIT_DEF &&
          "IMPLICIT_DEF should have been handled as a special case elsewhere!");
 
@@ -265,8 +264,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
 
 /// getVR - Return the virtual register corresponding to the specified result
 /// of the specified node.
-Register InstrEmitter::getVR(SDValue Op,
-                             DenseMap<SDValue, Register> &VRBaseMap) {
+Register InstrEmitter::getVR(SDValue Op, VRBaseMapType &VRBaseMap) {
   if (Op.isMachineOpcode() &&
       Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) {
     // Add an IMPLICIT_DEF instruction before every use.
@@ -280,7 +278,7 @@ Register InstrEmitter::getVR(SDValue Op,
     return VReg;
   }
 
-  DenseMap<SDValue, Register>::iterator I = VRBaseMap.find(Op);
+  VRBaseMapType::iterator I = VRBaseMap.find(Op);
   assert(I != VRBaseMap.end() && "Node emitted out of order - late");
   return I->second;
 }
@@ -318,7 +316,7 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
                                  SDValue Op,
                                  unsigned IIOpNum,
                                  const MCInstrDesc *II,
-                                 DenseMap<SDValue, Register> &VRBaseMap,
+                                 VRBaseMapType &VRBaseMap,
                                  bool IsDebug, bool IsClone, bool IsCloned) {
   assert(Op.getValueType() != MVT::Other &&
          Op.getValueType() != MVT::Glue &&
@@ -395,12 +393,10 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
 /// AddOperand - Add the specified operand to the specified machine instr.  II
 /// specifies the instruction information for the node, and IIOpNum is the
 /// operand number (in the II) that we are adding.
-void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
-                              SDValue Op,
-                              unsigned IIOpNum,
-                              const MCInstrDesc *II,
-                              DenseMap<SDValue, Register> &VRBaseMap,
-                              bool IsDebug, bool IsClone, bool IsCloned) {
+void InstrEmitter::AddOperand(MachineInstrBuilder &MIB, SDValue Op,
+                              unsigned IIOpNum, const MCInstrDesc *II,
+                              VRBaseMapType &VRBaseMap, bool IsDebug,
+                              bool IsClone, bool IsCloned) {
   if (Op.isMachineOpcode()) {
     AddRegisterOperand(MIB, Op, IIOpNum, II, VRBaseMap,
                        IsDebug, IsClone, IsCloned);
@@ -499,8 +495,7 @@ Register InstrEmitter::ConstrainForSubReg(Register VReg, unsigned SubIdx,
 
 /// EmitSubregNode - Generate machine code for subreg nodes.
 ///
-void InstrEmitter::EmitSubregNode(SDNode *Node,
-                                  DenseMap<SDValue, Register> &VRBaseMap,
+void InstrEmitter::EmitSubregNode(SDNode *Node, VRBaseMapType &VRBaseMap,
                                   bool IsClone, bool IsCloned) {
   Register VRBase;
   unsigned Opc = Node->getMachineOpcode();
@@ -634,7 +629,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
 ///
 void
 InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
-                                     DenseMap<SDValue, Register> &VRBaseMap) {
+                                     VRBaseMapType &VRBaseMap) {
   Register VReg = getVR(Node->getOperand(0), VRBaseMap);
 
   // Create the new VReg in the destination class and emit a copy.
@@ -653,9 +648,8 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
 
 /// EmitRegSequence - Generate machine code for REG_SEQUENCE nodes.
 ///
-void InstrEmitter::EmitRegSequence(SDNode *Node,
-                                  DenseMap<SDValue, Register> &VRBaseMap,
-                                  bool IsClone, bool IsCloned) {
+void InstrEmitter::EmitRegSequence(SDNode *Node, VRBaseMapType &VRBaseMap,
+                                   bool IsClone, bool IsCloned) {
   unsigned DstRCIdx = Node->getConstantOperandVal(0);
   const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
   Register NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC));
@@ -703,7 +697,7 @@ void InstrEmitter::EmitRegSequence(SDNode *Node,
 ///
 MachineInstr *
 InstrEmitter::EmitDbgValue(SDDbgValue *SD,
-                           DenseMap<SDValue, Register> &VRBaseMap) {
+                           VRBaseMapType &VRBaseMap) {
   DebugLoc DL = SD->getDebugLoc();
   assert(cast<DILocalVariable>(SD->getVariable())
              ->isValidLocationForIntrinsic(DL) &&
@@ -755,7 +749,7 @@ MachineOperand GetMOForConstDbgOp(const SDDbgOperand &Op) {
 void InstrEmitter::AddDbgValueLocationOps(
     MachineInstrBuilder &MIB, const MCInstrDesc &DbgValDesc,
     ArrayRef<SDDbgOperand> LocationOps,
-    DenseMap<SDValue, Register> &VRBaseMap) {
+    VRBaseMapType &VRBaseMap) {
   for (const SDDbgOperand &Op : LocationOps) {
     switch (Op.getKind()) {
     case SDDbgOperand::FRAMEIX:
@@ -786,7 +780,7 @@ void InstrEmitter::AddDbgValueLocationOps(
 
 MachineInstr *
 InstrEmitter::EmitDbgInstrRef(SDDbgValue *SD,
-                              DenseMap<SDValue, Register> &VRBaseMap) {
+                              VRBaseMapType &VRBaseMap) {
   MDNode *Var = SD->getVariable();
   const DIExpression *Expr = (DIExpression *)SD->getExpression();
   DebugLoc DL = SD->getDebugLoc();
@@ -862,7 +856,7 @@ InstrEmitter::EmitDbgInstrRef(SDDbgValue *SD,
       // Look up the corresponding VReg for the given SDNode, if any.
       SDNode *Node = DbgOperand.getSDNode();
       SDValue Op = SDValue(Node, DbgOperand.getResNo());
-      DenseMap<SDValue, Register>::iterator I = VRBaseMap.find(Op);
+      VRBaseMapType::iterator I = VRBaseMap.find(Op);
       // No VReg -> produce a DBG_VALUE $noreg instead.
       if (I == VRBaseMap.end())
         break;
@@ -928,7 +922,7 @@ MachineInstr *InstrEmitter::EmitDbgNoLocation(SDDbgValue *SD) {
 
 MachineInstr *
 InstrEmitter::EmitDbgValueList(SDDbgValue *SD,
-                               DenseMap<SDValue, Register> &VRBaseMap) {
+                               VRBaseMapType &VRBaseMap) {
   MDNode *Var = SD->getVariable();
   DIExpression *Expr = SD->getExpression();
   DebugLoc DL = SD->getDebugLoc();
@@ -944,7 +938,7 @@ InstrEmitter::EmitDbgValueList(SDDbgValue *SD,
 
 MachineInstr *
 InstrEmitter::EmitDbgValueFromSingleOp(SDDbgValue *SD,
-                                       DenseMap<SDValue, Register> &VRBaseMap) {
+                                       VRBaseMapType &VRBaseMap) {
   MDNode *Var = SD->getVariable();
   DIExpression *Expr = SD->getExpression();
   DebugLoc DL = SD->getDebugLoc();
@@ -996,7 +990,7 @@ InstrEmitter::EmitDbgLabel(SDDbgLabel *SD) {
 ///
 void InstrEmitter::
 EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
-                DenseMap<SDValue, Register> &VRBaseMap) {
+                VRBaseMapType &VRBaseMap) {
   unsigned Opc = Node->getMachineOpcode();
 
   // Handle subreg insert/extract specially
@@ -1238,7 +1232,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 /// needed dependencies.
 void InstrEmitter::
 EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
-                DenseMap<SDValue, Register> &VRBaseMap) {
+                VRBaseMapType &VRBaseMap) {
   switch (Node->getOpcode()) {
   default:
 #ifndef NDEBUG
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
index 959bce31c8b27..16d754cdc2338 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -30,6 +30,10 @@ class TargetLowering;
 class TargetMachine;
 
 class LLVM_LIBRARY_VISIBILITY InstrEmitter {
+public:
+  using VRBaseMapType = SmallDenseMap<SDValue, Register, 16>;
+
+private:
   MachineFunction *MF;
   MachineRegisterInfo *MRI;
   const TargetInstrInfo *TII;
@@ -45,18 +49,17 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
   /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
   /// implicit physical register output.
   void EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
-                       Register SrcReg, DenseMap<SDValue, Register> &VRBaseMap);
+                       Register SrcReg, VRBaseMapType &VRBaseMap);
 
   void CreateVirtualRegisters(SDNode *Node,
                               MachineInstrBuilder &MIB,
                               const MCInstrDesc &II,
                               bool IsClone, bool IsCloned,
-                              DenseMap<SDValue, Register> &VRBaseMap);
+                              VRBaseMapType &VRBaseMap);
 
   /// getVR - Return the virtual register corresponding to the specified result
   /// of the specified node.
-  Register getVR(SDValue Op,
-                 DenseMap<SDValue, Register> &VRBaseMap);
+  Register getVR(SDValue Op, VRBaseMapType &VRBaseMap);
 
   /// AddRegisterOperand - Add the specified register as an operand to the
   /// specified machine instr. Insert register copies if the register is
@@ -65,7 +68,7 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
                           SDValue Op,
                           unsigned IIOpNum,
                           const MCInstrDesc *II,
-                          DenseMap<SDValue, Register> &VRBaseMap,
+                          VRBaseMapType &VRBaseMap,
                           bool IsDebug, bool IsClone, bool IsCloned);
 
   /// AddOperand - Add the specified operand to the specified machine instr.  II
@@ -76,7 +79,7 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
                   SDValue Op,
                   unsigned IIOpNum,
                   const MCInstrDesc *II,
-                  DenseMap<SDValue, Register> &VRBaseMap,
+                  VRBaseMapType &VRBaseMap,
                   bool IsDebug, bool IsClone, bool IsCloned);
 
   /// ConstrainForSubReg - Try to constrain VReg to a register class that
@@ -87,20 +90,20 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
 
   /// EmitSubregNode - Generate machine code for subreg nodes.
   ///
-  void EmitSubregNode(SDNode *Node, DenseMap<SDValue, Register> &VRBaseMap,
-                      bool IsClone, bool IsCloned);
+  void EmitSubregNode(SDNode *Node, VRBaseMapType &VRBaseMap, bool IsClone,
+                      bool IsCloned);
 
   /// EmitCopyToRegClassNode - Generate machine code for COPY_TO_REGCLASS nodes.
   /// COPY_TO_REGCLASS is just a normal copy, except that the destination
   /// register is constrained to be in a particular register class.
   ///
-  void EmitCopyToRegClassNode(SDNode *Node,
-                              DenseMap<SDValue, Register> &VRBaseMap);
+  void EmitCopyToRegClassNode(SDNode *Node, VRBaseMapType &VRBaseMap);
 
   /// EmitRegSequence - Generate machine code for REG_SEQUENCE nodes.
   ///
-  void EmitRegSequence(SDNode *Node, DenseMap<SDValue, Register> &VRBaseMap,
-                       bool IsClone, bool IsCloned);
+  void EmitRegSequence(SDNode *Node, VRBaseMapType &VRBaseMap, bool IsClone,
+                       bool IsCloned);
+
 public:
   /// CountResults - The results of target nodes have register or immediate
   /// operands first, then an optional chain, and optional flag operands
@@ -110,29 +113,26 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
   void AddDbgValueLocationOps(MachineInstrBuilder &MIB,
                               const MCInstrDesc &DbgValDesc,
                               ArrayRef<SDDbgOperand> Locations,
-                              DenseMap<SDValue, Register> &VRBaseMap);
+                              VRBaseMapType &VRBaseMap);
 
   /// EmitDbgValue - Generate machine instruction for a dbg_value node.
   ///
-  MachineInstr *EmitDbgValue(SDDbgValue *SD,
-                             DenseMap<SDValue, Register> &VRBaseMap);
+  MachineInstr *EmitDbgValue(SDDbgValue *SD, VRBaseMapType &VRBaseMap);
 
   /// Emit a dbg_value as a DBG_INSTR_REF. May produce DBG_VALUE $noreg instead
   /// if there is no variable location; alternately a half-formed DBG_INSTR_REF
   /// that refers to a virtual register and is corrected later in isel.
-  MachineInstr *EmitDbgInstrRef(SDDbgValue *SD,
-                                DenseMap<SDValue, Register> &VRBaseMap);
+  MachineInstr *EmitDbgInstrRef(SDDbgValue *SD, VRBaseMapType &VRBaseMap);
 
   /// Emit a DBG_VALUE $noreg, indicating a variable has no location.
   MachineInstr *EmitDbgNoLocation(SDDbgValue *SD);
 
   /// Emit a DBG_VALUE_LIST from the operands to SDDbgValue.
-  MachineInstr *EmitDbgValueList(SDDbgValue *SD,
-                                 DenseMap<SDValue, Register> &VRBaseMap);
+  MachineInstr *EmitDbgValueList(SDDbgValue *SD, VRBaseMapType &VRBaseMap);
 
   /// Emit a DBG_VALUE from the operands to SDDbgValue.
   MachineInstr *EmitDbgValueFromSingleOp(SDDbgValue *SD,
-                                    DenseMap<SDValue, Register> &VRBaseMap);
+                                         VRBaseMapType &VRBaseMap);
 
   /// Generate machine instruction for a dbg_label node.
   MachineInstr *EmitDbgLabel(SDDbgLabel *SD);
@@ -140,7 +140,7 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
   /// EmitNode - Generate machine code for a node and needed dependencies.
   ///
   void EmitNode(SDNode *Node, bool IsClone, bool IsCloned,
-                DenseMap<SDValue, Register> &VRBaseMap) {
+                VRBaseMapType &VRBaseMap) {
     if (Node->isMachineOpcode())
       EmitMachineNode(Node, IsClone, IsCloned, VRBaseMap);
     else
@@ -160,9 +160,9 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
 
 private:
   void EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
-                       DenseMap<SDValue, Register> &VRBaseMap);
+                       VRBaseMapType &VRBaseMap);
   void EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
-                       DenseMap<SDValue, Register> &VRBaseMap);
+                       VRBaseMapType &VRBaseMap);
 };
 } // namespace llvm
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 5d433204d5da0..0adf3cfb34c94 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1804,9 +1804,12 @@ SDValue VectorLegalizer::ExpandFNEG(SDNode *Node) {
   EVT VT = Node->getValueType(0);
   EVT IntVT = VT.changeVectorElementTypeToInteger();
 
+  if (!TLI.isOperationLegalOrCustom(ISD::XOR, IntVT))
+    return SDValue();
+
   // FIXME: The FSUB check is here to force unrolling v1f64 vectors on AArch64.
-  if (!TLI.isOperationLegalOrCustom(ISD::XOR, IntVT) ||
-      !(TLI.isOperationLegalOrCustomOrPromote(ISD::FSUB, VT) || VT.isScalableVector()))
+  if (!TLI.isOperationLegalOrCustomOrPromote(ISD::FSUB, VT) &&
+      !VT.isScalableVector())
     return SDValue();
 
   SDLoc DL(Node);
@@ -1821,8 +1824,12 @@ SDValue VectorLegalizer::ExpandFABS(SDNode *Node) {
   EVT VT = Node->getValueType(0);
   EVT IntVT = VT.changeVectorElementTypeToInteger();
 
-  // FIXME: We shouldn't restrict this to scalable vectors.
-  if (!TLI.isOperationLegalOrCustom(ISD::AND, IntVT) || !VT.isScalableVector())
+  if (!TLI.isOperationLegalOrCustom(ISD::AND, IntVT))
+    return SDValue();
+
+  // FIXME: The FSUB check is here to force unrolling v1f64 vectors on AArch64.
+  if (!TLI.isOperationLegalOrCustomOrPromote(ISD::FSUB, VT) &&
+      !VT.isScalableVector())
     return SDValue();
 
   SDLoc DL(Node);
@@ -1837,10 +1844,14 @@ SDValue VectorLegalizer::ExpandFCOPYSIGN(SDNode *Node) {
   EVT VT = Node->getValueType(0);
   EVT IntVT = VT.changeVectorElementTypeToInteger();
 
-  // FIXME: We shouldn't restrict this to scalable vectors.
   if (VT != Node->getOperand(1).getValueType() ||
       !TLI.isOperationLegalOrCustom(ISD::AND, IntVT) ||
-      !TLI.isOperationLegalOrCustom(ISD::OR, IntVT) || !VT.isScalableVector())
+      !TLI.isOperationLegalOrCustom(ISD::OR, IntVT))
+    return SDValue();
+
+  // FIXME: The FSUB check is here to force unrolling v1f64 vectors on AArch64.
+  if (!TLI.isOperationLegalOrCustomOrPromote(ISD::FSUB, VT) &&
+      !VT.isScalableVector())
     return SDValue();
 
   SDLoc DL(Node);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1c466ed0b7799..0a22f06271984 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4537,6 +4537,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_FMINIMUM:
   case ISD::FMAXIMUM:
   case ISD::VP_FMAXIMUM:
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
   case ISD::SMIN: case ISD::VP_SMIN:
   case ISD::SMAX: case ISD::VP_SMAX:
   case ISD::UMIN: case ISD::VP_UMIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index de4a1ac2a3baf..70a7438440191 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -770,7 +770,7 @@ void ScheduleDAGLinearize::Schedule() {
 MachineBasicBlock*
 ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
   InstrEmitter Emitter(DAG->getTarget(), BB, InsertPos);
-  DenseMap<SDValue, Register> VRBaseMap;
+  InstrEmitter::VRBaseMapType VRBaseMap;
 
   LLVM_DEBUG({ dbgs() << "\n*** Final schedule ***\n"; });
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index e4ee3fd99f16e..9e5867c70d7b6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -183,7 +183,7 @@ class ScheduleDAGRRList : public ScheduleDAGSDNodes {
 
   // Hack to keep track of the inverse of FindCallSeqStart without more crazy
   // DAG crawling.
-  DenseMap<SUnit*, SUnit*> CallSeqEndForStart;
+  SmallDenseMap<SUnit *, SUnit *, 16> CallSeqEndForStart;
 
 public:
   ScheduleDAGRRList(MachineFunction &mf, bool needlatency,
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 53dd71d173473..31939ae5922ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -737,7 +737,7 @@ void ScheduleDAGSDNodes::VerifyScheduledSequence(bool isBottomUp) {
 static void
 ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
                    SmallVectorImpl<std::pair<unsigned, MachineInstr*> > &Orders,
-                   DenseMap<SDValue, Register> &VRBaseMap, unsigned Order) {
+                   InstrEmitter::VRBaseMapType &VRBaseMap, unsigned Order) {
   if (!N->getHasDebugValue())
     return;
 
@@ -782,7 +782,7 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
 // instructions in the right order.
 static void
 ProcessSourceNode(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
-                  DenseMap<SDValue, Register> &VRBaseMap,
+                  InstrEmitter::VRBaseMapType &VRBaseMap,
                   SmallVectorImpl<std::pair<unsigned, MachineInstr *>> &Orders,
                   SmallSet<Register, 8> &Seen, MachineInstr *NewInsn) {
   unsigned Order = N->getIROrder();
@@ -808,7 +808,7 @@ ProcessSourceNode(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
 }
 
 void ScheduleDAGSDNodes::
-EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, Register> &VRBaseMap,
+EmitPhysRegCopy(SUnit *SU, SmallDenseMap<SUnit *, Register, 16> &VRBaseMap,
                 MachineBasicBlock::iterator InsertPos) {
   for (const SDep &Pred : SU->Preds) {
     if (Pred.isCtrl())
@@ -851,8 +851,8 @@ EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, Register> &VRBaseMap,
 MachineBasicBlock *ScheduleDAGSDNodes::
 EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
   InstrEmitter Emitter(DAG->getTarget(), BB, InsertPos);
-  DenseMap<SDValue, Register> VRBaseMap;
-  DenseMap<SUnit*, Register> CopyVRBaseMap;
+  InstrEmitter::VRBaseMapType VRBaseMap;
+  SmallDenseMap<SUnit *, Register, 16> CopyVRBaseMap;
   SmallVector<std::pair<unsigned, MachineInstr*>, 32> Orders;
   SmallSet<Register, 8> Seen;
   bool HasDbg = DAG->hasDebugValues();
@@ -861,7 +861,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
   // Zero, one, or multiple instructions can be created when emitting a node.
   auto EmitNode =
       [&](SDNode *Node, bool IsClone, bool IsCloned,
-          DenseMap<SDValue, Register> &VRBaseMap) -> MachineInstr * {
+          InstrEmitter::VRBaseMapType &VRBaseMap) -> MachineInstr * {
     // Fetch instruction prior to this, or end() if nonexistant.
     auto GetPrevInsn = [&](MachineBasicBlock::iterator I) {
       if (I == BB->begin())
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 446df640821d8..b7d25c6ccc9b0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -184,7 +184,8 @@ class InstrItineraryData;
     void BuildSchedUnits();
     void AddSchedEdges();
 
-    void EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, Register> &VRBaseMap,
+    void EmitPhysRegCopy(SUnit *SU,
+                         SmallDenseMap<SUnit *, Register, 16> &VRBaseMap,
                          MachineBasicBlock::iterator InsertPos);
   };
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a2a232ed93b72..f19975557a0a7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8606,6 +8606,9 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
       return DAG.getNode(IEEE2008Op, DL, VT, LHS, RHS, Flags);
   }
 
+  if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return DAG.UnrollVectorOp(Node);
+
   // If only one operand is NaN, override it with another operand.
   if (!Flags.hasNoNaNs() && !DAG.isKnownNeverNaN(LHS)) {
     LHS = DAG.getSelectCC(DL, LHS, LHS, RHS, LHS, ISD::SETUO);
diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 7d054cb7c7c71..4ee86e07e1a5a 100644
--- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -103,15 +103,18 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF,
   // saved registers.
   SavedRegs.resize(TRI.getNumRegs());
 
-  // When interprocedural register allocation is enabled caller saved registers
-  // are preferred over callee saved registers.
+  // Get the callee saved register list...
+  const MCPhysReg *CSRegs = nullptr;
+
+  // When interprocedural register allocation is enabled, callee saved register
+  // list should be empty, since caller saved registers are preferred over
+  // callee saved registers. Unless it has some risked CSR to be optimized out.
   if (MF.getTarget().Options.EnableIPRA &&
       isSafeForNoCSROpt(MF.getFunction()) &&
       isProfitableForNoCSROpt(MF.getFunction()))
-    return;
-
-  // Get the callee saved register list...
-  const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
+    CSRegs = TRI.getIPRACSRegs(&MF);
+  else
+    CSRegs = MF.getRegInfo().getCalleeSavedRegs();
 
   // Early exit if there are no callee saved registers.
   if (!CSRegs || CSRegs[0] == 0)
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index eb2751ab30ac5..fa3e8ad21dbd4 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -1014,10 +1014,8 @@ void DWARFVerifier::verifyDebugLineRows() {
           DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, FullPath);
       assert(HasFullPath && "Invalid index?");
       (void)HasFullPath;
-      auto It = FullPathMap.find(FullPath);
-      if (It == FullPathMap.end())
-        FullPathMap[FullPath] = FileIndex;
-      else if (It->second != FileIndex && DumpOpts.Verbose) {
+      auto [It, Inserted] = FullPathMap.try_emplace(FullPath, FileIndex);
+      if (!Inserted && It->second != FileIndex && DumpOpts.Verbose) {
         warn() << ".debug_line["
                << format("0x%08" PRIx64,
                          *toSectionOffset(Die.find(DW_AT_stmt_list)))
diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVCompare.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVCompare.cpp
index 3ed0de14f93f0..5673ea7c2cd23 100644
--- a/llvm/lib/DebugInfo/LogicalView/Core/LVCompare.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Core/LVCompare.cpp
@@ -312,8 +312,9 @@ Error LVCompare::execute(LVReader *ReferenceReader, LVReader *TargetReader) {
 
       // We need to find an insertion point in the reference scopes tree.
       Parent = Element->getParentScope();
-      if (ScopeLinks.find(Parent) != ScopeLinks.end()) {
-        LVScope *InsertionPoint = ScopeLinks[Parent];
+      auto It = ScopeLinks.find(Parent);
+      if (It != ScopeLinks.end()) {
+        LVScope *InsertionPoint = It->second;
         LLVM_DEBUG({
           dbgs() << "Inserted at: "
                  << hexSquareString(InsertionPoint->getOffset()) << "\n";
diff --git a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp
index eab0dfa47e1e7..cdfcae86f79c2 100644
--- a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp
@@ -521,10 +521,8 @@ void COFFPlatform::pushInitializersLoop(PushInitializersSendResultFn SendResult,
       }
 
       for (auto *DepJD : JDDepMap[CurJD])
-        if (!Visited.count(DepJD)) {
+        if (Visited.insert(DepJD).second)
           Worklist.push_back(DepJD);
-          Visited.insert(DepJD);
-        }
     }
   });
 
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.cpp
index 30a9728c8c20e..de02a20524b9c 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.cpp
@@ -39,12 +39,12 @@ static VTuneMethodBatch getMethodBatch(LinkGraph &G, bool EmitDebugInfo) {
 
   auto GetStringIdx = [Deduplicator = StringMap<uint32_t>(),
                        &Batch](StringRef S) mutable {
-    auto I = Deduplicator.find(S);
-    if (I != Deduplicator.end())
-      return I->second;
-
-    Batch.Strings.push_back(S.str());
-    return Deduplicator[S] = Batch.Strings.size();
+    auto [I, Inserted] = Deduplicator.try_emplace(S);
+    if (Inserted) {
+      Batch.Strings.push_back(S.str());
+      I->second = Batch.Strings.size();
+    }
+    return I->second;
   };
   for (auto Sym : G.defined_symbols()) {
     if (!Sym->isCallable())
diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
index 67c920a40ea2e..d92077dbcbd03 100644
--- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -1,4 +1,5 @@
-//===------ ELFNixPlatform.cpp - Utilities for executing MachO in Orc -----===//
+//===------ ELFNixPlatform.cpp - Utilities for executing ELFNix in Orc
+//-----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -15,6 +16,7 @@
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h"
 #include "llvm/ExecutionEngine/Orc/Shared/ObjectFormats.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/Debug.h"
@@ -28,6 +30,125 @@ using namespace llvm::orc::shared;
 
 namespace {
 
+template <typename SPSSerializer, typename... ArgTs>
+shared::WrapperFunctionCall::ArgDataBufferType
+getArgDataBufferType(const ArgTs &...Args) {
+  shared::WrapperFunctionCall::ArgDataBufferType ArgData;
+  ArgData.resize(SPSSerializer::size(Args...));
+  SPSOutputBuffer OB(ArgData.empty() ? nullptr : ArgData.data(),
+                     ArgData.size());
+  if (SPSSerializer::serialize(OB, Args...))
+    return ArgData;
+  return {};
+}
+
+std::unique_ptr<jitlink::LinkGraph> createPlatformGraph(ELFNixPlatform &MOP,
+                                                        std::string Name) {
+  unsigned PointerSize;
+  llvm::endianness Endianness;
+  const auto &TT = MOP.getExecutionSession().getTargetTriple();
+
+  switch (TT.getArch()) {
+  case Triple::x86_64:
+    PointerSize = 8;
+    Endianness = llvm::endianness::little;
+    break;
+  case Triple::aarch64:
+    PointerSize = 8;
+    Endianness = llvm::endianness::little;
+    break;
+  case Triple::ppc64:
+    PointerSize = 8;
+    Endianness = llvm::endianness::big;
+    break;
+  case Triple::ppc64le:
+    PointerSize = 8;
+    Endianness = llvm::endianness::little;
+    break;
+  default:
+    llvm_unreachable("Unrecognized architecture");
+  }
+
+  return std::make_unique<jitlink::LinkGraph>(std::move(Name), TT, PointerSize,
+                                              Endianness,
+                                              jitlink::getGenericEdgeKindName);
+}
+
+// Creates a Bootstrap-Complete LinkGraph to run deferred actions.
+class ELFNixPlatformCompleteBootstrapMaterializationUnit
+    : public MaterializationUnit {
+public:
+  ELFNixPlatformCompleteBootstrapMaterializationUnit(
+      ELFNixPlatform &MOP, StringRef PlatformJDName,
+      SymbolStringPtr CompleteBootstrapSymbol, DeferredRuntimeFnMap DeferredAAs,
+      ExecutorAddr ELFNixHeaderAddr, ExecutorAddr PlatformBootstrap,
+      ExecutorAddr PlatformShutdown, ExecutorAddr RegisterJITDylib,
+      ExecutorAddr DeregisterJITDylib)
+      : MaterializationUnit(
+            {{{CompleteBootstrapSymbol, JITSymbolFlags::None}}, nullptr}),
+        MOP(MOP), PlatformJDName(PlatformJDName),
+        CompleteBootstrapSymbol(std::move(CompleteBootstrapSymbol)),
+        DeferredAAsMap(std::move(DeferredAAs)),
+        ELFNixHeaderAddr(ELFNixHeaderAddr),
+        PlatformBootstrap(PlatformBootstrap),
+        PlatformShutdown(PlatformShutdown), RegisterJITDylib(RegisterJITDylib),
+        DeregisterJITDylib(DeregisterJITDylib) {}
+
+  StringRef getName() const override {
+    return "ELFNixPlatformCompleteBootstrap";
+  }
+
+  void materialize(std::unique_ptr<MaterializationResponsibility> R) override {
+    using namespace jitlink;
+    auto G = createPlatformGraph(MOP, "<OrcRTCompleteBootstrap>");
+    auto &PlaceholderSection =
+        G->createSection("__orc_rt_cplt_bs", MemProt::Read);
+    auto &PlaceholderBlock =
+        G->createZeroFillBlock(PlaceholderSection, 1, ExecutorAddr(), 1, 0);
+    G->addDefinedSymbol(PlaceholderBlock, 0, *CompleteBootstrapSymbol, 1,
+                        Linkage::Strong, Scope::Hidden, false, true);
+
+    // 1. Bootstrap the platform support code.
+    G->allocActions().push_back(
+        {cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddr>>(
+             PlatformBootstrap, ELFNixHeaderAddr)),
+         cantFail(
+             WrapperFunctionCall::Create<SPSArgList<>>(PlatformShutdown))});
+
+    // 2. Register the platform JITDylib.
+    G->allocActions().push_back(
+        {cantFail(WrapperFunctionCall::Create<
+                  SPSArgList<SPSString, SPSExecutorAddr>>(
+             RegisterJITDylib, PlatformJDName, ELFNixHeaderAddr)),
+         cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddr>>(
+             DeregisterJITDylib, ELFNixHeaderAddr))});
+
+    // 4. Add the deferred actions to the graph.
+    for (auto &[Fn, CallDatas] : DeferredAAsMap) {
+      for (auto &CallData : CallDatas) {
+        G->allocActions().push_back(
+            {WrapperFunctionCall(Fn.first->Addr, std::move(CallData.first)),
+             WrapperFunctionCall(Fn.second->Addr, std::move(CallData.second))});
+      }
+    }
+
+    MOP.getObjectLinkingLayer().emit(std::move(R), std::move(G));
+  }
+
+  void discard(const JITDylib &JD, const SymbolStringPtr &Sym) override {}
+
+private:
+  ELFNixPlatform &MOP;
+  StringRef PlatformJDName;
+  SymbolStringPtr CompleteBootstrapSymbol;
+  DeferredRuntimeFnMap DeferredAAsMap;
+  ExecutorAddr ELFNixHeaderAddr;
+  ExecutorAddr PlatformBootstrap;
+  ExecutorAddr PlatformShutdown;
+  ExecutorAddr RegisterJITDylib;
+  ExecutorAddr DeregisterJITDylib;
+};
+
 class DSOHandleMaterializationUnit : public MaterializationUnit {
 public:
   DSOHandleMaterializationUnit(ELFNixPlatform &ENP,
@@ -174,16 +295,28 @@ ELFNixPlatform::Create(ExecutionSession &ES,
 }
 
 Error ELFNixPlatform::setupJITDylib(JITDylib &JD) {
-  return JD.define(
-      std::make_unique<DSOHandleMaterializationUnit>(*this, DSOHandleSymbol));
+  if (auto Err = JD.define(std::make_unique<DSOHandleMaterializationUnit>(
+          *this, DSOHandleSymbol)))
+    return Err;
+
+  return ES.lookup({&JD}, DSOHandleSymbol).takeError();
 }
 
 Error ELFNixPlatform::teardownJITDylib(JITDylib &JD) {
+  std::lock_guard<std::mutex> Lock(PlatformMutex);
+  auto I = JITDylibToHandleAddr.find(&JD);
+  if (I != JITDylibToHandleAddr.end()) {
+    assert(HandleAddrToJITDylib.count(I->second) &&
+           "HandleAddrToJITDylib missing entry");
+    HandleAddrToJITDylib.erase(I->second);
+    JITDylibToHandleAddr.erase(I);
+  }
   return Error::success();
 }
 
 Error ELFNixPlatform::notifyAdding(ResourceTracker &RT,
                                    const MaterializationUnit &MU) {
+
   auto &JD = RT.getJITDylib();
   const auto &InitSym = MU.getInitializerSymbol();
   if (!InitSym)
@@ -262,14 +395,16 @@ ELFNixPlatform::ELFNixPlatform(
     ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
     JITDylib &PlatformJD,
     std::unique_ptr<DefinitionGenerator> OrcRuntimeGenerator, Error &Err)
-    : ES(ES), ObjLinkingLayer(ObjLinkingLayer),
+    : ES(ES), PlatformJD(PlatformJD), ObjLinkingLayer(ObjLinkingLayer),
       DSOHandleSymbol(ES.intern("__dso_handle")) {
   ErrorAsOutParameter _(&Err);
-
   ObjLinkingLayer.addPlugin(std::make_unique<ELFNixPlatformPlugin>(*this));
 
   PlatformJD.addGenerator(std::move(OrcRuntimeGenerator));
 
+  BootstrapInfo BI;
+  Bootstrap = &BI;
+
   // PlatformJD hasn't been 'set-up' by the platform yet (since we're creating
   // the platform now), so set it up.
   if (auto E2 = setupJITDylib(PlatformJD)) {
@@ -277,19 +412,44 @@ ELFNixPlatform::ELFNixPlatform(
     return;
   }
 
-  RegisteredInitSymbols[&PlatformJD].add(
-      DSOHandleSymbol, SymbolLookupFlags::WeaklyReferencedSymbol);
-
-  // Associate wrapper function tags with JIT-side function implementations.
-  if (auto E2 = associateRuntimeSupportFunctions(PlatformJD)) {
-    Err = std::move(E2);
+  // Step (2) Request runtime registration functions to trigger
+  // materialization..
+  if ((Err = ES.lookup(
+                   makeJITDylibSearchOrder(&PlatformJD),
+                   SymbolLookupSet(
+                       {PlatformBootstrap.Name, PlatformShutdown.Name,
+                        RegisterJITDylib.Name, DeregisterJITDylib.Name,
+                        RegisterInitSections.Name, DeregisterInitSections.Name,
+                        RegisterObjectSections.Name,
+                        DeregisterObjectSections.Name, CreatePThreadKey.Name}))
+                 .takeError()))
     return;
+
+  // Step (3) Wait for any incidental linker work to complete.
+  {
+    std::unique_lock<std::mutex> Lock(BI.Mutex);
+    BI.CV.wait(Lock, [&]() { return BI.ActiveGraphs == 0; });
+    Bootstrap = nullptr;
   }
 
-  // Lookup addresses of runtime functions callable by the platform,
-  // call the platform bootstrap function to initialize the platform-state
-  // object in the executor.
-  if (auto E2 = bootstrapELFNixRuntime(PlatformJD)) {
+  // Step (4) Add complete-bootstrap materialization unit and request.
+  auto BootstrapCompleteSymbol =
+      ES.intern("__orc_rt_elfnix_complete_bootstrap");
+  if ((Err = PlatformJD.define(
+           std::make_unique<ELFNixPlatformCompleteBootstrapMaterializationUnit>(
+               *this, PlatformJD.getName(), BootstrapCompleteSymbol,
+               std::move(BI.DeferredRTFnMap), BI.ELFNixHeaderAddr,
+               PlatformBootstrap.Addr, PlatformShutdown.Addr,
+               RegisterJITDylib.Addr, DeregisterJITDylib.Addr))))
+    return;
+  if ((Err = ES.lookup(makeJITDylibSearchOrder(
+                           &PlatformJD, JITDylibLookupFlags::MatchAllSymbols),
+                       std::move(BootstrapCompleteSymbol))
+                 .takeError()))
+    return;
+
+  // Associate wrapper function tags with JIT-side function implementations.
+  if (auto E2 = associateRuntimeSupportFunctions(PlatformJD)) {
     Err = std::move(E2);
     return;
   }
@@ -298,17 +458,11 @@ ELFNixPlatform::ELFNixPlatform(
 Error ELFNixPlatform::associateRuntimeSupportFunctions(JITDylib &PlatformJD) {
   ExecutionSession::JITDispatchHandlerAssociationMap WFs;
 
-  using GetInitializersSPSSig =
-      SPSExpected<SPSELFNixJITDylibInitializerSequence>(SPSString);
-  WFs[ES.intern("__orc_rt_elfnix_get_initializers_tag")] =
-      ES.wrapAsyncWithSPS<GetInitializersSPSSig>(
-          this, &ELFNixPlatform::rt_getInitializers);
-
-  using GetDeinitializersSPSSig =
-      SPSExpected<SPSELFJITDylibDeinitializerSequence>(SPSExecutorAddr);
-  WFs[ES.intern("__orc_rt_elfnix_get_deinitializers_tag")] =
-      ES.wrapAsyncWithSPS<GetDeinitializersSPSSig>(
-          this, &ELFNixPlatform::rt_getDeinitializers);
+  using RecordInitializersSPSSig =
+      SPSExpected<SPSELFNixJITDylibDepInfoMap>(SPSExecutorAddr);
+  WFs[ES.intern("__orc_rt_elfnix_push_initializers_tag")] =
+      ES.wrapAsyncWithSPS<RecordInitializersSPSSig>(
+          this, &ELFNixPlatform::rt_recordInitializers);
 
   using LookupSymbolSPSSig =
       SPSExpected<SPSExecutorAddr>(SPSExecutorAddr, SPSString);
@@ -319,110 +473,120 @@ Error ELFNixPlatform::associateRuntimeSupportFunctions(JITDylib &PlatformJD) {
   return ES.registerJITDispatchHandlers(PlatformJD, std::move(WFs));
 }
 
-void ELFNixPlatform::getInitializersBuildSequencePhase(
-    SendInitializerSequenceFn SendResult, JITDylib &JD,
-    std::vector<JITDylibSP> DFSLinkOrder) {
-  ELFNixJITDylibInitializerSequence FullInitSeq;
-  {
-    std::lock_guard<std::mutex> Lock(PlatformMutex);
-    for (auto &InitJD : reverse(DFSLinkOrder)) {
-      LLVM_DEBUG({
-        dbgs() << "ELFNixPlatform: Appending inits for \"" << InitJD->getName()
-               << "\" to sequence\n";
-      });
-      auto ISItr = InitSeqs.find(InitJD.get());
-      if (ISItr != InitSeqs.end()) {
-        FullInitSeq.emplace_back(std::move(ISItr->second));
-        InitSeqs.erase(ISItr);
-      }
-    }
-  }
+void ELFNixPlatform::pushInitializersLoop(
+    PushInitializersSendResultFn SendResult, JITDylibSP JD) {
+  DenseMap<JITDylib *, SymbolLookupSet> NewInitSymbols;
+  DenseMap<JITDylib *, SmallVector<JITDylib *>> JDDepMap;
+  SmallVector<JITDylib *, 16> Worklist({JD.get()});
 
-  SendResult(std::move(FullInitSeq));
-}
+  ES.runSessionLocked([&]() {
+    while (!Worklist.empty()) {
+      // FIXME: Check for defunct dylibs.
 
-void ELFNixPlatform::getInitializersLookupPhase(
-    SendInitializerSequenceFn SendResult, JITDylib &JD) {
+      auto DepJD = Worklist.back();
+      Worklist.pop_back();
 
-  auto DFSLinkOrder = JD.getDFSLinkOrder();
-  if (!DFSLinkOrder) {
-    SendResult(DFSLinkOrder.takeError());
-    return;
-  }
+      // If we've already visited this JITDylib on this iteration then continue.
+      if (JDDepMap.count(DepJD))
+        continue;
 
-  DenseMap<JITDylib *, SymbolLookupSet> NewInitSymbols;
-  ES.runSessionLocked([&]() {
-    for (auto &InitJD : *DFSLinkOrder) {
-      auto RISItr = RegisteredInitSymbols.find(InitJD.get());
+      // Add dep info.
+      auto &DM = JDDepMap[DepJD];
+      DepJD->withLinkOrderDo([&](const JITDylibSearchOrder &O) {
+        for (auto &KV : O) {
+          if (KV.first == DepJD)
+            continue;
+          DM.push_back(KV.first);
+          Worklist.push_back(KV.first);
+        }
+      });
+
+      // Add any registered init symbols.
+      auto RISItr = RegisteredInitSymbols.find(DepJD);
       if (RISItr != RegisteredInitSymbols.end()) {
-        NewInitSymbols[InitJD.get()] = std::move(RISItr->second);
+        NewInitSymbols[DepJD] = std::move(RISItr->second);
         RegisteredInitSymbols.erase(RISItr);
       }
     }
   });
 
-  // If there are no further init symbols to look up then move on to the next
-  // phase.
+  // If there are no further init symbols to look up then send the link order
+  // (as a list of header addresses) to the caller.
   if (NewInitSymbols.empty()) {
-    getInitializersBuildSequencePhase(std::move(SendResult), JD,
-                                      std::move(*DFSLinkOrder));
+
+    // To make the list intelligible to the runtime we need to convert all
+    // JITDylib pointers to their header addresses. Only include JITDylibs
+    // that appear in the JITDylibToHandleAddr map (i.e. those that have been
+    // through setupJITDylib) -- bare JITDylibs aren't managed by the platform.
+    DenseMap<JITDylib *, ExecutorAddr> HeaderAddrs;
+    HeaderAddrs.reserve(JDDepMap.size());
+    {
+      std::lock_guard<std::mutex> Lock(PlatformMutex);
+      for (auto &KV : JDDepMap) {
+        auto I = JITDylibToHandleAddr.find(KV.first);
+        if (I != JITDylibToHandleAddr.end())
+          HeaderAddrs[KV.first] = I->second;
+      }
+    }
+
+    // Build the dep info map to return.
+    ELFNixJITDylibDepInfoMap DIM;
+    DIM.reserve(JDDepMap.size());
+    for (auto &KV : JDDepMap) {
+      auto HI = HeaderAddrs.find(KV.first);
+      // Skip unmanaged JITDylibs.
+      if (HI == HeaderAddrs.end())
+        continue;
+      auto H = HI->second;
+      ELFNixJITDylibDepInfo DepInfo;
+      for (auto &Dep : KV.second) {
+        auto HJ = HeaderAddrs.find(Dep);
+        if (HJ != HeaderAddrs.end())
+          DepInfo.push_back(HJ->second);
+      }
+      DIM.push_back(std::make_pair(H, std::move(DepInfo)));
+    }
+    SendResult(DIM);
     return;
   }
 
   // Otherwise issue a lookup and re-run this phase when it completes.
   lookupInitSymbolsAsync(
-      [this, SendResult = std::move(SendResult), &JD](Error Err) mutable {
+      [this, SendResult = std::move(SendResult), JD](Error Err) mutable {
         if (Err)
           SendResult(std::move(Err));
         else
-          getInitializersLookupPhase(std::move(SendResult), JD);
+          pushInitializersLoop(std::move(SendResult), JD);
       },
       ES, std::move(NewInitSymbols));
 }
 
-void ELFNixPlatform::rt_getInitializers(SendInitializerSequenceFn SendResult,
-                                        StringRef JDName) {
-  LLVM_DEBUG({
-    dbgs() << "ELFNixPlatform::rt_getInitializers(\"" << JDName << "\")\n";
-  });
-
-  JITDylib *JD = ES.getJITDylibByName(JDName);
-  if (!JD) {
-    LLVM_DEBUG({
-      dbgs() << "  No such JITDylib \"" << JDName << "\". Sending error.\n";
-    });
-    SendResult(make_error<StringError>("No JITDylib named " + JDName,
-                                       inconvertibleErrorCode()));
-    return;
-  }
-
-  getInitializersLookupPhase(std::move(SendResult), *JD);
-}
-
-void ELFNixPlatform::rt_getDeinitializers(
-    SendDeinitializerSequenceFn SendResult, ExecutorAddr Handle) {
-  LLVM_DEBUG({
-    dbgs() << "ELFNixPlatform::rt_getDeinitializers(\"" << Handle << "\")\n";
-  });
-
-  JITDylib *JD = nullptr;
-
+void ELFNixPlatform::rt_recordInitializers(
+    PushInitializersSendResultFn SendResult, ExecutorAddr JDHeaderAddr) {
+  JITDylibSP JD;
   {
     std::lock_guard<std::mutex> Lock(PlatformMutex);
-    auto I = HandleAddrToJITDylib.find(Handle);
+    auto I = HandleAddrToJITDylib.find(JDHeaderAddr);
     if (I != HandleAddrToJITDylib.end())
       JD = I->second;
   }
 
+  LLVM_DEBUG({
+    dbgs() << "ELFNixPlatform::rt_recordInitializers(" << JDHeaderAddr << ") ";
+    if (JD)
+      dbgs() << "pushing initializers for " << JD->getName() << "\n";
+    else
+      dbgs() << "No JITDylib for header address.\n";
+  });
+
   if (!JD) {
-    LLVM_DEBUG(dbgs() << "  No JITDylib for handle " << Handle << "\n");
-    SendResult(make_error<StringError>("No JITDylib associated with handle " +
-                                           formatv("{0:x}", Handle),
+    SendResult(make_error<StringError>("No JITDylib with header addr " +
+                                           formatv("{0:x}", JDHeaderAddr),
                                        inconvertibleErrorCode()));
     return;
   }
 
-  SendResult(ELFNixJITDylibDeinitializerSequence());
+  pushInitializersLoop(std::move(SendResult), JD);
 }
 
 void ELFNixPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult,
@@ -473,116 +637,98 @@ void ELFNixPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult,
       RtLookupNotifyComplete(std::move(SendResult)), NoDependenciesToRegister);
 }
 
-Error ELFNixPlatform::bootstrapELFNixRuntime(JITDylib &PlatformJD) {
-
-  std::pair<const char *, ExecutorAddr *> Symbols[] = {
-      {"__orc_rt_elfnix_platform_bootstrap", &orc_rt_elfnix_platform_bootstrap},
-      {"__orc_rt_elfnix_platform_shutdown", &orc_rt_elfnix_platform_shutdown},
-      {"__orc_rt_elfnix_register_object_sections",
-       &orc_rt_elfnix_register_object_sections},
-      {"__orc_rt_elfnix_create_pthread_key",
-       &orc_rt_elfnix_create_pthread_key}};
-
-  SymbolLookupSet RuntimeSymbols;
-  std::vector<std::pair<SymbolStringPtr, ExecutorAddr *>> AddrsToRecord;
-  for (const auto &KV : Symbols) {
-    auto Name = ES.intern(KV.first);
-    RuntimeSymbols.add(Name);
-    AddrsToRecord.push_back({std::move(Name), KV.second});
-  }
-
-  auto RuntimeSymbolAddrs = ES.lookup(
-      {{&PlatformJD, JITDylibLookupFlags::MatchAllSymbols}}, RuntimeSymbols);
-  if (!RuntimeSymbolAddrs)
-    return RuntimeSymbolAddrs.takeError();
-
-  for (const auto &KV : AddrsToRecord) {
-    auto &Name = KV.first;
-    assert(RuntimeSymbolAddrs->count(Name) && "Missing runtime symbol?");
-    *KV.second = (*RuntimeSymbolAddrs)[Name].getAddress();
-  }
-
-  auto PJDDSOHandle = ES.lookup(
-      {{&PlatformJD, JITDylibLookupFlags::MatchAllSymbols}}, DSOHandleSymbol);
-  if (!PJDDSOHandle)
-    return PJDDSOHandle.takeError();
-
-  if (auto Err = ES.callSPSWrapper<void(uint64_t)>(
-          orc_rt_elfnix_platform_bootstrap,
-          PJDDSOHandle->getAddress().getValue()))
-    return Err;
-
-  // FIXME: Ordering is fuzzy here. We're probably best off saying
-  // "behavior is undefined if code that uses the runtime is added before
-  // the platform constructor returns", then move all this to the constructor.
-  RuntimeBootstrapped = true;
-  std::vector<ELFPerObjectSectionsToRegister> DeferredPOSRs;
-  {
-    std::lock_guard<std::mutex> Lock(PlatformMutex);
-    DeferredPOSRs = std::move(BootstrapPOSRs);
-  }
-
-  for (auto &D : DeferredPOSRs)
-    if (auto Err = registerPerObjectSections(D))
-      return Err;
-
+Error ELFNixPlatform::ELFNixPlatformPlugin::bootstrapPipelineStart(
+    jitlink::LinkGraph &G) {
+  // Increment the active graphs count in BootstrapInfo.
+  std::lock_guard<std::mutex> Lock(MP.Bootstrap.load()->Mutex);
+  ++MP.Bootstrap.load()->ActiveGraphs;
   return Error::success();
 }
 
-Error ELFNixPlatform::registerInitInfo(
-    JITDylib &JD, ArrayRef<jitlink::Section *> InitSections) {
-
-  std::unique_lock<std::mutex> Lock(PlatformMutex);
-
-  ELFNixJITDylibInitializers *InitSeq = nullptr;
-  {
-    auto I = InitSeqs.find(&JD);
-    if (I == InitSeqs.end()) {
-      // If there's no init sequence entry yet then we need to look up the
-      // header symbol to force creation of one.
-      Lock.unlock();
-
-      auto SearchOrder =
-          JD.withLinkOrderDo([](const JITDylibSearchOrder &SO) { return SO; });
-      if (auto Err = ES.lookup(SearchOrder, DSOHandleSymbol).takeError())
-        return Err;
-
-      Lock.lock();
-      I = InitSeqs.find(&JD);
-      assert(I != InitSeqs.end() &&
-             "Entry missing after header symbol lookup?");
+Error ELFNixPlatform::ELFNixPlatformPlugin::
+    bootstrapPipelineRecordRuntimeFunctions(jitlink::LinkGraph &G) {
+  // Record bootstrap function names.
+  std::pair<StringRef, ExecutorAddr *> RuntimeSymbols[] = {
+      {*MP.DSOHandleSymbol, &MP.Bootstrap.load()->ELFNixHeaderAddr},
+      {*MP.PlatformBootstrap.Name, &MP.PlatformBootstrap.Addr},
+      {*MP.PlatformShutdown.Name, &MP.PlatformShutdown.Addr},
+      {*MP.RegisterJITDylib.Name, &MP.RegisterJITDylib.Addr},
+      {*MP.DeregisterJITDylib.Name, &MP.DeregisterJITDylib.Addr},
+      {*MP.RegisterObjectSections.Name, &MP.RegisterObjectSections.Addr},
+      {*MP.DeregisterObjectSections.Name, &MP.DeregisterObjectSections.Addr},
+      {*MP.RegisterInitSections.Name, &MP.RegisterInitSections.Addr},
+      {*MP.DeregisterInitSections.Name, &MP.DeregisterInitSections.Addr},
+      {*MP.CreatePThreadKey.Name, &MP.CreatePThreadKey.Addr}};
+
+  bool RegisterELFNixHeader = false;
+
+  for (auto *Sym : G.defined_symbols()) {
+    for (auto &RTSym : RuntimeSymbols) {
+      if (Sym->hasName() && Sym->getName() == RTSym.first) {
+        if (*RTSym.second)
+          return make_error<StringError>(
+              "Duplicate " + RTSym.first +
+                  " detected during ELFNixPlatform bootstrap",
+              inconvertibleErrorCode());
+
+        if (Sym->getName() == *MP.DSOHandleSymbol)
+          RegisterELFNixHeader = true;
+
+        *RTSym.second = Sym->getAddress();
+      }
     }
-    InitSeq = &I->second;
   }
 
-  for (auto *Sec : InitSections) {
-    // FIXME: Avoid copy here.
-    jitlink::SectionRange R(*Sec);
-    InitSeq->InitSections[Sec->getName()].push_back(R.getRange());
+  if (RegisterELFNixHeader) {
+    // If this graph defines the elfnix header symbol then create the internal
+    // mapping between it and PlatformJD.
+    std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
+    MP.JITDylibToHandleAddr[&MP.PlatformJD] =
+        MP.Bootstrap.load()->ELFNixHeaderAddr;
+    MP.HandleAddrToJITDylib[MP.Bootstrap.load()->ELFNixHeaderAddr] =
+        &MP.PlatformJD;
   }
 
   return Error::success();
 }
 
+Error ELFNixPlatform::ELFNixPlatformPlugin::bootstrapPipelineEnd(
+    jitlink::LinkGraph &G) {
+  std::lock_guard<std::mutex> Lock(MP.Bootstrap.load()->Mutex);
+  assert(MP.Bootstrap && "DeferredAAs reset before bootstrap completed");
+  --MP.Bootstrap.load()->ActiveGraphs;
+  // Notify Bootstrap->CV while holding the mutex because the mutex is
+  // also keeping Bootstrap->CV alive.
+  if (MP.Bootstrap.load()->ActiveGraphs == 0)
+    MP.Bootstrap.load()->CV.notify_all();
+  return Error::success();
+}
+
 Error ELFNixPlatform::registerPerObjectSections(
-    const ELFPerObjectSectionsToRegister &POSR) {
+    jitlink::LinkGraph &G, const ELFPerObjectSectionsToRegister &POSR,
+    bool IsBootstrapping) {
+  using SPSRegisterPerObjSectionsArgs =
+      SPSArgList<SPSELFPerObjectSectionsToRegister>;
+
+  if (LLVM_UNLIKELY(IsBootstrapping)) {
+    Bootstrap.load()->addArgumentsToRTFnMap(
+        &RegisterObjectSections, &DeregisterObjectSections,
+        getArgDataBufferType<SPSRegisterPerObjSectionsArgs>(POSR),
+        getArgDataBufferType<SPSRegisterPerObjSectionsArgs>(POSR));
+    return Error::success();
+  }
 
-  if (!orc_rt_elfnix_register_object_sections)
-    return make_error<StringError>("Attempting to register per-object "
-                                   "sections, but runtime support has not "
-                                   "been loaded yet",
-                                   inconvertibleErrorCode());
+  G.allocActions().push_back(
+      {cantFail(WrapperFunctionCall::Create<SPSRegisterPerObjSectionsArgs>(
+           RegisterObjectSections.Addr, POSR)),
+       cantFail(WrapperFunctionCall::Create<SPSRegisterPerObjSectionsArgs>(
+           DeregisterObjectSections.Addr, POSR))});
 
-  Error ErrResult = Error::success();
-  if (auto Err = ES.callSPSWrapper<shared::SPSError(
-                     SPSELFPerObjectSectionsToRegister)>(
-          orc_rt_elfnix_register_object_sections, ErrResult, POSR))
-    return Err;
-  return ErrResult;
+  return Error::success();
 }
 
 Expected<uint64_t> ELFNixPlatform::createPThreadKey() {
-  if (!orc_rt_elfnix_create_pthread_key)
+  if (!CreatePThreadKey.Addr)
     return make_error<StringError>(
         "Attempting to create pthread key in target, but runtime support has "
         "not been loaded yet",
@@ -590,7 +736,7 @@ Expected<uint64_t> ELFNixPlatform::createPThreadKey() {
 
   Expected<uint64_t> Result(0);
   if (auto Err = ES.callSPSWrapper<SPSExpected<uint64_t>(void)>(
-          orc_rt_elfnix_create_pthread_key, Result))
+          CreatePThreadKey.Addr, Result))
     return std::move(Err);
   return Result;
 }
@@ -598,38 +744,53 @@ Expected<uint64_t> ELFNixPlatform::createPThreadKey() {
 void ELFNixPlatform::ELFNixPlatformPlugin::modifyPassConfig(
     MaterializationResponsibility &MR, jitlink::LinkGraph &LG,
     jitlink::PassConfiguration &Config) {
+  using namespace jitlink;
+
+  bool InBootstrapPhase =
+      &MR.getTargetJITDylib() == &MP.PlatformJD && MP.Bootstrap;
+
+  // If we're in the bootstrap phase then increment the active graphs.
+  if (InBootstrapPhase) {
+    Config.PrePrunePasses.push_back(
+        [this](LinkGraph &G) { return bootstrapPipelineStart(G); });
+    Config.PostAllocationPasses.push_back([this](LinkGraph &G) {
+      return bootstrapPipelineRecordRuntimeFunctions(G);
+    });
+  }
 
   // If the initializer symbol is the __dso_handle symbol then just add
   // the DSO handle support passes.
-  if (MR.getInitializerSymbol() == MP.DSOHandleSymbol) {
-    addDSOHandleSupportPasses(MR, Config);
-    // The DSOHandle materialization unit doesn't require any other
-    // support, so we can bail out early.
-    return;
-  }
+  if (auto InitSymbol = MR.getInitializerSymbol()) {
+    if (InitSymbol == MP.DSOHandleSymbol && !InBootstrapPhase) {
+      addDSOHandleSupportPasses(MR, Config);
+      // The DSOHandle materialization unit doesn't require any other
+      // support, so we can bail out early.
+      return;
+    }
 
-  // If the object contains initializers then add passes to record them.
-  if (MR.getInitializerSymbol())
-    addInitializerSupportPasses(MR, Config);
+    /// Preserve init sections.
+    Config.PrePrunePasses.push_back(
+        [this, &MR](jitlink::LinkGraph &G) -> Error {
+          if (auto Err = preserveInitSections(G, MR))
+            return Err;
+          return Error::success();
+        });
+  }
 
   // Add passes for eh-frame and TLV support.
-  addEHAndTLVSupportPasses(MR, Config);
-}
+  addEHAndTLVSupportPasses(MR, Config, InBootstrapPhase);
 
-void ELFNixPlatform::ELFNixPlatformPlugin::addInitializerSupportPasses(
-    MaterializationResponsibility &MR, jitlink::PassConfiguration &Config) {
-
-  /// Preserve init sections.
-  Config.PrePrunePasses.push_back([this, &MR](jitlink::LinkGraph &G) -> Error {
-    if (auto Err = preserveInitSections(G, MR))
-      return Err;
-    return Error::success();
+  // If the object contains initializers then add passes to record them.
+  Config.PostFixupPasses.push_back([this, &JD = MR.getTargetJITDylib(),
+                                    InBootstrapPhase](jitlink::LinkGraph &G) {
+    return registerInitSections(G, JD, InBootstrapPhase);
   });
 
-  Config.PostFixupPasses.push_back(
-      [this, &JD = MR.getTargetJITDylib()](jitlink::LinkGraph &G) {
-        return registerInitSections(G, JD);
-      });
+  // If we're in the bootstrap phase then steal allocation actions and then
+  // decrement the active graphs.
+  if (InBootstrapPhase)
+    Config.PostFixupPasses.push_back(
+        [this](LinkGraph &G) { return bootstrapPipelineEnd(G); });
 }
 
 void ELFNixPlatform::ELFNixPlatformPlugin::addDSOHandleSupportPasses(
@@ -645,16 +806,22 @@ void ELFNixPlatform::ELFNixPlatformPlugin::addDSOHandleSupportPasses(
       std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
       auto HandleAddr = (*I)->getAddress();
       MP.HandleAddrToJITDylib[HandleAddr] = &JD;
-      assert(!MP.InitSeqs.count(&JD) && "InitSeq entry for JD already exists");
-      MP.InitSeqs.insert(std::make_pair(
-          &JD, ELFNixJITDylibInitializers(JD.getName(), HandleAddr)));
+      MP.JITDylibToHandleAddr[&JD] = HandleAddr;
+
+      G.allocActions().push_back(
+          {cantFail(WrapperFunctionCall::Create<
+                    SPSArgList<SPSString, SPSExecutorAddr>>(
+               MP.RegisterJITDylib.Addr, JD.getName(), HandleAddr)),
+           cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddr>>(
+               MP.DeregisterJITDylib.Addr, HandleAddr))});
     }
     return Error::success();
   });
 }
 
 void ELFNixPlatform::ELFNixPlatformPlugin::addEHAndTLVSupportPasses(
-    MaterializationResponsibility &MR, jitlink::PassConfiguration &Config) {
+    MaterializationResponsibility &MR, jitlink::PassConfiguration &Config,
+    bool IsBootstrapping) {
 
   // Insert TLV lowering at the start of the PostPrunePasses, since we want
   // it to run before GOT/PLT lowering.
@@ -668,7 +835,8 @@ void ELFNixPlatform::ELFNixPlatformPlugin::addEHAndTLVSupportPasses(
 
   // Add a pass to register the final addresses of the eh-frame and TLV sections
   // with the runtime.
-  Config.PostFixupPasses.push_back([this](jitlink::LinkGraph &G) -> Error {
+  Config.PostFixupPasses.push_back([this, IsBootstrapping](
+                                       jitlink::LinkGraph &G) -> Error {
     ELFPerObjectSectionsToRegister POSR;
 
     if (auto *EHFrameSection = G.findSectionByName(ELFEHFrameSectionName)) {
@@ -702,17 +870,7 @@ void ELFNixPlatform::ELFNixPlatformPlugin::addEHAndTLVSupportPasses(
     }
 
     if (POSR.EHFrameSection.Start || POSR.ThreadDataSection.Start) {
-
-      // If we're still bootstrapping the runtime then just record this
-      // frame for now.
-      if (!MP.RuntimeBootstrapped) {
-        std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
-        MP.BootstrapPOSRs.push_back(POSR);
-        return Error::success();
-      }
-
-      // Otherwise register it immediately.
-      if (auto Err = MP.registerPerObjectSections(POSR))
+      if (auto Err = MP.registerPerObjectSections(G, POSR, IsBootstrapping))
         return Err;
     }
 
@@ -757,28 +915,55 @@ Error ELFNixPlatform::ELFNixPlatformPlugin::preserveInitSections(
 }
 
 Error ELFNixPlatform::ELFNixPlatformPlugin::registerInitSections(
-    jitlink::LinkGraph &G, JITDylib &JD) {
-
-  SmallVector<jitlink::Section *> InitSections;
-
+    jitlink::LinkGraph &G, JITDylib &JD, bool IsBootstrapping) {
+  SmallVector<ExecutorAddrRange> ELFNixPlatformSecs;
   LLVM_DEBUG(dbgs() << "ELFNixPlatform::registerInitSections\n");
 
   for (auto &Sec : G.sections()) {
     if (isELFInitializerSection(Sec.getName())) {
-      InitSections.push_back(&Sec);
+      jitlink::SectionRange R(Sec);
+      ELFNixPlatformSecs.push_back(R.getRange());
     }
   }
 
   // Dump the scraped inits.
   LLVM_DEBUG({
     dbgs() << "ELFNixPlatform: Scraped " << G.getName() << " init sections:\n";
-    for (auto *Sec : InitSections) {
-      jitlink::SectionRange R(*Sec);
-      dbgs() << "  " << Sec->getName() << ": " << R.getRange() << "\n";
+    for (auto &Sec : G.sections()) {
+      jitlink::SectionRange R(Sec);
+      dbgs() << "  " << Sec.getName() << ": " << R.getRange() << "\n";
     }
   });
 
-  return MP.registerInitInfo(JD, InitSections);
+  ExecutorAddr HeaderAddr;
+  {
+    std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
+    auto I = MP.JITDylibToHandleAddr.find(&JD);
+    assert(I != MP.JITDylibToHandleAddr.end() && "No header registered for JD");
+    assert(I->second && "Null header registered for JD");
+    HeaderAddr = I->second;
+  }
+
+  using SPSRegisterInitSectionsArgs =
+      SPSArgList<SPSExecutorAddr, SPSSequence<SPSExecutorAddrRange>>;
+
+  if (LLVM_UNLIKELY(IsBootstrapping)) {
+    MP.Bootstrap.load()->addArgumentsToRTFnMap(
+        &MP.RegisterInitSections, &MP.DeregisterInitSections,
+        getArgDataBufferType<SPSRegisterInitSectionsArgs>(HeaderAddr,
+                                                          ELFNixPlatformSecs),
+        getArgDataBufferType<SPSRegisterInitSectionsArgs>(HeaderAddr,
+                                                          ELFNixPlatformSecs));
+    return Error::success();
+  }
+
+  G.allocActions().push_back(
+      {cantFail(WrapperFunctionCall::Create<SPSRegisterInitSectionsArgs>(
+           MP.RegisterInitSections.Addr, HeaderAddr, ELFNixPlatformSecs)),
+       cantFail(WrapperFunctionCall::Create<SPSRegisterInitSectionsArgs>(
+           MP.DeregisterInitSections.Addr, HeaderAddr, ELFNixPlatformSecs))});
+
+  return Error::success();
 }
 
 Error ELFNixPlatform::ELFNixPlatformPlugin::fixTLVSectionsAndEdges(
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index a90770f6ea2ee..922c65d7fc3f5 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2278,7 +2278,7 @@ Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
   Builder.restoreIP(SaveIP);
 
   Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      CastItem, FromType->getPointerTo());
+      CastItem, Builder.getPtrTy(0));
   Builder.CreateStore(From, ValCastItem);
   return Builder.CreateLoad(ToType, CastItem);
 }
@@ -2328,11 +2328,11 @@ void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
       continue;
     Type *IntType = Builder.getIntNTy(IntSize * 8);
     Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
-        Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast");
+        Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
     Value *SrcAddrGEP =
         Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
     ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
-        ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast");
+        ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
 
     Function *CurFunc = Builder.GetInsertBlock()->getParent();
     if ((Size / IntSize) > 1) {
@@ -2567,7 +2567,7 @@ Function *OpenMPIRBuilder::emitInterWarpCopyFunction(
   Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
       ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
   Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      NumWarpsAlloca, Arg1Type->getPointerTo(),
+      NumWarpsAlloca, Builder.getPtrTy(0),
       NumWarpsAlloca->getName() + ".ascast");
   Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
   Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
@@ -2765,7 +2765,7 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
 
   Type *ReduceListArgType = ReduceListArg->getType();
   Type *LaneIDArgType = LaneIDArg->getType();
-  Type *LaneIDArgPtrType = LaneIDArg->getType()->getPointerTo();
+  Type *LaneIDArgPtrType = Builder.getPtrTy(0);
   Value *ReduceListAlloca = Builder.CreateAlloca(
       ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
   Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 70e3af941bf77..280e347739cdb 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1338,12 +1338,8 @@ void SlotTracker::CreateMetadataSlot(const MDNode *N) {
 void SlotTracker::CreateAttributeSetSlot(AttributeSet AS) {
   assert(AS.hasAttributes() && "Doesn't need a slot!");
 
-  as_iterator I = asMap.find(AS);
-  if (I != asMap.end())
-    return;
-
-  unsigned DestSlot = asNext++;
-  asMap[AS] = DestSlot;
+  if (asMap.try_emplace(AS, asNext).second)
+    ++asNext;
 }
 
 /// Create a new slot for the specified Module
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index b84258398c193..6f833acd6dbc0 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5517,6 +5517,18 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
     return Res;
   }
 
+  if (T.isSPARC()) {
+    // Add "-i128:128"
+    std::string I64 = "-i64:64";
+    std::string I128 = "-i128:128";
+    if (!StringRef(Res).contains(I128)) {
+      size_t Pos = Res.find(I64);
+      assert(Pos != size_t(-1) && "no i64 data layout found!");
+      Res.insert(Pos + I64.size(), I128);
+    }
+    return Res;
+  }
+
   if (!T.isX86())
     return Res;
 
diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
index e5756940dd5a0..544f4ea9223d0 100644
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -32,6 +32,7 @@ add_llvm_component_library(LLVMCore
   GCStrategy.cpp
   GVMaterializer.cpp
   Globals.cpp
+  Intrinsics.cpp
   IRBuilder.cpp
   IRPrintingPasses.cpp
   SSAContext.cpp
diff --git a/llvm/lib/IR/ConstantFPRange.cpp b/llvm/lib/IR/ConstantFPRange.cpp
index 58aab353b4393..957701891c8f3 100644
--- a/llvm/lib/IR/ConstantFPRange.cpp
+++ b/llvm/lib/IR/ConstantFPRange.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/IR/ConstantFPRange.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -196,13 +195,6 @@ FPClassTest ConstantFPRange::classify() const {
   return static_cast<FPClassTest>(Mask);
 }
 
-KnownFPClass ConstantFPRange::toKnownFPClass() const {
-  KnownFPClass Result;
-  Result.KnownFPClasses = classify();
-  Result.SignBit = getSignBit();
-  return Result;
-}
-
 void ConstantFPRange::print(raw_ostream &OS) const {
   if (isFullSet())
     OS << "full-set";
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 05ab0968ef6f3..a6f46da313e21 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -731,11 +731,11 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
 
   // Handle simplifications when the RHS is a constant int.
   if (ConstantInt *CI2 = dyn_cast<ConstantInt>(C2)) {
+    if (C2 == ConstantExpr::getBinOpAbsorber(Opcode, C2->getType(),
+                                             /*AllowLHSConstant*/ false))
+      return C2;
+
     switch (Opcode) {
-    case Instruction::Mul:
-      if (CI2->isZero())
-        return C2; // X * 0 == 0
-      break;
     case Instruction::UDiv:
     case Instruction::SDiv:
       if (CI2->isZero())
@@ -749,9 +749,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
         return PoisonValue::get(CI2->getType());              // X % 0 == poison
       break;
     case Instruction::And:
-      if (CI2->isZero())
-        return C2; // X & 0 == 0
-
+      assert(!CI2->isZero() && "And zero handled above");
       if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
         // If and'ing the address of a global with a constant, fold it.
         if (CE1->getOpcode() == Instruction::PtrToInt &&
@@ -791,10 +789,6 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
         }
       }
       break;
-    case Instruction::Or:
-      if (CI2->isMinusOne())
-        return C2; // X | -1 == -1
-      break;
     }
   } else if (isa<ConstantInt>(C1)) {
     // If C1 is a ConstantInt and C2 is not, swap the operands.
@@ -854,19 +848,9 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
       }
     }
 
-    switch (Opcode) {
-    case Instruction::SDiv:
-    case Instruction::UDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::Shl:
-      if (CI1->isZero()) return C1;
-      break;
-    default:
-      break;
-    }
+    if (C1 == ConstantExpr::getBinOpAbsorber(Opcode, C1->getType(),
+                                             /*AllowLHSConstant*/ true))
+      return C1;
   } else if (ConstantFP *CFP1 = dyn_cast<ConstantFP>(C1)) {
     if (ConstantFP *CFP2 = dyn_cast<ConstantFP>(C2)) {
       const APFloat &C1V = CFP1->getValueAPF();
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index d6c00a4b54782..fe3a086c5772d 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -2735,17 +2735,34 @@ Constant *ConstantExpr::getIdentity(Instruction *I, Type *Ty,
   return nullptr;
 }
 
-Constant *ConstantExpr::getBinOpAbsorber(unsigned Opcode, Type *Ty) {
+Constant *ConstantExpr::getBinOpAbsorber(unsigned Opcode, Type *Ty,
+                                         bool AllowLHSConstant) {
   switch (Opcode) {
   default:
-    // Doesn't have an absorber.
-    return nullptr;
+    break;
 
-  case Instruction::Or:
+  case Instruction::Or: // -1 | X = -1
     return Constant::getAllOnesValue(Ty);
 
-  case Instruction::And:
-  case Instruction::Mul:
+  case Instruction::And: // 0 & X = 0
+  case Instruction::Mul: // 0 * X = 0
+    return Constant::getNullValue(Ty);
+  }
+
+  // AllowLHSConstant must be set.
+  if (!AllowLHSConstant)
+    return nullptr;
+
+  switch (Opcode) {
+  default:
+    return nullptr;
+  case Instruction::Shl:  // 0 << X = 0
+  case Instruction::LShr: // 0 >>l X = 0
+  case Instruction::AShr: // 0 >>a X = 0
+  case Instruction::SDiv: // 0 /s X = 0
+  case Instruction::UDiv: // 0 /u X = 0
+  case Instruction::URem: // 0 %u X = 0
+  case Instruction::SRem: // 0 %s X = 0
     return Constant::getNullValue(Ty);
   }
 }
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index c1ca2c255aa58..ee084e870263d 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -2508,7 +2508,7 @@ const char *LLVMIntrinsicCopyOverloadedName2(LLVMModuleRef Mod, unsigned ID,
 }
 
 unsigned LLVMLookupIntrinsicID(const char *Name, size_t NameLen) {
-  return Function::lookupIntrinsicID({Name, NameLen});
+  return Intrinsic::lookupIntrinsicID({Name, NameLen});
 }
 
 LLVMBool LLVMIntrinsicIsOverloaded(unsigned ID) {
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 8767c2971f62c..e4786e0bc6032 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -33,24 +33,6 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IntrinsicsAArch64.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/IntrinsicsARM.h"
-#include "llvm/IR/IntrinsicsBPF.h"
-#include "llvm/IR/IntrinsicsDirectX.h"
-#include "llvm/IR/IntrinsicsHexagon.h"
-#include "llvm/IR/IntrinsicsLoongArch.h"
-#include "llvm/IR/IntrinsicsMips.h"
-#include "llvm/IR/IntrinsicsNVPTX.h"
-#include "llvm/IR/IntrinsicsPowerPC.h"
-#include "llvm/IR/IntrinsicsR600.h"
-#include "llvm/IR/IntrinsicsRISCV.h"
-#include "llvm/IR/IntrinsicsS390.h"
-#include "llvm/IR/IntrinsicsSPIRV.h"
-#include "llvm/IR/IntrinsicsVE.h"
-#include "llvm/IR/IntrinsicsWebAssembly.h"
-#include "llvm/IR/IntrinsicsX86.h"
-#include "llvm/IR/IntrinsicsXCore.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
@@ -939,66 +921,8 @@ void Function::setOnlyAccessesInaccessibleMemOrArgMem() {
                    MemoryEffects::inaccessibleOrArgMemOnly());
 }
 
-/// Table of string intrinsic names indexed by enum value.
-static const char * const IntrinsicNameTable[] = {
-  "not_intrinsic",
-#define GET_INTRINSIC_NAME_TABLE
-#include "llvm/IR/IntrinsicImpl.inc"
-#undef GET_INTRINSIC_NAME_TABLE
-};
-
-/// Table of per-target intrinsic name tables.
-#define GET_INTRINSIC_TARGET_DATA
-#include "llvm/IR/IntrinsicImpl.inc"
-#undef GET_INTRINSIC_TARGET_DATA
-
-bool Function::isTargetIntrinsic(Intrinsic::ID IID) {
-  return IID > TargetInfos[0].Count;
-}
-
 bool Function::isTargetIntrinsic() const {
-  return isTargetIntrinsic(IntID);
-}
-
-/// Find the segment of \c IntrinsicNameTable for intrinsics with the same
-/// target as \c Name, or the generic table if \c Name is not target specific.
-///
-/// Returns the relevant slice of \c IntrinsicNameTable
-static ArrayRef<const char *> findTargetSubtable(StringRef Name) {
-  assert(Name.starts_with("llvm."));
-
-  ArrayRef<IntrinsicTargetInfo> Targets(TargetInfos);
-  // Drop "llvm." and take the first dotted component. That will be the target
-  // if this is target specific.
-  StringRef Target = Name.drop_front(5).split('.').first;
-  auto It = partition_point(
-      Targets, [=](const IntrinsicTargetInfo &TI) { return TI.Name < Target; });
-  // We've either found the target or just fall back to the generic set, which
-  // is always first.
-  const auto &TI = It != Targets.end() && It->Name == Target ? *It : Targets[0];
-  return ArrayRef(&IntrinsicNameTable[1] + TI.Offset, TI.Count);
-}
-
-/// This does the actual lookup of an intrinsic ID which
-/// matches the given function name.
-Intrinsic::ID Function::lookupIntrinsicID(StringRef Name) {
-  ArrayRef<const char *> NameTable = findTargetSubtable(Name);
-  int Idx = Intrinsic::lookupLLVMIntrinsicByName(NameTable, Name);
-  if (Idx == -1)
-    return Intrinsic::not_intrinsic;
-
-  // Intrinsic IDs correspond to the location in IntrinsicNameTable, but we have
-  // an index into a sub-table.
-  int Adjust = NameTable.data() - IntrinsicNameTable;
-  Intrinsic::ID ID = static_cast<Intrinsic::ID>(Idx + Adjust);
-
-  // If the intrinsic is not overloaded, require an exact match. If it is
-  // overloaded, require either exact or prefix match.
-  const auto MatchSize = strlen(NameTable[Idx]);
-  assert(Name.size() >= MatchSize && "Expected either exact or prefix match");
-  bool IsExactMatch = Name.size() == MatchSize;
-  return IsExactMatch || Intrinsic::isOverloaded(ID) ? ID
-                                                     : Intrinsic::not_intrinsic;
+  return Intrinsic::isTargetIntrinsic(IntID);
 }
 
 void Function::updateAfterNameChange() {
@@ -1010,893 +934,7 @@ void Function::updateAfterNameChange() {
     return;
   }
   HasLLVMReservedName = true;
-  IntID = lookupIntrinsicID(Name);
-}
-
-/// Returns a stable mangling for the type specified for use in the name
-/// mangling scheme used by 'any' types in intrinsic signatures.  The mangling
-/// of named types is simply their name.  Manglings for unnamed types consist
-/// of a prefix ('p' for pointers, 'a' for arrays, 'f_' for functions)
-/// combined with the mangling of their component types.  A vararg function
-/// type will have a suffix of 'vararg'.  Since function types can contain
-/// other function types, we close a function type mangling with suffix 'f'
-/// which can't be confused with it's prefix.  This ensures we don't have
-/// collisions between two unrelated function types. Otherwise, you might
-/// parse ffXX as f(fXX) or f(fX)X.  (X is a placeholder for any other type.)
-/// The HasUnnamedType boolean is set if an unnamed type was encountered,
-/// indicating that extra care must be taken to ensure a unique name.
-static std::string getMangledTypeStr(Type *Ty, bool &HasUnnamedType) {
-  std::string Result;
-  if (PointerType *PTyp = dyn_cast<PointerType>(Ty)) {
-    Result += "p" + utostr(PTyp->getAddressSpace());
-  } else if (ArrayType *ATyp = dyn_cast<ArrayType>(Ty)) {
-    Result += "a" + utostr(ATyp->getNumElements()) +
-              getMangledTypeStr(ATyp->getElementType(), HasUnnamedType);
-  } else if (StructType *STyp = dyn_cast<StructType>(Ty)) {
-    if (!STyp->isLiteral()) {
-      Result += "s_";
-      if (STyp->hasName())
-        Result += STyp->getName();
-      else
-        HasUnnamedType = true;
-    } else {
-      Result += "sl_";
-      for (auto *Elem : STyp->elements())
-        Result += getMangledTypeStr(Elem, HasUnnamedType);
-    }
-    // Ensure nested structs are distinguishable.
-    Result += "s";
-  } else if (FunctionType *FT = dyn_cast<FunctionType>(Ty)) {
-    Result += "f_" + getMangledTypeStr(FT->getReturnType(), HasUnnamedType);
-    for (size_t i = 0; i < FT->getNumParams(); i++)
-      Result += getMangledTypeStr(FT->getParamType(i), HasUnnamedType);
-    if (FT->isVarArg())
-      Result += "vararg";
-    // Ensure nested function types are distinguishable.
-    Result += "f";
-  } else if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
-    ElementCount EC = VTy->getElementCount();
-    if (EC.isScalable())
-      Result += "nx";
-    Result += "v" + utostr(EC.getKnownMinValue()) +
-              getMangledTypeStr(VTy->getElementType(), HasUnnamedType);
-  } else if (TargetExtType *TETy = dyn_cast<TargetExtType>(Ty)) {
-    Result += "t";
-    Result += TETy->getName();
-    for (Type *ParamTy : TETy->type_params())
-      Result += "_" + getMangledTypeStr(ParamTy, HasUnnamedType);
-    for (unsigned IntParam : TETy->int_params())
-      Result += "_" + utostr(IntParam);
-    // Ensure nested target extension types are distinguishable.
-    Result += "t";
-  } else if (Ty) {
-    switch (Ty->getTypeID()) {
-    default: llvm_unreachable("Unhandled type");
-    case Type::VoidTyID:      Result += "isVoid";   break;
-    case Type::MetadataTyID:  Result += "Metadata"; break;
-    case Type::HalfTyID:      Result += "f16";      break;
-    case Type::BFloatTyID:    Result += "bf16";     break;
-    case Type::FloatTyID:     Result += "f32";      break;
-    case Type::DoubleTyID:    Result += "f64";      break;
-    case Type::X86_FP80TyID:  Result += "f80";      break;
-    case Type::FP128TyID:     Result += "f128";     break;
-    case Type::PPC_FP128TyID:
-      Result += "ppcf128";
-      break;
-    case Type::X86_AMXTyID:   Result += "x86amx";   break;
-    case Type::IntegerTyID:
-      Result += "i" + utostr(cast<IntegerType>(Ty)->getBitWidth());
-      break;
-    }
-  }
-  return Result;
-}
-
-StringRef Intrinsic::getBaseName(ID id) {
-  assert(id < num_intrinsics && "Invalid intrinsic ID!");
-  return IntrinsicNameTable[id];
-}
-
-StringRef Intrinsic::getName(ID id) {
-  assert(id < num_intrinsics && "Invalid intrinsic ID!");
-  assert(!Intrinsic::isOverloaded(id) &&
-         "This version of getName does not support overloading");
-  return getBaseName(id);
-}
-
-static std::string getIntrinsicNameImpl(Intrinsic::ID Id, ArrayRef<Type *> Tys,
-                                        Module *M, FunctionType *FT,
-                                        bool EarlyModuleCheck) {
-
-  assert(Id < Intrinsic::num_intrinsics && "Invalid intrinsic ID!");
-  assert((Tys.empty() || Intrinsic::isOverloaded(Id)) &&
-         "This version of getName is for overloaded intrinsics only");
-  (void)EarlyModuleCheck;
-  assert((!EarlyModuleCheck || M ||
-          !any_of(Tys, [](Type *T) { return isa<PointerType>(T); })) &&
-         "Intrinsic overloading on pointer types need to provide a Module");
-  bool HasUnnamedType = false;
-  std::string Result(Intrinsic::getBaseName(Id));
-  for (Type *Ty : Tys)
-    Result += "." + getMangledTypeStr(Ty, HasUnnamedType);
-  if (HasUnnamedType) {
-    assert(M && "unnamed types need a module");
-    if (!FT)
-      FT = Intrinsic::getType(M->getContext(), Id, Tys);
-    else
-      assert((FT == Intrinsic::getType(M->getContext(), Id, Tys)) &&
-             "Provided FunctionType must match arguments");
-    return M->getUniqueIntrinsicName(Result, Id, FT);
-  }
-  return Result;
-}
-
-std::string Intrinsic::getName(ID Id, ArrayRef<Type *> Tys, Module *M,
-                               FunctionType *FT) {
-  assert(M && "We need to have a Module");
-  return getIntrinsicNameImpl(Id, Tys, M, FT, true);
-}
-
-std::string Intrinsic::getNameNoUnnamedTypes(ID Id, ArrayRef<Type *> Tys) {
-  return getIntrinsicNameImpl(Id, Tys, nullptr, nullptr, false);
-}
-
-/// IIT_Info - These are enumerators that describe the entries returned by the
-/// getIntrinsicInfoTableEntries function.
-///
-/// Defined in Intrinsics.td.
-enum IIT_Info {
-#define GET_INTRINSIC_IITINFO
-#include "llvm/IR/IntrinsicImpl.inc"
-#undef GET_INTRINSIC_IITINFO
-};
-
-static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
-                      IIT_Info LastInfo,
-                      SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
-  using namespace Intrinsic;
-
-  bool IsScalableVector = (LastInfo == IIT_SCALABLE_VEC);
-
-  IIT_Info Info = IIT_Info(Infos[NextElt++]);
-  unsigned StructElts = 2;
-
-  switch (Info) {
-  case IIT_Done:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Void, 0));
-    return;
-  case IIT_VARARG:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::VarArg, 0));
-    return;
-  case IIT_MMX:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::MMX, 0));
-    return;
-  case IIT_AMX:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::AMX, 0));
-    return;
-  case IIT_TOKEN:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Token, 0));
-    return;
-  case IIT_METADATA:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Metadata, 0));
-    return;
-  case IIT_F16:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Half, 0));
-    return;
-  case IIT_BF16:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::BFloat, 0));
-    return;
-  case IIT_F32:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Float, 0));
-    return;
-  case IIT_F64:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Double, 0));
-    return;
-  case IIT_F128:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Quad, 0));
-    return;
-  case IIT_PPCF128:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::PPCQuad, 0));
-    return;
-  case IIT_I1:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 1));
-    return;
-  case IIT_I2:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 2));
-    return;
-  case IIT_I4:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 4));
-    return;
-  case IIT_AARCH64_SVCOUNT:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::AArch64Svcount, 0));
-    return;
-  case IIT_I8:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 8));
-    return;
-  case IIT_I16:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer,16));
-    return;
-  case IIT_I32:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 32));
-    return;
-  case IIT_I64:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 64));
-    return;
-  case IIT_I128:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 128));
-    return;
-  case IIT_V1:
-    OutputTable.push_back(IITDescriptor::getVector(1, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_V2:
-    OutputTable.push_back(IITDescriptor::getVector(2, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_V3:
-    OutputTable.push_back(IITDescriptor::getVector(3, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_V4:
-    OutputTable.push_back(IITDescriptor::getVector(4, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_V6:
-    OutputTable.push_back(IITDescriptor::getVector(6, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_V8:
-    OutputTable.push_back(IITDescriptor::getVector(8, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_V10:
-    OutputTable.push_back(IITDescriptor::getVector(10, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_V16:
-    OutputTable.push_back(IITDescriptor::getVector(16, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_V32:
-    OutputTable.push_back(IITDescriptor::getVector(32, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_V64:
-    OutputTable.push_back(IITDescriptor::getVector(64, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_V128:
-    OutputTable.push_back(IITDescriptor::getVector(128, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_V256:
-    OutputTable.push_back(IITDescriptor::getVector(256, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_V512:
-    OutputTable.push_back(IITDescriptor::getVector(512, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_V1024:
-    OutputTable.push_back(IITDescriptor::getVector(1024, IsScalableVector));
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  case IIT_EXTERNREF:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 10));
-    return;
-  case IIT_FUNCREF:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 20));
-    return;
-  case IIT_PTR:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 0));
-    return;
-  case IIT_ANYPTR: // [ANYPTR addrspace]
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer,
-                                             Infos[NextElt++]));
-    return;
-  case IIT_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Argument, ArgInfo));
-    return;
-  }
-  case IIT_EXTEND_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::ExtendArgument,
-                                             ArgInfo));
-    return;
-  }
-  case IIT_TRUNC_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::TruncArgument,
-                                             ArgInfo));
-    return;
-  }
-  case IIT_HALF_VEC_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::HalfVecArgument,
-                                             ArgInfo));
-    return;
-  }
-  case IIT_SAME_VEC_WIDTH_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument,
-                                             ArgInfo));
-    return;
-  }
-  case IIT_VEC_OF_ANYPTRS_TO_ELT: {
-    unsigned short ArgNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    unsigned short RefNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(
-        IITDescriptor::get(IITDescriptor::VecOfAnyPtrsToElt, ArgNo, RefNo));
-    return;
-  }
-  case IIT_EMPTYSTRUCT:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0));
-    return;
-  case IIT_STRUCT9: ++StructElts; [[fallthrough]];
-  case IIT_STRUCT8: ++StructElts; [[fallthrough]];
-  case IIT_STRUCT7: ++StructElts; [[fallthrough]];
-  case IIT_STRUCT6: ++StructElts; [[fallthrough]];
-  case IIT_STRUCT5: ++StructElts; [[fallthrough]];
-  case IIT_STRUCT4: ++StructElts; [[fallthrough]];
-  case IIT_STRUCT3: ++StructElts; [[fallthrough]];
-  case IIT_STRUCT2: {
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct,StructElts));
-
-    for (unsigned i = 0; i != StructElts; ++i)
-      DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  }
-  case IIT_SUBDIVIDE2_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Subdivide2Argument,
-                                             ArgInfo));
-    return;
-  }
-  case IIT_SUBDIVIDE4_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Subdivide4Argument,
-                                             ArgInfo));
-    return;
-  }
-  case IIT_VEC_ELEMENT: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::VecElementArgument,
-                                             ArgInfo));
-    return;
-  }
-  case IIT_SCALABLE_VEC: {
-    DecodeIITType(NextElt, Infos, Info, OutputTable);
-    return;
-  }
-  case IIT_VEC_OF_BITCASTS_TO_INT: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::VecOfBitcastsToInt,
-                                             ArgInfo));
-    return;
-  }
-  }
-  llvm_unreachable("unhandled");
-}
-
-#define GET_INTRINSIC_GENERATOR_GLOBAL
-#include "llvm/IR/IntrinsicImpl.inc"
-#undef GET_INTRINSIC_GENERATOR_GLOBAL
-
-void Intrinsic::getIntrinsicInfoTableEntries(ID id,
-                                             SmallVectorImpl<IITDescriptor> &T){
-  static_assert(sizeof(IIT_Table[0]) == 2,
-                "Expect 16-bit entries in IIT_Table");
-  // Check to see if the intrinsic's type was expressible by the table.
-  uint16_t TableVal = IIT_Table[id - 1];
-
-  // Decode the TableVal into an array of IITValues.
-  SmallVector<unsigned char> IITValues;
-  ArrayRef<unsigned char> IITEntries;
-  unsigned NextElt = 0;
-  if (TableVal >> 15) {
-    // This is an offset into the IIT_LongEncodingTable.
-    IITEntries = IIT_LongEncodingTable;
-
-    // Strip sentinel bit.
-    NextElt = TableVal & 0x7fff;
-  } else {
-    // If the entry was encoded into a single word in the table itself, decode
-    // it from an array of nibbles to an array of bytes.
-    do {
-      IITValues.push_back(TableVal & 0xF);
-      TableVal >>= 4;
-    } while (TableVal);
-
-    IITEntries = IITValues;
-    NextElt = 0;
-  }
-
-  // Okay, decode the table into the output vector of IITDescriptors.
-  DecodeIITType(NextElt, IITEntries, IIT_Done, T);
-  while (NextElt != IITEntries.size() && IITEntries[NextElt] != 0)
-    DecodeIITType(NextElt, IITEntries, IIT_Done, T);
-}
-
-static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
-                             ArrayRef<Type*> Tys, LLVMContext &Context) {
-  using namespace Intrinsic;
-
-  IITDescriptor D = Infos.front();
-  Infos = Infos.slice(1);
-
-  switch (D.Kind) {
-  case IITDescriptor::Void: return Type::getVoidTy(Context);
-  case IITDescriptor::VarArg: return Type::getVoidTy(Context);
-  case IITDescriptor::MMX:
-    return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
-  case IITDescriptor::AMX: return Type::getX86_AMXTy(Context);
-  case IITDescriptor::Token: return Type::getTokenTy(Context);
-  case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
-  case IITDescriptor::Half: return Type::getHalfTy(Context);
-  case IITDescriptor::BFloat: return Type::getBFloatTy(Context);
-  case IITDescriptor::Float: return Type::getFloatTy(Context);
-  case IITDescriptor::Double: return Type::getDoubleTy(Context);
-  case IITDescriptor::Quad: return Type::getFP128Ty(Context);
-  case IITDescriptor::PPCQuad: return Type::getPPC_FP128Ty(Context);
-  case IITDescriptor::AArch64Svcount:
-    return TargetExtType::get(Context, "aarch64.svcount");
-
-  case IITDescriptor::Integer:
-    return IntegerType::get(Context, D.Integer_Width);
-  case IITDescriptor::Vector:
-    return VectorType::get(DecodeFixedType(Infos, Tys, Context),
-                           D.Vector_Width);
-  case IITDescriptor::Pointer:
-    return PointerType::get(Context, D.Pointer_AddressSpace);
-  case IITDescriptor::Struct: {
-    SmallVector<Type *, 8> Elts;
-    for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
-      Elts.push_back(DecodeFixedType(Infos, Tys, Context));
-    return StructType::get(Context, Elts);
-  }
-  case IITDescriptor::Argument:
-    return Tys[D.getArgumentNumber()];
-  case IITDescriptor::ExtendArgument: {
-    Type *Ty = Tys[D.getArgumentNumber()];
-    if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-      return VectorType::getExtendedElementVectorType(VTy);
-
-    return IntegerType::get(Context, 2 * cast<IntegerType>(Ty)->getBitWidth());
-  }
-  case IITDescriptor::TruncArgument: {
-    Type *Ty = Tys[D.getArgumentNumber()];
-    if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-      return VectorType::getTruncatedElementVectorType(VTy);
-
-    IntegerType *ITy = cast<IntegerType>(Ty);
-    assert(ITy->getBitWidth() % 2 == 0);
-    return IntegerType::get(Context, ITy->getBitWidth() / 2);
-  }
-  case IITDescriptor::Subdivide2Argument:
-  case IITDescriptor::Subdivide4Argument: {
-    Type *Ty = Tys[D.getArgumentNumber()];
-    VectorType *VTy = dyn_cast<VectorType>(Ty);
-    assert(VTy && "Expected an argument of Vector Type");
-    int SubDivs = D.Kind == IITDescriptor::Subdivide2Argument ? 1 : 2;
-    return VectorType::getSubdividedVectorType(VTy, SubDivs);
-  }
-  case IITDescriptor::HalfVecArgument:
-    return VectorType::getHalfElementsVectorType(cast<VectorType>(
-                                                  Tys[D.getArgumentNumber()]));
-  case IITDescriptor::SameVecWidthArgument: {
-    Type *EltTy = DecodeFixedType(Infos, Tys, Context);
-    Type *Ty = Tys[D.getArgumentNumber()];
-    if (auto *VTy = dyn_cast<VectorType>(Ty))
-      return VectorType::get(EltTy, VTy->getElementCount());
-    return EltTy;
-  }
-  case IITDescriptor::VecElementArgument: {
-    Type *Ty = Tys[D.getArgumentNumber()];
-    if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-      return VTy->getElementType();
-    llvm_unreachable("Expected an argument of Vector Type");
-  }
-  case IITDescriptor::VecOfBitcastsToInt: {
-    Type *Ty = Tys[D.getArgumentNumber()];
-    VectorType *VTy = dyn_cast<VectorType>(Ty);
-    assert(VTy && "Expected an argument of Vector Type");
-    return VectorType::getInteger(VTy);
-  }
-  case IITDescriptor::VecOfAnyPtrsToElt:
-    // Return the overloaded type (which determines the pointers address space)
-    return Tys[D.getOverloadArgNumber()];
-  }
-  llvm_unreachable("unhandled");
-}
-
-FunctionType *Intrinsic::getType(LLVMContext &Context,
-                                 ID id, ArrayRef<Type*> Tys) {
-  SmallVector<IITDescriptor, 8> Table;
-  getIntrinsicInfoTableEntries(id, Table);
-
-  ArrayRef<IITDescriptor> TableRef = Table;
-  Type *ResultTy = DecodeFixedType(TableRef, Tys, Context);
-
-  SmallVector<Type*, 8> ArgTys;
-  while (!TableRef.empty())
-    ArgTys.push_back(DecodeFixedType(TableRef, Tys, Context));
-
-  // DecodeFixedType returns Void for IITDescriptor::Void and IITDescriptor::VarArg
-  // If we see void type as the type of the last argument, it is vararg intrinsic
-  if (!ArgTys.empty() && ArgTys.back()->isVoidTy()) {
-    ArgTys.pop_back();
-    return FunctionType::get(ResultTy, ArgTys, true);
-  }
-  return FunctionType::get(ResultTy, ArgTys, false);
-}
-
-bool Intrinsic::isOverloaded(ID id) {
-#define GET_INTRINSIC_OVERLOAD_TABLE
-#include "llvm/IR/IntrinsicImpl.inc"
-#undef GET_INTRINSIC_OVERLOAD_TABLE
-}
-
-/// This defines the "Intrinsic::getAttributes(ID id)" method.
-#define GET_INTRINSIC_ATTRIBUTES
-#include "llvm/IR/IntrinsicImpl.inc"
-#undef GET_INTRINSIC_ATTRIBUTES
-
-Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type*> Tys) {
-  // There can never be multiple globals with the same name of different types,
-  // because intrinsics must be a specific type.
-  auto *FT = getType(M->getContext(), id, Tys);
-  return cast<Function>(
-      M->getOrInsertFunction(
-           Tys.empty() ? getName(id) : getName(id, Tys, M, FT), FT)
-          .getCallee());
-}
-
-// This defines the "Intrinsic::getIntrinsicForClangBuiltin()" method.
-#define GET_LLVM_INTRINSIC_FOR_CLANG_BUILTIN
-#include "llvm/IR/IntrinsicImpl.inc"
-#undef GET_LLVM_INTRINSIC_FOR_CLANG_BUILTIN
-
-// This defines the "Intrinsic::getIntrinsicForMSBuiltin()" method.
-#define GET_LLVM_INTRINSIC_FOR_MS_BUILTIN
-#include "llvm/IR/IntrinsicImpl.inc"
-#undef GET_LLVM_INTRINSIC_FOR_MS_BUILTIN
-
-bool Intrinsic::isConstrainedFPIntrinsic(ID QID) {
-  switch (QID) {
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
-  case Intrinsic::INTRINSIC:
-#include "llvm/IR/ConstrainedOps.def"
-#undef INSTRUCTION
-    return true;
-  default:
-    return false;
-  }
-}
-
-bool Intrinsic::hasConstrainedFPRoundingModeOperand(Intrinsic::ID QID) {
-  switch (QID) {
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
-  case Intrinsic::INTRINSIC:                                                   \
-    return ROUND_MODE == 1;
-#include "llvm/IR/ConstrainedOps.def"
-#undef INSTRUCTION
-  default:
-    return false;
-  }
-}
-
-using DeferredIntrinsicMatchPair =
-    std::pair<Type *, ArrayRef<Intrinsic::IITDescriptor>>;
-
-static bool matchIntrinsicType(
-    Type *Ty, ArrayRef<Intrinsic::IITDescriptor> &Infos,
-    SmallVectorImpl<Type *> &ArgTys,
-    SmallVectorImpl<DeferredIntrinsicMatchPair> &DeferredChecks,
-    bool IsDeferredCheck) {
-  using namespace Intrinsic;
-
-  // If we ran out of descriptors, there are too many arguments.
-  if (Infos.empty()) return true;
-
-  // Do this before slicing off the 'front' part
-  auto InfosRef = Infos;
-  auto DeferCheck = [&DeferredChecks, &InfosRef](Type *T) {
-    DeferredChecks.emplace_back(T, InfosRef);
-    return false;
-  };
-
-  IITDescriptor D = Infos.front();
-  Infos = Infos.slice(1);
-
-  switch (D.Kind) {
-    case IITDescriptor::Void: return !Ty->isVoidTy();
-    case IITDescriptor::VarArg: return true;
-    case IITDescriptor::MMX: {
-      FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
-      return !VT || VT->getNumElements() != 1 ||
-             !VT->getElementType()->isIntegerTy(64);
-    }
-    case IITDescriptor::AMX:  return !Ty->isX86_AMXTy();
-    case IITDescriptor::Token: return !Ty->isTokenTy();
-    case IITDescriptor::Metadata: return !Ty->isMetadataTy();
-    case IITDescriptor::Half: return !Ty->isHalfTy();
-    case IITDescriptor::BFloat: return !Ty->isBFloatTy();
-    case IITDescriptor::Float: return !Ty->isFloatTy();
-    case IITDescriptor::Double: return !Ty->isDoubleTy();
-    case IITDescriptor::Quad: return !Ty->isFP128Ty();
-    case IITDescriptor::PPCQuad: return !Ty->isPPC_FP128Ty();
-    case IITDescriptor::Integer: return !Ty->isIntegerTy(D.Integer_Width);
-    case IITDescriptor::AArch64Svcount:
-      return !isa<TargetExtType>(Ty) ||
-             cast<TargetExtType>(Ty)->getName() != "aarch64.svcount";
-    case IITDescriptor::Vector: {
-      VectorType *VT = dyn_cast<VectorType>(Ty);
-      return !VT || VT->getElementCount() != D.Vector_Width ||
-             matchIntrinsicType(VT->getElementType(), Infos, ArgTys,
-                                DeferredChecks, IsDeferredCheck);
-    }
-    case IITDescriptor::Pointer: {
-      PointerType *PT = dyn_cast<PointerType>(Ty);
-      return !PT || PT->getAddressSpace() != D.Pointer_AddressSpace;
-    }
-
-    case IITDescriptor::Struct: {
-      StructType *ST = dyn_cast<StructType>(Ty);
-      if (!ST || !ST->isLiteral() || ST->isPacked() ||
-          ST->getNumElements() != D.Struct_NumElements)
-        return true;
-
-      for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
-        if (matchIntrinsicType(ST->getElementType(i), Infos, ArgTys,
-                               DeferredChecks, IsDeferredCheck))
-          return true;
-      return false;
-    }
-
-    case IITDescriptor::Argument:
-      // If this is the second occurrence of an argument,
-      // verify that the later instance matches the previous instance.
-      if (D.getArgumentNumber() < ArgTys.size())
-        return Ty != ArgTys[D.getArgumentNumber()];
-
-      if (D.getArgumentNumber() > ArgTys.size() ||
-          D.getArgumentKind() == IITDescriptor::AK_MatchType)
-        return IsDeferredCheck || DeferCheck(Ty);
-
-      assert(D.getArgumentNumber() == ArgTys.size() && !IsDeferredCheck &&
-             "Table consistency error");
-      ArgTys.push_back(Ty);
-
-      switch (D.getArgumentKind()) {
-        case IITDescriptor::AK_Any:        return false; // Success
-        case IITDescriptor::AK_AnyInteger: return !Ty->isIntOrIntVectorTy();
-        case IITDescriptor::AK_AnyFloat:   return !Ty->isFPOrFPVectorTy();
-        case IITDescriptor::AK_AnyVector:  return !isa<VectorType>(Ty);
-        case IITDescriptor::AK_AnyPointer: return !isa<PointerType>(Ty);
-        default:                           break;
-      }
-      llvm_unreachable("all argument kinds not covered");
-
-    case IITDescriptor::ExtendArgument: {
-      // If this is a forward reference, defer the check for later.
-      if (D.getArgumentNumber() >= ArgTys.size())
-        return IsDeferredCheck || DeferCheck(Ty);
-
-      Type *NewTy = ArgTys[D.getArgumentNumber()];
-      if (VectorType *VTy = dyn_cast<VectorType>(NewTy))
-        NewTy = VectorType::getExtendedElementVectorType(VTy);
-      else if (IntegerType *ITy = dyn_cast<IntegerType>(NewTy))
-        NewTy = IntegerType::get(ITy->getContext(), 2 * ITy->getBitWidth());
-      else
-        return true;
-
-      return Ty != NewTy;
-    }
-    case IITDescriptor::TruncArgument: {
-      // If this is a forward reference, defer the check for later.
-      if (D.getArgumentNumber() >= ArgTys.size())
-        return IsDeferredCheck || DeferCheck(Ty);
-
-      Type *NewTy = ArgTys[D.getArgumentNumber()];
-      if (VectorType *VTy = dyn_cast<VectorType>(NewTy))
-        NewTy = VectorType::getTruncatedElementVectorType(VTy);
-      else if (IntegerType *ITy = dyn_cast<IntegerType>(NewTy))
-        NewTy = IntegerType::get(ITy->getContext(), ITy->getBitWidth() / 2);
-      else
-        return true;
-
-      return Ty != NewTy;
-    }
-    case IITDescriptor::HalfVecArgument:
-      // If this is a forward reference, defer the check for later.
-      if (D.getArgumentNumber() >= ArgTys.size())
-        return IsDeferredCheck || DeferCheck(Ty);
-      return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
-             VectorType::getHalfElementsVectorType(
-                     cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
-    case IITDescriptor::SameVecWidthArgument: {
-      if (D.getArgumentNumber() >= ArgTys.size()) {
-        // Defer check and subsequent check for the vector element type.
-        Infos = Infos.slice(1);
-        return IsDeferredCheck || DeferCheck(Ty);
-      }
-      auto *ReferenceType = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
-      auto *ThisArgType = dyn_cast<VectorType>(Ty);
-      // Both must be vectors of the same number of elements or neither.
-      if ((ReferenceType != nullptr) != (ThisArgType != nullptr))
-        return true;
-      Type *EltTy = Ty;
-      if (ThisArgType) {
-        if (ReferenceType->getElementCount() !=
-            ThisArgType->getElementCount())
-          return true;
-        EltTy = ThisArgType->getElementType();
-      }
-      return matchIntrinsicType(EltTy, Infos, ArgTys, DeferredChecks,
-                                IsDeferredCheck);
-    }
-    case IITDescriptor::VecOfAnyPtrsToElt: {
-      unsigned RefArgNumber = D.getRefArgNumber();
-      if (RefArgNumber >= ArgTys.size()) {
-        if (IsDeferredCheck)
-          return true;
-        // If forward referencing, already add the pointer-vector type and
-        // defer the checks for later.
-        ArgTys.push_back(Ty);
-        return DeferCheck(Ty);
-      }
-
-      if (!IsDeferredCheck){
-        assert(D.getOverloadArgNumber() == ArgTys.size() &&
-               "Table consistency error");
-        ArgTys.push_back(Ty);
-      }
-
-      // Verify the overloaded type "matches" the Ref type.
-      // i.e. Ty is a vector with the same width as Ref.
-      // Composed of pointers to the same element type as Ref.
-      auto *ReferenceType = dyn_cast<VectorType>(ArgTys[RefArgNumber]);
-      auto *ThisArgVecTy = dyn_cast<VectorType>(Ty);
-      if (!ThisArgVecTy || !ReferenceType ||
-          (ReferenceType->getElementCount() != ThisArgVecTy->getElementCount()))
-        return true;
-      return !ThisArgVecTy->getElementType()->isPointerTy();
-    }
-    case IITDescriptor::VecElementArgument: {
-      if (D.getArgumentNumber() >= ArgTys.size())
-        return IsDeferredCheck ? true : DeferCheck(Ty);
-      auto *ReferenceType = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
-      return !ReferenceType || Ty != ReferenceType->getElementType();
-    }
-    case IITDescriptor::Subdivide2Argument:
-    case IITDescriptor::Subdivide4Argument: {
-      // If this is a forward reference, defer the check for later.
-      if (D.getArgumentNumber() >= ArgTys.size())
-        return IsDeferredCheck || DeferCheck(Ty);
-
-      Type *NewTy = ArgTys[D.getArgumentNumber()];
-      if (auto *VTy = dyn_cast<VectorType>(NewTy)) {
-        int SubDivs = D.Kind == IITDescriptor::Subdivide2Argument ? 1 : 2;
-        NewTy = VectorType::getSubdividedVectorType(VTy, SubDivs);
-        return Ty != NewTy;
-      }
-      return true;
-    }
-    case IITDescriptor::VecOfBitcastsToInt: {
-      if (D.getArgumentNumber() >= ArgTys.size())
-        return IsDeferredCheck || DeferCheck(Ty);
-      auto *ReferenceType = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
-      auto *ThisArgVecTy = dyn_cast<VectorType>(Ty);
-      if (!ThisArgVecTy || !ReferenceType)
-        return true;
-      return ThisArgVecTy != VectorType::getInteger(ReferenceType);
-    }
-  }
-  llvm_unreachable("unhandled");
-}
-
-Intrinsic::MatchIntrinsicTypesResult
-Intrinsic::matchIntrinsicSignature(FunctionType *FTy,
-                                   ArrayRef<Intrinsic::IITDescriptor> &Infos,
-                                   SmallVectorImpl<Type *> &ArgTys) {
-  SmallVector<DeferredIntrinsicMatchPair, 2> DeferredChecks;
-  if (matchIntrinsicType(FTy->getReturnType(), Infos, ArgTys, DeferredChecks,
-                         false))
-    return MatchIntrinsicTypes_NoMatchRet;
-
-  unsigned NumDeferredReturnChecks = DeferredChecks.size();
-
-  for (auto *Ty : FTy->params())
-    if (matchIntrinsicType(Ty, Infos, ArgTys, DeferredChecks, false))
-      return MatchIntrinsicTypes_NoMatchArg;
-
-  for (unsigned I = 0, E = DeferredChecks.size(); I != E; ++I) {
-    DeferredIntrinsicMatchPair &Check = DeferredChecks[I];
-    if (matchIntrinsicType(Check.first, Check.second, ArgTys, DeferredChecks,
-                           true))
-      return I < NumDeferredReturnChecks ? MatchIntrinsicTypes_NoMatchRet
-                                         : MatchIntrinsicTypes_NoMatchArg;
-  }
-
-  return MatchIntrinsicTypes_Match;
-}
-
-bool
-Intrinsic::matchIntrinsicVarArg(bool isVarArg,
-                                ArrayRef<Intrinsic::IITDescriptor> &Infos) {
-  // If there are no descriptors left, then it can't be a vararg.
-  if (Infos.empty())
-    return isVarArg;
-
-  // There should be only one descriptor remaining at this point.
-  if (Infos.size() != 1)
-    return true;
-
-  // Check and verify the descriptor.
-  IITDescriptor D = Infos.front();
-  Infos = Infos.slice(1);
-  if (D.Kind == IITDescriptor::VarArg)
-    return !isVarArg;
-
-  return true;
-}
-
-bool Intrinsic::getIntrinsicSignature(Intrinsic::ID ID, FunctionType *FT,
-                                      SmallVectorImpl<Type *> &ArgTys) {
-  if (!ID)
-    return false;
-
-  SmallVector<Intrinsic::IITDescriptor, 8> Table;
-  getIntrinsicInfoTableEntries(ID, Table);
-  ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
-
-  if (Intrinsic::matchIntrinsicSignature(FT, TableRef, ArgTys) !=
-      Intrinsic::MatchIntrinsicTypesResult::MatchIntrinsicTypes_Match) {
-    return false;
-  }
-  if (Intrinsic::matchIntrinsicVarArg(FT->isVarArg(), TableRef))
-    return false;
-  return true;
-}
-
-bool Intrinsic::getIntrinsicSignature(Function *F,
-                                      SmallVectorImpl<Type *> &ArgTys) {
-  return getIntrinsicSignature(F->getIntrinsicID(), F->getFunctionType(),
-                               ArgTys);
-}
-
-std::optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) {
-  SmallVector<Type *, 4> ArgTys;
-  if (!getIntrinsicSignature(F, ArgTys))
-    return std::nullopt;
-
-  Intrinsic::ID ID = F->getIntrinsicID();
-  StringRef Name = F->getName();
-  std::string WantedName =
-      Intrinsic::getName(ID, ArgTys, F->getParent(), F->getFunctionType());
-  if (Name == WantedName)
-    return std::nullopt;
-
-  Function *NewDecl = [&] {
-    if (auto *ExistingGV = F->getParent()->getNamedValue(WantedName)) {
-      if (auto *ExistingF = dyn_cast<Function>(ExistingGV))
-        if (ExistingF->getFunctionType() == F->getFunctionType())
-          return ExistingF;
-
-      // The name already exists, but is not a function or has the wrong
-      // prototype. Make place for the new one by renaming the old version.
-      // Either this old version will be removed later on or the module is
-      // invalid and we'll get an error.
-      ExistingGV->setName(WantedName + ".renamed");
-    }
-    return Intrinsic::getDeclaration(F->getParent(), ID, ArgTys);
-  }();
-
-  NewDecl->setCallingConv(F->getCallingConv());
-  assert(NewDecl->getFunctionType() == F->getFunctionType() &&
-         "Shouldn't change the signature");
-  return NewDecl;
+  IntID = Intrinsic::lookupIntrinsicID(Name);
 }
 
 /// hasAddressTaken - returns true if there are any uses of this function
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index e95b98a640443..009e0c03957c9 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -1584,7 +1584,7 @@ bool GetElementPtrInst::accumulateConstantOffset(const DataLayout &DL,
 
 bool GetElementPtrInst::collectOffset(
     const DataLayout &DL, unsigned BitWidth,
-    MapVector<Value *, APInt> &VariableOffsets,
+    SmallMapVector<Value *, APInt, 4> &VariableOffsets,
     APInt &ConstantOffset) const {
   // Delegate to the generic GEPOperator implementation.
   return cast<GEPOperator>(this)->collectOffset(DL, BitWidth, VariableOffsets,
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 7ed82c2ece464..0a6c93fde6302 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -236,43 +236,6 @@ void DbgAssignIntrinsic::setValue(Value *V) {
              MetadataAsValue::get(getContext(), ValueAsMetadata::get(V)));
 }
 
-int llvm::Intrinsic::lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
-                                               StringRef Name) {
-  assert(Name.starts_with("llvm.") && "Unexpected intrinsic prefix");
-
-  // Do successive binary searches of the dotted name components. For
-  // "llvm.gc.experimental.statepoint.p1i8.p1i32", we will find the range of
-  // intrinsics starting with "llvm.gc", then "llvm.gc.experimental", then
-  // "llvm.gc.experimental.statepoint", and then we will stop as the range is
-  // size 1. During the search, we can skip the prefix that we already know is
-  // identical. By using strncmp we consider names with differing suffixes to
-  // be part of the equal range.
-  size_t CmpEnd = 4; // Skip the "llvm" component.
-  const char *const *Low = NameTable.begin();
-  const char *const *High = NameTable.end();
-  const char *const *LastLow = Low;
-  while (CmpEnd < Name.size() && High - Low > 0) {
-    size_t CmpStart = CmpEnd;
-    CmpEnd = Name.find('.', CmpStart + 1);
-    CmpEnd = CmpEnd == StringRef::npos ? Name.size() : CmpEnd;
-    auto Cmp = [CmpStart, CmpEnd](const char *LHS, const char *RHS) {
-      return strncmp(LHS + CmpStart, RHS + CmpStart, CmpEnd - CmpStart) < 0;
-    };
-    LastLow = Low;
-    std::tie(Low, High) = std::equal_range(Low, High, Name.data(), Cmp);
-  }
-  if (High - Low > 0)
-    LastLow = Low;
-
-  if (LastLow == NameTable.end())
-    return -1;
-  StringRef NameFound = *LastLow;
-  if (Name == NameFound ||
-      (Name.starts_with(NameFound) && Name[NameFound.size()] == '.'))
-    return LastLow - NameTable.begin();
-  return -1;
-}
-
 ConstantInt *InstrProfCntrInstBase::getNumCounters() const {
   if (InstrProfValueProfileInst::classof(this))
     llvm_unreachable("InstrProfValueProfileInst does not have counters!");
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
new file mode 100644
index 0000000000000..ef26b1926b976
--- /dev/null
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -0,0 +1,1088 @@
+//===-- Intrinsics.cpp - Intrinsic Function Handling ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements functions required for supporting intrinsic functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/IntrinsicsBPF.h"
+#include "llvm/IR/IntrinsicsDirectX.h"
+#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/IntrinsicsLoongArch.h"
+#include "llvm/IR/IntrinsicsMips.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
+#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+#include "llvm/IR/IntrinsicsS390.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+#include "llvm/IR/IntrinsicsVE.h"
+#include "llvm/IR/IntrinsicsWebAssembly.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/IntrinsicsXCore.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+
+using namespace llvm;
+
+/// Table of string intrinsic names indexed by enum value.
+static constexpr const char *const IntrinsicNameTable[] = {
+    "not_intrinsic",
+#define GET_INTRINSIC_NAME_TABLE
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_INTRINSIC_NAME_TABLE
+};
+
+StringRef Intrinsic::getBaseName(ID id) {
+  assert(id < num_intrinsics && "Invalid intrinsic ID!");
+  return IntrinsicNameTable[id];
+}
+
+StringRef Intrinsic::getName(ID id) {
+  assert(id < num_intrinsics && "Invalid intrinsic ID!");
+  assert(!Intrinsic::isOverloaded(id) &&
+         "This version of getName does not support overloading");
+  return getBaseName(id);
+}
+
+/// Returns a stable mangling for the type specified for use in the name
+/// mangling scheme used by 'any' types in intrinsic signatures.  The mangling
+/// of named types is simply their name.  Manglings for unnamed types consist
+/// of a prefix ('p' for pointers, 'a' for arrays, 'f_' for functions)
+/// combined with the mangling of their component types.  A vararg function
+/// type will have a suffix of 'vararg'.  Since function types can contain
+/// other function types, we close a function type mangling with suffix 'f'
+/// which can't be confused with it's prefix.  This ensures we don't have
+/// collisions between two unrelated function types. Otherwise, you might
+/// parse ffXX as f(fXX) or f(fX)X.  (X is a placeholder for any other type.)
+/// The HasUnnamedType boolean is set if an unnamed type was encountered,
+/// indicating that extra care must be taken to ensure a unique name.
+static std::string getMangledTypeStr(Type *Ty, bool &HasUnnamedType) {
+  std::string Result;
+  if (PointerType *PTyp = dyn_cast<PointerType>(Ty)) {
+    Result += "p" + utostr(PTyp->getAddressSpace());
+  } else if (ArrayType *ATyp = dyn_cast<ArrayType>(Ty)) {
+    Result += "a" + utostr(ATyp->getNumElements()) +
+              getMangledTypeStr(ATyp->getElementType(), HasUnnamedType);
+  } else if (StructType *STyp = dyn_cast<StructType>(Ty)) {
+    if (!STyp->isLiteral()) {
+      Result += "s_";
+      if (STyp->hasName())
+        Result += STyp->getName();
+      else
+        HasUnnamedType = true;
+    } else {
+      Result += "sl_";
+      for (auto *Elem : STyp->elements())
+        Result += getMangledTypeStr(Elem, HasUnnamedType);
+    }
+    // Ensure nested structs are distinguishable.
+    Result += "s";
+  } else if (FunctionType *FT = dyn_cast<FunctionType>(Ty)) {
+    Result += "f_" + getMangledTypeStr(FT->getReturnType(), HasUnnamedType);
+    for (size_t i = 0; i < FT->getNumParams(); i++)
+      Result += getMangledTypeStr(FT->getParamType(i), HasUnnamedType);
+    if (FT->isVarArg())
+      Result += "vararg";
+    // Ensure nested function types are distinguishable.
+    Result += "f";
+  } else if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    ElementCount EC = VTy->getElementCount();
+    if (EC.isScalable())
+      Result += "nx";
+    Result += "v" + utostr(EC.getKnownMinValue()) +
+              getMangledTypeStr(VTy->getElementType(), HasUnnamedType);
+  } else if (TargetExtType *TETy = dyn_cast<TargetExtType>(Ty)) {
+    Result += "t";
+    Result += TETy->getName();
+    for (Type *ParamTy : TETy->type_params())
+      Result += "_" + getMangledTypeStr(ParamTy, HasUnnamedType);
+    for (unsigned IntParam : TETy->int_params())
+      Result += "_" + utostr(IntParam);
+    // Ensure nested target extension types are distinguishable.
+    Result += "t";
+  } else if (Ty) {
+    switch (Ty->getTypeID()) {
+    default:
+      llvm_unreachable("Unhandled type");
+    case Type::VoidTyID:
+      Result += "isVoid";
+      break;
+    case Type::MetadataTyID:
+      Result += "Metadata";
+      break;
+    case Type::HalfTyID:
+      Result += "f16";
+      break;
+    case Type::BFloatTyID:
+      Result += "bf16";
+      break;
+    case Type::FloatTyID:
+      Result += "f32";
+      break;
+    case Type::DoubleTyID:
+      Result += "f64";
+      break;
+    case Type::X86_FP80TyID:
+      Result += "f80";
+      break;
+    case Type::FP128TyID:
+      Result += "f128";
+      break;
+    case Type::PPC_FP128TyID:
+      Result += "ppcf128";
+      break;
+    case Type::X86_AMXTyID:
+      Result += "x86amx";
+      break;
+    case Type::IntegerTyID:
+      Result += "i" + utostr(cast<IntegerType>(Ty)->getBitWidth());
+      break;
+    }
+  }
+  return Result;
+}
+
+static std::string getIntrinsicNameImpl(Intrinsic::ID Id, ArrayRef<Type *> Tys,
+                                        Module *M, FunctionType *FT,
+                                        bool EarlyModuleCheck) {
+
+  assert(Id < Intrinsic::num_intrinsics && "Invalid intrinsic ID!");
+  assert((Tys.empty() || Intrinsic::isOverloaded(Id)) &&
+         "This version of getName is for overloaded intrinsics only");
+  (void)EarlyModuleCheck;
+  assert((!EarlyModuleCheck || M ||
+          !any_of(Tys, [](Type *T) { return isa<PointerType>(T); })) &&
+         "Intrinsic overloading on pointer types need to provide a Module");
+  bool HasUnnamedType = false;
+  std::string Result(Intrinsic::getBaseName(Id));
+  for (Type *Ty : Tys)
+    Result += "." + getMangledTypeStr(Ty, HasUnnamedType);
+  if (HasUnnamedType) {
+    assert(M && "unnamed types need a module");
+    if (!FT)
+      FT = Intrinsic::getType(M->getContext(), Id, Tys);
+    else
+      assert((FT == Intrinsic::getType(M->getContext(), Id, Tys)) &&
+             "Provided FunctionType must match arguments");
+    return M->getUniqueIntrinsicName(Result, Id, FT);
+  }
+  return Result;
+}
+
+std::string Intrinsic::getName(ID Id, ArrayRef<Type *> Tys, Module *M,
+                               FunctionType *FT) {
+  assert(M && "We need to have a Module");
+  return getIntrinsicNameImpl(Id, Tys, M, FT, true);
+}
+
+std::string Intrinsic::getNameNoUnnamedTypes(ID Id, ArrayRef<Type *> Tys) {
+  return getIntrinsicNameImpl(Id, Tys, nullptr, nullptr, false);
+}
+
+/// IIT_Info - These are enumerators that describe the entries returned by the
+/// getIntrinsicInfoTableEntries function.
+///
+/// Defined in Intrinsics.td.
+enum IIT_Info {
+#define GET_INTRINSIC_IITINFO
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_INTRINSIC_IITINFO
+};
+
+static void
+DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
+              IIT_Info LastInfo,
+              SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
+  using namespace Intrinsic;
+
+  bool IsScalableVector = (LastInfo == IIT_SCALABLE_VEC);
+
+  IIT_Info Info = IIT_Info(Infos[NextElt++]);
+  unsigned StructElts = 2;
+
+  switch (Info) {
+  case IIT_Done:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Void, 0));
+    return;
+  case IIT_VARARG:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::VarArg, 0));
+    return;
+  case IIT_MMX:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::MMX, 0));
+    return;
+  case IIT_AMX:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::AMX, 0));
+    return;
+  case IIT_TOKEN:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Token, 0));
+    return;
+  case IIT_METADATA:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Metadata, 0));
+    return;
+  case IIT_F16:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Half, 0));
+    return;
+  case IIT_BF16:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::BFloat, 0));
+    return;
+  case IIT_F32:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Float, 0));
+    return;
+  case IIT_F64:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Double, 0));
+    return;
+  case IIT_F128:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Quad, 0));
+    return;
+  case IIT_PPCF128:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::PPCQuad, 0));
+    return;
+  case IIT_I1:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 1));
+    return;
+  case IIT_I2:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 2));
+    return;
+  case IIT_I4:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 4));
+    return;
+  case IIT_AARCH64_SVCOUNT:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::AArch64Svcount, 0));
+    return;
+  case IIT_I8:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 8));
+    return;
+  case IIT_I16:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 16));
+    return;
+  case IIT_I32:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 32));
+    return;
+  case IIT_I64:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 64));
+    return;
+  case IIT_I128:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 128));
+    return;
+  case IIT_V1:
+    OutputTable.push_back(IITDescriptor::getVector(1, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V2:
+    OutputTable.push_back(IITDescriptor::getVector(2, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V3:
+    OutputTable.push_back(IITDescriptor::getVector(3, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V4:
+    OutputTable.push_back(IITDescriptor::getVector(4, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V6:
+    OutputTable.push_back(IITDescriptor::getVector(6, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V8:
+    OutputTable.push_back(IITDescriptor::getVector(8, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V10:
+    OutputTable.push_back(IITDescriptor::getVector(10, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V16:
+    OutputTable.push_back(IITDescriptor::getVector(16, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V32:
+    OutputTable.push_back(IITDescriptor::getVector(32, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V64:
+    OutputTable.push_back(IITDescriptor::getVector(64, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V128:
+    OutputTable.push_back(IITDescriptor::getVector(128, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V256:
+    OutputTable.push_back(IITDescriptor::getVector(256, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V512:
+    OutputTable.push_back(IITDescriptor::getVector(512, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V1024:
+    OutputTable.push_back(IITDescriptor::getVector(1024, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_EXTERNREF:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 10));
+    return;
+  case IIT_FUNCREF:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 20));
+    return;
+  case IIT_PTR:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 0));
+    return;
+  case IIT_ANYPTR: // [ANYPTR addrspace]
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::Pointer, Infos[NextElt++]));
+    return;
+  case IIT_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Argument, ArgInfo));
+    return;
+  }
+  case IIT_EXTEND_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::ExtendArgument, ArgInfo));
+    return;
+  }
+  case IIT_TRUNC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::TruncArgument, ArgInfo));
+    return;
+  }
+  case IIT_HALF_VEC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::HalfVecArgument, ArgInfo));
+    return;
+  }
+  case IIT_SAME_VEC_WIDTH_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::SameVecWidthArgument, ArgInfo));
+    return;
+  }
+  case IIT_VEC_OF_ANYPTRS_TO_ELT: {
+    unsigned short ArgNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    unsigned short RefNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::VecOfAnyPtrsToElt, ArgNo, RefNo));
+    return;
+  }
+  case IIT_EMPTYSTRUCT:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0));
+    return;
+  case IIT_STRUCT9:
+    ++StructElts;
+    [[fallthrough]];
+  case IIT_STRUCT8:
+    ++StructElts;
+    [[fallthrough]];
+  case IIT_STRUCT7:
+    ++StructElts;
+    [[fallthrough]];
+  case IIT_STRUCT6:
+    ++StructElts;
+    [[fallthrough]];
+  case IIT_STRUCT5:
+    ++StructElts;
+    [[fallthrough]];
+  case IIT_STRUCT4:
+    ++StructElts;
+    [[fallthrough]];
+  case IIT_STRUCT3:
+    ++StructElts;
+    [[fallthrough]];
+  case IIT_STRUCT2: {
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::Struct, StructElts));
+
+    for (unsigned i = 0; i != StructElts; ++i)
+      DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  }
+  case IIT_SUBDIVIDE2_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::Subdivide2Argument, ArgInfo));
+    return;
+  }
+  case IIT_SUBDIVIDE4_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::Subdivide4Argument, ArgInfo));
+    return;
+  }
+  case IIT_VEC_ELEMENT: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::VecElementArgument, ArgInfo));
+    return;
+  }
+  case IIT_SCALABLE_VEC: {
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  }
+  case IIT_VEC_OF_BITCASTS_TO_INT: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::VecOfBitcastsToInt, ArgInfo));
+    return;
+  }
+  }
+  llvm_unreachable("unhandled");
+}
+
+#define GET_INTRINSIC_GENERATOR_GLOBAL
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_INTRINSIC_GENERATOR_GLOBAL
+
+void Intrinsic::getIntrinsicInfoTableEntries(
+    ID id, SmallVectorImpl<IITDescriptor> &T) {
+  static_assert(sizeof(IIT_Table[0]) == 2,
+                "Expect 16-bit entries in IIT_Table");
+  // Check to see if the intrinsic's type was expressible by the table.
+  uint16_t TableVal = IIT_Table[id - 1];
+
+  // Decode the TableVal into an array of IITValues.
+  SmallVector<unsigned char> IITValues;
+  ArrayRef<unsigned char> IITEntries;
+  unsigned NextElt = 0;
+  if (TableVal >> 15) {
+    // This is an offset into the IIT_LongEncodingTable.
+    IITEntries = IIT_LongEncodingTable;
+
+    // Strip sentinel bit.
+    NextElt = TableVal & 0x7fff;
+  } else {
+    // If the entry was encoded into a single word in the table itself, decode
+    // it from an array of nibbles to an array of bytes.
+    do {
+      IITValues.push_back(TableVal & 0xF);
+      TableVal >>= 4;
+    } while (TableVal);
+
+    IITEntries = IITValues;
+    NextElt = 0;
+  }
+
+  // Okay, decode the table into the output vector of IITDescriptors.
+  DecodeIITType(NextElt, IITEntries, IIT_Done, T);
+  while (NextElt != IITEntries.size() && IITEntries[NextElt] != 0)
+    DecodeIITType(NextElt, IITEntries, IIT_Done, T);
+}
+
+static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                             ArrayRef<Type *> Tys, LLVMContext &Context) {
+  using namespace Intrinsic;
+
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+
+  switch (D.Kind) {
+  case IITDescriptor::Void:
+    return Type::getVoidTy(Context);
+  case IITDescriptor::VarArg:
+    return Type::getVoidTy(Context);
+  case IITDescriptor::MMX:
+    return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
+  case IITDescriptor::AMX:
+    return Type::getX86_AMXTy(Context);
+  case IITDescriptor::Token:
+    return Type::getTokenTy(Context);
+  case IITDescriptor::Metadata:
+    return Type::getMetadataTy(Context);
+  case IITDescriptor::Half:
+    return Type::getHalfTy(Context);
+  case IITDescriptor::BFloat:
+    return Type::getBFloatTy(Context);
+  case IITDescriptor::Float:
+    return Type::getFloatTy(Context);
+  case IITDescriptor::Double:
+    return Type::getDoubleTy(Context);
+  case IITDescriptor::Quad:
+    return Type::getFP128Ty(Context);
+  case IITDescriptor::PPCQuad:
+    return Type::getPPC_FP128Ty(Context);
+  case IITDescriptor::AArch64Svcount:
+    return TargetExtType::get(Context, "aarch64.svcount");
+
+  case IITDescriptor::Integer:
+    return IntegerType::get(Context, D.Integer_Width);
+  case IITDescriptor::Vector:
+    return VectorType::get(DecodeFixedType(Infos, Tys, Context),
+                           D.Vector_Width);
+  case IITDescriptor::Pointer:
+    return PointerType::get(Context, D.Pointer_AddressSpace);
+  case IITDescriptor::Struct: {
+    SmallVector<Type *, 8> Elts;
+    for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
+      Elts.push_back(DecodeFixedType(Infos, Tys, Context));
+    return StructType::get(Context, Elts);
+  }
+  case IITDescriptor::Argument:
+    return Tys[D.getArgumentNumber()];
+  case IITDescriptor::ExtendArgument: {
+    Type *Ty = Tys[D.getArgumentNumber()];
+    if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+      return VectorType::getExtendedElementVectorType(VTy);
+
+    return IntegerType::get(Context, 2 * cast<IntegerType>(Ty)->getBitWidth());
+  }
+  case IITDescriptor::TruncArgument: {
+    Type *Ty = Tys[D.getArgumentNumber()];
+    if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+      return VectorType::getTruncatedElementVectorType(VTy);
+
+    IntegerType *ITy = cast<IntegerType>(Ty);
+    assert(ITy->getBitWidth() % 2 == 0);
+    return IntegerType::get(Context, ITy->getBitWidth() / 2);
+  }
+  case IITDescriptor::Subdivide2Argument:
+  case IITDescriptor::Subdivide4Argument: {
+    Type *Ty = Tys[D.getArgumentNumber()];
+    VectorType *VTy = dyn_cast<VectorType>(Ty);
+    assert(VTy && "Expected an argument of Vector Type");
+    int SubDivs = D.Kind == IITDescriptor::Subdivide2Argument ? 1 : 2;
+    return VectorType::getSubdividedVectorType(VTy, SubDivs);
+  }
+  case IITDescriptor::HalfVecArgument:
+    return VectorType::getHalfElementsVectorType(
+        cast<VectorType>(Tys[D.getArgumentNumber()]));
+  case IITDescriptor::SameVecWidthArgument: {
+    Type *EltTy = DecodeFixedType(Infos, Tys, Context);
+    Type *Ty = Tys[D.getArgumentNumber()];
+    if (auto *VTy = dyn_cast<VectorType>(Ty))
+      return VectorType::get(EltTy, VTy->getElementCount());
+    return EltTy;
+  }
+  case IITDescriptor::VecElementArgument: {
+    Type *Ty = Tys[D.getArgumentNumber()];
+    if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+      return VTy->getElementType();
+    llvm_unreachable("Expected an argument of Vector Type");
+  }
+  case IITDescriptor::VecOfBitcastsToInt: {
+    Type *Ty = Tys[D.getArgumentNumber()];
+    VectorType *VTy = dyn_cast<VectorType>(Ty);
+    assert(VTy && "Expected an argument of Vector Type");
+    return VectorType::getInteger(VTy);
+  }
+  case IITDescriptor::VecOfAnyPtrsToElt:
+    // Return the overloaded type (which determines the pointers address space)
+    return Tys[D.getOverloadArgNumber()];
+  }
+  llvm_unreachable("unhandled");
+}
+
+FunctionType *Intrinsic::getType(LLVMContext &Context, ID id,
+                                 ArrayRef<Type *> Tys) {
+  SmallVector<IITDescriptor, 8> Table;
+  getIntrinsicInfoTableEntries(id, Table);
+
+  ArrayRef<IITDescriptor> TableRef = Table;
+  Type *ResultTy = DecodeFixedType(TableRef, Tys, Context);
+
+  SmallVector<Type *, 8> ArgTys;
+  while (!TableRef.empty())
+    ArgTys.push_back(DecodeFixedType(TableRef, Tys, Context));
+
+  // DecodeFixedType returns Void for IITDescriptor::Void and
+  // IITDescriptor::VarArg If we see void type as the type of the last argument,
+  // it is vararg intrinsic
+  if (!ArgTys.empty() && ArgTys.back()->isVoidTy()) {
+    ArgTys.pop_back();
+    return FunctionType::get(ResultTy, ArgTys, true);
+  }
+  return FunctionType::get(ResultTy, ArgTys, false);
+}
+
+bool Intrinsic::isOverloaded(ID id) {
+#define GET_INTRINSIC_OVERLOAD_TABLE
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_INTRINSIC_OVERLOAD_TABLE
+}
+
+/// Table of per-target intrinsic name tables.
+#define GET_INTRINSIC_TARGET_DATA
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_INTRINSIC_TARGET_DATA
+
+bool Intrinsic::isTargetIntrinsic(Intrinsic::ID IID) {
+  return IID > TargetInfos[0].Count;
+}
+
+int llvm::Intrinsic::lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
+                                               StringRef Name,
+                                               StringRef Target) {
+  assert(Name.starts_with("llvm.") && "Unexpected intrinsic prefix");
+  assert(Name.drop_front(5).starts_with(Target) && "Unexpected target");
+
+  // Do successive binary searches of the dotted name components. For
+  // "llvm.gc.experimental.statepoint.p1i8.p1i32", we will find the range of
+  // intrinsics starting with "llvm.gc", then "llvm.gc.experimental", then
+  // "llvm.gc.experimental.statepoint", and then we will stop as the range is
+  // size 1. During the search, we can skip the prefix that we already know is
+  // identical. By using strncmp we consider names with differing suffixes to
+  // be part of the equal range.
+  size_t CmpEnd = 4; // Skip the "llvm" component.
+  if (!Target.empty())
+    CmpEnd += 1 + Target.size(); // skip the .target component.
+
+  const char *const *Low = NameTable.begin();
+  const char *const *High = NameTable.end();
+  const char *const *LastLow = Low;
+  while (CmpEnd < Name.size() && High - Low > 0) {
+    size_t CmpStart = CmpEnd;
+    CmpEnd = Name.find('.', CmpStart + 1);
+    CmpEnd = CmpEnd == StringRef::npos ? Name.size() : CmpEnd;
+    auto Cmp = [CmpStart, CmpEnd](const char *LHS, const char *RHS) {
+      return strncmp(LHS + CmpStart, RHS + CmpStart, CmpEnd - CmpStart) < 0;
+    };
+    LastLow = Low;
+    std::tie(Low, High) = std::equal_range(Low, High, Name.data(), Cmp);
+  }
+  if (High - Low > 0)
+    LastLow = Low;
+
+  if (LastLow == NameTable.end())
+    return -1;
+  StringRef NameFound = *LastLow;
+  if (Name == NameFound ||
+      (Name.starts_with(NameFound) && Name[NameFound.size()] == '.'))
+    return LastLow - NameTable.begin();
+  return -1;
+}
+
+/// Find the segment of \c IntrinsicNameTable for intrinsics with the same
+/// target as \c Name, or the generic table if \c Name is not target specific.
+///
+/// Returns the relevant slice of \c IntrinsicNameTable and the target name.
+static std::pair<ArrayRef<const char *>, StringRef>
+findTargetSubtable(StringRef Name) {
+  assert(Name.starts_with("llvm."));
+
+  ArrayRef<IntrinsicTargetInfo> Targets(TargetInfos);
+  // Drop "llvm." and take the first dotted component. That will be the target
+  // if this is target specific.
+  StringRef Target = Name.drop_front(5).split('.').first;
+  auto It = partition_point(
+      Targets, [=](const IntrinsicTargetInfo &TI) { return TI.Name < Target; });
+  // We've either found the target or just fall back to the generic set, which
+  // is always first.
+  const auto &TI = It != Targets.end() && It->Name == Target ? *It : Targets[0];
+  return {ArrayRef(&IntrinsicNameTable[1] + TI.Offset, TI.Count), TI.Name};
+}
+
+/// This does the actual lookup of an intrinsic ID which matches the given
+/// function name.
+Intrinsic::ID Intrinsic::lookupIntrinsicID(StringRef Name) {
+  auto [NameTable, Target] = findTargetSubtable(Name);
+  int Idx = Intrinsic::lookupLLVMIntrinsicByName(NameTable, Name, Target);
+  if (Idx == -1)
+    return Intrinsic::not_intrinsic;
+
+  // Intrinsic IDs correspond to the location in IntrinsicNameTable, but we have
+  // an index into a sub-table.
+  int Adjust = NameTable.data() - IntrinsicNameTable;
+  Intrinsic::ID ID = static_cast<Intrinsic::ID>(Idx + Adjust);
+
+  // If the intrinsic is not overloaded, require an exact match. If it is
+  // overloaded, require either exact or prefix match.
+  const auto MatchSize = strlen(NameTable[Idx]);
+  assert(Name.size() >= MatchSize && "Expected either exact or prefix match");
+  bool IsExactMatch = Name.size() == MatchSize;
+  return IsExactMatch || Intrinsic::isOverloaded(ID) ? ID
+                                                     : Intrinsic::not_intrinsic;
+}
+
+/// This defines the "Intrinsic::getAttributes(ID id)" method.
+#define GET_INTRINSIC_ATTRIBUTES
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_INTRINSIC_ATTRIBUTES
+
+Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type *> Tys) {
+  // There can never be multiple globals with the same name of different types,
+  // because intrinsics must be a specific type.
+  auto *FT = getType(M->getContext(), id, Tys);
+  return cast<Function>(
+      M->getOrInsertFunction(
+           Tys.empty() ? getName(id) : getName(id, Tys, M, FT), FT)
+          .getCallee());
+}
+
+// This defines the "Intrinsic::getIntrinsicForClangBuiltin()" method.
+#define GET_LLVM_INTRINSIC_FOR_CLANG_BUILTIN
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_LLVM_INTRINSIC_FOR_CLANG_BUILTIN
+
+// This defines the "Intrinsic::getIntrinsicForMSBuiltin()" method.
+#define GET_LLVM_INTRINSIC_FOR_MS_BUILTIN
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_LLVM_INTRINSIC_FOR_MS_BUILTIN
+
+bool Intrinsic::isConstrainedFPIntrinsic(ID QID) {
+  switch (QID) {
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
+  case Intrinsic::INTRINSIC:
+#include "llvm/IR/ConstrainedOps.def"
+#undef INSTRUCTION
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool Intrinsic::hasConstrainedFPRoundingModeOperand(Intrinsic::ID QID) {
+  switch (QID) {
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
+  case Intrinsic::INTRINSIC:                                                   \
+    return ROUND_MODE == 1;
+#include "llvm/IR/ConstrainedOps.def"
+#undef INSTRUCTION
+  default:
+    return false;
+  }
+}
+
+using DeferredIntrinsicMatchPair =
+    std::pair<Type *, ArrayRef<Intrinsic::IITDescriptor>>;
+
+static bool
+matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                   SmallVectorImpl<Type *> &ArgTys,
+                   SmallVectorImpl<DeferredIntrinsicMatchPair> &DeferredChecks,
+                   bool IsDeferredCheck) {
+  using namespace Intrinsic;
+
+  // If we ran out of descriptors, there are too many arguments.
+  if (Infos.empty())
+    return true;
+
+  // Do this before slicing off the 'front' part
+  auto InfosRef = Infos;
+  auto DeferCheck = [&DeferredChecks, &InfosRef](Type *T) {
+    DeferredChecks.emplace_back(T, InfosRef);
+    return false;
+  };
+
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+
+  switch (D.Kind) {
+  case IITDescriptor::Void:
+    return !Ty->isVoidTy();
+  case IITDescriptor::VarArg:
+    return true;
+  case IITDescriptor::MMX: {
+    FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
+    return !VT || VT->getNumElements() != 1 ||
+           !VT->getElementType()->isIntegerTy(64);
+  }
+  case IITDescriptor::AMX:
+    return !Ty->isX86_AMXTy();
+  case IITDescriptor::Token:
+    return !Ty->isTokenTy();
+  case IITDescriptor::Metadata:
+    return !Ty->isMetadataTy();
+  case IITDescriptor::Half:
+    return !Ty->isHalfTy();
+  case IITDescriptor::BFloat:
+    return !Ty->isBFloatTy();
+  case IITDescriptor::Float:
+    return !Ty->isFloatTy();
+  case IITDescriptor::Double:
+    return !Ty->isDoubleTy();
+  case IITDescriptor::Quad:
+    return !Ty->isFP128Ty();
+  case IITDescriptor::PPCQuad:
+    return !Ty->isPPC_FP128Ty();
+  case IITDescriptor::Integer:
+    return !Ty->isIntegerTy(D.Integer_Width);
+  case IITDescriptor::AArch64Svcount:
+    return !isa<TargetExtType>(Ty) ||
+           cast<TargetExtType>(Ty)->getName() != "aarch64.svcount";
+  case IITDescriptor::Vector: {
+    VectorType *VT = dyn_cast<VectorType>(Ty);
+    return !VT || VT->getElementCount() != D.Vector_Width ||
+           matchIntrinsicType(VT->getElementType(), Infos, ArgTys,
+                              DeferredChecks, IsDeferredCheck);
+  }
+  case IITDescriptor::Pointer: {
+    PointerType *PT = dyn_cast<PointerType>(Ty);
+    return !PT || PT->getAddressSpace() != D.Pointer_AddressSpace;
+  }
+
+  case IITDescriptor::Struct: {
+    StructType *ST = dyn_cast<StructType>(Ty);
+    if (!ST || !ST->isLiteral() || ST->isPacked() ||
+        ST->getNumElements() != D.Struct_NumElements)
+      return true;
+
+    for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
+      if (matchIntrinsicType(ST->getElementType(i), Infos, ArgTys,
+                             DeferredChecks, IsDeferredCheck))
+        return true;
+    return false;
+  }
+
+  case IITDescriptor::Argument:
+    // If this is the second occurrence of an argument,
+    // verify that the later instance matches the previous instance.
+    if (D.getArgumentNumber() < ArgTys.size())
+      return Ty != ArgTys[D.getArgumentNumber()];
+
+    if (D.getArgumentNumber() > ArgTys.size() ||
+        D.getArgumentKind() == IITDescriptor::AK_MatchType)
+      return IsDeferredCheck || DeferCheck(Ty);
+
+    assert(D.getArgumentNumber() == ArgTys.size() && !IsDeferredCheck &&
+           "Table consistency error");
+    ArgTys.push_back(Ty);
+
+    switch (D.getArgumentKind()) {
+    case IITDescriptor::AK_Any:
+      return false; // Success
+    case IITDescriptor::AK_AnyInteger:
+      return !Ty->isIntOrIntVectorTy();
+    case IITDescriptor::AK_AnyFloat:
+      return !Ty->isFPOrFPVectorTy();
+    case IITDescriptor::AK_AnyVector:
+      return !isa<VectorType>(Ty);
+    case IITDescriptor::AK_AnyPointer:
+      return !isa<PointerType>(Ty);
+    default:
+      break;
+    }
+    llvm_unreachable("all argument kinds not covered");
+
+  case IITDescriptor::ExtendArgument: {
+    // If this is a forward reference, defer the check for later.
+    if (D.getArgumentNumber() >= ArgTys.size())
+      return IsDeferredCheck || DeferCheck(Ty);
+
+    Type *NewTy = ArgTys[D.getArgumentNumber()];
+    if (VectorType *VTy = dyn_cast<VectorType>(NewTy))
+      NewTy = VectorType::getExtendedElementVectorType(VTy);
+    else if (IntegerType *ITy = dyn_cast<IntegerType>(NewTy))
+      NewTy = IntegerType::get(ITy->getContext(), 2 * ITy->getBitWidth());
+    else
+      return true;
+
+    return Ty != NewTy;
+  }
+  case IITDescriptor::TruncArgument: {
+    // If this is a forward reference, defer the check for later.
+    if (D.getArgumentNumber() >= ArgTys.size())
+      return IsDeferredCheck || DeferCheck(Ty);
+
+    Type *NewTy = ArgTys[D.getArgumentNumber()];
+    if (VectorType *VTy = dyn_cast<VectorType>(NewTy))
+      NewTy = VectorType::getTruncatedElementVectorType(VTy);
+    else if (IntegerType *ITy = dyn_cast<IntegerType>(NewTy))
+      NewTy = IntegerType::get(ITy->getContext(), ITy->getBitWidth() / 2);
+    else
+      return true;
+
+    return Ty != NewTy;
+  }
+  case IITDescriptor::HalfVecArgument:
+    // If this is a forward reference, defer the check for later.
+    if (D.getArgumentNumber() >= ArgTys.size())
+      return IsDeferredCheck || DeferCheck(Ty);
+    return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+           VectorType::getHalfElementsVectorType(
+               cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+  case IITDescriptor::SameVecWidthArgument: {
+    if (D.getArgumentNumber() >= ArgTys.size()) {
+      // Defer check and subsequent check for the vector element type.
+      Infos = Infos.slice(1);
+      return IsDeferredCheck || DeferCheck(Ty);
+    }
+    auto *ReferenceType = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
+    auto *ThisArgType = dyn_cast<VectorType>(Ty);
+    // Both must be vectors of the same number of elements or neither.
+    if ((ReferenceType != nullptr) != (ThisArgType != nullptr))
+      return true;
+    Type *EltTy = Ty;
+    if (ThisArgType) {
+      if (ReferenceType->getElementCount() != ThisArgType->getElementCount())
+        return true;
+      EltTy = ThisArgType->getElementType();
+    }
+    return matchIntrinsicType(EltTy, Infos, ArgTys, DeferredChecks,
+                              IsDeferredCheck);
+  }
+  case IITDescriptor::VecOfAnyPtrsToElt: {
+    unsigned RefArgNumber = D.getRefArgNumber();
+    if (RefArgNumber >= ArgTys.size()) {
+      if (IsDeferredCheck)
+        return true;
+      // If forward referencing, already add the pointer-vector type and
+      // defer the checks for later.
+      ArgTys.push_back(Ty);
+      return DeferCheck(Ty);
+    }
+
+    if (!IsDeferredCheck) {
+      assert(D.getOverloadArgNumber() == ArgTys.size() &&
+             "Table consistency error");
+      ArgTys.push_back(Ty);
+    }
+
+    // Verify the overloaded type "matches" the Ref type.
+    // i.e. Ty is a vector with the same width as Ref.
+    // Composed of pointers to the same element type as Ref.
+    auto *ReferenceType = dyn_cast<VectorType>(ArgTys[RefArgNumber]);
+    auto *ThisArgVecTy = dyn_cast<VectorType>(Ty);
+    if (!ThisArgVecTy || !ReferenceType ||
+        (ReferenceType->getElementCount() != ThisArgVecTy->getElementCount()))
+      return true;
+    return !ThisArgVecTy->getElementType()->isPointerTy();
+  }
+  case IITDescriptor::VecElementArgument: {
+    if (D.getArgumentNumber() >= ArgTys.size())
+      return IsDeferredCheck ? true : DeferCheck(Ty);
+    auto *ReferenceType = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
+    return !ReferenceType || Ty != ReferenceType->getElementType();
+  }
+  case IITDescriptor::Subdivide2Argument:
+  case IITDescriptor::Subdivide4Argument: {
+    // If this is a forward reference, defer the check for later.
+    if (D.getArgumentNumber() >= ArgTys.size())
+      return IsDeferredCheck || DeferCheck(Ty);
+
+    Type *NewTy = ArgTys[D.getArgumentNumber()];
+    if (auto *VTy = dyn_cast<VectorType>(NewTy)) {
+      int SubDivs = D.Kind == IITDescriptor::Subdivide2Argument ? 1 : 2;
+      NewTy = VectorType::getSubdividedVectorType(VTy, SubDivs);
+      return Ty != NewTy;
+    }
+    return true;
+  }
+  case IITDescriptor::VecOfBitcastsToInt: {
+    if (D.getArgumentNumber() >= ArgTys.size())
+      return IsDeferredCheck || DeferCheck(Ty);
+    auto *ReferenceType = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
+    auto *ThisArgVecTy = dyn_cast<VectorType>(Ty);
+    if (!ThisArgVecTy || !ReferenceType)
+      return true;
+    return ThisArgVecTy != VectorType::getInteger(ReferenceType);
+  }
+  }
+  llvm_unreachable("unhandled");
+}
+
+Intrinsic::MatchIntrinsicTypesResult
+Intrinsic::matchIntrinsicSignature(FunctionType *FTy,
+                                   ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                                   SmallVectorImpl<Type *> &ArgTys) {
+  SmallVector<DeferredIntrinsicMatchPair, 2> DeferredChecks;
+  if (matchIntrinsicType(FTy->getReturnType(), Infos, ArgTys, DeferredChecks,
+                         false))
+    return MatchIntrinsicTypes_NoMatchRet;
+
+  unsigned NumDeferredReturnChecks = DeferredChecks.size();
+
+  for (auto *Ty : FTy->params())
+    if (matchIntrinsicType(Ty, Infos, ArgTys, DeferredChecks, false))
+      return MatchIntrinsicTypes_NoMatchArg;
+
+  for (unsigned I = 0, E = DeferredChecks.size(); I != E; ++I) {
+    DeferredIntrinsicMatchPair &Check = DeferredChecks[I];
+    if (matchIntrinsicType(Check.first, Check.second, ArgTys, DeferredChecks,
+                           true))
+      return I < NumDeferredReturnChecks ? MatchIntrinsicTypes_NoMatchRet
+                                         : MatchIntrinsicTypes_NoMatchArg;
+  }
+
+  return MatchIntrinsicTypes_Match;
+}
+
+bool Intrinsic::matchIntrinsicVarArg(
+    bool isVarArg, ArrayRef<Intrinsic::IITDescriptor> &Infos) {
+  // If there are no descriptors left, then it can't be a vararg.
+  if (Infos.empty())
+    return isVarArg;
+
+  // There should be only one descriptor remaining at this point.
+  if (Infos.size() != 1)
+    return true;
+
+  // Check and verify the descriptor.
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+  if (D.Kind == IITDescriptor::VarArg)
+    return !isVarArg;
+
+  return true;
+}
+
+bool Intrinsic::getIntrinsicSignature(Intrinsic::ID ID, FunctionType *FT,
+                                      SmallVectorImpl<Type *> &ArgTys) {
+  if (!ID)
+    return false;
+
+  SmallVector<Intrinsic::IITDescriptor, 8> Table;
+  getIntrinsicInfoTableEntries(ID, Table);
+  ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+
+  if (Intrinsic::matchIntrinsicSignature(FT, TableRef, ArgTys) !=
+      Intrinsic::MatchIntrinsicTypesResult::MatchIntrinsicTypes_Match) {
+    return false;
+  }
+  if (Intrinsic::matchIntrinsicVarArg(FT->isVarArg(), TableRef))
+    return false;
+  return true;
+}
+
+bool Intrinsic::getIntrinsicSignature(Function *F,
+                                      SmallVectorImpl<Type *> &ArgTys) {
+  return getIntrinsicSignature(F->getIntrinsicID(), F->getFunctionType(),
+                               ArgTys);
+}
+
+std::optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) {
+  SmallVector<Type *, 4> ArgTys;
+  if (!getIntrinsicSignature(F, ArgTys))
+    return std::nullopt;
+
+  Intrinsic::ID ID = F->getIntrinsicID();
+  StringRef Name = F->getName();
+  std::string WantedName =
+      Intrinsic::getName(ID, ArgTys, F->getParent(), F->getFunctionType());
+  if (Name == WantedName)
+    return std::nullopt;
+
+  Function *NewDecl = [&] {
+    if (auto *ExistingGV = F->getParent()->getNamedValue(WantedName)) {
+      if (auto *ExistingF = dyn_cast<Function>(ExistingGV))
+        if (ExistingF->getFunctionType() == F->getFunctionType())
+          return ExistingF;
+
+      // The name already exists, but is not a function or has the wrong
+      // prototype. Make place for the new one by renaming the old version.
+      // Either this old version will be removed later on or the module is
+      // invalid and we'll get an error.
+      ExistingGV->setName(WantedName + ".renamed");
+    }
+    return Intrinsic::getDeclaration(F->getParent(), ID, ArgTys);
+  }();
+
+  NewDecl->setCallingConv(F->getCallingConv());
+  assert(NewDecl->getFunctionType() == F->getFunctionType() &&
+         "Shouldn't change the signature");
+  return NewDecl;
+}
diff --git a/llvm/lib/IR/LLVMContext.cpp b/llvm/lib/IR/LLVMContext.cpp
index 22e60772def43..e078527b597b4 100644
--- a/llvm/lib/IR/LLVMContext.cpp
+++ b/llvm/lib/IR/LLVMContext.cpp
@@ -330,6 +330,10 @@ void LLVMContext::getSyncScopeNames(SmallVectorImpl<StringRef> &SSNs) const {
   pImpl->getSyncScopeNames(SSNs);
 }
 
+std::optional<StringRef> LLVMContext::getSyncScopeName(SyncScope::ID Id) const {
+  return pImpl->getSyncScopeName(Id);
+}
+
 void LLVMContext::setGC(const Function &Fn, std::string GCName) {
   pImpl->GCNames[&Fn] = std::move(GCName);
 }
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 4f1ef8cec3213..f2c965a45df3a 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -244,6 +244,16 @@ void LLVMContextImpl::getSyncScopeNames(
     SSNs[SSE.second] = SSE.first();
 }
 
+std::optional<StringRef>
+LLVMContextImpl::getSyncScopeName(SyncScope::ID Id) const {
+  for (const auto &SSE : SSC) {
+    if (SSE.second != Id)
+      continue;
+    return SSE.first();
+  }
+  return std::nullopt;
+}
+
 /// Gets the OptPassGate for this LLVMContextImpl, which defaults to the
 /// singleton OptBisect if not explicitly set.
 OptPassGate &LLVMContextImpl::getOptPassGate() const {
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index e76f004b590ef..971091f304061 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1665,6 +1665,10 @@ class LLVMContextImpl {
   /// scope names are ordered by increasing synchronization scope IDs.
   void getSyncScopeNames(SmallVectorImpl<StringRef> &SSNs) const;
 
+  /// getSyncScopeName - Returns the name of a SyncScope::ID
+  /// registered with LLVMContext, if any.
+  std::optional<StringRef> getSyncScopeName(SyncScope::ID Id) const;
+
   /// Maintain the GC name for each function.
   ///
   /// This saves allocating an additional word in Function for programs which
diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index 6c9862556f550..f93ff8f6fc8a2 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -201,7 +201,7 @@ bool GEPOperator::accumulateConstantOffset(
 
 bool GEPOperator::collectOffset(
     const DataLayout &DL, unsigned BitWidth,
-    MapVector<Value *, APInt> &VariableOffsets,
+    SmallMapVector<Value *, APInt, 4> &VariableOffsets,
     APInt &ConstantOffset) const {
   assert(BitWidth == DL.getIndexSizeInBits(getPointerAddressSpace()) &&
          "The offset bit width does not match DL specification.");
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index c5ff1e7acbfd2..d88fd09a1aa07 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -479,9 +479,7 @@ class MasmParser : public MCAsmParser {
   void addDirectiveHandler(StringRef Directive,
                            ExtensionDirectiveHandler Handler) override {
     ExtensionDirectiveMap[Directive] = Handler;
-    if (!DirectiveKindMap.contains(Directive)) {
-      DirectiveKindMap[Directive] = DK_HANDLER_DIRECTIVE;
-    }
+    DirectiveKindMap.try_emplace(Directive, DK_HANDLER_DIRECTIVE);
   }
 
   void addAliasForDirective(StringRef Directive, StringRef Alias) override {
diff --git a/llvm/lib/MC/MCRegisterInfo.cpp b/llvm/lib/MC/MCRegisterInfo.cpp
index a5de02abce667..178b1d21e5200 100644
--- a/llvm/lib/MC/MCRegisterInfo.cpp
+++ b/llvm/lib/MC/MCRegisterInfo.cpp
@@ -141,7 +141,7 @@ unsigned MCRegisterInfo::getSubRegIndex(MCRegister Reg,
   return 0;
 }
 
-int MCRegisterInfo::getDwarfRegNum(MCRegister RegNum, bool isEH) const {
+int64_t MCRegisterInfo::getDwarfRegNum(MCRegister RegNum, bool isEH) const {
   const DwarfLLVMRegPair *M = isEH ? EHL2DwarfRegs : L2DwarfRegs;
   unsigned Size = isEH ? EHL2DwarfRegsSize : L2DwarfRegsSize;
 
@@ -151,24 +151,28 @@ int MCRegisterInfo::getDwarfRegNum(MCRegister RegNum, bool isEH) const {
   const DwarfLLVMRegPair *I = std::lower_bound(M, M+Size, Key);
   if (I == M+Size || I->FromReg != RegNum)
     return -1;
-  return I->ToReg;
+  // Consumers need to be able to detect -1 and -2, but at various points
+  // the numbers move between unsigned and signed representations, as well as
+  // between 32- and 64-bit representations. We need to convert first to int
+  // before int64_t for proper sign handling.
+  return int64_t(int(I->ToReg));
 }
 
-std::optional<MCRegister> MCRegisterInfo::getLLVMRegNum(unsigned RegNum,
+std::optional<MCRegister> MCRegisterInfo::getLLVMRegNum(uint64_t RegNum,
                                                         bool isEH) const {
   const DwarfLLVMRegPair *M = isEH ? EHDwarf2LRegs : Dwarf2LRegs;
   unsigned Size = isEH ? EHDwarf2LRegsSize : Dwarf2LRegsSize;
 
   if (!M)
     return std::nullopt;
-  DwarfLLVMRegPair Key = { RegNum, 0 };
+  DwarfLLVMRegPair Key = {unsigned(RegNum), 0};
   const DwarfLLVMRegPair *I = std::lower_bound(M, M+Size, Key);
   if (I != M + Size && I->FromReg == RegNum)
     return MCRegister::from(I->ToReg);
   return std::nullopt;
 }
 
-int MCRegisterInfo::getDwarfRegNumFromDwarfEHRegNum(unsigned RegNum) const {
+int64_t MCRegisterInfo::getDwarfRegNumFromDwarfEHRegNum(uint64_t RegNum) const {
   // On ELF platforms, DWARF EH register numbers are the same as DWARF
   // other register numbers.  On Darwin x86, they differ and so need to be
   // mapped.  The .cfi_* directives accept integer literals as well as
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index c4d88856abdfb..933ede71746e0 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -665,8 +665,9 @@ InstrBuilder::getOrCreateInstrDesc(const MCInst &MCI,
   SchedClassID = *VariantSchedClassIDOrErr;
 
   auto VDKey = std::make_pair(hashMCInst(MCI), SchedClassID);
-  if (VariantDescriptors.contains(VDKey))
-    return *VariantDescriptors[VDKey];
+  auto It = VariantDescriptors.find(VDKey);
+  if (It != VariantDescriptors.end())
+    return *It->second;
 
   return createInstrDescImpl(MCI, IVec);
 }
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index 114045561366d..c61ba868efe60 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -754,9 +754,8 @@ static Expected<std::vector<unsigned>> getSymbols(SymbolicFile *Obj,
       raw_string_ostream NameStream(Name);
       if (Error E = S.printName(NameStream))
         return std::move(E);
-      if (Map->find(Name) != Map->end())
+      if (!Map->try_emplace(Name, Index).second)
         continue; // ignore duplicated symbol
-      (*Map)[Name] = Index;
       if (Map == &SymMap->Map) {
         Ret.push_back(SymNames.tell());
         SymNames << Name << '\0';
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 18643c6b44485..c713371da81e4 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -49,13 +49,10 @@ using namespace coverage;
 #define DEBUG_TYPE "coverage-mapping"
 
 Counter CounterExpressionBuilder::get(const CounterExpression &E) {
-  auto It = ExpressionIndices.find(E);
-  if (It != ExpressionIndices.end())
-    return Counter::getExpression(It->second);
-  unsigned I = Expressions.size();
-  Expressions.push_back(E);
-  ExpressionIndices[E] = I;
-  return Counter::getExpression(I);
+  auto [It, Inserted] = ExpressionIndices.try_emplace(E, Expressions.size());
+  if (Inserted)
+    Expressions.push_back(E);
+  return Counter::getExpression(It->second);
 }
 
 void CounterExpressionBuilder::extractTerms(Counter C, int Factor,
diff --git a/llvm/lib/SandboxIR/Argument.cpp b/llvm/lib/SandboxIR/Argument.cpp
new file mode 100644
index 0000000000000..e35da2d1dbcb7
--- /dev/null
+++ b/llvm/lib/SandboxIR/Argument.cpp
@@ -0,0 +1,23 @@
+//===- Argument.cpp - The function Argument class of Sandbox IR -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SandboxIR/Argument.h"
+
+namespace llvm::sandboxir {
+
+#ifndef NDEBUG
+void Argument::printAsOperand(raw_ostream &OS) const {
+  printAsOperandCommon(OS);
+}
+void Argument::dumpOS(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+#endif // NDEBUG
+
+} // namespace llvm::sandboxir
diff --git a/llvm/lib/SandboxIR/BasicBlock.cpp b/llvm/lib/SandboxIR/BasicBlock.cpp
new file mode 100644
index 0000000000000..983a5e8b8825e
--- /dev/null
+++ b/llvm/lib/SandboxIR/BasicBlock.cpp
@@ -0,0 +1,164 @@
+//===- BasicBlock.cpp - The BasicBlock class of Sandbox IR ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SandboxIR/BasicBlock.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/SandboxIR/Instruction.h"
+
+namespace llvm::sandboxir {
+
+BBIterator &BBIterator::operator++() {
+  auto ItE = BB->end();
+  assert(It != ItE && "Already at end!");
+  ++It;
+  if (It == ItE)
+    return *this;
+  Instruction &NextI = *cast<sandboxir::Instruction>(Ctx->getValue(&*It));
+  unsigned Num = NextI.getNumOfIRInstrs();
+  assert(Num > 0 && "Bad getNumOfIRInstrs()");
+  It = std::next(It, Num - 1);
+  return *this;
+}
+
+BBIterator &BBIterator::operator--() {
+  assert(It != BB->begin() && "Already at begin!");
+  if (It == BB->end()) {
+    --It;
+    return *this;
+  }
+  Instruction &CurrI = **this;
+  unsigned Num = CurrI.getNumOfIRInstrs();
+  assert(Num > 0 && "Bad getNumOfIRInstrs()");
+  assert(std::prev(It, Num - 1) != BB->begin() && "Already at begin!");
+  It = std::prev(It, Num);
+  return *this;
+}
+
+BasicBlock *BBIterator::getNodeParent() const {
+  llvm::BasicBlock *Parent = const_cast<BBIterator *>(this)->It.getNodeParent();
+  return cast<BasicBlock>(Ctx->getValue(Parent));
+}
+
+BasicBlock::iterator::pointer
+BasicBlock::iterator::getInstr(llvm::BasicBlock::iterator It) const {
+  return cast_or_null<Instruction>(Ctx->getValue(&*It));
+}
+
+Function *BasicBlock::getParent() const {
+  auto *BB = cast<llvm::BasicBlock>(Val);
+  auto *F = BB->getParent();
+  if (F == nullptr)
+    // Detached
+    return nullptr;
+  return cast_or_null<Function>(Ctx.getValue(F));
+}
+
+void BasicBlock::buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB) {
+  for (llvm::Instruction &IRef : reverse(*LLVMBB)) {
+    llvm::Instruction *I = &IRef;
+    Ctx.getOrCreateValue(I);
+    for (auto [OpIdx, Op] : enumerate(I->operands())) {
+      // Skip instruction's label operands
+      if (isa<llvm::BasicBlock>(Op))
+        continue;
+      // Skip metadata
+      if (isa<llvm::MetadataAsValue>(Op))
+        continue;
+      // Skip asm
+      if (isa<llvm::InlineAsm>(Op))
+        continue;
+      Ctx.getOrCreateValue(Op);
+    }
+  }
+#if !defined(NDEBUG)
+  verify();
+#endif
+}
+
+BasicBlock::iterator BasicBlock::begin() const {
+  llvm::BasicBlock *BB = cast<llvm::BasicBlock>(Val);
+  llvm::BasicBlock::iterator It = BB->begin();
+  if (!BB->empty()) {
+    auto *V = Ctx.getValue(&*BB->begin());
+    assert(V != nullptr && "No SandboxIR for BB->begin()!");
+    auto *I = cast<Instruction>(V);
+    unsigned Num = I->getNumOfIRInstrs();
+    assert(Num >= 1u && "Bad getNumOfIRInstrs()");
+    It = std::next(It, Num - 1);
+  }
+  return iterator(BB, It, &Ctx);
+}
+
+Instruction *BasicBlock::getTerminator() const {
+  auto *TerminatorV =
+      Ctx.getValue(cast<llvm::BasicBlock>(Val)->getTerminator());
+  return cast_or_null<Instruction>(TerminatorV);
+}
+
+Instruction &BasicBlock::front() const {
+  auto *BB = cast<llvm::BasicBlock>(Val);
+  assert(!BB->empty() && "Empty block!");
+  auto *SBI = cast<Instruction>(getContext().getValue(&*BB->begin()));
+  assert(SBI != nullptr && "Expected Instr!");
+  return *SBI;
+}
+
+Instruction &BasicBlock::back() const {
+  auto *BB = cast<llvm::BasicBlock>(Val);
+  assert(!BB->empty() && "Empty block!");
+  auto *SBI = cast<Instruction>(getContext().getValue(&*BB->rbegin()));
+  assert(SBI != nullptr && "Expected Instr!");
+  return *SBI;
+}
+
+#ifndef NDEBUG
+void BasicBlock::dumpOS(raw_ostream &OS) const {
+  llvm::BasicBlock *BB = cast<llvm::BasicBlock>(Val);
+  const auto &Name = BB->getName();
+  OS << Name;
+  if (!Name.empty())
+    OS << ":\n";
+  // If there are Instructions in the BB that are not mapped to SandboxIR, then
+  // use a crash-proof dump.
+  if (any_of(*BB, [this](llvm::Instruction &I) {
+        return Ctx.getValue(&I) == nullptr;
+      })) {
+    OS << "<Crash-proof mode!>\n";
+    DenseSet<Instruction *> Visited;
+    for (llvm::Instruction &IRef : *BB) {
+      Value *SBV = Ctx.getValue(&IRef);
+      if (SBV == nullptr)
+        OS << IRef << " *** No SandboxIR ***\n";
+      else {
+        auto *SBI = dyn_cast<Instruction>(SBV);
+        if (SBI == nullptr) {
+          OS << IRef << " *** Not a SBInstruction!!! ***\n";
+        } else {
+          if (Visited.insert(SBI).second)
+            OS << *SBI << "\n";
+        }
+      }
+    }
+  } else {
+    for (auto &SBI : *this) {
+      SBI.dumpOS(OS);
+      OS << "\n";
+    }
+  }
+}
+
+void BasicBlock::verify() const {
+  assert(isa<llvm::BasicBlock>(Val) && "Expected BasicBlock!");
+  for (const auto &I : *this) {
+    I.verify();
+  }
+}
+#endif // NDEBUG
+
+} // namespace llvm::sandboxir
diff --git a/llvm/lib/SandboxIR/CMakeLists.txt b/llvm/lib/SandboxIR/CMakeLists.txt
index 03474be0c7b80..3ec53b04b046f 100644
--- a/llvm/lib/SandboxIR/CMakeLists.txt
+++ b/llvm/lib/SandboxIR/CMakeLists.txt
@@ -1,9 +1,19 @@
 add_llvm_component_library(LLVMSandboxIR
+  Argument.cpp
+  BasicBlock.cpp
+  Constant.cpp
+  Context.cpp
+  Function.cpp
+  Instruction.cpp
+  Module.cpp
   Pass.cpp
   PassManager.cpp
-  SandboxIR.cpp
+  Region.cpp
   Tracker.cpp
   Type.cpp
+  User.cpp
+  Use.cpp
+  Value.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/SandboxIR
@@ -11,5 +21,6 @@ add_llvm_component_library(LLVMSandboxIR
   LINK_COMPONENTS
   Core
   Support
+  Analysis
   )
 
diff --git a/llvm/lib/SandboxIR/Constant.cpp b/llvm/lib/SandboxIR/Constant.cpp
new file mode 100644
index 0000000000000..f26c004f63092
--- /dev/null
+++ b/llvm/lib/SandboxIR/Constant.cpp
@@ -0,0 +1,471 @@
+//===- Constant.cpp - The Constant classes of Sandbox IR ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SandboxIR/Constant.h"
+#include "llvm/SandboxIR/Argument.h"
+#include "llvm/SandboxIR/BasicBlock.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Function.h"
+
+namespace llvm::sandboxir {
+
+#ifndef NDEBUG
+void Constant::dumpOS(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+#endif // NDEBUG
+
+ConstantInt *ConstantInt::getTrue(Context &Ctx) {
+  auto *LLVMC = llvm::ConstantInt::getTrue(Ctx.LLVMCtx);
+  return cast<ConstantInt>(Ctx.getOrCreateConstant(LLVMC));
+}
+ConstantInt *ConstantInt::getFalse(Context &Ctx) {
+  auto *LLVMC = llvm::ConstantInt::getFalse(Ctx.LLVMCtx);
+  return cast<ConstantInt>(Ctx.getOrCreateConstant(LLVMC));
+}
+ConstantInt *ConstantInt::getBool(Context &Ctx, bool V) {
+  auto *LLVMC = llvm::ConstantInt::getBool(Ctx.LLVMCtx, V);
+  return cast<ConstantInt>(Ctx.getOrCreateConstant(LLVMC));
+}
+Constant *ConstantInt::getTrue(Type *Ty) {
+  auto *LLVMC = llvm::ConstantInt::getTrue(Ty->LLVMTy);
+  return Ty->getContext().getOrCreateConstant(LLVMC);
+}
+Constant *ConstantInt::getFalse(Type *Ty) {
+  auto *LLVMC = llvm::ConstantInt::getFalse(Ty->LLVMTy);
+  return Ty->getContext().getOrCreateConstant(LLVMC);
+}
+Constant *ConstantInt::getBool(Type *Ty, bool V) {
+  auto *LLVMC = llvm::ConstantInt::getBool(Ty->LLVMTy, V);
+  return Ty->getContext().getOrCreateConstant(LLVMC);
+}
+ConstantInt *ConstantInt::get(Type *Ty, uint64_t V, bool IsSigned) {
+  auto *LLVMC = llvm::ConstantInt::get(Ty->LLVMTy, V, IsSigned);
+  return cast<ConstantInt>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+ConstantInt *ConstantInt::get(IntegerType *Ty, uint64_t V, bool IsSigned) {
+  auto *LLVMC = llvm::ConstantInt::get(Ty->LLVMTy, V, IsSigned);
+  return cast<ConstantInt>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+ConstantInt *ConstantInt::getSigned(IntegerType *Ty, int64_t V) {
+  auto *LLVMC =
+      llvm::ConstantInt::getSigned(cast<llvm::IntegerType>(Ty->LLVMTy), V);
+  return cast<ConstantInt>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+Constant *ConstantInt::getSigned(Type *Ty, int64_t V) {
+  auto *LLVMC = llvm::ConstantInt::getSigned(Ty->LLVMTy, V);
+  return Ty->getContext().getOrCreateConstant(LLVMC);
+}
+ConstantInt *ConstantInt::get(Context &Ctx, const APInt &V) {
+  auto *LLVMC = llvm::ConstantInt::get(Ctx.LLVMCtx, V);
+  return cast<ConstantInt>(Ctx.getOrCreateConstant(LLVMC));
+}
+ConstantInt *ConstantInt::get(IntegerType *Ty, StringRef Str, uint8_t Radix) {
+  auto *LLVMC =
+      llvm::ConstantInt::get(cast<llvm::IntegerType>(Ty->LLVMTy), Str, Radix);
+  return cast<ConstantInt>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+Constant *ConstantInt::get(Type *Ty, const APInt &V) {
+  auto *LLVMC = llvm::ConstantInt::get(Ty->LLVMTy, V);
+  return Ty->getContext().getOrCreateConstant(LLVMC);
+}
+IntegerType *ConstantInt::getIntegerType() const {
+  auto *LLVMTy = cast<llvm::ConstantInt>(Val)->getIntegerType();
+  return cast<IntegerType>(Ctx.getType(LLVMTy));
+}
+
+bool ConstantInt::isValueValidForType(Type *Ty, uint64_t V) {
+  return llvm::ConstantInt::isValueValidForType(Ty->LLVMTy, V);
+}
+bool ConstantInt::isValueValidForType(Type *Ty, int64_t V) {
+  return llvm::ConstantInt::isValueValidForType(Ty->LLVMTy, V);
+}
+
+Constant *ConstantFP::get(Type *Ty, double V) {
+  auto *LLVMC = llvm::ConstantFP::get(Ty->LLVMTy, V);
+  return Ty->getContext().getOrCreateConstant(LLVMC);
+}
+
+Constant *ConstantFP::get(Type *Ty, const APFloat &V) {
+  auto *LLVMC = llvm::ConstantFP::get(Ty->LLVMTy, V);
+  return Ty->getContext().getOrCreateConstant(LLVMC);
+}
+
+Constant *ConstantFP::get(Type *Ty, StringRef Str) {
+  auto *LLVMC = llvm::ConstantFP::get(Ty->LLVMTy, Str);
+  return Ty->getContext().getOrCreateConstant(LLVMC);
+}
+
+ConstantFP *ConstantFP::get(const APFloat &V, Context &Ctx) {
+  auto *LLVMC = llvm::ConstantFP::get(Ctx.LLVMCtx, V);
+  return cast<ConstantFP>(Ctx.getOrCreateConstant(LLVMC));
+}
+
+Constant *ConstantFP::getNaN(Type *Ty, bool Negative, uint64_t Payload) {
+  auto *LLVMC = llvm::ConstantFP::getNaN(Ty->LLVMTy, Negative, Payload);
+  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+Constant *ConstantFP::getQNaN(Type *Ty, bool Negative, APInt *Payload) {
+  auto *LLVMC = llvm::ConstantFP::getQNaN(Ty->LLVMTy, Negative, Payload);
+  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+Constant *ConstantFP::getSNaN(Type *Ty, bool Negative, APInt *Payload) {
+  auto *LLVMC = llvm::ConstantFP::getSNaN(Ty->LLVMTy, Negative, Payload);
+  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+Constant *ConstantFP::getZero(Type *Ty, bool Negative) {
+  auto *LLVMC = llvm::ConstantFP::getZero(Ty->LLVMTy, Negative);
+  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+Constant *ConstantFP::getNegativeZero(Type *Ty) {
+  auto *LLVMC = llvm::ConstantFP::getNegativeZero(Ty->LLVMTy);
+  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
+  auto *LLVMC = llvm::ConstantFP::getInfinity(Ty->LLVMTy, Negative);
+  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+bool ConstantFP::isValueValidForType(Type *Ty, const APFloat &V) {
+  return llvm::ConstantFP::isValueValidForType(Ty->LLVMTy, V);
+}
+
+Constant *ConstantArray::get(ArrayType *T, ArrayRef<Constant *> V) {
+  auto &Ctx = T->getContext();
+  SmallVector<llvm::Constant *> LLVMValues;
+  LLVMValues.reserve(V.size());
+  for (auto *Elm : V)
+    LLVMValues.push_back(cast<llvm::Constant>(Elm->Val));
+  auto *LLVMC =
+      llvm::ConstantArray::get(cast<llvm::ArrayType>(T->LLVMTy), LLVMValues);
+  return cast<ConstantArray>(Ctx.getOrCreateConstant(LLVMC));
+}
+
+ArrayType *ConstantArray::getType() const {
+  return cast<ArrayType>(
+      Ctx.getType(cast<llvm::ConstantArray>(Val)->getType()));
+}
+
+Constant *ConstantStruct::get(StructType *T, ArrayRef<Constant *> V) {
+  auto &Ctx = T->getContext();
+  SmallVector<llvm::Constant *> LLVMValues;
+  LLVMValues.reserve(V.size());
+  for (auto *Elm : V)
+    LLVMValues.push_back(cast<llvm::Constant>(Elm->Val));
+  auto *LLVMC =
+      llvm::ConstantStruct::get(cast<llvm::StructType>(T->LLVMTy), LLVMValues);
+  return cast<ConstantStruct>(Ctx.getOrCreateConstant(LLVMC));
+}
+
+StructType *ConstantStruct::getTypeForElements(Context &Ctx,
+                                               ArrayRef<Constant *> V,
+                                               bool Packed) {
+  unsigned VecSize = V.size();
+  SmallVector<Type *, 16> EltTypes;
+  EltTypes.reserve(VecSize);
+  for (Constant *Elm : V)
+    EltTypes.push_back(Elm->getType());
+  return StructType::get(Ctx, EltTypes, Packed);
+}
+
+ConstantAggregateZero *ConstantAggregateZero::get(Type *Ty) {
+  auto *LLVMC = llvm::ConstantAggregateZero::get(Ty->LLVMTy);
+  return cast<ConstantAggregateZero>(
+      Ty->getContext().getOrCreateConstant(LLVMC));
+}
+
+Constant *ConstantAggregateZero::getSequentialElement() const {
+  return cast<Constant>(Ctx.getValue(
+      cast<llvm::ConstantAggregateZero>(Val)->getSequentialElement()));
+}
+Constant *ConstantAggregateZero::getStructElement(unsigned Elt) const {
+  return cast<Constant>(Ctx.getValue(
+      cast<llvm::ConstantAggregateZero>(Val)->getStructElement(Elt)));
+}
+Constant *ConstantAggregateZero::getElementValue(Constant *C) const {
+  return cast<Constant>(
+      Ctx.getValue(cast<llvm::ConstantAggregateZero>(Val)->getElementValue(
+          cast<llvm::Constant>(C->Val))));
+}
+Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const {
+  return cast<Constant>(Ctx.getValue(
+      cast<llvm::ConstantAggregateZero>(Val)->getElementValue(Idx)));
+}
+
+ConstantPointerNull *ConstantPointerNull::get(PointerType *Ty) {
+  auto *LLVMC =
+      llvm::ConstantPointerNull::get(cast<llvm::PointerType>(Ty->LLVMTy));
+  return cast<ConstantPointerNull>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+
+PointerType *ConstantPointerNull::getType() const {
+  return cast<PointerType>(
+      Ctx.getType(cast<llvm::ConstantPointerNull>(Val)->getType()));
+}
+
+UndefValue *UndefValue::get(Type *T) {
+  auto *LLVMC = llvm::UndefValue::get(T->LLVMTy);
+  return cast<UndefValue>(T->getContext().getOrCreateConstant(LLVMC));
+}
+
+UndefValue *UndefValue::getSequentialElement() const {
+  return cast<UndefValue>(Ctx.getOrCreateConstant(
+      cast<llvm::UndefValue>(Val)->getSequentialElement()));
+}
+
+UndefValue *UndefValue::getStructElement(unsigned Elt) const {
+  return cast<UndefValue>(Ctx.getOrCreateConstant(
+      cast<llvm::UndefValue>(Val)->getStructElement(Elt)));
+}
+
+UndefValue *UndefValue::getElementValue(Constant *C) const {
+  return cast<UndefValue>(
+      Ctx.getOrCreateConstant(cast<llvm::UndefValue>(Val)->getElementValue(
+          cast<llvm::Constant>(C->Val))));
+}
+
+UndefValue *UndefValue::getElementValue(unsigned Idx) const {
+  return cast<UndefValue>(Ctx.getOrCreateConstant(
+      cast<llvm::UndefValue>(Val)->getElementValue(Idx)));
+}
+
+PoisonValue *PoisonValue::get(Type *T) {
+  auto *LLVMC = llvm::PoisonValue::get(T->LLVMTy);
+  return cast<PoisonValue>(T->getContext().getOrCreateConstant(LLVMC));
+}
+
+PoisonValue *PoisonValue::getSequentialElement() const {
+  return cast<PoisonValue>(Ctx.getOrCreateConstant(
+      cast<llvm::PoisonValue>(Val)->getSequentialElement()));
+}
+
+PoisonValue *PoisonValue::getStructElement(unsigned Elt) const {
+  return cast<PoisonValue>(Ctx.getOrCreateConstant(
+      cast<llvm::PoisonValue>(Val)->getStructElement(Elt)));
+}
+
+PoisonValue *PoisonValue::getElementValue(Constant *C) const {
+  return cast<PoisonValue>(
+      Ctx.getOrCreateConstant(cast<llvm::PoisonValue>(Val)->getElementValue(
+          cast<llvm::Constant>(C->Val))));
+}
+
+PoisonValue *PoisonValue::getElementValue(unsigned Idx) const {
+  return cast<PoisonValue>(Ctx.getOrCreateConstant(
+      cast<llvm::PoisonValue>(Val)->getElementValue(Idx)));
+}
+
+void GlobalObject::setAlignment(MaybeAlign Align) {
+  Ctx.getTracker()
+      .emplaceIfTracking<
+          GenericSetter<&GlobalObject::getAlign, &GlobalObject::setAlignment>>(
+          this);
+  cast<llvm::GlobalObject>(Val)->setAlignment(Align);
+}
+
+void GlobalObject::setGlobalObjectSubClassData(unsigned V) {
+  Ctx.getTracker()
+      .emplaceIfTracking<
+          GenericSetter<&GlobalObject::getGlobalObjectSubClassData,
+                        &GlobalObject::setGlobalObjectSubClassData>>(this);
+  cast<llvm::GlobalObject>(Val)->setGlobalObjectSubClassData(V);
+}
+
+void GlobalObject::setSection(StringRef S) {
+  Ctx.getTracker()
+      .emplaceIfTracking<
+          GenericSetter<&GlobalObject::getSection, &GlobalObject::setSection>>(
+          this);
+  cast<llvm::GlobalObject>(Val)->setSection(S);
+}
+
+template <typename GlobalT, typename LLVMGlobalT, typename ParentT,
+          typename LLVMParentT>
+GlobalT &GlobalWithNodeAPI<GlobalT, LLVMGlobalT, ParentT, LLVMParentT>::
+    LLVMGVToGV::operator()(LLVMGlobalT &LLVMGV) const {
+  return cast<GlobalT>(*Ctx.getValue(&LLVMGV));
+}
+
+// Explicit instantiations.
+template class GlobalWithNodeAPI<GlobalIFunc, llvm::GlobalIFunc, GlobalObject,
+                                 llvm::GlobalObject>;
+template class GlobalWithNodeAPI<Function, llvm::Function, GlobalObject,
+                                 llvm::GlobalObject>;
+template class GlobalWithNodeAPI<GlobalVariable, llvm::GlobalVariable,
+                                 GlobalObject, llvm::GlobalObject>;
+template class GlobalWithNodeAPI<GlobalAlias, llvm::GlobalAlias, GlobalValue,
+                                 llvm::GlobalValue>;
+
+void GlobalIFunc::setResolver(Constant *Resolver) {
+  Ctx.getTracker()
+      .emplaceIfTracking<
+          GenericSetter<&GlobalIFunc::getResolver, &GlobalIFunc::setResolver>>(
+          this);
+  cast<llvm::GlobalIFunc>(Val)->setResolver(
+      cast<llvm::Constant>(Resolver->Val));
+}
+
+Constant *GlobalIFunc::getResolver() const {
+  return Ctx.getOrCreateConstant(cast<llvm::GlobalIFunc>(Val)->getResolver());
+}
+
+Function *GlobalIFunc::getResolverFunction() {
+  return cast<Function>(Ctx.getOrCreateConstant(
+      cast<llvm::GlobalIFunc>(Val)->getResolverFunction()));
+}
+
+GlobalVariable &
+GlobalVariable::LLVMGVToGV::operator()(llvm::GlobalVariable &LLVMGV) const {
+  return cast<GlobalVariable>(*Ctx.getValue(&LLVMGV));
+}
+
+Constant *GlobalVariable::getInitializer() const {
+  return Ctx.getOrCreateConstant(
+      cast<llvm::GlobalVariable>(Val)->getInitializer());
+}
+
+void GlobalVariable::setInitializer(Constant *InitVal) {
+  Ctx.getTracker()
+      .emplaceIfTracking<GenericSetter<&GlobalVariable::getInitializer,
+                                       &GlobalVariable::setInitializer>>(this);
+  cast<llvm::GlobalVariable>(Val)->setInitializer(
+      cast<llvm::Constant>(InitVal->Val));
+}
+
+void GlobalVariable::setConstant(bool V) {
+  Ctx.getTracker()
+      .emplaceIfTracking<GenericSetter<&GlobalVariable::isConstant,
+                                       &GlobalVariable::setConstant>>(this);
+  cast<llvm::GlobalVariable>(Val)->setConstant(V);
+}
+
+void GlobalVariable::setExternallyInitialized(bool V) {
+  Ctx.getTracker()
+      .emplaceIfTracking<
+          GenericSetter<&GlobalVariable::isExternallyInitialized,
+                        &GlobalVariable::setExternallyInitialized>>(this);
+  cast<llvm::GlobalVariable>(Val)->setExternallyInitialized(V);
+}
+
+void GlobalAlias::setAliasee(Constant *Aliasee) {
+  Ctx.getTracker()
+      .emplaceIfTracking<
+          GenericSetter<&GlobalAlias::getAliasee, &GlobalAlias::setAliasee>>(
+          this);
+  cast<llvm::GlobalAlias>(Val)->setAliasee(cast<llvm::Constant>(Aliasee->Val));
+}
+
+Constant *GlobalAlias::getAliasee() const {
+  return cast<Constant>(
+      Ctx.getOrCreateConstant(cast<llvm::GlobalAlias>(Val)->getAliasee()));
+}
+
+const GlobalObject *GlobalAlias::getAliaseeObject() const {
+  return cast<GlobalObject>(Ctx.getOrCreateConstant(
+      cast<llvm::GlobalAlias>(Val)->getAliaseeObject()));
+}
+
+void GlobalValue::setUnnamedAddr(UnnamedAddr V) {
+  Ctx.getTracker()
+      .emplaceIfTracking<GenericSetter<&GlobalValue::getUnnamedAddr,
+                                       &GlobalValue::setUnnamedAddr>>(this);
+  cast<llvm::GlobalValue>(Val)->setUnnamedAddr(V);
+}
+
+void GlobalValue::setVisibility(VisibilityTypes V) {
+  Ctx.getTracker()
+      .emplaceIfTracking<GenericSetter<&GlobalValue::getVisibility,
+                                       &GlobalValue::setVisibility>>(this);
+  cast<llvm::GlobalValue>(Val)->setVisibility(V);
+}
+
+NoCFIValue *NoCFIValue::get(GlobalValue *GV) {
+  auto *LLVMC = llvm::NoCFIValue::get(cast<llvm::GlobalValue>(GV->Val));
+  return cast<NoCFIValue>(GV->getContext().getOrCreateConstant(LLVMC));
+}
+
+GlobalValue *NoCFIValue::getGlobalValue() const {
+  auto *LLVMC = cast<llvm::NoCFIValue>(Val)->getGlobalValue();
+  return cast<GlobalValue>(Ctx.getOrCreateConstant(LLVMC));
+}
+
+PointerType *NoCFIValue::getType() const {
+  return cast<PointerType>(Ctx.getType(cast<llvm::NoCFIValue>(Val)->getType()));
+}
+
+ConstantPtrAuth *ConstantPtrAuth::get(Constant *Ptr, ConstantInt *Key,
+                                      ConstantInt *Disc, Constant *AddrDisc) {
+  auto *LLVMC = llvm::ConstantPtrAuth::get(
+      cast<llvm::Constant>(Ptr->Val), cast<llvm::ConstantInt>(Key->Val),
+      cast<llvm::ConstantInt>(Disc->Val), cast<llvm::Constant>(AddrDisc->Val));
+  return cast<ConstantPtrAuth>(Ptr->getContext().getOrCreateConstant(LLVMC));
+}
+
+Constant *ConstantPtrAuth::getPointer() const {
+  return Ctx.getOrCreateConstant(
+      cast<llvm::ConstantPtrAuth>(Val)->getPointer());
+}
+
+ConstantInt *ConstantPtrAuth::getKey() const {
+  return cast<ConstantInt>(
+      Ctx.getOrCreateConstant(cast<llvm::ConstantPtrAuth>(Val)->getKey()));
+}
+
+ConstantInt *ConstantPtrAuth::getDiscriminator() const {
+  return cast<ConstantInt>(Ctx.getOrCreateConstant(
+      cast<llvm::ConstantPtrAuth>(Val)->getDiscriminator()));
+}
+
+Constant *ConstantPtrAuth::getAddrDiscriminator() const {
+  return Ctx.getOrCreateConstant(
+      cast<llvm::ConstantPtrAuth>(Val)->getAddrDiscriminator());
+}
+
+ConstantPtrAuth *ConstantPtrAuth::getWithSameSchema(Constant *Pointer) const {
+  auto *LLVMC = cast<llvm::ConstantPtrAuth>(Val)->getWithSameSchema(
+      cast<llvm::Constant>(Pointer->Val));
+  return cast<ConstantPtrAuth>(Ctx.getOrCreateConstant(LLVMC));
+}
+
+BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) {
+  auto *LLVMC = llvm::BlockAddress::get(cast<llvm::Function>(F->Val),
+                                        cast<llvm::BasicBlock>(BB->Val));
+  return cast<BlockAddress>(F->getContext().getOrCreateConstant(LLVMC));
+}
+
+BlockAddress *BlockAddress::get(BasicBlock *BB) {
+  auto *LLVMC = llvm::BlockAddress::get(cast<llvm::BasicBlock>(BB->Val));
+  return cast<BlockAddress>(BB->getContext().getOrCreateConstant(LLVMC));
+}
+
+BlockAddress *BlockAddress::lookup(const BasicBlock *BB) {
+  auto *LLVMC = llvm::BlockAddress::lookup(cast<llvm::BasicBlock>(BB->Val));
+  return cast_or_null<BlockAddress>(BB->getContext().getValue(LLVMC));
+}
+
+Function *BlockAddress::getFunction() const {
+  return cast<Function>(
+      Ctx.getValue(cast<llvm::BlockAddress>(Val)->getFunction()));
+}
+
+BasicBlock *BlockAddress::getBasicBlock() const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::BlockAddress>(Val)->getBasicBlock()));
+}
+
+DSOLocalEquivalent *DSOLocalEquivalent::get(GlobalValue *GV) {
+  auto *LLVMC = llvm::DSOLocalEquivalent::get(cast<llvm::GlobalValue>(GV->Val));
+  return cast<DSOLocalEquivalent>(GV->getContext().getValue(LLVMC));
+}
+
+GlobalValue *DSOLocalEquivalent::getGlobalValue() const {
+  return cast<GlobalValue>(
+      Ctx.getValue(cast<llvm::DSOLocalEquivalent>(Val)->getGlobalValue()));
+}
+
+} // namespace llvm::sandboxir
diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp
new file mode 100644
index 0000000000000..f5b3d2733344f
--- /dev/null
+++ b/llvm/lib/SandboxIR/Context.cpp
@@ -0,0 +1,729 @@
+//===- Context.cpp - The Context class of Sandbox IR ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/SandboxIR/Instruction.h"
+#include "llvm/SandboxIR/Module.h"
+
+namespace llvm::sandboxir {
+
+std::unique_ptr<Value> Context::detachLLVMValue(llvm::Value *V) {
+  std::unique_ptr<Value> Erased;
+  auto It = LLVMValueToValueMap.find(V);
+  if (It != LLVMValueToValueMap.end()) {
+    auto *Val = It->second.release();
+    Erased = std::unique_ptr<Value>(Val);
+    LLVMValueToValueMap.erase(It);
+  }
+  return Erased;
+}
+
+std::unique_ptr<Value> Context::detach(Value *V) {
+  assert(V->getSubclassID() != Value::ClassID::Constant &&
+         "Can't detach a constant!");
+  assert(V->getSubclassID() != Value::ClassID::User && "Can't detach a user!");
+  return detachLLVMValue(V->Val);
+}
+
+Value *Context::registerValue(std::unique_ptr<Value> &&VPtr) {
+  assert(VPtr->getSubclassID() != Value::ClassID::User &&
+         "Can't register a user!");
+
+  // Track creation of instructions.
+  // Please note that we don't allow the creation of detached instructions,
+  // meaning that the instructions need to be inserted into a block upon
+  // creation. This is why the tracker class combines creation and insertion.
+  if (auto *I = dyn_cast<Instruction>(VPtr.get()))
+    getTracker().emplaceIfTracking<CreateAndInsertInst>(I);
+
+  Value *V = VPtr.get();
+  [[maybe_unused]] auto Pair =
+      LLVMValueToValueMap.insert({VPtr->Val, std::move(VPtr)});
+  assert(Pair.second && "Already exists!");
+  return V;
+}
+
+Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
+  auto Pair = LLVMValueToValueMap.insert({LLVMV, nullptr});
+  auto It = Pair.first;
+  if (!Pair.second)
+    return It->second.get();
+
+  if (auto *C = dyn_cast<llvm::Constant>(LLVMV)) {
+    switch (C->getValueID()) {
+    case llvm::Value::ConstantIntVal:
+      It->second = std::unique_ptr<ConstantInt>(
+          new ConstantInt(cast<llvm::ConstantInt>(C), *this));
+      return It->second.get();
+    case llvm::Value::ConstantFPVal:
+      It->second = std::unique_ptr<ConstantFP>(
+          new ConstantFP(cast<llvm::ConstantFP>(C), *this));
+      return It->second.get();
+    case llvm::Value::BlockAddressVal:
+      It->second = std::unique_ptr<BlockAddress>(
+          new BlockAddress(cast<llvm::BlockAddress>(C), *this));
+      return It->second.get();
+    case llvm::Value::ConstantTokenNoneVal:
+      It->second = std::unique_ptr<ConstantTokenNone>(
+          new ConstantTokenNone(cast<llvm::ConstantTokenNone>(C), *this));
+      return It->second.get();
+    case llvm::Value::ConstantAggregateZeroVal: {
+      auto *CAZ = cast<llvm::ConstantAggregateZero>(C);
+      It->second = std::unique_ptr<ConstantAggregateZero>(
+          new ConstantAggregateZero(CAZ, *this));
+      auto *Ret = It->second.get();
+      // Must create sandboxir for elements.
+      auto EC = CAZ->getElementCount();
+      if (EC.isFixed()) {
+        for (auto ElmIdx : seq<unsigned>(0, EC.getFixedValue()))
+          getOrCreateValueInternal(CAZ->getElementValue(ElmIdx), CAZ);
+      }
+      return Ret;
+    }
+    case llvm::Value::ConstantPointerNullVal:
+      It->second = std::unique_ptr<ConstantPointerNull>(
+          new ConstantPointerNull(cast<llvm::ConstantPointerNull>(C), *this));
+      return It->second.get();
+    case llvm::Value::PoisonValueVal:
+      It->second = std::unique_ptr<PoisonValue>(
+          new PoisonValue(cast<llvm::PoisonValue>(C), *this));
+      return It->second.get();
+    case llvm::Value::UndefValueVal:
+      It->second = std::unique_ptr<UndefValue>(
+          new UndefValue(cast<llvm::UndefValue>(C), *this));
+      return It->second.get();
+    case llvm::Value::DSOLocalEquivalentVal: {
+      auto *DSOLE = cast<llvm::DSOLocalEquivalent>(C);
+      It->second = std::unique_ptr<DSOLocalEquivalent>(
+          new DSOLocalEquivalent(DSOLE, *this));
+      auto *Ret = It->second.get();
+      getOrCreateValueInternal(DSOLE->getGlobalValue(), DSOLE);
+      return Ret;
+    }
+    case llvm::Value::ConstantArrayVal:
+      It->second = std::unique_ptr<ConstantArray>(
+          new ConstantArray(cast<llvm::ConstantArray>(C), *this));
+      break;
+    case llvm::Value::ConstantStructVal:
+      It->second = std::unique_ptr<ConstantStruct>(
+          new ConstantStruct(cast<llvm::ConstantStruct>(C), *this));
+      break;
+    case llvm::Value::ConstantVectorVal:
+      It->second = std::unique_ptr<ConstantVector>(
+          new ConstantVector(cast<llvm::ConstantVector>(C), *this));
+      break;
+    case llvm::Value::FunctionVal:
+      It->second = std::unique_ptr<Function>(
+          new Function(cast<llvm::Function>(C), *this));
+      break;
+    case llvm::Value::GlobalIFuncVal:
+      It->second = std::unique_ptr<GlobalIFunc>(
+          new GlobalIFunc(cast<llvm::GlobalIFunc>(C), *this));
+      break;
+    case llvm::Value::GlobalVariableVal:
+      It->second = std::unique_ptr<GlobalVariable>(
+          new GlobalVariable(cast<llvm::GlobalVariable>(C), *this));
+      break;
+    case llvm::Value::GlobalAliasVal:
+      It->second = std::unique_ptr<GlobalAlias>(
+          new GlobalAlias(cast<llvm::GlobalAlias>(C), *this));
+      break;
+    case llvm::Value::NoCFIValueVal:
+      It->second = std::unique_ptr<NoCFIValue>(
+          new NoCFIValue(cast<llvm::NoCFIValue>(C), *this));
+      break;
+    case llvm::Value::ConstantPtrAuthVal:
+      It->second = std::unique_ptr<ConstantPtrAuth>(
+          new ConstantPtrAuth(cast<llvm::ConstantPtrAuth>(C), *this));
+      break;
+    case llvm::Value::ConstantExprVal:
+      It->second = std::unique_ptr<ConstantExpr>(
+          new ConstantExpr(cast<llvm::ConstantExpr>(C), *this));
+      break;
+    default:
+      It->second = std::unique_ptr<Constant>(new Constant(C, *this));
+      break;
+    }
+    auto *NewC = It->second.get();
+    for (llvm::Value *COp : C->operands())
+      getOrCreateValueInternal(COp, C);
+    return NewC;
+  }
+  if (auto *Arg = dyn_cast<llvm::Argument>(LLVMV)) {
+    It->second = std::unique_ptr<Argument>(new Argument(Arg, *this));
+    return It->second.get();
+  }
+  if (auto *BB = dyn_cast<llvm::BasicBlock>(LLVMV)) {
+    assert(isa<llvm::BlockAddress>(U) &&
+           "This won't create a SBBB, don't call this function directly!");
+    if (auto *SBBB = getValue(BB))
+      return SBBB;
+    return nullptr;
+  }
+  assert(isa<llvm::Instruction>(LLVMV) && "Expected Instruction");
+
+  switch (cast<llvm::Instruction>(LLVMV)->getOpcode()) {
+  case llvm::Instruction::VAArg: {
+    auto *LLVMVAArg = cast<llvm::VAArgInst>(LLVMV);
+    It->second = std::unique_ptr<VAArgInst>(new VAArgInst(LLVMVAArg, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Freeze: {
+    auto *LLVMFreeze = cast<llvm::FreezeInst>(LLVMV);
+    It->second = std::unique_ptr<FreezeInst>(new FreezeInst(LLVMFreeze, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Fence: {
+    auto *LLVMFence = cast<llvm::FenceInst>(LLVMV);
+    It->second = std::unique_ptr<FenceInst>(new FenceInst(LLVMFence, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Select: {
+    auto *LLVMSel = cast<llvm::SelectInst>(LLVMV);
+    It->second = std::unique_ptr<SelectInst>(new SelectInst(LLVMSel, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::ExtractElement: {
+    auto *LLVMIns = cast<llvm::ExtractElementInst>(LLVMV);
+    It->second = std::unique_ptr<ExtractElementInst>(
+        new ExtractElementInst(LLVMIns, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::InsertElement: {
+    auto *LLVMIns = cast<llvm::InsertElementInst>(LLVMV);
+    It->second = std::unique_ptr<InsertElementInst>(
+        new InsertElementInst(LLVMIns, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::ShuffleVector: {
+    auto *LLVMIns = cast<llvm::ShuffleVectorInst>(LLVMV);
+    It->second = std::unique_ptr<ShuffleVectorInst>(
+        new ShuffleVectorInst(LLVMIns, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::ExtractValue: {
+    auto *LLVMIns = cast<llvm::ExtractValueInst>(LLVMV);
+    It->second =
+        std::unique_ptr<ExtractValueInst>(new ExtractValueInst(LLVMIns, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::InsertValue: {
+    auto *LLVMIns = cast<llvm::InsertValueInst>(LLVMV);
+    It->second =
+        std::unique_ptr<InsertValueInst>(new InsertValueInst(LLVMIns, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Br: {
+    auto *LLVMBr = cast<llvm::BranchInst>(LLVMV);
+    It->second = std::unique_ptr<BranchInst>(new BranchInst(LLVMBr, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Load: {
+    auto *LLVMLd = cast<llvm::LoadInst>(LLVMV);
+    It->second = std::unique_ptr<LoadInst>(new LoadInst(LLVMLd, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Store: {
+    auto *LLVMSt = cast<llvm::StoreInst>(LLVMV);
+    It->second = std::unique_ptr<StoreInst>(new StoreInst(LLVMSt, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Ret: {
+    auto *LLVMRet = cast<llvm::ReturnInst>(LLVMV);
+    It->second = std::unique_ptr<ReturnInst>(new ReturnInst(LLVMRet, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Call: {
+    auto *LLVMCall = cast<llvm::CallInst>(LLVMV);
+    It->second = std::unique_ptr<CallInst>(new CallInst(LLVMCall, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Invoke: {
+    auto *LLVMInvoke = cast<llvm::InvokeInst>(LLVMV);
+    It->second = std::unique_ptr<InvokeInst>(new InvokeInst(LLVMInvoke, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::CallBr: {
+    auto *LLVMCallBr = cast<llvm::CallBrInst>(LLVMV);
+    It->second = std::unique_ptr<CallBrInst>(new CallBrInst(LLVMCallBr, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::LandingPad: {
+    auto *LLVMLPad = cast<llvm::LandingPadInst>(LLVMV);
+    It->second =
+        std::unique_ptr<LandingPadInst>(new LandingPadInst(LLVMLPad, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::CatchPad: {
+    auto *LLVMCPI = cast<llvm::CatchPadInst>(LLVMV);
+    It->second =
+        std::unique_ptr<CatchPadInst>(new CatchPadInst(LLVMCPI, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::CleanupPad: {
+    auto *LLVMCPI = cast<llvm::CleanupPadInst>(LLVMV);
+    It->second =
+        std::unique_ptr<CleanupPadInst>(new CleanupPadInst(LLVMCPI, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::CatchRet: {
+    auto *LLVMCRI = cast<llvm::CatchReturnInst>(LLVMV);
+    It->second =
+        std::unique_ptr<CatchReturnInst>(new CatchReturnInst(LLVMCRI, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::CleanupRet: {
+    auto *LLVMCRI = cast<llvm::CleanupReturnInst>(LLVMV);
+    It->second = std::unique_ptr<CleanupReturnInst>(
+        new CleanupReturnInst(LLVMCRI, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::GetElementPtr: {
+    auto *LLVMGEP = cast<llvm::GetElementPtrInst>(LLVMV);
+    It->second = std::unique_ptr<GetElementPtrInst>(
+        new GetElementPtrInst(LLVMGEP, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::CatchSwitch: {
+    auto *LLVMCatchSwitchInst = cast<llvm::CatchSwitchInst>(LLVMV);
+    It->second = std::unique_ptr<CatchSwitchInst>(
+        new CatchSwitchInst(LLVMCatchSwitchInst, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Resume: {
+    auto *LLVMResumeInst = cast<llvm::ResumeInst>(LLVMV);
+    It->second =
+        std::unique_ptr<ResumeInst>(new ResumeInst(LLVMResumeInst, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Switch: {
+    auto *LLVMSwitchInst = cast<llvm::SwitchInst>(LLVMV);
+    It->second =
+        std::unique_ptr<SwitchInst>(new SwitchInst(LLVMSwitchInst, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::FNeg: {
+    auto *LLVMUnaryOperator = cast<llvm::UnaryOperator>(LLVMV);
+    It->second = std::unique_ptr<UnaryOperator>(
+        new UnaryOperator(LLVMUnaryOperator, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Add:
+  case llvm::Instruction::FAdd:
+  case llvm::Instruction::Sub:
+  case llvm::Instruction::FSub:
+  case llvm::Instruction::Mul:
+  case llvm::Instruction::FMul:
+  case llvm::Instruction::UDiv:
+  case llvm::Instruction::SDiv:
+  case llvm::Instruction::FDiv:
+  case llvm::Instruction::URem:
+  case llvm::Instruction::SRem:
+  case llvm::Instruction::FRem:
+  case llvm::Instruction::Shl:
+  case llvm::Instruction::LShr:
+  case llvm::Instruction::AShr:
+  case llvm::Instruction::And:
+  case llvm::Instruction::Or:
+  case llvm::Instruction::Xor: {
+    auto *LLVMBinaryOperator = cast<llvm::BinaryOperator>(LLVMV);
+    It->second = std::unique_ptr<BinaryOperator>(
+        new BinaryOperator(LLVMBinaryOperator, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::AtomicRMW: {
+    auto *LLVMAtomicRMW = cast<llvm::AtomicRMWInst>(LLVMV);
+    It->second =
+        std::unique_ptr<AtomicRMWInst>(new AtomicRMWInst(LLVMAtomicRMW, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::AtomicCmpXchg: {
+    auto *LLVMAtomicCmpXchg = cast<llvm::AtomicCmpXchgInst>(LLVMV);
+    It->second = std::unique_ptr<AtomicCmpXchgInst>(
+        new AtomicCmpXchgInst(LLVMAtomicCmpXchg, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Alloca: {
+    auto *LLVMAlloca = cast<llvm::AllocaInst>(LLVMV);
+    It->second = std::unique_ptr<AllocaInst>(new AllocaInst(LLVMAlloca, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::ZExt:
+  case llvm::Instruction::SExt:
+  case llvm::Instruction::FPToUI:
+  case llvm::Instruction::FPToSI:
+  case llvm::Instruction::FPExt:
+  case llvm::Instruction::PtrToInt:
+  case llvm::Instruction::IntToPtr:
+  case llvm::Instruction::SIToFP:
+  case llvm::Instruction::UIToFP:
+  case llvm::Instruction::Trunc:
+  case llvm::Instruction::FPTrunc:
+  case llvm::Instruction::BitCast:
+  case llvm::Instruction::AddrSpaceCast: {
+    auto *LLVMCast = cast<llvm::CastInst>(LLVMV);
+    It->second = std::unique_ptr<CastInst>(new CastInst(LLVMCast, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::PHI: {
+    auto *LLVMPhi = cast<llvm::PHINode>(LLVMV);
+    It->second = std::unique_ptr<PHINode>(new PHINode(LLVMPhi, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::ICmp: {
+    auto *LLVMICmp = cast<llvm::ICmpInst>(LLVMV);
+    It->second = std::unique_ptr<ICmpInst>(new ICmpInst(LLVMICmp, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::FCmp: {
+    auto *LLVMFCmp = cast<llvm::FCmpInst>(LLVMV);
+    It->second = std::unique_ptr<FCmpInst>(new FCmpInst(LLVMFCmp, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Unreachable: {
+    auto *LLVMUnreachable = cast<llvm::UnreachableInst>(LLVMV);
+    It->second = std::unique_ptr<UnreachableInst>(
+        new UnreachableInst(LLVMUnreachable, *this));
+    return It->second.get();
+  }
+  default:
+    break;
+  }
+
+  It->second = std::unique_ptr<OpaqueInst>(
+      new OpaqueInst(cast<llvm::Instruction>(LLVMV), *this));
+  return It->second.get();
+}
+
+Argument *Context::getOrCreateArgument(llvm::Argument *LLVMArg) {
+  auto Pair = LLVMValueToValueMap.insert({LLVMArg, nullptr});
+  auto It = Pair.first;
+  if (Pair.second) {
+    It->second = std::unique_ptr<Argument>(new Argument(LLVMArg, *this));
+    return cast<Argument>(It->second.get());
+  }
+  return cast<Argument>(It->second.get());
+}
+
+Constant *Context::getOrCreateConstant(llvm::Constant *LLVMC) {
+  return cast<Constant>(getOrCreateValueInternal(LLVMC, 0));
+}
+
+BasicBlock *Context::createBasicBlock(llvm::BasicBlock *LLVMBB) {
+  assert(getValue(LLVMBB) == nullptr && "Already exists!");
+  auto NewBBPtr = std::unique_ptr<BasicBlock>(new BasicBlock(LLVMBB, *this));
+  auto *BB = cast<BasicBlock>(registerValue(std::move(NewBBPtr)));
+  // Create SandboxIR for BB's body.
+  BB->buildBasicBlockFromLLVMIR(LLVMBB);
+  return BB;
+}
+
+VAArgInst *Context::createVAArgInst(llvm::VAArgInst *SI) {
+  auto NewPtr = std::unique_ptr<VAArgInst>(new VAArgInst(SI, *this));
+  return cast<VAArgInst>(registerValue(std::move(NewPtr)));
+}
+
+FreezeInst *Context::createFreezeInst(llvm::FreezeInst *SI) {
+  auto NewPtr = std::unique_ptr<FreezeInst>(new FreezeInst(SI, *this));
+  return cast<FreezeInst>(registerValue(std::move(NewPtr)));
+}
+
+FenceInst *Context::createFenceInst(llvm::FenceInst *SI) {
+  auto NewPtr = std::unique_ptr<FenceInst>(new FenceInst(SI, *this));
+  return cast<FenceInst>(registerValue(std::move(NewPtr)));
+}
+
+SelectInst *Context::createSelectInst(llvm::SelectInst *SI) {
+  auto NewPtr = std::unique_ptr<SelectInst>(new SelectInst(SI, *this));
+  return cast<SelectInst>(registerValue(std::move(NewPtr)));
+}
+
+ExtractElementInst *
+Context::createExtractElementInst(llvm::ExtractElementInst *EEI) {
+  auto NewPtr =
+      std::unique_ptr<ExtractElementInst>(new ExtractElementInst(EEI, *this));
+  return cast<ExtractElementInst>(registerValue(std::move(NewPtr)));
+}
+
+InsertElementInst *
+Context::createInsertElementInst(llvm::InsertElementInst *IEI) {
+  auto NewPtr =
+      std::unique_ptr<InsertElementInst>(new InsertElementInst(IEI, *this));
+  return cast<InsertElementInst>(registerValue(std::move(NewPtr)));
+}
+
+ShuffleVectorInst *
+Context::createShuffleVectorInst(llvm::ShuffleVectorInst *SVI) {
+  auto NewPtr =
+      std::unique_ptr<ShuffleVectorInst>(new ShuffleVectorInst(SVI, *this));
+  return cast<ShuffleVectorInst>(registerValue(std::move(NewPtr)));
+}
+
+ExtractValueInst *Context::createExtractValueInst(llvm::ExtractValueInst *EVI) {
+  auto NewPtr =
+      std::unique_ptr<ExtractValueInst>(new ExtractValueInst(EVI, *this));
+  return cast<ExtractValueInst>(registerValue(std::move(NewPtr)));
+}
+
+InsertValueInst *Context::createInsertValueInst(llvm::InsertValueInst *IVI) {
+  auto NewPtr =
+      std::unique_ptr<InsertValueInst>(new InsertValueInst(IVI, *this));
+  return cast<InsertValueInst>(registerValue(std::move(NewPtr)));
+}
+
+BranchInst *Context::createBranchInst(llvm::BranchInst *BI) {
+  auto NewPtr = std::unique_ptr<BranchInst>(new BranchInst(BI, *this));
+  return cast<BranchInst>(registerValue(std::move(NewPtr)));
+}
+
+LoadInst *Context::createLoadInst(llvm::LoadInst *LI) {
+  auto NewPtr = std::unique_ptr<LoadInst>(new LoadInst(LI, *this));
+  return cast<LoadInst>(registerValue(std::move(NewPtr)));
+}
+
+StoreInst *Context::createStoreInst(llvm::StoreInst *SI) {
+  auto NewPtr = std::unique_ptr<StoreInst>(new StoreInst(SI, *this));
+  return cast<StoreInst>(registerValue(std::move(NewPtr)));
+}
+
+ReturnInst *Context::createReturnInst(llvm::ReturnInst *I) {
+  auto NewPtr = std::unique_ptr<ReturnInst>(new ReturnInst(I, *this));
+  return cast<ReturnInst>(registerValue(std::move(NewPtr)));
+}
+
+CallInst *Context::createCallInst(llvm::CallInst *I) {
+  auto NewPtr = std::unique_ptr<CallInst>(new CallInst(I, *this));
+  return cast<CallInst>(registerValue(std::move(NewPtr)));
+}
+
+InvokeInst *Context::createInvokeInst(llvm::InvokeInst *I) {
+  auto NewPtr = std::unique_ptr<InvokeInst>(new InvokeInst(I, *this));
+  return cast<InvokeInst>(registerValue(std::move(NewPtr)));
+}
+
+CallBrInst *Context::createCallBrInst(llvm::CallBrInst *I) {
+  auto NewPtr = std::unique_ptr<CallBrInst>(new CallBrInst(I, *this));
+  return cast<CallBrInst>(registerValue(std::move(NewPtr)));
+}
+
+UnreachableInst *Context::createUnreachableInst(llvm::UnreachableInst *UI) {
+  auto NewPtr =
+      std::unique_ptr<UnreachableInst>(new UnreachableInst(UI, *this));
+  return cast<UnreachableInst>(registerValue(std::move(NewPtr)));
+}
+LandingPadInst *Context::createLandingPadInst(llvm::LandingPadInst *I) {
+  auto NewPtr = std::unique_ptr<LandingPadInst>(new LandingPadInst(I, *this));
+  return cast<LandingPadInst>(registerValue(std::move(NewPtr)));
+}
+CatchPadInst *Context::createCatchPadInst(llvm::CatchPadInst *I) {
+  auto NewPtr = std::unique_ptr<CatchPadInst>(new CatchPadInst(I, *this));
+  return cast<CatchPadInst>(registerValue(std::move(NewPtr)));
+}
+CleanupPadInst *Context::createCleanupPadInst(llvm::CleanupPadInst *I) {
+  auto NewPtr = std::unique_ptr<CleanupPadInst>(new CleanupPadInst(I, *this));
+  return cast<CleanupPadInst>(registerValue(std::move(NewPtr)));
+}
+CatchReturnInst *Context::createCatchReturnInst(llvm::CatchReturnInst *I) {
+  auto NewPtr = std::unique_ptr<CatchReturnInst>(new CatchReturnInst(I, *this));
+  return cast<CatchReturnInst>(registerValue(std::move(NewPtr)));
+}
+CleanupReturnInst *
+Context::createCleanupReturnInst(llvm::CleanupReturnInst *I) {
+  auto NewPtr =
+      std::unique_ptr<CleanupReturnInst>(new CleanupReturnInst(I, *this));
+  return cast<CleanupReturnInst>(registerValue(std::move(NewPtr)));
+}
+GetElementPtrInst *
+Context::createGetElementPtrInst(llvm::GetElementPtrInst *I) {
+  auto NewPtr =
+      std::unique_ptr<GetElementPtrInst>(new GetElementPtrInst(I, *this));
+  return cast<GetElementPtrInst>(registerValue(std::move(NewPtr)));
+}
+CatchSwitchInst *Context::createCatchSwitchInst(llvm::CatchSwitchInst *I) {
+  auto NewPtr = std::unique_ptr<CatchSwitchInst>(new CatchSwitchInst(I, *this));
+  return cast<CatchSwitchInst>(registerValue(std::move(NewPtr)));
+}
+ResumeInst *Context::createResumeInst(llvm::ResumeInst *I) {
+  auto NewPtr = std::unique_ptr<ResumeInst>(new ResumeInst(I, *this));
+  return cast<ResumeInst>(registerValue(std::move(NewPtr)));
+}
+SwitchInst *Context::createSwitchInst(llvm::SwitchInst *I) {
+  auto NewPtr = std::unique_ptr<SwitchInst>(new SwitchInst(I, *this));
+  return cast<SwitchInst>(registerValue(std::move(NewPtr)));
+}
+UnaryOperator *Context::createUnaryOperator(llvm::UnaryOperator *I) {
+  auto NewPtr = std::unique_ptr<UnaryOperator>(new UnaryOperator(I, *this));
+  return cast<UnaryOperator>(registerValue(std::move(NewPtr)));
+}
+BinaryOperator *Context::createBinaryOperator(llvm::BinaryOperator *I) {
+  auto NewPtr = std::unique_ptr<BinaryOperator>(new BinaryOperator(I, *this));
+  return cast<BinaryOperator>(registerValue(std::move(NewPtr)));
+}
+AtomicRMWInst *Context::createAtomicRMWInst(llvm::AtomicRMWInst *I) {
+  auto NewPtr = std::unique_ptr<AtomicRMWInst>(new AtomicRMWInst(I, *this));
+  return cast<AtomicRMWInst>(registerValue(std::move(NewPtr)));
+}
+AtomicCmpXchgInst *
+Context::createAtomicCmpXchgInst(llvm::AtomicCmpXchgInst *I) {
+  auto NewPtr =
+      std::unique_ptr<AtomicCmpXchgInst>(new AtomicCmpXchgInst(I, *this));
+  return cast<AtomicCmpXchgInst>(registerValue(std::move(NewPtr)));
+}
+AllocaInst *Context::createAllocaInst(llvm::AllocaInst *I) {
+  auto NewPtr = std::unique_ptr<AllocaInst>(new AllocaInst(I, *this));
+  return cast<AllocaInst>(registerValue(std::move(NewPtr)));
+}
+CastInst *Context::createCastInst(llvm::CastInst *I) {
+  auto NewPtr = std::unique_ptr<CastInst>(new CastInst(I, *this));
+  return cast<CastInst>(registerValue(std::move(NewPtr)));
+}
+PHINode *Context::createPHINode(llvm::PHINode *I) {
+  auto NewPtr = std::unique_ptr<PHINode>(new PHINode(I, *this));
+  return cast<PHINode>(registerValue(std::move(NewPtr)));
+}
+ICmpInst *Context::createICmpInst(llvm::ICmpInst *I) {
+  auto NewPtr = std::unique_ptr<ICmpInst>(new ICmpInst(I, *this));
+  return cast<ICmpInst>(registerValue(std::move(NewPtr)));
+}
+FCmpInst *Context::createFCmpInst(llvm::FCmpInst *I) {
+  auto NewPtr = std::unique_ptr<FCmpInst>(new FCmpInst(I, *this));
+  return cast<FCmpInst>(registerValue(std::move(NewPtr)));
+}
+CmpInst *CmpInst::create(Predicate P, Value *S1, Value *S2,
+                         Instruction *InsertBefore, Context &Ctx,
+                         const Twine &Name) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(InsertBefore->getTopmostLLVMInstruction());
+  auto *LLVMI = Builder.CreateCmp(P, S1->Val, S2->Val, Name);
+  if (dyn_cast<llvm::ICmpInst>(LLVMI))
+    return Ctx.createICmpInst(cast<llvm::ICmpInst>(LLVMI));
+  return Ctx.createFCmpInst(cast<llvm::FCmpInst>(LLVMI));
+}
+CmpInst *CmpInst::createWithCopiedFlags(Predicate P, Value *S1, Value *S2,
+                                        const Instruction *F,
+                                        Instruction *InsertBefore, Context &Ctx,
+                                        const Twine &Name) {
+  CmpInst *Inst = create(P, S1, S2, InsertBefore, Ctx, Name);
+  cast<llvm::CmpInst>(Inst->Val)->copyIRFlags(F->Val);
+  return Inst;
+}
+
+Type *CmpInst::makeCmpResultType(Type *OpndType) {
+  if (auto *VT = dyn_cast<VectorType>(OpndType)) {
+    // TODO: Cleanup when we have more complete support for
+    // sandboxir::VectorType
+    return OpndType->getContext().getType(llvm::VectorType::get(
+        llvm::Type::getInt1Ty(OpndType->getContext().LLVMCtx),
+        cast<llvm::VectorType>(VT->LLVMTy)->getElementCount()));
+  }
+  return Type::getInt1Ty(OpndType->getContext());
+}
+
+void CmpInst::setPredicate(Predicate P) {
+  Ctx.getTracker()
+      .emplaceIfTracking<
+          GenericSetter<&CmpInst::getPredicate, &CmpInst::setPredicate>>(this);
+  cast<llvm::CmpInst>(Val)->setPredicate(P);
+}
+
+void CmpInst::swapOperands() {
+  if (ICmpInst *IC = dyn_cast<ICmpInst>(this))
+    IC->swapOperands();
+  else
+    cast<FCmpInst>(this)->swapOperands();
+}
+
+void ICmpInst::swapOperands() {
+  Ctx.getTracker().emplaceIfTracking<CmpSwapOperands>(this);
+  cast<llvm::ICmpInst>(Val)->swapOperands();
+}
+
+void FCmpInst::swapOperands() {
+  Ctx.getTracker().emplaceIfTracking<CmpSwapOperands>(this);
+  cast<llvm::FCmpInst>(Val)->swapOperands();
+}
+
+#ifndef NDEBUG
+void CmpInst::dumpOS(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void CmpInst::dump() const {
+  dumpOS(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+Value *Context::getValue(llvm::Value *V) const {
+  auto It = LLVMValueToValueMap.find(V);
+  if (It != LLVMValueToValueMap.end())
+    return It->second.get();
+  return nullptr;
+}
+
+Context::Context(LLVMContext &LLVMCtx)
+    : LLVMCtx(LLVMCtx), IRTracker(*this),
+      LLVMIRBuilder(LLVMCtx, ConstantFolder()) {}
+
+Context::~Context() {}
+
+Module *Context::getModule(llvm::Module *LLVMM) const {
+  auto It = LLVMModuleToModuleMap.find(LLVMM);
+  if (It != LLVMModuleToModuleMap.end())
+    return It->second.get();
+  return nullptr;
+}
+
+Module *Context::getOrCreateModule(llvm::Module *LLVMM) {
+  auto Pair = LLVMModuleToModuleMap.insert({LLVMM, nullptr});
+  auto It = Pair.first;
+  if (!Pair.second)
+    return It->second.get();
+  It->second = std::unique_ptr<Module>(new Module(*LLVMM, *this));
+  return It->second.get();
+}
+
+Function *Context::createFunction(llvm::Function *F) {
+  assert(getValue(F) == nullptr && "Already exists!");
+  // Create the module if needed before we create the new sandboxir::Function.
+  // Note: this won't fully populate the module. The only globals that will be
+  // available will be the ones being used within the function.
+  getOrCreateModule(F->getParent());
+
+  auto NewFPtr = std::unique_ptr<Function>(new Function(F, *this));
+  auto *SBF = cast<Function>(registerValue(std::move(NewFPtr)));
+  // Create arguments.
+  for (auto &Arg : F->args())
+    getOrCreateArgument(&Arg);
+  // Create BBs.
+  for (auto &BB : *F)
+    createBasicBlock(&BB);
+  return SBF;
+}
+
+Module *Context::createModule(llvm::Module *LLVMM) {
+  auto *M = getOrCreateModule(LLVMM);
+  // Create the functions.
+  for (auto &LLVMF : *LLVMM)
+    createFunction(&LLVMF);
+  // Create globals.
+  for (auto &Global : LLVMM->globals())
+    getOrCreateValue(&Global);
+  // Create aliases.
+  for (auto &Alias : LLVMM->aliases())
+    getOrCreateValue(&Alias);
+  // Create ifuncs.
+  for (auto &IFunc : LLVMM->ifuncs())
+    getOrCreateValue(&IFunc);
+
+  return M;
+}
+
+} // namespace llvm::sandboxir
diff --git a/llvm/lib/SandboxIR/Function.cpp b/llvm/lib/SandboxIR/Function.cpp
new file mode 100644
index 0000000000000..f7a1d35b00465
--- /dev/null
+++ b/llvm/lib/SandboxIR/Function.cpp
@@ -0,0 +1,55 @@
+//===- Function.cpp - The Function class of Sandbox IR --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/IR/Value.h"
+#include "llvm/SandboxIR/Context.h"
+
+namespace llvm::sandboxir {
+
+FunctionType *Function::getFunctionType() const {
+  return cast<FunctionType>(
+      Ctx.getType(cast<llvm::Function>(Val)->getFunctionType()));
+}
+
+#ifndef NDEBUG
+void Function::dumpNameAndArgs(raw_ostream &OS) const {
+  auto *F = cast<llvm::Function>(Val);
+  OS << *F->getReturnType() << " @" << F->getName() << "(";
+  interleave(
+      F->args(),
+      [this, &OS](const llvm::Argument &LLVMArg) {
+        auto *SBArg = cast_or_null<Argument>(Ctx.getValue(&LLVMArg));
+        if (SBArg == nullptr)
+          OS << "NULL";
+        else
+          SBArg->printAsOperand(OS);
+      },
+      [&] { OS << ", "; });
+  OS << ")";
+}
+
+void Function::dumpOS(raw_ostream &OS) const {
+  dumpNameAndArgs(OS);
+  OS << " {\n";
+  auto *LLVMF = cast<llvm::Function>(Val);
+  interleave(
+      *LLVMF,
+      [this, &OS](const llvm::BasicBlock &LLVMBB) {
+        auto *BB = cast_or_null<BasicBlock>(Ctx.getValue(&LLVMBB));
+        if (BB == nullptr)
+          OS << "NULL";
+        else
+          OS << *BB;
+      },
+      [&OS] { OS << "\n"; });
+  OS << "}\n";
+}
+#endif // NDEBUG
+
+} // namespace llvm::sandboxir
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/Instruction.cpp
similarity index 59%
rename from llvm/lib/SandboxIR/SandboxIR.cpp
rename to llvm/lib/SandboxIR/Instruction.cpp
index 124c1bf92ca7f..276c4f0872b10 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/Instruction.cpp
@@ -1,4 +1,4 @@
-//===- SandboxIR.cpp - A transactional overlay IR on top of LLVM IR -------===//
+//===- Instruction.cpp - The Instructions of Sandbox IR -------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,310 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/SandboxIR/SandboxIR.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/Support/Debug.h"
-#include <sstream>
+#include "llvm/SandboxIR/Instruction.h"
+#include "llvm/SandboxIR/Function.h"
 
-using namespace llvm::sandboxir;
-
-Value *Use::get() const { return Ctx->getValue(LLVMUse->get()); }
-
-void Use::set(Value *V) {
-  Ctx->getTracker().emplaceIfTracking<UseSet>(*this);
-  LLVMUse->set(V->Val);
-}
-
-unsigned Use::getOperandNo() const { return Usr->getUseOperandNo(*this); }
-
-void Use::swap(Use &OtherUse) {
-  Ctx->getTracker().emplaceIfTracking<UseSwap>(*this, OtherUse);
-  LLVMUse->swap(*OtherUse.LLVMUse);
-}
-
-#ifndef NDEBUG
-void Use::dumpOS(raw_ostream &OS) const {
-  Value *Def = nullptr;
-  if (LLVMUse == nullptr)
-    OS << "<null> LLVM Use! ";
-  else
-    Def = Ctx->getValue(LLVMUse->get());
-  OS << "Def:  ";
-  if (Def == nullptr)
-    OS << "NULL";
-  else
-    OS << *Def;
-  OS << "\n";
-
-  OS << "User: ";
-  if (Usr == nullptr)
-    OS << "NULL";
-  else
-    OS << *Usr;
-  OS << "\n";
-
-  OS << "OperandNo: ";
-  if (Usr == nullptr)
-    OS << "N/A";
-  else
-    OS << getOperandNo();
-  OS << "\n";
-}
-
-void Use::dump() const { dumpOS(dbgs()); }
-#endif // NDEBUG
-
-Use OperandUseIterator::operator*() const { return Use; }
-
-OperandUseIterator &OperandUseIterator::operator++() {
-  assert(Use.LLVMUse != nullptr && "Already at end!");
-  User *User = Use.getUser();
-  Use = User->getOperandUseInternal(Use.getOperandNo() + 1, /*Verify=*/false);
-  return *this;
-}
-
-UserUseIterator &UserUseIterator::operator++() {
-  // Get the corresponding llvm::Use, get the next in the list, and update the
-  // sandboxir::Use.
-  llvm::Use *&LLVMUse = Use.LLVMUse;
-  assert(LLVMUse != nullptr && "Already at end!");
-  LLVMUse = LLVMUse->getNext();
-  if (LLVMUse == nullptr) {
-    Use.Usr = nullptr;
-    return *this;
-  }
-  auto *Ctx = Use.Ctx;
-  auto *LLVMUser = LLVMUse->getUser();
-  Use.Usr = cast_or_null<sandboxir::User>(Ctx->getValue(LLVMUser));
-  return *this;
-}
-
-OperandUseIterator OperandUseIterator::operator+(unsigned Num) const {
-  sandboxir::Use U = Use.getUser()->getOperandUseInternal(
-      Use.getOperandNo() + Num, /*Verify=*/true);
-  return OperandUseIterator(U);
-}
-
-OperandUseIterator OperandUseIterator::operator-(unsigned Num) const {
-  assert(Use.getOperandNo() >= Num && "Out of bounds!");
-  sandboxir::Use U = Use.getUser()->getOperandUseInternal(
-      Use.getOperandNo() - Num, /*Verify=*/true);
-  return OperandUseIterator(U);
-}
-
-int OperandUseIterator::operator-(const OperandUseIterator &Other) const {
-  int ThisOpNo = Use.getOperandNo();
-  int OtherOpNo = Other.Use.getOperandNo();
-  return ThisOpNo - OtherOpNo;
-}
-
-Value::Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx)
-    : SubclassID(SubclassID), Val(Val), Ctx(Ctx) {
-#ifndef NDEBUG
-  UID = Ctx.getNumValues();
-#endif
-}
-
-Value::use_iterator Value::use_begin() {
-  llvm::Use *LLVMUse = nullptr;
-  if (Val->use_begin() != Val->use_end())
-    LLVMUse = &*Val->use_begin();
-  User *User = LLVMUse != nullptr ? cast_or_null<sandboxir::User>(Ctx.getValue(
-                                        Val->use_begin()->getUser()))
-                                  : nullptr;
-  return use_iterator(Use(LLVMUse, User, Ctx));
-}
-
-Value::user_iterator Value::user_begin() {
-  auto UseBegin = Val->use_begin();
-  auto UseEnd = Val->use_end();
-  bool AtEnd = UseBegin == UseEnd;
-  llvm::Use *LLVMUse = AtEnd ? nullptr : &*UseBegin;
-  User *User =
-      AtEnd ? nullptr
-            : cast_or_null<sandboxir::User>(Ctx.getValue(&*LLVMUse->getUser()));
-  return user_iterator(Use(LLVMUse, User, Ctx), UseToUser());
-}
-
-unsigned Value::getNumUses() const { return range_size(Val->users()); }
-
-Type *Value::getType() const { return Ctx.getType(Val->getType()); }
-
-void Value::replaceUsesWithIf(
-    Value *OtherV, llvm::function_ref<bool(const Use &)> ShouldReplace) {
-  assert(getType() == OtherV->getType() && "Can't replace with different type");
-  llvm::Value *OtherVal = OtherV->Val;
-  // We are delegating RUWIf to LLVM IR's RUWIf.
-  Val->replaceUsesWithIf(
-      OtherVal, [&ShouldReplace, this](llvm::Use &LLVMUse) -> bool {
-        User *DstU = cast_or_null<User>(Ctx.getValue(LLVMUse.getUser()));
-        if (DstU == nullptr)
-          return false;
-        Use UseToReplace(&LLVMUse, DstU, Ctx);
-        if (!ShouldReplace(UseToReplace))
-          return false;
-        Ctx.getTracker().emplaceIfTracking<UseSet>(UseToReplace);
-        return true;
-      });
-}
-
-void Value::replaceAllUsesWith(Value *Other) {
-  assert(getType() == Other->getType() &&
-         "Replacing with Value of different type!");
-  auto &Tracker = Ctx.getTracker();
-  if (Tracker.isTracking()) {
-    for (auto Use : uses())
-      Tracker.track(std::make_unique<UseSet>(Use));
-  }
-  // We are delegating RAUW to LLVM IR's RAUW.
-  Val->replaceAllUsesWith(Other->Val);
-}
-
-#ifndef NDEBUG
-std::string Value::getUid() const {
-  std::stringstream SS;
-  SS << "SB" << UID << ".";
-  return SS.str();
-}
-
-void Value::dumpCommonHeader(raw_ostream &OS) const {
-  OS << getUid() << " " << getSubclassIDStr(SubclassID) << " ";
-}
-
-void Value::dumpCommonFooter(raw_ostream &OS) const {
-  OS.indent(2) << "Val: ";
-  if (Val)
-    OS << *Val;
-  else
-    OS << "NULL";
-  OS << "\n";
-}
-
-void Value::dumpCommonPrefix(raw_ostream &OS) const {
-  if (Val)
-    OS << *Val;
-  else
-    OS << "NULL ";
-}
-
-void Value::dumpCommonSuffix(raw_ostream &OS) const {
-  OS << " ; " << getUid() << " (" << getSubclassIDStr(SubclassID) << ")";
-}
-
-void Value::printAsOperandCommon(raw_ostream &OS) const {
-  if (Val)
-    Val->printAsOperand(OS);
-  else
-    OS << "NULL ";
-}
-
-void Value::dump() const {
-  dumpOS(dbgs());
-  dbgs() << "\n";
-}
-
-void Argument::printAsOperand(raw_ostream &OS) const {
-  printAsOperandCommon(OS);
-}
-void Argument::dumpOS(raw_ostream &OS) const {
-  dumpCommonPrefix(OS);
-  dumpCommonSuffix(OS);
-}
-#endif // NDEBUG
-
-Use User::getOperandUseDefault(unsigned OpIdx, bool Verify) const {
-  assert((!Verify || OpIdx < getNumOperands()) && "Out of bounds!");
-  assert(isa<llvm::User>(Val) && "Non-users have no operands!");
-  llvm::Use *LLVMUse;
-  if (OpIdx != getNumOperands())
-    LLVMUse = &cast<llvm::User>(Val)->getOperandUse(OpIdx);
-  else
-    LLVMUse = cast<llvm::User>(Val)->op_end();
-  return Use(LLVMUse, const_cast<User *>(this), Ctx);
-}
-
-#ifndef NDEBUG
-void User::verifyUserOfLLVMUse(const llvm::Use &Use) const {
-  assert(Ctx.getValue(Use.getUser()) == this &&
-         "Use not found in this SBUser's operands!");
-}
-#endif
-
-bool User::classof(const Value *From) {
-  switch (From->getSubclassID()) {
-#define DEF_VALUE(ID, CLASS)
-#define DEF_USER(ID, CLASS)                                                    \
-  case ClassID::ID:                                                            \
-    return true;
-#define DEF_INSTR(ID, OPC, CLASS)                                              \
-  case ClassID::ID:                                                            \
-    return true;
-#include "llvm/SandboxIR/SandboxIRValues.def"
-  default:
-    return false;
-  }
-}
-
-void User::setOperand(unsigned OperandIdx, Value *Operand) {
-  assert(isa<llvm::User>(Val) && "No operands!");
-  Ctx.getTracker().emplaceIfTracking<UseSet>(getOperandUse(OperandIdx));
-  // We are delegating to llvm::User::setOperand().
-  cast<llvm::User>(Val)->setOperand(OperandIdx, Operand->Val);
-}
-
-bool User::replaceUsesOfWith(Value *FromV, Value *ToV) {
-  auto &Tracker = Ctx.getTracker();
-  if (Tracker.isTracking()) {
-    for (auto OpIdx : seq<unsigned>(0, getNumOperands())) {
-      auto Use = getOperandUse(OpIdx);
-      if (Use.get() == FromV)
-        Tracker.emplaceIfTracking<UseSet>(Use);
-    }
-  }
-  // We are delegating RUOW to LLVM IR's RUOW.
-  return cast<llvm::User>(Val)->replaceUsesOfWith(FromV->Val, ToV->Val);
-}
-
-#ifndef NDEBUG
-void User::dumpCommonHeader(raw_ostream &OS) const {
-  Value::dumpCommonHeader(OS);
-  // TODO: This is incomplete
-}
-#endif // NDEBUG
-
-BBIterator &BBIterator::operator++() {
-  auto ItE = BB->end();
-  assert(It != ItE && "Already at end!");
-  ++It;
-  if (It == ItE)
-    return *this;
-  Instruction &NextI = *cast<sandboxir::Instruction>(Ctx->getValue(&*It));
-  unsigned Num = NextI.getNumOfIRInstrs();
-  assert(Num > 0 && "Bad getNumOfIRInstrs()");
-  It = std::next(It, Num - 1);
-  return *this;
-}
-
-BBIterator &BBIterator::operator--() {
-  assert(It != BB->begin() && "Already at begin!");
-  if (It == BB->end()) {
-    --It;
-    return *this;
-  }
-  Instruction &CurrI = **this;
-  unsigned Num = CurrI.getNumOfIRInstrs();
-  assert(Num > 0 && "Bad getNumOfIRInstrs()");
-  assert(std::prev(It, Num - 1) != BB->begin() && "Already at begin!");
-  It = std::prev(It, Num);
-  return *this;
-}
-
-BasicBlock *BBIterator::getNodeParent() const {
-  llvm::BasicBlock *Parent = const_cast<BBIterator *>(this)->It.getNodeParent();
-  return cast<BasicBlock>(Ctx->getValue(Parent));
-}
+namespace llvm::sandboxir {
 
 const char *Instruction::getOpcodeName(Opcode Opc) {
   switch (Opc) {
@@ -318,7 +18,7 @@ const char *Instruction::getOpcodeName(Opcode Opc) {
     return #OPC;
 #define OPCODES(...) __VA_ARGS__
 #define DEF_INSTR(ID, OPC, CLASS) OPC
-#include "llvm/SandboxIR/SandboxIRValues.def"
+#include "llvm/SandboxIR/Values.def"
   }
   llvm_unreachable("Unknown Opcode");
 }
@@ -473,7 +173,7 @@ bool Instruction::classof(const sandboxir::Value *From) {
 #define DEF_INSTR(ID, OPC, CLASS)                                              \
   case ClassID::ID:                                                            \
     return true;
-#include "llvm/SandboxIR/SandboxIRValues.def"
+#include "llvm/SandboxIR/Values.def"
   default:
     return false;
   }
@@ -2258,1273 +1958,9 @@ Value *InsertValueInst::create(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
   return Ctx.getOrCreateConstant(cast<llvm::Constant>(NewV));
 }
 
-#ifndef NDEBUG
-void Constant::dumpOS(raw_ostream &OS) const {
-  dumpCommonPrefix(OS);
-  dumpCommonSuffix(OS);
-}
-#endif // NDEBUG
-
-ConstantInt *ConstantInt::getTrue(Context &Ctx) {
-  auto *LLVMC = llvm::ConstantInt::getTrue(Ctx.LLVMCtx);
-  return cast<ConstantInt>(Ctx.getOrCreateConstant(LLVMC));
-}
-ConstantInt *ConstantInt::getFalse(Context &Ctx) {
-  auto *LLVMC = llvm::ConstantInt::getFalse(Ctx.LLVMCtx);
-  return cast<ConstantInt>(Ctx.getOrCreateConstant(LLVMC));
-}
-ConstantInt *ConstantInt::getBool(Context &Ctx, bool V) {
-  auto *LLVMC = llvm::ConstantInt::getBool(Ctx.LLVMCtx, V);
-  return cast<ConstantInt>(Ctx.getOrCreateConstant(LLVMC));
-}
-Constant *ConstantInt::getTrue(Type *Ty) {
-  auto *LLVMC = llvm::ConstantInt::getTrue(Ty->LLVMTy);
-  return Ty->getContext().getOrCreateConstant(LLVMC);
-}
-Constant *ConstantInt::getFalse(Type *Ty) {
-  auto *LLVMC = llvm::ConstantInt::getFalse(Ty->LLVMTy);
-  return Ty->getContext().getOrCreateConstant(LLVMC);
-}
-Constant *ConstantInt::getBool(Type *Ty, bool V) {
-  auto *LLVMC = llvm::ConstantInt::getBool(Ty->LLVMTy, V);
-  return Ty->getContext().getOrCreateConstant(LLVMC);
-}
-ConstantInt *ConstantInt::get(Type *Ty, uint64_t V, bool IsSigned) {
-  auto *LLVMC = llvm::ConstantInt::get(Ty->LLVMTy, V, IsSigned);
-  return cast<ConstantInt>(Ty->getContext().getOrCreateConstant(LLVMC));
-}
-ConstantInt *ConstantInt::get(IntegerType *Ty, uint64_t V, bool IsSigned) {
-  auto *LLVMC = llvm::ConstantInt::get(Ty->LLVMTy, V, IsSigned);
-  return cast<ConstantInt>(Ty->getContext().getOrCreateConstant(LLVMC));
-}
-ConstantInt *ConstantInt::getSigned(IntegerType *Ty, int64_t V) {
-  auto *LLVMC =
-      llvm::ConstantInt::getSigned(cast<llvm::IntegerType>(Ty->LLVMTy), V);
-  return cast<ConstantInt>(Ty->getContext().getOrCreateConstant(LLVMC));
-}
-Constant *ConstantInt::getSigned(Type *Ty, int64_t V) {
-  auto *LLVMC = llvm::ConstantInt::getSigned(Ty->LLVMTy, V);
-  return Ty->getContext().getOrCreateConstant(LLVMC);
-}
-ConstantInt *ConstantInt::get(Context &Ctx, const APInt &V) {
-  auto *LLVMC = llvm::ConstantInt::get(Ctx.LLVMCtx, V);
-  return cast<ConstantInt>(Ctx.getOrCreateConstant(LLVMC));
-}
-ConstantInt *ConstantInt::get(IntegerType *Ty, StringRef Str, uint8_t Radix) {
-  auto *LLVMC =
-      llvm::ConstantInt::get(cast<llvm::IntegerType>(Ty->LLVMTy), Str, Radix);
-  return cast<ConstantInt>(Ty->getContext().getOrCreateConstant(LLVMC));
-}
-Constant *ConstantInt::get(Type *Ty, const APInt &V) {
-  auto *LLVMC = llvm::ConstantInt::get(Ty->LLVMTy, V);
-  return Ty->getContext().getOrCreateConstant(LLVMC);
-}
-IntegerType *ConstantInt::getIntegerType() const {
-  auto *LLVMTy = cast<llvm::ConstantInt>(Val)->getIntegerType();
-  return cast<IntegerType>(Ctx.getType(LLVMTy));
-}
-
-bool ConstantInt::isValueValidForType(Type *Ty, uint64_t V) {
-  return llvm::ConstantInt::isValueValidForType(Ty->LLVMTy, V);
-}
-bool ConstantInt::isValueValidForType(Type *Ty, int64_t V) {
-  return llvm::ConstantInt::isValueValidForType(Ty->LLVMTy, V);
-}
-
-Constant *ConstantFP::get(Type *Ty, double V) {
-  auto *LLVMC = llvm::ConstantFP::get(Ty->LLVMTy, V);
-  return Ty->getContext().getOrCreateConstant(LLVMC);
-}
-
-Constant *ConstantFP::get(Type *Ty, const APFloat &V) {
-  auto *LLVMC = llvm::ConstantFP::get(Ty->LLVMTy, V);
-  return Ty->getContext().getOrCreateConstant(LLVMC);
-}
-
-Constant *ConstantFP::get(Type *Ty, StringRef Str) {
-  auto *LLVMC = llvm::ConstantFP::get(Ty->LLVMTy, Str);
-  return Ty->getContext().getOrCreateConstant(LLVMC);
-}
-
-ConstantFP *ConstantFP::get(const APFloat &V, Context &Ctx) {
-  auto *LLVMC = llvm::ConstantFP::get(Ctx.LLVMCtx, V);
-  return cast<ConstantFP>(Ctx.getOrCreateConstant(LLVMC));
-}
-
-Constant *ConstantFP::getNaN(Type *Ty, bool Negative, uint64_t Payload) {
-  auto *LLVMC = llvm::ConstantFP::getNaN(Ty->LLVMTy, Negative, Payload);
-  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
-}
-Constant *ConstantFP::getQNaN(Type *Ty, bool Negative, APInt *Payload) {
-  auto *LLVMC = llvm::ConstantFP::getQNaN(Ty->LLVMTy, Negative, Payload);
-  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
-}
-Constant *ConstantFP::getSNaN(Type *Ty, bool Negative, APInt *Payload) {
-  auto *LLVMC = llvm::ConstantFP::getSNaN(Ty->LLVMTy, Negative, Payload);
-  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
-}
-Constant *ConstantFP::getZero(Type *Ty, bool Negative) {
-  auto *LLVMC = llvm::ConstantFP::getZero(Ty->LLVMTy, Negative);
-  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
-}
-Constant *ConstantFP::getNegativeZero(Type *Ty) {
-  auto *LLVMC = llvm::ConstantFP::getNegativeZero(Ty->LLVMTy);
-  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
-}
-Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
-  auto *LLVMC = llvm::ConstantFP::getInfinity(Ty->LLVMTy, Negative);
-  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
-}
-bool ConstantFP::isValueValidForType(Type *Ty, const APFloat &V) {
-  return llvm::ConstantFP::isValueValidForType(Ty->LLVMTy, V);
-}
-
-Constant *ConstantArray::get(ArrayType *T, ArrayRef<Constant *> V) {
-  auto &Ctx = T->getContext();
-  SmallVector<llvm::Constant *> LLVMValues;
-  LLVMValues.reserve(V.size());
-  for (auto *Elm : V)
-    LLVMValues.push_back(cast<llvm::Constant>(Elm->Val));
-  auto *LLVMC =
-      llvm::ConstantArray::get(cast<llvm::ArrayType>(T->LLVMTy), LLVMValues);
-  return cast<ConstantArray>(Ctx.getOrCreateConstant(LLVMC));
-}
-
-ArrayType *ConstantArray::getType() const {
-  return cast<ArrayType>(
-      Ctx.getType(cast<llvm::ConstantArray>(Val)->getType()));
-}
-
-Constant *ConstantStruct::get(StructType *T, ArrayRef<Constant *> V) {
-  auto &Ctx = T->getContext();
-  SmallVector<llvm::Constant *> LLVMValues;
-  LLVMValues.reserve(V.size());
-  for (auto *Elm : V)
-    LLVMValues.push_back(cast<llvm::Constant>(Elm->Val));
-  auto *LLVMC =
-      llvm::ConstantStruct::get(cast<llvm::StructType>(T->LLVMTy), LLVMValues);
-  return cast<ConstantStruct>(Ctx.getOrCreateConstant(LLVMC));
-}
-
-StructType *ConstantStruct::getTypeForElements(Context &Ctx,
-                                               ArrayRef<Constant *> V,
-                                               bool Packed) {
-  unsigned VecSize = V.size();
-  SmallVector<Type *, 16> EltTypes;
-  EltTypes.reserve(VecSize);
-  for (Constant *Elm : V)
-    EltTypes.push_back(Elm->getType());
-  return StructType::get(Ctx, EltTypes, Packed);
-}
-
-ConstantAggregateZero *ConstantAggregateZero::get(Type *Ty) {
-  auto *LLVMC = llvm::ConstantAggregateZero::get(Ty->LLVMTy);
-  return cast<ConstantAggregateZero>(
-      Ty->getContext().getOrCreateConstant(LLVMC));
-}
-
-Constant *ConstantAggregateZero::getSequentialElement() const {
-  return cast<Constant>(Ctx.getValue(
-      cast<llvm::ConstantAggregateZero>(Val)->getSequentialElement()));
-}
-Constant *ConstantAggregateZero::getStructElement(unsigned Elt) const {
-  return cast<Constant>(Ctx.getValue(
-      cast<llvm::ConstantAggregateZero>(Val)->getStructElement(Elt)));
-}
-Constant *ConstantAggregateZero::getElementValue(Constant *C) const {
-  return cast<Constant>(
-      Ctx.getValue(cast<llvm::ConstantAggregateZero>(Val)->getElementValue(
-          cast<llvm::Constant>(C->Val))));
-}
-Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const {
-  return cast<Constant>(Ctx.getValue(
-      cast<llvm::ConstantAggregateZero>(Val)->getElementValue(Idx)));
-}
-
-ConstantPointerNull *ConstantPointerNull::get(PointerType *Ty) {
-  auto *LLVMC =
-      llvm::ConstantPointerNull::get(cast<llvm::PointerType>(Ty->LLVMTy));
-  return cast<ConstantPointerNull>(Ty->getContext().getOrCreateConstant(LLVMC));
-}
-
-PointerType *ConstantPointerNull::getType() const {
-  return cast<PointerType>(
-      Ctx.getType(cast<llvm::ConstantPointerNull>(Val)->getType()));
-}
-
-UndefValue *UndefValue::get(Type *T) {
-  auto *LLVMC = llvm::UndefValue::get(T->LLVMTy);
-  return cast<UndefValue>(T->getContext().getOrCreateConstant(LLVMC));
-}
-
-UndefValue *UndefValue::getSequentialElement() const {
-  return cast<UndefValue>(Ctx.getOrCreateConstant(
-      cast<llvm::UndefValue>(Val)->getSequentialElement()));
-}
-
-UndefValue *UndefValue::getStructElement(unsigned Elt) const {
-  return cast<UndefValue>(Ctx.getOrCreateConstant(
-      cast<llvm::UndefValue>(Val)->getStructElement(Elt)));
-}
-
-UndefValue *UndefValue::getElementValue(Constant *C) const {
-  return cast<UndefValue>(
-      Ctx.getOrCreateConstant(cast<llvm::UndefValue>(Val)->getElementValue(
-          cast<llvm::Constant>(C->Val))));
-}
-
-UndefValue *UndefValue::getElementValue(unsigned Idx) const {
-  return cast<UndefValue>(Ctx.getOrCreateConstant(
-      cast<llvm::UndefValue>(Val)->getElementValue(Idx)));
-}
-
-PoisonValue *PoisonValue::get(Type *T) {
-  auto *LLVMC = llvm::PoisonValue::get(T->LLVMTy);
-  return cast<PoisonValue>(T->getContext().getOrCreateConstant(LLVMC));
-}
-
-PoisonValue *PoisonValue::getSequentialElement() const {
-  return cast<PoisonValue>(Ctx.getOrCreateConstant(
-      cast<llvm::PoisonValue>(Val)->getSequentialElement()));
-}
-
-PoisonValue *PoisonValue::getStructElement(unsigned Elt) const {
-  return cast<PoisonValue>(Ctx.getOrCreateConstant(
-      cast<llvm::PoisonValue>(Val)->getStructElement(Elt)));
-}
-
-PoisonValue *PoisonValue::getElementValue(Constant *C) const {
-  return cast<PoisonValue>(
-      Ctx.getOrCreateConstant(cast<llvm::PoisonValue>(Val)->getElementValue(
-          cast<llvm::Constant>(C->Val))));
-}
-
-PoisonValue *PoisonValue::getElementValue(unsigned Idx) const {
-  return cast<PoisonValue>(Ctx.getOrCreateConstant(
-      cast<llvm::PoisonValue>(Val)->getElementValue(Idx)));
-}
-
-void GlobalObject::setAlignment(MaybeAlign Align) {
-  Ctx.getTracker()
-      .emplaceIfTracking<
-          GenericSetter<&GlobalObject::getAlign, &GlobalObject::setAlignment>>(
-          this);
-  cast<llvm::GlobalObject>(Val)->setAlignment(Align);
-}
-
-void GlobalObject::setGlobalObjectSubClassData(unsigned V) {
-  Ctx.getTracker()
-      .emplaceIfTracking<
-          GenericSetter<&GlobalObject::getGlobalObjectSubClassData,
-                        &GlobalObject::setGlobalObjectSubClassData>>(this);
-  cast<llvm::GlobalObject>(Val)->setGlobalObjectSubClassData(V);
-}
-
-void GlobalObject::setSection(StringRef S) {
-  Ctx.getTracker()
-      .emplaceIfTracking<
-          GenericSetter<&GlobalObject::getSection, &GlobalObject::setSection>>(
-          this);
-  cast<llvm::GlobalObject>(Val)->setSection(S);
-}
-
-template <typename GlobalT, typename LLVMGlobalT, typename ParentT,
-          typename LLVMParentT>
-GlobalT &GlobalWithNodeAPI<GlobalT, LLVMGlobalT, ParentT, LLVMParentT>::
-    LLVMGVToGV::operator()(LLVMGlobalT &LLVMGV) const {
-  return cast<GlobalT>(*Ctx.getValue(&LLVMGV));
-}
-
-namespace llvm::sandboxir {
-// Explicit instantiations.
-template class GlobalWithNodeAPI<GlobalIFunc, llvm::GlobalIFunc, GlobalObject,
-                                 llvm::GlobalObject>;
-template class GlobalWithNodeAPI<Function, llvm::Function, GlobalObject,
-                                 llvm::GlobalObject>;
-template class GlobalWithNodeAPI<GlobalVariable, llvm::GlobalVariable,
-                                 GlobalObject, llvm::GlobalObject>;
-template class GlobalWithNodeAPI<GlobalAlias, llvm::GlobalAlias, GlobalValue,
-                                 llvm::GlobalValue>;
-} // namespace llvm::sandboxir
-
-void GlobalIFunc::setResolver(Constant *Resolver) {
-  Ctx.getTracker()
-      .emplaceIfTracking<
-          GenericSetter<&GlobalIFunc::getResolver, &GlobalIFunc::setResolver>>(
-          this);
-  cast<llvm::GlobalIFunc>(Val)->setResolver(
-      cast<llvm::Constant>(Resolver->Val));
-}
-
-Constant *GlobalIFunc::getResolver() const {
-  return Ctx.getOrCreateConstant(cast<llvm::GlobalIFunc>(Val)->getResolver());
-}
-
-Function *GlobalIFunc::getResolverFunction() {
-  return cast<Function>(Ctx.getOrCreateConstant(
-      cast<llvm::GlobalIFunc>(Val)->getResolverFunction()));
-}
-
-GlobalVariable &
-GlobalVariable::LLVMGVToGV::operator()(llvm::GlobalVariable &LLVMGV) const {
-  return cast<GlobalVariable>(*Ctx.getValue(&LLVMGV));
-}
-
-Constant *GlobalVariable::getInitializer() const {
-  return Ctx.getOrCreateConstant(
-      cast<llvm::GlobalVariable>(Val)->getInitializer());
-}
-
-void GlobalVariable::setInitializer(Constant *InitVal) {
-  Ctx.getTracker()
-      .emplaceIfTracking<GenericSetter<&GlobalVariable::getInitializer,
-                                       &GlobalVariable::setInitializer>>(this);
-  cast<llvm::GlobalVariable>(Val)->setInitializer(
-      cast<llvm::Constant>(InitVal->Val));
-}
-
-void GlobalVariable::setConstant(bool V) {
-  Ctx.getTracker()
-      .emplaceIfTracking<GenericSetter<&GlobalVariable::isConstant,
-                                       &GlobalVariable::setConstant>>(this);
-  cast<llvm::GlobalVariable>(Val)->setConstant(V);
-}
-
-void GlobalVariable::setExternallyInitialized(bool V) {
-  Ctx.getTracker()
-      .emplaceIfTracking<
-          GenericSetter<&GlobalVariable::isExternallyInitialized,
-                        &GlobalVariable::setExternallyInitialized>>(this);
-  cast<llvm::GlobalVariable>(Val)->setExternallyInitialized(V);
-}
-
-void GlobalAlias::setAliasee(Constant *Aliasee) {
-  Ctx.getTracker()
-      .emplaceIfTracking<
-          GenericSetter<&GlobalAlias::getAliasee, &GlobalAlias::setAliasee>>(
-          this);
-  cast<llvm::GlobalAlias>(Val)->setAliasee(cast<llvm::Constant>(Aliasee->Val));
-}
-
-Constant *GlobalAlias::getAliasee() const {
-  return cast<Constant>(
-      Ctx.getOrCreateConstant(cast<llvm::GlobalAlias>(Val)->getAliasee()));
-}
-
-const GlobalObject *GlobalAlias::getAliaseeObject() const {
-  return cast<GlobalObject>(Ctx.getOrCreateConstant(
-      cast<llvm::GlobalAlias>(Val)->getAliaseeObject()));
-}
-
-void GlobalValue::setUnnamedAddr(UnnamedAddr V) {
-  Ctx.getTracker()
-      .emplaceIfTracking<GenericSetter<&GlobalValue::getUnnamedAddr,
-                                       &GlobalValue::setUnnamedAddr>>(this);
-  cast<llvm::GlobalValue>(Val)->setUnnamedAddr(V);
-}
-
-void GlobalValue::setVisibility(VisibilityTypes V) {
-  Ctx.getTracker()
-      .emplaceIfTracking<GenericSetter<&GlobalValue::getVisibility,
-                                       &GlobalValue::setVisibility>>(this);
-  cast<llvm::GlobalValue>(Val)->setVisibility(V);
-}
-
-NoCFIValue *NoCFIValue::get(GlobalValue *GV) {
-  auto *LLVMC = llvm::NoCFIValue::get(cast<llvm::GlobalValue>(GV->Val));
-  return cast<NoCFIValue>(GV->getContext().getOrCreateConstant(LLVMC));
-}
-
-GlobalValue *NoCFIValue::getGlobalValue() const {
-  auto *LLVMC = cast<llvm::NoCFIValue>(Val)->getGlobalValue();
-  return cast<GlobalValue>(Ctx.getOrCreateConstant(LLVMC));
-}
-
-PointerType *NoCFIValue::getType() const {
-  return cast<PointerType>(Ctx.getType(cast<llvm::NoCFIValue>(Val)->getType()));
-}
-
-ConstantPtrAuth *ConstantPtrAuth::get(Constant *Ptr, ConstantInt *Key,
-                                      ConstantInt *Disc, Constant *AddrDisc) {
-  auto *LLVMC = llvm::ConstantPtrAuth::get(
-      cast<llvm::Constant>(Ptr->Val), cast<llvm::ConstantInt>(Key->Val),
-      cast<llvm::ConstantInt>(Disc->Val), cast<llvm::Constant>(AddrDisc->Val));
-  return cast<ConstantPtrAuth>(Ptr->getContext().getOrCreateConstant(LLVMC));
-}
-
-Constant *ConstantPtrAuth::getPointer() const {
-  return Ctx.getOrCreateConstant(
-      cast<llvm::ConstantPtrAuth>(Val)->getPointer());
-}
-
-ConstantInt *ConstantPtrAuth::getKey() const {
-  return cast<ConstantInt>(
-      Ctx.getOrCreateConstant(cast<llvm::ConstantPtrAuth>(Val)->getKey()));
-}
-
-ConstantInt *ConstantPtrAuth::getDiscriminator() const {
-  return cast<ConstantInt>(Ctx.getOrCreateConstant(
-      cast<llvm::ConstantPtrAuth>(Val)->getDiscriminator()));
-}
-
-Constant *ConstantPtrAuth::getAddrDiscriminator() const {
-  return Ctx.getOrCreateConstant(
-      cast<llvm::ConstantPtrAuth>(Val)->getAddrDiscriminator());
-}
-
-ConstantPtrAuth *ConstantPtrAuth::getWithSameSchema(Constant *Pointer) const {
-  auto *LLVMC = cast<llvm::ConstantPtrAuth>(Val)->getWithSameSchema(
-      cast<llvm::Constant>(Pointer->Val));
-  return cast<ConstantPtrAuth>(Ctx.getOrCreateConstant(LLVMC));
-}
-
-BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) {
-  auto *LLVMC = llvm::BlockAddress::get(cast<llvm::Function>(F->Val),
-                                        cast<llvm::BasicBlock>(BB->Val));
-  return cast<BlockAddress>(F->getContext().getOrCreateConstant(LLVMC));
-}
-
-BlockAddress *BlockAddress::get(BasicBlock *BB) {
-  auto *LLVMC = llvm::BlockAddress::get(cast<llvm::BasicBlock>(BB->Val));
-  return cast<BlockAddress>(BB->getContext().getOrCreateConstant(LLVMC));
-}
-
-BlockAddress *BlockAddress::lookup(const BasicBlock *BB) {
-  auto *LLVMC = llvm::BlockAddress::lookup(cast<llvm::BasicBlock>(BB->Val));
-  return cast_or_null<BlockAddress>(BB->getContext().getValue(LLVMC));
-}
-
-Function *BlockAddress::getFunction() const {
-  return cast<Function>(
-      Ctx.getValue(cast<llvm::BlockAddress>(Val)->getFunction()));
-}
-
-BasicBlock *BlockAddress::getBasicBlock() const {
-  return cast<BasicBlock>(
-      Ctx.getValue(cast<llvm::BlockAddress>(Val)->getBasicBlock()));
-}
-
-DSOLocalEquivalent *DSOLocalEquivalent::get(GlobalValue *GV) {
-  auto *LLVMC = llvm::DSOLocalEquivalent::get(cast<llvm::GlobalValue>(GV->Val));
-  return cast<DSOLocalEquivalent>(GV->getContext().getValue(LLVMC));
-}
-
-GlobalValue *DSOLocalEquivalent::getGlobalValue() const {
-  return cast<GlobalValue>(
-      Ctx.getValue(cast<llvm::DSOLocalEquivalent>(Val)->getGlobalValue()));
-}
-
 ConstantTokenNone *ConstantTokenNone::get(Context &Ctx) {
   auto *LLVMC = llvm::ConstantTokenNone::get(Ctx.LLVMCtx);
   return cast<ConstantTokenNone>(Ctx.getOrCreateConstant(LLVMC));
 }
 
-FunctionType *Function::getFunctionType() const {
-  return cast<FunctionType>(
-      Ctx.getType(cast<llvm::Function>(Val)->getFunctionType()));
-}
-
-#ifndef NDEBUG
-void Function::dumpNameAndArgs(raw_ostream &OS) const {
-  auto *F = cast<llvm::Function>(Val);
-  OS << *F->getReturnType() << " @" << F->getName() << "(";
-  interleave(
-      F->args(),
-      [this, &OS](const llvm::Argument &LLVMArg) {
-        auto *SBArg = cast_or_null<Argument>(Ctx.getValue(&LLVMArg));
-        if (SBArg == nullptr)
-          OS << "NULL";
-        else
-          SBArg->printAsOperand(OS);
-      },
-      [&] { OS << ", "; });
-  OS << ")";
-}
-void Function::dumpOS(raw_ostream &OS) const {
-  dumpNameAndArgs(OS);
-  OS << " {\n";
-  auto *LLVMF = cast<llvm::Function>(Val);
-  interleave(
-      *LLVMF,
-      [this, &OS](const llvm::BasicBlock &LLVMBB) {
-        auto *BB = cast_or_null<BasicBlock>(Ctx.getValue(&LLVMBB));
-        if (BB == nullptr)
-          OS << "NULL";
-        else
-          OS << *BB;
-      },
-      [&OS] { OS << "\n"; });
-  OS << "}\n";
-}
-#endif // NDEBUG
-
-BasicBlock::iterator::pointer
-BasicBlock::iterator::getInstr(llvm::BasicBlock::iterator It) const {
-  return cast_or_null<Instruction>(Ctx->getValue(&*It));
-}
-
-std::unique_ptr<Value> Context::detachLLVMValue(llvm::Value *V) {
-  std::unique_ptr<Value> Erased;
-  auto It = LLVMValueToValueMap.find(V);
-  if (It != LLVMValueToValueMap.end()) {
-    auto *Val = It->second.release();
-    Erased = std::unique_ptr<Value>(Val);
-    LLVMValueToValueMap.erase(It);
-  }
-  return Erased;
-}
-
-std::unique_ptr<Value> Context::detach(Value *V) {
-  assert(V->getSubclassID() != Value::ClassID::Constant &&
-         "Can't detach a constant!");
-  assert(V->getSubclassID() != Value::ClassID::User && "Can't detach a user!");
-  return detachLLVMValue(V->Val);
-}
-
-Value *Context::registerValue(std::unique_ptr<Value> &&VPtr) {
-  assert(VPtr->getSubclassID() != Value::ClassID::User &&
-         "Can't register a user!");
-
-  // Track creation of instructions.
-  // Please note that we don't allow the creation of detached instructions,
-  // meaning that the instructions need to be inserted into a block upon
-  // creation. This is why the tracker class combines creation and insertion.
-  if (auto *I = dyn_cast<Instruction>(VPtr.get()))
-    getTracker().emplaceIfTracking<CreateAndInsertInst>(I);
-
-  Value *V = VPtr.get();
-  [[maybe_unused]] auto Pair =
-      LLVMValueToValueMap.insert({VPtr->Val, std::move(VPtr)});
-  assert(Pair.second && "Already exists!");
-  return V;
-}
-
-Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
-  auto Pair = LLVMValueToValueMap.insert({LLVMV, nullptr});
-  auto It = Pair.first;
-  if (!Pair.second)
-    return It->second.get();
-
-  if (auto *C = dyn_cast<llvm::Constant>(LLVMV)) {
-    switch (C->getValueID()) {
-    case llvm::Value::ConstantIntVal:
-      It->second = std::unique_ptr<ConstantInt>(
-          new ConstantInt(cast<llvm::ConstantInt>(C), *this));
-      return It->second.get();
-    case llvm::Value::ConstantFPVal:
-      It->second = std::unique_ptr<ConstantFP>(
-          new ConstantFP(cast<llvm::ConstantFP>(C), *this));
-      return It->second.get();
-    case llvm::Value::BlockAddressVal:
-      It->second = std::unique_ptr<BlockAddress>(
-          new BlockAddress(cast<llvm::BlockAddress>(C), *this));
-      return It->second.get();
-    case llvm::Value::ConstantTokenNoneVal:
-      It->second = std::unique_ptr<ConstantTokenNone>(
-          new ConstantTokenNone(cast<llvm::ConstantTokenNone>(C), *this));
-      return It->second.get();
-    case llvm::Value::ConstantAggregateZeroVal: {
-      auto *CAZ = cast<llvm::ConstantAggregateZero>(C);
-      It->second = std::unique_ptr<ConstantAggregateZero>(
-          new ConstantAggregateZero(CAZ, *this));
-      auto *Ret = It->second.get();
-      // Must create sandboxir for elements.
-      auto EC = CAZ->getElementCount();
-      if (EC.isFixed()) {
-        for (auto ElmIdx : seq<unsigned>(0, EC.getFixedValue()))
-          getOrCreateValueInternal(CAZ->getElementValue(ElmIdx), CAZ);
-      }
-      return Ret;
-    }
-    case llvm::Value::ConstantPointerNullVal:
-      It->second = std::unique_ptr<ConstantPointerNull>(
-          new ConstantPointerNull(cast<llvm::ConstantPointerNull>(C), *this));
-      return It->second.get();
-    case llvm::Value::PoisonValueVal:
-      It->second = std::unique_ptr<PoisonValue>(
-          new PoisonValue(cast<llvm::PoisonValue>(C), *this));
-      return It->second.get();
-    case llvm::Value::UndefValueVal:
-      It->second = std::unique_ptr<UndefValue>(
-          new UndefValue(cast<llvm::UndefValue>(C), *this));
-      return It->second.get();
-    case llvm::Value::DSOLocalEquivalentVal: {
-      auto *DSOLE = cast<llvm::DSOLocalEquivalent>(C);
-      It->second = std::unique_ptr<DSOLocalEquivalent>(
-          new DSOLocalEquivalent(DSOLE, *this));
-      auto *Ret = It->second.get();
-      getOrCreateValueInternal(DSOLE->getGlobalValue(), DSOLE);
-      return Ret;
-    }
-    case llvm::Value::ConstantArrayVal:
-      It->second = std::unique_ptr<ConstantArray>(
-          new ConstantArray(cast<llvm::ConstantArray>(C), *this));
-      break;
-    case llvm::Value::ConstantStructVal:
-      It->second = std::unique_ptr<ConstantStruct>(
-          new ConstantStruct(cast<llvm::ConstantStruct>(C), *this));
-      break;
-    case llvm::Value::ConstantVectorVal:
-      It->second = std::unique_ptr<ConstantVector>(
-          new ConstantVector(cast<llvm::ConstantVector>(C), *this));
-      break;
-    case llvm::Value::FunctionVal:
-      It->second = std::unique_ptr<Function>(
-          new Function(cast<llvm::Function>(C), *this));
-      break;
-    case llvm::Value::GlobalIFuncVal:
-      It->second = std::unique_ptr<GlobalIFunc>(
-          new GlobalIFunc(cast<llvm::GlobalIFunc>(C), *this));
-      break;
-    case llvm::Value::GlobalVariableVal:
-      It->second = std::unique_ptr<GlobalVariable>(
-          new GlobalVariable(cast<llvm::GlobalVariable>(C), *this));
-      break;
-    case llvm::Value::GlobalAliasVal:
-      It->second = std::unique_ptr<GlobalAlias>(
-          new GlobalAlias(cast<llvm::GlobalAlias>(C), *this));
-      break;
-    case llvm::Value::NoCFIValueVal:
-      It->second = std::unique_ptr<NoCFIValue>(
-          new NoCFIValue(cast<llvm::NoCFIValue>(C), *this));
-      break;
-    case llvm::Value::ConstantPtrAuthVal:
-      It->second = std::unique_ptr<ConstantPtrAuth>(
-          new ConstantPtrAuth(cast<llvm::ConstantPtrAuth>(C), *this));
-      break;
-    case llvm::Value::ConstantExprVal:
-      It->second = std::unique_ptr<ConstantExpr>(
-          new ConstantExpr(cast<llvm::ConstantExpr>(C), *this));
-      break;
-    default:
-      It->second = std::unique_ptr<Constant>(new Constant(C, *this));
-      break;
-    }
-    auto *NewC = It->second.get();
-    for (llvm::Value *COp : C->operands())
-      getOrCreateValueInternal(COp, C);
-    return NewC;
-  }
-  if (auto *Arg = dyn_cast<llvm::Argument>(LLVMV)) {
-    It->second = std::unique_ptr<Argument>(new Argument(Arg, *this));
-    return It->second.get();
-  }
-  if (auto *BB = dyn_cast<llvm::BasicBlock>(LLVMV)) {
-    assert(isa<llvm::BlockAddress>(U) &&
-           "This won't create a SBBB, don't call this function directly!");
-    if (auto *SBBB = getValue(BB))
-      return SBBB;
-    return nullptr;
-  }
-  assert(isa<llvm::Instruction>(LLVMV) && "Expected Instruction");
-
-  switch (cast<llvm::Instruction>(LLVMV)->getOpcode()) {
-  case llvm::Instruction::VAArg: {
-    auto *LLVMVAArg = cast<llvm::VAArgInst>(LLVMV);
-    It->second = std::unique_ptr<VAArgInst>(new VAArgInst(LLVMVAArg, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Freeze: {
-    auto *LLVMFreeze = cast<llvm::FreezeInst>(LLVMV);
-    It->second = std::unique_ptr<FreezeInst>(new FreezeInst(LLVMFreeze, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Fence: {
-    auto *LLVMFence = cast<llvm::FenceInst>(LLVMV);
-    It->second = std::unique_ptr<FenceInst>(new FenceInst(LLVMFence, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Select: {
-    auto *LLVMSel = cast<llvm::SelectInst>(LLVMV);
-    It->second = std::unique_ptr<SelectInst>(new SelectInst(LLVMSel, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::ExtractElement: {
-    auto *LLVMIns = cast<llvm::ExtractElementInst>(LLVMV);
-    It->second = std::unique_ptr<ExtractElementInst>(
-        new ExtractElementInst(LLVMIns, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::InsertElement: {
-    auto *LLVMIns = cast<llvm::InsertElementInst>(LLVMV);
-    It->second = std::unique_ptr<InsertElementInst>(
-        new InsertElementInst(LLVMIns, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::ShuffleVector: {
-    auto *LLVMIns = cast<llvm::ShuffleVectorInst>(LLVMV);
-    It->second = std::unique_ptr<ShuffleVectorInst>(
-        new ShuffleVectorInst(LLVMIns, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::ExtractValue: {
-    auto *LLVMIns = cast<llvm::ExtractValueInst>(LLVMV);
-    It->second =
-        std::unique_ptr<ExtractValueInst>(new ExtractValueInst(LLVMIns, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::InsertValue: {
-    auto *LLVMIns = cast<llvm::InsertValueInst>(LLVMV);
-    It->second =
-        std::unique_ptr<InsertValueInst>(new InsertValueInst(LLVMIns, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Br: {
-    auto *LLVMBr = cast<llvm::BranchInst>(LLVMV);
-    It->second = std::unique_ptr<BranchInst>(new BranchInst(LLVMBr, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Load: {
-    auto *LLVMLd = cast<llvm::LoadInst>(LLVMV);
-    It->second = std::unique_ptr<LoadInst>(new LoadInst(LLVMLd, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Store: {
-    auto *LLVMSt = cast<llvm::StoreInst>(LLVMV);
-    It->second = std::unique_ptr<StoreInst>(new StoreInst(LLVMSt, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Ret: {
-    auto *LLVMRet = cast<llvm::ReturnInst>(LLVMV);
-    It->second = std::unique_ptr<ReturnInst>(new ReturnInst(LLVMRet, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Call: {
-    auto *LLVMCall = cast<llvm::CallInst>(LLVMV);
-    It->second = std::unique_ptr<CallInst>(new CallInst(LLVMCall, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Invoke: {
-    auto *LLVMInvoke = cast<llvm::InvokeInst>(LLVMV);
-    It->second = std::unique_ptr<InvokeInst>(new InvokeInst(LLVMInvoke, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::CallBr: {
-    auto *LLVMCallBr = cast<llvm::CallBrInst>(LLVMV);
-    It->second = std::unique_ptr<CallBrInst>(new CallBrInst(LLVMCallBr, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::LandingPad: {
-    auto *LLVMLPad = cast<llvm::LandingPadInst>(LLVMV);
-    It->second =
-        std::unique_ptr<LandingPadInst>(new LandingPadInst(LLVMLPad, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::CatchPad: {
-    auto *LLVMCPI = cast<llvm::CatchPadInst>(LLVMV);
-    It->second =
-        std::unique_ptr<CatchPadInst>(new CatchPadInst(LLVMCPI, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::CleanupPad: {
-    auto *LLVMCPI = cast<llvm::CleanupPadInst>(LLVMV);
-    It->second =
-        std::unique_ptr<CleanupPadInst>(new CleanupPadInst(LLVMCPI, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::CatchRet: {
-    auto *LLVMCRI = cast<llvm::CatchReturnInst>(LLVMV);
-    It->second =
-        std::unique_ptr<CatchReturnInst>(new CatchReturnInst(LLVMCRI, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::CleanupRet: {
-    auto *LLVMCRI = cast<llvm::CleanupReturnInst>(LLVMV);
-    It->second = std::unique_ptr<CleanupReturnInst>(
-        new CleanupReturnInst(LLVMCRI, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::GetElementPtr: {
-    auto *LLVMGEP = cast<llvm::GetElementPtrInst>(LLVMV);
-    It->second = std::unique_ptr<GetElementPtrInst>(
-        new GetElementPtrInst(LLVMGEP, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::CatchSwitch: {
-    auto *LLVMCatchSwitchInst = cast<llvm::CatchSwitchInst>(LLVMV);
-    It->second = std::unique_ptr<CatchSwitchInst>(
-        new CatchSwitchInst(LLVMCatchSwitchInst, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Resume: {
-    auto *LLVMResumeInst = cast<llvm::ResumeInst>(LLVMV);
-    It->second =
-        std::unique_ptr<ResumeInst>(new ResumeInst(LLVMResumeInst, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Switch: {
-    auto *LLVMSwitchInst = cast<llvm::SwitchInst>(LLVMV);
-    It->second =
-        std::unique_ptr<SwitchInst>(new SwitchInst(LLVMSwitchInst, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::FNeg: {
-    auto *LLVMUnaryOperator = cast<llvm::UnaryOperator>(LLVMV);
-    It->second = std::unique_ptr<UnaryOperator>(
-        new UnaryOperator(LLVMUnaryOperator, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Add:
-  case llvm::Instruction::FAdd:
-  case llvm::Instruction::Sub:
-  case llvm::Instruction::FSub:
-  case llvm::Instruction::Mul:
-  case llvm::Instruction::FMul:
-  case llvm::Instruction::UDiv:
-  case llvm::Instruction::SDiv:
-  case llvm::Instruction::FDiv:
-  case llvm::Instruction::URem:
-  case llvm::Instruction::SRem:
-  case llvm::Instruction::FRem:
-  case llvm::Instruction::Shl:
-  case llvm::Instruction::LShr:
-  case llvm::Instruction::AShr:
-  case llvm::Instruction::And:
-  case llvm::Instruction::Or:
-  case llvm::Instruction::Xor: {
-    auto *LLVMBinaryOperator = cast<llvm::BinaryOperator>(LLVMV);
-    It->second = std::unique_ptr<BinaryOperator>(
-        new BinaryOperator(LLVMBinaryOperator, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::AtomicRMW: {
-    auto *LLVMAtomicRMW = cast<llvm::AtomicRMWInst>(LLVMV);
-    It->second =
-        std::unique_ptr<AtomicRMWInst>(new AtomicRMWInst(LLVMAtomicRMW, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::AtomicCmpXchg: {
-    auto *LLVMAtomicCmpXchg = cast<llvm::AtomicCmpXchgInst>(LLVMV);
-    It->second = std::unique_ptr<AtomicCmpXchgInst>(
-        new AtomicCmpXchgInst(LLVMAtomicCmpXchg, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Alloca: {
-    auto *LLVMAlloca = cast<llvm::AllocaInst>(LLVMV);
-    It->second = std::unique_ptr<AllocaInst>(new AllocaInst(LLVMAlloca, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::ZExt:
-  case llvm::Instruction::SExt:
-  case llvm::Instruction::FPToUI:
-  case llvm::Instruction::FPToSI:
-  case llvm::Instruction::FPExt:
-  case llvm::Instruction::PtrToInt:
-  case llvm::Instruction::IntToPtr:
-  case llvm::Instruction::SIToFP:
-  case llvm::Instruction::UIToFP:
-  case llvm::Instruction::Trunc:
-  case llvm::Instruction::FPTrunc:
-  case llvm::Instruction::BitCast:
-  case llvm::Instruction::AddrSpaceCast: {
-    auto *LLVMCast = cast<llvm::CastInst>(LLVMV);
-    It->second = std::unique_ptr<CastInst>(new CastInst(LLVMCast, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::PHI: {
-    auto *LLVMPhi = cast<llvm::PHINode>(LLVMV);
-    It->second = std::unique_ptr<PHINode>(new PHINode(LLVMPhi, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::ICmp: {
-    auto *LLVMICmp = cast<llvm::ICmpInst>(LLVMV);
-    It->second = std::unique_ptr<ICmpInst>(new ICmpInst(LLVMICmp, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::FCmp: {
-    auto *LLVMFCmp = cast<llvm::FCmpInst>(LLVMV);
-    It->second = std::unique_ptr<FCmpInst>(new FCmpInst(LLVMFCmp, *this));
-    return It->second.get();
-  }
-  case llvm::Instruction::Unreachable: {
-    auto *LLVMUnreachable = cast<llvm::UnreachableInst>(LLVMV);
-    It->second = std::unique_ptr<UnreachableInst>(
-        new UnreachableInst(LLVMUnreachable, *this));
-    return It->second.get();
-  }
-  default:
-    break;
-  }
-
-  It->second = std::unique_ptr<OpaqueInst>(
-      new OpaqueInst(cast<llvm::Instruction>(LLVMV), *this));
-  return It->second.get();
-}
-
-BasicBlock *Context::createBasicBlock(llvm::BasicBlock *LLVMBB) {
-  assert(getValue(LLVMBB) == nullptr && "Already exists!");
-  auto NewBBPtr = std::unique_ptr<BasicBlock>(new BasicBlock(LLVMBB, *this));
-  auto *BB = cast<BasicBlock>(registerValue(std::move(NewBBPtr)));
-  // Create SandboxIR for BB's body.
-  BB->buildBasicBlockFromLLVMIR(LLVMBB);
-  return BB;
-}
-
-VAArgInst *Context::createVAArgInst(llvm::VAArgInst *SI) {
-  auto NewPtr = std::unique_ptr<VAArgInst>(new VAArgInst(SI, *this));
-  return cast<VAArgInst>(registerValue(std::move(NewPtr)));
-}
-
-FreezeInst *Context::createFreezeInst(llvm::FreezeInst *SI) {
-  auto NewPtr = std::unique_ptr<FreezeInst>(new FreezeInst(SI, *this));
-  return cast<FreezeInst>(registerValue(std::move(NewPtr)));
-}
-
-FenceInst *Context::createFenceInst(llvm::FenceInst *SI) {
-  auto NewPtr = std::unique_ptr<FenceInst>(new FenceInst(SI, *this));
-  return cast<FenceInst>(registerValue(std::move(NewPtr)));
-}
-
-SelectInst *Context::createSelectInst(llvm::SelectInst *SI) {
-  auto NewPtr = std::unique_ptr<SelectInst>(new SelectInst(SI, *this));
-  return cast<SelectInst>(registerValue(std::move(NewPtr)));
-}
-
-ExtractElementInst *
-Context::createExtractElementInst(llvm::ExtractElementInst *EEI) {
-  auto NewPtr =
-      std::unique_ptr<ExtractElementInst>(new ExtractElementInst(EEI, *this));
-  return cast<ExtractElementInst>(registerValue(std::move(NewPtr)));
-}
-
-InsertElementInst *
-Context::createInsertElementInst(llvm::InsertElementInst *IEI) {
-  auto NewPtr =
-      std::unique_ptr<InsertElementInst>(new InsertElementInst(IEI, *this));
-  return cast<InsertElementInst>(registerValue(std::move(NewPtr)));
-}
-
-ShuffleVectorInst *
-Context::createShuffleVectorInst(llvm::ShuffleVectorInst *SVI) {
-  auto NewPtr =
-      std::unique_ptr<ShuffleVectorInst>(new ShuffleVectorInst(SVI, *this));
-  return cast<ShuffleVectorInst>(registerValue(std::move(NewPtr)));
-}
-
-ExtractValueInst *Context::createExtractValueInst(llvm::ExtractValueInst *EVI) {
-  auto NewPtr =
-      std::unique_ptr<ExtractValueInst>(new ExtractValueInst(EVI, *this));
-  return cast<ExtractValueInst>(registerValue(std::move(NewPtr)));
-}
-
-InsertValueInst *Context::createInsertValueInst(llvm::InsertValueInst *IVI) {
-  auto NewPtr =
-      std::unique_ptr<InsertValueInst>(new InsertValueInst(IVI, *this));
-  return cast<InsertValueInst>(registerValue(std::move(NewPtr)));
-}
-
-BranchInst *Context::createBranchInst(llvm::BranchInst *BI) {
-  auto NewPtr = std::unique_ptr<BranchInst>(new BranchInst(BI, *this));
-  return cast<BranchInst>(registerValue(std::move(NewPtr)));
-}
-
-LoadInst *Context::createLoadInst(llvm::LoadInst *LI) {
-  auto NewPtr = std::unique_ptr<LoadInst>(new LoadInst(LI, *this));
-  return cast<LoadInst>(registerValue(std::move(NewPtr)));
-}
-
-StoreInst *Context::createStoreInst(llvm::StoreInst *SI) {
-  auto NewPtr = std::unique_ptr<StoreInst>(new StoreInst(SI, *this));
-  return cast<StoreInst>(registerValue(std::move(NewPtr)));
-}
-
-ReturnInst *Context::createReturnInst(llvm::ReturnInst *I) {
-  auto NewPtr = std::unique_ptr<ReturnInst>(new ReturnInst(I, *this));
-  return cast<ReturnInst>(registerValue(std::move(NewPtr)));
-}
-
-CallInst *Context::createCallInst(llvm::CallInst *I) {
-  auto NewPtr = std::unique_ptr<CallInst>(new CallInst(I, *this));
-  return cast<CallInst>(registerValue(std::move(NewPtr)));
-}
-
-InvokeInst *Context::createInvokeInst(llvm::InvokeInst *I) {
-  auto NewPtr = std::unique_ptr<InvokeInst>(new InvokeInst(I, *this));
-  return cast<InvokeInst>(registerValue(std::move(NewPtr)));
-}
-
-CallBrInst *Context::createCallBrInst(llvm::CallBrInst *I) {
-  auto NewPtr = std::unique_ptr<CallBrInst>(new CallBrInst(I, *this));
-  return cast<CallBrInst>(registerValue(std::move(NewPtr)));
-}
-
-UnreachableInst *Context::createUnreachableInst(llvm::UnreachableInst *UI) {
-  auto NewPtr =
-      std::unique_ptr<UnreachableInst>(new UnreachableInst(UI, *this));
-  return cast<UnreachableInst>(registerValue(std::move(NewPtr)));
-}
-LandingPadInst *Context::createLandingPadInst(llvm::LandingPadInst *I) {
-  auto NewPtr = std::unique_ptr<LandingPadInst>(new LandingPadInst(I, *this));
-  return cast<LandingPadInst>(registerValue(std::move(NewPtr)));
-}
-CatchPadInst *Context::createCatchPadInst(llvm::CatchPadInst *I) {
-  auto NewPtr = std::unique_ptr<CatchPadInst>(new CatchPadInst(I, *this));
-  return cast<CatchPadInst>(registerValue(std::move(NewPtr)));
-}
-CleanupPadInst *Context::createCleanupPadInst(llvm::CleanupPadInst *I) {
-  auto NewPtr = std::unique_ptr<CleanupPadInst>(new CleanupPadInst(I, *this));
-  return cast<CleanupPadInst>(registerValue(std::move(NewPtr)));
-}
-CatchReturnInst *Context::createCatchReturnInst(llvm::CatchReturnInst *I) {
-  auto NewPtr = std::unique_ptr<CatchReturnInst>(new CatchReturnInst(I, *this));
-  return cast<CatchReturnInst>(registerValue(std::move(NewPtr)));
-}
-CleanupReturnInst *
-Context::createCleanupReturnInst(llvm::CleanupReturnInst *I) {
-  auto NewPtr =
-      std::unique_ptr<CleanupReturnInst>(new CleanupReturnInst(I, *this));
-  return cast<CleanupReturnInst>(registerValue(std::move(NewPtr)));
-}
-GetElementPtrInst *
-Context::createGetElementPtrInst(llvm::GetElementPtrInst *I) {
-  auto NewPtr =
-      std::unique_ptr<GetElementPtrInst>(new GetElementPtrInst(I, *this));
-  return cast<GetElementPtrInst>(registerValue(std::move(NewPtr)));
-}
-CatchSwitchInst *Context::createCatchSwitchInst(llvm::CatchSwitchInst *I) {
-  auto NewPtr = std::unique_ptr<CatchSwitchInst>(new CatchSwitchInst(I, *this));
-  return cast<CatchSwitchInst>(registerValue(std::move(NewPtr)));
-}
-ResumeInst *Context::createResumeInst(llvm::ResumeInst *I) {
-  auto NewPtr = std::unique_ptr<ResumeInst>(new ResumeInst(I, *this));
-  return cast<ResumeInst>(registerValue(std::move(NewPtr)));
-}
-SwitchInst *Context::createSwitchInst(llvm::SwitchInst *I) {
-  auto NewPtr = std::unique_ptr<SwitchInst>(new SwitchInst(I, *this));
-  return cast<SwitchInst>(registerValue(std::move(NewPtr)));
-}
-UnaryOperator *Context::createUnaryOperator(llvm::UnaryOperator *I) {
-  auto NewPtr = std::unique_ptr<UnaryOperator>(new UnaryOperator(I, *this));
-  return cast<UnaryOperator>(registerValue(std::move(NewPtr)));
-}
-BinaryOperator *Context::createBinaryOperator(llvm::BinaryOperator *I) {
-  auto NewPtr = std::unique_ptr<BinaryOperator>(new BinaryOperator(I, *this));
-  return cast<BinaryOperator>(registerValue(std::move(NewPtr)));
-}
-AtomicRMWInst *Context::createAtomicRMWInst(llvm::AtomicRMWInst *I) {
-  auto NewPtr = std::unique_ptr<AtomicRMWInst>(new AtomicRMWInst(I, *this));
-  return cast<AtomicRMWInst>(registerValue(std::move(NewPtr)));
-}
-AtomicCmpXchgInst *
-Context::createAtomicCmpXchgInst(llvm::AtomicCmpXchgInst *I) {
-  auto NewPtr =
-      std::unique_ptr<AtomicCmpXchgInst>(new AtomicCmpXchgInst(I, *this));
-  return cast<AtomicCmpXchgInst>(registerValue(std::move(NewPtr)));
-}
-AllocaInst *Context::createAllocaInst(llvm::AllocaInst *I) {
-  auto NewPtr = std::unique_ptr<AllocaInst>(new AllocaInst(I, *this));
-  return cast<AllocaInst>(registerValue(std::move(NewPtr)));
-}
-CastInst *Context::createCastInst(llvm::CastInst *I) {
-  auto NewPtr = std::unique_ptr<CastInst>(new CastInst(I, *this));
-  return cast<CastInst>(registerValue(std::move(NewPtr)));
-}
-PHINode *Context::createPHINode(llvm::PHINode *I) {
-  auto NewPtr = std::unique_ptr<PHINode>(new PHINode(I, *this));
-  return cast<PHINode>(registerValue(std::move(NewPtr)));
-}
-ICmpInst *Context::createICmpInst(llvm::ICmpInst *I) {
-  auto NewPtr = std::unique_ptr<ICmpInst>(new ICmpInst(I, *this));
-  return cast<ICmpInst>(registerValue(std::move(NewPtr)));
-}
-FCmpInst *Context::createFCmpInst(llvm::FCmpInst *I) {
-  auto NewPtr = std::unique_ptr<FCmpInst>(new FCmpInst(I, *this));
-  return cast<FCmpInst>(registerValue(std::move(NewPtr)));
-}
-CmpInst *CmpInst::create(Predicate P, Value *S1, Value *S2,
-                         Instruction *InsertBefore, Context &Ctx,
-                         const Twine &Name) {
-  auto &Builder = Ctx.getLLVMIRBuilder();
-  Builder.SetInsertPoint(InsertBefore->getTopmostLLVMInstruction());
-  auto *LLVMI = Builder.CreateCmp(P, S1->Val, S2->Val, Name);
-  if (dyn_cast<llvm::ICmpInst>(LLVMI))
-    return Ctx.createICmpInst(cast<llvm::ICmpInst>(LLVMI));
-  return Ctx.createFCmpInst(cast<llvm::FCmpInst>(LLVMI));
-}
-CmpInst *CmpInst::createWithCopiedFlags(Predicate P, Value *S1, Value *S2,
-                                        const Instruction *F,
-                                        Instruction *InsertBefore, Context &Ctx,
-                                        const Twine &Name) {
-  CmpInst *Inst = create(P, S1, S2, InsertBefore, Ctx, Name);
-  cast<llvm::CmpInst>(Inst->Val)->copyIRFlags(F->Val);
-  return Inst;
-}
-
-Type *CmpInst::makeCmpResultType(Type *OpndType) {
-  if (auto *VT = dyn_cast<VectorType>(OpndType)) {
-    // TODO: Cleanup when we have more complete support for
-    // sandboxir::VectorType
-    return OpndType->getContext().getType(llvm::VectorType::get(
-        llvm::Type::getInt1Ty(OpndType->getContext().LLVMCtx),
-        cast<llvm::VectorType>(VT->LLVMTy)->getElementCount()));
-  }
-  return Type::getInt1Ty(OpndType->getContext());
-}
-
-void CmpInst::setPredicate(Predicate P) {
-  Ctx.getTracker()
-      .emplaceIfTracking<
-          GenericSetter<&CmpInst::getPredicate, &CmpInst::setPredicate>>(this);
-  cast<llvm::CmpInst>(Val)->setPredicate(P);
-}
-
-void CmpInst::swapOperands() {
-  if (ICmpInst *IC = dyn_cast<ICmpInst>(this))
-    IC->swapOperands();
-  else
-    cast<FCmpInst>(this)->swapOperands();
-}
-
-void ICmpInst::swapOperands() {
-  Ctx.getTracker().emplaceIfTracking<CmpSwapOperands>(this);
-  cast<llvm::ICmpInst>(Val)->swapOperands();
-}
-
-void FCmpInst::swapOperands() {
-  Ctx.getTracker().emplaceIfTracking<CmpSwapOperands>(this);
-  cast<llvm::FCmpInst>(Val)->swapOperands();
-}
-
-#ifndef NDEBUG
-void CmpInst::dumpOS(raw_ostream &OS) const {
-  dumpCommonPrefix(OS);
-  dumpCommonSuffix(OS);
-}
-
-void CmpInst::dump() const {
-  dumpOS(dbgs());
-  dbgs() << "\n";
-}
-#endif // NDEBUG
-
-Value *Context::getValue(llvm::Value *V) const {
-  auto It = LLVMValueToValueMap.find(V);
-  if (It != LLVMValueToValueMap.end())
-    return It->second.get();
-  return nullptr;
-}
-
-Function *Context::createFunction(llvm::Function *F) {
-  assert(getValue(F) == nullptr && "Already exists!");
-  auto NewFPtr = std::unique_ptr<Function>(new Function(F, *this));
-  auto *SBF = cast<Function>(registerValue(std::move(NewFPtr)));
-  // Create arguments.
-  for (auto &Arg : F->args())
-    getOrCreateArgument(&Arg);
-  // Create BBs.
-  for (auto &BB : *F)
-    createBasicBlock(&BB);
-  return SBF;
-}
-
-Function *BasicBlock::getParent() const {
-  auto *BB = cast<llvm::BasicBlock>(Val);
-  auto *F = BB->getParent();
-  if (F == nullptr)
-    // Detached
-    return nullptr;
-  return cast_or_null<Function>(Ctx.getValue(F));
-}
-
-void BasicBlock::buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB) {
-  for (llvm::Instruction &IRef : reverse(*LLVMBB)) {
-    llvm::Instruction *I = &IRef;
-    Ctx.getOrCreateValue(I);
-    for (auto [OpIdx, Op] : enumerate(I->operands())) {
-      // Skip instruction's label operands
-      if (isa<llvm::BasicBlock>(Op))
-        continue;
-      // Skip metadata
-      if (isa<llvm::MetadataAsValue>(Op))
-        continue;
-      // Skip asm
-      if (isa<llvm::InlineAsm>(Op))
-        continue;
-      Ctx.getOrCreateValue(Op);
-    }
-  }
-#if !defined(NDEBUG)
-  verify();
-#endif
-}
-
-BasicBlock::iterator BasicBlock::begin() const {
-  llvm::BasicBlock *BB = cast<llvm::BasicBlock>(Val);
-  llvm::BasicBlock::iterator It = BB->begin();
-  if (!BB->empty()) {
-    auto *V = Ctx.getValue(&*BB->begin());
-    assert(V != nullptr && "No SandboxIR for BB->begin()!");
-    auto *I = cast<Instruction>(V);
-    unsigned Num = I->getNumOfIRInstrs();
-    assert(Num >= 1u && "Bad getNumOfIRInstrs()");
-    It = std::next(It, Num - 1);
-  }
-  return iterator(BB, It, &Ctx);
-}
-
-Instruction *BasicBlock::getTerminator() const {
-  auto *TerminatorV =
-      Ctx.getValue(cast<llvm::BasicBlock>(Val)->getTerminator());
-  return cast_or_null<Instruction>(TerminatorV);
-}
-
-Instruction &BasicBlock::front() const {
-  auto *BB = cast<llvm::BasicBlock>(Val);
-  assert(!BB->empty() && "Empty block!");
-  auto *SBI = cast<Instruction>(getContext().getValue(&*BB->begin()));
-  assert(SBI != nullptr && "Expected Instr!");
-  return *SBI;
-}
-
-Instruction &BasicBlock::back() const {
-  auto *BB = cast<llvm::BasicBlock>(Val);
-  assert(!BB->empty() && "Empty block!");
-  auto *SBI = cast<Instruction>(getContext().getValue(&*BB->rbegin()));
-  assert(SBI != nullptr && "Expected Instr!");
-  return *SBI;
-}
-
-#ifndef NDEBUG
-void BasicBlock::dumpOS(raw_ostream &OS) const {
-  llvm::BasicBlock *BB = cast<llvm::BasicBlock>(Val);
-  const auto &Name = BB->getName();
-  OS << Name;
-  if (!Name.empty())
-    OS << ":\n";
-  // If there are Instructions in the BB that are not mapped to SandboxIR, then
-  // use a crash-proof dump.
-  if (any_of(*BB, [this](llvm::Instruction &I) {
-        return Ctx.getValue(&I) == nullptr;
-      })) {
-    OS << "<Crash-proof mode!>\n";
-    DenseSet<Instruction *> Visited;
-    for (llvm::Instruction &IRef : *BB) {
-      Value *SBV = Ctx.getValue(&IRef);
-      if (SBV == nullptr)
-        OS << IRef << " *** No SandboxIR ***\n";
-      else {
-        auto *SBI = dyn_cast<Instruction>(SBV);
-        if (SBI == nullptr) {
-          OS << IRef << " *** Not a SBInstruction!!! ***\n";
-        } else {
-          if (Visited.insert(SBI).second)
-            OS << *SBI << "\n";
-        }
-      }
-    }
-  } else {
-    for (auto &SBI : *this) {
-      SBI.dumpOS(OS);
-      OS << "\n";
-    }
-  }
-}
-
-void BasicBlock::verify() const {
-  assert(isa<llvm::BasicBlock>(Val) && "Expected BasicBlock!");
-  for (const auto &I : *this) {
-    I.verify();
-  }
-}
-
-#endif // NDEBUG
+} // namespace llvm::sandboxir
diff --git a/llvm/lib/SandboxIR/Module.cpp b/llvm/lib/SandboxIR/Module.cpp
new file mode 100644
index 0000000000000..61cc2414c45ae
--- /dev/null
+++ b/llvm/lib/SandboxIR/Module.cpp
@@ -0,0 +1,43 @@
+//===- Module.cpp ---------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SandboxIR/Module.h"
+#include "llvm/SandboxIR/Constant.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/SandboxIR/Value.h"
+
+using namespace llvm::sandboxir;
+
+Function *Module::getFunction(StringRef Name) const {
+  llvm::Function *LLVMF = LLVMM.getFunction(Name);
+  return cast_or_null<Function>(Ctx.getValue(LLVMF));
+}
+
+GlobalVariable *Module::getGlobalVariable(StringRef Name,
+                                          bool AllowInternal) const {
+  return cast_or_null<GlobalVariable>(
+      Ctx.getValue(LLVMM.getGlobalVariable(Name, AllowInternal)));
+}
+
+GlobalAlias *Module::getNamedAlias(StringRef Name) const {
+  return cast_or_null<GlobalAlias>(Ctx.getValue(LLVMM.getNamedAlias(Name)));
+}
+
+GlobalIFunc *Module::getNamedIFunc(StringRef Name) const {
+  return cast_or_null<GlobalIFunc>(Ctx.getValue(LLVMM.getNamedIFunc(Name)));
+}
+
+#ifndef NDEBUG
+void Module::dumpOS(raw_ostream &OS) const { OS << LLVMM; }
+
+void Module::dump() const {
+  dumpOS(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
diff --git a/llvm/lib/SandboxIR/PassManager.cpp b/llvm/lib/SandboxIR/PassManager.cpp
index 4abd39b28e87a..4168420a01ce2 100644
--- a/llvm/lib/SandboxIR/PassManager.cpp
+++ b/llvm/lib/SandboxIR/PassManager.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/SandboxIR/PassManager.h"
-#include "llvm/SandboxIR/SandboxIR.h"
 
 using namespace llvm::sandboxir;
 
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Region.cpp b/llvm/lib/SandboxIR/Region.cpp
similarity index 96%
rename from llvm/lib/Transforms/Vectorize/SandboxVectorizer/Region.cpp
rename to llvm/lib/SandboxIR/Region.cpp
index 5f2c28484f62b..b6292f3b24b87 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Region.cpp
+++ b/llvm/lib/SandboxIR/Region.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Vectorize/SandboxVectorizer/Region.h"
+#include "llvm/SandboxIR/Region.h"
+#include "llvm/SandboxIR/Function.h"
 
 namespace llvm::sandboxir {
 
diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp
index b1f472d7928f4..abcad39330094 100644
--- a/llvm/lib/SandboxIR/Tracker.cpp
+++ b/llvm/lib/SandboxIR/Tracker.cpp
@@ -10,7 +10,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Instruction.h"
-#include "llvm/SandboxIR/SandboxIR.h"
+#include "llvm/SandboxIR/Instruction.h"
 #include <sstream>
 
 using namespace llvm::sandboxir;
diff --git a/llvm/lib/SandboxIR/Type.cpp b/llvm/lib/SandboxIR/Type.cpp
index 87dcb726dde35..7bb788ecf25a6 100644
--- a/llvm/lib/SandboxIR/Type.cpp
+++ b/llvm/lib/SandboxIR/Type.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/SandboxIR/Type.h"
-#include "llvm/SandboxIR/SandboxIR.h"
+#include "llvm/SandboxIR/Context.h"
 
 using namespace llvm::sandboxir;
 
diff --git a/llvm/lib/SandboxIR/Use.cpp b/llvm/lib/SandboxIR/Use.cpp
new file mode 100644
index 0000000000000..ffbd41da51849
--- /dev/null
+++ b/llvm/lib/SandboxIR/Use.cpp
@@ -0,0 +1,61 @@
+//===- Use.cpp ------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SandboxIR/Use.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/User.h"
+
+namespace llvm::sandboxir {
+
+Value *Use::get() const { return Ctx->getValue(LLVMUse->get()); }
+
+void Use::set(Value *V) {
+  Ctx->getTracker().emplaceIfTracking<UseSet>(*this);
+  LLVMUse->set(V->Val);
+}
+
+unsigned Use::getOperandNo() const { return Usr->getUseOperandNo(*this); }
+
+void Use::swap(Use &OtherUse) {
+  Ctx->getTracker().emplaceIfTracking<UseSwap>(*this, OtherUse);
+  LLVMUse->swap(*OtherUse.LLVMUse);
+}
+
+#ifndef NDEBUG
+void Use::dumpOS(raw_ostream &OS) const {
+  Value *Def = nullptr;
+  if (LLVMUse == nullptr)
+    OS << "<null> LLVM Use! ";
+  else
+    Def = Ctx->getValue(LLVMUse->get());
+  OS << "Def:  ";
+  if (Def == nullptr)
+    OS << "NULL";
+  else
+    OS << *Def;
+  OS << "\n";
+
+  OS << "User: ";
+  if (Usr == nullptr)
+    OS << "NULL";
+  else
+    OS << *Usr;
+  OS << "\n";
+
+  OS << "OperandNo: ";
+  if (Usr == nullptr)
+    OS << "N/A";
+  else
+    OS << getOperandNo();
+  OS << "\n";
+}
+
+void Use::dump() const { dumpOS(dbgs()); }
+#endif // NDEBUG
+
+} // namespace llvm::sandboxir
diff --git a/llvm/lib/SandboxIR/User.cpp b/llvm/lib/SandboxIR/User.cpp
new file mode 100644
index 0000000000000..d7e4656e6e90e
--- /dev/null
+++ b/llvm/lib/SandboxIR/User.cpp
@@ -0,0 +1,118 @@
+//===- User.cpp - The User class of Sandbox IR ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SandboxIR/User.h"
+#include "llvm/SandboxIR/Context.h"
+
+namespace llvm::sandboxir {
+
+Use OperandUseIterator::operator*() const { return Use; }
+
+OperandUseIterator &OperandUseIterator::operator++() {
+  assert(Use.LLVMUse != nullptr && "Already at end!");
+  User *User = Use.getUser();
+  Use = User->getOperandUseInternal(Use.getOperandNo() + 1, /*Verify=*/false);
+  return *this;
+}
+
+UserUseIterator &UserUseIterator::operator++() {
+  // Get the corresponding llvm::Use, get the next in the list, and update the
+  // sandboxir::Use.
+  llvm::Use *&LLVMUse = Use.LLVMUse;
+  assert(LLVMUse != nullptr && "Already at end!");
+  LLVMUse = LLVMUse->getNext();
+  if (LLVMUse == nullptr) {
+    Use.Usr = nullptr;
+    return *this;
+  }
+  auto *Ctx = Use.Ctx;
+  auto *LLVMUser = LLVMUse->getUser();
+  Use.Usr = cast_or_null<sandboxir::User>(Ctx->getValue(LLVMUser));
+  return *this;
+}
+
+OperandUseIterator OperandUseIterator::operator+(unsigned Num) const {
+  sandboxir::Use U = Use.getUser()->getOperandUseInternal(
+      Use.getOperandNo() + Num, /*Verify=*/true);
+  return OperandUseIterator(U);
+}
+
+OperandUseIterator OperandUseIterator::operator-(unsigned Num) const {
+  assert(Use.getOperandNo() >= Num && "Out of bounds!");
+  sandboxir::Use U = Use.getUser()->getOperandUseInternal(
+      Use.getOperandNo() - Num, /*Verify=*/true);
+  return OperandUseIterator(U);
+}
+
+int OperandUseIterator::operator-(const OperandUseIterator &Other) const {
+  int ThisOpNo = Use.getOperandNo();
+  int OtherOpNo = Other.Use.getOperandNo();
+  return ThisOpNo - OtherOpNo;
+}
+
+Use User::getOperandUseDefault(unsigned OpIdx, bool Verify) const {
+  assert((!Verify || OpIdx < getNumOperands()) && "Out of bounds!");
+  assert(isa<llvm::User>(Val) && "Non-users have no operands!");
+  llvm::Use *LLVMUse;
+  if (OpIdx != getNumOperands())
+    LLVMUse = &cast<llvm::User>(Val)->getOperandUse(OpIdx);
+  else
+    LLVMUse = cast<llvm::User>(Val)->op_end();
+  return Use(LLVMUse, const_cast<User *>(this), Ctx);
+}
+
+#ifndef NDEBUG
+void User::verifyUserOfLLVMUse(const llvm::Use &Use) const {
+  assert(Ctx.getValue(Use.getUser()) == this &&
+         "Use not found in this SBUser's operands!");
+}
+#endif
+
+bool User::classof(const Value *From) {
+  switch (From->getSubclassID()) {
+#define DEF_VALUE(ID, CLASS)
+#define DEF_USER(ID, CLASS)                                                    \
+  case ClassID::ID:                                                            \
+    return true;
+#define DEF_INSTR(ID, OPC, CLASS)                                              \
+  case ClassID::ID:                                                            \
+    return true;
+#include "llvm/SandboxIR/Values.def"
+  default:
+    return false;
+  }
+}
+
+void User::setOperand(unsigned OperandIdx, Value *Operand) {
+  assert(isa<llvm::User>(Val) && "No operands!");
+  Ctx.getTracker().emplaceIfTracking<UseSet>(getOperandUse(OperandIdx));
+  // We are delegating to llvm::User::setOperand().
+  cast<llvm::User>(Val)->setOperand(OperandIdx, Operand->Val);
+}
+
+bool User::replaceUsesOfWith(Value *FromV, Value *ToV) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking()) {
+    for (auto OpIdx : seq<unsigned>(0, getNumOperands())) {
+      auto Use = getOperandUse(OpIdx);
+      if (Use.get() == FromV)
+        Tracker.emplaceIfTracking<UseSet>(Use);
+    }
+  }
+  // We are delegating RUOW to LLVM IR's RUOW.
+  return cast<llvm::User>(Val)->replaceUsesOfWith(FromV->Val, ToV->Val);
+}
+
+#ifndef NDEBUG
+void User::dumpCommonHeader(raw_ostream &OS) const {
+  Value::dumpCommonHeader(OS);
+  // TODO: This is incomplete
+}
+#endif // NDEBUG
+
+} // namespace llvm::sandboxir
diff --git a/llvm/lib/SandboxIR/Value.cpp b/llvm/lib/SandboxIR/Value.cpp
new file mode 100644
index 0000000000000..b9d91c7e11f74
--- /dev/null
+++ b/llvm/lib/SandboxIR/Value.cpp
@@ -0,0 +1,122 @@
+//===- Value.cpp - The Value class of Sandbox IR --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SandboxIR/Value.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/User.h"
+#include <sstream>
+
+namespace llvm::sandboxir {
+
+Value::Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx)
+    : SubclassID(SubclassID), Val(Val), Ctx(Ctx) {
+#ifndef NDEBUG
+  UID = Ctx.getNumValues();
+#endif
+}
+
+Value::use_iterator Value::use_begin() {
+  llvm::Use *LLVMUse = nullptr;
+  if (Val->use_begin() != Val->use_end())
+    LLVMUse = &*Val->use_begin();
+  User *User = LLVMUse != nullptr ? cast_or_null<sandboxir::User>(Ctx.getValue(
+                                        Val->use_begin()->getUser()))
+                                  : nullptr;
+  return use_iterator(Use(LLVMUse, User, Ctx));
+}
+
+Value::user_iterator Value::user_begin() {
+  auto UseBegin = Val->use_begin();
+  auto UseEnd = Val->use_end();
+  bool AtEnd = UseBegin == UseEnd;
+  llvm::Use *LLVMUse = AtEnd ? nullptr : &*UseBegin;
+  User *User =
+      AtEnd ? nullptr
+            : cast_or_null<sandboxir::User>(Ctx.getValue(&*LLVMUse->getUser()));
+  return user_iterator(Use(LLVMUse, User, Ctx), UseToUser());
+}
+
+unsigned Value::getNumUses() const { return range_size(Val->users()); }
+
+Type *Value::getType() const { return Ctx.getType(Val->getType()); }
+
+void Value::replaceUsesWithIf(
+    Value *OtherV, llvm::function_ref<bool(const Use &)> ShouldReplace) {
+  assert(getType() == OtherV->getType() && "Can't replace with different type");
+  llvm::Value *OtherVal = OtherV->Val;
+  // We are delegating RUWIf to LLVM IR's RUWIf.
+  Val->replaceUsesWithIf(
+      OtherVal, [&ShouldReplace, this](llvm::Use &LLVMUse) -> bool {
+        User *DstU = cast_or_null<User>(Ctx.getValue(LLVMUse.getUser()));
+        if (DstU == nullptr)
+          return false;
+        Use UseToReplace(&LLVMUse, DstU, Ctx);
+        if (!ShouldReplace(UseToReplace))
+          return false;
+        Ctx.getTracker().emplaceIfTracking<UseSet>(UseToReplace);
+        return true;
+      });
+}
+
+void Value::replaceAllUsesWith(Value *Other) {
+  assert(getType() == Other->getType() &&
+         "Replacing with Value of different type!");
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking()) {
+    for (auto Use : uses())
+      Tracker.track(std::make_unique<UseSet>(Use));
+  }
+  // We are delegating RAUW to LLVM IR's RAUW.
+  Val->replaceAllUsesWith(Other->Val);
+}
+
+#ifndef NDEBUG
+std::string Value::getUid() const {
+  std::stringstream SS;
+  SS << "SB" << UID << ".";
+  return SS.str();
+}
+
+void Value::dumpCommonHeader(raw_ostream &OS) const {
+  OS << getUid() << " " << getSubclassIDStr(SubclassID) << " ";
+}
+
+void Value::dumpCommonFooter(raw_ostream &OS) const {
+  OS.indent(2) << "Val: ";
+  if (Val)
+    OS << *Val;
+  else
+    OS << "NULL";
+  OS << "\n";
+}
+
+void Value::dumpCommonPrefix(raw_ostream &OS) const {
+  if (Val)
+    OS << *Val;
+  else
+    OS << "NULL ";
+}
+
+void Value::dumpCommonSuffix(raw_ostream &OS) const {
+  OS << " ; " << getUid() << " (" << getSubclassIDStr(SubclassID) << ")";
+}
+
+void Value::printAsOperandCommon(raw_ostream &OS) const {
+  if (Val)
+    Val->printAsOperand(OS);
+  else
+    OS << "NULL ";
+}
+
+void Value::dump() const {
+  dumpOS(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+} // namespace llvm::sandboxir
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 8e31e0ced2d73..6863c5c0af5dc 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -1075,9 +1075,13 @@ KnownBits KnownBits::srem(const KnownBits &LHS, const KnownBits &RHS) {
 
   // The sign bit is the LHS's sign bit, except when the result of the
   // remainder is zero. The magnitude of the result should be less than or
-  // equal to the magnitude of the LHS. Therefore any leading zeros that exist
-  // in the left hand side must also exist in the result.
-  Known.Zero.setHighBits(LHS.countMinLeadingZeros());
+  // equal to the magnitude of either operand.
+  if (LHS.isNegative() && Known.isNonZero())
+    Known.One.setHighBits(
+        std::max(LHS.countMinLeadingOnes(), RHS.countMinSignBits()));
+  else if (LHS.isNonNegative())
+    Known.Zero.setHighBits(
+        std::max(LHS.countMinLeadingZeros(), RHS.countMinSignBits()));
   return Known;
 }
 
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 0f99b4a13d2f9..0a5d384239d5e 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -82,7 +82,7 @@ struct RecordKeeperImpl {
   FoldingSet<FoldOpInit> TheFoldOpInitPool;
   FoldingSet<IsAOpInit> TheIsAOpInitPool;
   FoldingSet<ExistsOpInit> TheExistsOpInitPool;
-  DenseMap<std::pair<RecTy *, Init *>, VarInit *> TheVarInitPool;
+  DenseMap<std::pair<const RecTy *, Init *>, VarInit *> TheVarInitPool;
   DenseMap<std::pair<TypedInit *, unsigned>, VarBitInit *> TheVarBitInitPool;
   FoldingSet<VarDefInit> TheVarDefInitPool;
   DenseMap<std::pair<Init *, StringInit *>, FieldInit *> TheFieldInitPool;
@@ -133,7 +133,7 @@ void detail::RecordKeeperImpl::dumpAllocationStats(raw_ostream &OS) const {
 LLVM_DUMP_METHOD void RecTy::dump() const { print(errs()); }
 #endif
 
-ListRecTy *RecTy::getListTy() {
+const ListRecTy *RecTy::getListTy() const {
   if (!ListTy)
     ListTy = new (RK.getImpl().Allocator) ListRecTy(this);
   return ListTy;
@@ -146,7 +146,7 @@ bool RecTy::typeIsConvertibleTo(const RecTy *RHS) const {
 
 bool RecTy::typeIsA(const RecTy *RHS) const { return this == RHS; }
 
-BitRecTy *BitRecTy::get(RecordKeeper &RK) {
+const BitRecTy *BitRecTy::get(RecordKeeper &RK) {
   return &RK.getImpl().SharedBitRecTy;
 }
 
@@ -158,7 +158,7 @@ bool BitRecTy::typeIsConvertibleTo(const RecTy *RHS) const{
   return false;
 }
 
-BitsRecTy *BitsRecTy::get(RecordKeeper &RK, unsigned Sz) {
+const BitsRecTy *BitsRecTy::get(RecordKeeper &RK, unsigned Sz) {
   detail::RecordKeeperImpl &RKImpl = RK.getImpl();
   if (Sz >= RKImpl.SharedBitsRecTys.size())
     RKImpl.SharedBitsRecTys.resize(Sz + 1);
@@ -179,7 +179,7 @@ bool BitsRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
   return (kind == BitRecTyKind && Size == 1) || (kind == IntRecTyKind);
 }
 
-IntRecTy *IntRecTy::get(RecordKeeper &RK) {
+const IntRecTy *IntRecTy::get(RecordKeeper &RK) {
   return &RK.getImpl().SharedIntRecTy;
 }
 
@@ -188,7 +188,7 @@ bool IntRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
   return kind==BitRecTyKind || kind==BitsRecTyKind || kind==IntRecTyKind;
 }
 
-StringRecTy *StringRecTy::get(RecordKeeper &RK) {
+const StringRecTy *StringRecTy::get(RecordKeeper &RK) {
   return &RK.getImpl().SharedStringRecTy;
 }
 
@@ -217,7 +217,7 @@ bool ListRecTy::typeIsA(const RecTy *RHS) const {
   return false;
 }
 
-DagRecTy *DagRecTy::get(RecordKeeper &RK) {
+const DagRecTy *DagRecTy::get(RecordKeeper &RK) {
   return &RK.getImpl().SharedDagRecTy;
 }
 
@@ -226,22 +226,22 @@ std::string DagRecTy::getAsString() const {
 }
 
 static void ProfileRecordRecTy(FoldingSetNodeID &ID,
-                               ArrayRef<Record *> Classes) {
+                               ArrayRef<const Record *> Classes) {
   ID.AddInteger(Classes.size());
-  for (Record *R : Classes)
+  for (const Record *R : Classes)
     ID.AddPointer(R);
 }
 
-RecordRecTy *RecordRecTy::get(RecordKeeper &RK,
-                              ArrayRef<Record *> UnsortedClasses) {
+const RecordRecTy *RecordRecTy::get(RecordKeeper &RK,
+                                    ArrayRef<const Record *> UnsortedClasses) {
   detail::RecordKeeperImpl &RKImpl = RK.getImpl();
   if (UnsortedClasses.empty())
     return &RKImpl.AnyRecord;
 
   FoldingSet<RecordRecTy> &ThePool = RKImpl.RecordTypePool;
 
-  SmallVector<Record *, 4> Classes(UnsortedClasses);
-  llvm::sort(Classes, [](Record *LHS, Record *RHS) {
+  SmallVector<const Record *, 4> Classes(UnsortedClasses);
+  llvm::sort(Classes, [](const Record *LHS, const Record *RHS) {
     return LHS->getNameInitAsString() < RHS->getNameInitAsString();
   });
 
@@ -263,16 +263,17 @@ RecordRecTy *RecordRecTy::get(RecordKeeper &RK,
 #endif
 
   void *Mem = RKImpl.Allocator.Allocate(
-      totalSizeToAlloc<Record *>(Classes.size()), alignof(RecordRecTy));
+      totalSizeToAlloc<const Record *>(Classes.size()), alignof(RecordRecTy));
   RecordRecTy *Ty = new (Mem) RecordRecTy(RK, Classes.size());
   std::uninitialized_copy(Classes.begin(), Classes.end(),
-                          Ty->getTrailingObjects<Record *>());
+                          Ty->getTrailingObjects<const Record *>());
   ThePool.InsertNode(Ty, IP);
   return Ty;
 }
-RecordRecTy *RecordRecTy::get(Record *Class) {
+
+const RecordRecTy *RecordRecTy::get(const Record *Class) {
   assert(Class && "unexpected null class");
-  return get(Class->getRecords(), Class);
+  return get(Class->getRecords(), {Class});
 }
 
 void RecordRecTy::Profile(FoldingSetNodeID &ID) const {
@@ -285,7 +286,7 @@ std::string RecordRecTy::getAsString() const {
 
   std::string Str = "{";
   bool First = true;
-  for (Record *R : getClasses()) {
+  for (const Record *R : getClasses()) {
     if (!First)
       Str += ", ";
     First = false;
@@ -295,11 +296,10 @@ std::string RecordRecTy::getAsString() const {
   return Str;
 }
 
-bool RecordRecTy::isSubClassOf(Record *Class) const {
-  return llvm::any_of(getClasses(), [Class](Record *MySuperClass) {
-                                      return MySuperClass == Class ||
-                                             MySuperClass->isSubClassOf(Class);
-                                    });
+bool RecordRecTy::isSubClassOf(const Record *Class) const {
+  return llvm::any_of(getClasses(), [Class](const Record *MySuperClass) {
+    return MySuperClass == Class || MySuperClass->isSubClassOf(Class);
+  });
 }
 
 bool RecordRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
@@ -310,21 +310,22 @@ bool RecordRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
   if (!RTy)
     return false;
 
-  return llvm::all_of(RTy->getClasses(), [this](Record *TargetClass) {
-                                           return isSubClassOf(TargetClass);
-                                         });
+  return llvm::all_of(RTy->getClasses(), [this](const Record *TargetClass) {
+    return isSubClassOf(TargetClass);
+  });
 }
 
 bool RecordRecTy::typeIsA(const RecTy *RHS) const {
   return typeIsConvertibleTo(RHS);
 }
 
-static RecordRecTy *resolveRecordTypes(RecordRecTy *T1, RecordRecTy *T2) {
-  SmallVector<Record *, 4> CommonSuperClasses;
-  SmallVector<Record *, 4> Stack(T1->getClasses());
+static const RecordRecTy *resolveRecordTypes(const RecordRecTy *T1,
+                                             const RecordRecTy *T2) {
+  SmallVector<const Record *, 4> CommonSuperClasses;
+  SmallVector<const Record *, 4> Stack(T1->getClasses());
 
   while (!Stack.empty()) {
-    Record *R = Stack.pop_back_val();
+    const Record *R = Stack.pop_back_val();
 
     if (T2->isSubClassOf(R)) {
       CommonSuperClasses.push_back(R);
@@ -336,12 +337,12 @@ static RecordRecTy *resolveRecordTypes(RecordRecTy *T1, RecordRecTy *T2) {
   return RecordRecTy::get(T1->getRecordKeeper(), CommonSuperClasses);
 }
 
-RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
+const RecTy *llvm::resolveTypes(const RecTy *T1, const RecTy *T2) {
   if (T1 == T2)
     return T1;
 
-  if (RecordRecTy *RecTy1 = dyn_cast<RecordRecTy>(T1)) {
-    if (RecordRecTy *RecTy2 = dyn_cast<RecordRecTy>(T2))
+  if (const RecordRecTy *RecTy1 = dyn_cast<RecordRecTy>(T1)) {
+    if (const RecordRecTy *RecTy2 = dyn_cast<RecordRecTy>(T2))
       return resolveRecordTypes(RecTy1, RecTy2);
   }
 
@@ -353,10 +354,10 @@ RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
   if (T2->typeIsConvertibleTo(T1))
     return T1;
 
-  if (ListRecTy *ListTy1 = dyn_cast<ListRecTy>(T1)) {
-    if (ListRecTy *ListTy2 = dyn_cast<ListRecTy>(T2)) {
-      RecTy* NewType = resolveTypes(ListTy1->getElementType(),
-                                    ListTy2->getElementType());
+  if (const ListRecTy *ListTy1 = dyn_cast<ListRecTy>(T1)) {
+    if (const ListRecTy *ListTy2 = dyn_cast<ListRecTy>(T2)) {
+      const RecTy *NewType =
+          resolveTypes(ListTy1->getElementType(), ListTy2->getElementType());
       if (NewType)
         return NewType->getListTy();
     }
@@ -387,11 +388,11 @@ UnsetInit *UnsetInit::get(RecordKeeper &RK) {
   return &RK.getImpl().TheUnsetInit;
 }
 
-Init *UnsetInit::getCastTo(RecTy *Ty) const {
+Init *UnsetInit::getCastTo(const RecTy *Ty) const {
   return const_cast<UnsetInit *>(this);
 }
 
-Init *UnsetInit::convertInitializerTo(RecTy *Ty) const {
+Init *UnsetInit::convertInitializerTo(const RecTy *Ty) const {
   return const_cast<UnsetInit *>(this);
 }
 
@@ -437,7 +438,7 @@ BitInit *BitInit::get(RecordKeeper &RK, bool V) {
   return V ? &RK.getImpl().TrueBitInit : &RK.getImpl().FalseBitInit;
 }
 
-Init *BitInit::convertInitializerTo(RecTy *Ty) const {
+Init *BitInit::convertInitializerTo(const RecTy *Ty) const {
   if (isa<BitRecTy>(Ty))
     return const_cast<BitInit *>(this);
 
@@ -483,7 +484,7 @@ void BitsInit::Profile(FoldingSetNodeID &ID) const {
   ProfileBitsInit(ID, ArrayRef(getTrailingObjects<Init *>(), NumBits));
 }
 
-Init *BitsInit::convertInitializerTo(RecTy *Ty) const {
+Init *BitsInit::convertInitializerTo(const RecTy *Ty) const {
   if (isa<BitRecTy>(Ty)) {
     if (getNumBits() != 1) return nullptr; // Only accept if just one bit!
     return getBit(0);
@@ -601,7 +602,7 @@ static bool canFitInBitfield(int64_t Value, unsigned NumBits) {
          (Value >> NumBits == 0) || (Value >> (NumBits-1) == -1);
 }
 
-Init *IntInit::convertInitializerTo(RecTy *Ty) const {
+Init *IntInit::convertInitializerTo(const RecTy *Ty) const {
   if (isa<IntRecTy>(Ty))
     return const_cast<IntInit *>(this);
 
@@ -674,16 +675,15 @@ StringInit *StringInit::get(RecordKeeper &RK, StringRef V, StringFormat Fmt) {
   return Entry.second;
 }
 
-Init *StringInit::convertInitializerTo(RecTy *Ty) const {
+Init *StringInit::convertInitializerTo(const RecTy *Ty) const {
   if (isa<StringRecTy>(Ty))
     return const_cast<StringInit *>(this);
 
   return nullptr;
 }
 
-static void ProfileListInit(FoldingSetNodeID &ID,
-                            ArrayRef<Init *> Range,
-                            RecTy *EltTy) {
+static void ProfileListInit(FoldingSetNodeID &ID, ArrayRef<Init *> Range,
+                            const RecTy *EltTy) {
   ID.AddInteger(Range.size());
   ID.AddPointer(EltTy);
 
@@ -691,7 +691,7 @@ static void ProfileListInit(FoldingSetNodeID &ID,
     ID.AddPointer(I);
 }
 
-ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
+ListInit *ListInit::get(ArrayRef<Init *> Range, const RecTy *EltTy) {
   FoldingSetNodeID ID;
   ProfileListInit(ID, Range, EltTy);
 
@@ -713,12 +713,12 @@ ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
 }
 
 void ListInit::Profile(FoldingSetNodeID &ID) const {
-  RecTy *EltTy = cast<ListRecTy>(getType())->getElementType();
+  const RecTy *EltTy = cast<ListRecTy>(getType())->getElementType();
 
   ProfileListInit(ID, getValues(), EltTy);
 }
 
-Init *ListInit::convertInitializerTo(RecTy *Ty) const {
+Init *ListInit::convertInitializerTo(const RecTy *Ty) const {
   if (getType() == Ty)
     return const_cast<ListInit*>(this);
 
@@ -729,7 +729,7 @@ Init *ListInit::convertInitializerTo(RecTy *Ty) const {
     // Verify that all of the elements of the list are subclasses of the
     // appropriate class!
     bool Changed = false;
-    RecTy *ElementType = LRT->getElementType();
+    const RecTy *ElementType = LRT->getElementType();
     for (Init *I : getValues())
       if (Init *CI = I->convertInitializerTo(ElementType)) {
         Elements.push_back(CI);
@@ -803,14 +803,14 @@ Init *OpInit::getBit(unsigned Bit) const {
   return VarBitInit::get(const_cast<OpInit*>(this), Bit);
 }
 
-static void
-ProfileUnOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *Op, RecTy *Type) {
+static void ProfileUnOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *Op,
+                            const RecTy *Type) {
   ID.AddInteger(Opcode);
   ID.AddPointer(Op);
   ID.AddPointer(Type);
 }
 
-UnOpInit *UnOpInit::get(UnaryOp Opc, Init *LHS, RecTy *Type) {
+UnOpInit *UnOpInit::get(UnaryOp Opc, Init *LHS, const RecTy *Type) {
   FoldingSetNodeID ID;
   ProfileUnOpInit(ID, Opc, LHS, Type);
 
@@ -989,7 +989,8 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const {
 
   case LISTFLATTEN:
     if (ListInit *LHSList = dyn_cast<ListInit>(LHS)) {
-      ListRecTy *InnerListTy = dyn_cast<ListRecTy>(LHSList->getElementType());
+      const ListRecTy *InnerListTy =
+          dyn_cast<ListRecTy>(LHSList->getElementType());
       // list of non-lists, !listflatten() is a NOP.
       if (!InnerListTy)
         return LHS;
@@ -1052,16 +1053,16 @@ std::string UnOpInit::getAsString() const {
   return Result + "(" + LHS->getAsString() + ")";
 }
 
-static void
-ProfileBinOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, Init *RHS,
-                 RecTy *Type) {
+static void ProfileBinOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS,
+                             Init *RHS, const RecTy *Type) {
   ID.AddInteger(Opcode);
   ID.AddPointer(LHS);
   ID.AddPointer(RHS);
   ID.AddPointer(Type);
 }
 
-BinOpInit *BinOpInit::get(BinaryOp Opc, Init *LHS, Init *RHS, RecTy *Type) {
+BinOpInit *BinOpInit::get(BinaryOp Opc, Init *LHS, Init *RHS,
+                          const RecTy *Type) {
   FoldingSetNodeID ID;
   ProfileBinOpInit(ID, Opc, LHS, RHS, Type);
 
@@ -1587,9 +1588,8 @@ std::string BinOpInit::getAsString() const {
   return Result + "(" + LHS->getAsString() + ", " + RHS->getAsString() + ")";
 }
 
-static void
-ProfileTernOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, Init *MHS,
-                  Init *RHS, RecTy *Type) {
+static void ProfileTernOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS,
+                              Init *MHS, Init *RHS, const RecTy *Type) {
   ID.AddInteger(Opcode);
   ID.AddPointer(LHS);
   ID.AddPointer(MHS);
@@ -1598,7 +1598,7 @@ ProfileTernOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, Init *MHS,
 }
 
 TernOpInit *TernOpInit::get(TernaryOp Opc, Init *LHS, Init *MHS, Init *RHS,
-                            RecTy *Type) {
+                            const RecTy *Type) {
   FoldingSetNodeID ID;
   ProfileTernOpInit(ID, Opc, LHS, MHS, RHS, Type);
 
@@ -1651,7 +1651,7 @@ static Init *ForeachDagApply(Init *LHS, DagInit *MHSd, Init *RHS,
 }
 
 // Applies RHS to all elements of MHS, using LHS as a temp variable.
-static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
+static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, const RecTy *Type,
                            Record *CurRec) {
   if (DagInit *MHSd = dyn_cast<DagInit>(MHS))
     return ForeachDagApply(LHS, MHSd, RHS, CurRec);
@@ -1672,7 +1672,7 @@ static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
 
 // Evaluates RHS for all elements of MHS, using LHS as a temp variable.
 // Creates a new list with the elements that evaluated to true.
-static Init *FilterHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
+static Init *FilterHelper(Init *LHS, Init *MHS, Init *RHS, const RecTy *Type,
                           Record *CurRec) {
   if (ListInit *MHSl = dyn_cast<ListInit>(MHS)) {
     SmallVector<Init *, 8> NewList;
@@ -1955,7 +1955,7 @@ std::string TernOpInit::getAsString() const {
 }
 
 static void ProfileFoldOpInit(FoldingSetNodeID &ID, Init *Start, Init *List,
-                              Init *A, Init *B, Init *Expr, RecTy *Type) {
+                              Init *A, Init *B, Init *Expr, const RecTy *Type) {
   ID.AddPointer(Start);
   ID.AddPointer(List);
   ID.AddPointer(A);
@@ -1965,7 +1965,7 @@ static void ProfileFoldOpInit(FoldingSetNodeID &ID, Init *Start, Init *List,
 }
 
 FoldOpInit *FoldOpInit::get(Init *Start, Init *List, Init *A, Init *B,
-                            Init *Expr, RecTy *Type) {
+                            Init *Expr, const RecTy *Type) {
   FoldingSetNodeID ID;
   ProfileFoldOpInit(ID, Start, List, A, B, Expr, Type);
 
@@ -2023,13 +2023,13 @@ std::string FoldOpInit::getAsString() const {
       .str();
 }
 
-static void ProfileIsAOpInit(FoldingSetNodeID &ID, RecTy *CheckType,
+static void ProfileIsAOpInit(FoldingSetNodeID &ID, const RecTy *CheckType,
                              Init *Expr) {
   ID.AddPointer(CheckType);
   ID.AddPointer(Expr);
 }
 
-IsAOpInit *IsAOpInit::get(RecTy *CheckType, Init *Expr) {
+IsAOpInit *IsAOpInit::get(const RecTy *CheckType, Init *Expr) {
 
   FoldingSetNodeID ID;
   ProfileIsAOpInit(ID, CheckType, Expr);
@@ -2085,13 +2085,13 @@ std::string IsAOpInit::getAsString() const {
       .str();
 }
 
-static void ProfileExistsOpInit(FoldingSetNodeID &ID, RecTy *CheckType,
+static void ProfileExistsOpInit(FoldingSetNodeID &ID, const RecTy *CheckType,
                                 Init *Expr) {
   ID.AddPointer(CheckType);
   ID.AddPointer(Expr);
 }
 
-ExistsOpInit *ExistsOpInit::get(RecTy *CheckType, Init *Expr) {
+ExistsOpInit *ExistsOpInit::get(const RecTy *CheckType, Init *Expr) {
   FoldingSetNodeID ID;
   ProfileExistsOpInit(ID, CheckType, Expr);
 
@@ -2160,18 +2160,17 @@ std::string ExistsOpInit::getAsString() const {
       .str();
 }
 
-RecTy *TypedInit::getFieldType(StringInit *FieldName) const {
-  if (RecordRecTy *RecordType = dyn_cast<RecordRecTy>(getType())) {
-    for (Record *Rec : RecordType->getClasses()) {
-      if (RecordVal *Field = Rec->getValue(FieldName))
+const RecTy *TypedInit::getFieldType(StringInit *FieldName) const {
+  if (const RecordRecTy *RecordType = dyn_cast<RecordRecTy>(getType())) {
+    for (const Record *Rec : RecordType->getClasses()) {
+      if (const RecordVal *Field = Rec->getValue(FieldName))
         return Field->getType();
     }
   }
   return nullptr;
 }
 
-Init *
-TypedInit::convertInitializerTo(RecTy *Ty) const {
+Init *TypedInit::convertInitializerTo(const RecTy *Ty) const {
   if (getType() == Ty || getType()->typeIsA(Ty))
     return const_cast<TypedInit *>(this);
 
@@ -2183,7 +2182,7 @@ TypedInit::convertInitializerTo(RecTy *Ty) const {
 }
 
 Init *TypedInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
-  BitsRecTy *T = dyn_cast<BitsRecTy>(getType());
+  const BitsRecTy *T = dyn_cast<BitsRecTy>(getType());
   if (!T) return nullptr;  // Cannot subscript a non-bits variable.
   unsigned NumBits = T->getNumBits();
 
@@ -2198,7 +2197,7 @@ Init *TypedInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
   return BitsInit::get(getRecordKeeper(), NewBits);
 }
 
-Init *TypedInit::getCastTo(RecTy *Ty) const {
+Init *TypedInit::getCastTo(const RecTy *Ty) const {
   // Handle the common case quickly
   if (getType() == Ty || getType()->typeIsA(Ty))
     return const_cast<TypedInit *>(this);
@@ -2216,12 +2215,12 @@ Init *TypedInit::getCastTo(RecTy *Ty) const {
       ->Fold(nullptr);
 }
 
-VarInit *VarInit::get(StringRef VN, RecTy *T) {
+VarInit *VarInit::get(StringRef VN, const RecTy *T) {
   Init *Value = StringInit::get(T->getRecordKeeper(), VN);
   return VarInit::get(Value, T);
 }
 
-VarInit *VarInit::get(Init *VN, RecTy *T) {
+VarInit *VarInit::get(Init *VN, const RecTy *T) {
   detail::RecordKeeperImpl &RK = T->getRecordKeeper().getImpl();
   VarInit *&I = RK.TheVarInitPool[std::make_pair(T, VN)];
   if (!I)
@@ -2269,14 +2268,14 @@ Init *VarBitInit::resolveReferences(Resolver &R) const {
 DefInit::DefInit(Record *D)
     : TypedInit(IK_DefInit, D->getType()), Def(D) {}
 
-Init *DefInit::convertInitializerTo(RecTy *Ty) const {
+Init *DefInit::convertInitializerTo(const RecTy *Ty) const {
   if (auto *RRT = dyn_cast<RecordRecTy>(Ty))
     if (getType()->typeIsConvertibleTo(RRT))
       return const_cast<DefInit *>(this);
   return nullptr;
 }
 
-RecTy *DefInit::getFieldType(StringInit *FieldName) const {
+const RecTy *DefInit::getFieldType(StringInit *FieldName) const {
   if (const RecordVal *RV = Def->getValue(FieldName))
     return RV->getType();
   return nullptr;
@@ -2490,7 +2489,7 @@ void CondOpInit::Profile(FoldingSetNodeID &ID) const {
 }
 
 CondOpInit *CondOpInit::get(ArrayRef<Init *> CondRange,
-                            ArrayRef<Init *> ValRange, RecTy *Ty) {
+                            ArrayRef<Init *> ValRange, const RecTy *Ty) {
   assert(CondRange.size() == ValRange.size() &&
          "Number of conditions and values must match!");
 
@@ -2719,7 +2718,7 @@ std::string DagInit::getAsString() const {
 //    Other implementations
 //===----------------------------------------------------------------------===//
 
-RecordVal::RecordVal(Init *N, RecTy *T, FieldKind K)
+RecordVal::RecordVal(Init *N, const RecTy *T, FieldKind K)
     : Name(N), TyAndKind(T, K) {
   setValue(UnsetInit::get(N->getRecordKeeper()));
   assert(Value && "Cannot create unset value for current type!");
@@ -2727,7 +2726,7 @@ RecordVal::RecordVal(Init *N, RecTy *T, FieldKind K)
 
 // This constructor accepts the same arguments as the above, but also
 // a source location.
-RecordVal::RecordVal(Init *N, SMLoc Loc, RecTy *T, FieldKind K)
+RecordVal::RecordVal(Init *N, SMLoc Loc, const RecTy *T, FieldKind K)
     : Name(N), Loc(Loc), TyAndKind(T, K) {
   setValue(UnsetInit::get(N->getRecordKeeper()));
   assert(Value && "Cannot create unset value for current type!");
@@ -2758,7 +2757,7 @@ bool RecordVal::setValue(Init *V) {
     if (Value) {
       assert(!isa<TypedInit>(Value) ||
              cast<TypedInit>(Value)->getType()->typeIsA(getType()));
-      if (BitsRecTy *BTy = dyn_cast<BitsRecTy>(getType())) {
+      if (const BitsRecTy *BTy = dyn_cast<BitsRecTy>(getType())) {
         if (!isa<BitsInit>(Value)) {
           SmallVector<Init *, 64> Bits;
           Bits.reserve(BTy->getNumBits());
@@ -2783,7 +2782,7 @@ bool RecordVal::setValue(Init *V, SMLoc NewLoc) {
     if (Value) {
       assert(!isa<TypedInit>(Value) ||
              cast<TypedInit>(Value)->getType()->typeIsA(getType()));
-      if (BitsRecTy *BTy = dyn_cast<BitsRecTy>(getType())) {
+      if (const BitsRecTy *BTy = dyn_cast<BitsRecTy>(getType())) {
         if (!isa<BitsInit>(Value)) {
           SmallVector<Init *, 64> Bits;
           Bits.reserve(BTy->getNumBits());
@@ -2830,8 +2829,8 @@ void Record::checkName() {
                                   "' is not a string!");
 }
 
-RecordRecTy *Record::getType() const {
-  SmallVector<Record *, 4> DirectSCs;
+const RecordRecTy *Record::getType() const {
+  SmallVector<const Record *, 4> DirectSCs;
   getDirectSuperClasses(DirectSCs);
   return RecordRecTy::get(TrackedRecords, DirectSCs);
 }
@@ -2882,7 +2881,8 @@ bool Record::hasDirectSuperClass(const Record *Superclass) const {
   return false;
 }
 
-void Record::getDirectSuperClasses(SmallVectorImpl<Record *> &Classes) const {
+void Record::getDirectSuperClasses(
+    SmallVectorImpl<const Record *> &Classes) const {
   ArrayRef<std::pair<Record *, SMRange>> SCs = getSuperClasses();
 
   while (!SCs.empty()) {
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index 54c9a902ec27a..91a3617f8579e 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -1087,7 +1087,7 @@ bool TGParser::ParseOptionalBitList(SmallVectorImpl<unsigned> &Ranges) {
 ///   Type ::= DAG                          // dag type
 ///   Type ::= ClassID                      // Record Type
 ///
-RecTy *TGParser::ParseType() {
+const RecTy *TGParser::ParseType() {
   switch (Lex.getCode()) {
   default: TokError("Unknown token when expecting a type"); return nullptr;
   case tgtok::String:
@@ -1137,7 +1137,7 @@ RecTy *TGParser::ParseType() {
       return nullptr;
     }
     Lex.Lex();  // Eat '<'
-    RecTy *SubType = ParseType();
+    const RecTy *SubType = ParseType();
     if (!SubType) return nullptr;
 
     if (!consume(tgtok::greater)) {
@@ -1182,7 +1182,7 @@ Init *TGParser::ParseIDValue(Record *CurRec, StringInit *Name, SMRange NameLoc,
 ///
 /// Operation ::= XOperator ['<' Type '>'] '(' Args ')'
 ///
-Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
+Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
   switch (Lex.getCode()) {
   default:
     TokError("unknown bang operator");
@@ -1200,7 +1200,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
   case tgtok::XRepr:
   case tgtok::XGetDagOp: { // Value ::= !unop '(' Value ')'
     UnOpInit::UnaryOp Code;
-    RecTy *Type = nullptr;
+    const RecTy *Type = nullptr;
 
     switch (Lex.getCode()) {
     default: llvm_unreachable("Unhandled code!");
@@ -1305,10 +1305,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         return nullptr;
       }
       if (LHSt) {
-        ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
-        StringRecTy *SType = dyn_cast<StringRecTy>(LHSt->getType());
-        DagRecTy *DType = dyn_cast<DagRecTy>(LHSt->getType());
-        if (!LType && !SType && !DType) {
+        if (!isa<ListRecTy, StringRecTy, DagRecTy>(LHSt->getType())) {
           TokError("expected string, list, or dag type argument in unary operator");
           return nullptr;
         }
@@ -1324,8 +1321,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         return nullptr;
       }
       if (LHSt) {
-        ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
-        if (!LType) {
+        if (!isa<ListRecTy>(LHSt->getType())) {
           TokError("expected list type argument in unary operator");
           return nullptr;
         }
@@ -1348,14 +1344,14 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
                               : ListRecTy::get(Itemt->getType());
       } else {
         assert(LHSt && "expected list type argument in unary operator");
-        ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
+        const ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
         Type = UseElementType ? LType->getElementType() : LType;
       }
 
       // for !listflatten, we expect a list of lists, but also support a list of
       // non-lists, where !listflatten will be a NOP.
       if (Code == UnOpInit::LISTFLATTEN) {
-        ListRecTy *InnerListTy = dyn_cast<ListRecTy>(Type);
+        const ListRecTy *InnerListTy = dyn_cast<ListRecTy>(Type);
         if (InnerListTy) {
           // listflatten will convert list<list<X>> to list<X>.
           Type = ListRecTy::get(InnerListTy->getElementType());
@@ -1377,7 +1373,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     // Value ::= !isa '<' Type '>' '(' Value ')'
     Lex.Lex(); // eat the operation
 
-    RecTy *Type = ParseOperatorType();
+    const RecTy *Type = ParseOperatorType();
     if (!Type)
       return nullptr;
 
@@ -1402,7 +1398,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     // Value ::= !exists '<' Type '>' '(' Value ')'
     Lex.Lex(); // eat the operation.
 
-    RecTy *Type = ParseOperatorType();
+    const RecTy *Type = ParseOperatorType();
     if (!Type)
       return nullptr;
 
@@ -1422,7 +1418,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       return nullptr;
     }
 
-    RecordRecTy *RecType = dyn_cast<RecordRecTy>(ExprType->getType());
+    const RecordRecTy *RecType = dyn_cast<RecordRecTy>(ExprType->getType());
     if (RecType) {
       Error(ExprLoc,
             "expected string type argument in !exists operator, please "
@@ -1430,7 +1426,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       return nullptr;
     }
 
-    StringRecTy *SType = dyn_cast<StringRecTy>(ExprType->getType());
+    const StringRecTy *SType = dyn_cast<StringRecTy>(ExprType->getType());
     if (!SType) {
       Error(ExprLoc, "expected string type argument in !exists operator");
       return nullptr;
@@ -1509,8 +1505,8 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       break;
     }
 
-    RecTy *Type = nullptr;
-    RecTy *ArgType = nullptr;
+    const RecTy *Type = nullptr;
+    const RecTy *ArgType = nullptr;
     switch (OpTok) {
     default:
       llvm_unreachable("Unhandled code!");
@@ -1600,7 +1596,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
                            InitList.back()->getAsString() + "'"));
         return nullptr;
       }
-      RecTy *ListType = InitListBack->getType();
+      const RecTy *ListType = InitListBack->getType();
 
       if (!ArgType) {
         // Argument type must be determined from the argument itself.
@@ -1702,7 +1698,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
 
       } else {
         // Desired argument type is a known and in ArgType.
-        RecTy *Resolved = resolveTypes(ArgType, ListType);
+        const RecTy *Resolved = resolveTypes(ArgType, ListType);
         if (!Resolved) {
           Error(InitLoc, Twine("expected value of type '") +
                              ArgType->getAsString() + "', got '" +
@@ -1812,7 +1808,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         return nullptr;
       }
 
-      RecTy *ArgBackType = ArgBack->getType();
+      const RecTy *ArgBackType = ArgBack->getType();
       if (!FirstArgIsList || Args.size() == 1) {
         if (Args.size() == 1 && isa<ListRecTy>(ArgBackType)) {
           FirstArgIsList = true; // Detect error if 2nd arg were present.
@@ -1888,7 +1884,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
   case tgtok::XIf:
   case tgtok::XSubst: { // Value ::= !ternop '(' Value ',' Value ',' Value ')'
     TernOpInit::TernaryOp Code;
-    RecTy *Type = nullptr;
+    const RecTy *Type = nullptr;
 
     tgtok::TokKind LexCode = Lex.getCode();
     Lex.Lex();  // eat the operation
@@ -1982,8 +1978,8 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       break;
     }
     case tgtok::XIf: {
-      RecTy *MHSTy = nullptr;
-      RecTy *RHSTy = nullptr;
+      const RecTy *MHSTy = nullptr;
+      const RecTy *RHSTy = nullptr;
 
       if (TypedInit *MHSt = dyn_cast<TypedInit>(MHS))
         MHSTy = MHSt->getType();
@@ -2104,7 +2100,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       return nullptr;
     }
 
-    ListRecTy *ListType = dyn_cast<ListRecTy>(List->getType());
+    const ListRecTy *ListType = dyn_cast<ListRecTy>(List->getType());
     if (!ListType) {
       TokError(Twine("!foldl list must be a list, but is of type '") +
                List->getType()->getAsString());
@@ -2202,8 +2198,8 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
 ///
 /// OperatorType ::= '<' Type '>'
 ///
-RecTy *TGParser::ParseOperatorType() {
-  RecTy *Type = nullptr;
+const RecTy *TGParser::ParseOperatorType() {
+  const RecTy *Type = nullptr;
 
   if (!consume(tgtok::less)) {
     TokError("expected type name for operator");
@@ -2231,9 +2227,9 @@ RecTy *TGParser::ParseOperatorType() {
 /// Parse the !substr operation. Return null on error.
 ///
 /// Substr ::= !substr(string, start-int [, length-int]) => string
-Init *TGParser::ParseOperationSubstr(Record *CurRec, RecTy *ItemType) {
+Init *TGParser::ParseOperationSubstr(Record *CurRec, const RecTy *ItemType) {
   TernOpInit::TernaryOp Code = TernOpInit::SUBSTR;
-  RecTy *Type = StringRecTy::get(Records);
+  const RecTy *Type = StringRecTy::get(Records);
 
   Lex.Lex(); // eat the operation
 
@@ -2319,9 +2315,9 @@ Init *TGParser::ParseOperationSubstr(Record *CurRec, RecTy *ItemType) {
 /// Parse the !find operation. Return null on error.
 ///
 /// Substr ::= !find(string, string [, start-int]) => int
-Init *TGParser::ParseOperationFind(Record *CurRec, RecTy *ItemType) {
+Init *TGParser::ParseOperationFind(Record *CurRec, const RecTy *ItemType) {
   TernOpInit::TernaryOp Code = TernOpInit::FIND;
-  RecTy *Type = IntRecTy::get(Records);
+  const RecTy *Type = IntRecTy::get(Records);
 
   Lex.Lex(); // eat the operation
 
@@ -2408,7 +2404,8 @@ Init *TGParser::ParseOperationFind(Record *CurRec, RecTy *ItemType) {
 ///
 /// ForEach ::= !foreach(ID, list-or-dag, expr) => list<expr type>
 /// Filter  ::= !foreach(ID, list, predicate) ==> list<list type>
-Init *TGParser::ParseOperationForEachFilter(Record *CurRec, RecTy *ItemType) { 
+Init *TGParser::ParseOperationForEachFilter(Record *CurRec,
+                                            const RecTy *ItemType) {
   SMLoc OpLoc = Lex.getLoc();
   tgtok::TokKind Operation = Lex.getCode();
   Lex.Lex(); // eat the operation
@@ -2452,14 +2449,14 @@ Init *TGParser::ParseOperationForEachFilter(Record *CurRec, RecTy *ItemType) {
     return nullptr;
   }
 
-  RecTy *InEltType = nullptr;
-  RecTy *ExprEltType = nullptr;
+  const RecTy *InEltType = nullptr;
+  const RecTy *ExprEltType = nullptr;
   bool IsDAG = false;
 
-  if (ListRecTy *InListTy = dyn_cast<ListRecTy>(MHSt->getType())) {
+  if (const ListRecTy *InListTy = dyn_cast<ListRecTy>(MHSt->getType())) {
     InEltType = InListTy->getElementType();
     if (ItemType) {
-      if (ListRecTy *OutListTy = dyn_cast<ListRecTy>(ItemType)) {
+      if (const ListRecTy *OutListTy = dyn_cast<ListRecTy>(ItemType)) {
         ExprEltType = (Operation == tgtok::XForEach)
                           ? OutListTy->getElementType()
                           : IntRecTy::get(Records);
@@ -2471,7 +2468,7 @@ Init *TGParser::ParseOperationForEachFilter(Record *CurRec, RecTy *ItemType) {
         return nullptr;
       }
     }
-  } else if (DagRecTy *InDagTy = dyn_cast<DagRecTy>(MHSt->getType())) {
+  } else if (const DagRecTy *InDagTy = dyn_cast<DagRecTy>(MHSt->getType())) {
     if (Operation == tgtok::XFilter) {
       TokError("!filter must have a list argument");
       return nullptr;
@@ -2514,7 +2511,7 @@ Init *TGParser::ParseOperationForEachFilter(Record *CurRec, RecTy *ItemType) {
     return nullptr;
   }
 
-  RecTy *OutType = InEltType;
+  const RecTy *OutType = InEltType;
   if (Operation == tgtok::XForEach && !IsDAG) {
     TypedInit *RHSt = dyn_cast<TypedInit>(RHS);
     if (!RHSt) {
@@ -2532,7 +2529,7 @@ Init *TGParser::ParseOperationForEachFilter(Record *CurRec, RecTy *ItemType) {
       ->Fold(CurRec);
 }
 
-Init *TGParser::ParseOperationCond(Record *CurRec, RecTy *ItemType) {
+Init *TGParser::ParseOperationCond(Record *CurRec, const RecTy *ItemType) {
   Lex.Lex();  // eat the operation 'cond'
 
   if (!consume(tgtok::l_paren)) {
@@ -2577,9 +2574,9 @@ Init *TGParser::ParseOperationCond(Record *CurRec, RecTy *ItemType) {
   }
 
   // resolve type
-  RecTy *Type = nullptr;
+  const RecTy *Type = nullptr;
   for (Init *V : Val) {
-    RecTy *VTy = nullptr;
+    const RecTy *VTy = nullptr;
     if (TypedInit *Vt = dyn_cast<TypedInit>(V))
       VTy = Vt->getType();
     if (BitsInit *Vbits = dyn_cast<BitsInit>(V))
@@ -2592,7 +2589,7 @@ Init *TGParser::ParseOperationCond(Record *CurRec, RecTy *ItemType) {
         Type = VTy;
     } else {
       if (!isa<UnsetInit>(V)) {
-        RecTy *RType = resolveTypes(Type, VTy);
+        const RecTy *RType = resolveTypes(Type, VTy);
         if (!RType) {
           TokError(Twine("inconsistent types '") + Type->getAsString() +
                          "' and '" + VTy->getAsString() + "' for !cond");
@@ -2637,7 +2634,7 @@ Init *TGParser::ParseOperationCond(Record *CurRec, RecTy *ItemType) {
 ///   SimpleValue ::= STRCONCATTOK '(' Value ',' Value ')'
 ///   SimpleValue ::= COND '(' [Value ':' Value,]+ ')'
 ///
-Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
+Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType,
                                  IDParseMode Mode) {
   Init *R = nullptr;
   tgtok::TokKind Code = Lex.getCode();
@@ -2756,7 +2753,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
       }
       // bits<n> can also come from variable initializers.
       if (VarInit *VI = dyn_cast<VarInit>(Vals[i])) {
-        if (BitsRecTy *BitsRec = dyn_cast<BitsRecTy>(VI->getType())) {
+        if (const BitsRecTy *BitsRec = dyn_cast<BitsRecTy>(VI->getType())) {
           for (unsigned i = 0, e = BitsRec->getNumBits(); i != e; ++i)
             NewBits.push_back(VI->getBit((e - i) - 1));
           continue;
@@ -2779,11 +2776,11 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     Lex.Lex(); // eat the '['
     SmallVector<Init*, 16> Vals;
 
-    RecTy *DeducedEltTy = nullptr;
-    ListRecTy *GivenListTy = nullptr;
+    const RecTy *DeducedEltTy = nullptr;
+    const ListRecTy *GivenListTy = nullptr;
 
     if (ItemType) {
-      ListRecTy *ListType = dyn_cast<ListRecTy>(ItemType);
+      const ListRecTy *ListType = dyn_cast<ListRecTy>(ItemType);
       if (!ListType) {
         TokError(Twine("Encountered a list when expecting a ") +
                  ItemType->getAsString());
@@ -2802,7 +2799,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
       return nullptr;
     }
 
-    RecTy *GivenEltTy = nullptr;
+    const RecTy *GivenEltTy = nullptr;
     if (consume(tgtok::less)) {
       // Optional list element type
       GivenEltTy = ParseType();
@@ -2818,7 +2815,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     }
 
     // Check elements
-    RecTy *EltTy = nullptr;
+    const RecTy *EltTy = nullptr;
     for (Init *V : Vals) {
       TypedInit *TArg = dyn_cast<TypedInit>(V);
       if (TArg) {
@@ -2866,11 +2863,13 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
 
     return ListInit::get(Vals, DeducedEltTy);
   }
-  case tgtok::l_paren: {         // Value ::= '(' IDValue DagArgList ')'
+  case tgtok::l_paren: { // Value ::= '(' IDValue DagArgList ')'
+                         // Value ::= '(' '[' ValueList ']' DagArgList ')'
     Lex.Lex();   // eat the '('
     if (Lex.getCode() != tgtok::Id && Lex.getCode() != tgtok::XCast &&
-        Lex.getCode() != tgtok::question && Lex.getCode() != tgtok::XGetDagOp) {
-      TokError("expected identifier in dag init");
+        Lex.getCode() != tgtok::question && Lex.getCode() != tgtok::XGetDagOp &&
+        Lex.getCode() != tgtok::l_square) {
+      TokError("expected identifier or list of value types in dag init");
       return nullptr;
     }
 
@@ -2913,7 +2912,8 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
 ///   ValueSuffix ::= '[' SliceElements ']'
 ///   ValueSuffix ::= '.' ID
 ///
-Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
+Init *TGParser::ParseValue(Record *CurRec, const RecTy *ItemType,
+                           IDParseMode Mode) {
   SMLoc LHSLoc = Lex.getLoc();
   Init *Result = ParseSimpleValue(CurRec, ItemType, Mode);
   if (!Result) return nullptr;
@@ -3004,9 +3004,9 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
           DI->getDef()->getValue(FieldName)->addReferenceLoc(FieldNameLoc);
         } else if (auto *TI = dyn_cast<TypedInit>(Result)) {
           if (auto *RecTy = dyn_cast<RecordRecTy>(TI->getType())) {
-            for (Record *R : RecTy->getClasses())
+            for (const Record *R : RecTy->getClasses())
               if (auto *RV = R->getValue(FieldName))
-                RV->addReferenceLoc(FieldNameLoc);
+                const_cast<RecordVal *>(RV)->addReferenceLoc(FieldNameLoc);
           }
         }
       }
@@ -3157,8 +3157,7 @@ void TGParser::ParseDagArgList(
 ///   ValueList ::= Value (',' Value)
 ///
 void TGParser::ParseValueList(SmallVectorImpl<Init *> &Result, Record *CurRec,
-                              RecTy *ItemType) {
-
+                              const RecTy *ItemType) {
   Result.push_back(ParseValue(CurRec, ItemType));
   if (!Result.back()) {
     Result.clear();
@@ -3267,7 +3266,7 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
   // Read the field prefix if present.
   bool HasField = consume(tgtok::Field);
 
-  RecTy *Type = ParseType();
+  const RecTy *Type = ParseType();
   if (!Type) return nullptr;
 
   if (Lex.getCode() != tgtok::Id) {
@@ -3351,7 +3350,7 @@ VarInit *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) {
     return nullptr;
   }
 
-  RecTy *IterType = nullptr;
+  const RecTy *IterType = nullptr;
   SmallVector<unsigned, 16> Ranges;
 
   switch (Lex.getCode()) {
@@ -3495,7 +3494,7 @@ bool TGParser::ParseBodyItem(Record *CurRec) {
   if (!Field)
     return TokError("Value '" + FieldName->getValue() + "' unknown!");
 
-  RecTy *Type = Field->getType();
+  const RecTy *Type = Field->getType();
   if (!BitList.empty() && isa<BitsRecTy>(Type)) {
     // When assigning to a subset of a 'bits' object, expect the RHS to have
     // the type of that subset instead of the type of the whole object.
@@ -3657,7 +3656,7 @@ bool TGParser::ParseDefset() {
 
   DefsetRecord Defset;
   Defset.Loc = Lex.getLoc();
-  RecTy *Type = ParseType();
+  const RecTy *Type = ParseType();
   if (!Type)
     return true;
   if (!isa<ListRecTy>(Type))
@@ -3713,7 +3712,7 @@ bool TGParser::ParseDeftype() {
     return TokError("expected '='");
 
   SMLoc Loc = Lex.getLoc();
-  RecTy *Type = ParseType();
+  const RecTy *Type = ParseType();
   if (!Type)
     return true;
 
@@ -3857,7 +3856,7 @@ bool TGParser::ParseIf(MultiClass *CurMultiClass) {
   ListInit *EmptyList = ListInit::get({}, BitRecTy::get(Records));
   ListInit *SingletonList =
       ListInit::get({BitInit::get(Records, true)}, BitRecTy::get(Records));
-  RecTy *BitListTy = ListRecTy::get(BitRecTy::get(Records));
+  const RecTy *BitListTy = ListRecTy::get(BitRecTy::get(Records));
 
   // The foreach containing the then-clause selects SingletonList if
   // the condition is true.
@@ -4411,7 +4410,7 @@ bool TGParser::CheckTemplateArgValues(
       ArgName = Value->getName();
 
     RecordVal *Arg = ArgsRec->getValue(ArgName);
-    RecTy *ArgType = Arg->getType();
+    const RecTy *ArgType = Arg->getType();
 
     if (TypedInit *ArgValue = dyn_cast<TypedInit>(Value->getValue())) {
       auto *CastValue = ArgValue->getCastTo(ArgType);
diff --git a/llvm/lib/TableGen/TGParser.h b/llvm/lib/TableGen/TGParser.h
index b08e250870901..f33ae1ce2c810 100644
--- a/llvm/lib/TableGen/TGParser.h
+++ b/llvm/lib/TableGen/TGParser.h
@@ -74,7 +74,7 @@ struct ForeachLoop {
 
 struct DefsetRecord {
   SMLoc Loc;
-  RecTy *EltTy = nullptr;
+  const RecTy *EltTy = nullptr;
   SmallVector<Init *, 16> Elements;
 };
 
@@ -143,7 +143,7 @@ class TGParser {
   TGLexer Lex;
   std::vector<SmallVector<LetRecord, 4>> LetStack;
   std::map<std::string, std::unique_ptr<MultiClass>> MultiClasses;
-  std::map<std::string, RecTy *> TypeAliases;
+  std::map<std::string, const RecTy *> TypeAliases;
 
   /// Loops - Keep track of any foreach loops we are within.
   ///
@@ -288,12 +288,12 @@ class TGParser {
 
   Init *ParseIDValue(Record *CurRec, StringInit *Name, SMRange NameLoc,
                      IDParseMode Mode = ParseValueMode);
-  Init *ParseSimpleValue(Record *CurRec, RecTy *ItemType = nullptr,
+  Init *ParseSimpleValue(Record *CurRec, const RecTy *ItemType = nullptr,
                          IDParseMode Mode = ParseValueMode);
-  Init *ParseValue(Record *CurRec, RecTy *ItemType = nullptr,
+  Init *ParseValue(Record *CurRec, const RecTy *ItemType = nullptr,
                    IDParseMode Mode = ParseValueMode);
-  void ParseValueList(SmallVectorImpl<llvm::Init*> &Result,
-                      Record *CurRec, RecTy *ItemType = nullptr);
+  void ParseValueList(SmallVectorImpl<llvm::Init *> &Result, Record *CurRec,
+                      const RecTy *ItemType = nullptr);
   bool ParseTemplateArgValueList(SmallVectorImpl<llvm::ArgumentInit *> &Result,
                                  Record *CurRec, Record *ArgsRec);
   void ParseDagArgList(
@@ -306,13 +306,13 @@ class TGParser {
   void ParseRangeList(SmallVectorImpl<unsigned> &Result);
   bool ParseRangePiece(SmallVectorImpl<unsigned> &Ranges,
                        TypedInit *FirstItem = nullptr);
-  RecTy *ParseType();
-  Init *ParseOperation(Record *CurRec, RecTy *ItemType);
-  Init *ParseOperationSubstr(Record *CurRec, RecTy *ItemType);
-  Init *ParseOperationFind(Record *CurRec, RecTy *ItemType);
-  Init *ParseOperationForEachFilter(Record *CurRec, RecTy *ItemType);
-  Init *ParseOperationCond(Record *CurRec, RecTy *ItemType);
-  RecTy *ParseOperatorType();
+  const RecTy *ParseType();
+  Init *ParseOperation(Record *CurRec, const RecTy *ItemType);
+  Init *ParseOperationSubstr(Record *CurRec, const RecTy *ItemType);
+  Init *ParseOperationFind(Record *CurRec, const RecTy *ItemType);
+  Init *ParseOperationForEachFilter(Record *CurRec, const RecTy *ItemType);
+  Init *ParseOperationCond(Record *CurRec, const RecTy *ItemType);
+  const RecTy *ParseOperatorType();
   Init *ParseObjectName(MultiClass *CurMultiClass);
   Record *ParseClassID();
   MultiClass *ParseMultiClassID();
diff --git a/llvm/lib/TableGen/TableGenBackendSkeleton.cpp b/llvm/lib/TableGen/TableGenBackendSkeleton.cpp
index 8e65b7a5b300f..a7c4a8925799a 100644
--- a/llvm/lib/TableGen/TableGenBackendSkeleton.cpp
+++ b/llvm/lib/TableGen/TableGenBackendSkeleton.cpp
@@ -1,4 +1,4 @@
-//===- SkeletonEmitter.cpp - Skeleton TableGen backend          -*- C++ -*-===//
+//===- TableGenBackendSkeleton.cpp - Skeleton TableGen backend --*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -29,10 +29,10 @@ namespace {
 
 class SkeletonEmitter {
 private:
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
 
 public:
-  SkeletonEmitter(RecordKeeper &RK) : Records(RK) {}
+  SkeletonEmitter(const RecordKeeper &RK) : Records(RK) {}
 
   void run(raw_ostream &OS);
 }; // emitter class
@@ -55,7 +55,7 @@ static TableGen::Emitter::OptClass<SkeletonEmitter>
 //===----------------------------------------------------------------------===//
 // Option B: Register "EmitSkeleton" directly
 // The emitter entry may be private scope.
-static void EmitSkeleton(RecordKeeper &RK, raw_ostream &OS) {
+static void EmitSkeleton(const RecordKeeper &RK, raw_ostream &OS) {
   // Instantiate the emitter class and invoke run().
   SkeletonEmitter(RK).run(OS);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index c6e88131d5a34..a982ea67a0f27 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -24,6 +24,7 @@
 #include "MCTargetDesc/AArch64TargetStreamer.h"
 #include "TargetInfo/AArch64TargetInfo.h"
 #include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -86,6 +87,9 @@ class AArch64AsmPrinter : public AsmPrinter {
   FaultMaps FM;
   const AArch64Subtarget *STI;
   bool ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags = false;
+#ifndef NDEBUG
+  unsigned InstsEmitted;
+#endif
 
 public:
   AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
@@ -150,8 +154,7 @@ class AArch64AsmPrinter : public AsmPrinter {
   void emitPtrauthAuthResign(const MachineInstr *MI);
 
   // Emit the sequence to compute a discriminator into x17, or reuse AddrDisc.
-  unsigned emitPtrauthDiscriminator(uint16_t Disc, unsigned AddrDisc,
-                                    unsigned &InstsEmitted);
+  unsigned emitPtrauthDiscriminator(uint16_t Disc, unsigned AddrDisc);
 
   // Emit the sequence for LOADauthptrstatic
   void LowerLOADauthptrstatic(const MachineInstr &MI);
@@ -164,6 +167,11 @@ class AArch64AsmPrinter : public AsmPrinter {
   /// pseudo instructions.
   bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
+  void EmitToStreamer(MCStreamer &S, const MCInst &Inst);
+  void EmitToStreamer(const MCInst &Inst) {
+    EmitToStreamer(*OutStreamer, Inst);
+  }
+
   void emitInstruction(const MachineInstr *MI) override;
 
   void emitFunctionHeaderComment() override;
@@ -229,6 +237,10 @@ class AArch64AsmPrinter : public AsmPrinter {
   /// Emit the LOHs contained in AArch64FI.
   void emitLOHs();
 
+  void emitMovXReg(Register Dest, Register Src);
+  void emitMOVZ(Register Dest, uint64_t Imm, unsigned Shift);
+  void emitMOVK(Register Dest, uint64_t Imm, unsigned Shift);
+
   /// Emit instruction to set float register to zero.
   void emitFMov0(const MachineInstr &MI);
 
@@ -409,16 +421,6 @@ void AArch64AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
   auto &O = *OutStreamer;
   MCSymbol *CurSled = OutContext.createTempSymbol("xray_sled_", true);
   O.emitLabel(CurSled);
-  MCInst MovX0Op0 = MCInstBuilder(AArch64::ORRXrs)
-                        .addReg(AArch64::X0)
-                        .addReg(AArch64::XZR)
-                        .addReg(MI.getOperand(0).getReg())
-                        .addImm(0);
-  MCInst MovX1Op1 = MCInstBuilder(AArch64::ORRXrs)
-                        .addReg(AArch64::X1)
-                        .addReg(AArch64::XZR)
-                        .addReg(MI.getOperand(1).getReg())
-                        .addImm(0);
   bool MachO = TM.getTargetTriple().isOSBinFormatMachO();
   auto *Sym = MCSymbolRefExpr::create(
       OutContext.getOrCreateSymbol(
@@ -438,13 +440,9 @@ void AArch64AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
                           .addReg(AArch64::X2)
                           .addReg(AArch64::SP)
                           .addImm(2));
-    EmitToStreamer(O, MovX0Op0);
-    EmitToStreamer(O, MovX1Op1);
-    EmitToStreamer(O, MCInstBuilder(AArch64::ORRXrs)
-                          .addReg(AArch64::X2)
-                          .addReg(AArch64::XZR)
-                          .addReg(MI.getOperand(2).getReg())
-                          .addImm(0));
+    emitMovXReg(AArch64::X0, MI.getOperand(0).getReg());
+    emitMovXReg(AArch64::X1, MI.getOperand(1).getReg());
+    emitMovXReg(AArch64::X2, MI.getOperand(2).getReg());
     EmitToStreamer(O, MCInstBuilder(AArch64::BL).addExpr(Sym));
     EmitToStreamer(O, MCInstBuilder(AArch64::LDRXui)
                           .addReg(AArch64::X2)
@@ -468,8 +466,8 @@ void AArch64AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
                           .addReg(AArch64::X1)
                           .addReg(AArch64::SP)
                           .addImm(-2));
-    EmitToStreamer(O, MovX0Op0);
-    EmitToStreamer(O, MovX1Op1);
+    emitMovXReg(AArch64::X0, MI.getOperand(0).getReg());
+    emitMovXReg(AArch64::X1, MI.getOperand(1).getReg());
     EmitToStreamer(O, MCInstBuilder(AArch64::BL).addExpr(Sym));
     O.AddComment("End XRay custom event");
     EmitToStreamer(O, MCInstBuilder(AArch64::LDPXpost)
@@ -497,11 +495,7 @@ void AArch64AsmPrinter::LowerKCFI_CHECK(const MachineInstr &MI) {
     // Checking XZR makes no sense. Instead of emitting a load, zero
     // ScratchRegs[0] and use it for the ESR AddrIndex below.
     AddrReg = getXRegFromWReg(ScratchRegs[0]);
-    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ORRXrs)
-                                     .addReg(AddrReg)
-                                     .addReg(AArch64::XZR)
-                                     .addReg(AArch64::XZR)
-                                     .addImm(0));
+    emitMovXReg(AddrReg, AArch64::XZR);
   } else {
     // If one of the scratch registers is used for the call target (e.g.
     // with AArch64::TCRETURNriBTI), we can clobber another caller-saved
@@ -534,16 +528,8 @@ void AArch64AsmPrinter::LowerKCFI_CHECK(const MachineInstr &MI) {
 
   // Load the expected type hash.
   const int64_t Type = MI.getOperand(1).getImm();
-  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::MOVKWi)
-                                   .addReg(ScratchRegs[1])
-                                   .addReg(ScratchRegs[1])
-                                   .addImm(Type & 0xFFFF)
-                                   .addImm(0));
-  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::MOVKWi)
-                                   .addReg(ScratchRegs[1])
-                                   .addReg(ScratchRegs[1])
-                                   .addImm((Type >> 16) & 0xFFFF)
-                                   .addImm(16));
+  emitMOVK(ScratchRegs[1], Type & 0xFFFF, 0);
+  emitMOVK(ScratchRegs[1], (Type >> 16) & 0xFFFF, 16);
 
   // Compare the hashes and trap if there's a mismatch.
   EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::SUBSWrs)
@@ -627,6 +613,7 @@ void AArch64AsmPrinter::emitHwasanMemaccessSymbols(Module &M) {
   std::unique_ptr<MCSubtargetInfo> STI(
       TM.getTarget().createMCSubtargetInfo(TT.str(), "", ""));
   assert(STI && "Unable to create subtarget info");
+  this->STI = static_cast<const AArch64Subtarget *>(&*STI);
 
   MCSymbol *HwasanTagMismatchV1Sym =
       OutContext.getOrCreateSymbol("__hwasan_tag_mismatch");
@@ -667,204 +654,161 @@ void AArch64AsmPrinter::emitHwasanMemaccessSymbols(Module &M) {
     OutStreamer->emitSymbolAttribute(Sym, MCSA_Hidden);
     OutStreamer->emitLabel(Sym);
 
-    OutStreamer->emitInstruction(MCInstBuilder(AArch64::SBFMXri)
-                                     .addReg(AArch64::X16)
-                                     .addReg(Reg)
-                                     .addImm(4)
-                                     .addImm(55),
-                                 *STI);
+    EmitToStreamer(MCInstBuilder(AArch64::SBFMXri)
+                       .addReg(AArch64::X16)
+                       .addReg(Reg)
+                       .addImm(4)
+                       .addImm(55));
 
     if (IsFixedShadow) {
       // Aarch64 makes it difficult to embed large constants in the code.
       // Fortuitously, kShadowBaseAlignment == 32, so we use the 32-bit
       // left-shift option in the MOV instruction. Combined with the 16-bit
       // immediate, this is enough to represent any offset up to 2**48.
-      OutStreamer->emitInstruction(MCInstBuilder(AArch64::MOVZXi)
-                                       .addReg(AArch64::X17)
-                                       .addImm(FixedShadowOffset >> 32)
-                                       .addImm(32),
-                                   *STI);
-      OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDRBBroX)
-                                       .addReg(AArch64::W16)
-                                       .addReg(AArch64::X17)
-                                       .addReg(AArch64::X16)
-                                       .addImm(0)
-                                       .addImm(0),
-                                   *STI);
+      emitMOVZ(AArch64::X17, FixedShadowOffset >> 32, 32);
+      EmitToStreamer(MCInstBuilder(AArch64::LDRBBroX)
+                         .addReg(AArch64::W16)
+                         .addReg(AArch64::X17)
+                         .addReg(AArch64::X16)
+                         .addImm(0)
+                         .addImm(0));
     } else {
-      OutStreamer->emitInstruction(
-          MCInstBuilder(AArch64::LDRBBroX)
-              .addReg(AArch64::W16)
-              .addReg(IsShort ? AArch64::X20 : AArch64::X9)
-              .addReg(AArch64::X16)
-              .addImm(0)
-              .addImm(0),
-          *STI);
+      EmitToStreamer(MCInstBuilder(AArch64::LDRBBroX)
+                         .addReg(AArch64::W16)
+                         .addReg(IsShort ? AArch64::X20 : AArch64::X9)
+                         .addReg(AArch64::X16)
+                         .addImm(0)
+                         .addImm(0));
     }
 
-    OutStreamer->emitInstruction(
-        MCInstBuilder(AArch64::SUBSXrs)
-            .addReg(AArch64::XZR)
-            .addReg(AArch64::X16)
-            .addReg(Reg)
-            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
-        *STI);
+    EmitToStreamer(MCInstBuilder(AArch64::SUBSXrs)
+                       .addReg(AArch64::XZR)
+                       .addReg(AArch64::X16)
+                       .addReg(Reg)
+                       .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)));
     MCSymbol *HandleMismatchOrPartialSym = OutContext.createTempSymbol();
-    OutStreamer->emitInstruction(
-        MCInstBuilder(AArch64::Bcc)
-            .addImm(AArch64CC::NE)
-            .addExpr(MCSymbolRefExpr::create(HandleMismatchOrPartialSym,
-                                             OutContext)),
-        *STI);
+    EmitToStreamer(MCInstBuilder(AArch64::Bcc)
+                       .addImm(AArch64CC::NE)
+                       .addExpr(MCSymbolRefExpr::create(
+                           HandleMismatchOrPartialSym, OutContext)));
     MCSymbol *ReturnSym = OutContext.createTempSymbol();
     OutStreamer->emitLabel(ReturnSym);
-    OutStreamer->emitInstruction(
-        MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI);
+    EmitToStreamer(MCInstBuilder(AArch64::RET).addReg(AArch64::LR));
     OutStreamer->emitLabel(HandleMismatchOrPartialSym);
 
     if (HasMatchAllTag) {
-      OutStreamer->emitInstruction(MCInstBuilder(AArch64::UBFMXri)
-                                       .addReg(AArch64::X17)
-                                       .addReg(Reg)
-                                       .addImm(56)
-                                       .addImm(63),
-                                   *STI);
-      OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSXri)
-                                       .addReg(AArch64::XZR)
-                                       .addReg(AArch64::X17)
-                                       .addImm(MatchAllTag)
-                                       .addImm(0),
-                                   *STI);
-      OutStreamer->emitInstruction(
+      EmitToStreamer(MCInstBuilder(AArch64::UBFMXri)
+                         .addReg(AArch64::X17)
+                         .addReg(Reg)
+                         .addImm(56)
+                         .addImm(63));
+      EmitToStreamer(MCInstBuilder(AArch64::SUBSXri)
+                         .addReg(AArch64::XZR)
+                         .addReg(AArch64::X17)
+                         .addImm(MatchAllTag)
+                         .addImm(0));
+      EmitToStreamer(
           MCInstBuilder(AArch64::Bcc)
               .addImm(AArch64CC::EQ)
-              .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
-          *STI);
+              .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)));
     }
 
     if (IsShort) {
-      OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSWri)
-                                       .addReg(AArch64::WZR)
-                                       .addReg(AArch64::W16)
-                                       .addImm(15)
-                                       .addImm(0),
-                                   *STI);
+      EmitToStreamer(MCInstBuilder(AArch64::SUBSWri)
+                         .addReg(AArch64::WZR)
+                         .addReg(AArch64::W16)
+                         .addImm(15)
+                         .addImm(0));
       MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
-      OutStreamer->emitInstruction(
+      EmitToStreamer(
           MCInstBuilder(AArch64::Bcc)
               .addImm(AArch64CC::HI)
-              .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
-          *STI);
+              .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)));
 
-      OutStreamer->emitInstruction(
-          MCInstBuilder(AArch64::ANDXri)
-              .addReg(AArch64::X17)
-              .addReg(Reg)
-              .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
-          *STI);
+      EmitToStreamer(MCInstBuilder(AArch64::ANDXri)
+                         .addReg(AArch64::X17)
+                         .addReg(Reg)
+                         .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)));
       if (Size != 1)
-        OutStreamer->emitInstruction(MCInstBuilder(AArch64::ADDXri)
-                                         .addReg(AArch64::X17)
-                                         .addReg(AArch64::X17)
-                                         .addImm(Size - 1)
-                                         .addImm(0),
-                                     *STI);
-      OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSWrs)
-                                       .addReg(AArch64::WZR)
-                                       .addReg(AArch64::W16)
-                                       .addReg(AArch64::W17)
-                                       .addImm(0),
-                                   *STI);
-      OutStreamer->emitInstruction(
+        EmitToStreamer(MCInstBuilder(AArch64::ADDXri)
+                           .addReg(AArch64::X17)
+                           .addReg(AArch64::X17)
+                           .addImm(Size - 1)
+                           .addImm(0));
+      EmitToStreamer(MCInstBuilder(AArch64::SUBSWrs)
+                         .addReg(AArch64::WZR)
+                         .addReg(AArch64::W16)
+                         .addReg(AArch64::W17)
+                         .addImm(0));
+      EmitToStreamer(
           MCInstBuilder(AArch64::Bcc)
               .addImm(AArch64CC::LS)
-              .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
-          *STI);
+              .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)));
 
-      OutStreamer->emitInstruction(
-          MCInstBuilder(AArch64::ORRXri)
-              .addReg(AArch64::X16)
-              .addReg(Reg)
-              .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
-          *STI);
-      OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDRBBui)
-                                       .addReg(AArch64::W16)
-                                       .addReg(AArch64::X16)
-                                       .addImm(0),
-                                   *STI);
-      OutStreamer->emitInstruction(
+      EmitToStreamer(MCInstBuilder(AArch64::ORRXri)
+                         .addReg(AArch64::X16)
+                         .addReg(Reg)
+                         .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)));
+      EmitToStreamer(MCInstBuilder(AArch64::LDRBBui)
+                         .addReg(AArch64::W16)
+                         .addReg(AArch64::X16)
+                         .addImm(0));
+      EmitToStreamer(
           MCInstBuilder(AArch64::SUBSXrs)
               .addReg(AArch64::XZR)
               .addReg(AArch64::X16)
               .addReg(Reg)
-              .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
-          *STI);
-      OutStreamer->emitInstruction(
+              .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)));
+      EmitToStreamer(
           MCInstBuilder(AArch64::Bcc)
               .addImm(AArch64CC::EQ)
-              .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
-          *STI);
+              .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)));
 
       OutStreamer->emitLabel(HandleMismatchSym);
     }
 
-    OutStreamer->emitInstruction(MCInstBuilder(AArch64::STPXpre)
-                                     .addReg(AArch64::SP)
-                                     .addReg(AArch64::X0)
-                                     .addReg(AArch64::X1)
-                                     .addReg(AArch64::SP)
-                                     .addImm(-32),
-                                 *STI);
-    OutStreamer->emitInstruction(MCInstBuilder(AArch64::STPXi)
-                                     .addReg(AArch64::FP)
-                                     .addReg(AArch64::LR)
-                                     .addReg(AArch64::SP)
-                                     .addImm(29),
-                                 *STI);
+    EmitToStreamer(MCInstBuilder(AArch64::STPXpre)
+                       .addReg(AArch64::SP)
+                       .addReg(AArch64::X0)
+                       .addReg(AArch64::X1)
+                       .addReg(AArch64::SP)
+                       .addImm(-32));
+    EmitToStreamer(MCInstBuilder(AArch64::STPXi)
+                       .addReg(AArch64::FP)
+                       .addReg(AArch64::LR)
+                       .addReg(AArch64::SP)
+                       .addImm(29));
 
     if (Reg != AArch64::X0)
-      OutStreamer->emitInstruction(MCInstBuilder(AArch64::ORRXrs)
-                                       .addReg(AArch64::X0)
-                                       .addReg(AArch64::XZR)
-                                       .addReg(Reg)
-                                       .addImm(0),
-                                   *STI);
-    OutStreamer->emitInstruction(
-        MCInstBuilder(AArch64::MOVZXi)
-            .addReg(AArch64::X1)
-            .addImm(AccessInfo & HWASanAccessInfo::RuntimeMask)
-            .addImm(0),
-        *STI);
+      emitMovXReg(AArch64::X0, Reg);
+    emitMOVZ(AArch64::X1, AccessInfo & HWASanAccessInfo::RuntimeMask, 0);
 
     if (CompileKernel) {
       // The Linux kernel's dynamic loader doesn't support GOT relative
       // relocations, but it doesn't support late binding either, so just call
       // the function directly.
-      OutStreamer->emitInstruction(
-          MCInstBuilder(AArch64::B).addExpr(HwasanTagMismatchRef), *STI);
+      EmitToStreamer(MCInstBuilder(AArch64::B).addExpr(HwasanTagMismatchRef));
     } else {
       // Intentionally load the GOT entry and branch to it, rather than possibly
       // late binding the function, which may clobber the registers before we
       // have a chance to save them.
-      OutStreamer->emitInstruction(
+      EmitToStreamer(
           MCInstBuilder(AArch64::ADRP)
               .addReg(AArch64::X16)
               .addExpr(AArch64MCExpr::create(
                   HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE,
-                  OutContext)),
-          *STI);
-      OutStreamer->emitInstruction(
+                  OutContext)));
+      EmitToStreamer(
           MCInstBuilder(AArch64::LDRXui)
               .addReg(AArch64::X16)
               .addReg(AArch64::X16)
               .addExpr(AArch64MCExpr::create(
                   HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12,
-                  OutContext)),
-          *STI);
-      OutStreamer->emitInstruction(
-          MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI);
+                  OutContext)));
+      EmitToStreamer(MCInstBuilder(AArch64::BR).addReg(AArch64::X16));
     }
   }
+  this->STI = nullptr;
 }
 
 static void emitAuthenticatedPointer(MCStreamer &OutStreamer,
@@ -1397,8 +1341,6 @@ void AArch64AsmPrinter::LowerJumpTableDest(llvm::MCStreamer &OutStreamer,
 }
 
 void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) {
-  unsigned InstsEmitted = 0;
-
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   assert(MJTI && "Can't lower jump-table dispatch without JTI");
 
@@ -1436,34 +1378,22 @@ void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) {
                                      .addReg(AArch64::X16)
                                      .addImm(MaxTableEntry)
                                      .addImm(0));
-    ++InstsEmitted;
   } else {
-    EmitToStreamer(*OutStreamer,
-                   MCInstBuilder(AArch64::MOVZXi)
-                       .addReg(AArch64::X17)
-                       .addImm(static_cast<uint16_t>(MaxTableEntry))
-                       .addImm(0));
-    ++InstsEmitted;
+    emitMOVZ(AArch64::X17, static_cast<uint16_t>(MaxTableEntry), 0);
     // It's sad that we have to manually materialize instructions, but we can't
     // trivially reuse the main pseudo expansion logic.
     // A MOVK sequence is easy enough to generate and handles the general case.
     for (int Offset = 16; Offset < 64; Offset += 16) {
       if ((MaxTableEntry >> Offset) == 0)
         break;
-      EmitToStreamer(*OutStreamer,
-                     MCInstBuilder(AArch64::MOVKXi)
-                         .addReg(AArch64::X17)
-                         .addReg(AArch64::X17)
-                         .addImm(static_cast<uint16_t>(MaxTableEntry >> Offset))
-                         .addImm(Offset));
-      ++InstsEmitted;
+      emitMOVK(AArch64::X17, static_cast<uint16_t>(MaxTableEntry >> Offset),
+               Offset);
     }
     EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::SUBSXrs)
                                      .addReg(AArch64::XZR)
                                      .addReg(AArch64::X16)
                                      .addReg(AArch64::X17)
                                      .addImm(0));
-    ++InstsEmitted;
   }
 
   // This picks entry #0 on failure.
@@ -1473,7 +1403,6 @@ void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) {
                                    .addReg(AArch64::X16)
                                    .addReg(AArch64::XZR)
                                    .addImm(AArch64CC::LS));
-  ++InstsEmitted;
 
   // Prepare the @PAGE/@PAGEOFF low/high operands.
   MachineOperand JTMOHi(JTOp), JTMOLo(JTOp);
@@ -1488,14 +1417,12 @@ void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) {
   EmitToStreamer(
       *OutStreamer,
       MCInstBuilder(AArch64::ADRP).addReg(AArch64::X17).addOperand(JTMCHi));
-  ++InstsEmitted;
 
   EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXri)
                                    .addReg(AArch64::X17)
                                    .addReg(AArch64::X17)
                                    .addOperand(JTMCLo)
                                    .addImm(0));
-  ++InstsEmitted;
 
   EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::LDRSWroX)
                                    .addReg(AArch64::X16)
@@ -1503,7 +1430,6 @@ void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) {
                                    .addReg(AArch64::X16)
                                    .addImm(0)
                                    .addImm(1));
-  ++InstsEmitted;
 
   MCSymbol *AdrLabel = MF->getContext().createTempSymbol();
   const auto *AdrLabelE = MCSymbolRefExpr::create(AdrLabel, MF->getContext());
@@ -1513,20 +1439,14 @@ void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) {
   EmitToStreamer(
       *OutStreamer,
       MCInstBuilder(AArch64::ADR).addReg(AArch64::X17).addExpr(AdrLabelE));
-  ++InstsEmitted;
 
   EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXrs)
                                    .addReg(AArch64::X16)
                                    .addReg(AArch64::X17)
                                    .addReg(AArch64::X16)
                                    .addImm(0));
-  ++InstsEmitted;
 
   EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::BR).addReg(AArch64::X16));
-  ++InstsEmitted;
-
-  (void)InstsEmitted;
-  assert(STI->getInstrInfo()->getInstSizeInBytes(MI) >= InstsEmitted * 4);
 }
 
 void AArch64AsmPrinter::LowerMOPS(llvm::MCStreamer &OutStreamer,
@@ -1615,20 +1535,9 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
     Register ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
     EncodedBytes = 16;
     // Materialize the jump address:
-    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 32) & 0xFFFF)
-                                    .addImm(32));
-    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 16) & 0xFFFF)
-                                    .addImm(16));
-    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm(CallTarget & 0xFFFF)
-                                    .addImm(0));
+    emitMOVZ(ScratchReg, (CallTarget >> 32) & 0xFFFF, 32);
+    emitMOVK(ScratchReg, (CallTarget >> 16) & 0xFFFF, 16);
+    emitMOVK(ScratchReg, CallTarget & 0xFFFF, 0);
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg));
   }
   // Emit padding.
@@ -1714,7 +1623,34 @@ void AArch64AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI) {
   }
 
   OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
-  OutStreamer->emitInstruction(MI, getSubtargetInfo());
+  EmitToStreamer(MI);
+}
+
+void AArch64AsmPrinter::emitMovXReg(Register Dest, Register Src) {
+  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ORRXrs)
+                                   .addReg(Dest)
+                                   .addReg(AArch64::XZR)
+                                   .addReg(Src)
+                                   .addImm(0));
+}
+
+void AArch64AsmPrinter::emitMOVZ(Register Dest, uint64_t Imm, unsigned Shift) {
+  bool Is64Bit = AArch64::GPR64RegClass.contains(Dest);
+  EmitToStreamer(*OutStreamer,
+                 MCInstBuilder(Is64Bit ? AArch64::MOVZXi : AArch64::MOVZWi)
+                     .addReg(Dest)
+                     .addImm(Imm)
+                     .addImm(Shift));
+}
+
+void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
+  bool Is64Bit = AArch64::GPR64RegClass.contains(Dest);
+  EmitToStreamer(*OutStreamer,
+                 MCInstBuilder(Is64Bit ? AArch64::MOVKXi : AArch64::MOVKWi)
+                     .addReg(Dest)
+                     .addReg(Dest)
+                     .addImm(Imm)
+                     .addImm(Shift));
 }
 
 void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
@@ -1761,8 +1697,7 @@ void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
 }
 
 unsigned AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,
-                                                     unsigned AddrDisc,
-                                                     unsigned &InstsEmitted) {
+                                                     unsigned AddrDisc) {
   // So far we've used NoRegister in pseudos.  Now we need real encodings.
   if (AddrDisc == AArch64::NoRegister)
     AddrDisc = AArch64::XZR;
@@ -1774,32 +1709,17 @@ unsigned AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,
 
   // If there's only a constant discriminator, MOV it into x17.
   if (AddrDisc == AArch64::XZR) {
-    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::MOVZXi)
-                                     .addReg(AArch64::X17)
-                                     .addImm(Disc)
-                                     .addImm(/*shift=*/0));
-    ++InstsEmitted;
+    emitMOVZ(AArch64::X17, Disc, 0);
     return AArch64::X17;
   }
 
   // If there are both, emit a blend into x17.
-  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ORRXrs)
-                                   .addReg(AArch64::X17)
-                                   .addReg(AArch64::XZR)
-                                   .addReg(AddrDisc)
-                                   .addImm(0));
-  ++InstsEmitted;
-  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::MOVKXi)
-                                   .addReg(AArch64::X17)
-                                   .addReg(AArch64::X17)
-                                   .addImm(Disc)
-                                   .addImm(/*shift=*/48));
-  ++InstsEmitted;
+  emitMovXReg(AArch64::X17, AddrDisc);
+  emitMOVK(AArch64::X17, Disc, 48);
   return AArch64::X17;
 }
 
 void AArch64AsmPrinter::emitPtrauthAuthResign(const MachineInstr *MI) {
-  unsigned InstsEmitted = 0;
   const bool IsAUTPAC = MI->getOpcode() == AArch64::AUTPAC;
 
   // We can expand AUT/AUTPAC into 3 possible sequences:
@@ -1884,8 +1804,7 @@ void AArch64AsmPrinter::emitPtrauthAuthResign(const MachineInstr *MI) {
 
   // Compute aut discriminator into x17
   assert(isUInt<16>(AUTDisc));
-  unsigned AUTDiscReg =
-      emitPtrauthDiscriminator(AUTDisc, AUTAddrDisc, InstsEmitted);
+  unsigned AUTDiscReg = emitPtrauthDiscriminator(AUTDisc, AUTAddrDisc);
   bool AUTZero = AUTDiscReg == AArch64::XZR;
   unsigned AUTOpc = getAUTOpcodeForKey(AUTKey, AUTZero);
 
@@ -1898,13 +1817,10 @@ void AArch64AsmPrinter::emitPtrauthAuthResign(const MachineInstr *MI) {
   if (!AUTZero)
     AUTInst.addOperand(MCOperand::createReg(AUTDiscReg));
   EmitToStreamer(*OutStreamer, AUTInst);
-  ++InstsEmitted;
 
   // Unchecked or checked-but-non-trapping AUT is just an "AUT": we're done.
-  if (!IsAUTPAC && (!ShouldCheck || !ShouldTrap)) {
-    assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4);
+  if (!IsAUTPAC && (!ShouldCheck || !ShouldTrap))
     return;
-  }
 
   MCSymbol *EndSym = nullptr;
 
@@ -1914,18 +1830,12 @@ void AArch64AsmPrinter::emitPtrauthAuthResign(const MachineInstr *MI) {
 
     // XPAC has tied src/dst: use x17 as a temporary copy.
     //  mov x17, x16
-    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ORRXrs)
-                                     .addReg(AArch64::X17)
-                                     .addReg(AArch64::XZR)
-                                     .addReg(AArch64::X16)
-                                     .addImm(0));
-    ++InstsEmitted;
+    emitMovXReg(AArch64::X17, AArch64::X16);
 
     //  xpaci x17
     EmitToStreamer(
         *OutStreamer,
         MCInstBuilder(XPACOpc).addReg(AArch64::X17).addReg(AArch64::X17));
-    ++InstsEmitted;
 
     //  cmp x16, x17
     EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::SUBSXrs)
@@ -1933,21 +1843,18 @@ void AArch64AsmPrinter::emitPtrauthAuthResign(const MachineInstr *MI) {
                                      .addReg(AArch64::X16)
                                      .addReg(AArch64::X17)
                                      .addImm(0));
-    ++InstsEmitted;
 
     //  b.eq Lsuccess
     EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::Bcc)
                                      .addImm(AArch64CC::EQ)
                                      .addExpr(MCSymbolRefExpr::create(
                                          SuccessSym, OutContext)));
-    ++InstsEmitted;
 
     if (ShouldTrap) {
       // Trapping sequences do a 'brk'.
       //  brk #<0xc470 + aut key>
       EmitToStreamer(*OutStreamer,
                      MCInstBuilder(AArch64::BRK).addImm(0xc470 | AUTKey));
-      ++InstsEmitted;
     } else {
       // Non-trapping checked sequences return the stripped result in x16,
       // skipping over the PAC if there is one.
@@ -1955,12 +1862,7 @@ void AArch64AsmPrinter::emitPtrauthAuthResign(const MachineInstr *MI) {
       // FIXME: can we simply return the AUT result, already in x16? without..
       //        ..traps this is usable as an oracle anyway, based on high bits
       //  mov x17, x16
-      EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ORRXrs)
-                                       .addReg(AArch64::X16)
-                                       .addReg(AArch64::XZR)
-                                       .addReg(AArch64::X17)
-                                       .addImm(0));
-      ++InstsEmitted;
+      emitMovXReg(AArch64::X16, AArch64::X17);
 
       if (IsAUTPAC) {
         EndSym = createTempSymbol("resign_end_");
@@ -1969,7 +1871,6 @@ void AArch64AsmPrinter::emitPtrauthAuthResign(const MachineInstr *MI) {
         EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::B)
                                          .addExpr(MCSymbolRefExpr::create(
                                              EndSym, OutContext)));
-        ++InstsEmitted;
       }
     }
 
@@ -1981,10 +1882,8 @@ void AArch64AsmPrinter::emitPtrauthAuthResign(const MachineInstr *MI) {
   // We already emitted unchecked and checked-but-non-trapping AUTs.
   // That left us with trapping AUTs, and AUTPACs.
   // Trapping AUTs don't need PAC: we're done.
-  if (!IsAUTPAC) {
-    assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4);
+  if (!IsAUTPAC)
     return;
-  }
 
   auto PACKey = (AArch64PACKey::ID)MI->getOperand(3).getImm();
   uint64_t PACDisc = MI->getOperand(4).getImm();
@@ -1992,8 +1891,7 @@ void AArch64AsmPrinter::emitPtrauthAuthResign(const MachineInstr *MI) {
 
   // Compute pac discriminator into x17
   assert(isUInt<16>(PACDisc));
-  unsigned PACDiscReg =
-      emitPtrauthDiscriminator(PACDisc, PACAddrDisc, InstsEmitted);
+  unsigned PACDiscReg = emitPtrauthDiscriminator(PACDisc, PACAddrDisc);
   bool PACZero = PACDiscReg == AArch64::XZR;
   unsigned PACOpc = getPACOpcodeForKey(PACKey, PACZero);
 
@@ -2006,16 +1904,13 @@ void AArch64AsmPrinter::emitPtrauthAuthResign(const MachineInstr *MI) {
   if (!PACZero)
     PACInst.addOperand(MCOperand::createReg(PACDiscReg));
   EmitToStreamer(*OutStreamer, PACInst);
-  ++InstsEmitted;
 
-  assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4);
   //  Lend:
   if (EndSym)
     OutStreamer->emitLabel(EndSym);
 }
 
 void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
-  unsigned InstsEmitted = 0;
   bool IsCall = MI->getOpcode() == AArch64::BLRA;
   unsigned BrTarget = MI->getOperand(0).getReg();
 
@@ -2029,7 +1924,7 @@ void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
   unsigned AddrDisc = MI->getOperand(3).getReg();
 
   // Compute discriminator into x17
-  unsigned DiscReg = emitPtrauthDiscriminator(Disc, AddrDisc, InstsEmitted);
+  unsigned DiscReg = emitPtrauthDiscriminator(Disc, AddrDisc);
   bool IsZeroDisc = DiscReg == AArch64::XZR;
 
   unsigned Opc;
@@ -2051,9 +1946,6 @@ void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
   if (!IsZeroDisc)
     BRInst.addOperand(MCOperand::createReg(DiscReg));
   EmitToStreamer(*OutStreamer, BRInst);
-  ++InstsEmitted;
-
-  assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4);
 }
 
 const MCExpr *
@@ -2161,12 +2053,6 @@ void AArch64AsmPrinter::LowerLOADauthptrstatic(const MachineInstr &MI) {
 }
 
 void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) {
-  unsigned InstsEmitted = 0;
-  auto EmitAndIncrement = [this, &InstsEmitted](const MCInst &Inst) {
-    EmitToStreamer(*OutStreamer, Inst);
-    ++InstsEmitted;
-  };
-
   const bool IsGOTLoad = MI.getOpcode() == AArch64::LOADgotPAC;
   MachineOperand GAOp = MI.getOperand(0);
   const uint64_t KeyC = MI.getOperand(1).getImm();
@@ -2228,20 +2114,20 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) {
   MCInstLowering.lowerOperand(GAMOHi, GAMCHi);
   MCInstLowering.lowerOperand(GAMOLo, GAMCLo);
 
-  EmitAndIncrement(
+  EmitToStreamer(
       MCInstBuilder(AArch64::ADRP).addReg(AArch64::X16).addOperand(GAMCHi));
 
   if (IsGOTLoad) {
-    EmitAndIncrement(MCInstBuilder(AArch64::LDRXui)
-                         .addReg(AArch64::X16)
-                         .addReg(AArch64::X16)
-                         .addOperand(GAMCLo));
+    EmitToStreamer(MCInstBuilder(AArch64::LDRXui)
+                       .addReg(AArch64::X16)
+                       .addReg(AArch64::X16)
+                       .addOperand(GAMCLo));
   } else {
-    EmitAndIncrement(MCInstBuilder(AArch64::ADDXri)
-                         .addReg(AArch64::X16)
-                         .addReg(AArch64::X16)
-                         .addOperand(GAMCLo)
-                         .addImm(0));
+    EmitToStreamer(MCInstBuilder(AArch64::ADDXri)
+                       .addReg(AArch64::X16)
+                       .addReg(AArch64::X16)
+                       .addOperand(GAMCLo)
+                       .addImm(0));
   }
 
   if (Offset != 0) {
@@ -2250,7 +2136,7 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) {
     if (isUInt<24>(AbsOffset)) {
       for (int BitPos = 0; BitPos != 24 && (AbsOffset >> BitPos);
            BitPos += 12) {
-        EmitAndIncrement(
+        EmitToStreamer(
             MCInstBuilder(IsNeg ? AArch64::SUBXri : AArch64::ADDXri)
                 .addReg(AArch64::X16)
                 .addReg(AArch64::X16)
@@ -2259,10 +2145,10 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) {
       }
     } else {
       const uint64_t UOffset = Offset;
-      EmitAndIncrement(MCInstBuilder(IsNeg ? AArch64::MOVNXi : AArch64::MOVZXi)
-                           .addReg(AArch64::X17)
-                           .addImm((IsNeg ? ~UOffset : UOffset) & 0xffff)
-                           .addImm(/*shift=*/0));
+      EmitToStreamer(MCInstBuilder(IsNeg ? AArch64::MOVNXi : AArch64::MOVZXi)
+                         .addReg(AArch64::X17)
+                         .addImm((IsNeg ? ~UOffset : UOffset) & 0xffff)
+                         .addImm(/*shift=*/0));
       auto NeedMovk = [IsNeg, UOffset](int BitPos) -> bool {
         assert(BitPos == 16 || BitPos == 32 || BitPos == 48);
         uint64_t Shifted = UOffset >> BitPos;
@@ -2273,39 +2159,24 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) {
             return true;
         return false;
       };
-      for (int BitPos = 16; BitPos != 64 && NeedMovk(BitPos); BitPos += 16) {
-        EmitAndIncrement(MCInstBuilder(AArch64::MOVKXi)
-                             .addReg(AArch64::X17)
-                             .addReg(AArch64::X17)
-                             .addImm((UOffset >> BitPos) & 0xffff)
-                             .addImm(/*shift=*/BitPos));
-      }
-      EmitAndIncrement(MCInstBuilder(AArch64::ADDXrs)
-                           .addReg(AArch64::X16)
-                           .addReg(AArch64::X16)
-                           .addReg(AArch64::X17)
-                           .addImm(/*shift=*/0));
+      for (int BitPos = 16; BitPos != 64 && NeedMovk(BitPos); BitPos += 16)
+        emitMOVK(AArch64::X17, (UOffset >> BitPos) & 0xffff, BitPos);
+
+      EmitToStreamer(MCInstBuilder(AArch64::ADDXrs)
+                         .addReg(AArch64::X16)
+                         .addReg(AArch64::X16)
+                         .addReg(AArch64::X17)
+                         .addImm(/*shift=*/0));
     }
   }
 
   unsigned DiscReg = AddrDisc;
   if (Disc != 0) {
     if (AddrDisc != AArch64::XZR) {
-      EmitAndIncrement(MCInstBuilder(AArch64::ORRXrs)
-                           .addReg(AArch64::X17)
-                           .addReg(AArch64::XZR)
-                           .addReg(AddrDisc)
-                           .addImm(0));
-      EmitAndIncrement(MCInstBuilder(AArch64::MOVKXi)
-                           .addReg(AArch64::X17)
-                           .addReg(AArch64::X17)
-                           .addImm(Disc)
-                           .addImm(/*shift=*/48));
+      emitMovXReg(AArch64::X17, AddrDisc);
+      emitMOVK(AArch64::X17, Disc, 48);
     } else {
-      EmitAndIncrement(MCInstBuilder(AArch64::MOVZXi)
-                           .addReg(AArch64::X17)
-                           .addImm(Disc)
-                           .addImm(/*shift=*/0));
+      emitMOVZ(AArch64::X17, Disc, 0);
     }
     DiscReg = AArch64::X17;
   }
@@ -2315,9 +2186,7 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) {
                  .addReg(AArch64::X16);
   if (DiscReg != AArch64::XZR)
     MIB.addReg(DiscReg);
-  EmitAndIncrement(MIB);
-
-  assert(STI->getInstrInfo()->getInstSizeInBytes(MI) >= InstsEmitted * 4);
+  EmitToStreamer(MIB);
 }
 
 const MCExpr *
@@ -2337,9 +2206,23 @@ AArch64AsmPrinter::lowerBlockAddressConstant(const BlockAddress &BA) {
 // instructions) auto-generated.
 #include "AArch64GenMCPseudoLowering.inc"
 
+void AArch64AsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
+  S.emitInstruction(Inst, *STI);
+#ifndef NDEBUG
+  ++InstsEmitted;
+#endif
+}
+
 void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   AArch64_MC::verifyInstructionPredicates(MI->getOpcode(), STI->getFeatureBits());
 
+#ifndef NDEBUG
+  InstsEmitted = 0;
+  auto CheckMISize = make_scope_exit([&]() {
+    assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4);
+  });
+#endif
+
   // Do any auto-generated pseudo lowerings.
   if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
     EmitToStreamer(*OutStreamer, OutInst);
@@ -2511,21 +2394,10 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     if (Disc) {
       if (AddrDisc != AArch64::NoRegister) {
         if (ScratchReg != AddrDisc)
-          EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ORRXrs)
-                                           .addReg(ScratchReg)
-                                           .addReg(AArch64::XZR)
-                                           .addReg(AddrDisc)
-                                           .addImm(0));
-        EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::MOVKXi)
-                                         .addReg(ScratchReg)
-                                         .addReg(ScratchReg)
-                                         .addImm(Disc)
-                                         .addImm(/*shift=*/48));
+          emitMovXReg(ScratchReg, AddrDisc);
+        emitMOVK(ScratchReg, Disc, 48);
       } else {
-        EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::MOVZXi)
-                                         .addReg(ScratchReg)
-                                         .addImm(Disc)
-                                         .addImm(/*shift=*/0));
+        emitMOVZ(ScratchReg, Disc, 0);
       }
       DiscReg = ScratchReg;
     }
@@ -2638,6 +2510,9 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     TLSDescCall.setOpcode(AArch64::TLSDESCCALL);
     TLSDescCall.addOperand(Sym);
     EmitToStreamer(*OutStreamer, TLSDescCall);
+#ifndef NDEBUG
+    --InstsEmitted; // no code emitted
+#endif
 
     MCInst Blr;
     Blr.setOpcode(AArch64::BLR);
@@ -2862,7 +2737,7 @@ void AArch64AsmPrinter::emitMachOIFuncStubBody(Module &M, const GlobalIFunc &GI,
                                        AArch64II::MO_GOT | AArch64II::MO_PAGE),
         SymPage);
     Adrp.addOperand(SymPage);
-    OutStreamer->emitInstruction(Adrp, *STI);
+    EmitToStreamer(Adrp);
   }
 
   {
@@ -2877,20 +2752,17 @@ void AArch64AsmPrinter::emitMachOIFuncStubBody(Module &M, const GlobalIFunc &GI,
         SymPageOff);
     Ldr.addOperand(SymPageOff);
     Ldr.addOperand(MCOperand::createImm(0));
-    OutStreamer->emitInstruction(Ldr, *STI);
+    EmitToStreamer(Ldr);
   }
 
-  OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDRXui)
-                                   .addReg(AArch64::X16)
-                                   .addReg(AArch64::X16)
-                                   .addImm(0),
-                               *STI);
-
-  OutStreamer->emitInstruction(MCInstBuilder(TM.getTargetTriple().isArm64e()
-                                                 ? AArch64::BRAAZ
-                                                 : AArch64::BR)
-                                   .addReg(AArch64::X16),
-                               *STI);
+  EmitToStreamer(MCInstBuilder(AArch64::LDRXui)
+                     .addReg(AArch64::X16)
+                     .addReg(AArch64::X16)
+                     .addImm(0));
+
+  EmitToStreamer(MCInstBuilder(TM.getTargetTriple().isArm64e() ? AArch64::BRAAZ
+                                                               : AArch64::BR)
+                     .addReg(AArch64::X16));
 }
 
 void AArch64AsmPrinter::emitMachOIFuncStubHelperBody(Module &M,
@@ -2927,43 +2799,38 @@ void AArch64AsmPrinter::emitMachOIFuncStubHelperBody(Module &M,
   //   ldp	fp, lr, [sp], #16
   //   br	x16
 
-  OutStreamer->emitInstruction(MCInstBuilder(AArch64::STPXpre)
-                                   .addReg(AArch64::SP)
-                                   .addReg(AArch64::FP)
-                                   .addReg(AArch64::LR)
-                                   .addReg(AArch64::SP)
-                                   .addImm(-2),
-                               *STI);
-
-  OutStreamer->emitInstruction(MCInstBuilder(AArch64::ADDXri)
-                                   .addReg(AArch64::FP)
-                                   .addReg(AArch64::SP)
-                                   .addImm(0)
-                                   .addImm(0),
-                               *STI);
+  EmitToStreamer(MCInstBuilder(AArch64::STPXpre)
+                     .addReg(AArch64::SP)
+                     .addReg(AArch64::FP)
+                     .addReg(AArch64::LR)
+                     .addReg(AArch64::SP)
+                     .addImm(-2));
+
+  EmitToStreamer(MCInstBuilder(AArch64::ADDXri)
+                     .addReg(AArch64::FP)
+                     .addReg(AArch64::SP)
+                     .addImm(0)
+                     .addImm(0));
 
   for (int I = 0; I != 4; ++I)
-    OutStreamer->emitInstruction(MCInstBuilder(AArch64::STPXpre)
-                                     .addReg(AArch64::SP)
-                                     .addReg(AArch64::X1 + 2 * I)
-                                     .addReg(AArch64::X0 + 2 * I)
-                                     .addReg(AArch64::SP)
-                                     .addImm(-2),
-                                 *STI);
+    EmitToStreamer(MCInstBuilder(AArch64::STPXpre)
+                       .addReg(AArch64::SP)
+                       .addReg(AArch64::X1 + 2 * I)
+                       .addReg(AArch64::X0 + 2 * I)
+                       .addReg(AArch64::SP)
+                       .addImm(-2));
 
   for (int I = 0; I != 4; ++I)
-    OutStreamer->emitInstruction(MCInstBuilder(AArch64::STPDpre)
-                                     .addReg(AArch64::SP)
-                                     .addReg(AArch64::D1 + 2 * I)
-                                     .addReg(AArch64::D0 + 2 * I)
-                                     .addReg(AArch64::SP)
-                                     .addImm(-2),
-                                 *STI);
-
-  OutStreamer->emitInstruction(
+    EmitToStreamer(MCInstBuilder(AArch64::STPDpre)
+                       .addReg(AArch64::SP)
+                       .addReg(AArch64::D1 + 2 * I)
+                       .addReg(AArch64::D0 + 2 * I)
+                       .addReg(AArch64::SP)
+                       .addImm(-2));
+
+  EmitToStreamer(
       MCInstBuilder(AArch64::BL)
-          .addOperand(MCOperand::createExpr(lowerConstant(GI.getResolver()))),
-      *STI);
+          .addOperand(MCOperand::createExpr(lowerConstant(GI.getResolver()))));
 
   {
     MCInst Adrp;
@@ -2975,7 +2842,7 @@ void AArch64AsmPrinter::emitMachOIFuncStubHelperBody(Module &M,
                                  AArch64II::MO_GOT | AArch64II::MO_PAGE),
         SymPage);
     Adrp.addOperand(SymPage);
-    OutStreamer->emitInstruction(Adrp, *STI);
+    EmitToStreamer(Adrp);
   }
 
   {
@@ -2990,53 +2857,46 @@ void AArch64AsmPrinter::emitMachOIFuncStubHelperBody(Module &M,
         SymPageOff);
     Ldr.addOperand(SymPageOff);
     Ldr.addOperand(MCOperand::createImm(0));
-    OutStreamer->emitInstruction(Ldr, *STI);
+    EmitToStreamer(Ldr);
   }
 
-  OutStreamer->emitInstruction(MCInstBuilder(AArch64::STRXui)
-                                   .addReg(AArch64::X0)
-                                   .addReg(AArch64::X16)
-                                   .addImm(0),
-                               *STI);
+  EmitToStreamer(MCInstBuilder(AArch64::STRXui)
+                     .addReg(AArch64::X0)
+                     .addReg(AArch64::X16)
+                     .addImm(0));
 
-  OutStreamer->emitInstruction(MCInstBuilder(AArch64::ADDXri)
-                                   .addReg(AArch64::X16)
-                                   .addReg(AArch64::X0)
-                                   .addImm(0)
-                                   .addImm(0),
-                               *STI);
+  EmitToStreamer(MCInstBuilder(AArch64::ADDXri)
+                     .addReg(AArch64::X16)
+                     .addReg(AArch64::X0)
+                     .addImm(0)
+                     .addImm(0));
 
   for (int I = 3; I != -1; --I)
-    OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDPDpost)
-                                     .addReg(AArch64::SP)
-                                     .addReg(AArch64::D1 + 2 * I)
-                                     .addReg(AArch64::D0 + 2 * I)
-                                     .addReg(AArch64::SP)
-                                     .addImm(2),
-                                 *STI);
+    EmitToStreamer(MCInstBuilder(AArch64::LDPDpost)
+                       .addReg(AArch64::SP)
+                       .addReg(AArch64::D1 + 2 * I)
+                       .addReg(AArch64::D0 + 2 * I)
+                       .addReg(AArch64::SP)
+                       .addImm(2));
 
   for (int I = 3; I != -1; --I)
-    OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDPXpost)
-                                     .addReg(AArch64::SP)
-                                     .addReg(AArch64::X1 + 2 * I)
-                                     .addReg(AArch64::X0 + 2 * I)
-                                     .addReg(AArch64::SP)
-                                     .addImm(2),
-                                 *STI);
-
-  OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDPXpost)
-                                   .addReg(AArch64::SP)
-                                   .addReg(AArch64::FP)
-                                   .addReg(AArch64::LR)
-                                   .addReg(AArch64::SP)
-                                   .addImm(2),
-                               *STI);
-
-  OutStreamer->emitInstruction(MCInstBuilder(TM.getTargetTriple().isArm64e()
-                                                 ? AArch64::BRAAZ
-                                                 : AArch64::BR)
-                                   .addReg(AArch64::X16),
-                               *STI);
+    EmitToStreamer(MCInstBuilder(AArch64::LDPXpost)
+                       .addReg(AArch64::SP)
+                       .addReg(AArch64::X1 + 2 * I)
+                       .addReg(AArch64::X0 + 2 * I)
+                       .addReg(AArch64::SP)
+                       .addImm(2));
+
+  EmitToStreamer(MCInstBuilder(AArch64::LDPXpost)
+                     .addReg(AArch64::SP)
+                     .addReg(AArch64::FP)
+                     .addReg(AArch64::LR)
+                     .addReg(AArch64::SP)
+                     .addImm(2));
+
+  EmitToStreamer(MCInstBuilder(TM.getTargetTriple().isArm64e() ? AArch64::BRAAZ
+                                                               : AArch64::BR)
+                     .addReg(AArch64::X16));
 }
 
 const MCExpr *AArch64AsmPrinter::lowerConstant(const Constant *CV) {
diff --git a/llvm/lib/Target/AArch64/AArch64FMV.td b/llvm/lib/Target/AArch64/AArch64FMV.td
index 16468068d7a8f..8266507379f3b 100644
--- a/llvm/lib/Target/AArch64/AArch64FMV.td
+++ b/llvm/lib/Target/AArch64/AArch64FMV.td
@@ -73,7 +73,6 @@ def : FMVExtension<"rdm", "FEAT_RDM", "+rdm,+fp-armv8,+neon", 108>;
 def : FMVExtension<"rng", "FEAT_RNG", "+rand", 10>;
 def : FMVExtension<"rpres", "FEAT_RPRES", "", 300>;
 def : FMVExtension<"sb", "FEAT_SB", "+sb", 470>;
-def : FMVExtension<"sha1", "FEAT_SHA1", "+fp-armv8,+neon", 120>;
 def : FMVExtension<"sha2", "FEAT_SHA2", "+sha2,+fp-armv8,+neon", 130>;
 def : FMVExtension<"sha3", "FEAT_SHA3", "+sha3,+sha2,+fp-armv8,+neon", 140>;
 def : FMVExtension<"simd", "FEAT_SIMD", "+fp-armv8,+neon", 100>;
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index fde07d84e97f5..be33331be4e8f 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -285,6 +285,11 @@ static cl::opt<bool>
     StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming",
                               cl::init(false), cl::Hidden);
 
+static cl::opt<bool> DisableMultiVectorSpillFill(
+    "aarch64-disable-multivector-spill-fill",
+    cl::desc("Disable use of LD/ST pairs for SME2 or SVE2p1"), cl::init(false),
+    cl::Hidden);
+
 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
 
 /// Returns how much of the incoming argument stack area (in bytes) we should
@@ -2954,6 +2959,24 @@ unsigned findFreePredicateReg(BitVector &SavedRegs) {
   return AArch64::NoRegister;
 }
 
+// The multivector LD/ST are available only for SME or SVE2p1 targets
+bool enableMultiVectorSpillFill(const AArch64Subtarget &Subtarget,
+                                MachineFunction &MF) {
+  if (DisableMultiVectorSpillFill)
+    return false;
+
+  SMEAttrs FuncAttrs(MF.getFunction());
+  bool IsLocallyStreaming =
+      FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
+
+  // Only when in streaming mode SME2 instructions can be safely used.
+  // It is not safe to use SME2 instructions when in streaming compatible or
+  // locally streaming mode.
+  return Subtarget.hasSVE2p1() ||
+         (Subtarget.hasSME2() &&
+          (!IsLocallyStreaming && Subtarget.isStreaming()));
+}
+
 static void computeCalleeSaveRegisterPairs(
     MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
     const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
@@ -3330,7 +3353,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
                               MF.getSubtarget<AArch64Subtarget>();
       AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
       unsigned PnReg = AFI->getPredicateRegForFillSpill();
-      assert(((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && PnReg != 0) &&
+      assert((PnReg != 0 && enableMultiVectorSpillFill(Subtarget, MF)) &&
              "Expects SVE2.1 or SME2 target and a predicate register");
 #ifdef EXPENSIVE_CHECKS
       auto IsPPR = [](const RegPairInfo &c) {
@@ -3508,7 +3531,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       [[maybe_unused]] const AArch64Subtarget &Subtarget =
                               MF.getSubtarget<AArch64Subtarget>();
       unsigned PnReg = AFI->getPredicateRegForFillSpill();
-      assert(((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && PnReg != 0) &&
+      assert((PnReg != 0 && enableMultiVectorSpillFill(Subtarget, MF)) &&
              "Expects SVE2.1 or SME2 target and a predicate register");
 #ifdef EXPENSIVE_CHECKS
       assert(!(PPRBegin < ZPRBegin) &&
@@ -3722,7 +3745,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
                     SavedRegs.test(CSRegs[i ^ 1]));
   }
 
-  if (HasPairZReg && (Subtarget.hasSVE2p1() || Subtarget.hasSME2())) {
+  if (HasPairZReg && enableMultiVectorSpillFill(Subtarget, MF)) {
     AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
     // Find a suitable predicate register for the multi-vector spill/fill
     // instructions.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 69806c9c3fdbf..6133580a3cd77 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -400,8 +400,10 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
     return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
   }
 
-  void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
-                             uint32_t MaxImm);
+  void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
+                                 unsigned Opc, uint32_t MaxImm);
+
+  void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc);
 
   template <unsigned MaxIdx, unsigned Scale>
   bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
@@ -1975,9 +1977,10 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,
   SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode);
 }
 
-void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
-                                                unsigned NumOutVecs,
-                                                unsigned Opc, uint32_t MaxImm) {
+void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
+                                                    unsigned NumOutVecs,
+                                                    unsigned Opc,
+                                                    uint32_t MaxImm) {
   if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(4)))
     if (Imm->getZExtValue() > MaxImm)
       return;
@@ -1985,6 +1988,7 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
   SDValue ZtValue;
   if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
     return;
+
   SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
   SDLoc DL(Node);
   EVT VT = Node->getValueType(0);
@@ -2003,6 +2007,34 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
   CurDAG->RemoveDeadNode(Node);
 }
 
+void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
+                                                unsigned NumOutVecs,
+                                                unsigned Opc) {
+
+  SDValue ZtValue;
+  SmallVector<SDValue, 4> Ops;
+  if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
+    return;
+
+  Ops.push_back(ZtValue);
+  Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)}));
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+
+  SDNode *Instruction =
+      CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
+  SDValue SuperReg = SDValue(Instruction, 0);
+
+  for (unsigned I = 0; I < NumOutVecs; ++I)
+    ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
+                                      AArch64::zsub0 + I, DL, VT, SuperReg));
+
+  // Copy chain
+  unsigned ChainIdx = NumOutVecs;
+  ReplaceUses(SDValue(Node, ChainIdx), SDValue(Instruction, 1));
+  CurDAG->RemoveDeadNode(Node);
+}
+
 void AArch64DAGToDAGISel::SelectClamp(SDNode *N, unsigned NumVecs,
                                       unsigned Op) {
   SDLoc DL(N);
@@ -5478,7 +5510,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
               {AArch64::LUTI2_4ZTZI_B, AArch64::LUTI2_4ZTZI_H,
                AArch64::LUTI2_4ZTZI_S}))
         // Second Immediate must be <= 3:
-        SelectMultiVectorLuti(Node, 4, Opc, 3);
+        SelectMultiVectorLutiLane(Node, 4, Opc, 3);
       return;
     }
     case Intrinsic::aarch64_sme_luti4_lane_zt_x4: {
@@ -5486,7 +5518,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
               Node->getValueType(0),
               {0, AArch64::LUTI4_4ZTZI_H, AArch64::LUTI4_4ZTZI_S}))
         // Second Immediate must be <= 1:
-        SelectMultiVectorLuti(Node, 4, Opc, 1);
+        SelectMultiVectorLutiLane(Node, 4, Opc, 1);
       return;
     }
     case Intrinsic::aarch64_sme_luti2_lane_zt_x2: {
@@ -5495,7 +5527,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
               {AArch64::LUTI2_2ZTZI_B, AArch64::LUTI2_2ZTZI_H,
                AArch64::LUTI2_2ZTZI_S}))
         // Second Immediate must be <= 7:
-        SelectMultiVectorLuti(Node, 2, Opc, 7);
+        SelectMultiVectorLutiLane(Node, 2, Opc, 7);
       return;
     }
     case Intrinsic::aarch64_sme_luti4_lane_zt_x2: {
@@ -5504,7 +5536,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
               {AArch64::LUTI4_2ZTZI_B, AArch64::LUTI4_2ZTZI_H,
                AArch64::LUTI4_2ZTZI_S}))
         // Second Immediate must be <= 3:
-        SelectMultiVectorLuti(Node, 2, Opc, 3);
+        SelectMultiVectorLutiLane(Node, 2, Opc, 3);
+      return;
+    }
+    case Intrinsic::aarch64_sme_luti4_zt_x4: {
+      SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z);
       return;
     }
     }
@@ -5640,6 +5676,34 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                AArch64::SQDMULH_VG4_4Z4Z_S, AArch64::SQDMULH_VG4_4Z4Z_D}))
         SelectDestructiveMultiIntrinsic(Node, 4, true, Op);
       return;
+    case Intrinsic::aarch64_sme_fp8_scale_single_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::FP>(
+              Node->getValueType(0),
+              {0, AArch64::FSCALE_2ZZ_H, AArch64::FSCALE_2ZZ_S,
+               AArch64::FSCALE_2ZZ_D}))
+        SelectDestructiveMultiIntrinsic(Node, 2, false, Op);
+      return;
+    case Intrinsic::aarch64_sme_fp8_scale_single_x4:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::FP>(
+              Node->getValueType(0),
+              {0, AArch64::FSCALE_4ZZ_H, AArch64::FSCALE_4ZZ_S,
+               AArch64::FSCALE_4ZZ_D}))
+        SelectDestructiveMultiIntrinsic(Node, 4, false, Op);
+      return;
+    case Intrinsic::aarch64_sme_fp8_scale_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::FP>(
+              Node->getValueType(0),
+              {0, AArch64::FSCALE_2Z2Z_H, AArch64::FSCALE_2Z2Z_S,
+               AArch64::FSCALE_2Z2Z_D}))
+        SelectDestructiveMultiIntrinsic(Node, 2, true, Op);
+      return;
+    case Intrinsic::aarch64_sme_fp8_scale_x4:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::FP>(
+              Node->getValueType(0),
+              {0, AArch64::FSCALE_4Z4Z_H, AArch64::FSCALE_4Z4Z_S,
+               AArch64::FSCALE_4Z4Z_D}))
+        SelectDestructiveMultiIntrinsic(Node, 4, true, Op);
+      return;
     case Intrinsic::aarch64_sve_whilege_x2:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int1>(
               Node->getValueType(0),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4166d9bd22bc0..3de09eca94233 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1996,8 +1996,8 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
     return true;
 
   EVT VT = EVT::getEVT(I->getType());
-  return VT != MVT::nxv4i32 && VT != MVT::nxv2i64 && VT != MVT::v4i32 &&
-         VT != MVT::v2i32;
+  return VT != MVT::nxv4i64 && VT != MVT::nxv4i32 && VT != MVT::nxv2i64 &&
+         VT != MVT::v4i64 && VT != MVT::v4i32 && VT != MVT::v2i32;
 }
 
 bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
@@ -3242,6 +3242,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     return EmitZero(MI, BB);
   case AArch64::ZERO_T_PSEUDO:
     return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
+  case AArch64::MOVT_TIZ_PSEUDO:
+    return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
   }
 }
 
@@ -21916,8 +21918,10 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
 
   // Dot products operate on chunks of four elements so there must be four times
   // as many elements in the wide type
-  if (!(ReducedType == MVT::nxv4i32 && MulSrcType == MVT::nxv16i8) &&
+  if (!(ReducedType == MVT::nxv4i64 && MulSrcType == MVT::nxv16i8) &&
+      !(ReducedType == MVT::nxv4i32 && MulSrcType == MVT::nxv16i8) &&
       !(ReducedType == MVT::nxv2i64 && MulSrcType == MVT::nxv8i16) &&
+      !(ReducedType == MVT::v4i64 && MulSrcType == MVT::v16i8) &&
       !(ReducedType == MVT::v4i32 && MulSrcType == MVT::v16i8) &&
       !(ReducedType == MVT::v2i32 && MulSrcType == MVT::v8i8))
     return SDValue();
@@ -21930,7 +21934,7 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
 
     bool Scalable = N->getValueType(0).isScalableVT();
     // There's no nxv2i64 version of usdot
-    if (Scalable && ReducedType != MVT::nxv4i32)
+    if (Scalable && ReducedType != MVT::nxv4i32 && ReducedType != MVT::nxv4i64)
       return SDValue();
 
     Opcode = AArch64ISD::USDOT;
@@ -21942,6 +21946,20 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
   else
     Opcode = AArch64ISD::UDOT;
 
+  // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
+  // product followed by a zero / sign extension
+  if ((ReducedType == MVT::nxv4i64 && MulSrcType == MVT::nxv16i8) ||
+      (ReducedType == MVT::v4i64 && MulSrcType == MVT::v16i8)) {
+    EVT ReducedTypeI32 =
+        (ReducedType.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
+
+    auto DotI32 = DAG.getNode(Opcode, DL, ReducedTypeI32,
+                              DAG.getConstant(0, DL, ReducedTypeI32), A, B);
+    auto Extended = DAG.getSExtOrTrunc(DotI32, DL, ReducedType);
+    return DAG.getNode(ISD::ADD, DL, NarrowOp.getValueType(), NarrowOp,
+                       Extended);
+  }
+
   return DAG.getNode(Opcode, DL, ReducedType, NarrowOp, A, B);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 46b462de5071c..1d1d9b5512cfc 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6243,6 +6243,26 @@ multiclass SIMDThreeSameVectorDOT4<string asm> {
                                           V128, v4f32, v16i8, null_frag>;
 }
 
+let mayRaiseFPException = 1, Uses = [FPCR] in
+multiclass SIMDThreeVectorFscale<bit U, bit S, bits<3> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+        [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+        [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
+                                      asm, ".2s",
+        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
+                                      asm, ".4s",
+        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
+                                      asm, ".2d",
+        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+}
+
 //----------------------------------------------------------------------------
 // AdvSIMD two register vector instructions.
 //----------------------------------------------------------------------------
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 3b38a5f78dee5..32bc0e7d0d647 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1994,6 +1994,7 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         .addReg(AArch64::X0)
         .addMBB(TargetMBB)
         .addImm(0);
+    TargetMBB->setMachineBlockAddressTaken();
     return true;
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c70e835d1619f..4374d92a5b7b1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -9697,6 +9697,7 @@ def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
 def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
           (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
 
+let Size = 8 in
 def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>;
 def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>;
 
@@ -10136,7 +10137,7 @@ let Uses = [FPMR, FPCR], Predicates = [HasFP8] in {
   defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">;
   defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">;
   defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">;
-  defm FSCALE : SIMDThreeSameVectorFP<0b1, 0b1, 0b111, "fscale", null_frag>;
+  defm FSCALE : SIMDThreeVectorFscale<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>;
 } // End let Predicates = [HasFP8]
 
 // fminimum(abs(a), abs(b)) -> famin(a, b)
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index ebe4121c944b1..17b28e5bfe861 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -939,7 +939,7 @@ defm FAMIN_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famin", 0b0010101>;
 } //[HasSME2, HasFAMINMAX]
 
 let Predicates = [HasSME2, HasSME_LUTv2] in {
-defm MOVT : sme2_movt_zt_to_zt<"movt",  0b0011111>;
+defm MOVT_TIZ : sme2_movt_zt_to_zt<"movt",  0b0011111, int_aarch64_sme_write_lane_zt, int_aarch64_sme_write_zt>;
 def LUTI4_4ZZT2Z    : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
 } //[HasSME2, HasSME_LUTv2]
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ac05a44abc2dd..84aea83424e4d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2166,11 +2166,7 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_sve_fcvt_f64f32:
   case Intrinsic::aarch64_sve_fcvtlt_f32f16:
   case Intrinsic::aarch64_sve_fcvtlt_f64f32:
-  case Intrinsic::aarch64_sve_fcvtnt_bf16f32:
-  case Intrinsic::aarch64_sve_fcvtnt_f16f32:
-  case Intrinsic::aarch64_sve_fcvtnt_f32f64:
   case Intrinsic::aarch64_sve_fcvtx_f32f64:
-  case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
   case Intrinsic::aarch64_sve_fcvtzs:
   case Intrinsic::aarch64_sve_fcvtzs_i32f16:
   case Intrinsic::aarch64_sve_fcvtzs_i32f64:
@@ -2192,6 +2188,11 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_sve_ucvtf_f32i64:
   case Intrinsic::aarch64_sve_ucvtf_f64i32:
     return instCombineSVEAllOrNoActiveUnary(IC, II);
+  case Intrinsic::aarch64_sve_fcvtnt_bf16f32:
+  case Intrinsic::aarch64_sve_fcvtnt_f16f32:
+  case Intrinsic::aarch64_sve_fcvtnt_f32f64:
+  case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
+    return instCombineSVENoActiveReplace(IC, II, true);
   case Intrinsic::aarch64_sve_st1_scatter:
   case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
   case Intrinsic::aarch64_sve_st1_scatter_sxtw:
@@ -3440,15 +3441,14 @@ InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
   return 1;
 }
 
-InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                                   Type *CondTy,
-                                                   CmpInst::Predicate VecPred,
-                                                   TTI::TargetCostKind CostKind,
-                                                   const Instruction *I) {
+InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
+    unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+    TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
+    TTI::OperandValueInfo Op2Info, const Instruction *I) {
   // TODO: Handle other cost kinds.
   if (CostKind != TTI::TCK_RecipThroughput)
     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
-                                     I);
+                                     Op1Info, Op2Info, I);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // We don't lower some vector selects well that are wider than the register
@@ -3527,7 +3527,8 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 
   // The base case handles scalable vectors fine for now, since it treats the
   // cost as 1 * legalization cost.
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+                                   Op1Info, Op2Info, I);
 }
 
 AArch64TTIImpl::TTI::MemCmpExpansionOptions
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 22bba21eedcc5..28e45207596ec 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -208,10 +208,12 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
                                             const SCEV *Ptr);
 
-  InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-                                     CmpInst::Predicate VecPred,
-                                     TTI::TargetCostKind CostKind,
-                                     const Instruction *I = nullptr);
+  InstructionCost getCmpSelInstrCost(
+      unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
+      TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr);
 
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 6cb181011f8f6..51aeee023f2e3 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -268,11 +268,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       })
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .lowerIf(scalarOrEltWiderThan(0, 64))
-      .minScalarOrElt(0, MinFPScalar)
       .clampNumElements(0, v4s16, v8s16)
       .clampNumElements(0, v2s32, v4s32)
       .clampNumElements(0, v2s64, v2s64)
-      .moreElementsToNextPow2(0);
+      .moreElementsToNextPow2(0)
+      .lowerFor({s16, v4s16, v8s16});
 
   getActionDefinitionsBuilder(G_FREM)
       .libcallFor({s32, s64})
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 97c5f96388abe..c7f44ec018f5a 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -447,10 +447,10 @@ class AArch64MCInstrAnalysis : public MCInstrAnalysis {
     const MCRegisterClass &FPR128RC =
         MRI.getRegClass(AArch64::FPR128RegClassID);
 
-    auto ClearsSuperReg = [=](unsigned RegID) {
+    auto ClearsSuperReg = [=](MCRegister Reg) {
       // An update to the lower 32 bits of a 64 bit integer register is
       // architecturally defined to zero extend the upper 32 bits on a write.
-      if (GPR32RC.contains(RegID))
+      if (GPR32RC.contains(Reg))
         return true;
       // SIMD&FP instructions operating on scalar data only acccess the lower
       // bits of a register, the upper bits are zero extended on a write. For
@@ -458,9 +458,9 @@ class AArch64MCInstrAnalysis : public MCInstrAnalysis {
       // register are zero extended on a write.
       // When VL is higher than 128 bits, any write to a SIMD&FP register sets
       // bits higher than 128 to zero.
-      return FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
-             FPR32RC.contains(RegID) || FPR64RC.contains(RegID) ||
-             FPR128RC.contains(RegID);
+      return FPR8RC.contains(Reg) || FPR16RC.contains(Reg) ||
+             FPR32RC.contains(Reg) || FPR64RC.contains(Reg) ||
+             FPR128RC.contains(Reg);
     };
 
     Mask.clearAllBits();
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index a83c132496efa..38d256c823411 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -3283,10 +3283,21 @@ class sme2_movt_zt_to_zt<string mnemonic, bits<7> opc>
   let Inst{4-0}   = Zt;
 }
 
-multiclass sme2_movt_zt_to_zt<string mnemonic, bits<7> opc> {
+multiclass sme2_movt_zt_to_zt<string mnemonic, bits<7> opc, SDPatternOperator intrinsic_lane, SDPatternOperator intrinsic> {
   def NAME : sme2_movt_zt_to_zt<mnemonic, opc>;
+  def NAME # _PSEUDO
+      : Pseudo<(outs), (ins ZTR:$ZT, sme_elm_idx0_3:$off2, ZPRAny:$Zt), []>, Sched<[]> {
+    let usesCustomInserter = 1;
+  }
   def : InstAlias<mnemonic # "\t$ZTt, $Zt",
                  (!cast<Instruction>(NAME) ZTR:$ZTt, 0, ZPRAny:$Zt), 1>;
+
+  foreach vt = [nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16] in {
+    def : Pat<(intrinsic_lane (imm_to_zt untyped:$zt), vt:$zn, sme_elm_idx0_3:$imm),
+              (!cast<Instruction>(NAME # _PSEUDO) $zt, $imm, $zn)>;
+    def : Pat<(intrinsic (imm_to_zt untyped:$zt), vt:$zn),
+              (!cast<Instruction>(NAME # _PSEUDO) $zt, 0, $zn)>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 4abb5a63ab6d2..342d55e828bca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -57,6 +57,7 @@ FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *);
 ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPULateCodeGenPrepareLegacyPass();
+FunctionPass *createAMDGPUReserveWWMRegsPass();
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
 ModulePass *
 createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
@@ -154,6 +155,9 @@ struct AMDGPULowerBufferFatPointersPass
   const TargetMachine &TM;
 };
 
+void initializeAMDGPUReserveWWMRegsPass(PassRegistry &);
+extern char &AMDGPUReserveWWMRegsID;
+
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 3626fd8bc78c1..dc94edf85586f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1118,7 +1118,7 @@ class GCNSubtargetFeatureGeneration <string Value,
 
 def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
     "southern-islands",
-  [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
+  [FeatureFP64, FeatureAddressableLocalMemorySize32768, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
   FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
   FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
@@ -1130,7 +1130,7 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
 
 def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
     "sea-islands",
-  [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
+  [FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureFlatAddressSpace,
   FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
   FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
@@ -1144,7 +1144,7 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
 
 def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
   "volcanic-islands",
-  [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
+  [FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128,
    FeatureWavefrontSize64, FeatureFlatAddressSpace,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
@@ -1160,7 +1160,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
 
 def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
   "gfx9",
-  [FeatureFP64, FeatureLocalMemorySize65536,
+  [FeatureFP64, FeatureAddressableLocalMemorySize65536,
    FeatureWavefrontSize64, FeatureFlatAddressSpace,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
@@ -1179,7 +1179,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
 
 def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
   "gfx10",
-  [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
+  [FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128,
    FeatureFlatAddressSpace,
    FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureInv2PiInlineImm,
@@ -1203,7 +1203,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
 
 def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
   "gfx11",
-  [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
+  [FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128,
    FeatureFlatAddressSpace, Feature16BitInsts,
    FeatureInv2PiInlineImm, FeatureApertureRegs,
    FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
@@ -1226,7 +1226,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
 
 def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
   "gfx12",
-  [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
+  [FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128,
    FeatureFlatAddressSpace, Feature16BitInsts,
    FeatureInv2PiInlineImm, FeatureApertureRegs,
    FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 682c29457c35e..4f6633d8027c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -18,6 +18,7 @@
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPU.h"
 #include "AMDGPUHSAMetadataStreamer.h"
+#include "AMDGPUMCResourceInfo.h"
 #include "AMDGPUResourceUsageAnalysis.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
@@ -33,6 +34,7 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/MC/MCAssembler.h"
@@ -359,6 +361,127 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
   return AsmPrinter::doInitialization(M);
 }
 
+void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
+  if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
+    return;
+
+  using RIK = MCResourceInfo::ResourceInfoKind;
+  const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
+
+  auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
+    int64_t Val;
+    if (Value->evaluateAsAbsolute(Val)) {
+      Res = Val;
+      return true;
+    }
+    return false;
+  };
+
+  const uint64_t MaxScratchPerWorkitem =
+      STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
+  MCSymbol *ScratchSizeSymbol =
+      RI.getSymbol(F.getName(), RIK::RIK_PrivateSegSize, OutContext);
+  uint64_t ScratchSize;
+  if (ScratchSizeSymbol->isVariable() &&
+      TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
+      ScratchSize > MaxScratchPerWorkitem) {
+    DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
+                                          DS_Error);
+    F.getContext().diagnose(DiagStackSize);
+  }
+
+  // Validate addressable scalar registers (i.e., prior to added implicit
+  // SGPRs).
+  MCSymbol *NumSGPRSymbol =
+      RI.getSymbol(F.getName(), RIK::RIK_NumSGPR, OutContext);
+  if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+      !STM.hasSGPRInitBug()) {
+    unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
+    uint64_t NumSgpr;
+    if (NumSGPRSymbol->isVariable() &&
+        TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
+        NumSgpr > MaxAddressableNumSGPRs) {
+      DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
+                                       NumSgpr, MaxAddressableNumSGPRs,
+                                       DS_Error, DK_ResourceLimit);
+      F.getContext().diagnose(Diag);
+      return;
+    }
+  }
+
+  MCSymbol *VCCUsedSymbol =
+      RI.getSymbol(F.getName(), RIK::RIK_UsesVCC, OutContext);
+  MCSymbol *FlatUsedSymbol =
+      RI.getSymbol(F.getName(), RIK::RIK_UsesFlatScratch, OutContext);
+  uint64_t VCCUsed, FlatUsed, NumSgpr;
+
+  if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
+      FlatUsedSymbol->isVariable() &&
+      TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
+      TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
+      TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
+
+    // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
+    // resolvable.
+    NumSgpr += IsaInfo::getNumExtraSGPRs(
+        &STM, VCCUsed, FlatUsed,
+        getTargetStreamer()->getTargetID()->isXnackOnOrAny());
+    if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
+        STM.hasSGPRInitBug()) {
+      unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
+      if (NumSgpr > MaxAddressableNumSGPRs) {
+        DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
+                                         MaxAddressableNumSGPRs, DS_Error,
+                                         DK_ResourceLimit);
+        F.getContext().diagnose(Diag);
+        return;
+      }
+    }
+
+    MCSymbol *NumVgprSymbol =
+        RI.getSymbol(F.getName(), RIK::RIK_NumVGPR, OutContext);
+    MCSymbol *NumAgprSymbol =
+        RI.getSymbol(F.getName(), RIK::RIK_NumAGPR, OutContext);
+    uint64_t NumVgpr, NumAgpr;
+
+    MachineModuleInfo &MMI =
+        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+    MachineFunction *MF = MMI.getMachineFunction(F);
+    if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
+        TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
+        TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
+      const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+      unsigned MaxWaves = MFI.getMaxWavesPerEU();
+      uint64_t TotalNumVgpr =
+          getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
+      uint64_t NumVGPRsForWavesPerEU = std::max(
+          {TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
+      uint64_t NumSGPRsForWavesPerEU = std::max(
+          {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
+      const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
+          STM.computeOccupancy(F, MFI.getLDSSize()),
+          MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
+          MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
+          OutContext);
+      uint64_t Occupancy;
+
+      const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
+          F, "amdgpu-waves-per-eu", {0, 0}, true);
+
+      if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
+        DiagnosticInfoOptimizationFailure Diag(
+            F, F.getSubprogram(),
+            "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
+            "'" +
+                F.getName() + "': desired occupancy was " + Twine(MinWEU) +
+                ", final occupancy is " + Twine(Occupancy));
+        F.getContext().diagnose(Diag);
+        return;
+      }
+    }
+  }
+}
+
 bool AMDGPUAsmPrinter::doFinalization(Module &M) {
   // Pad with s_code_end to help tools and guard against instruction prefetch
   // causing stale data in caches. Arguably this should be done by the linker,
@@ -371,25 +494,24 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
     getTargetStreamer()->EmitCodeEnd(STI);
   }
 
-  return AsmPrinter::doFinalization(M);
-}
+  // Assign expressions which can only be resolved when all other functions are
+  // known.
+  RI.finalize(OutContext);
 
-// Print comments that apply to both callable functions and entry points.
-void AMDGPUAsmPrinter::emitCommonFunctionComments(
-    uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
-    uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
-    const AMDGPUMachineFunction *MFI) {
-  OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
-  OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
-  OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
-  if (NumAGPR) {
-    OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
-    OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
-                                false);
-  }
-  OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
-  OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
-                              false);
+  // Switch section and emit all GPR maximums within the processed module.
+  OutStreamer->pushSection();
+  MCSectionELF *MaxGPRSection =
+      OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
+  OutStreamer->switchSection(MaxGPRSection);
+  getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext),
+                                              RI.getMaxAGPRSymbol(OutContext),
+                                              RI.getMaxSGPRSymbol(OutContext));
+  OutStreamer->popSection();
+
+  for (Function &F : M.functions())
+    validateMCResourceInfo(F);
+
+  return AsmPrinter::doFinalization(M);
 }
 
 SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
@@ -402,12 +524,14 @@ SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
   return Str;
 }
 
+// Print comments that apply to both callable functions and entry points.
 void AMDGPUAsmPrinter::emitCommonFunctionComments(
     const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
     const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
     const AMDGPUMachineFunction *MFI) {
   OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
-  OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false);
+  OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
+                              false);
   OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
   if (NumAGPR && TotalNumVGPR) {
     OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
@@ -540,6 +664,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     OutStreamer->switchSection(ConfigSection);
   }
 
+  const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
+      ResourceUsage->getResourceInfo();
+  RI.gatherResourceInfo(MF, Info, OutContext);
+
   if (MFI->isModuleEntryFunction()) {
     getSIProgramInfo(CurrentProgramInfo, MF);
   }
@@ -571,21 +699,44 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
                            STM.hasMAIInsts());
 
+  {
+    using RIK = MCResourceInfo::ResourceInfoKind;
+    getTargetStreamer()->EmitMCResourceInfo(
+        RI.getSymbol(MF.getName(), RIK::RIK_NumVGPR, OutContext),
+        RI.getSymbol(MF.getName(), RIK::RIK_NumAGPR, OutContext),
+        RI.getSymbol(MF.getName(), RIK::RIK_NumSGPR, OutContext),
+        RI.getSymbol(MF.getName(), RIK::RIK_PrivateSegSize, OutContext),
+        RI.getSymbol(MF.getName(), RIK::RIK_UsesVCC, OutContext),
+        RI.getSymbol(MF.getName(), RIK::RIK_UsesFlatScratch, OutContext),
+        RI.getSymbol(MF.getName(), RIK::RIK_HasDynSizedStack, OutContext),
+        RI.getSymbol(MF.getName(), RIK::RIK_HasRecursion, OutContext),
+        RI.getSymbol(MF.getName(), RIK::RIK_HasIndirectCall, OutContext));
+  }
+
   if (isVerbose()) {
     MCSectionELF *CommentSection =
         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
     OutStreamer->switchSection(CommentSection);
 
     if (!MFI->isEntryFunction()) {
+      using RIK = MCResourceInfo::ResourceInfoKind;
       OutStreamer->emitRawComment(" Function info:", false);
-      const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
-          ResourceUsage->getResourceInfo(&MF.getFunction());
+
       emitCommonFunctionComments(
-          Info.NumVGPR,
-          STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
-          Info.getTotalNumVGPRs(STM),
-          Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
-          Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
+          RI.getSymbol(MF.getName(), RIK::RIK_NumVGPR, OutContext)
+              ->getVariableValue(),
+          STM.hasMAIInsts()
+              ? RI.getSymbol(MF.getName(), RIK::RIK_NumAGPR, OutContext)
+                    ->getVariableValue()
+              : nullptr,
+          RI.createTotalNumVGPRs(MF, Ctx),
+          RI.createTotalNumSGPRs(
+              MF,
+              MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
+              Ctx),
+          RI.getSymbol(MF.getName(), RIK::RIK_PrivateSegSize, OutContext)
+              ->getVariableValue(),
+          getFunctionCodeSize(MF), MFI);
       return false;
     }
 
@@ -751,10 +902,26 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const
   return CodeSize;
 }
 
+// AccumOffset computed for the MCExpr equivalent of:
+// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
+static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
+  const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
+  const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
+
+  // Can't be lower than 1 for subsequent alignTo.
+  const MCExpr *MaximumTaken =
+      AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
+
+  // Practically, it's computing divideCeil(MaximumTaken, 4).
+  const MCExpr *DivCeil = MCBinaryExpr::createDiv(
+      AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
+      Ctx);
+
+  return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
+}
+
 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                                         const MachineFunction &MF) {
-  const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
-      ResourceUsage->getResourceInfo(&MF.getFunction());
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   MCContext &Ctx = MF.getContext();
 
@@ -771,28 +938,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     return false;
   };
 
-  ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
-  ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);
-  ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));
-  ProgInfo.AccumOffset =
-      CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);
+  auto GetSymRefExpr =
+      [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
+    MCSymbol *Sym = RI.getSymbol(MF.getName(), RIK, OutContext);
+    return MCSymbolRefExpr::create(Sym, Ctx);
+  };
+
+  using RIK = MCResourceInfo::ResourceInfoKind;
+  ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
+  ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
+  ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
+      ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
+
+  ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
   ProgInfo.TgSplit = STM.isTgSplitEnabled();
-  ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);
-  ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);
-  ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);
-  ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);
+  ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
+  ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
+  ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
+  ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
   ProgInfo.DynamicCallStack =
-      CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);
-
-  const uint64_t MaxScratchPerWorkitem =
-      STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
-  uint64_t ScratchSize;
-  if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) &&
-      ScratchSize > MaxScratchPerWorkitem) {
-    DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize,
-                                          MaxScratchPerWorkitem, DS_Error);
-    MF.getFunction().getContext().diagnose(DiagStackSize);
-  }
+      MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
+                             GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -1477,6 +1643,8 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
 void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AMDGPUResourceUsageAnalysis>();
   AU.addPreserved<AMDGPUResourceUsageAnalysis>();
+  AU.addRequired<MachineModuleInfoWrapperPass>();
+  AU.addPreserved<MachineModuleInfoWrapperPass>();
   AsmPrinter::getAnalysisUsage(AU);
 }
 
@@ -1522,7 +1690,7 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
   // printing multiple diagnostic location and diag opts.
   EmitResourceUsageRemark("FunctionName", "Function Name",
                           MF.getFunction().getName());
-  EmitResourceUsageRemark("NumSGPR", "SGPRs",
+  EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
                           getMCExprStr(CurrentProgramInfo.NumSGPR));
   EmitResourceUsageRemark("NumVGPR", "VGPRs",
                           getMCExprStr(CurrentProgramInfo.NumArchVGPR));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index f66bbde42ce27..cc8c4411805e2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 
+#include "AMDGPUMCResourceInfo.h"
 #include "SIProgramInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 
@@ -24,6 +25,7 @@ struct AMDGPUResourceUsageAnalysis;
 class AMDGPUTargetStreamer;
 class MCCodeEmitter;
 class MCOperand;
+class MCResourceInfo;
 
 namespace AMDGPU {
 struct MCKernelDescriptor;
@@ -40,6 +42,8 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
 
   AMDGPUResourceUsageAnalysis *ResourceUsage;
 
+  MCResourceInfo RI;
+
   SIProgramInfo CurrentProgramInfo;
 
   std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream;
@@ -60,11 +64,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
   void EmitPALMetadata(const MachineFunction &MF,
                        const SIProgramInfo &KernelInfo);
   void emitPALFunctionMetadata(const MachineFunction &MF);
-  void emitCommonFunctionComments(uint32_t NumVGPR,
-                                  std::optional<uint32_t> NumAGPR,
-                                  uint32_t TotalNumVGPR, uint32_t NumSGPR,
-                                  uint64_t ScratchSize, uint64_t CodeSize,
-                                  const AMDGPUMachineFunction *MFI);
   void emitCommonFunctionComments(const MCExpr *NumVGPR, const MCExpr *NumAGPR,
                                   const MCExpr *TotalNumVGPR,
                                   const MCExpr *NumSGPR,
@@ -84,6 +83,11 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
 
   SmallString<128> getMCExprStr(const MCExpr *Value);
 
+  /// Attempts to replace the validation that is missed in getSIProgramInfo due
+  /// to MCExpr being unknown. Invoked during doFinalization such that the
+  /// MCResourceInfo symbols are known.
+  void validateMCResourceInfo(Function &F);
+
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
                             std::unique_ptr<MCStreamer> Streamer);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index 3533087bbfd1b..f832a2a55d622 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -18,15 +18,17 @@ def FeatureFMA : SubtargetFeature<"fmaf",
   "Enable single precision FMA (not as fast as mul+add, but fused)"
 >;
 
-class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
-  "localmemorysize"#Value,
-  "LocalMemorySize",
+// Addressable local memory size is the maximum number of bytes of LDS that can
+// be allocated to a single workgroup.
+class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
+  "addressablelocalmemorysize"#Value,
+  "AddressableLocalMemorySize",
   !cast<string>(Value),
   "The size of local memory in bytes"
 >;
 
-def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
-def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
+def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
+def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
 
 class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
   "wavefrontsize"#!shl(1, ValueLog2),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index d3d5bc924525f..ff8798edb3cc0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1911,7 +1911,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
                     0);
   }
 
-  Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
+  Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i32);
 
   return true;
 }
@@ -1966,7 +1966,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
           return false;
         if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
           return false;
-        Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
+        Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
         return true;
       }
     }
@@ -1999,7 +1999,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
   if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
     return false;
   SAddr = SelectSAddrFI(CurDAG, SAddr);
-  Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+  Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index fad51ce8285e0..94fdf4effa10a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -900,6 +900,10 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
     unsigned IntrID = N->getConstantOperandVal(0);
     return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
   }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntrID = N->getConstantOperandVal(1);
+    return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
+  }
   case ISD::LOAD:
     if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
         AMDGPUAS::CONSTANT_ADDRESS_32BIT)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e657f668cc656..271c8d45fd4a2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -289,9 +289,11 @@ static const LLT F64 = LLT::float64();
 static const LLT S96 = LLT::scalar(96);
 static const LLT S128 = LLT::scalar(128);
 static const LLT S160 = LLT::scalar(160);
+static const LLT S192 = LLT::scalar(192);
 static const LLT S224 = LLT::scalar(224);
 static const LLT S256 = LLT::scalar(256);
 static const LLT S512 = LLT::scalar(512);
+static const LLT S1024 = LLT::scalar(1024);
 static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
 
 static const LLT V2S8 = LLT::fixed_vector(2, 8);
@@ -332,8 +334,8 @@ static const LLT V16S64 = LLT::fixed_vector(16, 64);
 static const LLT V2S128 = LLT::fixed_vector(2, 128);
 static const LLT V4S128 = LLT::fixed_vector(4, 128);
 
-static std::initializer_list<LLT> AllScalarTypes = {S32,  S64,  S96,  S128,
-                                                    S160, S224, S256, S512};
+static std::initializer_list<LLT> AllScalarTypes = {
+    S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
 
 static std::initializer_list<LLT> AllS16Vectors{
     V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
@@ -889,10 +891,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .clampScalar(0, S16, S64);
 
   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
-      .legalIf(isRegisterType(0))
+      .legalIf(isRegisterClassType(0))
       // s1 and s16 are special cases because they have legal operations on
       // them, but don't really occupy registers in the normal way.
       .legalFor({S1, S16})
+      .clampNumElements(0, V16S32, V32S32)
       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
       .clampScalarOrElt(0, S32, MaxScalar)
       .widenScalarToNextPow2(0, 32)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 40d2450d775fa..4c596e37476c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -953,11 +953,10 @@ static Type* getIntrinsicParamType(
   case AMDGPULibFunc::IMG1D:
   case AMDGPULibFunc::IMG2D:
   case AMDGPULibFunc::IMG3D:
-    T = StructType::create(C,"ocl_image")->getPointerTo(); break;
   case AMDGPULibFunc::SAMPLER:
-    T = StructType::create(C,"ocl_sampler")->getPointerTo(); break;
   case AMDGPULibFunc::EVENT:
-    T = StructType::create(C,"ocl_event")->getPointerTo(); break;
+    T = PointerType::getUnqual(C);
+    break;
   default:
     llvm_unreachable("Unhandled param type");
     return nullptr;
@@ -965,9 +964,8 @@ static Type* getIntrinsicParamType(
   if (P.VectorSize > 1)
     T = FixedVectorType::get(T, P.VectorSize);
   if (P.PtrKind != AMDGPULibFunc::BYVALUE)
-    T = useAddrSpace ? T->getPointerTo((P.PtrKind & AMDGPULibFunc::ADDR_SPACE)
-                                       - 1)
-                     : T->getPointerTo();
+    T = PointerType::get(
+        C, useAddrSpace ? ((P.PtrKind & AMDGPULibFunc::ADDR_SPACE) - 1) : 0);
   return T;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
new file mode 100644
index 0000000000000..f608a9a4f470f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -0,0 +1,224 @@
+//===- AMDGPUMCResourceInfo.cpp --- MC Resource Info ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief MC infrastructure to propagate the function level resource usage
+/// info.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCResourceInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+
+using namespace llvm;
+
+MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK,
+                                    MCContext &OutContext) {
+  auto GOCS = [FuncName, &OutContext](StringRef Suffix) {
+    return OutContext.getOrCreateSymbol(FuncName + Twine(Suffix));
+  };
+  switch (RIK) {
+  case RIK_NumVGPR:
+    return GOCS(".num_vgpr");
+  case RIK_NumAGPR:
+    return GOCS(".num_agpr");
+  case RIK_NumSGPR:
+    return GOCS(".numbered_sgpr");
+  case RIK_PrivateSegSize:
+    return GOCS(".private_seg_size");
+  case RIK_UsesVCC:
+    return GOCS(".uses_vcc");
+  case RIK_UsesFlatScratch:
+    return GOCS(".uses_flat_scratch");
+  case RIK_HasDynSizedStack:
+    return GOCS(".has_dyn_sized_stack");
+  case RIK_HasRecursion:
+    return GOCS(".has_recursion");
+  case RIK_HasIndirectCall:
+    return GOCS(".has_indirect_call");
+  }
+  llvm_unreachable("Unexpected ResourceInfoKind.");
+}
+
+const MCExpr *MCResourceInfo::getSymRefExpr(StringRef FuncName,
+                                            ResourceInfoKind RIK,
+                                            MCContext &Ctx) {
+  return MCSymbolRefExpr::create(getSymbol(FuncName, RIK, Ctx), Ctx);
+}
+
+void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
+  // Assign expression to get the max register use to the max_num_Xgpr symbol.
+  MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext);
+  MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
+  MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
+
+  auto assignMaxRegSym = [&OutContext](MCSymbol *Sym, int32_t RegCount) {
+    const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext);
+    Sym->setVariableValue(MaxExpr);
+  };
+
+  assignMaxRegSym(MaxVGPRSym, MaxVGPR);
+  assignMaxRegSym(MaxAGPRSym, MaxAGPR);
+  assignMaxRegSym(MaxSGPRSym, MaxSGPR);
+}
+
+void MCResourceInfo::finalize(MCContext &OutContext) {
+  assert(!Finalized && "Cannot finalize ResourceInfo again.");
+  Finalized = true;
+  assignMaxRegs(OutContext);
+}
+
+MCSymbol *MCResourceInfo::getMaxVGPRSymbol(MCContext &OutContext) {
+  return OutContext.getOrCreateSymbol("amdgpu.max_num_vgpr");
+}
+
+MCSymbol *MCResourceInfo::getMaxAGPRSymbol(MCContext &OutContext) {
+  return OutContext.getOrCreateSymbol("amdgpu.max_num_agpr");
+}
+
+MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
+  return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr");
+}
+
+void MCResourceInfo::assignResourceInfoExpr(
+    int64_t LocalValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind,
+    const MachineFunction &MF, const SmallVectorImpl<const Function *> &Callees,
+    MCContext &OutContext) {
+  const MCConstantExpr *LocalConstExpr =
+      MCConstantExpr::create(LocalValue, OutContext);
+  const MCExpr *SymVal = LocalConstExpr;
+  if (!Callees.empty()) {
+    SmallVector<const MCExpr *, 8> ArgExprs;
+    // Avoid recursive symbol assignment.
+    SmallPtrSet<const Function *, 8> Seen;
+    ArgExprs.push_back(LocalConstExpr);
+    const Function &F = MF.getFunction();
+    Seen.insert(&F);
+
+    for (const Function *Callee : Callees) {
+      if (!Seen.insert(Callee).second)
+        continue;
+      MCSymbol *CalleeValSym = getSymbol(Callee->getName(), RIK, OutContext);
+      ArgExprs.push_back(MCSymbolRefExpr::create(CalleeValSym, OutContext));
+    }
+    SymVal = AMDGPUMCExpr::create(Kind, ArgExprs, OutContext);
+  }
+  MCSymbol *Sym = getSymbol(MF.getName(), RIK, OutContext);
+  Sym->setVariableValue(SymVal);
+}
+
+void MCResourceInfo::gatherResourceInfo(
+    const MachineFunction &MF,
+    const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI,
+    MCContext &OutContext) {
+  // Worst case VGPR use for non-hardware-entrypoints.
+  MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext);
+  MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
+  MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
+
+  if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
+    addMaxVGPRCandidate(FRI.NumVGPR);
+    addMaxAGPRCandidate(FRI.NumAGPR);
+    addMaxSGPRCandidate(FRI.NumExplicitSGPR);
+  }
+
+  auto SetMaxReg = [&](MCSymbol *MaxSym, int32_t numRegs,
+                       ResourceInfoKind RIK) {
+    if (!FRI.HasIndirectCall) {
+      assignResourceInfoExpr(numRegs, RIK, AMDGPUMCExpr::AGVK_Max, MF,
+                             FRI.Callees, OutContext);
+    } else {
+      const MCExpr *SymRef = MCSymbolRefExpr::create(MaxSym, OutContext);
+      MCSymbol *LocalNumSym = getSymbol(MF.getName(), RIK, OutContext);
+      const MCExpr *MaxWithLocal = AMDGPUMCExpr::createMax(
+          {MCConstantExpr::create(numRegs, OutContext), SymRef}, OutContext);
+      LocalNumSym->setVariableValue(MaxWithLocal);
+    }
+  };
+
+  SetMaxReg(MaxVGPRSym, FRI.NumVGPR, RIK_NumVGPR);
+  SetMaxReg(MaxAGPRSym, FRI.NumAGPR, RIK_NumAGPR);
+  SetMaxReg(MaxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR);
+
+  {
+    // The expression for private segment size should be: FRI.PrivateSegmentSize
+    // + max(FRI.Callees, FRI.CalleeSegmentSize)
+    SmallVector<const MCExpr *, 8> ArgExprs;
+    if (FRI.CalleeSegmentSize)
+      ArgExprs.push_back(
+          MCConstantExpr::create(FRI.CalleeSegmentSize, OutContext));
+
+    if (!FRI.HasIndirectCall) {
+      for (const Function *Callee : FRI.Callees) {
+        MCSymbol *calleeValSym =
+            getSymbol(Callee->getName(), RIK_PrivateSegSize, OutContext);
+        ArgExprs.push_back(MCSymbolRefExpr::create(calleeValSym, OutContext));
+      }
+    }
+    const MCExpr *localConstExpr =
+        MCConstantExpr::create(FRI.PrivateSegmentSize, OutContext);
+    if (!ArgExprs.empty()) {
+      const AMDGPUMCExpr *transitiveExpr =
+          AMDGPUMCExpr::createMax(ArgExprs, OutContext);
+      localConstExpr =
+          MCBinaryExpr::createAdd(localConstExpr, transitiveExpr, OutContext);
+    }
+    getSymbol(MF.getName(), RIK_PrivateSegSize, OutContext)
+        ->setVariableValue(localConstExpr);
+  }
+
+  auto SetToLocal = [&](int64_t LocalValue, ResourceInfoKind RIK) {
+    MCSymbol *Sym = getSymbol(MF.getName(), RIK, OutContext);
+    Sym->setVariableValue(MCConstantExpr::create(LocalValue, OutContext));
+  };
+
+  if (!FRI.HasIndirectCall) {
+    assignResourceInfoExpr(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC,
+                           AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
+    assignResourceInfoExpr(FRI.UsesFlatScratch,
+                           ResourceInfoKind::RIK_UsesFlatScratch,
+                           AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
+    assignResourceInfoExpr(FRI.HasDynamicallySizedStack,
+                           ResourceInfoKind::RIK_HasDynSizedStack,
+                           AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
+    assignResourceInfoExpr(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion,
+                           AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
+    assignResourceInfoExpr(FRI.HasIndirectCall,
+                           ResourceInfoKind::RIK_HasIndirectCall,
+                           AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
+  } else {
+    SetToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC);
+    SetToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch);
+    SetToLocal(FRI.HasDynamicallySizedStack,
+               ResourceInfoKind::RIK_HasDynSizedStack);
+    SetToLocal(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion);
+    SetToLocal(FRI.HasIndirectCall, ResourceInfoKind::RIK_HasIndirectCall);
+  }
+}
+
+const MCExpr *MCResourceInfo::createTotalNumVGPRs(const MachineFunction &MF,
+                                                  MCContext &Ctx) {
+  return AMDGPUMCExpr::createTotalNumVGPR(
+      getSymRefExpr(MF.getName(), RIK_NumAGPR, Ctx),
+      getSymRefExpr(MF.getName(), RIK_NumVGPR, Ctx), Ctx);
+}
+
+const MCExpr *MCResourceInfo::createTotalNumSGPRs(const MachineFunction &MF,
+                                                  bool hasXnack,
+                                                  MCContext &Ctx) {
+  return MCBinaryExpr::createAdd(
+      getSymRefExpr(MF.getName(), RIK_NumSGPR, Ctx),
+      AMDGPUMCExpr::createExtraSGPRs(
+          getSymRefExpr(MF.getName(), RIK_UsesVCC, Ctx),
+          getSymRefExpr(MF.getName(), RIK_UsesFlatScratch, Ctx), hasXnack, Ctx),
+      Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
new file mode 100644
index 0000000000000..08c0c106d5aa9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -0,0 +1,102 @@
+//===- AMDGPUMCResourceInfo.h ----- MC Resource Info --------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief MC infrastructure to propagate the function level resource usage
+/// info.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H
+
+#include "AMDGPUResourceUsageAnalysis.h"
+#include "MCTargetDesc/AMDGPUMCExpr.h"
+
+namespace llvm {
+
+class MCContext;
+class MCSymbol;
+class StringRef;
+class MachineFunction;
+
+class MCResourceInfo {
+public:
+  enum ResourceInfoKind {
+    RIK_NumVGPR,
+    RIK_NumAGPR,
+    RIK_NumSGPR,
+    RIK_PrivateSegSize,
+    RIK_UsesVCC,
+    RIK_UsesFlatScratch,
+    RIK_HasDynSizedStack,
+    RIK_HasRecursion,
+    RIK_HasIndirectCall
+  };
+
+private:
+  int32_t MaxVGPR = 0;
+  int32_t MaxAGPR = 0;
+  int32_t MaxSGPR = 0;
+
+  // Whether the MCResourceInfo has been finalized through finalize(MCContext
+  // &). Should only be called once, at the end of AsmPrinting to assign MaxXGPR
+  // symbols to their final value.
+  bool Finalized = false;
+
+  void assignResourceInfoExpr(int64_t localValue, ResourceInfoKind RIK,
+                              AMDGPUMCExpr::VariantKind Kind,
+                              const MachineFunction &MF,
+                              const SmallVectorImpl<const Function *> &Callees,
+                              MCContext &OutContext);
+
+  // Assigns expression for Max S/V/A-GPRs to the referenced symbols.
+  void assignMaxRegs(MCContext &OutContext);
+
+public:
+  MCResourceInfo() = default;
+  void addMaxVGPRCandidate(int32_t candidate) {
+    MaxVGPR = std::max(MaxVGPR, candidate);
+  }
+  void addMaxAGPRCandidate(int32_t candidate) {
+    MaxAGPR = std::max(MaxAGPR, candidate);
+  }
+  void addMaxSGPRCandidate(int32_t candidate) {
+    MaxSGPR = std::max(MaxSGPR, candidate);
+  }
+
+  MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK,
+                      MCContext &OutContext);
+  const MCExpr *getSymRefExpr(StringRef FuncName, ResourceInfoKind RIK,
+                              MCContext &Ctx);
+
+  // Resolves the final symbols that requires the inter-function resource info
+  // to be resolved.
+  void finalize(MCContext &OutContext);
+
+  MCSymbol *getMaxVGPRSymbol(MCContext &OutContext);
+  MCSymbol *getMaxAGPRSymbol(MCContext &OutContext);
+  MCSymbol *getMaxSGPRSymbol(MCContext &OutContext);
+
+  /// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function
+  /// granularity. However, some resource info has to be assigned the call
+  /// transitive maximum or accumulative. For example, if A calls B and B's VGPR
+  /// usage exceeds A's, A should be assigned B's VGPR usage. Furthermore,
+  /// functions with indirect calls should be assigned the module level maximum.
+  void gatherResourceInfo(
+      const MachineFunction &MF,
+      const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI,
+      MCContext &OutContext);
+
+  const MCExpr *createTotalNumVGPRs(const MachineFunction &MF, MCContext &Ctx);
+  const MCExpr *createTotalNumSGPRs(const MachineFunction &MF, bool hasXnack,
+                                    MCContext &Ctx);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 7bd618b2d9660..24bfbff41ec5c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -402,7 +402,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
   // helper.
   unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
-  MapVector<Value *, APInt> VarOffsets;
+  SmallMapVector<Value *, APInt, 4> VarOffsets;
   APInt ConstOffset(BW, 0);
   if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
       !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp
new file mode 100644
index 0000000000000..7dc492a8f7adf
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp
@@ -0,0 +1,96 @@
+//===-- AMDGPUReserveWWMRegs.cpp - Add WWM Regs to reserved regs list -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass should be invoked at the end of wwm-regalloc pipeline.
+/// It identifies the WWM regs allocated during this pipeline and add
+/// them to the list of reserved registers so that they won't be available for
+/// per-thread VGPR allocation in the subsequent regalloc pipeline.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-reserve-wwm-regs"
+
+namespace {
+
+class AMDGPUReserveWWMRegs : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUReserveWWMRegs() : MachineFunctionPass(ID) {
+    initializeAMDGPUReserveWWMRegsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Reserve WWM Registers";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(AMDGPUReserveWWMRegs, DEBUG_TYPE,
+                "AMDGPU Reserve WWM Registers", false, false)
+
+char AMDGPUReserveWWMRegs::ID = 0;
+
+char &llvm::AMDGPUReserveWWMRegsID = AMDGPUReserveWWMRegs::ID;
+
+bool AMDGPUReserveWWMRegs::runOnMachineFunction(MachineFunction &MF) {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      unsigned Opc = MI.getOpcode();
+      if (Opc != AMDGPU::SI_SPILL_S32_TO_VGPR &&
+          Opc != AMDGPU::SI_RESTORE_S32_FROM_VGPR)
+        continue;
+
+      Register Reg = Opc == AMDGPU::SI_SPILL_S32_TO_VGPR
+                         ? MI.getOperand(0).getReg()
+                         : MI.getOperand(1).getReg();
+
+      assert(Reg.isPhysical() &&
+             "All WWM registers should have been allocated by now.");
+
+      MFI->reserveWWMRegister(Reg);
+      Changed |= true;
+    }
+  }
+
+  // The renamable flag can't be set for reserved registers. Reset the flag for
+  // MOs involving wwm-regs as they will be reserved during vgpr-regalloc
+  // pipeline.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (Register Reg : MFI->getWWMReservedRegs()) {
+    for (MachineOperand &MO : MRI.reg_operands(Reg))
+      MO.setIsRenamable(false);
+  }
+
+  // Now clear the NonWWMRegMask earlier set during wwm-regalloc.
+  MFI->clearNonWWMRegAllocMask();
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 0aca99a82d197..1ee3c40d69a3b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -13,14 +13,6 @@
 /// The results of this analysis are used to fill the register usage, flat
 /// usage, etc. into hardware registers.
 ///
-/// The analysis takes callees into account. E.g. if a function A that needs 10
-/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
-/// will return 20.
-/// It is assumed that an indirect call can go into any function except
-/// hardware-entrypoints. Therefore the register usage of functions with
-/// indirect calls is estimated as the maximum of all non-entrypoint functions
-/// in the module.
-///
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUResourceUsageAnalysis.h"
@@ -28,8 +20,8 @@
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/Analysis/CallGraph.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
@@ -78,92 +70,37 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
   return false;
 }
 
-int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
-    const GCNSubtarget &ST) const {
-  return NumExplicitSGPR +
-         IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
-                                   ST.getTargetID().isXnackOnOrAny());
-}
-
-int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
-    const GCNSubtarget &ST) const {
-  return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), NumAGPR, NumVGPR);
-}
-
-bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
+bool AMDGPUResourceUsageAnalysis::runOnMachineFunction(MachineFunction &MF) {
   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
   if (!TPC)
     return false;
 
-  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
   const TargetMachine &TM = TPC->getTM<TargetMachine>();
   const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
-  bool HasIndirectCall = false;
-
-  CallGraph CG = CallGraph(M);
-  auto End = po_end(&CG);
 
   // By default, for code object v5 and later, track only the minimum scratch
   // size
   uint32_t AssumedStackSizeForDynamicSizeObjects =
       clAssumedStackSizeForDynamicSizeObjects;
   uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
-  if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
+  if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
+          AMDGPU::AMDHSA_COV5 ||
       STI.getTargetTriple().getOS() == Triple::AMDPAL) {
-    if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0)
+    if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
       AssumedStackSizeForDynamicSizeObjects = 0;
-    if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0)
+    if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
       AssumedStackSizeForExternalCall = 0;
   }
 
-  for (auto IT = po_begin(&CG); IT != End; ++IT) {
-    Function *F = IT->getFunction();
-    if (!F || F->isDeclaration())
-      continue;
-
-    MachineFunction *MF = MMI.getMachineFunction(*F);
-    assert(MF && "function must have been generated already");
-
-    auto CI =
-        CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
-    SIFunctionResourceInfo &Info = CI.first->second;
-    assert(CI.second && "should only be called once per function");
-    Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
-                                AssumedStackSizeForExternalCall);
-    HasIndirectCall |= Info.HasIndirectCall;
-  }
-
-  // It's possible we have unreachable functions in the module which weren't
-  // visited by the PO traversal. Make sure we have some resource counts to
-  // report.
-  for (const auto &IT : CG) {
-    const Function *F = IT.first;
-    if (!F || F->isDeclaration())
-      continue;
-
-    auto CI =
-        CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
-    if (!CI.second) // Skip already visited functions
-      continue;
-
-    SIFunctionResourceInfo &Info = CI.first->second;
-    MachineFunction *MF = MMI.getMachineFunction(*F);
-    assert(MF && "function must have been generated already");
-    Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
-                                AssumedStackSizeForExternalCall);
-    HasIndirectCall |= Info.HasIndirectCall;
-  }
-
-  if (HasIndirectCall)
-    propagateIndirectCallRegisterUsage();
+  ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects,
+                                      AssumedStackSizeForExternalCall);
 
   return false;
 }
 
 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
-    const MachineFunction &MF, const TargetMachine &TM,
-    uint32_t AssumedStackSizeForDynamicSizeObjects,
+    const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
     uint32_t AssumedStackSizeForExternalCall) const {
   SIFunctionResourceInfo Info;
 
@@ -253,7 +190,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
   int32_t MaxVGPR = -1;
   int32_t MaxAGPR = -1;
   int32_t MaxSGPR = -1;
-  uint64_t CalleeFrameSize = 0;
+  Info.CalleeSegmentSize = 0;
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
@@ -512,8 +449,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
 
         const Function *Callee = getCalleeFunction(*CalleeOp);
-        DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
-            CallGraphResourceInfo.end();
 
         // Avoid crashing on undefined behavior with an illegal call to a
         // kernel. If a callsite's calling convention doesn't match the
@@ -522,9 +457,14 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
           report_fatal_error("invalid call to entry function");
 
+        auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
+          return F == &MF.getFunction();
+        };
+
+        if (Callee && !isSameFunction(MF, Callee))
+          Info.Callees.push_back(Callee);
+
         bool IsIndirect = !Callee || Callee->isDeclaration();
-        if (!IsIndirect)
-          I = CallGraphResourceInfo.find(Callee);
 
         // FIXME: Call site could have norecurse on it
         if (!Callee || !Callee->doesNotRecurse()) {
@@ -539,15 +479,15 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
             // directly call the tail called function. If a kernel directly
             // calls a tail recursive function, we'll assume maximum stack size
             // based on the regular call instruction.
-            CalleeFrameSize = std::max(
-                CalleeFrameSize,
+            Info.CalleeSegmentSize = std::max(
+                Info.CalleeSegmentSize,
                 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
           }
         }
 
-        if (IsIndirect || I == CallGraphResourceInfo.end()) {
-          CalleeFrameSize =
-              std::max(CalleeFrameSize,
+        if (IsIndirect) {
+          Info.CalleeSegmentSize =
+              std::max(Info.CalleeSegmentSize,
                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
 
           // Register usage of indirect calls gets handled later
@@ -555,19 +495,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
           Info.HasDynamicallySizedStack = true;
           Info.HasIndirectCall = true;
-        } else {
-          // We force CodeGen to run in SCC order, so the callee's register
-          // usage etc. should be the cumulative usage of all callees.
-          MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
-          MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
-          MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
-          CalleeFrameSize =
-              std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
-          Info.UsesVCC |= I->second.UsesVCC;
-          Info.UsesFlatScratch |= I->second.UsesFlatScratch;
-          Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
-          Info.HasRecursion |= I->second.HasRecursion;
-          Info.HasIndirectCall |= I->second.HasIndirectCall;
         }
       }
     }
@@ -576,36 +503,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
   Info.NumExplicitSGPR = MaxSGPR + 1;
   Info.NumVGPR = MaxVGPR + 1;
   Info.NumAGPR = MaxAGPR + 1;
-  Info.PrivateSegmentSize += CalleeFrameSize;
 
   return Info;
 }
-
-void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
-  // Collect the maximum number of registers from non-hardware-entrypoints.
-  // All these functions are potential targets for indirect calls.
-  int32_t NonKernelMaxSGPRs = 0;
-  int32_t NonKernelMaxVGPRs = 0;
-  int32_t NonKernelMaxAGPRs = 0;
-
-  for (const auto &I : CallGraphResourceInfo) {
-    if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
-      auto &Info = I.getSecond();
-      NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
-      NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
-      NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
-    }
-  }
-
-  // Add register usage for functions with indirect calls.
-  // For calls to unknown functions, we assume the maximum register usage of
-  // all non-hardware-entrypoints in the current module.
-  for (auto &I : CallGraphResourceInfo) {
-    auto &Info = I.getSecond();
-    if (Info.HasIndirectCall) {
-      Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
-      Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
-      Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
-    }
-  }
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index 7f71de6749dce..92ef41f49b3ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -15,8 +15,8 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
 
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 
 namespace llvm {
 
@@ -24,10 +24,9 @@ class GCNSubtarget;
 class MachineFunction;
 class TargetMachine;
 
-struct AMDGPUResourceUsageAnalysis : public ModulePass {
-  static char ID;
-
+struct AMDGPUResourceUsageAnalysis : public MachineFunctionPass {
 public:
+  static char ID;
   // Track resource usage for callee functions.
   struct SIFunctionResourceInfo {
     // Track the number of explicitly used VGPRs. Special registers reserved at
@@ -35,48 +34,33 @@ struct AMDGPUResourceUsageAnalysis : public ModulePass {
     int32_t NumVGPR = 0;
     int32_t NumAGPR = 0;
     int32_t NumExplicitSGPR = 0;
+    uint64_t CalleeSegmentSize = 0;
     uint64_t PrivateSegmentSize = 0;
     bool UsesVCC = false;
     bool UsesFlatScratch = false;
     bool HasDynamicallySizedStack = false;
     bool HasRecursion = false;
     bool HasIndirectCall = false;
-
-    int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
-    // Total number of VGPRs is actually a combination of AGPR and VGPR
-    // depending on architecture - and some alignment constraints
-    int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
+    SmallVector<const Function *, 16> Callees;
   };
 
-  AMDGPUResourceUsageAnalysis() : ModulePass(ID) {}
+  AMDGPUResourceUsageAnalysis() : MachineFunctionPass(ID) {}
 
-  bool doInitialization(Module &M) override {
-    CallGraphResourceInfo.clear();
-    return ModulePass::doInitialization(M);
-  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  bool runOnModule(Module &M) override;
+  const SIFunctionResourceInfo &getResourceInfo() const { return ResourceInfo; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineModuleInfoWrapperPass>();
     AU.setPreservesAll();
-  }
-
-  const SIFunctionResourceInfo &getResourceInfo(const Function *F) const {
-    auto Info = CallGraphResourceInfo.find(F);
-    assert(Info != CallGraphResourceInfo.end() &&
-           "Failed to find resource info for function");
-    return Info->getSecond();
+    MachineFunctionPass::getAnalysisUsage(AU);
   }
 
 private:
   SIFunctionResourceInfo
-  analyzeResourceUsage(const MachineFunction &MF, const TargetMachine &TM,
+  analyzeResourceUsage(const MachineFunction &MF,
                        uint32_t AssumedStackSizeForDynamicSizeObjects,
                        uint32_t AssumedStackSizeForExternalCall) const;
-  void propagateIndirectCallRegisterUsage();
-
-  DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
+  SIFunctionResourceInfo ResourceInfo;
 };
 } // namespace llvm
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 49ccd2c9ae511..334322f533e54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -226,10 +226,18 @@ class AMDGPUSubtarget {
     return WavefrontSizeLog2;
   }
 
+  /// Return the maximum number of bytes of LDS available for all workgroups
+  /// running on the same WGP or CU.
+  /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
+  /// limited to 64k.
   unsigned getLocalMemorySize() const {
     return LocalMemorySize;
   }
 
+  /// Return the maximum number of bytes of LDS that can be allocated to a
+  /// single workgroup.
+  /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
+  /// 128k in total.
   unsigned getAddressableLocalMemorySize() const {
     return AddressableLocalMemorySize;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
old mode 100755
new mode 100644
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index abd50748f2cc0..1f2148c2922de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -108,6 +108,12 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
     : RegisterRegAllocBase(N, D, C) {}
 };
 
+class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
+public:
+  WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
+      : RegisterRegAllocBase(N, D, C) {}
+};
+
 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
                               const MachineRegisterInfo &MRI,
                               const Register Reg) {
@@ -122,13 +128,24 @@ static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
   return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
 }
 
-/// -{sgpr|vgpr}-regalloc=... command line option.
+static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
+                                const MachineRegisterInfo &MRI,
+                                const Register Reg) {
+  const SIMachineFunctionInfo *MFI =
+      MRI.getMF().getInfo<SIMachineFunctionInfo>();
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
+         MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
+}
+
+/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
 
 /// A dummy default pass factory indicates whether the register allocator is
 /// overridden on the command line.
 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
+static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
 
 static SGPRRegisterRegAlloc
 defaultSGPRRegAlloc("default",
@@ -145,6 +162,11 @@ static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
              cl::desc("Register allocator to use for VGPRs"));
 
+static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
+               RegisterPassParser<WWMRegisterRegAlloc>>
+    WWMRegAlloc("wwm-regalloc", cl::Hidden,
+                cl::init(&useDefaultRegisterAllocator),
+                cl::desc("Register allocator to use for WWM registers"));
 
 static void initializeDefaultSGPRRegisterAllocatorOnce() {
   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
@@ -164,6 +186,15 @@ static void initializeDefaultVGPRRegisterAllocatorOnce() {
   }
 }
 
+static void initializeDefaultWWMRegisterAllocatorOnce() {
+  RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
+
+  if (!Ctor) {
+    Ctor = WWMRegAlloc;
+    WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
+  }
+}
+
 static FunctionPass *createBasicSGPRRegisterAllocator() {
   return createBasicRegisterAllocator(onlyAllocateSGPRs);
 }
@@ -188,6 +219,18 @@ static FunctionPass *createFastVGPRRegisterAllocator() {
   return createFastRegisterAllocator(onlyAllocateVGPRs, true);
 }
 
+static FunctionPass *createBasicWWMRegisterAllocator() {
+  return createBasicRegisterAllocator(onlyAllocateWWMRegs);
+}
+
+static FunctionPass *createGreedyWWMRegisterAllocator() {
+  return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
+}
+
+static FunctionPass *createFastWWMRegisterAllocator() {
+  return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
+}
+
 static SGPRRegisterRegAlloc basicRegAllocSGPR(
   "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
 static SGPRRegisterRegAlloc greedyRegAllocSGPR(
@@ -204,6 +247,14 @@ static VGPRRegisterRegAlloc greedyRegAllocVGPR(
 
 static VGPRRegisterRegAlloc fastRegAllocVGPR(
   "fast", "fast register allocator", createFastVGPRRegisterAllocator);
+static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
+                                               "basic register allocator",
+                                               createBasicWWMRegisterAllocator);
+static WWMRegisterRegAlloc
+    greedyRegAllocWWMReg("greedy", "greedy register allocator",
+                         createGreedyWWMRegisterAllocator);
+static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
+                                              createFastWWMRegisterAllocator);
 } // anonymous namespace
 
 static cl::opt<bool>
@@ -440,6 +491,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
   initializeAMDGPULowerModuleLDSLegacyPass(*PR);
   initializeAMDGPULowerBufferFatPointersPass(*PR);
+  initializeAMDGPUReserveWWMRegsPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
@@ -799,12 +851,11 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
 
 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
   const auto *LD = dyn_cast<LoadInst>(V);
-  if (!LD)
+  if (!LD) // TODO: Handle invariant load like constant.
     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
 
   // It must be a generic pointer loaded.
-  assert(V->getType()->isPointerTy() &&
-         V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
+  assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
 
   const auto *Ptr = LD->getPointerOperand();
   if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
@@ -990,6 +1041,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
 
   FunctionPass *createSGPRAllocPass(bool Optimized);
   FunctionPass *createVGPRAllocPass(bool Optimized);
+  FunctionPass *createWWMRegAllocPass(bool Optimized);
   FunctionPass *createRegAllocPass(bool Optimized) override;
 
   bool addRegAssignAndRewriteFast() override;
@@ -1383,7 +1435,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
 }
 
 bool GCNPassConfig::addPreRewrite() {
-  addPass(&SILowerWWMCopiesID);
   if (EnableRegReassign)
     addPass(&GCNNSAReassignID);
   return true;
@@ -1419,12 +1470,28 @@ FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
   return createFastVGPRRegisterAllocator();
 }
 
+FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
+  // Initialize the global default.
+  llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
+                  initializeDefaultWWMRegisterAllocatorOnce);
+
+  RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
+  if (Ctor != useDefaultRegisterAllocator)
+    return Ctor();
+
+  if (Optimized)
+    return createGreedyWWMRegisterAllocator();
+
+  return createFastWWMRegisterAllocator();
+}
+
 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
   llvm_unreachable("should not be used");
 }
 
 static const char RegAllocOptNotSupportedMessage[] =
-  "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
+    "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
+    "and -vgpr-regalloc";
 
 bool GCNPassConfig::addRegAssignAndRewriteFast() {
   if (!usingDefaultRegAlloc())
@@ -1436,11 +1503,19 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
 
   // Equivalent of PEI for SGPRs.
   addPass(&SILowerSGPRSpillsLegacyID);
+
+  // To Allocate wwm registers used in whole quad mode operations (for shaders).
   addPass(&SIPreAllocateWWMRegsID);
 
-  addPass(createVGPRAllocPass(false));
+  // For allocating other wwm register operands.
+  addPass(createWWMRegAllocPass(false));
 
   addPass(&SILowerWWMCopiesID);
+  addPass(&AMDGPUReserveWWMRegsID);
+
+  // For allocating per-thread VGPRs.
+  addPass(createVGPRAllocPass(false));
+
   return true;
 }
 
@@ -1460,8 +1535,17 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
 
   // Equivalent of PEI for SGPRs.
   addPass(&SILowerSGPRSpillsLegacyID);
+
+  // To Allocate wwm registers used in whole quad mode operations (for shaders).
   addPass(&SIPreAllocateWWMRegsID);
 
+  // For allocating other whole wave mode registers.
+  addPass(createWWMRegAllocPass(true));
+  addPass(&SILowerWWMCopiesID);
+  addPass(createVirtRegRewriter(false));
+  addPass(&AMDGPUReserveWWMRegsID);
+
+  // For allocating per-thread VGPRs.
   addPass(createVGPRAllocPass(true));
 
   addPreRewrite();
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 555b8cb5c6e53..e12db4ab058ed 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -203,7 +203,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   };
 
   struct RegOp {
-    unsigned RegNo;
+    MCRegister RegNo;
     Modifiers Mods;
   };
 
@@ -1192,10 +1192,9 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   }
 
   static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser,
-                                      unsigned RegNo, SMLoc S,
-                                      SMLoc E) {
+                                      MCRegister Reg, SMLoc S, SMLoc E) {
     auto Op = std::make_unique<AMDGPUOperand>(Register, AsmParser);
-    Op->Reg.RegNo = RegNo;
+    Op->Reg.RegNo = Reg;
     Op->Reg.Mods = Modifiers();
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -1357,7 +1356,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   bool ParseAMDKernelCodeTValue(StringRef ID, AMDGPUMCKernelCodeT &Header);
   bool ParseDirectiveAMDKernelCodeT();
   // TODO: Possibly make subtargetHasRegister const.
-  bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo);
+  bool subtargetHasRegister(const MCRegisterInfo &MRI, MCRegister Reg);
   bool ParseDirectiveAMDGPUHsaKernel();
 
   bool ParseDirectiveISAVersion();
@@ -1372,25 +1371,26 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
                            const char *AssemblerDirectiveEnd,
                            std::string &CollectString);
 
-  bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
-                             RegisterKind RegKind, unsigned Reg1, SMLoc Loc);
-  bool ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
+  bool AddNextRegisterToList(MCRegister &Reg, unsigned &RegWidth,
+                             RegisterKind RegKind, MCRegister Reg1, SMLoc Loc);
+  bool ParseAMDGPURegister(RegisterKind &RegKind, MCRegister &Reg,
                            unsigned &RegNum, unsigned &RegWidth,
                            bool RestoreOnFailure = false);
-  bool ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
+  bool ParseAMDGPURegister(RegisterKind &RegKind, MCRegister &Reg,
                            unsigned &RegNum, unsigned &RegWidth,
                            SmallVectorImpl<AsmToken> &Tokens);
-  unsigned ParseRegularReg(RegisterKind &RegKind, unsigned &RegNum,
-                           unsigned &RegWidth,
-                           SmallVectorImpl<AsmToken> &Tokens);
-  unsigned ParseSpecialReg(RegisterKind &RegKind, unsigned &RegNum,
-                           unsigned &RegWidth,
-                           SmallVectorImpl<AsmToken> &Tokens);
-  unsigned ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
-                        unsigned &RegWidth, SmallVectorImpl<AsmToken> &Tokens);
+  MCRegister ParseRegularReg(RegisterKind &RegKind, unsigned &RegNum,
+                             unsigned &RegWidth,
+                             SmallVectorImpl<AsmToken> &Tokens);
+  MCRegister ParseSpecialReg(RegisterKind &RegKind, unsigned &RegNum,
+                             unsigned &RegWidth,
+                             SmallVectorImpl<AsmToken> &Tokens);
+  MCRegister ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
+                          unsigned &RegWidth,
+                          SmallVectorImpl<AsmToken> &Tokens);
   bool ParseRegRange(unsigned& Num, unsigned& Width);
-  unsigned getRegularReg(RegisterKind RegKind, unsigned RegNum, unsigned SubReg,
-                         unsigned RegWidth, SMLoc Loc);
+  MCRegister getRegularReg(RegisterKind RegKind, unsigned RegNum,
+                           unsigned SubReg, unsigned RegWidth, SMLoc Loc);
 
   bool isRegister();
   bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const;
@@ -1746,7 +1746,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   SMLoc getOperandLoc(std::function<bool(const AMDGPUOperand&)> Test,
                       const OperandVector &Operands) const;
   SMLoc getImmLoc(AMDGPUOperand::ImmTy Type, const OperandVector &Operands) const;
-  SMLoc getRegLoc(unsigned Reg, const OperandVector &Operands) const;
+  SMLoc getRegLoc(MCRegister Reg, const OperandVector &Operands) const;
   SMLoc getLitLoc(const OperandVector &Operands,
                   bool SearchMandatoryLiterals = false) const;
   SMLoc getMandatoryLitLoc(const OperandVector &Operands) const;
@@ -1773,7 +1773,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   bool validateOpSel(const MCInst &Inst);
   bool validateNeg(const MCInst &Inst, int OpName);
   bool validateDPP(const MCInst &Inst, const OperandVector &Operands);
-  bool validateVccOperand(unsigned Reg) const;
+  bool validateVccOperand(MCRegister Reg) const;
   bool validateVOPLiteral(const MCInst &Inst, const OperandVector &Operands);
   bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands);
   bool validateMAISrc2(const MCInst &Inst, const OperandVector &Operands);
@@ -1838,10 +1838,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   ParseStatus parseSOPPBrTarget(OperandVector &Operands);
   ParseStatus parseBoolReg(OperandVector &Operands);
 
-  bool parseSwizzleOperand(int64_t &Op,
-                           const unsigned MinVal,
-                           const unsigned MaxVal,
-                           const StringRef ErrMsg,
+  bool parseSwizzleOperand(int64_t &Op, const unsigned MinVal,
+                           const unsigned MaxVal, const Twine &ErrMsg,
                            SMLoc &Loc);
   bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op,
                             const unsigned MinVal,
@@ -1855,6 +1853,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   bool parseSwizzleBroadcast(int64_t &Imm);
   bool parseSwizzleSwap(int64_t &Imm);
   bool parseSwizzleReverse(int64_t &Imm);
+  bool parseSwizzleFFT(int64_t &Imm);
+  bool parseSwizzleRotate(int64_t &Imm);
 
   ParseStatus parseGPRIdxMode(OperandVector &Operands);
   int64_t parseGPRIdxMacro();
@@ -2637,7 +2637,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
   return -1;
 }
 
-static unsigned getSpecialRegForName(StringRef RegName) {
+static MCRegister getSpecialRegForName(StringRef RegName) {
   return StringSwitch<unsigned>(RegName)
     .Case("exec", AMDGPU::EXEC)
     .Case("vcc", AMDGPU::VCC)
@@ -2709,9 +2709,9 @@ ParseStatus AMDGPUAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
   return ParseStatus::Success;
 }
 
-bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
-                                            RegisterKind RegKind, unsigned Reg1,
-                                            SMLoc Loc) {
+bool AMDGPUAsmParser::AddNextRegisterToList(MCRegister &Reg, unsigned &RegWidth,
+                                            RegisterKind RegKind,
+                                            MCRegister Reg1, SMLoc Loc) {
   switch (RegKind) {
   case IS_SPECIAL:
     if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) {
@@ -2824,7 +2824,7 @@ AMDGPUAsmParser::isRegister(const AsmToken &Token,
     }
   }
 
-  return getSpecialRegForName(Str) != AMDGPU::NoRegister;
+  return getSpecialRegForName(Str).isValid();
 }
 
 bool
@@ -2833,9 +2833,9 @@ AMDGPUAsmParser::isRegister()
   return isRegister(getToken(), peekToken());
 }
 
-unsigned AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
-                                        unsigned SubReg, unsigned RegWidth,
-                                        SMLoc Loc) {
+MCRegister AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
+                                          unsigned SubReg, unsigned RegWidth,
+                                          SMLoc Loc) {
   assert(isRegularReg(RegKind));
 
   unsigned AlignSize = 1;
@@ -2847,24 +2847,24 @@ unsigned AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
 
   if (RegNum % AlignSize != 0) {
     Error(Loc, "invalid register alignment");
-    return AMDGPU::NoRegister;
+    return MCRegister();
   }
 
   unsigned RegIdx = RegNum / AlignSize;
   int RCID = getRegClass(RegKind, RegWidth);
   if (RCID == -1) {
     Error(Loc, "invalid or unsupported register size");
-    return AMDGPU::NoRegister;
+    return MCRegister();
   }
 
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
   const MCRegisterClass RC = TRI->getRegClass(RCID);
   if (RegIdx >= RC.getNumRegs()) {
     Error(Loc, "register index is out of range");
-    return AMDGPU::NoRegister;
+    return MCRegister();
   }
 
-  unsigned Reg = RC.getRegister(RegIdx);
+  MCRegister Reg = RC.getRegister(RegIdx);
 
   if (SubReg) {
     Reg = TRI->getSubReg(Reg, SubReg);
@@ -2919,11 +2919,12 @@ bool AMDGPUAsmParser::ParseRegRange(unsigned &Num, unsigned &RegWidth) {
   return true;
 }
 
-unsigned AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind,
-                                          unsigned &RegNum, unsigned &RegWidth,
-                                          SmallVectorImpl<AsmToken> &Tokens) {
+MCRegister AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind,
+                                            unsigned &RegNum,
+                                            unsigned &RegWidth,
+                                            SmallVectorImpl<AsmToken> &Tokens) {
   assert(isToken(AsmToken::Identifier));
-  unsigned Reg = getSpecialRegForName(getTokenStr());
+  MCRegister Reg = getSpecialRegForName(getTokenStr());
   if (Reg) {
     RegNum = 0;
     RegWidth = 32;
@@ -2934,9 +2935,10 @@ unsigned AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind,
   return Reg;
 }
 
-unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
-                                          unsigned &RegNum, unsigned &RegWidth,
-                                          SmallVectorImpl<AsmToken> &Tokens) {
+MCRegister AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
+                                            unsigned &RegNum,
+                                            unsigned &RegWidth,
+                                            SmallVectorImpl<AsmToken> &Tokens) {
   assert(isToken(AsmToken::Identifier));
   StringRef RegName = getTokenStr();
   auto Loc = getLoc();
@@ -2944,7 +2946,7 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
   const RegInfo *RI = getRegularRegInfo(RegName);
   if (!RI) {
     Error(Loc, "invalid register name");
-    return AMDGPU::NoRegister;
+    return MCRegister();
   }
 
   Tokens.push_back(getToken());
@@ -2962,64 +2964,65 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
     // Single 32-bit register: vXX.
     if (!getRegNum(RegSuffix, RegNum)) {
       Error(Loc, "invalid register index");
-      return AMDGPU::NoRegister;
+      return MCRegister();
     }
     RegWidth = 32;
   } else {
     // Range of registers: v[XX:YY]. ":YY" is optional.
     if (!ParseRegRange(RegNum, RegWidth))
-      return AMDGPU::NoRegister;
+      return MCRegister();
   }
 
   return getRegularReg(RegKind, RegNum, SubReg, RegWidth, Loc);
 }
 
-unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
-                                       unsigned &RegWidth,
-                                       SmallVectorImpl<AsmToken> &Tokens) {
-  unsigned Reg = AMDGPU::NoRegister;
+MCRegister AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind,
+                                         unsigned &RegNum, unsigned &RegWidth,
+                                         SmallVectorImpl<AsmToken> &Tokens) {
+  MCRegister Reg;
   auto ListLoc = getLoc();
 
   if (!skipToken(AsmToken::LBrac,
                  "expected a register or a list of registers")) {
-    return AMDGPU::NoRegister;
+    return MCRegister();
   }
 
   // List of consecutive registers, e.g.: [s0,s1,s2,s3]
 
   auto Loc = getLoc();
   if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth))
-    return AMDGPU::NoRegister;
+    return MCRegister();
   if (RegWidth != 32) {
     Error(Loc, "expected a single 32-bit register");
-    return AMDGPU::NoRegister;
+    return MCRegister();
   }
 
   for (; trySkipToken(AsmToken::Comma); ) {
     RegisterKind NextRegKind;
-    unsigned NextReg, NextRegNum, NextRegWidth;
+    MCRegister NextReg;
+    unsigned NextRegNum, NextRegWidth;
     Loc = getLoc();
 
     if (!ParseAMDGPURegister(NextRegKind, NextReg,
                              NextRegNum, NextRegWidth,
                              Tokens)) {
-      return AMDGPU::NoRegister;
+      return MCRegister();
     }
     if (NextRegWidth != 32) {
       Error(Loc, "expected a single 32-bit register");
-      return AMDGPU::NoRegister;
+      return MCRegister();
     }
     if (NextRegKind != RegKind) {
       Error(Loc, "registers in a list must be of the same kind");
-      return AMDGPU::NoRegister;
+      return MCRegister();
     }
     if (!AddNextRegisterToList(Reg, RegWidth, RegKind, NextReg, Loc))
-      return AMDGPU::NoRegister;
+      return MCRegister();
   }
 
   if (!skipToken(AsmToken::RBrac,
                  "expected a comma or a closing square bracket")) {
-    return AMDGPU::NoRegister;
+    return MCRegister();
   }
 
   if (isRegularReg(RegKind))
@@ -3028,22 +3031,23 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
   return Reg;
 }
 
-bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
-                                          unsigned &RegNum, unsigned &RegWidth,
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind,
+                                          MCRegister &Reg, unsigned &RegNum,
+                                          unsigned &RegWidth,
                                           SmallVectorImpl<AsmToken> &Tokens) {
   auto Loc = getLoc();
-  Reg = AMDGPU::NoRegister;
+  Reg = MCRegister();
 
   if (isToken(AsmToken::Identifier)) {
     Reg = ParseSpecialReg(RegKind, RegNum, RegWidth, Tokens);
-    if (Reg == AMDGPU::NoRegister)
+    if (!Reg)
       Reg = ParseRegularReg(RegKind, RegNum, RegWidth, Tokens);
   } else {
     Reg = ParseRegList(RegKind, RegNum, RegWidth, Tokens);
   }
 
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
-  if (Reg == AMDGPU::NoRegister) {
+  if (!Reg) {
     assert(Parser.hasPendingError());
     return false;
   }
@@ -3061,10 +3065,11 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
   return true;
 }
 
-bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
-                                          unsigned &RegNum, unsigned &RegWidth,
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind,
+                                          MCRegister &Reg, unsigned &RegNum,
+                                          unsigned &RegWidth,
                                           bool RestoreOnFailure /*=false*/) {
-  Reg = AMDGPU::NoRegister;
+  Reg = MCRegister();
 
   SmallVector<AsmToken, 1> Tokens;
   if (ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, Tokens)) {
@@ -3132,7 +3137,8 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) {
   SMLoc StartLoc = Tok.getLoc();
   SMLoc EndLoc = Tok.getEndLoc();
   RegisterKind RegKind;
-  unsigned Reg, RegNum, RegWidth;
+  MCRegister Reg;
+  unsigned RegNum, RegWidth;
 
   if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
     return nullptr;
@@ -3735,7 +3741,7 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(
     const MCInst &Inst, const OperandVector &Operands) {
   const unsigned Opcode = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opcode);
-  unsigned LastSGPR = AMDGPU::NoRegister;
+  MCRegister LastSGPR;
   unsigned ConstantBusUseCount = 0;
   unsigned NumLiterals = 0;
   unsigned LiteralSize;
@@ -4688,7 +4694,7 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
 }
 
 // Check if VCC register matches wavefront size
-bool AMDGPUAsmParser::validateVccOperand(unsigned Reg) const {
+bool AMDGPUAsmParser::validateVccOperand(MCRegister Reg) const {
   auto FB = getFeatureBits();
   return (FB[AMDGPU::FeatureWavefrontSize64] && Reg == AMDGPU::VCC) ||
     (FB[AMDGPU::FeatureWavefrontSize32] && Reg == AMDGPU::VCC_LO);
@@ -4820,7 +4826,7 @@ bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const {
     if (!Op.isReg())
       continue;
 
-    unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+    MCRegister Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
     if (!Sub)
       continue;
 
@@ -6248,15 +6254,15 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
 }
 
 bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
-                                           unsigned RegNo) {
-  if (MRI.regsOverlap(TTMP12_TTMP13_TTMP14_TTMP15, RegNo))
+                                           MCRegister Reg) {
+  if (MRI.regsOverlap(TTMP12_TTMP13_TTMP14_TTMP15, Reg))
     return isGFX9Plus();
 
   // GFX10+ has 2 more SGPRs 104 and 105.
-  if (MRI.regsOverlap(SGPR104_SGPR105, RegNo))
+  if (MRI.regsOverlap(SGPR104_SGPR105, Reg))
     return hasSGPR104_SGPR105();
 
-  switch (RegNo) {
+  switch (Reg.id()) {
   case SRC_SHARED_BASE_LO:
   case SRC_SHARED_BASE:
   case SRC_SHARED_LIMIT_LO:
@@ -6295,7 +6301,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
     // No flat_scr on SI.
     // On GFX10Plus flat scratch is not a valid register operand and can only be
     // accessed with s_setreg/s_getreg.
-    switch (RegNo) {
+    switch (Reg.id()) {
     case FLAT_SCR:
     case FLAT_SCR_LO:
     case FLAT_SCR_HI:
@@ -6307,7 +6313,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
 
   // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that
   // SI/CI have.
-  if (MRI.regsOverlap(SGPR102_SGPR103, RegNo))
+  if (MRI.regsOverlap(SGPR102_SGPR103, Reg))
     return hasSGPR102_SGPR103();
 
   return true;
@@ -7105,7 +7111,7 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
     if (Op.isOff()) {
       assert(SrcIdx < 4);
       OperandIdx[SrcIdx] = Inst.size();
-      Inst.addOperand(MCOperand::createReg(AMDGPU::NoRegister));
+      Inst.addOperand(MCOperand::createReg(MCRegister()));
       ++SrcIdx;
       continue;
     }
@@ -7128,12 +7134,12 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
   if (OptionalIdx.find(AMDGPUOperand::ImmTyExpCompr) != OptionalIdx.end()) {
     Compr = true;
     Inst.getOperand(OperandIdx[1]) = Inst.getOperand(OperandIdx[2]);
-    Inst.getOperand(OperandIdx[2]).setReg(AMDGPU::NoRegister);
-    Inst.getOperand(OperandIdx[3]).setReg(AMDGPU::NoRegister);
+    Inst.getOperand(OperandIdx[2]).setReg(MCRegister());
+    Inst.getOperand(OperandIdx[3]).setReg(MCRegister());
   }
 
   for (auto i = 0; i < SrcIdx; ++i) {
-    if (Inst.getOperand(OperandIdx[i]).getReg() != AMDGPU::NoRegister) {
+    if (Inst.getOperand(OperandIdx[i]).getReg()) {
       EnMask |= Compr? (0x3 << i * 2) : (0x1 << i);
     }
   }
@@ -7902,9 +7908,8 @@ AMDGPUAsmParser::getImmLoc(AMDGPUOperand::ImmTy Type,
   return getOperandLoc(Test, Operands);
 }
 
-SMLoc
-AMDGPUAsmParser::getRegLoc(unsigned Reg,
-                           const OperandVector &Operands) const {
+SMLoc AMDGPUAsmParser::getRegLoc(MCRegister Reg,
+                                 const OperandVector &Operands) const {
   auto Test = [=](const AMDGPUOperand& Op) {
     return Op.isRegKind() && Op.getReg() == Reg;
   };
@@ -7996,12 +8001,9 @@ encodeBitmaskPerm(const unsigned AndMask,
          (XorMask << BITMASK_XOR_SHIFT);
 }
 
-bool
-AMDGPUAsmParser::parseSwizzleOperand(int64_t &Op,
-                                     const unsigned MinVal,
-                                     const unsigned MaxVal,
-                                     const StringRef ErrMsg,
-                                     SMLoc &Loc) {
+bool AMDGPUAsmParser::parseSwizzleOperand(int64_t &Op, const unsigned MinVal,
+                                          const unsigned MaxVal,
+                                          const Twine &ErrMsg, SMLoc &Loc) {
   if (!skipToken(AsmToken::Comma, "expected a comma")) {
     return false;
   }
@@ -8166,6 +8168,54 @@ AMDGPUAsmParser::parseSwizzleBitmaskPerm(int64_t &Imm) {
   return true;
 }
 
+bool AMDGPUAsmParser::parseSwizzleFFT(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  if (!AMDGPU::isGFX9Plus(getSTI())) {
+    Error(getLoc(), "FFT mode swizzle not supported on this GPU");
+    return false;
+  }
+
+  int64_t Swizzle;
+  SMLoc Loc;
+  if (!parseSwizzleOperand(Swizzle, 0, FFT_SWIZZLE_MAX,
+                           "FFT swizzle must be in the interval [0," +
+                               Twine(FFT_SWIZZLE_MAX) + Twine(']'),
+                           Loc))
+    return false;
+
+  Imm = FFT_MODE_ENC | Swizzle;
+  return true;
+}
+
+bool AMDGPUAsmParser::parseSwizzleRotate(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  if (!AMDGPU::isGFX9Plus(getSTI())) {
+    Error(getLoc(), "Rotate mode swizzle not supported on this GPU");
+    return false;
+  }
+
+  SMLoc Loc;
+  int64_t Direction;
+
+  if (!parseSwizzleOperand(Direction, 0, 1,
+                           "direction must be 0 (left) or 1 (right)", Loc))
+    return false;
+
+  int64_t RotateSize;
+  if (!parseSwizzleOperand(
+          RotateSize, 0, ROTATE_MAX_SIZE,
+          "number of threads to rotate must be in the interval [0," +
+              Twine(ROTATE_MAX_SIZE) + Twine(']'),
+          Loc))
+    return false;
+
+  Imm = ROTATE_MODE_ENC | (Direction << ROTATE_DIR_SHIFT) |
+        (RotateSize << ROTATE_SIZE_SHIFT);
+  return true;
+}
+
 bool
 AMDGPUAsmParser::parseSwizzleOffset(int64_t &Imm) {
 
@@ -8200,6 +8250,10 @@ AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) {
       Ok = parseSwizzleSwap(Imm);
     } else if (trySkipId(IdSymbolic[ID_REVERSE])) {
       Ok = parseSwizzleReverse(Imm);
+    } else if (trySkipId(IdSymbolic[ID_FFT])) {
+      Ok = parseSwizzleFFT(Imm);
+    } else if (trySkipId(IdSymbolic[ID_ROTATE])) {
+      Ok = parseSwizzleRotate(Imm);
     } else {
       Error(ModeLoc, "expected a swizzle mode");
     }
@@ -9298,7 +9352,7 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
     if (IsVOP3CvtSrDpp) {
       if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
         Inst.addOperand(MCOperand::createImm(0));
-        Inst.addOperand(MCOperand::createReg(0));
+        Inst.addOperand(MCOperand::createReg(MCRegister()));
       }
     }
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 7c883cc2017dd..fed29c3e14aae 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -81,6 +81,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMCInstLower.cpp
   AMDGPUMemoryUtils.cpp
   AMDGPUIGroupLP.cpp
+  AMDGPUMCResourceInfo.cpp
   AMDGPUMarkLastScratchLoad.cpp
   AMDGPUMIRFormatter.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -94,6 +95,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPURegBankSelect.cpp
   AMDGPURegisterBankInfo.cpp
   AMDGPURemoveIncompatibleFunctions.cpp
+  AMDGPUReserveWWMRegs.cpp
   AMDGPUResourceUsageAnalysis.cpp
   AMDGPURewriteOutArguments.cpp
   AMDGPURewriteUndefForPHI.cpp
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ca4be01736c1f..9eedcc636fd94 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1038,18 +1038,18 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     return;
 
   // Widen the register to the correct number of enabled channels.
-  unsigned NewVdata = AMDGPU::NoRegister;
+  MCRegister NewVdata;
   if (DstSize != Info->VDataDwords) {
     auto DataRCID = MCII->get(NewOpcode).operands()[VDataIdx].RegClass;
 
     // Get first subregister of VData
-    unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
-    unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
+    MCRegister Vdata0 = MI.getOperand(VDataIdx).getReg();
+    MCRegister VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
     Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
 
     NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
                                        &MRI.getRegClass(DataRCID));
-    if (NewVdata == AMDGPU::NoRegister) {
+    if (!NewVdata) {
       // It's possible to encode this such that the low register + enabled
       // components exceeds the register count.
       return;
@@ -1059,11 +1059,11 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
   // If not using NSA on GFX10+, widen vaddr0 address register to correct size.
   // If using partial NSA on GFX11+ widen last address register.
   int VAddrSAIdx = IsPartialNSA ? (RsrcIdx - 1) : VAddr0Idx;
-  unsigned NewVAddrSA = AMDGPU::NoRegister;
+  MCRegister NewVAddrSA;
   if (STI.hasFeature(AMDGPU::FeatureNSAEncoding) && (!IsNSA || IsPartialNSA) &&
       AddrSize != Info->VAddrDwords) {
-    unsigned VAddrSA = MI.getOperand(VAddrSAIdx).getReg();
-    unsigned VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0);
+    MCRegister VAddrSA = MI.getOperand(VAddrSAIdx).getReg();
+    MCRegister VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0);
     VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA;
 
     auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 52c24a5c25ec2..187d337a98a0b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -143,11 +143,10 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   if (LDSBankCount == 0)
     LDSBankCount = 32;
 
-  if (TT.getArch() == Triple::amdgcn && LocalMemorySize == 0)
-    LocalMemorySize = 32768;
-
-  AddressableLocalMemorySize = LocalMemorySize;
+  if (TT.getArch() == Triple::amdgcn && AddressableLocalMemorySize == 0)
+    AddressableLocalMemorySize = 32768;
 
+  LocalMemorySize = AddressableLocalMemorySize;
   if (AMDGPU::isGFX10Plus(*this) &&
       !getFeatureBits().test(AMDGPU::FeatureCuMode))
     LocalMemorySize *= 2;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 94bf5e4b95270..dd8d93c3f0b72 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -315,10 +315,10 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI,
   }
 }
 
-void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
+void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O,
                                         const MCRegisterInfo &MRI) {
 #if !defined(NDEBUG)
-  switch (RegNo) {
+  switch (Reg.id()) {
   case AMDGPU::FP_REG:
   case AMDGPU::SP_REG:
   case AMDGPU::PRIVATE_RSRC_REG:
@@ -328,7 +328,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
   }
 #endif
 
-  O << getRegisterName(RegNo);
+  O << getRegisterName(Reg);
 }
 
 void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
@@ -1503,8 +1503,21 @@ void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo,
 
   O << " offset:";
 
-  if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) {
+  // Rotate and FFT modes
+  if (Imm >= ROTATE_MODE_LO && AMDGPU::isGFX9Plus(STI)) {
+    if (Imm >= FFT_MODE_LO) {
+      O << "swizzle(" << IdSymbolic[ID_FFT] << ',' << (Imm & FFT_SWIZZLE_MASK)
+        << ')';
+    } else if (Imm >= ROTATE_MODE_LO) {
+      O << "swizzle(" << IdSymbolic[ID_ROTATE] << ','
+        << ((Imm >> ROTATE_DIR_SHIFT) & ROTATE_DIR_MASK) << ','
+        << ((Imm >> ROTATE_SIZE_SHIFT) & ROTATE_SIZE_MASK) << ')';
+    }
+    return;
+  }
 
+  // Basic mode
+  if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) {
     O << "swizzle(" << IdSymbolic[ID_QUAD_PERM];
     for (unsigned I = 0; I < LANE_NUM; ++I) {
       O << ",";
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 4d44db5d9d818..a72e0fe6ea769 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -32,7 +32,7 @@ class AMDGPUInstPrinter : public MCInstPrinter {
   void printRegName(raw_ostream &OS, MCRegister Reg) const override;
   void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
                  const MCSubtargetInfo &STI, raw_ostream &O) override;
-  static void printRegOperand(unsigned RegNo, raw_ostream &O,
+  static void printRegOperand(MCRegister Reg, raw_ostream &O,
                               const MCRegisterInfo &MRI);
 
 private:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 2af1f91973025..f5e05f6bd658a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -489,7 +489,7 @@ void AMDGPUMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
   const MCOperand &MO = MI.getOperand(OpNo);
 
   if (MO.isReg()) {
-    unsigned Reg = MO.getReg();
+    MCRegister Reg = MO.getReg();
     RegEnc |= MRI.getEncodingValue(Reg);
     RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
     if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
@@ -518,7 +518,7 @@ void AMDGPUMCCodeEmitter::getSDWAVopcDstEncoding(
 
   const MCOperand &MO = MI.getOperand(OpNo);
 
-  unsigned Reg = MO.getReg();
+  MCRegister Reg = MO.getReg();
   if (Reg != AMDGPU::VCC && Reg != AMDGPU::VCC_LO) {
     RegEnc |= MRI.getEncodingValue(Reg);
     RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
@@ -530,7 +530,7 @@ void AMDGPUMCCodeEmitter::getSDWAVopcDstEncoding(
 void AMDGPUMCCodeEmitter::getAVOperandEncoding(
     const MCInst &MI, unsigned OpNo, APInt &Op,
     SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
-  unsigned Reg = MI.getOperand(OpNo).getReg();
+  MCRegister Reg = MI.getOperand(OpNo).getReg();
   unsigned Enc = MRI.getEncodingValue(Reg);
   unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
   bool IsVGPROrAGPR =
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 73d466abc66f7..a1a41d6cc8c6a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -271,6 +271,47 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
      << Alignment.value() << '\n';
 }
 
+void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
+    const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+    const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
+    const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
+    const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
+    const MCSymbol *HasIndirectCall) {
+#define PRINT_RES_INFO(ARG)                                                    \
+  OS << "\t.set ";                                                             \
+  ARG->print(OS, getContext().getAsmInfo());                                   \
+  OS << ", ";                                                                  \
+  ARG->getVariableValue()->print(OS, getContext().getAsmInfo());               \
+  Streamer.addBlankLine();
+
+  PRINT_RES_INFO(NumVGPR);
+  PRINT_RES_INFO(NumAGPR);
+  PRINT_RES_INFO(NumExplicitSGPR);
+  PRINT_RES_INFO(PrivateSegmentSize);
+  PRINT_RES_INFO(UsesVCC);
+  PRINT_RES_INFO(UsesFlatScratch);
+  PRINT_RES_INFO(HasDynamicallySizedStack);
+  PRINT_RES_INFO(HasRecursion);
+  PRINT_RES_INFO(HasIndirectCall);
+#undef PRINT_RES_INFO
+}
+
+void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
+                                                     const MCSymbol *MaxAGPR,
+                                                     const MCSymbol *MaxSGPR) {
+#define PRINT_RES_INFO(ARG)                                                    \
+  OS << "\t.set ";                                                             \
+  ARG->print(OS, getContext().getAsmInfo());                                   \
+  OS << ", ";                                                                  \
+  ARG->getVariableValue()->print(OS, getContext().getAsmInfo());               \
+  Streamer.addBlankLine();
+
+  PRINT_RES_INFO(MaxVGPR);
+  PRINT_RES_INFO(MaxAGPR);
+  PRINT_RES_INFO(MaxSGPR);
+#undef PRINT_RES_INFO
+}
+
 bool AMDGPUTargetAsmStreamer::EmitISAVersion() {
   OS << "\t.amd_amdgpu_isa \"" << getTargetID()->toString() << "\"\n";
   return true;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index bf1538c71d154..6a91ad06de5d1 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -60,6 +60,17 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
   virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, Align Alignment) {
   }
 
+  virtual void EmitMCResourceInfo(
+      const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+      const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
+      const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
+      const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
+      const MCSymbol *HasIndirectCall) {};
+
+  virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
+                                      const MCSymbol *MaxAGPR,
+                                      const MCSymbol *MaxSGPR) {};
+
   /// \returns True on success, false on failure.
   virtual bool EmitISAVersion() { return true; }
 
@@ -136,6 +147,18 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
 
   void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
 
+  void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+                          const MCSymbol *NumExplicitSGPR,
+                          const MCSymbol *PrivateSegmentSize,
+                          const MCSymbol *UsesVCC,
+                          const MCSymbol *UsesFlatScratch,
+                          const MCSymbol *HasDynamicallySizedStack,
+                          const MCSymbol *HasRecursion,
+                          const MCSymbol *HasIndirectCall) override;
+
+  void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR,
+                              const MCSymbol *MaxSGPR) override;
+
   /// \returns True on success, false on failure.
   bool EmitISAVersion() override;
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
index 56a23e26b8d9f..7a9ed80bd1a6c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
@@ -141,7 +141,7 @@ void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
 
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
-    switch (Op.getReg()) {
+    switch (Op.getReg().id()) {
     // This is the default predicate state, so we don't need to print it.
     case R600::PRED_SEL_OFF:
       break;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index fa040d548f64c..134f30518d501 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -52,7 +52,7 @@ class R600MCCodeEmitter : public MCCodeEmitter {
   void emit(uint32_t value, SmallVectorImpl<char> &CB) const;
   void emit(uint64_t value, SmallVectorImpl<char> &CB) const;
 
-  unsigned getHWReg(unsigned regNo) const;
+  unsigned getHWReg(MCRegister Reg) const;
 
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
                                  SmallVectorImpl<MCFixup> &Fixups,
@@ -145,8 +145,8 @@ void R600MCCodeEmitter::emit(uint64_t Value, SmallVectorImpl<char> &CB) const {
   support::endian::write(CB, Value, llvm::endianness::little);
 }
 
-unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
-  return MRI.getEncodingValue(RegNo) & HW_REG_MASK;
+unsigned R600MCCodeEmitter::getHWReg(MCRegister Reg) const {
+  return MRI.getEncodingValue(Reg) & HW_REG_MASK;
 }
 
 uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
diff --git a/llvm/lib/Target/AMDGPU/R600Processors.td b/llvm/lib/Target/AMDGPU/R600Processors.td
index 8cf8edd1254fe..0265a976c9825 100644
--- a/llvm/lib/Target/AMDGPU/R600Processors.td
+++ b/llvm/lib/Target/AMDGPU/R600Processors.td
@@ -53,13 +53,13 @@ def FeatureR700 : R600SubtargetFeatureGeneration<"R700", "r700",
 >;
 
 def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", "evergreen",
-  [FeatureFetchLimit16, FeatureLocalMemorySize32768]
+  [FeatureFetchLimit16, FeatureAddressableLocalMemorySize32768]
 >;
 
 def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
   "northern-islands",
   [FeatureFetchLimit16, FeatureWavefrontSize64,
-   FeatureLocalMemorySize32768]
+   FeatureAddressableLocalMemorySize32768]
 >;
 
 
diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
index e5a8c5cf3baf6..fd5a87999cf81 100644
--- a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
@@ -29,7 +29,7 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
       TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
       InstrItins(getInstrItineraryForCPU(GPU)) {
-  AddressableLocalMemorySize = LocalMemorySize;
+  LocalMemorySize = AddressableLocalMemorySize;
 }
 
 R600Subtarget &R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index fb3d83ca30d19..07c80bd2575f0 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -841,9 +841,12 @@ enum Id : unsigned { // id of symbolic names
   ID_BITMASK_PERM,
   ID_SWAP,
   ID_REVERSE,
-  ID_BROADCAST
+  ID_BROADCAST,
+  ID_FFT,
+  ID_ROTATE
 };
 
+// clang-format off
 enum EncBits : unsigned {
 
   // swizzle mode encodings
@@ -854,6 +857,14 @@ enum EncBits : unsigned {
   BITMASK_PERM_ENC      = 0x0000,
   BITMASK_PERM_ENC_MASK = 0x8000,
 
+  FFT_MODE_ENC          = 0xE000,
+
+  ROTATE_MODE_ENC       = 0xC000,
+  FFT_ROTATE_MODE_MASK  = 0xF000,
+
+  ROTATE_MODE_LO        = 0xC000,
+  FFT_MODE_LO           = 0xE000,
+
   // QUAD_PERM encodings
 
   LANE_MASK             = 0x3,
@@ -869,8 +880,21 @@ enum EncBits : unsigned {
 
   BITMASK_AND_SHIFT     = 0,
   BITMASK_OR_SHIFT      = 5,
-  BITMASK_XOR_SHIFT     = 10
+  BITMASK_XOR_SHIFT     = 10,
+
+  // FFT encodings
+
+  FFT_SWIZZLE_MASK      = 0x1F,
+  FFT_SWIZZLE_MAX       = 0x1F,
+
+  // ROTATE encodings
+  ROTATE_MAX_SIZE       = 0x1F,
+  ROTATE_DIR_SHIFT      = 10, // bit position of rotate direction
+  ROTATE_DIR_MASK       = 0x1,
+  ROTATE_SIZE_SHIFT     = 5, // bit position of rotate size
+  ROTATE_SIZE_MASK      = ROTATE_MAX_SIZE,
 };
+// clang-format on
 
 } // namespace Swizzle
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 50a6f028f66de..3d1657392884f 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1341,23 +1341,6 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
 
-  // Allocate spill slots for WWM reserved VGPRs.
-  // For chain functions, we only need to do this if we have calls to
-  // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
-  // chain functions do not return) and the function did not contain a call to
-  // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
-  // when entering the function).
-  bool IsChainWithoutRestores =
-      FuncInfo->isChainFunction() &&
-      (!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
-  if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
-    for (Register Reg : FuncInfo->getWWMReservedRegs()) {
-      const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
-      FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
-                                 TRI->getSpillAlign(*RC));
-    }
-  }
-
   const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
                                && EnableSpillVGPRToAGPR;
 
@@ -1583,11 +1566,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
     return;
 
-  MFI->shiftSpillPhysVGPRsToLowestRange(MF);
-
   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
-  if (MFI->isEntryFunction())
-    return;
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -1597,19 +1576,9 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   MachineInstr *ReturnMI = nullptr;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
-      // WRITELANE instructions used for SGPR spills can overwrite the inactive
-      // lanes of VGPRs and callee must spill and restore them even if they are
-      // marked Caller-saved.
-
-      // TODO: Handle this elsewhere at an early point. Walking through all MBBs
-      // here would be a bad heuristic. A better way should be by calling
-      // allocateWWMSpill during the regalloc pipeline whenever a physical
-      // register is allocated for the intended virtual registers.
-      if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
-        MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
-      else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
-        MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
-      else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
+      // TODO: Walking through all MBBs here would be a bad heuristic. Better
+      // handle them elsewhere.
+      if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
         NeedExecCopyReservedReg = true;
       else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
                MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
@@ -1624,6 +1593,23 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
     }
   }
 
+  SmallVector<Register> SortedWWMVGPRs;
+  for (Register Reg : MFI->getWWMReservedRegs()) {
+    // The shift-back is needed only for the VGPRs used for SGPR spills and they
+    // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
+    // reserved registers.
+    const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
+    if (TRI->getRegSizeInBits(*RC) > 32)
+      continue;
+    SortedWWMVGPRs.push_back(Reg);
+  }
+
+  sort(SortedWWMVGPRs, std::greater<Register>());
+  MFI->shiftWwmVGPRsToLowestRange(MF, SortedWWMVGPRs, SavedVGPRs);
+
+  if (MFI->isEntryFunction())
+    return;
+
   // Remove any VGPRs used in the return value because these do not need to be saved.
   // This prevents CSR restore from clobbering return VGPRs.
   if (ReturnMI) {
@@ -1633,6 +1619,13 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
     }
   }
 
+  // Create the stack objects for WWM registers now.
+  for (Register Reg : MFI->getWWMReservedRegs()) {
+    const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
+    MFI->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
+                          TRI->getSpillAlign(*RC));
+  }
+
   // Ignore the SGPRs the default implementation found.
   SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
 
@@ -1648,14 +1641,6 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   // allow the default insertion to handle them.
   for (auto &Reg : MFI->getWWMSpills())
     SavedVGPRs.reset(Reg.first);
-
-  // Mark all lane VGPRs as BB LiveIns.
-  for (MachineBasicBlock &MBB : MF) {
-    for (auto &Reg : MFI->getWWMSpills())
-      MBB.addLiveIn(Reg.first);
-
-    MBB.sortUniqueLiveIns();
-  }
 }
 
 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 08f2ff4566b67..5e4cf705cc9e4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -753,7 +753,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
     setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
 
-    setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
+    setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
+                        ISD::FMAXIMUMNUM},
                        {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
                        Custom);
 
@@ -5842,6 +5843,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMAXNUM_IEEE:
   case ISD::FMINIMUM:
   case ISD::FMAXIMUM:
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
   case ISD::UADDSAT:
   case ISD::USUBSAT:
   case ISD::SADDSAT:
@@ -6797,8 +6800,7 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
   LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
 
   // Special case: for shifts, the RHS always needs a zext.
-  if (Op.getOpcode() == ISD::SRA || Op.getOpcode() == ISD::SRL ||
-      Op.getOpcode() == ISD::SRA)
+  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
     RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
   else
     RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
@@ -8210,9 +8212,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     append_range(Ops, VAddrs);
   else
     Ops.push_back(VAddr);
-  Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
-  if (BaseOpcode->Sampler)
-    Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
+  SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
+  EVT RsrcVT = Rsrc.getValueType();
+  if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
+    return Op;
+  Ops.push_back(Rsrc);
+  if (BaseOpcode->Sampler) {
+    SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
+    if (Samp.getValueType() != MVT::v4i32)
+      return Op;
+    Ops.push_back(Samp);
+  }
   Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
   if (IsGFX10Plus)
     Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
@@ -15500,6 +15510,10 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
         Failed |= !RegName.consume_back("]");
         if (!Failed) {
           uint32_t Width = (End - Idx + 1) * 32;
+          // Prohibit constraints for register ranges with a width that does not
+          // match the required type.
+          if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
+            return std::pair(0U, nullptr);
           MCRegister Reg = RC->getRegister(Idx);
           if (SIRegisterInfo::isVGPRClass(RC))
             RC = TRI->getVGPRClassForBitWidth(Width);
@@ -15513,6 +15527,9 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
           }
         }
       } else {
+        // Check for lossy scalar/vector conversions.
+        if (VT.isVector() && VT.getSizeInBits() != 32)
+          return std::pair(0U, nullptr);
         bool Failed = RegName.getAsInteger(10, Idx);
         if (!Failed && Idx < RC->getNumRegs())
           return std::pair(RC->getRegister(Idx), RC);
@@ -16145,11 +16162,8 @@ static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
 
 static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
   LLVMContext &Ctx = RMW->getContext();
-  SmallVector<StringRef> SSNs;
-  Ctx.getSyncScopeNames(SSNs);
-  StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
-                           ? "system"
-                           : SSNs[RMW->getSyncScopeID()];
+  StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("");
+  StringRef MemScope = SS.empty() ? StringRef("system") : SS;
 
   return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
          << "Hardware instruction generated for atomic "
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 80a7529002ac9..e64b35d230d48 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -310,7 +310,14 @@ class WaitcntBrackets {
   bool counterOutOfOrder(InstCounterType T) const;
   void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
   void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
-  void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
+
+  void determineWait(InstCounterType T, RegInterval Interval,
+                     AMDGPU::Waitcnt &Wait) const;
+  void determineWait(InstCounterType T, int RegNo,
+                     AMDGPU::Waitcnt &Wait) const {
+    determineWait(T, {RegNo, RegNo + 1}, Wait);
+  }
+
   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
   void applyWaitcnt(InstCounterType T, unsigned Count);
   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
@@ -345,16 +352,22 @@ class WaitcntBrackets {
     LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
   }
 
-  // Return true if there might be pending writes to the specified vgpr by VMEM
+  // Return true if there might be pending writes to the vgpr-interval by VMEM
   // instructions with types different from V.
-  bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
-    assert(GprNo < NUM_ALL_VGPRS);
-    return VgprVmemTypes[GprNo] & ~(1 << V);
+  bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
+    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+      assert(RegNo < NUM_ALL_VGPRS);
+      if (VgprVmemTypes[RegNo] & ~(1 << V))
+        return true;
+    }
+    return false;
   }
 
-  void clearVgprVmemTypes(int GprNo) {
-    assert(GprNo < NUM_ALL_VGPRS);
-    VgprVmemTypes[GprNo] = 0;
+  void clearVgprVmemTypes(RegInterval Interval) {
+    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+      assert(RegNo < NUM_ALL_VGPRS);
+      VgprVmemTypes[RegNo] = 0;
+    }
   }
 
   void setStateOnFunctionEntryOrReturn() {
@@ -396,19 +409,16 @@ class WaitcntBrackets {
   }
 
   void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
-    if (GprNo < NUM_ALL_VGPRS) {
-      VgprUB = std::max(VgprUB, GprNo);
-      VgprScores[T][GprNo] = Val;
-    } else {
-      assert(T == SmemAccessCounter);
-      SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
-      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
-    }
+    setScoreByInterval({GprNo, GprNo + 1}, T, Val);
   }
 
-  void setExpScore(const MachineInstr *MI, const SIRegisterInfo *TRI,
-                   const MachineRegisterInfo *MRI, const MachineOperand &Op,
-                   unsigned Val);
+  void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
+                          unsigned Score);
+
+  void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
+                         const MachineRegisterInfo *MRI,
+                         const MachineOperand &Op, InstCounterType CntTy,
+                         unsigned Val);
 
   const GCNSubtarget *ST = nullptr;
   InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
@@ -772,17 +782,30 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
   return Result;
 }
 
-void WaitcntBrackets::setExpScore(const MachineInstr *MI,
-                                  const SIRegisterInfo *TRI,
-                                  const MachineRegisterInfo *MRI,
-                                  const MachineOperand &Op, unsigned Val) {
-  RegInterval Interval = getRegInterval(MI, MRI, TRI, Op);
-  assert(TRI->isVectorRegister(*MRI, Op.getReg()));
+void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
+                                         InstCounterType CntTy,
+                                         unsigned Score) {
   for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-    setRegScore(RegNo, EXP_CNT, Val);
+    if (RegNo < NUM_ALL_VGPRS) {
+      VgprUB = std::max(VgprUB, RegNo);
+      VgprScores[CntTy][RegNo] = Score;
+    } else {
+      assert(CntTy == SmemAccessCounter);
+      SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
+      SgprScores[RegNo - NUM_ALL_VGPRS] = Score;
+    }
   }
 }
 
+void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
+                                        const SIRegisterInfo *TRI,
+                                        const MachineRegisterInfo *MRI,
+                                        const MachineOperand &Op,
+                                        InstCounterType CntTy, unsigned Score) {
+  RegInterval Interval = getRegInterval(MI, MRI, TRI, Op);
+  setScoreByInterval(Interval, CntTy, Score);
+}
+
 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                                     const SIRegisterInfo *TRI,
                                     const MachineRegisterInfo *MRI,
@@ -806,57 +829,61 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
       // All GDS operations must protect their address register (same as
       // export.)
       if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
-        setExpScore(&Inst, TRI, MRI, *AddrOp, CurrScore);
+        setScoreByOperand(&Inst, TRI, MRI, *AddrOp, EXP_CNT, CurrScore);
 
       if (Inst.mayStore()) {
         if (const auto *Data0 =
                 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
-          setExpScore(&Inst, TRI, MRI, *Data0, CurrScore);
+          setScoreByOperand(&Inst, TRI, MRI, *Data0, EXP_CNT, CurrScore);
         if (const auto *Data1 =
                 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
-          setExpScore(&Inst, TRI, MRI, *Data1, CurrScore);
+          setScoreByOperand(&Inst, TRI, MRI, *Data1, EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
         for (const MachineOperand &Op : Inst.all_uses()) {
           if (TRI->isVectorRegister(*MRI, Op.getReg()))
-            setExpScore(&Inst, TRI, MRI, Op, CurrScore);
+            setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
         }
       }
     } else if (TII->isFLAT(Inst)) {
       if (Inst.mayStore()) {
-        setExpScore(&Inst, TRI, MRI,
-                    *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
-                    CurrScore);
+        setScoreByOperand(&Inst, TRI, MRI,
+                          *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+                          EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setExpScore(&Inst, TRI, MRI,
-                    *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
-                    CurrScore);
+        setScoreByOperand(&Inst, TRI, MRI,
+                          *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+                          EXP_CNT, CurrScore);
       }
     } else if (TII->isMIMG(Inst)) {
       if (Inst.mayStore()) {
-        setExpScore(&Inst, TRI, MRI, Inst.getOperand(0), CurrScore);
+        setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
+                          CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setExpScore(&Inst, TRI, MRI,
-                    *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
-                    CurrScore);
+        setScoreByOperand(&Inst, TRI, MRI,
+                          *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+                          EXP_CNT, CurrScore);
       }
     } else if (TII->isMTBUF(Inst)) {
       if (Inst.mayStore())
-        setExpScore(&Inst, TRI, MRI, Inst.getOperand(0), CurrScore);
+        setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
+                          CurrScore);
     } else if (TII->isMUBUF(Inst)) {
       if (Inst.mayStore()) {
-        setExpScore(&Inst, TRI, MRI, Inst.getOperand(0), CurrScore);
+        setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
+                          CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setExpScore(&Inst, TRI, MRI,
-                    *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
-                    CurrScore);
+        setScoreByOperand(&Inst, TRI, MRI,
+                          *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+                          EXP_CNT, CurrScore);
       }
     } else if (TII->isLDSDIR(Inst)) {
       // LDSDIR instructions attach the score to the destination.
-      setExpScore(&Inst, TRI, MRI,
-                  *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst), CurrScore);
+      setScoreByOperand(&Inst, TRI, MRI,
+                        *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
+                        EXP_CNT, CurrScore);
     } else {
       if (TII->isEXP(Inst)) {
         // For export the destination registers are really temps that
@@ -865,15 +892,13 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
         // score.
         for (MachineOperand &DefMO : Inst.all_defs()) {
           if (TRI->isVGPR(*MRI, DefMO.getReg())) {
-            setRegScore(
-                TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
-                EXP_CNT, CurrScore);
+            setScoreByOperand(&Inst, TRI, MRI, DefMO, EXP_CNT, CurrScore);
           }
         }
       }
       for (const MachineOperand &Op : Inst.all_uses()) {
         if (TRI->isVectorRegister(*MRI, Op.getReg()))
-          setExpScore(&Inst, TRI, MRI, Op, CurrScore);
+          setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
       }
     }
   } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
@@ -901,9 +926,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
             VgprVmemTypes[RegNo] |= 1 << V;
         }
       }
-      for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-        setRegScore(RegNo, T, CurrScore);
-      }
+      setScoreByInterval(Interval, T, CurrScore);
     }
     if (Inst.mayStore() &&
         (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
@@ -1034,31 +1057,34 @@ void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
     Count = ~0u;
 }
 
-void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
+void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
                                     AMDGPU::Waitcnt &Wait) const {
-  unsigned ScoreToWait = getRegScore(RegNo, T);
-
-  // If the score of src_operand falls within the bracket, we need an
-  // s_waitcnt instruction.
   const unsigned LB = getScoreLB(T);
   const unsigned UB = getScoreUB(T);
-  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
-    if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
-        !ST->hasFlatLgkmVMemCountInOrder()) {
-      // If there is a pending FLAT operation, and this is a VMem or LGKM
-      // waitcnt and the target can report early completion, then we need
-      // to force a waitcnt 0.
-      addWait(Wait, T, 0);
-    } else if (counterOutOfOrder(T)) {
-      // Counter can get decremented out-of-order when there
-      // are multiple types event in the bracket. Also emit an s_wait counter
-      // with a conservative value of 0 for the counter.
-      addWait(Wait, T, 0);
-    } else {
-      // If a counter has been maxed out avoid overflow by waiting for
-      // MAX(CounterType) - 1 instead.
-      unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
-      addWait(Wait, T, NeededWait);
+  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+    unsigned ScoreToWait = getRegScore(RegNo, T);
+
+    // If the score of src_operand falls within the bracket, we need an
+    // s_waitcnt instruction.
+    if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
+      if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
+          !ST->hasFlatLgkmVMemCountInOrder()) {
+        // If there is a pending FLAT operation, and this is a VMem or LGKM
+        // waitcnt and the target can report early completion, then we need
+        // to force a waitcnt 0.
+        addWait(Wait, T, 0);
+      } else if (counterOutOfOrder(T)) {
+        // Counter can get decremented out-of-order when there
+        // are multiple types event in the bracket. Also emit an s_wait counter
+        // with a conservative value of 0 for the counter.
+        addWait(Wait, T, 0);
+      } else {
+        // If a counter has been maxed out avoid overflow by waiting for
+        // MAX(CounterType) - 1 instead.
+        unsigned NeededWait =
+            std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
+        addWait(Wait, T, NeededWait);
+      }
     }
   }
 }
@@ -1670,18 +1696,16 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         RegInterval CallAddrOpInterval =
             ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOp);
 
-        for (int RegNo = CallAddrOpInterval.first;
-             RegNo < CallAddrOpInterval.second; ++RegNo)
-          ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
+        ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
+                                    Wait);
 
         if (const auto *RtnAddrOp =
                 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
           RegInterval RtnAddrOpInterval =
               ScoreBrackets.getRegInterval(&MI, MRI, TRI, *RtnAddrOp);
 
-          for (int RegNo = RtnAddrOpInterval.first;
-               RegNo < RtnAddrOpInterval.second; ++RegNo)
-            ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
+          ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
+                                      Wait);
         }
       }
     } else {
@@ -1750,36 +1774,34 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, Op);
 
         const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
-        for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-          if (IsVGPR) {
-            // Implicit VGPR defs and uses are never a part of the memory
-            // instructions description and usually present to account for
-            // super-register liveness.
-            // TODO: Most of the other instructions also have implicit uses
-            // for the liveness accounting only.
-            if (Op.isImplicit() && MI.mayLoadOrStore())
-              continue;
-
-            // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
-            // previous write and this write are the same type of VMEM
-            // instruction, in which case they are (in some architectures)
-            // guaranteed to write their results in order anyway.
-            if (Op.isUse() || !updateVMCntOnly(MI) ||
-                ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
-                                                       getVmemType(MI)) ||
-                !ST->hasVmemWriteVgprInOrder()) {
-              ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
-              ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
-              ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
-              ScoreBrackets.clearVgprVmemTypes(RegNo);
-            }
-            if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
-              ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
-            }
-            ScoreBrackets.determineWait(DS_CNT, RegNo, Wait);
-          } else {
-            ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
+        if (IsVGPR) {
+          // Implicit VGPR defs and uses are never a part of the memory
+          // instructions description and usually present to account for
+          // super-register liveness.
+          // TODO: Most of the other instructions also have implicit uses
+          // for the liveness accounting only.
+          if (Op.isImplicit() && MI.mayLoadOrStore())
+            continue;
+
+          // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
+          // previous write and this write are the same type of VMEM
+          // instruction, in which case they are (in some architectures)
+          // guaranteed to write their results in order anyway.
+          if (Op.isUse() || !updateVMCntOnly(MI) ||
+              ScoreBrackets.hasOtherPendingVmemTypes(Interval,
+                                                     getVmemType(MI)) ||
+              !ST->hasVmemWriteVgprInOrder()) {
+            ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
+            ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);
+            ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
+            ScoreBrackets.clearVgprVmemTypes(Interval);
+          }
+          if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
+            ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
           }
+          ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
+        } else {
+          ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
         }
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 9b506eb0a711a..dd1ab2c628715 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -84,10 +84,6 @@ class InstSI <dag outs, dag ins, string asm = "",
   // Is it possible for this instruction to be atomic?
   field bit maybeAtomic = 1;
 
-  // This bit indicates that this is a VI instruction which is renamed
-  // in GFX9. Required for correct mapping from pseudo to MC.
-  field bit renamedInGFX9 = 0;
-
   // This bit indicates that this has a floating point result type, so
   // the clamp modifier has floating point semantics.
   field bit FPClamp = 0;
@@ -214,7 +210,9 @@ class InstSI <dag outs, dag ins, string asm = "",
   let TSFlags{42} = VOP3_OPSEL;
 
   let TSFlags{43} = maybeAtomic;
-  let TSFlags{44} = renamedInGFX9;
+
+  // Reserved, must be 0.
+  let TSFlags{44} = 0;
 
   let TSFlags{45} = FPClamp;
   let TSFlags{46} = IntClamp;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9ad0b4c65e1d9..5c39b2a4fc96a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8899,11 +8899,10 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
   }
 
   uint16_t Opcode = MI.getOpcode();
-  // FIXME: Copies inserted in the block prolog for live-range split should also
-  // be included.
   return IsNullOrVectorRegister &&
-         (isSpill(Opcode) || (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
-                              MI.modifiesRegister(AMDGPU::EXEC, &RI)));
+         (isSGPRSpill(Opcode) ||
+          (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
+           MI.modifiesRegister(AMDGPU::EXEC, &RI)));
 }
 
 MachineInstrBuilder
@@ -9189,13 +9188,43 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
   }
 }
 
+#define GENERATE_RENAMED_GFX9_CASES(OPCODE)                                    \
+  case OPCODE##_dpp:                                                           \
+  case OPCODE##_e32:                                                           \
+  case OPCODE##_e64:                                                           \
+  case OPCODE##_e64_dpp:                                                       \
+  case OPCODE##_sdwa:
+
+static bool isRenamedInGFX9(int Opcode) {
+  switch (Opcode) {
+    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
+    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
+    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
+    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
+    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
+    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
+    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
+    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
+    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
+  //
+  case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
+  case AMDGPU::V_FMA_F16_gfx9_e64:
+  case AMDGPU::V_INTERP_P2_F16:
+  case AMDGPU::V_MAD_F16_e64:
+  case AMDGPU::V_MAD_U16_e64:
+  case AMDGPU::V_MAD_I16_e64:
+    return true;
+  default:
+    return false;
+  }
+}
+
 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
   Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
 
   unsigned Gen = subtargetEncodingFamily(ST);
 
-  if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
-    ST.getGeneration() == AMDGPUSubtarget::GFX9)
+  if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
     Gen = SIEncodingFamily::GFX9;
 
   // Adjust the encoding family to GFX80 for D16 buffer instructions when the
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9afb29d95abd7..8073aca7f197f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -931,6 +931,7 @@ def SI_SPILL_S32_TO_VGPR : PseudoInstSI <(outs VGPR_32:$vdst),
   let hasSideEffects = 0;
   let mayLoad = 0;
   let mayStore = 0;
+  let hasExtraDefRegAllocReq = 1;
   let Constraints = "$vdst = $vdst_in";
 }
 
@@ -941,6 +942,7 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
   let hasSideEffects = 0;
   let mayLoad = 0;
   let mayStore = 0;
+  let hasExtraSrcRegAllocReq = 1;
 }
 } // End Spill = 1, VALU = 1, isConvergent = 1
 
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 35e5bea9ae16e..822336ebaf5dc 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -21,6 +21,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/InitializePasses.h"
@@ -33,12 +34,18 @@ using MBBVector = SmallVector<MachineBasicBlock *, 4>;
 
 namespace {
 
+static cl::opt<unsigned> MaxNumVGPRsForWwmAllocation(
+    "amdgpu-num-vgprs-for-wwm-alloc",
+    cl::desc("Max num VGPRs for whole-wave register allocation."),
+    cl::ReallyHidden, cl::init(10));
+
 class SILowerSGPRSpills {
 private:
   const SIRegisterInfo *TRI = nullptr;
   const SIInstrInfo *TII = nullptr;
   LiveIntervals *LIS = nullptr;
   SlotIndexes *Indexes = nullptr;
+  MachineDominatorTree *MDT = nullptr;
 
   // Save and Restore blocks of the current function. Typically there is a
   // single save block, unless Windows EH funclets are involved.
@@ -46,13 +53,17 @@ class SILowerSGPRSpills {
   MBBVector RestoreBlocks;
 
 public:
-  SILowerSGPRSpills(LiveIntervals *LIS, SlotIndexes *Indexes)
-      : LIS(LIS), Indexes(Indexes) {}
+  SILowerSGPRSpills(LiveIntervals *LIS, SlotIndexes *Indexes,
+                    MachineDominatorTree *MDT)
+      : LIS(LIS), Indexes(Indexes), MDT(MDT) {}
   bool run(MachineFunction &MF);
   void calculateSaveRestoreBlocks(MachineFunction &MF);
   bool spillCalleeSavedRegs(MachineFunction &MF,
                             SmallVectorImpl<int> &CalleeSavedFIs);
-  void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS);
+  void updateLaneVGPRDomInstr(
+      int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
+      DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr);
+  void determineRegsForWWMAllocation(MachineFunction &MF, BitVector &RegMask);
 };
 
 class SILowerSGPRSpillsLegacy : public MachineFunctionPass {
@@ -64,6 +75,7 @@ class SILowerSGPRSpillsLegacy : public MachineFunctionPass {
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -84,6 +96,7 @@ INITIALIZE_PASS_BEGIN(SILowerSGPRSpillsLegacy, DEBUG_TYPE,
                       "SI lower SGPR spill instructions", false, false)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(SILowerSGPRSpillsLegacy, DEBUG_TYPE,
                     "SI lower SGPR spill instructions", false, false)
 
@@ -266,51 +279,90 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
   return false;
 }
 
-void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
-                                                 LiveIntervals *LIS) {
-  // TODO: This is a workaround to avoid the unmodelled liveness computed with
-  // whole-wave virtual registers when allocated together with the regular VGPR
-  // virtual registers. Presently, the liveness computed during the regalloc is
-  // only uniform (or single lane aware) and it doesn't take account of the
-  // divergent control flow that exists for our GPUs. Since the WWM registers
-  // can modify inactive lanes, the wave-aware liveness should be computed for
-  // the virtual registers to accurately plot their interferences. Without
-  // having the divergent CFG for the function, it is difficult to implement the
-  // wave-aware liveness info. Until then, we conservatively extend the liveness
-  // of the wwm registers into the entire function so that they won't be reused
-  // without first spilling/splitting their liveranges.
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
-  // Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks.
-  for (auto Reg : MFI->getSGPRSpillVGPRs()) {
-    for (MachineBasicBlock *SaveBlock : SaveBlocks) {
-      MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
-      DebugLoc DL = SaveBlock->findDebugLoc(InsertBefore);
-      auto MIB = BuildMI(*SaveBlock, InsertBefore, DL,
-                         TII->get(AMDGPU::IMPLICIT_DEF), Reg);
-      MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
-      // Set SGPR_SPILL asm printer flag
-      MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL);
-      if (LIS) {
-        LIS->InsertMachineInstrInMaps(*MIB);
+void SILowerSGPRSpills::updateLaneVGPRDomInstr(
+    int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
+    DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr) {
+  // For the Def of a virtual LaneVPGR to dominate all its uses, we should
+  // insert an IMPLICIT_DEF before the dominating spill. Switching to a
+  // depth first order doesn't really help since the machine function can be in
+  // the unstructured control flow post-SSA. For each virtual register, hence
+  // finding the common dominator to get either the dominating spill or a block
+  // dominating all spills.
+  SIMachineFunctionInfo *FuncInfo =
+      MBB->getParent()->getInfo<SIMachineFunctionInfo>();
+  ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills =
+      FuncInfo->getSGPRSpillToVirtualVGPRLanes(FI);
+  Register PrevLaneVGPR;
+  for (auto &Spill : VGPRSpills) {
+    if (PrevLaneVGPR == Spill.VGPR)
+      continue;
+
+    PrevLaneVGPR = Spill.VGPR;
+    auto I = LaneVGPRDomInstr.find(Spill.VGPR);
+    if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) {
+      // Initially add the spill instruction itself for Insertion point.
+      LaneVGPRDomInstr[Spill.VGPR] = InsertPt;
+    } else {
+      assert(I != LaneVGPRDomInstr.end());
+      auto PrevInsertPt = I->second;
+      MachineBasicBlock *DomMBB = PrevInsertPt->getParent();
+      if (DomMBB == MBB) {
+        // The insertion point earlier selected in a predecessor block whose
+        // spills are currently being lowered. The earlier InsertPt would be
+        // the one just before the block terminator and it should be changed
+        // if we insert any new spill in it.
+        if (MDT->dominates(&*InsertPt, &*PrevInsertPt))
+          I->second = InsertPt;
+
+        continue;
       }
+
+      // Find the common dominator block between PrevInsertPt and the
+      // current spill.
+      DomMBB = MDT->findNearestCommonDominator(DomMBB, MBB);
+      if (DomMBB == MBB)
+        I->second = InsertPt;
+      else if (DomMBB != PrevInsertPt->getParent())
+        I->second = &(*DomMBB->getFirstTerminator());
     }
   }
+}
 
-  // Insert the KILL in the return blocks to extend their liveness untill the
-  // end of function. Insert a separate KILL for each VGPR.
-  for (MachineBasicBlock *RestoreBlock : RestoreBlocks) {
-    MachineBasicBlock::iterator InsertBefore =
-        RestoreBlock->getFirstTerminator();
-    DebugLoc DL = RestoreBlock->findDebugLoc(InsertBefore);
-    for (auto Reg : MFI->getSGPRSpillVGPRs()) {
-      auto MIB = BuildMI(*RestoreBlock, InsertBefore, DL,
-                         TII->get(TargetOpcode::KILL));
-      MIB.addReg(Reg);
-      if (LIS)
-        LIS->InsertMachineInstrInMaps(*MIB);
+void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
+                                                      BitVector &RegMask) {
+  // Determine an optimal number of VGPRs for WWM allocation. The complement
+  // list will be available for allocating other VGPR virtual registers.
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  BitVector ReservedRegs = TRI->getReservedRegs(MF);
+  BitVector NonWwmAllocMask(TRI->getNumRegs());
+
+  // FIXME: MaxNumVGPRsForWwmAllocation might need to be adjusted in the future
+  // to have a balanced allocation between WWM values and per-thread vector
+  // register operands.
+  unsigned NumRegs = MaxNumVGPRsForWwmAllocation;
+  NumRegs =
+      std::min(static_cast<unsigned>(MFI->getSGPRSpillVGPRs().size()), NumRegs);
+
+  auto [MaxNumVGPRs, MaxNumAGPRs] = TRI->getMaxNumVectorRegs(MF);
+  // Try to use the highest available registers for now. Later after
+  // vgpr-regalloc, they can be shifted to the lowest range.
+  unsigned I = 0;
+  for (unsigned Reg = AMDGPU::VGPR0 + MaxNumVGPRs - 1;
+       (I < NumRegs) && (Reg >= AMDGPU::VGPR0); --Reg) {
+    if (!ReservedRegs.test(Reg) &&
+        !MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/true)) {
+      TRI->markSuperRegs(RegMask, Reg);
+      ++I;
     }
   }
+
+  if (I != NumRegs) {
+    // Reserve an arbitrary register and report the error.
+    TRI->markSuperRegs(RegMask, AMDGPU::VGPR0);
+    MF.getFunction().getContext().emitError(
+        "can't find enough VGPRs for wwm-regalloc");
+  }
 }
 
 bool SILowerSGPRSpillsLegacy::runOnMachineFunction(MachineFunction &MF) {
@@ -318,7 +370,9 @@ bool SILowerSGPRSpillsLegacy::runOnMachineFunction(MachineFunction &MF) {
   LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
   auto *SIWrapper = getAnalysisIfAvailable<SlotIndexesWrapperPass>();
   SlotIndexes *Indexes = SIWrapper ? &SIWrapper->getSI() : nullptr;
-  return SILowerSGPRSpills(LIS, Indexes).run(MF);
+  MachineDominatorTree *MDT =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  return SILowerSGPRSpills(LIS, Indexes, MDT).run(MF);
 }
 
 bool SILowerSGPRSpills::run(MachineFunction &MF) {
@@ -361,6 +415,9 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
     // To track the spill frame indices handled in this pass.
     BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
 
+    // To track the IMPLICIT_DEF insertion point for the lane vgprs.
+    DenseMap<Register, MachineBasicBlock::iterator> LaneVGPRDomInstr;
+
     for (MachineBasicBlock &MBB : MF) {
       for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
         if (!TII->isSGPRSpill(MI))
@@ -390,6 +447,7 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
                   "failed to spill SGPR to physical VGPR lane when allocated");
           }
         } else {
+          MachineInstrSpan MIS(&MI, &MBB);
           if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
             bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
                 MI, FI, nullptr, Indexes, LIS);
@@ -397,21 +455,47 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
               llvm_unreachable(
                   "failed to spill SGPR to virtual VGPR lane when allocated");
             SpillFIs.set(FI);
+            updateLaneVGPRDomInstr(FI, &MBB, MIS.begin(), LaneVGPRDomInstr);
             SpilledToVirtVGPRLanes = true;
           }
         }
       }
     }
 
-    if (SpilledToVirtVGPRLanes) {
-      extendWWMVirtRegLiveness(MF, LIS);
+    for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) {
+      auto InsertPt = LaneVGPRDomInstr[Reg];
+      // Insert the IMPLICIT_DEF at the identified points.
+      MachineBasicBlock &Block = *InsertPt->getParent();
+      DebugLoc DL = Block.findDebugLoc(InsertPt);
+      auto MIB =
+          BuildMI(Block, *InsertPt, DL, TII->get(AMDGPU::IMPLICIT_DEF), Reg);
+
+      // Add WWM flag to the virtual register.
+      FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
+
+      // Set SGPR_SPILL asm printer flag
+      MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL);
       if (LIS) {
-        // Compute the LiveInterval for the newly created virtual registers.
-        for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
-          LIS->createAndComputeVirtRegInterval(Reg);
+        LIS->InsertMachineInstrInMaps(*MIB);
+        LIS->createAndComputeVirtRegInterval(Reg);
       }
     }
 
+    // Determine the registers for WWM allocation and also compute the register
+    // mask for non-wwm VGPR allocation.
+    if (FuncInfo->getSGPRSpillVGPRs().size()) {
+      BitVector WwmRegMask(TRI->getNumRegs());
+
+      determineRegsForWWMAllocation(MF, WwmRegMask);
+
+      BitVector NonWwmRegMask(WwmRegMask);
+      NonWwmRegMask.flip().clearBitsNotInMask(TRI->getAllVGPRRegMask());
+
+      // The complement set will be the registers for non-wwm (per-thread) vgpr
+      // allocation.
+      FuncInfo->updateNonWWMRegMask(NonWwmRegMask);
+    }
+
     for (MachineBasicBlock &MBB : MF) {
       // FIXME: The dead frame indices are replaced with a null register from
       // the debug value instructions. We should instead, update it with the
@@ -468,6 +552,7 @@ SILowerSGPRSpillsPass::run(MachineFunction &MF,
   MFPropsModifier _(*this, MF);
   auto *LIS = MFAM.getCachedResult<LiveIntervalsAnalysis>(MF);
   auto *Indexes = MFAM.getCachedResult<SlotIndexesAnalysis>(MF);
-  SILowerSGPRSpills(LIS, Indexes).run(MF);
+  MachineDominatorTree *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
+  SILowerSGPRSpills(LIS, Indexes, MDT).run(MF);
   return PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 2237b2e78c417..8be9a082a7fd0 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -287,8 +287,14 @@ void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
   // amdgpu_cs_chain_preserve calling convention and this is a scratch register.
   // We never need to allocate a spill for these because we don't even need to
   // restore the inactive lanes for them (they're scratchier than the usual
-  // scratch registers).
-  if (isChainFunction() && SIRegisterInfo::isChainScratchRegister(VGPR))
+  // scratch registers). We only need to do this if we have calls to
+  // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
+  // chain functions do not return) and the function did not contain a call to
+  // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
+  // when entering the function).
+  if (isChainFunction() &&
+      (SIRegisterInfo::isChainScratchRegister(VGPR) ||
+       !MF.getFrameInfo().hasTailCall() || hasInitWholeWave()))
     return;
 
   WWMSpills.insert(std::make_pair(
@@ -319,11 +325,13 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
   return false;
 }
 
-void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange(
-    MachineFunction &MF) {
+void SIMachineFunctionInfo::shiftWwmVGPRsToLowestRange(
+    MachineFunction &MF, SmallVectorImpl<Register> &WWMVGPRs,
+    BitVector &SavedVGPRs) {
   const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  for (Register &Reg : SpillPhysVGPRs) {
+  for (unsigned I = 0, E = WWMVGPRs.size(); I < E; ++I) {
+    Register Reg = WWMVGPRs[I];
     Register NewReg =
         TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
     if (!NewReg || NewReg >= Reg)
@@ -332,10 +340,22 @@ void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange(
     MRI.replaceRegWith(Reg, NewReg);
 
     // Update various tables with the new VGPR.
+    WWMVGPRs[I] = NewReg;
     WWMReservedRegs.remove(Reg);
     WWMReservedRegs.insert(NewReg);
-    WWMSpills.insert(std::make_pair(NewReg, WWMSpills[Reg]));
-    WWMSpills.erase(Reg);
+    MRI.reserveReg(NewReg, TRI);
+
+    // Replace the register in SpillPhysVGPRs. This is needed to look for free
+    // lanes while spilling special SGPRs like FP, BP, etc. during PEI.
+    auto RegItr = std::find(SpillPhysVGPRs.begin(), SpillPhysVGPRs.end(), Reg);
+    if (RegItr != SpillPhysVGPRs.end()) {
+      unsigned Idx = std::distance(SpillPhysVGPRs.begin(), RegItr);
+      SpillPhysVGPRs[Idx] = NewReg;
+    }
+
+    // The generic `determineCalleeSaves` might have set the old register if it
+    // is in the CSR range.
+    SavedVGPRs.reset(Reg);
 
     for (MachineBasicBlock &MBB : MF) {
       MBB.removeLiveIn(Reg);
@@ -380,7 +400,9 @@ bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
       return false;
     }
 
-    allocateWWMSpill(MF, LaneVGPR);
+    if (IsPrologEpilog)
+      allocateWWMSpill(MF, LaneVGPR);
+
     reserveWWMRegister(LaneVGPR);
     for (MachineBasicBlock &MBB : MF) {
       MBB.addLiveIn(LaneVGPR);
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index aff0b34947d68..669f98dd865d6 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -524,6 +524,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // the VGPR and its stack slot index.
   WWMSpillsMap WWMSpills;
 
+  // Before allocation, the VGPR registers are partitioned into two distinct
+  // sets, the first one for WWM-values and the second set for non-WWM values.
+  // The latter set should be reserved during WWM-regalloc.
+  BitVector NonWWMRegMask;
+
   using ReservedRegSet = SmallSetVector<Register, 8>;
   // To track the VGPRs reserved for WWM instructions. They get stack slots
   // later during PrologEpilogInserter and get added into the superset WWMSpills
@@ -590,6 +595,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   void reserveWWMRegister(Register Reg) { WWMReservedRegs.insert(Reg); }
 
+  void updateNonWWMRegMask(BitVector &RegMask) { NonWWMRegMask = RegMask; }
+  BitVector getNonWWMRegMask() const { return NonWWMRegMask; }
+  void clearNonWWMRegAllocMask() { NonWWMRegMask.clear(); }
+
   SIModeRegisterDefaults getMode() const { return Mode; }
 
   ArrayRef<SIRegisterInfo::SpilledReg>
@@ -729,9 +738,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
       I->second.IsDead = true;
   }
 
-  // To bring the Physical VGPRs in the highest range allocated for CSR SGPR
-  // spilling into the lowest available range.
-  void shiftSpillPhysVGPRsToLowestRange(MachineFunction &MF);
+  // To bring the allocated WWM registers in \p WWMVGPRs to the lowest available
+  // range.
+  void shiftWwmVGPRsToLowestRange(MachineFunction &MF,
+                                  SmallVectorImpl<Register> &WWMVGPRs,
+                                  BitVector &SavedVGPRs);
 
   bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI,
                                    bool SpillToPhysVGPRLane = false,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 2d1cd1bda3afe..207cf7bc46b21 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -561,6 +561,37 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
   return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
 }
 
+std::pair<unsigned, unsigned>
+SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxNumAGPRs = MaxNumVGPRs;
+  unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+
+  // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
+  // a wave may have up to 512 total vector registers combining together both
+  // VGPRs and AGPRs. Hence, in an entry function without calls and without
+  // AGPRs used within it, it is possible to use the whole vector register
+  // budget for VGPRs.
+  //
+  // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
+  //       register file accordingly.
+  if (ST.hasGFX90AInsts()) {
+    if (MFI->usesAGPRs(MF)) {
+      MaxNumVGPRs /= 2;
+      MaxNumAGPRs = MaxNumVGPRs;
+    } else {
+      if (MaxNumVGPRs > TotalNumVGPRs) {
+        MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
+        MaxNumVGPRs = TotalNumVGPRs;
+      } else
+        MaxNumAGPRs = 0;
+    }
+  }
+
+  return std::pair(MaxNumVGPRs, MaxNumAGPRs);
+}
+
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::MODE);
@@ -668,30 +699,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   // Reserve VGPRs/AGPRs.
   //
-  unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
-  unsigned MaxNumAGPRs = MaxNumVGPRs;
-  unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
-
-  // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
-  // a wave may have up to 512 total vector registers combining together both
-  // VGPRs and AGPRs. Hence, in an entry function without calls and without
-  // AGPRs used within it, it is possible to use the whole vector register
-  // budget for VGPRs.
-  //
-  // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
-  //       register file accordingly.
-  if (ST.hasGFX90AInsts()) {
-    if (MFI->usesAGPRs(MF)) {
-      MaxNumVGPRs /= 2;
-      MaxNumAGPRs = MaxNumVGPRs;
-    } else {
-      if (MaxNumVGPRs > TotalNumVGPRs) {
-        MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
-        MaxNumVGPRs = TotalNumVGPRs;
-      } else
-        MaxNumAGPRs = 0;
-    }
-  }
+  auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF);
 
   for (const TargetRegisterClass *RC : regclasses()) {
     if (RC->isBaseClass() && isVGPRClass(RC)) {
@@ -724,6 +732,18 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
   }
 
+  // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
+  // MFI->getNonWWMRegMask() field will have a valid bitmask only during
+  // wwm-regalloc and it would be empty otherwise.
+  BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
+  if (!NonWWMRegMask.empty()) {
+    for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
+         RegI < RegE; ++RegI) {
+      if (NonWWMRegMask.test(RegI))
+        reserveRegisterTuples(Reserved, RegI);
+    }
+  }
+
   for (Register Reg : MFI->getWWMReservedRegs())
     reserveRegisterTuples(Reserved, Reg);
 
@@ -930,11 +950,11 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
     .addFrameIndex(FrameIdx);
 
   if (ST.enableFlatScratch() ) {
-    // FIXME: Mark scc as dead
     // FIXME: Make sure scc isn't live in.
     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
         .addReg(OffsetReg, RegState::Kill)
-        .addReg(FIReg);
+        .addReg(FIReg)
+        .setOperandDead(3); // scc
     return BaseReg;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 88d5686720985..409e5418abc8e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -82,6 +82,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   /// spilling is needed.
   MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
 
+  /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
+  /// of waves per execution unit required for the function \p MF.
+  std::pair<unsigned, unsigned>
+  getMaxNumVectorRegs(const MachineFunction &MF) const;
+
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   bool isAsmClobberable(const MachineFunction &MF,
                         MCRegister PhysReg) const override;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 5f7549c2921ed..a8e4ce133ffbc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -645,14 +645,18 @@ unsigned const DfmtNfmt2UFmtGFX11[] = {
 
 namespace Swizzle {
 
+// clang-format off
 // This must be in sync with llvm::AMDGPU::Swizzle::Id enum members, see SIDefines.h.
-const char* const IdSymbolic[] = {
+const char *const IdSymbolic[] = {
   "QUAD_PERM",
   "BITMASK_PERM",
   "SWAP",
   "REVERSE",
   "BROADCAST",
+  "FFT",
+  "ROTATE",
 };
+// clang-format on
 
 } // namespace Swizzle
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index f32c82f1e4ba4..feec7b47ae294 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -911,9 +911,9 @@ unsigned getLocalMemorySize(const MCSubtargetInfo *STI) {
 }
 
 unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
-  if (STI->getFeatureBits().test(FeatureLocalMemorySize32768))
+  if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize32768))
     return 32768;
-  if (STI->getFeatureBits().test(FeatureLocalMemorySize65536))
+  if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize65536))
     return 65536;
   return 0;
 }
@@ -2219,9 +2219,9 @@ int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR,
   return std::max(ArgNumVGPR, ArgNumAGPR);
 }
 
-bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
+bool isSGPR(MCRegister Reg, const MCRegisterInfo *TRI) {
   const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
-  const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
+  const MCRegister FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
   return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||
     Reg == AMDGPU::SCC;
 }
@@ -2232,7 +2232,7 @@ bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI) {
 
 #define MAP_REG2REG \
   using namespace AMDGPU; \
-  switch(Reg) { \
+  switch(Reg.id()) { \
   default: return Reg; \
   CASE_CI_VI(FLAT_SCR) \
   CASE_CI_VI(FLAT_SCR_LO) \
@@ -2287,7 +2287,7 @@ bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI) {
 #define CASE_GFXPRE11_GFX11PLUS_TO(node, result) \
   case node: return isGFX11Plus(STI) ? result##_gfx11plus : result##_gfxpre11;
 
-unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
+MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI) {
   if (STI.getTargetTriple().getArch() == Triple::r600)
     return Reg;
   MAP_REG2REG
@@ -2303,9 +2303,7 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
 #define CASE_GFXPRE11_GFX11PLUS(node) case node##_gfx11plus: case node##_gfxpre11: return node;
 #define CASE_GFXPRE11_GFX11PLUS_TO(node, result)
 
-unsigned mc2PseudoReg(unsigned Reg) {
-  MAP_REG2REG
-}
+MCRegister mc2PseudoReg(MCRegister Reg) { MAP_REG2REG }
 
 bool isInlineValue(unsigned Reg) {
   switch (Reg) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index da37534f2fa4f..d1d84394cc070 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1316,18 +1316,18 @@ unsigned hasKernargPreload(const MCSubtargetInfo &STI);
 bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST);
 
 /// Is Reg - scalar register
-bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
+bool isSGPR(MCRegister Reg, const MCRegisterInfo *TRI);
 
 /// \returns if \p Reg occupies the high 16-bits of a 32-bit register.
 bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI);
 
 /// If \p Reg is a pseudo reg, return the correct hardware register given
 /// \p STI otherwise return \p Reg.
-unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
+MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI);
 
 /// Convert hardware register \p Reg to a pseudo register
 LLVM_READNONE
-unsigned mc2PseudoReg(unsigned Reg);
+MCRegister mc2PseudoReg(MCRegister Reg);
 
 LLVM_READNONE
 bool isInlineValue(unsigned Reg);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index a53bf70d77717..92d09b3afa77d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -215,15 +215,15 @@ void AMDGPUPALMetadata::setRegister(unsigned Reg, const MCExpr *Val,
       const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
       Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
     }
-    ExprIt->getSecond() = Val;
   } else if (N.getKind() == msgpack::Type::UInt) {
     const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
     Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
-    int64_t Unused;
-    if (!Val->evaluateAsAbsolute(Unused))
-      REM[Reg] = Val;
-    (void)Unused;
+  } else {
+    // Default to uint64_t 0 so additional calls to setRegister will allow
+    // propagate ORs.
+    N = (uint64_t)0;
   }
+  REM[Reg] = Val;
   DelayedExprs.assignDocNode(N, msgpack::Type::UInt, Val);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index bd80505970578..871a7c3c2579e 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -917,7 +917,7 @@ multiclass VOP1_Real_FULL_with_name_gfx11_gfx12<bits<9> op, string opName,
   VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>,
   VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
 
-multiclass VOP1_Real_FULL_t16_and_f16_gfx11_gfx12<bits<9> op, string asmName,
+multiclass VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<bits<9> op, string asmName,
                                           string opName = NAME> {
   defm opName#"_t16" :
 	VOP1_Real_FULL_with_name_gfx11_gfx12<op, opName#"_t16", asmName>;
@@ -965,10 +965,10 @@ defm V_NOT_B16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16"
 defm V_CVT_I32_I16_fake16    : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">;
 defm V_CVT_U32_U16_fake16    : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">;
 
-defm V_CVT_F16_U16           : VOP1_Real_FULL_t16_and_f16_gfx11_gfx12<0x050, "v_cvt_f16_u16">;
-defm V_CVT_F16_I16           : VOP1_Real_FULL_t16_and_f16_gfx11_gfx12<0x051, "v_cvt_f16_i16">;
-defm V_CVT_U16_F16           : VOP1_Real_FULL_t16_and_f16_gfx11_gfx12<0x052, "v_cvt_u16_f16">;
-defm V_CVT_I16_F16           : VOP1_Real_FULL_t16_and_f16_gfx11_gfx12<0x053, "v_cvt_i16_f16">;
+defm V_CVT_F16_U16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x050, "v_cvt_f16_u16">;
+defm V_CVT_F16_I16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x051, "v_cvt_f16_i16">;
+defm V_CVT_U16_F16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x052, "v_cvt_u16_f16">;
+defm V_CVT_I16_F16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x053, "v_cvt_i16_f16">;
 defm V_RCP_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
 defm V_RCP_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
 defm V_SQRT_F16_t16          : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">;
@@ -980,7 +980,7 @@ defm V_LOG_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16"
 defm V_EXP_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
 defm V_EXP_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
 defm V_FREXP_MANT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">;
-defm V_FREXP_EXP_I16_F16     : VOP1_Real_FULL_t16_and_f16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">;
+defm V_FREXP_EXP_I16_F16     : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">;
 defm V_FLOOR_F16_t16         : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
 defm V_FLOOR_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
 defm V_CEIL_F16_t16          : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
@@ -991,11 +991,11 @@ defm V_FRACT_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f1
 defm V_SIN_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">;
 defm V_COS_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">;
 defm V_SAT_PK_U8_I16_fake16  : VOP1_Real_FULL_t16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">;
-defm V_CVT_NORM_I16_F16      : VOP1_Real_FULL_t16_and_f16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">;
-defm V_CVT_NORM_U16_F16      : VOP1_Real_FULL_t16_and_f16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">;
+defm V_CVT_NORM_I16_F16      : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">;
+defm V_CVT_NORM_U16_F16      : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">;
 
-defm V_CVT_F16_F32           : VOP1_Real_FULL_t16_and_f16_gfx11_gfx12<0x00a, "v_cvt_f16_f32">;
-defm V_CVT_F32_F16           : VOP1_Real_FULL_t16_and_f16_gfx11_gfx12<0x00b, "v_cvt_f32_f16">;
+defm V_CVT_F16_F32           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00a, "v_cvt_f16_f32">;
+defm V_CVT_F32_F16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b, "v_cvt_f32_f16">;
 
 //===----------------------------------------------------------------------===//
 // GFX10.
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 52f7be3b4577d..cdc3214924961 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -142,26 +142,21 @@ class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
 multiclass VOP2Inst_e32<string opName,
                         VOPProfile P,
                         SDPatternOperator node = null_frag,
-                        string revOp = opName,
-                        bit GFX9Renamed = 0> {
-  let renamedInGFX9 = GFX9Renamed in {
+                        string revOp = opName> {
     def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>,
                Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
-  } // End renamedInGFX9 = GFX9Renamed
 }
 multiclass
     VOP2Inst_e32_VOPD<string opName, VOPProfile P, bits<5> VOPDOp,
                       string VOPDName, SDPatternOperator node = null_frag,
-                      string revOp = opName, bit GFX9Renamed = 0> {
-  defm NAME : VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>,
+                      string revOp = opName> {
+  defm NAME : VOP2Inst_e32<opName, P, node, revOp>,
               VOPD_Component<VOPDOp, VOPDName>;
 }
 multiclass VOP2Inst_e64<string opName,
                         VOPProfile P,
                         SDPatternOperator node = null_frag,
-                        string revOp = opName,
-                        bit GFX9Renamed = 0> {
-  let renamedInGFX9 = GFX9Renamed in {
+                        string revOp = opName> {
     def _e64 : VOP3InstBase <opName, P, node, 1>,
                Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
 
@@ -169,45 +164,37 @@ multiclass VOP2Inst_e64<string opName,
       if P.HasExtVOP3DPP then
         def _e64_dpp  : VOP3_DPP_Pseudo <opName, P>;
     } // End SubtargetPredicate = isGFX11Plus
-  } // End renamedInGFX9 = GFX9Renamed
 }
 
 multiclass VOP2Inst_sdwa<string opName,
-                         VOPProfile P,
-                         bit GFX9Renamed = 0> {
-  let renamedInGFX9 = GFX9Renamed in {
+                         VOPProfile P> {
     if P.HasExtSDWA then
       def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
-  } // End renamedInGFX9 = GFX9Renamed
 }
 
 multiclass VOP2Inst<string opName,
                     VOPProfile P,
                     SDPatternOperator node = null_frag,
-                    string revOp = opName,
-                    bit GFX9Renamed = 0> :
-    VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>,
-    VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>,
-    VOP2Inst_sdwa<opName, P, GFX9Renamed> {
-  let renamedInGFX9 = GFX9Renamed in {
+                    string revOp = opName> :
+    VOP2Inst_e32<opName, P, node, revOp>,
+    VOP2Inst_e64<opName, P, node, revOp>,
+    VOP2Inst_sdwa<opName, P> {
     if P.HasExtDPP then
       def _dpp  : VOP2_DPP_Pseudo <opName, P>;
-  }
 }
 
 multiclass VOP2Inst_t16<string opName,
                         VOPProfile P,
                         SDPatternOperator node = null_frag,
-                        string revOp = opName,
-                        bit GFX9Renamed = 0> {
+                        string revOp = opName> {
   let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
-    defm NAME : VOP2Inst<opName, P, node, revOp, GFX9Renamed>;
+    defm NAME : VOP2Inst<opName, P, node, revOp>;
   }
   let SubtargetPredicate = UseRealTrue16Insts in {
-    defm _t16 : VOP2Inst<opName#"_t16", VOPProfile_True16<P>, node, revOp#"_t16", GFX9Renamed>;
+    defm _t16 : VOP2Inst<opName#"_t16", VOPProfile_True16<P>, node, revOp#"_t16">;
   }
   let SubtargetPredicate = UseFakeTrue16Insts in {
-    defm _fake16 : VOP2Inst<opName#"_fake16", VOPProfile_Fake16<P>, node, revOp#"_fake16", GFX9Renamed>;
+    defm _fake16 : VOP2Inst<opName#"_fake16", VOPProfile_Fake16<P>, node, revOp#"_fake16">;
   }
 }
 
@@ -218,13 +205,15 @@ multiclass VOP2Inst_t16<string opName,
 multiclass VOP2Inst_e64_t16<string opName,
                         VOPProfile P,
                         SDPatternOperator node = null_frag,
-                        string revOp = opName,
-                        bit GFX9Renamed = 0> {
+                        string revOp = opName> {
   let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
-    defm NAME : VOP2Inst<opName, P, node, revOp, GFX9Renamed>;
+    defm NAME : VOP2Inst<opName, P, node, revOp>;
   }
-  let SubtargetPredicate = HasTrue16BitInsts in {
-    defm _t16 : VOP2Inst_e64<opName#"_t16", VOPProfile_Fake16<P>, node, revOp#"_t16", GFX9Renamed>;
+  let SubtargetPredicate = UseRealTrue16Insts in {
+    defm _t16 : VOP2Inst_e64<opName#"_t16", VOPProfile_True16<P>, node, revOp#"_t16">;
+  }
+  let SubtargetPredicate = UseFakeTrue16Insts in {
+    defm _fake16 : VOP2Inst_e64<opName#"_fake16", VOPProfile_Fake16<P>, node, revOp#"_fake16">;
   }
 }
 
@@ -233,24 +222,19 @@ multiclass VOP2Inst_VOPD<string opName,
                          bits<5> VOPDOp,
                          string VOPDName,
                          SDPatternOperator node = null_frag,
-                         string revOp = opName,
-                         bit GFX9Renamed = 0> :
-    VOP2Inst_e32_VOPD<opName, P, VOPDOp, VOPDName, node, revOp, GFX9Renamed>,
-    VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>,
-    VOP2Inst_sdwa<opName, P, GFX9Renamed> {
-  let renamedInGFX9 = GFX9Renamed in {
+                         string revOp = opName> :
+    VOP2Inst_e32_VOPD<opName, P, VOPDOp, VOPDName, node, revOp>,
+    VOP2Inst_e64<opName, P, node, revOp>,
+    VOP2Inst_sdwa<opName, P> {
     if P.HasExtDPP then
       def _dpp  : VOP2_DPP_Pseudo <opName, P>;
-  }
 }
 
 multiclass VOP2bInst <string opName,
                       VOPProfile P,
                       SDPatternOperator node = null_frag,
                       string revOp = opName,
-                      bit GFX9Renamed = 0,
                       bit useSGPRInput = !eq(P.NumSrcArgs, 3)> {
-  let renamedInGFX9 = GFX9Renamed in {
     let SchedRW = [Write32Bit, WriteSALU] in {
       let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
         def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>,
@@ -274,7 +258,6 @@ multiclass VOP2bInst <string opName,
           def _e64_dpp  : VOP3_DPP_Pseudo <opName, P>;
       } // End SubtargetPredicate = isGFX11Plus
     }
-  }
 }
 
 class VOP2bInstAlias <VOP2_Pseudo ps, Instruction inst,
@@ -763,26 +746,24 @@ def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
 
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
-
-let isAdd = 1 in {
-  defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32", 1>;
-}
-
-defm V_SUB_CO_U32 : VOP2bInst <"v_sub_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
-defm V_SUBREV_CO_U32 : VOP2bInst <"v_subrev_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
-defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32", 1>;
-defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>;
-defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>;
+defm V_SUB_CO_U32 : VOP2bInst <"v_sub_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32">;
+defm V_SUBREV_CO_U32 : VOP2bInst <"v_subrev_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32">;
+defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32">;
+defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32">;
 
 
-let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in {
+let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in { 
+  defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32">;
+  defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32">;
+}
 
-let isAdd = 1 in {
-  defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32", 1>;
+let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1, isAdd = 1 in { 
+  defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32">;
 }
 
-defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
-defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
+let isAdd = 1 in { 
+  defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32">;
+  defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">;
 }
 
 } // End isCommutable = 1
@@ -1679,9 +1660,14 @@ multiclass VOP2_Real_e32_gfx11_gfx12<bits<6> op> :
 multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> :
   VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>;
 
-multiclass VOP3Only_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName> :
-  VOP3Only_Realtriple_t16<GFX11Gen, op, asmName>,
-  VOP3Only_Realtriple_t16<GFX12Gen, op, asmName>;
+multiclass VOP3Only_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName, string OpName = NAME> :
+  VOP3Only_Realtriple_t16<GFX11Gen, op, asmName, OpName>,
+  VOP3Only_Realtriple_t16<GFX12Gen, op, asmName, OpName>;
+
+multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<bits<10> op, string asmName, string OpName = NAME> {
+  defm OpName#"_t16": VOP3Only_Realtriple_t16_gfx11_gfx12<op, asmName, OpName#"_t16">;
+  defm OpName#"_fake16": VOP3Only_Realtriple_t16_gfx11_gfx12<op, asmName, OpName#"_fake16">;
+}
 
 multiclass VOP3beOnly_Realtriple_gfx11_gfx12<bits<10> op> :
   VOP3beOnly_Realtriple<GFX11Gen, op>, VOP3beOnly_Realtriple<GFX12Gen, op>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 20beb41b7b58b..78ca7a2f258cb 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -335,35 +335,33 @@ let FPDPRounding = 1 in {
     defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
   } // End Predicates = [Has16BitInsts, isGFX8Only]
 
-  let renamedInGFX9 = 1, SubtargetPredicate = isGFX9Plus in {
+  let SubtargetPredicate = isGFX9Plus in {
     defm V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
                                           VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup>;
     defm V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma>;
-  } // End renamedInGFX9 = 1, SubtargetPredicate = isGFX9Plus
+  } // End SubtargetPredicate = isGFX9Plus
 } // End FPDPRounding = 1
 
 let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
 
-let renamedInGFX9 = 1 in {
-  defm V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
-  defm V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
-  let FPDPRounding = 1 in {
-    defm V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fmad>;
-    let Uses = [MODE, M0, EXEC] in {
-    let OtherPredicates = [isNotGFX90APlus] in
-    // For some reason the intrinsic operands are in a different order
-    // from the instruction operands.
-    def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
-           [(set f16:$vdst,
-             (int_amdgcn_interp_p2_f16 (VOP3Mods f32:$src2, i32:$src2_modifiers),
-                                       (VOP3Mods f32:$src0, i32:$src0_modifiers),
-                                       (i32 timm:$attrchan),
-                                       (i32 timm:$attr),
-                                       (i1 timm:$high),
-                                       M0))]>;
-    } // End Uses = [M0, MODE, EXEC]
-  } // End FPDPRounding = 1
-} // End renamedInGFX9 = 1
+defm V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
+defm V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
+let FPDPRounding = 1 in {
+  defm V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fmad>;
+  let Uses = [MODE, M0, EXEC] in {
+  let OtherPredicates = [isNotGFX90APlus] in
+  // For some reason the intrinsic operands are in a different order
+  // from the instruction operands.
+  def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
+          [(set f16:$vdst,
+            (int_amdgcn_interp_p2_f16 (VOP3Mods f32:$src2, i32:$src2_modifiers),
+                                      (VOP3Mods f32:$src0, i32:$src0_modifiers),
+                                      (i32 timm:$attrchan),
+                                      (i32 timm:$attr),
+                                      (i1 timm:$high),
+                                      M0))]>;
+  } // End Uses = [M0, MODE, EXEC]
+} // End FPDPRounding = 1
 
 let SubtargetPredicate = isGFX9Only, FPDPRounding = 1 in {
   defm V_MAD_F16_gfx9   : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>> ;
@@ -1193,13 +1191,13 @@ defm V_MAD_U64_U32_gfx11   : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_
 defm V_MAD_I64_I32_gfx11   : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">;
 defm V_ADD_NC_U16          : VOP3Only_Realtriple_gfx11_gfx12<0x303>;
 defm V_SUB_NC_U16          : VOP3Only_Realtriple_gfx11_gfx12<0x304>;
-defm V_MUL_LO_U16_t16      : VOP3Only_Realtriple_t16_gfx11_gfx12<0x305, "v_mul_lo_u16">;
+defm V_MUL_LO_U16          : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x305, "v_mul_lo_u16">;
 defm V_CVT_PK_I16_F32      : VOP3_Realtriple_gfx11_gfx12<0x306>;
 defm V_CVT_PK_U16_F32      : VOP3_Realtriple_gfx11_gfx12<0x307>;
-defm V_MAX_U16_t16         : VOP3Only_Realtriple_t16_gfx11_gfx12<0x309, "v_max_u16">;
-defm V_MAX_I16_t16         : VOP3Only_Realtriple_t16_gfx11_gfx12<0x30a, "v_max_i16">;
-defm V_MIN_U16_t16         : VOP3Only_Realtriple_t16_gfx11_gfx12<0x30b, "v_min_u16">;
-defm V_MIN_I16_t16         : VOP3Only_Realtriple_t16_gfx11_gfx12<0x30c, "v_min_i16">;
+defm V_MAX_U16             : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x309, "v_max_u16">;
+defm V_MAX_I16             : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30a, "v_max_i16">;
+defm V_MIN_U16             : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30b, "v_min_u16">;
+defm V_MIN_I16             : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30c, "v_min_i16">;
 defm V_ADD_NC_I16          : VOP3_Realtriple_with_name_gfx11_gfx12<0x30d, "V_ADD_I16", "v_add_nc_i16">;
 defm V_SUB_NC_I16          : VOP3_Realtriple_with_name_gfx11_gfx12<0x30e, "V_SUB_I16", "v_sub_nc_i16">;
 defm V_PACK_B32_F16        : VOP3_Realtriple_gfx11_gfx12<0x311>;
@@ -1216,9 +1214,9 @@ defm V_MUL_LO_U32          : VOP3_Real_Base_gfx11_gfx12<0x32c>;
 defm V_MUL_HI_U32          : VOP3_Real_Base_gfx11_gfx12<0x32d>;
 defm V_MUL_HI_I32          : VOP3_Real_Base_gfx11_gfx12<0x32e>;
 defm V_TRIG_PREOP_F64      : VOP3_Real_Base_gfx11_gfx12<0x32f>;
-defm V_LSHLREV_B16_t16     : VOP3Only_Realtriple_t16_gfx11_gfx12<0x338, "v_lshlrev_b16">;
-defm V_LSHRREV_B16_t16     : VOP3Only_Realtriple_t16_gfx11_gfx12<0x339, "v_lshrrev_b16">;
-defm V_ASHRREV_I16_t16     : VOP3Only_Realtriple_t16_gfx11_gfx12<0x33a, "v_ashrrev_i16">;
+defm V_LSHLREV_B16         : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x338, "v_lshlrev_b16">;
+defm V_LSHRREV_B16         : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x339, "v_lshrrev_b16">;
+defm V_ASHRREV_I16         : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x33a, "v_ashrrev_i16">;
 defm V_LSHLREV_B64         : VOP3_Real_Base_gfx11<0x33c>;
 defm V_LSHRREV_B64         : VOP3_Real_Base_gfx11_gfx12<0x33d>;
 defm V_ASHRREV_I64         : VOP3_Real_Base_gfx11_gfx12<0x33e>;
diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
index dc0e86c696f63..c1449adc34dc7 100644
--- a/llvm/lib/Target/ARM/ARMFeatures.td
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -372,11 +372,11 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Prefers32BitThumb", "true",
                                              "Prefer 32-bit Thumb instrs">;
 
-def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopLogAlignment","2",
-                                              "Prefer 32-bit alignment for loops">;
+def FeaturePreferBranchAlign32 : SubtargetFeature<"loop-align", "PreferBranchLogAlignment","2",
+                                              "Prefer 32-bit alignment for branch targets">;
 
-def FeaturePrefLoopAlign64 : SubtargetFeature<"loop-align-64", "PrefLoopLogAlignment","3",
-                                              "Prefer 64-bit alignment for loops">;
+def FeaturePreferBranchAlign64 : SubtargetFeature<"branch-align-64", "PreferBranchLogAlignment","3",
+                                              "Prefer 64-bit alignment for branch targets">;
 
 def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "4",
                         "Model MVE instructions as a 1 beat per tick architecture">;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index f891aece26848..1733424a8b669 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1635,8 +1635,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   // Prefer likely predicted branches to selects on out-of-order cores.
   PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
 
-  setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
-  setPrefFunctionAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
+  setPrefLoopAlignment(Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
+  setPrefFunctionAlignment(
+      Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
 
   setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
 }
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
index a66a2c0b1981d..ce767b2b968e1 100644
--- a/llvm/lib/Target/ARM/ARMProcessors.td
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
@@ -324,7 +324,7 @@ def : ProcessorModel<"cortex-r8",   CortexA8Model,      [ARMv7r,
 
 def : ProcessorModel<"cortex-m3",   CortexM4Model,      [ARMv7m,
                                                          ProcM3,
-                                                         FeaturePrefLoopAlign32,
+                                                         FeaturePreferBranchAlign32,
                                                          FeatureUseMISched,
                                                          FeatureHasNoBranchPredictor]>;
 
@@ -335,7 +335,7 @@ def : ProcessorModel<"sc300",       CortexM4Model,      [ARMv7m,
 
 def : ProcessorModel<"cortex-m4", CortexM4Model,        [ARMv7em,
                                                          FeatureVFP4_D16_SP,
-                                                         FeaturePrefLoopAlign32,
+                                                         FeaturePreferBranchAlign32,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureHasSlowFPVFMx,
                                                          FeatureUseMISched,
@@ -344,7 +344,7 @@ def : ProcessorModel<"cortex-m4", CortexM4Model,        [ARMv7em,
 def : ProcessorModel<"cortex-m7", CortexM7Model,        [ARMv7em,
                                                          ProcM7,
                                                          FeatureFPARMv8_D16,
-                                                         FeaturePrefLoopAlign64,
+                                                         FeaturePreferBranchAlign64,
                                                          FeatureUseMIPipeliner,
                                                          FeatureUseMISched]>;
 
@@ -355,7 +355,7 @@ def : ProcNoItin<"cortex-m23",                          [ARMv8mBaseline,
 def : ProcessorModel<"cortex-m33", CortexM4Model,       [ARMv8mMainline,
                                                          FeatureDSP,
                                                          FeatureFPARMv8_D16_SP,
-                                                         FeaturePrefLoopAlign32,
+                                                         FeaturePreferBranchAlign32,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureHasSlowFPVFMx,
                                                          FeatureUseMISched,
@@ -365,7 +365,7 @@ def : ProcessorModel<"cortex-m33", CortexM4Model,       [ARMv8mMainline,
 def : ProcessorModel<"cortex-m35p", CortexM4Model,      [ARMv8mMainline,
                                                          FeatureDSP,
                                                          FeatureFPARMv8_D16_SP,
-                                                         FeaturePrefLoopAlign32,
+                                                         FeaturePreferBranchAlign32,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureHasSlowFPVFMx,
                                                          FeatureUseMISched,
@@ -377,7 +377,7 @@ def : ProcessorModel<"cortex-m55", CortexM55Model,      [ARMv81mMainline,
                                                          FeatureFPARMv8_D16,
                                                          FeatureUseMISched,
                                                          FeatureHasNoBranchPredictor,
-                                                         FeaturePrefLoopAlign32,
+                                                         FeaturePreferBranchAlign32,
                                                          FeatureHasSlowFPVMLx,
                                                          HasMVEFloatOps,
                                                          FeatureFixCMSE_CVE_2021_35465]>;
@@ -386,7 +386,7 @@ def : ProcessorModel<"cortex-m85", CortexM85Model,      [ARMv81mMainline,
                                                          FeatureDSP,
                                                          FeatureFPARMv8_D16,
                                                          FeaturePACBTI,
-                                                         FeaturePrefLoopAlign64,
+                                                         FeaturePreferBranchAlign64,
                                                          FeatureUseMISched,
                                                          HasMVEFloatOps]>;
 
@@ -396,7 +396,7 @@ def : ProcessorModel<"cortex-m52", CortexM55Model,      [ARMv81mMainline,
                                                          FeatureHasNoBranchPredictor,
                                                          FeaturePACBTI,
                                                          FeatureUseMISched,
-                                                         FeaturePrefLoopAlign32,
+                                                         FeaturePreferBranchAlign32,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureMVEVectorCostFactor1,
                                                          HasMVEFloatOps]>;
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 13018e647e822..f9d822873bb00 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -302,7 +302,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
     LdStMultipleTiming = SingleIssuePlusExtras;
     MaxInterleaveFactor = 4;
     if (!isThumb())
-      PrefLoopLogAlignment = 3;
+      PreferBranchLogAlignment = 3;
     break;
   case Kryo:
     break;
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index fa20f4b590bea..1ca5bd4a620c4 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -133,7 +133,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   int PreISelOperandLatencyAdjustment = 2;
 
   /// What alignment is preferred for loop bodies and functions, in log2(bytes).
-  unsigned PrefLoopLogAlignment = 0;
+  unsigned PreferBranchLogAlignment = 0;
 
   /// The cost factor for MVE instructions, representing the multiple beats an
   // instruction can take. The default is 2, (set in initSubtargetFeatures so
@@ -476,7 +476,9 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
     return isROPI() || !isTargetELF();
   }
 
-  unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; }
+  unsigned getPreferBranchLogAlignment() const {
+    return PreferBranchLogAlignment;
+  }
 
   unsigned
   getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const {
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 9b5349241c341..865e2f3066ef0 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -934,11 +934,10 @@ InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
   return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
 }
 
-InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                               Type *CondTy,
-                                               CmpInst::Predicate VecPred,
-                                               TTI::TargetCostKind CostKind,
-                                               const Instruction *I) {
+InstructionCost ARMTTIImpl::getCmpSelInstrCost(
+    unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+    TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
+    TTI::OperandValueInfo Op2Info, const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
   // Thumb scalar code size cost for select.
@@ -1052,7 +1051,7 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
              VecValTy->getNumElements() *
                  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
                                     VecCondTy->getScalarType(), VecPred,
-                                    CostKind, I);
+                                    CostKind, Op1Info, Op2Info, I);
     }
 
     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
@@ -1077,8 +1076,8 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
     BaseCost = ST->getMVEVectorCostFactor(CostKind);
 
-  return BaseCost *
-         BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
+  return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
+                                              CostKind, Op1Info, Op2Info, I);
 }
 
 InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 528f082dde32c..7be53c4bcaa29 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -239,10 +239,12 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
                                    TTI::TargetCostKind CostKind,
                                    const Instruction *I = nullptr);
 
-  InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-                                     CmpInst::Predicate VecPred,
-                                     TTI::TargetCostKind CostKind,
-                                     const Instruction *I = nullptr);
+  InstructionCost getCmpSelInstrCost(
+      unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
+      TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr);
 
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
index ffda703a24ade..178877aa00682 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
@@ -86,16 +86,16 @@ void AVRInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   }
 }
 
-const char *AVRInstPrinter::getPrettyRegisterName(unsigned RegNum,
+const char *AVRInstPrinter::getPrettyRegisterName(MCRegister Reg,
                                                   MCRegisterInfo const &MRI) {
   // GCC prints register pairs by just printing the lower register
   // If the register contains a subregister, print it instead
   if (MRI.getNumSubRegIndices() > 0) {
-    unsigned RegLoNum = MRI.getSubReg(RegNum, AVR::sub_lo);
-    RegNum = (RegLoNum != AVR::NoRegister) ? RegLoNum : RegNum;
+    MCRegister RegLo = MRI.getSubReg(Reg, AVR::sub_lo);
+    Reg = (RegLo != AVR::NoRegister) ? RegLo : Reg;
   }
 
-  return getRegisterName(RegNum);
+  return getRegisterName(Reg);
 }
 
 void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
index 89d210bb22e8f..8ba24dc80d884 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
@@ -26,7 +26,7 @@ class AVRInstPrinter : public MCInstPrinter {
                  const MCRegisterInfo &MRI)
       : MCInstPrinter(MAI, MII, MRI) {}
 
-  static const char *getPrettyRegisterName(unsigned RegNo,
+  static const char *getPrettyRegisterName(MCRegister Reg,
                                            MCRegisterInfo const &MRI);
 
   void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
index 7682394e83926..aa69d618d8a0a 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -118,7 +118,7 @@ unsigned AVRMCCodeEmitter::encodeLDSTPtrReg(const MCInst &MI, unsigned OpNo,
   // The operand should be a pointer register.
   assert(MO.isReg());
 
-  switch (MO.getReg()) {
+  switch (MO.getReg().id()) {
   case AVR::R27R26:
     return 0x03; // X: 0b11
   case AVR::R29R28:
@@ -144,7 +144,7 @@ unsigned AVRMCCodeEmitter::encodeMemri(const MCInst &MI, unsigned OpNo,
 
   uint8_t RegBit = 0;
 
-  switch (RegOp.getReg()) {
+  switch (RegOp.getReg().id()) {
   default:
     Ctx.reportError(MI.getLoc(), "Expected either Y or Z register");
     return 0;
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 06b7743e0cd31..32ddf11ec3196 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -86,7 +86,7 @@ struct BPFOperand : public MCParsedAsmOperand {
   } Kind;
 
   struct RegOp {
-    unsigned RegNum;
+    MCRegister RegNum;
   };
 
   struct ImmOp {
@@ -206,10 +206,10 @@ struct BPFOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static std::unique_ptr<BPFOperand> createReg(unsigned RegNo, SMLoc S,
+  static std::unique_ptr<BPFOperand> createReg(MCRegister Reg, SMLoc S,
                                                SMLoc E) {
     auto Op = std::make_unique<BPFOperand>(Register);
-    Op->Reg.RegNum = RegNo;
+    Op->Reg.RegNum = Reg;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
@@ -447,13 +447,13 @@ ParseStatus BPFAsmParser::parseRegister(OperandVector &Operands) {
     return ParseStatus::NoMatch;
   case AsmToken::Identifier:
     StringRef Name = getLexer().getTok().getIdentifier();
-    unsigned RegNo = MatchRegisterName(Name);
+    MCRegister Reg = MatchRegisterName(Name);
 
-    if (RegNo == 0)
+    if (!Reg)
       return ParseStatus::NoMatch;
 
     getLexer().Lex();
-    Operands.push_back(BPFOperand::createReg(RegNo, S, E));
+    Operands.push_back(BPFOperand::createReg(Reg, S, E));
   }
   return ParseStatus::Success;
 }
@@ -487,12 +487,12 @@ ParseStatus BPFAsmParser::parseImmediate(OperandVector &Operands) {
 bool BPFAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
                                     SMLoc NameLoc, OperandVector &Operands) {
   // The first operand could be either register or actually an operator.
-  unsigned RegNo = MatchRegisterName(Name);
+  MCRegister Reg = MatchRegisterName(Name);
 
-  if (RegNo != 0) {
+  if (Reg) {
     SMLoc E = SMLoc::getFromPointer(NameLoc.getPointer() - 1);
-    Operands.push_back(BPFOperand::createReg(RegNo, NameLoc, E));
-  } else if (BPFOperand::isValidIdAtStart (Name))
+    Operands.push_back(BPFOperand::createReg(Reg, NameLoc, E));
+  } else if (BPFOperand::isValidIdAtStart(Name))
     Operands.push_back(BPFOperand::createToken(Name, NameLoc));
   else
     return Error(NameLoc, "invalid register/token name");
diff --git a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
index 2b78ed7134c92..660e00b893c88 100644
--- a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
+++ b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
@@ -440,7 +440,7 @@ static Value *aspaceWrapValue(DenseMap<Value *, Value *> &Cache, Function *F,
     auto *GEPTy = cast<PointerType>(GEP->getType());
     auto *NewGEP = GEP->clone();
     NewGEP->insertAfter(GEP);
-    NewGEP->mutateType(GEPTy->getPointerTo(0));
+    NewGEP->mutateType(PointerType::getUnqual(GEPTy->getContext()));
     NewGEP->setOperand(GEP->getPointerOperandIndex(), WrappedPtr);
     NewGEP->setName(GEP->getName());
     Cache[ToWrap] = NewGEP;
@@ -452,8 +452,7 @@ static Value *aspaceWrapValue(DenseMap<Value *, Value *> &Cache, Function *F,
     IB.SetInsertPoint(*InsnPtr->getInsertionPointAfterDef());
   else
     IB.SetInsertPoint(F->getEntryBlock().getFirstInsertionPt());
-  auto *PtrTy = cast<PointerType>(ToWrap->getType());
-  auto *ASZeroPtrTy = PtrTy->getPointerTo(0);
+  auto *ASZeroPtrTy = IB.getPtrTy(0);
   auto *ACast = IB.CreateAddrSpaceCast(ToWrap, ASZeroPtrTy, ToWrap->getName());
   Cache[ToWrap] = ACast;
   return ACast;
diff --git a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
index 9d0db33d9a1fd..bf0bef3a2b2f9 100644
--- a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
+++ b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
@@ -44,15 +44,17 @@ class BPFTTIImpl : public BasicTTIImplBase<BPFTTIImpl> {
     return TTI::TCC_Basic;
   }
 
-  InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-                                     CmpInst::Predicate VecPred,
-                                     TTI::TargetCostKind CostKind,
-                                     const llvm::Instruction *I = nullptr) {
+  InstructionCost getCmpSelInstrCost(
+      unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
+      TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
+      const llvm::Instruction *I = nullptr) {
     if (Opcode == Instruction::Select)
       return SCEVCheapExpansionBudget.getValue();
 
     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
-                                     I);
+                                     Op1Info, Op2Info, I);
   }
 
   InstructionCost getArithmeticInstrCost(
diff --git a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
index d923c96bc008e..d7dde11a1ecac 100644
--- a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
+++ b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
@@ -154,7 +154,7 @@ struct CSKYOperand : public MCParsedAsmOperand {
   } Kind;
 
   struct RegOp {
-    unsigned RegNum;
+    MCRegister RegNum;
   };
 
   struct ImmOp {
@@ -166,19 +166,19 @@ struct CSKYOperand : public MCParsedAsmOperand {
   };
 
   struct RegSeqOp {
-    unsigned RegNumFrom;
-    unsigned RegNumTo;
+    MCRegister RegNumFrom;
+    MCRegister RegNumTo;
   };
 
   struct RegListOp {
-    unsigned List1From = 0;
-    unsigned List1To = 0;
-    unsigned List2From = 0;
-    unsigned List2To = 0;
-    unsigned List3From = 0;
-    unsigned List3To = 0;
-    unsigned List4From = 0;
-    unsigned List4To = 0;
+    MCRegister List1From;
+    MCRegister List1To;
+    MCRegister List2From;
+    MCRegister List2To;
+    MCRegister List3From;
+    MCRegister List3To;
+    MCRegister List4From;
+    MCRegister List4To;
   };
 
   SMLoc StartLoc, EndLoc;
@@ -405,9 +405,9 @@ struct CSKYOperand : public MCParsedAsmOperand {
     return Reg.RegNum;
   }
 
-  std::pair<unsigned, unsigned> getRegSeq() const {
+  std::pair<MCRegister, MCRegister> getRegSeq() const {
     assert(Kind == RegisterSeq && "Invalid type access!");
-    return std::pair<unsigned, unsigned>(RegSeq.RegNumFrom, RegSeq.RegNumTo);
+    return {RegSeq.RegNumFrom, RegSeq.RegNumTo};
   }
 
   RegListOp getRegList() const {
@@ -478,7 +478,7 @@ struct CSKYOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static std::unique_ptr<CSKYOperand> createReg(unsigned RegNo, SMLoc S,
+  static std::unique_ptr<CSKYOperand> createReg(MCRegister RegNo, SMLoc S,
                                                 SMLoc E) {
     auto Op = std::make_unique<CSKYOperand>(Register);
     Op->Reg.RegNum = RegNo;
@@ -487,8 +487,8 @@ struct CSKYOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static std::unique_ptr<CSKYOperand> createRegSeq(unsigned RegNoFrom,
-                                                   unsigned RegNoTo, SMLoc S) {
+  static std::unique_ptr<CSKYOperand>
+  createRegSeq(MCRegister RegNoFrom, MCRegister RegNoTo, SMLoc S) {
     auto Op = std::make_unique<CSKYOperand>(RegisterSeq);
     Op->RegSeq.RegNumFrom = RegNoFrom;
     Op->RegSeq.RegNumTo = RegNoTo;
@@ -498,7 +498,7 @@ struct CSKYOperand : public MCParsedAsmOperand {
   }
 
   static std::unique_ptr<CSKYOperand>
-  createRegList(SmallVector<unsigned, 4> reglist, SMLoc S) {
+  createRegList(const SmallVector<MCRegister, 4> &reglist, SMLoc S) {
     auto Op = std::make_unique<CSKYOperand>(RegisterList);
     Op->RegList.List1From = 0;
     Op->RegList.List1To = 0;
@@ -1445,9 +1445,7 @@ ParseStatus CSKYAsmParser::parseRegSeq(OperandVector &Operands) {
 
 ParseStatus CSKYAsmParser::parseRegList(OperandVector &Operands) {
   SMLoc S = getLoc();
-
-  SmallVector<unsigned, 4> reglist;
-
+  SmallVector<MCRegister, 4> reglist;
   while (true) {
 
     if (!parseRegister(Operands).isSuccess())
diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt
index 7e0f8a145505e..c8ef0ef6f7e70 100644
--- a/llvm/lib/Target/DirectX/CMakeLists.txt
+++ b/llvm/lib/Target/DirectX/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_target(DirectXCodeGen
   DirectXTargetMachine.cpp
   DirectXTargetTransformInfo.cpp
   DXContainerGlobals.cpp
+  DXILDataScalarization.cpp
   DXILFinalizeLinkage.cpp
   DXILIntrinsicExpansion.cpp
   DXILOpBuilder.cpp
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
new file mode 100644
index 0000000000000..0e6cf59e25750
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -0,0 +1,300 @@
+//===- DXILDataScalarization.cpp - Perform DXIL Data Legalization ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#include "DXILDataScalarization.h"
+#include "DirectX.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/DXILResource.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#define DEBUG_TYPE "dxil-data-scalarization"
+static const int MaxVecSize = 4;
+
+using namespace llvm;
+
+class DXILDataScalarizationLegacy : public ModulePass {
+
+public:
+  bool runOnModule(Module &M) override;
+  DXILDataScalarizationLegacy() : ModulePass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  static char ID; // Pass identification.
+};
+
+static bool findAndReplaceVectors(Module &M);
+
+class DataScalarizerVisitor : public InstVisitor<DataScalarizerVisitor, bool> {
+public:
+  DataScalarizerVisitor() : GlobalMap() {}
+  bool visit(Function &F);
+  // InstVisitor methods.  They return true if the instruction was scalarized,
+  // false if nothing changed.
+  bool visitInstruction(Instruction &I) { return false; }
+  bool visitSelectInst(SelectInst &SI) { return false; }
+  bool visitICmpInst(ICmpInst &ICI) { return false; }
+  bool visitFCmpInst(FCmpInst &FCI) { return false; }
+  bool visitUnaryOperator(UnaryOperator &UO) { return false; }
+  bool visitBinaryOperator(BinaryOperator &BO) { return false; }
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI);
+  bool visitCastInst(CastInst &CI) { return false; }
+  bool visitBitCastInst(BitCastInst &BCI) { return false; }
+  bool visitInsertElementInst(InsertElementInst &IEI) { return false; }
+  bool visitExtractElementInst(ExtractElementInst &EEI) { return false; }
+  bool visitShuffleVectorInst(ShuffleVectorInst &SVI) { return false; }
+  bool visitPHINode(PHINode &PHI) { return false; }
+  bool visitLoadInst(LoadInst &LI);
+  bool visitStoreInst(StoreInst &SI);
+  bool visitCallInst(CallInst &ICI) { return false; }
+  bool visitFreezeInst(FreezeInst &FI) { return false; }
+  friend bool findAndReplaceVectors(llvm::Module &M);
+
+private:
+  GlobalVariable *lookupReplacementGlobal(Value *CurrOperand);
+  DenseMap<GlobalVariable *, GlobalVariable *> GlobalMap;
+  SmallVector<WeakTrackingVH, 32> PotentiallyDeadInstrs;
+  bool finish();
+};
+
+bool DataScalarizerVisitor::visit(Function &F) {
+  assert(!GlobalMap.empty());
+  ReversePostOrderTraversal<BasicBlock *> RPOT(&F.getEntryBlock());
+  for (BasicBlock *BB : RPOT) {
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
+      Instruction *I = &*II;
+      bool Done = InstVisitor::visit(I);
+      ++II;
+      if (Done && I->getType()->isVoidTy())
+        I->eraseFromParent();
+    }
+  }
+  return finish();
+}
+
+bool DataScalarizerVisitor::finish() {
+  RecursivelyDeleteTriviallyDeadInstructionsPermissive(PotentiallyDeadInstrs);
+  return true;
+}
+
+GlobalVariable *
+DataScalarizerVisitor::lookupReplacementGlobal(Value *CurrOperand) {
+  if (GlobalVariable *OldGlobal = dyn_cast<GlobalVariable>(CurrOperand)) {
+    auto It = GlobalMap.find(OldGlobal);
+    if (It != GlobalMap.end()) {
+      return It->second; // Found, return the new global
+    }
+  }
+  return nullptr; // Not found
+}
+
+bool DataScalarizerVisitor::visitLoadInst(LoadInst &LI) {
+  unsigned NumOperands = LI.getNumOperands();
+  for (unsigned I = 0; I < NumOperands; ++I) {
+    Value *CurrOpperand = LI.getOperand(I);
+    if (GlobalVariable *NewGlobal = lookupReplacementGlobal(CurrOpperand))
+      LI.setOperand(I, NewGlobal);
+  }
+  return false;
+}
+
+bool DataScalarizerVisitor::visitStoreInst(StoreInst &SI) {
+  unsigned NumOperands = SI.getNumOperands();
+  for (unsigned I = 0; I < NumOperands; ++I) {
+    Value *CurrOpperand = SI.getOperand(I);
+    if (GlobalVariable *NewGlobal = lookupReplacementGlobal(CurrOpperand)) {
+      SI.setOperand(I, NewGlobal);
+    }
+  }
+  return false;
+}
+
+bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+  unsigned NumOperands = GEPI.getNumOperands();
+  for (unsigned I = 0; I < NumOperands; ++I) {
+    Value *CurrOpperand = GEPI.getOperand(I);
+    GlobalVariable *NewGlobal = lookupReplacementGlobal(CurrOpperand);
+    if (!NewGlobal)
+      continue;
+    IRBuilder<> Builder(&GEPI);
+
+    SmallVector<Value *, MaxVecSize> Indices;
+    for (auto &Index : GEPI.indices())
+      Indices.push_back(Index);
+
+    Value *NewGEP =
+        Builder.CreateGEP(NewGlobal->getValueType(), NewGlobal, Indices);
+
+    GEPI.replaceAllUsesWith(NewGEP);
+    PotentiallyDeadInstrs.emplace_back(&GEPI);
+  }
+  return true;
+}
+
+// Recursively Creates and Array like version of the given vector like type.
+static Type *replaceVectorWithArray(Type *T, LLVMContext &Ctx) {
+  if (auto *VecTy = dyn_cast<VectorType>(T))
+    return ArrayType::get(VecTy->getElementType(),
+                          dyn_cast<FixedVectorType>(VecTy)->getNumElements());
+  if (auto *ArrayTy = dyn_cast<ArrayType>(T)) {
+    Type *NewElementType =
+        replaceVectorWithArray(ArrayTy->getElementType(), Ctx);
+    return ArrayType::get(NewElementType, ArrayTy->getNumElements());
+  }
+  // If it's not a vector or array, return the original type.
+  return T;
+}
+
+Constant *transformInitializer(Constant *Init, Type *OrigType, Type *NewType,
+                               LLVMContext &Ctx) {
+  // Handle ConstantAggregateZero (zero-initialized constants)
+  if (isa<ConstantAggregateZero>(Init)) {
+    return ConstantAggregateZero::get(NewType);
+  }
+
+  // Handle UndefValue (undefined constants)
+  if (isa<UndefValue>(Init)) {
+    return UndefValue::get(NewType);
+  }
+
+  // Handle vector to array transformation
+  if (isa<VectorType>(OrigType) && isa<ArrayType>(NewType)) {
+    // Convert vector initializer to array initializer
+    SmallVector<Constant *, MaxVecSize> ArrayElements;
+    if (ConstantVector *ConstVecInit = dyn_cast<ConstantVector>(Init)) {
+      for (unsigned I = 0; I < ConstVecInit->getNumOperands(); ++I)
+        ArrayElements.push_back(ConstVecInit->getOperand(I));
+    } else if (ConstantDataVector *ConstDataVecInit =
+                   llvm::dyn_cast<llvm::ConstantDataVector>(Init)) {
+      for (unsigned I = 0; I < ConstDataVecInit->getNumElements(); ++I)
+        ArrayElements.push_back(ConstDataVecInit->getElementAsConstant(I));
+    } else {
+      assert(false && "Expected a ConstantVector or ConstantDataVector for "
+                      "vector initializer!");
+    }
+
+    return ConstantArray::get(cast<ArrayType>(NewType), ArrayElements);
+  }
+
+  // Handle array of vectors transformation
+  if (auto *ArrayTy = dyn_cast<ArrayType>(OrigType)) {
+    auto *ArrayInit = dyn_cast<ConstantArray>(Init);
+    assert(ArrayInit && "Expected a ConstantArray for array initializer!");
+
+    SmallVector<Constant *, MaxVecSize> NewArrayElements;
+    for (unsigned I = 0; I < ArrayTy->getNumElements(); ++I) {
+      // Recursively transform array elements
+      Constant *NewElemInit = transformInitializer(
+          ArrayInit->getOperand(I), ArrayTy->getElementType(),
+          cast<ArrayType>(NewType)->getElementType(), Ctx);
+      NewArrayElements.push_back(NewElemInit);
+    }
+
+    return ConstantArray::get(cast<ArrayType>(NewType), NewArrayElements);
+  }
+
+  // If not a vector or array, return the original initializer
+  return Init;
+}
+
+static bool findAndReplaceVectors(Module &M) {
+  bool MadeChange = false;
+  LLVMContext &Ctx = M.getContext();
+  IRBuilder<> Builder(Ctx);
+  DataScalarizerVisitor Impl;
+  for (GlobalVariable &G : M.globals()) {
+    Type *OrigType = G.getValueType();
+
+    Type *NewType = replaceVectorWithArray(OrigType, Ctx);
+    if (OrigType != NewType) {
+      // Create a new global variable with the updated type
+      // Note: Initializer is set via transformInitializer
+      GlobalVariable *NewGlobal = new GlobalVariable(
+          M, NewType, G.isConstant(), G.getLinkage(),
+          /*Initializer=*/nullptr, G.getName() + ".scalarized", &G,
+          G.getThreadLocalMode(), G.getAddressSpace(),
+          G.isExternallyInitialized());
+
+      // Copy relevant attributes
+      NewGlobal->setUnnamedAddr(G.getUnnamedAddr());
+      if (G.getAlignment() > 0) {
+        NewGlobal->setAlignment(G.getAlign());
+      }
+
+      if (G.hasInitializer()) {
+        Constant *Init = G.getInitializer();
+        Constant *NewInit = transformInitializer(Init, OrigType, NewType, Ctx);
+        NewGlobal->setInitializer(NewInit);
+      }
+
+      // Note: we want to do G.replaceAllUsesWith(NewGlobal);, but it assumes
+      // type equality. Instead we will use the visitor pattern.
+      Impl.GlobalMap[&G] = NewGlobal;
+      for (User *U : make_early_inc_range(G.users())) {
+        if (isa<ConstantExpr>(U) && isa<Operator>(U)) {
+          ConstantExpr *CE = cast<ConstantExpr>(U);
+          convertUsersOfConstantsToInstructions(CE,
+                                                /*RestrictToFunc=*/nullptr,
+                                                /*RemoveDeadConstants=*/false,
+                                                /*IncludeSelf=*/true);
+        }
+        if (isa<Instruction>(U)) {
+          Instruction *Inst = cast<Instruction>(U);
+          Function *F = Inst->getFunction();
+          if (F)
+            Impl.visit(*F);
+        }
+      }
+    }
+  }
+
+  // Remove the old globals after the iteration
+  for (auto &[Old, New] : Impl.GlobalMap) {
+    Old->eraseFromParent();
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+PreservedAnalyses DXILDataScalarization::run(Module &M,
+                                             ModuleAnalysisManager &) {
+  bool MadeChanges = findAndReplaceVectors(M);
+  if (!MadeChanges)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DXILResourceAnalysis>();
+  return PA;
+}
+
+bool DXILDataScalarizationLegacy::runOnModule(Module &M) {
+  return findAndReplaceVectors(M);
+}
+
+void DXILDataScalarizationLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<DXILResourceWrapperPass>();
+}
+
+char DXILDataScalarizationLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(DXILDataScalarizationLegacy, DEBUG_TYPE,
+                      "DXIL Data Scalarization", false, false)
+INITIALIZE_PASS_END(DXILDataScalarizationLegacy, DEBUG_TYPE,
+                    "DXIL Data Scalarization", false, false)
+
+ModulePass *llvm::createDXILDataScalarizationLegacyPass() {
+  return new DXILDataScalarizationLegacy();
+}
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.h b/llvm/lib/Target/DirectX/DXILDataScalarization.h
new file mode 100644
index 0000000000000..560e061db96d0
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.h
@@ -0,0 +1,25 @@
+//===- DXILDataScalarization.h - Perform DXIL Data Legalization -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_DIRECTX_DXILDATASCALARIZATION_H
+#define LLVM_TARGET_DIRECTX_DXILDATASCALARIZATION_H
+
+#include "DXILResource.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+/// A pass that transforms Vectors to Arrays
+class DXILDataScalarization : public PassInfoMixin<DXILDataScalarization> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
+};
+} // namespace llvm
+
+#endif // LLVM_TARGET_DIRECTX_DXILDATASCALARIZATION_H
diff --git a/llvm/lib/Target/DirectX/DirectX.h b/llvm/lib/Target/DirectX/DirectX.h
index 60fc5094542b3..3221779be2f31 100644
--- a/llvm/lib/Target/DirectX/DirectX.h
+++ b/llvm/lib/Target/DirectX/DirectX.h
@@ -34,6 +34,12 @@ void initializeDXILIntrinsicExpansionLegacyPass(PassRegistry &);
 /// Pass to expand intrinsic operations that lack DXIL opCodes
 ModulePass *createDXILIntrinsicExpansionLegacyPass();
 
+/// Initializer for DXIL Data Scalarization Pass
+void initializeDXILDataScalarizationLegacyPass(PassRegistry &);
+
+/// Pass to scalarize llvm global data into a DXIL legal form
+ModulePass *createDXILDataScalarizationLegacyPass();
+
 /// Initializer for DXILOpLowering
 void initializeDXILOpLoweringLegacyPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index 606022a9835f0..f358215ecf373 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -46,6 +46,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
   RegisterTargetMachine<DirectXTargetMachine> X(getTheDirectXTarget());
   auto *PR = PassRegistry::getPassRegistry();
   initializeDXILIntrinsicExpansionLegacyPass(*PR);
+  initializeDXILDataScalarizationLegacyPass(*PR);
   initializeScalarizerLegacyPassPass(*PR);
   initializeDXILPrepareModulePass(*PR);
   initializeEmbedDXILPassPass(*PR);
@@ -86,6 +87,7 @@ class DirectXPassConfig : public TargetPassConfig {
   FunctionPass *createTargetRegisterAllocator(bool) override { return nullptr; }
   void addCodeGenPrepare() override {
     addPass(createDXILIntrinsicExpansionLegacyPass());
+    addPass(createDXILDataScalarizationLegacyPass());
     ScalarizerPassOptions DxilScalarOptions;
     DxilScalarOptions.ScalarizeLoadStore = true;
     addPass(createScalarizerPass(DxilScalarOptions));
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 62f188957cccf..e2157f1593e90 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -124,7 +124,7 @@ class HexagonAsmParser : public MCTargetAsmParser {
 
   bool parseDirectiveAttribute(SMLoc L);
 
-  bool RegisterMatchesArch(unsigned MatchNum) const;
+  bool RegisterMatchesArch(MCRegister MatchNum) const;
 
   bool matchBundleOptions();
   bool handleNoncontigiousRegister(bool Contigious, SMLoc &Loc);
@@ -145,10 +145,10 @@ class HexagonAsmParser : public MCTargetAsmParser {
   int processInstruction(MCInst &Inst, OperandVector const &Operands,
                          SMLoc IDLoc);
 
-  unsigned matchRegister(StringRef Name);
+  MCRegister matchRegister(StringRef Name);
 
-/// @name Auto-generated Match Functions
-/// {
+  /// @name Auto-generated Match Functions
+  /// {
 
 #define GET_ASSEMBLER_HEADER
 #include "HexagonGenAsmMatcher.inc"
@@ -205,7 +205,7 @@ struct HexagonOperand : public MCParsedAsmOperand {
   };
 
   struct RegTy {
-    unsigned RegNum;
+    MCRegister RegNum;
   };
 
   struct ImmTy {
@@ -434,9 +434,9 @@ struct HexagonOperand : public MCParsedAsmOperand {
   }
 
   static std::unique_ptr<HexagonOperand>
-  CreateReg(MCContext &Context, unsigned RegNum, SMLoc S, SMLoc E) {
+  CreateReg(MCContext &Context, MCRegister Reg, SMLoc S, SMLoc E) {
     HexagonOperand *Op = new HexagonOperand(Register, Context);
-    Op->Reg.RegNum = RegNum;
+    Op->Reg.RegNum = Reg;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return std::unique_ptr<HexagonOperand>(Op);
@@ -867,7 +867,7 @@ bool HexagonAsmParser::ParseDirectiveComm(bool IsLocal, SMLoc Loc) {
 }
 
 // validate register against architecture
-bool HexagonAsmParser::RegisterMatchesArch(unsigned MatchNum) const {
+bool HexagonAsmParser::RegisterMatchesArch(MCRegister MatchNum) const {
   if (HexagonMCRegisterClasses[Hexagon::V62RegsRegClassID].contains(MatchNum))
     if (!getSTI().hasFeature(Hexagon::ArchV62))
       return false;
@@ -929,7 +929,7 @@ bool HexagonAsmParser::parseOperand(OperandVector &Operands) {
   MCAsmLexer &Lexer = getLexer();
   if (!parseRegister(Register, Begin, End)) {
     if (!ErrorMissingParenthesis)
-      switch (Register) {
+      switch (Register.id()) {
       default:
         break;
       case Hexagon::P0:
@@ -1054,8 +1054,8 @@ ParseStatus HexagonAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
   llvm::erase_if(Collapsed, isSpace);
   StringRef FullString = Collapsed;
   std::pair<StringRef, StringRef> DotSplit = FullString.split('.');
-  unsigned DotReg = matchRegister(DotSplit.first.lower());
-  if (DotReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) {
+  MCRegister DotReg = matchRegister(DotSplit.first.lower());
+  if (DotReg && RegisterMatchesArch(DotReg)) {
     if (DotSplit.second.empty()) {
       Reg = DotReg;
       EndLoc = Lexer.getLoc();
@@ -1074,8 +1074,8 @@ ParseStatus HexagonAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
     }
   }
   std::pair<StringRef, StringRef> ColonSplit = StringRef(FullString).split(':');
-  unsigned ColonReg = matchRegister(ColonSplit.first.lower());
-  if (ColonReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) {
+  MCRegister ColonReg = matchRegister(ColonSplit.first.lower());
+  if (ColonReg && RegisterMatchesArch(DotReg)) {
     do {
       Lexer.UnLex(Lookahead.pop_back_val());
     } while (!Lookahead.empty() && !Lexer.is(AsmToken::Colon));
@@ -1358,13 +1358,13 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
 
     return std::make_pair(matchRegister(R1), matchRegister(R2));
   };
-  auto GetScalarRegs = [RI, GetRegPair](unsigned RegPair) {
+  auto GetScalarRegs = [RI, GetRegPair](MCRegister RegPair) {
     const unsigned Lower = RI->getEncodingValue(RegPair);
     const RegPairVals RegPair_ = std::make_pair(Lower + 1, Lower);
 
     return GetRegPair(RegPair_);
   };
-  auto GetVecRegs = [GetRegPair](unsigned VecRegPair) {
+  auto GetVecRegs = [GetRegPair](MCRegister VecRegPair) {
     const RegPairVals RegPair =
         HexagonMCInstrInfo::GetVecRegPairIndices(VecRegPair);
 
@@ -1461,7 +1461,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)"
   case Hexagon::A2_tfrp: {
     MCOperand &MO = Inst.getOperand(1);
-    const std::pair<unsigned, unsigned> RegPair = GetScalarRegs(MO.getReg());
+    const std::pair<MCRegister, MCRegister> RegPair =
+        GetScalarRegs(MO.getReg());
     MO.setReg(RegPair.first);
     Inst.addOperand(MCOperand::createReg(RegPair.second));
     Inst.setOpcode(Hexagon::A2_combinew);
@@ -1471,7 +1472,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   case Hexagon::A2_tfrpt:
   case Hexagon::A2_tfrpf: {
     MCOperand &MO = Inst.getOperand(2);
-    const std::pair<unsigned, unsigned> RegPair = GetScalarRegs(MO.getReg());
+    const std::pair<MCRegister, MCRegister> RegPair =
+        GetScalarRegs(MO.getReg());
     MO.setReg(RegPair.first);
     Inst.addOperand(MCOperand::createReg(RegPair.second));
     Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt)
@@ -1482,7 +1484,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   case Hexagon::A2_tfrptnew:
   case Hexagon::A2_tfrpfnew: {
     MCOperand &MO = Inst.getOperand(2);
-    const std::pair<unsigned, unsigned> RegPair = GetScalarRegs(MO.getReg());
+    const std::pair<MCRegister, MCRegister> RegPair =
+        GetScalarRegs(MO.getReg());
     MO.setReg(RegPair.first);
     Inst.addOperand(MCOperand::createReg(RegPair.second));
     Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
@@ -1494,7 +1497,7 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   // Translate a "$Vdd = $Vss" to "$Vdd = vcombine($Vs, $Vt)"
   case Hexagon::V6_vassignp: {
     MCOperand &MO = Inst.getOperand(1);
-    const std::pair<unsigned, unsigned> RegPair = GetVecRegs(MO.getReg());
+    const std::pair<MCRegister, MCRegister> RegPair = GetVecRegs(MO.getReg());
     MO.setReg(RegPair.first);
     Inst.addOperand(MCOperand::createReg(RegPair.second));
     Inst.setOpcode(Hexagon::V6_vcombine);
@@ -2051,8 +2054,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   return Match_Success;
 }
 
-unsigned HexagonAsmParser::matchRegister(StringRef Name) {
-  if (unsigned Reg = MatchRegisterName(Name))
+MCRegister HexagonAsmParser::matchRegister(StringRef Name) {
+  if (MCRegister Reg = MatchRegisterName(Name))
     return Reg;
   return MatchRegisterAltName(Name);
 }
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 44a5cd73c6e89..231004f6c1dc0 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -499,13 +499,14 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
     bool SubregBit = (Register & 0x1) != 0;
     if (HexagonMCInstrInfo::hasNewValue2(*MCII, Inst)) {
       // If subreg bit is set we're selecting the second produced newvalue
-      unsigned Producer = SubregBit ?
-          HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg() :
-          HexagonMCInstrInfo::getNewValueOperand2(*MCII, Inst).getReg();
+      MCRegister Producer =
+          SubregBit
+              ? HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg()
+              : HexagonMCInstrInfo::getNewValueOperand2(*MCII, Inst).getReg();
       assert(Producer != Hexagon::NoRegister);
       MCO.setReg(Producer);
     } else if (HexagonMCInstrInfo::hasNewValue(*MCII, Inst)) {
-      unsigned Producer =
+      MCRegister Producer =
           HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg();
 
       if (HexagonMCInstrInfo::IsVecRegPair(Producer)) {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index f47fcff5d6025..bbb9d065b6243 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -255,11 +255,10 @@ InstructionCost HexagonTTIImpl::getInterleavedMemoryOpCost(
                          CostKind);
 }
 
-InstructionCost HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                                   Type *CondTy,
-                                                   CmpInst::Predicate VecPred,
-                                                   TTI::TargetCostKind CostKind,
-                                                   const Instruction *I) {
+InstructionCost HexagonTTIImpl::getCmpSelInstrCost(
+    unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+    TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
+    TTI::OperandValueInfo Op2Info, const Instruction *I) {
   if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) {
     if (!isHVXVectorType(ValTy) && ValTy->isFPOrFPVectorTy())
       return InstructionCost::getMax();
@@ -267,7 +266,8 @@ InstructionCost HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     if (Opcode == Instruction::FCmp)
       return LT.first + FloatFactor * getTypeNumElements(ValTy);
   }
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+                                   Op1Info, Op2Info, I);
 }
 
 InstructionCost HexagonTTIImpl::getArithmeticInstrCost(
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 4a1cfe03d48a7..826644d08d1ac 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -132,10 +132,12 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
-  InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-                                     CmpInst::Predicate VecPred,
-                                     TTI::TargetCostKind CostKind,
-                                     const Instruction *I = nullptr);
+  InstructionCost getCmpSelInstrCost(
+      unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
+      TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr);
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index ef4c23df54121..9b6bc5ade379d 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -65,8 +65,8 @@ void HexagonMCChecker::init() {
     init(MCB);
 }
 
-void HexagonMCChecker::initReg(MCInst const &MCI, unsigned R, unsigned &PredReg,
-                               bool &isTrue) {
+void HexagonMCChecker::initReg(MCInst const &MCI, MCRegister R,
+                               MCRegister &PredReg, bool &isTrue) {
   if (HexagonMCInstrInfo::isPredicated(MCII, MCI) &&
       HexagonMCInstrInfo::isPredReg(RI, R)) {
     // Note an used predicate register.
@@ -91,7 +91,7 @@ void HexagonMCChecker::initReg(MCInst const &MCI, unsigned R, unsigned &PredReg,
 
 void HexagonMCChecker::init(MCInst const &MCI) {
   const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MCI);
-  unsigned PredReg = Hexagon::NoRegister;
+  MCRegister PredReg;
   bool isTrue = false;
 
   // Get used registers.
@@ -133,7 +133,7 @@ void HexagonMCChecker::init(MCInst const &MCI) {
 
   // Figure out explicit register definitions.
   for (unsigned i = 0; i < MCID.getNumDefs(); ++i) {
-    unsigned R = MCI.getOperand(i).getReg(), S = Hexagon::NoRegister;
+    MCRegister R = MCI.getOperand(i).getReg(), S = MCRegister();
     // USR has subregisters (while C8 does not for technical reasons), so
     // reset R to USR, since we know how to handle multiple defs of USR,
     // taking into account its subregisters.
@@ -187,7 +187,7 @@ void HexagonMCChecker::init(MCInst const &MCI) {
   if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
     for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i)
       if (MCI.getOperand(i).isReg()) {
-        unsigned P = MCI.getOperand(i).getReg();
+        MCRegister P = MCI.getOperand(i).getReg();
 
         if (HexagonMCInstrInfo::isPredReg(RI, P))
           NewPreds.insert(P);
@@ -531,7 +531,7 @@ bool HexagonMCChecker::checkRegistersReadOnly() {
     for (unsigned j = 0; j < Defs; ++j) {
       MCOperand const &Operand = Inst.getOperand(j);
       assert(Operand.isReg() && "Def is not a register");
-      unsigned Register = Operand.getReg();
+      MCRegister Register = Operand.getReg();
       if (ReadOnly.find(Register) != ReadOnly.end()) {
         reportError(Inst.getLoc(), "Cannot write to read-only register `" +
                                        Twine(RI.getName(Register)) + "'");
@@ -542,7 +542,7 @@ bool HexagonMCChecker::checkRegistersReadOnly() {
   return true;
 }
 
-bool HexagonMCChecker::registerUsed(unsigned Register) {
+bool HexagonMCChecker::registerUsed(MCRegister Register) {
   for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB))
     for (unsigned j = HexagonMCInstrInfo::getDesc(MCII, I).getNumDefs(),
                   n = I.getNumOperands();
@@ -556,7 +556,7 @@ bool HexagonMCChecker::registerUsed(unsigned Register) {
 
 std::tuple<MCInst const *, unsigned, HexagonMCInstrInfo::PredicateInfo>
 HexagonMCChecker::registerProducer(
-    unsigned Register, HexagonMCInstrInfo::PredicateInfo ConsumerPredicate) {
+    MCRegister Register, HexagonMCInstrInfo::PredicateInfo ConsumerPredicate) {
   std::tuple<MCInst const *, unsigned, HexagonMCInstrInfo::PredicateInfo>
       WrongSense;
 
@@ -588,7 +588,7 @@ void HexagonMCChecker::checkRegisterCurDefs() {
   for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
     if (HexagonMCInstrInfo::isCVINew(MCII, I) &&
         HexagonMCInstrInfo::getDesc(MCII, I).mayLoad()) {
-      const unsigned RegDef = I.getOperand(0).getReg();
+      const MCRegister RegDef = I.getOperand(0).getReg();
 
       bool HasRegDefUse = false;
       for (MCRegAliasIterator Alias(RegDef, &RI, true); Alias.isValid();
@@ -819,7 +819,7 @@ bool HexagonMCChecker::checkHVXAccum()
         HexagonMCInstrInfo::isAccumulator(MCII, I) && I.getOperand(0).isReg();
     if (!IsTarget)
       continue;
-    unsigned int R = I.getOperand(0).getReg();
+    MCRegister R = I.getOperand(0).getReg();
     TmpDefsIterator It = TmpDefs.find(R);
     if (It != TmpDefs.end()) {
       reportError("register `" + Twine(RI.getName(R)) + ".tmp" +
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
index 42d91f559f51a..e9b87c5315fe4 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -77,15 +77,15 @@ class HexagonMCChecker {
 
   void init();
   void init(MCInst const &);
-  void initReg(MCInst const &, unsigned, unsigned &PredReg, bool &isTrue);
+  void initReg(MCInst const &, MCRegister, MCRegister &PredReg, bool &isTrue);
 
-  bool registerUsed(unsigned Register);
+  bool registerUsed(MCRegister Register);
 
   /// \return a tuple of: pointer to the producer instruction or nullptr if
   /// none was found, the operand index, and the PredicateInfo for the
   /// producer.
   std::tuple<MCInst const *, unsigned, HexagonMCInstrInfo::PredicateInfo>
-  registerProducer(unsigned Register,
+  registerProducer(MCRegister Register,
                    HexagonMCInstrInfo::PredicateInfo Predicated);
 
   // Checks performed.
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 96ec81cd86abe..b744519b9725c 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -388,8 +388,8 @@ void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI,
   }
 }
 
-static bool RegisterMatches(unsigned Consumer, unsigned Producer,
-                            unsigned Producer2) {
+static bool RegisterMatches(MCRegister Consumer, MCRegister Producer,
+                            MCRegister Producer2) {
   return (Consumer == Producer) || (Consumer == Producer2) ||
          HexagonMCInstrInfo::IsSingleConsumerRefPairProducer(Producer,
                                                              Consumer);
@@ -721,9 +721,9 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
     // Calculate the new value distance to the associated producer
     unsigned SOffset = 0;
     unsigned VOffset = 0;
-    unsigned UseReg = MO.getReg();
-    unsigned DefReg1 = Hexagon::NoRegister;
-    unsigned DefReg2 = Hexagon::NoRegister;
+    MCRegister UseReg = MO.getReg();
+    MCRegister DefReg1;
+    MCRegister DefReg2;
 
     auto Instrs = HexagonMCInstrInfo::bundleInstructions(*State.Bundle);
     const MCOperand *I = Instrs.begin() + State.Index - 1;
@@ -734,8 +734,8 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
       if (HexagonMCInstrInfo::isImmext(Inst))
         continue;
 
-      DefReg1 = Hexagon::NoRegister;
-      DefReg2 = Hexagon::NoRegister;
+      DefReg1 = MCRegister();
+      DefReg2 = MCRegister();
       ++SOffset;
       if (HexagonMCInstrInfo::isVector(MCII, Inst)) {
         // Vector instructions don't count scalars.
@@ -770,7 +770,7 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
 
   assert(!MO.isImm());
   if (MO.isReg()) {
-    unsigned Reg = MO.getReg();
+    MCRegister Reg = MO.getReg();
     switch (HexagonMCInstrInfo::getDesc(MCII, MI)
                 .operands()[OperandNumber]
                 .RegClass) {
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
index 3deef95df3245..c1a9a01aaf7ab 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -78,7 +78,7 @@ static const unsigned cmpgtn1BitOpcode[8] = {
 
 // enum HexagonII::CompoundGroup
 static unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
-  unsigned DstReg, SrcReg, Src1Reg, Src2Reg;
+  MCRegister DstReg, SrcReg, Src1Reg, Src2Reg;
 
   switch (MI.getOpcode()) {
   default:
@@ -174,7 +174,7 @@ static unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
 /// getCompoundOp - Return the index from 0-7 into the above opcode lists.
 static unsigned getCompoundOp(MCInst const &HMCI) {
   const MCOperand &Predicate = HMCI.getOperand(0);
-  unsigned PredReg = Predicate.getReg();
+  MCRegister PredReg = Predicate.getReg();
 
   assert((PredReg == Hexagon::P0) || (PredReg == Hexagon::P1) ||
          (PredReg == Hexagon::P2) || (PredReg == Hexagon::P3));
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index 36d6c8c9f84b8..f3bdaf7921efe 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -187,7 +187,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
 }
 
 unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
-  unsigned DstReg, PredReg, SrcReg, Src1Reg, Src2Reg;
+  MCRegister DstReg, PredReg, SrcReg, Src1Reg, Src2Reg;
 
   switch (MCI.getOpcode()) {
   default:
@@ -533,7 +533,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
 }
 
 bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) {
-  unsigned DstReg, SrcReg;
+  MCRegister DstReg, SrcReg;
   switch (potentialDuplex.getOpcode()) {
   case Hexagon::A2_addi:
     // testing for case of: Rx = add(Rx,#s7)
@@ -657,7 +657,7 @@ bool HexagonMCInstrInfo::isDuplexPair(MCInst const &MIa, MCInst const &MIb) {
 inline static void addOps(MCInst &subInstPtr, MCInst const &Inst,
                           unsigned opNum) {
   if (Inst.getOperand(opNum).isReg()) {
-    switch (Inst.getOperand(opNum).getReg()) {
+    switch (Inst.getOperand(opNum).getReg().id()) {
     default:
       llvm_unreachable("Not Duplexable Register");
       break;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index a6de2ab9c75a2..a2ac8b70b9db4 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -257,10 +257,10 @@ MCInstrDesc const &HexagonMCInstrInfo::getDesc(MCInstrInfo const &MCII,
   return MCII.get(MCI.getOpcode());
 }
 
-unsigned HexagonMCInstrInfo::getDuplexRegisterNumbering(unsigned Reg) {
+unsigned HexagonMCInstrInfo::getDuplexRegisterNumbering(MCRegister Reg) {
   using namespace Hexagon;
 
-  switch (Reg) {
+  switch (Reg.id()) {
   default:
     llvm_unreachable("unknown duplex register");
   // Rs       Rss
@@ -616,7 +616,7 @@ bool HexagonMCInstrInfo::isCVINew(MCInstrInfo const &MCII, MCInst const &MCI) {
   return ((F >> HexagonII::CVINewPos) & HexagonII::CVINewMask);
 }
 
-bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) {
+bool HexagonMCInstrInfo::isDblRegForSubInst(MCRegister Reg) {
   return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) ||
           (Reg >= Hexagon::D8 && Reg <= Hexagon::D11));
 }
@@ -657,11 +657,11 @@ bool HexagonMCInstrInfo::isInnerLoop(MCInst const &MCI) {
   return (Flags & innerLoopMask) != 0;
 }
 
-bool HexagonMCInstrInfo::isIntReg(unsigned Reg) {
+bool HexagonMCInstrInfo::isIntReg(MCRegister Reg) {
   return (Reg >= Hexagon::R0 && Reg <= Hexagon::R31);
 }
 
-bool HexagonMCInstrInfo::isIntRegForSubInst(unsigned Reg) {
+bool HexagonMCInstrInfo::isIntRegForSubInst(MCRegister Reg) {
   return ((Reg >= Hexagon::R0 && Reg <= Hexagon::R7) ||
           (Reg >= Hexagon::R16 && Reg <= Hexagon::R23));
 }
@@ -691,21 +691,21 @@ bool HexagonMCInstrInfo::isOuterLoop(MCInst const &MCI) {
   return (Flags & outerLoopMask) != 0;
 }
 
-bool HexagonMCInstrInfo::IsVecRegPair(unsigned VecReg) {
+bool HexagonMCInstrInfo::IsVecRegPair(MCRegister VecReg) {
   return (VecReg >= Hexagon::W0 && VecReg <= Hexagon::W15) ||
          (VecReg >= Hexagon::WR0 && VecReg <= Hexagon::WR15);
 }
 
-bool HexagonMCInstrInfo::IsReverseVecRegPair(unsigned VecReg) {
+bool HexagonMCInstrInfo::IsReverseVecRegPair(MCRegister VecReg) {
   return (VecReg >= Hexagon::WR0 && VecReg <= Hexagon::WR15);
 }
 
-bool HexagonMCInstrInfo::IsVecRegSingle(unsigned VecReg) {
+bool HexagonMCInstrInfo::IsVecRegSingle(MCRegister VecReg) {
   return (VecReg >= Hexagon::V0 && VecReg <= Hexagon::V31);
 }
 
 std::pair<unsigned, unsigned>
-HexagonMCInstrInfo::GetVecRegPairIndices(unsigned VecRegPair) {
+HexagonMCInstrInfo::GetVecRegPairIndices(MCRegister VecRegPair) {
   assert(IsVecRegPair(VecRegPair) &&
          "VecRegPair must be a vector register pair");
 
@@ -717,8 +717,8 @@ HexagonMCInstrInfo::GetVecRegPairIndices(unsigned VecRegPair) {
                : std::make_pair(PairIndex + 1, PairIndex);
 }
 
-bool HexagonMCInstrInfo::IsSingleConsumerRefPairProducer(unsigned Producer,
-                                                         unsigned Consumer) {
+bool HexagonMCInstrInfo::IsSingleConsumerRefPairProducer(MCRegister Producer,
+                                                         MCRegister Consumer) {
   if (IsVecRegPair(Producer) && IsVecRegSingle(Consumer)) {
     const unsigned ProdPairIndex = IsReverseVecRegPair(Producer)
                                        ? Producer - Hexagon::WR0
@@ -760,7 +760,7 @@ bool HexagonMCInstrInfo::isPredicatedTrue(MCInstrInfo const &MCII,
       !((F >> HexagonII::PredicatedFalsePos) & HexagonII::PredicatedFalseMask));
 }
 
-bool HexagonMCInstrInfo::isPredReg(MCRegisterInfo const &MRI, unsigned Reg) {
+bool HexagonMCInstrInfo::isPredReg(MCRegisterInfo const &MRI, MCRegister Reg) {
   auto &PredRegClass = MRI.getRegClass(Hexagon::PredRegsRegClassID);
   return PredRegClass.contains(Reg);
 }
@@ -1031,9 +1031,9 @@ void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) {
   Operand.setImm(Operand.getImm() | outerLoopMask);
 }
 
-unsigned HexagonMCInstrInfo::SubregisterBit(unsigned Consumer,
-                                            unsigned Producer,
-                                            unsigned Producer2) {
+unsigned HexagonMCInstrInfo::SubregisterBit(MCRegister Consumer,
+                                            MCRegister Producer,
+                                            MCRegister Producer2) {
   // If we're a single vector consumer of a double producer, set subreg bit
   // based on if we're accessing the lower or upper register component
   if (IsVecRegPair(Producer) && IsVecRegSingle(Consumer)) {
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index ccd2482c3fd76..df942b63ee282 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -132,7 +132,7 @@ unsigned getDuplexCandidateGroup(MCInst const &MI);
 SmallVector<DuplexCandidate, 8>
 getDuplexPossibilties(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                       MCInst const &MCB);
-unsigned getDuplexRegisterNumbering(unsigned Reg);
+unsigned getDuplexRegisterNumbering(MCRegister Reg);
 
 MCExpr const &getExpr(MCExpr const &Expr);
 
@@ -231,7 +231,7 @@ bool isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI);
 bool isCVINew(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Is this double register suitable for use in a duplex subinst
-bool isDblRegForSubInst(unsigned Reg);
+bool isDblRegForSubInst(MCRegister Reg);
 
 // Is this a duplex instruction
 bool isDuplex(MCInstrInfo const &MCII, MCInst const &MCI);
@@ -260,10 +260,10 @@ bool isImmext(MCInst const &MCI);
 bool isInnerLoop(MCInst const &MCI);
 
 // Is this an integer register
-bool isIntReg(unsigned Reg);
+bool isIntReg(MCRegister Reg);
 
 // Is this register suitable for use in a duplex subinst
-bool isIntRegForSubInst(unsigned Reg);
+bool isIntRegForSubInst(MCRegister Reg);
 bool isMemReorderDisabled(MCInst const &MCI);
 
 // Return whether the insn is a new-value consumer.
@@ -289,7 +289,7 @@ bool isPredicatedNew(MCInstrInfo const &MCII, MCInst const &MCI);
 bool isPredicatedTrue(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Return true if this is a scalar predicate register.
-bool isPredReg(MCRegisterInfo const &MRI, unsigned Reg);
+bool isPredReg(MCRegisterInfo const &MRI, MCRegister Reg);
 
 // Returns true if the Ith operand is a predicate register.
 bool isPredRegister(MCInstrInfo const &MCII, MCInst const &Inst, unsigned I);
@@ -333,11 +333,11 @@ unsigned slotsConsumed(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
 void padEndloop(MCInst &MCI, MCContext &Context);
 class PredicateInfo {
 public:
-  PredicateInfo() : Register(0), Operand(0), PredicatedTrue(false) {}
-  PredicateInfo(unsigned Register, unsigned Operand, bool PredicatedTrue)
+  PredicateInfo() : Operand(0), PredicatedTrue(false) {}
+  PredicateInfo(MCRegister Register, unsigned Operand, bool PredicatedTrue)
       : Register(Register), Operand(Operand), PredicatedTrue(PredicatedTrue) {}
   bool isPredicated() const;
-  unsigned Register;
+  MCRegister Register;
   unsigned Operand;
   bool PredicatedTrue;
 };
@@ -360,18 +360,18 @@ void setOuterLoop(MCInst &MCI);
 
 // Would duplexing this instruction create a requirement to extend
 bool subInstWouldBeExtended(MCInst const &potentialDuplex);
-unsigned SubregisterBit(unsigned Consumer, unsigned Producer,
-                        unsigned Producer2);
+unsigned SubregisterBit(MCRegister Consumer, MCRegister Producer,
+                        MCRegister Producer2);
 
-bool IsVecRegSingle(unsigned VecReg);
-bool IsVecRegPair(unsigned VecReg);
-bool IsReverseVecRegPair(unsigned VecReg);
-bool IsSingleConsumerRefPairProducer(unsigned Producer, unsigned Consumer);
+bool IsVecRegSingle(MCRegister VecReg);
+bool IsVecRegPair(MCRegister VecReg);
+bool IsReverseVecRegPair(MCRegister VecReg);
+bool IsSingleConsumerRefPairProducer(MCRegister Producer, MCRegister Consumer);
 
 /// Returns an ordered pair of the constituent register ordinals for
 /// each of the elements of \a VecRegPair.  For example, Hexagon::W0 ("v0:1")
 /// returns { 0, 1 } and Hexagon::W1 ("v3:2") returns { 3, 2 }.
-std::pair<unsigned, unsigned> GetVecRegPairIndices(unsigned VecRegPair);
+std::pair<unsigned, unsigned> GetVecRegPairIndices(MCRegister VecRegPair);
 
 // Attempt to find and replace compound pairs
 void tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 280f1f3ddbb69..7ede5e3ed4093 100644
--- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -118,7 +118,7 @@ struct LanaiOperand : public MCParsedAsmOperand {
   };
 
   struct RegOp {
-    unsigned RegNum;
+    MCRegister RegNum;
   };
 
   struct ImmOp {
@@ -126,8 +126,8 @@ struct LanaiOperand : public MCParsedAsmOperand {
   };
 
   struct MemOp {
-    unsigned BaseReg;
-    unsigned OffsetReg;
+    MCRegister BaseReg;
+    MCRegister OffsetReg;
     unsigned AluOp;
     const MCExpr *Offset;
   };
@@ -166,12 +166,12 @@ struct LanaiOperand : public MCParsedAsmOperand {
     return StringRef(Tok.Data, Tok.Length);
   }
 
-  unsigned getMemBaseReg() const {
+  MCRegister getMemBaseReg() const {
     assert(isMem() && "Invalid type access!");
     return Mem.BaseReg;
   }
 
-  unsigned getMemOffsetReg() const {
+  MCRegister getMemOffsetReg() const {
     assert(isMem() && "Invalid type access!");
     return Mem.OffsetReg;
   }
@@ -439,7 +439,7 @@ struct LanaiOperand : public MCParsedAsmOperand {
   void addMemRegRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 3 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
-    assert(getMemOffsetReg() != 0 && "Invalid offset");
+    assert(getMemOffsetReg() && "Invalid offset");
     Inst.addOperand(MCOperand::createReg(getMemOffsetReg()));
     Inst.addOperand(MCOperand::createImm(getMemOp()));
   }
@@ -589,10 +589,10 @@ struct LanaiOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static std::unique_ptr<LanaiOperand> createReg(unsigned RegNum, SMLoc Start,
+  static std::unique_ptr<LanaiOperand> createReg(MCRegister Reg, SMLoc Start,
                                                  SMLoc End) {
     auto Op = std::make_unique<LanaiOperand>(REGISTER);
-    Op->Reg.RegNum = RegNum;
+    Op->Reg.RegNum = Reg;
     Op->StartLoc = Start;
     Op->EndLoc = End;
     return Op;
@@ -611,7 +611,7 @@ struct LanaiOperand : public MCParsedAsmOperand {
   MorphToMemImm(std::unique_ptr<LanaiOperand> Op) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = MEMORY_IMM;
-    Op->Mem.BaseReg = 0;
+    Op->Mem.BaseReg = MCRegister();
     Op->Mem.AluOp = LPAC::ADD;
     Op->Mem.OffsetReg = 0;
     Op->Mem.Offset = Imm;
@@ -619,9 +619,9 @@ struct LanaiOperand : public MCParsedAsmOperand {
   }
 
   static std::unique_ptr<LanaiOperand>
-  MorphToMemRegReg(unsigned BaseReg, std::unique_ptr<LanaiOperand> Op,
+  MorphToMemRegReg(MCRegister BaseReg, std::unique_ptr<LanaiOperand> Op,
                    unsigned AluOp) {
-    unsigned OffsetReg = Op->getReg();
+    MCRegister OffsetReg = Op->getReg();
     Op->Kind = MEMORY_REG_REG;
     Op->Mem.BaseReg = BaseReg;
     Op->Mem.AluOp = AluOp;
@@ -631,7 +631,7 @@ struct LanaiOperand : public MCParsedAsmOperand {
   }
 
   static std::unique_ptr<LanaiOperand>
-  MorphToMemRegImm(unsigned BaseReg, std::unique_ptr<LanaiOperand> Op,
+  MorphToMemRegImm(MCRegister BaseReg, std::unique_ptr<LanaiOperand> Op,
                    unsigned AluOp) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = MEMORY_REG_IMM;
@@ -691,21 +691,21 @@ LanaiAsmParser::parseRegister(bool RestoreOnFailure) {
   SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   std::optional<AsmToken> PercentTok;
 
-  unsigned RegNum;
+  MCRegister Reg;
   // Eat the '%'.
   if (Lexer.getKind() == AsmToken::Percent) {
     PercentTok = Parser.getTok();
     Parser.Lex();
   }
   if (Lexer.getKind() == AsmToken::Identifier) {
-    RegNum = MatchRegisterName(Lexer.getTok().getIdentifier());
-    if (RegNum == 0) {
+    Reg = MatchRegisterName(Lexer.getTok().getIdentifier());
+    if (!Reg) {
       if (PercentTok && RestoreOnFailure)
         Lexer.UnLex(*PercentTok);
       return nullptr;
     }
     Parser.Lex(); // Eat identifier token
-    return LanaiOperand::createReg(RegNum, Start, End);
+    return LanaiOperand::createReg(Reg, Start, End);
   }
   if (PercentTok && RestoreOnFailure)
     Lexer.UnLex(*PercentTok);
@@ -900,7 +900,7 @@ ParseStatus LanaiAsmParser::parseMemoryOperand(OperandVector &Operands) {
 
   // Use 0 if no offset given
   int OffsetValue = 0;
-  unsigned BaseReg = 0;
+  MCRegister BaseReg;
   unsigned AluOp = LPAC::ADD;
   bool PostOp = false, PreOp = false;
 
diff --git a/llvm/lib/Target/Lanai/LanaiRegisterInfo.h b/llvm/lib/Target/Lanai/LanaiRegisterInfo.h
index 5168dddd93019..4ff74c5f4eb1e 100644
--- a/llvm/lib/Target/Lanai/LanaiRegisterInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiRegisterInfo.h
@@ -43,8 +43,6 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
   Register getFrameRegister(const MachineFunction &MF) const override;
   Register getBaseRegister() const;
   bool hasBasePointer(const MachineFunction &MF) const;
-
-  int getDwarfRegNum(unsigned RegNum, bool IsEH) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h
index 1bc84014e7367..3c0b34dc8ecf4 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h
@@ -37,8 +37,8 @@ enum TOF {
 };
 } // namespace LanaiII
 
-static inline unsigned getLanaiRegisterNumbering(unsigned Reg) {
-  switch (Reg) {
+static inline unsigned getLanaiRegisterNumbering(MCRegister Reg) {
+  switch (Reg.id()) {
   case Lanai::R0:
     return 0;
   case Lanai::R1:
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 57c42024b4d2b..e3abb7eecc32b 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -559,10 +559,10 @@ class LoongArchOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static std::unique_ptr<LoongArchOperand> createReg(unsigned RegNo, SMLoc S,
+  static std::unique_ptr<LoongArchOperand> createReg(MCRegister Reg, SMLoc S,
                                                      SMLoc E) {
     auto Op = std::make_unique<LoongArchOperand>(KindTy::Register);
-    Op->Reg.RegNum = RegNo;
+    Op->Reg.RegNum = Reg;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
@@ -1424,9 +1424,9 @@ unsigned LoongArchAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   switch (Opc) {
   default:
     if (Opc >= LoongArch::AMADD_D && Opc <= LoongArch::AMXOR_W) {
-      unsigned Rd = Inst.getOperand(0).getReg();
-      unsigned Rk = Inst.getOperand(1).getReg();
-      unsigned Rj = Inst.getOperand(2).getReg();
+      MCRegister Rd = Inst.getOperand(0).getReg();
+      MCRegister Rk = Inst.getOperand(1).getReg();
+      MCRegister Rj = Inst.getOperand(2).getReg();
       if ((Rd == Rk || Rd == Rj) && Rd != LoongArch::R0)
         return Match_RequiresAMORdDifferRkRj;
     }
@@ -1435,7 +1435,7 @@ unsigned LoongArchAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   case LoongArch::PseudoLA_TLS_DESC_ABS_LARGE:
   case LoongArch::PseudoLA_TLS_DESC_PC:
   case LoongArch::PseudoLA_TLS_DESC_PC_LARGE: {
-    unsigned Rd = Inst.getOperand(0).getReg();
+    MCRegister Rd = Inst.getOperand(0).getReg();
     if (Rd != LoongArch::R4)
       return Match_RequiresLAORdR4;
     break;
@@ -1445,15 +1445,15 @@ unsigned LoongArchAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   case LoongArch::PseudoLA_TLS_IE_LARGE:
   case LoongArch::PseudoLA_TLS_LD_LARGE:
   case LoongArch::PseudoLA_TLS_GD_LARGE: {
-    unsigned Rd = Inst.getOperand(0).getReg();
-    unsigned Rj = Inst.getOperand(1).getReg();
+    MCRegister Rd = Inst.getOperand(0).getReg();
+    MCRegister Rj = Inst.getOperand(1).getReg();
     if (Rd == Rj)
       return Match_RequiresLAORdDifferRj;
     break;
   }
   case LoongArch::CSRXCHG:
   case LoongArch::GCSRXCHG: {
-    unsigned Rj = Inst.getOperand(2).getReg();
+    MCRegister Rj = Inst.getOperand(2).getReg();
     if (Rj == LoongArch::R0 || Rj == LoongArch::R1)
       return Match_RequiresOpnd2NotR0R1;
     return Match_Success;
diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index 34ae80669f2c3..7a8835c3af60f 100644
--- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -101,12 +101,12 @@ class MSP430Operand : public MCParsedAsmOperand {
   } Kind;
 
   struct Memory {
-    unsigned Reg;
+    MCRegister Reg;
     const MCExpr *Offset;
   };
   union {
     const MCExpr *Imm;
-    unsigned      Reg;
+    MCRegister    Reg;
     StringRef     Tok;
     Memory        Mem;
   };
@@ -116,11 +116,11 @@ class MSP430Operand : public MCParsedAsmOperand {
 public:
   MSP430Operand(StringRef Tok, SMLoc const &S)
       : Kind(k_Tok), Tok(Tok), Start(S), End(S) {}
-  MSP430Operand(KindTy Kind, unsigned Reg, SMLoc const &S, SMLoc const &E)
+  MSP430Operand(KindTy Kind, MCRegister Reg, SMLoc const &S, SMLoc const &E)
       : Kind(Kind), Reg(Reg), Start(S), End(E) {}
   MSP430Operand(MCExpr const *Imm, SMLoc const &S, SMLoc const &E)
       : Kind(k_Imm), Imm(Imm), Start(S), End(E) {}
-  MSP430Operand(unsigned Reg, MCExpr const *Expr, SMLoc const &S,
+  MSP430Operand(MCRegister Reg, MCExpr const *Expr, SMLoc const &S,
                 SMLoc const &E)
       : Kind(k_Mem), Mem({Reg, Expr}), Start(S), End(E) {}
 
@@ -188,7 +188,7 @@ class MSP430Operand : public MCParsedAsmOperand {
     return Reg;
   }
 
-  void setReg(unsigned RegNo) {
+  void setReg(MCRegister RegNo) {
     assert(Kind == k_Reg && "Invalid access!");
     Reg = RegNo;
   }
@@ -197,9 +197,9 @@ class MSP430Operand : public MCParsedAsmOperand {
     return std::make_unique<MSP430Operand>(Str, S);
   }
 
-  static std::unique_ptr<MSP430Operand> CreateReg(unsigned RegNum, SMLoc S,
+  static std::unique_ptr<MSP430Operand> CreateReg(MCRegister Reg, SMLoc S,
                                                   SMLoc E) {
-    return std::make_unique<MSP430Operand>(k_Reg, RegNum, S, E);
+    return std::make_unique<MSP430Operand>(k_Reg, Reg, S, E);
   }
 
   static std::unique_ptr<MSP430Operand> CreateImm(const MCExpr *Val, SMLoc S,
@@ -207,20 +207,19 @@ class MSP430Operand : public MCParsedAsmOperand {
     return std::make_unique<MSP430Operand>(Val, S, E);
   }
 
-  static std::unique_ptr<MSP430Operand> CreateMem(unsigned RegNum,
-                                                  const MCExpr *Val,
-                                                  SMLoc S, SMLoc E) {
-    return std::make_unique<MSP430Operand>(RegNum, Val, S, E);
+  static std::unique_ptr<MSP430Operand>
+  CreateMem(MCRegister Reg, const MCExpr *Val, SMLoc S, SMLoc E) {
+    return std::make_unique<MSP430Operand>(Reg, Val, S, E);
   }
 
-  static std::unique_ptr<MSP430Operand> CreateIndReg(unsigned RegNum, SMLoc S,
-                                                  SMLoc E) {
-    return std::make_unique<MSP430Operand>(k_IndReg, RegNum, S, E);
+  static std::unique_ptr<MSP430Operand> CreateIndReg(MCRegister Reg, SMLoc S,
+                                                     SMLoc E) {
+    return std::make_unique<MSP430Operand>(k_IndReg, Reg, S, E);
   }
 
-  static std::unique_ptr<MSP430Operand> CreatePostIndReg(unsigned RegNum, SMLoc S,
-                                                  SMLoc E) {
-    return std::make_unique<MSP430Operand>(k_PostIndReg, RegNum, S, E);
+  static std::unique_ptr<MSP430Operand> CreatePostIndReg(MCRegister Reg,
+                                                         SMLoc S, SMLoc E) {
+    return std::make_unique<MSP430Operand>(k_PostIndReg, Reg, S, E);
   }
 
   SMLoc getStartLoc() const override { return Start; }
@@ -545,8 +544,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430AsmParser() {
 #define GET_MATCHER_IMPLEMENTATION
 #include "MSP430GenAsmMatcher.inc"
 
-static unsigned convertGR16ToGR8(unsigned Reg) {
-  switch (Reg) {
+static MCRegister convertGR16ToGR8(MCRegister Reg) {
+  switch (Reg.id()) {
   default:
     llvm_unreachable("Unknown GR16 register");
   case MSP430::PC:  return MSP430::PCB;
@@ -575,7 +574,7 @@ unsigned MSP430AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
   if (!Op.isReg())
     return Match_InvalidOperand;
 
-  unsigned Reg = Op.getReg();
+  MCRegister Reg = Op.getReg();
   bool isGR16 =
       MSP430MCRegisterClasses[MSP430::GR16RegClassID].contains(Reg);
 
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index e267a6d0844c6..f4af1d08dde5d 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -81,12 +81,8 @@ bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   MipsFI = MF.getInfo<MipsFunctionInfo>();
   if (Subtarget->inMips16Mode())
-    for (const auto &I : MipsFI->StubsNeeded) {
-      const char *Symbol = I.first;
-      const Mips16HardFloatInfo::FuncSignature *Signature = I.second;
-      if (StubsNeeded.find(Symbol) == StubsNeeded.end())
-        StubsNeeded[Symbol] = Signature;
-    }
+    for (const auto &I : MipsFI->StubsNeeded)
+      StubsNeeded.insert(I);
   MCP = MF.getConstantPool();
 
   // In NaCl, all indirect jump targets must be aligned to bundle size.
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 9bcc911b6c345..09d84d41a2294 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -490,6 +490,7 @@ void NVPTXAsmPrinter::emitFunctionEntryLabel() {
   // Emit open brace for function body.
   OutStreamer->emitRawText(StringRef("{\n"));
   setAndEmitFunctionVirtualRegisters(*MF);
+  encodeDebugInfoRegisterNumbers(*MF);
   // Emit initial .loc debug directive for correct relocation symbol data.
   if (const DISubprogram *SP = MF->getFunction().getSubprogram()) {
     assert(SP->getUnit());
@@ -573,9 +574,30 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
   // filter it out for lower SM versions, as it causes a hard ptxas crash.
   const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
   const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
-  if (STI->getSmVersion() >= 90)
+
+  if (STI->getSmVersion() >= 90) {
+    std::optional<unsigned> ClusterX = getClusterDimx(F);
+    std::optional<unsigned> ClusterY = getClusterDimy(F);
+    std::optional<unsigned> ClusterZ = getClusterDimz(F);
+
+    if (ClusterX || ClusterY || ClusterZ) {
+      O << ".explicitcluster\n";
+      if (ClusterX.value_or(1) != 0) {
+        assert(ClusterY.value_or(1) && ClusterZ.value_or(1) &&
+               "cluster_dim_x != 0 implies cluster_dim_y and cluster_dim_z "
+               "should be non-zero as well");
+
+        O << ".reqnctapercluster " << ClusterX.value_or(1) << ", "
+          << ClusterY.value_or(1) << ", " << ClusterZ.value_or(1) << "\n";
+      } else {
+        assert(!ClusterY.value_or(1) && !ClusterZ.value_or(1) &&
+               "cluster_dim_x == 0 implies cluster_dim_y and cluster_dim_z "
+               "should be 0 as well");
+      }
+    }
     if (const auto Maxclusterrank = getMaxClusterRank(F))
       O << ".maxclusterrank " << *Maxclusterrank << "\n";
+  }
 }
 
 std::string NVPTXAsmPrinter::getVirtualRegisterName(unsigned Reg) const {
@@ -1771,6 +1793,26 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   OutStreamer->emitRawText(O.str());
 }
 
+/// Translate virtual register numbers in DebugInfo locations to their printed
+/// encodings, as used by CUDA-GDB.
+void NVPTXAsmPrinter::encodeDebugInfoRegisterNumbers(
+    const MachineFunction &MF) {
+  const NVPTXSubtarget &STI = MF.getSubtarget<NVPTXSubtarget>();
+  const NVPTXRegisterInfo *registerInfo = STI.getRegisterInfo();
+
+  // Clear the old mapping, and add the new one.  This mapping is used after the
+  // printing of the current function is complete, but before the next function
+  // is printed.
+  registerInfo->clearDebugRegisterMap();
+
+  for (auto &classMap : VRegMapping) {
+    for (auto &registerMapping : classMap.getSecond()) {
+      auto reg = registerMapping.getFirst();
+      registerInfo->addToDebugRegisterMap(reg, getVirtualRegisterName(reg));
+    }
+  }
+}
+
 void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) {
   APFloat APF = APFloat(Fp->getValueAPF()); // make a copy
   bool ignored;
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index d950047dc92c7..f58b4bdc40474 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -181,6 +181,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
   void emitVirtualRegister(unsigned int vr, raw_ostream &);
   void emitFunctionParamList(const Function *, raw_ostream &O);
   void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF);
+  void encodeDebugInfoRegisterNumbers(const MachineFunction &MF);
   void printReturnValStr(const Function *, raw_ostream &O);
   void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8812136733fb2..8718b7890bf58 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -951,8 +951,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(NVPTXISD::LDUV4)
     MAKE_CASE(NVPTXISD::StoreV2)
     MAKE_CASE(NVPTXISD::StoreV4)
-    MAKE_CASE(NVPTXISD::FUN_SHFL_CLAMP)
-    MAKE_CASE(NVPTXISD::FUN_SHFR_CLAMP)
+    MAKE_CASE(NVPTXISD::FSHL_CLAMP)
+    MAKE_CASE(NVPTXISD::FSHR_CLAMP)
     MAKE_CASE(NVPTXISD::IMAD)
     MAKE_CASE(NVPTXISD::BFE)
     MAKE_CASE(NVPTXISD::BFI)
@@ -2483,8 +2483,8 @@ SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
     //   dLo = shf.r.clamp aLo, aHi, Amt
 
     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
-    SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
-                             ShAmt);
+    SDValue Lo =
+        DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
 
     SDValue Ops[2] = { Lo, Hi };
     return DAG.getMergeValues(Ops, dl);
@@ -2542,8 +2542,8 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
     //   dHi = shf.l.clamp aLo, aHi, Amt
     //   dLo = aLo << Amt
 
-    SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
-                             ShAmt);
+    SDValue Hi =
+        DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
 
     SDValue Ops[2] = { Lo, Hi };
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 70e16eee346aa..8c3a597ce0b08 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -51,8 +51,8 @@ enum NodeType : unsigned {
   CallSeqEnd,
   CallPrototype,
   ProxyReg,
-  FUN_SHFL_CLAMP,
-  FUN_SHFR_CLAMP,
+  FSHL_CLAMP,
+  FSHR_CLAMP,
   MUL_WIDE_SIGNED,
   MUL_WIDE_UNSIGNED,
   IMAD,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index c3a8a774673f2..b82826089d3fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3498,8 +3498,8 @@ def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))),
 
 // Create SDNodes so they can be used in the DAG code, e.g.
 // NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
-def fshl_clamp : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
-def fshr_clamp : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
+def fshl_clamp : SDNode<"NVPTXISD::FSHL_CLAMP", SDTIntShiftDOp, []>;
+def fshr_clamp : SDNode<"NVPTXISD::FSHR_CLAMP", SDTIntShiftDOp, []>;
 
 // Funnel shift, requires >= sm_32.  Does not trap if amt is out of range, so
 // no side effects.
@@ -3510,7 +3510,7 @@ let hasSideEffects = false in {
                   (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
                   "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;",
                   [(set Int32Regs:$dst,
-                      (op (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 imm:$amt)))]>,
+                      (op (i32 Int32Regs:$hi), (i32 Int32Regs:$lo), (i32 imm:$amt)))]>,
         Requires<[hasHWROT32]>;
 
     def _r
@@ -3518,7 +3518,7 @@ let hasSideEffects = false in {
                   (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
                   "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;",
                   [(set Int32Regs:$dst,
-                      (op (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>,
+                      (op (i32 Int32Regs:$hi), (i32 Int32Regs:$lo), (i32 Int32Regs:$amt)))]>,
         Requires<[hasHWROT32]>;
   }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 042b0965ea33f..f5ac3c4e96436 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1619,14 +1619,16 @@ multiclass F_ATOMIC_2_imp<ValueType ptrT, NVPTXRegClass ptrclass,
   ValueType regT, NVPTXRegClass regclass,
   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
   Operand IMMType, SDNode IMM, list<Predicate> Pred> {
-  def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
-    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;"),
-    [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b)))]>,
-  Requires<Pred>;
-  def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
-    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;", ""),
-    [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), IMM:$b))]>,
-  Requires<!if(!or(!eq(TypeStr, ".f16"), !eq(TypeStr, ".bf16")), [Predicate<"false">], Pred)>;
+  let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
+    def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+      !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;"),
+      [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b)))]>,
+    Requires<Pred>;
+    def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
+      !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;", ""),
+      [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), IMM:$b))]>,
+    Requires<!if(!or(!eq(TypeStr, ".f16"), !eq(TypeStr, ".bf16")), [Predicate<"false">], Pred)>;
+  }
 }
 multiclass F_ATOMIC_2<ValueType regT, NVPTXRegClass regclass, string SpaceStr, string TypeStr,
   string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM,
@@ -1642,15 +1644,17 @@ multiclass F_ATOMIC_2_NEG_imp<ValueType ptrT, NVPTXRegClass ptrclass,
   ValueType regT, NVPTXRegClass regclass,
   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
   list<Predicate> Pred> {
-  def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
-    !strconcat(
-      "{{ \n\t",
-      ".reg \t.s", TypeStr, " temp; \n\t",
-      "neg.s", TypeStr, " \ttemp, $b; \n\t",
-      "atom", SpaceStr, OpcStr, ".u", TypeStr, " \t$dst, [$addr], temp; \n\t",
-      "}}"),
-    [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b)))]>,
-  Requires<Pred>;
+  let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
+    def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+      !strconcat(
+        "{{ \n\t",
+        ".reg \t.s", TypeStr, " temp; \n\t",
+        "neg.s", TypeStr, " \ttemp, $b; \n\t",
+        "atom", SpaceStr, OpcStr, ".u", TypeStr, " \t$dst, [$addr], temp; \n\t",
+        "}}"),
+      [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b)))]>,
+    Requires<Pred>;
+  }
 }
 multiclass F_ATOMIC_2_NEG<ValueType regT, NVPTXRegClass regclass, string SpaceStr,
   string TypeStr, string OpcStr, PatFrag IntOp, list<Predicate> Pred = []> {
@@ -1665,29 +1669,31 @@ multiclass F_ATOMIC_3_imp<ValueType ptrT, NVPTXRegClass ptrclass,
   ValueType regT, NVPTXRegClass regclass,
   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
   Operand IMMType, list<Predicate> Pred> {
-  def reg : NVPTXInst<(outs regclass:$dst),
-    (ins ptrclass:$addr, regclass:$b, regclass:$c),
-    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
-    [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), (regT regclass:$c)))]>,
-  Requires<Pred>;
-
-  def imm1 : NVPTXInst<(outs regclass:$dst),
-    (ins ptrclass:$addr, IMMType:$b, regclass:$c),
-    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
-    [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, (regT regclass:$c)))]>,
-  Requires<Pred>;
-
-  def imm2 : NVPTXInst<(outs regclass:$dst),
-    (ins ptrclass:$addr, regclass:$b, IMMType:$c),
-    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
-    [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), imm:$c))]>,
-  Requires<Pred>;
-
-  def imm3 : NVPTXInst<(outs regclass:$dst),
-    (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
-    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
-    [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, imm:$c))]>,
-  Requires<Pred>;
+  let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
+    def reg : NVPTXInst<(outs regclass:$dst),
+      (ins ptrclass:$addr, regclass:$b, regclass:$c),
+      !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+      [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), (regT regclass:$c)))]>,
+    Requires<Pred>;
+
+    def imm1 : NVPTXInst<(outs regclass:$dst),
+      (ins ptrclass:$addr, IMMType:$b, regclass:$c),
+      !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+      [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, (regT regclass:$c)))]>,
+    Requires<Pred>;
+
+    def imm2 : NVPTXInst<(outs regclass:$dst),
+      (ins ptrclass:$addr, regclass:$b, IMMType:$c),
+      !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
+      [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), imm:$c))]>,
+    Requires<Pred>;
+
+    def imm3 : NVPTXInst<(outs regclass:$dst),
+      (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
+      !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+      [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, imm:$c))]>,
+    Requires<Pred>;
+  }
 }
 multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SpaceStr, string TypeStr,
   string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index a8a23f04c1249..c30372fed1832 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTXRegisterInfo.h"
+#include "MCTargetDesc/NVPTXInstPrinter.h"
 #include "NVPTX.h"
 #include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
@@ -141,3 +142,44 @@ NVPTXRegisterInfo::getFrameLocalRegister(const MachineFunction &MF) const {
       static_cast<const NVPTXTargetMachine &>(MF.getTarget());
   return TM.is64Bit() ? NVPTX::VRFrameLocal64 : NVPTX::VRFrameLocal32;
 }
+
+void NVPTXRegisterInfo::clearDebugRegisterMap() const {
+  debugRegisterMap.clear();
+}
+
+static uint64_t encodeRegisterForDwarf(std::string registerName) {
+  if (registerName.length() > 8) {
+    // The name is more than 8 characters long, and so won't fit into 64 bits.
+    return 0;
+  }
+
+  // Encode the name string into a DWARF register number using cuda-gdb's
+  // encoding.  See cuda_check_dwarf2_reg_ptx_virtual_register in cuda-tdep.c,
+  // https://github.com/NVIDIA/cuda-gdb/blob/e5cf3bddae520ffb326f95b4d98ce5c7474b828b/gdb/cuda/cuda-tdep.c#L353
+  // IE the bytes of the string are concatenated in reverse into a single
+  // number, which is stored in ULEB128, but in practice must be no more than 8
+  // bytes (excluding null terminator, which is not included).
+  uint64_t result = 0;
+  for (unsigned char c : registerName)
+    result = (result << 8) | c;
+  return result;
+}
+
+void NVPTXRegisterInfo::addToDebugRegisterMap(
+    uint64_t preEncodedVirtualRegister, std::string registerName) const {
+  uint64_t mapped = encodeRegisterForDwarf(registerName);
+  if (mapped == 0)
+    return;
+  debugRegisterMap.insert({preEncodedVirtualRegister, mapped});
+}
+
+int64_t NVPTXRegisterInfo::getDwarfRegNum(MCRegister RegNum, bool isEH) const {
+  if (Register::isPhysicalRegister(RegNum)) {
+    std::string name = NVPTXInstPrinter::getRegisterName(RegNum.id());
+    return encodeRegisterForDwarf(name);
+  }
+  uint64_t lookup = debugRegisterMap.lookup(RegNum.id());
+  if (lookup)
+    return lookup;
+  return -1;
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
index 7bce3bd18ae8f..d2f6d257d6b07 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -26,6 +26,10 @@ class NVPTXRegisterInfo : public NVPTXGenRegisterInfo {
   // Hold Strings that can be free'd all together with NVPTXRegisterInfo
   BumpPtrAllocator StrAlloc;
   UniqueStringSaver StrPool;
+  // State for debug register mapping that can be mutated even through a const
+  // pointer so that we can get the proper dwarf register encoding during ASM
+  // emission.
+  mutable DenseMap<uint64_t, uint64_t> debugRegisterMap;
 
 public:
   NVPTXRegisterInfo();
@@ -56,6 +60,18 @@ class NVPTXRegisterInfo : public NVPTXGenRegisterInfo {
     return getStrPool().save(O.str()).data();
   }
 
+  // Manage the debugRegisterMap.  PTX virtual registers for DebugInfo are
+  // encoded using the names used in the emitted text of the PTX assembly. This
+  // mapping must be managed during assembly emission.
+  //
+  // These are marked const because the interfaces used to access this
+  // RegisterInfo object are all const, but we need to communicate some state
+  // here, because the proper encoding for debug registers is available only
+  // temporarily during ASM emission.
+  void addToDebugRegisterMap(uint64_t preEncodedVirtualRegister,
+                             std::string registerName) const;
+  void clearDebugRegisterMap() const;
+  int64_t getDwarfRegNum(MCRegister RegNum, bool isEH) const override;
 };
 
 std::string getNVPTXRegClassName(const TargetRegisterClass *RC);
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index be1c87d07f4de..2d62f34726e88 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -272,6 +272,18 @@ std::optional<unsigned> getMaxNTID(const Function &F) {
   return std::nullopt;
 }
 
+std::optional<unsigned> getClusterDimx(const Function &F) {
+  return findOneNVVMAnnotation(&F, "cluster_dim_x");
+}
+
+std::optional<unsigned> getClusterDimy(const Function &F) {
+  return findOneNVVMAnnotation(&F, "cluster_dim_y");
+}
+
+std::optional<unsigned> getClusterDimz(const Function &F) {
+  return findOneNVVMAnnotation(&F, "cluster_dim_z");
+}
+
 std::optional<unsigned> getMaxClusterRank(const Function &F) {
   return findOneNVVMAnnotation(&F, "maxclusterrank");
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index cf15dff85cbde..36fc0e4915353 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -55,6 +55,10 @@ std::optional<unsigned> getReqNTIDy(const Function &);
 std::optional<unsigned> getReqNTIDz(const Function &);
 std::optional<unsigned> getReqNTID(const Function &);
 
+std::optional<unsigned> getClusterDimx(const Function &);
+std::optional<unsigned> getClusterDimy(const Function &);
+std::optional<unsigned> getClusterDimz(const Function &);
+
 std::optional<unsigned> getMaxClusterRank(const Function &);
 std::optional<unsigned> getMinCTASm(const Function &);
 std::optional<unsigned> getMaxNReg(const Function &);
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index b7bdbeb535d52..ec3d3dbc8f6aa 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -655,18 +655,17 @@ InstructionCost PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   return Cost;
 }
 
-InstructionCost PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                               Type *CondTy,
-                                               CmpInst::Predicate VecPred,
-                                               TTI::TargetCostKind CostKind,
-                                               const Instruction *I) {
+InstructionCost PPCTTIImpl::getCmpSelInstrCost(
+    unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+    TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
+    TTI::OperandValueInfo Op2Info, const Instruction *I) {
   InstructionCost CostFactor =
       vectorCostAdjustmentFactor(Opcode, ValTy, nullptr);
   if (!CostFactor.isValid())
     return InstructionCost::getMax();
 
-  InstructionCost Cost =
-      BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
+  InstructionCost Cost = BaseT::getCmpSelInstrCost(
+      Opcode, ValTy, CondTy, VecPred, CostKind, Op1Info, Op2Info, I);
   // TODO: Handle other cost kinds.
   if (CostKind != TTI::TCK_RecipThroughput)
     return Cost;
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 126ccb2b3096e..3cb60d7a1785a 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -118,10 +118,12 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
                                    const Instruction *I = nullptr);
   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                  const Instruction *I = nullptr);
-  InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-                                     CmpInst::Predicate VecPred,
-                                     TTI::TargetCostKind CostKind,
-                                     const Instruction *I = nullptr);
+  InstructionCost getCmpSelInstrCost(
+      unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
+      TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr);
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 5e29a92f0bacd..fbad7d5d02db6 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -480,7 +480,13 @@ struct RISCVOperand final : public MCParsedAsmOperand {
            RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.RegNum);
   }
 
+  bool isGPRF16() const {
+    return Kind == KindTy::Register &&
+           RISCVMCRegisterClasses[RISCV::GPRF16RegClassID].contains(Reg.RegNum);
+  }
+
   bool isGPRAsFPR() const { return isGPR() && Reg.IsGPRAsFPR; }
+  bool isGPRAsFPR16() const { return isGPRF16() && Reg.IsGPRAsFPR; }
   bool isGPRPairAsFPR() const { return isGPRPair() && Reg.IsGPRAsFPR; }
 
   bool isGPRPair() const {
@@ -1342,6 +1348,10 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
     Op.Reg.RegNum = convertFPR64ToFPR16(Reg);
     return Match_Success;
   }
+  if (Kind == MCK_GPRAsFPR16 && Op.isGPRAsFPR()) {
+    Op.Reg.RegNum = Reg - RISCV::X0 + RISCV::X0_H;
+    return Match_Success;
+  }
 
   // There are some GPRF64AsFPR instructions that have no RV32 equivalent. We
   // reject them at parsing thinking we should match as GPRPairAsFPR for RV32.
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index b869458a25614..c2659a51b0209 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -81,6 +81,19 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint32_t RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeGPRF16RegisterClass(MCInst &Inst, uint32_t RegNo,
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
+  bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE);
+
+  if (RegNo >= 32 || (IsRVE && RegNo >= 16))
+    return MCDisassembler::Fail;
+
+  MCRegister Reg = RISCV::X0_H + RegNo;
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeGPRX1X5RegisterClass(MCInst &Inst, uint32_t RegNo,
                                                uint64_t Address,
                                                const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index b7ed9de6ca84d..d610f0b956027 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -129,7 +129,7 @@ ArrayRef<MCPhysReg> RISCV::getArgGPRs(const RISCVABI::ABI ABI) {
   static const MCPhysReg ArgIGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12,
                                        RISCV::X13, RISCV::X14, RISCV::X15,
                                        RISCV::X16, RISCV::X17};
-  // The GPRs used for passing arguments in the ILP32E/ILP64E ABI.
+  // The GPRs used for passing arguments in the ILP32E/LP64E ABI.
   static const MCPhysReg ArgEGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12,
                                        RISCV::X13, RISCV::X14, RISCV::X15};
 
@@ -139,6 +139,23 @@ ArrayRef<MCPhysReg> RISCV::getArgGPRs(const RISCVABI::ABI ABI) {
   return ArrayRef(ArgIGPRs);
 }
 
+static ArrayRef<MCPhysReg> getArgGPR16s(const RISCVABI::ABI ABI) {
+  // The GPRs used for passing arguments in the ILP32* and LP64* ABIs, except
+  // the ILP32E ABI.
+  static const MCPhysReg ArgIGPRs[] = {RISCV::X10_H, RISCV::X11_H, RISCV::X12_H,
+                                       RISCV::X13_H, RISCV::X14_H, RISCV::X15_H,
+                                       RISCV::X16_H, RISCV::X17_H};
+  // The GPRs used for passing arguments in the ILP32E/LP64E ABI.
+  static const MCPhysReg ArgEGPRs[] = {RISCV::X10_H, RISCV::X11_H,
+                                       RISCV::X12_H, RISCV::X13_H,
+                                       RISCV::X14_H, RISCV::X15_H};
+
+  if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E)
+    return ArrayRef(ArgEGPRs);
+
+  return ArrayRef(ArgIGPRs);
+}
+
 static ArrayRef<MCPhysReg> getFastCCArgGPRs(const RISCVABI::ABI ABI) {
   // The GPRs used for passing arguments in the FastCC, X5 and X6 might be used
   // for save-restore libcall, so we don't use them.
@@ -147,7 +164,7 @@ static ArrayRef<MCPhysReg> getFastCCArgGPRs(const RISCVABI::ABI ABI) {
       RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15,
       RISCV::X16, RISCV::X17, RISCV::X28, RISCV::X29, RISCV::X30, RISCV::X31};
 
-  // The GPRs used for passing arguments in the FastCC when using ILP32E/ILP64E.
+  // The GPRs used for passing arguments in the FastCC when using ILP32E/LP64E.
   static const MCPhysReg FastCCEGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12,
                                           RISCV::X13, RISCV::X14, RISCV::X15};
 
@@ -157,6 +174,26 @@ static ArrayRef<MCPhysReg> getFastCCArgGPRs(const RISCVABI::ABI ABI) {
   return ArrayRef(FastCCIGPRs);
 }
 
+static ArrayRef<MCPhysReg> getFastCCArgGPRF16s(const RISCVABI::ABI ABI) {
+  // The GPRs used for passing arguments in the FastCC, X5 and X6 might be used
+  // for save-restore libcall, so we don't use them.
+  // Don't use X7 for fastcc, since Zicfilp uses X7 as the label register.
+  static const MCPhysReg FastCCIGPRs[] = {
+      RISCV::X10_H, RISCV::X11_H, RISCV::X12_H, RISCV::X13_H,
+      RISCV::X14_H, RISCV::X15_H, RISCV::X16_H, RISCV::X17_H,
+      RISCV::X28_H, RISCV::X29_H, RISCV::X30_H, RISCV::X31_H};
+
+  // The GPRs used for passing arguments in the FastCC when using ILP32E/LP64E.
+  static const MCPhysReg FastCCEGPRs[] = {RISCV::X10_H, RISCV::X11_H,
+                                          RISCV::X12_H, RISCV::X13_H,
+                                          RISCV::X14_H, RISCV::X15_H};
+
+  if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E)
+    return ArrayRef(FastCCEGPRs);
+
+  return ArrayRef(FastCCIGPRs);
+}
+
 // Pass a 2*XLEN argument that has been split into two XLEN values through
 // registers or the stack as necessary.
 static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
@@ -320,6 +357,13 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
     }
   }
 
+  if ((ValVT == MVT::f16 && Subtarget.hasStdExtZhinxmin())) {
+    if (MCRegister Reg = State.AllocateReg(getArgGPR16s(ABI))) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
   ArrayRef<MCPhysReg> ArgGPRs = RISCV::getArgGPRs(ABI);
 
   // Zfinx/Zdinx use GPR without a bitcast when possible.
@@ -564,9 +608,16 @@ bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
 
   MVT XLenVT = Subtarget.getXLenVT();
 
+  // Check if there is an available GPRF16 before hitting the stack.
+  if ((LocVT == MVT::f16 && Subtarget.hasStdExtZhinxmin())) {
+    if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRF16s(ABI))) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
   // Check if there is an available GPR before hitting the stack.
-  if ((LocVT == MVT::f16 && Subtarget.hasStdExtZhinxmin()) ||
-      (LocVT == MVT::f32 && Subtarget.hasStdExtZfinx()) ||
+  if ((LocVT == MVT::f32 && Subtarget.hasStdExtZfinx()) ||
       (LocVT == MVT::f64 && Subtarget.is64Bit() &&
        Subtarget.hasStdExtZdinx())) {
     if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
diff --git a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
index cce0ffe16e5fe..713c7a0661def 100644
--- a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
+++ b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
@@ -93,14 +93,19 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
           continue;
         LLVM_DEBUG(dbgs() << "    Dead def operand #" << I << " in:\n      ";
                    MI.print(dbgs()));
+        Register X0Reg;
         const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI, MF);
-        if (!(RC && RC->contains(RISCV::X0))) {
+        if (RC && RC->contains(RISCV::X0)) {
+          X0Reg = RISCV::X0;
+        } else if (RC && RC->contains(RISCV::X0_H)) {
+          X0Reg = RISCV::X0_H;
+        } else {
           LLVM_DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
           continue;
         }
         assert(LIS.hasInterval(Reg));
         LIS.removeInterval(Reg);
-        MO.setReg(RISCV::X0);
+        MO.setReg(X0Reg);
         LLVM_DEBUG(dbgs() << "    Replacing with zero register. New:\n      ";
                    MI.print(dbgs()));
         ++NumDeadDefsReplaced;
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 72f96965ae985..2501256ca6adf 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -48,6 +48,8 @@ class RISCVExpandPseudo : public MachineFunctionPass {
                   MachineBasicBlock::iterator &NextMBBI);
   bool expandVMSET_VMCLR(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MBBI, unsigned Opcode);
+  bool expandMV_FPR16INX(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI);
   bool expandRV32ZdinxStore(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI);
   bool expandRV32ZdinxLoad(MachineBasicBlock &MBB,
@@ -104,6 +106,8 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
   // expanded instructions for each pseudo is correct in the Size field of the
   // tablegen definition for the pseudo.
   switch (MBBI->getOpcode()) {
+  case RISCV::PseudoMV_FPR16INX:
+    return expandMV_FPR16INX(MBB, MBBI);
   case RISCV::PseudoRV32ZdinxSD:
     return expandRV32ZdinxStore(MBB, MBBI);
   case RISCV::PseudoRV32ZdinxLD:
@@ -266,6 +270,23 @@ bool RISCVExpandPseudo::expandVMSET_VMCLR(MachineBasicBlock &MBB,
   return true;
 }
 
+bool RISCVExpandPseudo::expandMV_FPR16INX(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI) {
+  DebugLoc DL = MBBI->getDebugLoc();
+  const TargetRegisterInfo *TRI = STI->getRegisterInfo();
+  Register DstReg = TRI->getMatchingSuperReg(
+      MBBI->getOperand(0).getReg(), RISCV::sub_16, &RISCV::GPRRegClass);
+  Register SrcReg = TRI->getMatchingSuperReg(
+      MBBI->getOperand(1).getReg(), RISCV::sub_16, &RISCV::GPRRegClass);
+
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI), DstReg)
+      .addReg(SrcReg, getKillRegState(MBBI->getOperand(1).isKill()))
+      .addImm(0);
+
+  MBBI->eraseFromParent(); // The pseudo instruction is gone now.
+  return true;
+}
+
 // This function expands the PseudoRV32ZdinxSD for storing a double-precision
 // floating-point value into memory by generating an equivalent instruction
 // sequence for RV32.
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 22824b77c37dd..7cbd1a35b2583 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -755,6 +755,19 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   }
 }
 
+void RISCVFrameLowering::deallocateStack(MachineFunction &MF,
+                                         MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI,
+                                         const DebugLoc &DL, uint64_t StackSize,
+                                         int64_t CFAOffset) const {
+  const RISCVRegisterInfo *RI = STI.getRegisterInfo();
+
+  Register SPReg = getSPReg(STI);
+
+  RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(StackSize),
+                MachineInstr::FrameDestroy, getStackAlign());
+}
+
 void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
                                       MachineBasicBlock &MBB) const {
   const RISCVRegisterInfo *RI = STI.getRegisterInfo();
@@ -786,59 +799,70 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
       --MBBI;
   }
 
-  const auto &CSI = getUnmanagedCSI(MF, MFI.getCalleeSavedInfo());
+  const auto &CSI = MFI.getCalleeSavedInfo();
 
   // Skip to before the restores of scalar callee-saved registers
   // FIXME: assumes exactly one instruction is used to restore each
   // callee-saved register.
-  auto LastFrameDestroy = MBBI;
-  if (!CSI.empty())
-    LastFrameDestroy = std::prev(MBBI, CSI.size());
+  auto LastFrameDestroy = std::prev(MBBI, getUnmanagedCSI(MF, CSI).size());
 
-  uint64_t RealStackSize = getStackSizeWithRVVPadding(MF);
-  uint64_t StackSize = RealStackSize - RVFI->getReservedSpillsSize();
-  uint64_t FPOffset = RealStackSize - RVFI->getVarArgsSaveSize();
+  uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
+  uint64_t RealStackSize = FirstSPAdjustAmount ? FirstSPAdjustAmount
+                                               : getStackSizeWithRVVPadding(MF);
+  uint64_t StackSize = FirstSPAdjustAmount ? FirstSPAdjustAmount
+                                           : getStackSizeWithRVVPadding(MF) -
+                                                 RVFI->getReservedSpillsSize();
+  uint64_t FPOffset = FirstSPAdjustAmount ? FirstSPAdjustAmount
+                                          : getStackSizeWithRVVPadding(MF) -
+                                                RVFI->getVarArgsSaveSize();
   uint64_t RVVStackSize = RVFI->getRVVStackSize();
 
-  // Restore the stack pointer using the value of the frame pointer. Only
-  // necessary if the stack pointer was modified, meaning the stack size is
-  // unknown.
-  //
-  // In order to make sure the stack point is right through the EH region,
-  // we also need to restore stack pointer from the frame pointer if we
-  // don't preserve stack space within prologue/epilogue for outgoing variables,
-  // normally it's just checking the variable sized object is present or not
-  // is enough, but we also don't preserve that at prologue/epilogue when
-  // have vector objects in stack.
-  if (RI->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
-      !hasReservedCallFrame(MF)) {
-    assert(hasFP(MF) && "frame pointer should not have been eliminated");
-    RI->adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg,
-                  StackOffset::getFixed(-FPOffset),
-                  MachineInstr::FrameDestroy, getStackAlign());
-  } else {
-    if (RVVStackSize)
+  bool RestoreFP = RI->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
+                   !hasReservedCallFrame(MF);
+
+  if (RVVStackSize) {
+    // If restoreFP the stack pointer will be restored using the frame pointer
+    // value.
+    if (!RestoreFP) {
       adjustStackForRVV(MF, MBB, LastFrameDestroy, DL, RVVStackSize,
                         MachineInstr::FrameDestroy);
+    }
   }
 
-  uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
   if (FirstSPAdjustAmount) {
     uint64_t SecondSPAdjustAmount =
         getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount;
     assert(SecondSPAdjustAmount > 0 &&
            "SecondSPAdjustAmount should be greater than zero");
 
-    RI->adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg,
-                  StackOffset::getFixed(SecondSPAdjustAmount),
-                  MachineInstr::FrameDestroy, getStackAlign());
+    // If restoreFP the stack pointer will be restored using the frame pointer
+    // value.
+    if (!RestoreFP) {
+      RI->adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg,
+                    StackOffset::getFixed(SecondSPAdjustAmount),
+                    MachineInstr::FrameDestroy, getStackAlign());
+    }
   }
 
-  if (FirstSPAdjustAmount)
-    StackSize = FirstSPAdjustAmount;
+  // Restore the stack pointer using the value of the frame pointer. Only
+  // necessary if the stack pointer was modified, meaning the stack size is
+  // unknown.
+  //
+  // In order to make sure the stack point is right through the EH region,
+  // we also need to restore stack pointer from the frame pointer if we
+  // don't preserve stack space within prologue/epilogue for outgoing variables,
+  // normally it's just checking the variable sized object is present or not
+  // is enough, but we also don't preserve that at prologue/epilogue when
+  // have vector objects in stack.
+  if (RestoreFP) {
+    RI->adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg,
+                  StackOffset::getFixed(-FPOffset), MachineInstr::FrameDestroy,
+                  getStackAlign());
+  }
 
-  if (RVFI->isPushable(MF) && MBBI != MBB.end() &&
-      MBBI->getOpcode() == RISCV::CM_POP) {
+  bool ApplyPop = RVFI->isPushable(MF) && MBBI != MBB.end() &&
+                  MBBI->getOpcode() == RISCV::CM_POP;
+  if (ApplyPop) {
     // Use available stack adjustment in pop instruction to deallocate stack
     // space. Align the stack size down to a multiple of 16. This is needed for
     // RVE.
@@ -846,14 +870,19 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
     uint64_t Spimm = std::min(alignDown(StackSize, 16), (uint64_t)48);
     MBBI->getOperand(1).setImm(Spimm);
     StackSize -= Spimm;
-  }
 
-  // Deallocate stack
-  if (StackSize != 0) {
-    RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(StackSize),
-                  MachineInstr::FrameDestroy, getStackAlign());
+    if (StackSize != 0)
+      deallocateStack(MF, MBB, MBBI, DL, StackSize,
+                      /*stack_adj of cm.pop instr*/ RealStackSize - StackSize);
+
+    MBBI = std::next(MBBI);
   }
 
+  // Deallocate stack if StackSize isn't a zero and if we didn't already do it
+  // during cm.pop handling.
+  if (StackSize != 0 && !ApplyPop)
+    deallocateStack(MF, MBB, MBBI, DL, StackSize, 0);
+
   // Emit epilogue for shadow call stack.
   emitSCSEpilogue(MF, MBB, MBBI, DL);
 }
@@ -1102,16 +1131,25 @@ RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFunction &MF) const {
     RVVStackAlign = std::max(RVVStackAlign, ObjectAlign);
   }
 
+  uint64_t StackSize = Offset;
+
+  // Multiply by vscale.
+  if (ST.getRealMinVLen() >= RISCV::RVVBitsPerBlock)
+    StackSize *= ST.getRealMinVLen() / RISCV::RVVBitsPerBlock;
+
   // Ensure the alignment of the RVV stack. Since we want the most-aligned
   // object right at the bottom (i.e., any padding at the top of the frame),
   // readjust all RVV objects down by the alignment padding.
-  uint64_t StackSize = Offset;
   if (auto AlignmentPadding = offsetToAlignment(StackSize, RVVStackAlign)) {
     StackSize += AlignmentPadding;
     for (int FI : ObjectsToAllocate)
       MFI.setObjectOffset(FI, MFI.getObjectOffset(FI) - AlignmentPadding);
   }
 
+  // Remove vscale.
+  if (ST.getRealMinVLen() >= RISCV::RVVBitsPerBlock)
+    StackSize /= ST.getRealMinVLen() / RISCV::RVVBitsPerBlock;
+
   return std::make_pair(StackSize, RVVStackAlign);
 }
 
@@ -1557,6 +1595,7 @@ void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
     int FI = CS.getFrameIdx();
     if (FI >= 0 && MFI.getStackID(FI) == TargetStackID::ScalableVector) {
       MCRegister BaseReg = TRI.getSubReg(CS.getReg(), RISCV::sub_vrm1_0);
+
       // If it's not a grouped vector register, it doesn't have subregister, so
       // the base register is just itself.
       if (BaseReg == RISCV::NoRegister)
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 28ab4aff3b9d5..89f95f2aa04aa 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -88,9 +88,15 @@ class RISCVFrameLowering : public TargetFrameLowering {
   void adjustStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
                          int64_t Amount, MachineInstr::MIFlag Flag) const;
+
   void emitCalleeSavedRVVPrologCFI(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MI,
                                    bool HasFP) const;
+
+  void deallocateStack(MachineFunction &MF, MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+                       uint64_t StackSize, int64_t CFAOffset) const;
+
   std::pair<int64_t, Align>
   assignRVVStackObjectOffsets(MachineFunction &MF) const;
 };
diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 84c00d31800ce..e14de46673aba 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -95,10 +95,6 @@ def : Pat<(XLenVT (sub GPR:$rs1, simm12Plus1:$imm)),
 let Predicates = [IsRV64] in {
 def : Pat<(i32 (sub GPR:$rs1, simm12Plus1i32:$imm)),
           (ADDIW GPR:$rs1, (i64 (NegImm $imm)))>;
-
-def : Pat<(i32 (shl GPR:$rs1, (i32 GPR:$rs2))), (SLLW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (sra GPR:$rs1, (i32 GPR:$rs2))), (SRAW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (srl GPR:$rs1, (i32 GPR:$rs2))), (SRLW GPR:$rs1, GPR:$rs2)>;
 }
 
 // Ptr type used in patterns with GlobalISelEmitter
@@ -195,9 +191,9 @@ def : PatGprGpr<sub, SUBW, i32, i32>;
 def : PatGprGpr<and, AND, i32, i32>;
 def : PatGprGpr<or, OR, i32, i32>;
 def : PatGprGpr<xor, XOR, i32, i32>;
-def : PatGprGpr<shiftopw<shl>, SLLW, i32, i64>;
-def : PatGprGpr<shiftopw<srl>, SRLW, i32, i64>;
-def : PatGprGpr<shiftopw<sra>, SRAW, i32, i64>;
+def : PatGprGpr<shl, SLLW, i32, i32>;
+def : PatGprGpr<srl, SRLW, i32, i32>;
+def : PatGprGpr<sra, SRAW, i32, i32>;
 
 def : Pat<(i32 (add GPR:$rs1, simm12i32:$imm)),
           (ADDIW GPR:$rs1, (i64 (as_i64imm $imm)))>;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 05ba18bf8ebd8..23479c2edf1d9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -928,7 +928,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     }
 
     SDNode *Res;
-    if (Opc == RISCV::FCVT_D_W_IN32X || Opc == RISCV::FCVT_D_W)
+    if (VT.SimpleTy == MVT::f16 && Opc == RISCV::COPY) {
+      Res =
+          CurDAG->getTargetExtractSubreg(RISCV::sub_16, DL, VT, Imm).getNode();
+    } else if (Opc == RISCV::FCVT_D_W_IN32X || Opc == RISCV::FCVT_D_W)
       Res = CurDAG->getMachineNode(
           Opc, DL, VT, Imm,
           CurDAG->getTargetConstant(RISCVFPRndMode::RNE, DL, XLenVT));
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7b00b2514c4ef..e77f8783f1727 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1082,10 +1082,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                          VT, Custom);
       MVT EltVT = VT.getVectorElementType();
       if (isTypeLegal(EltVT))
-        setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+        setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT,
+                            ISD::EXTRACT_VECTOR_ELT},
+                           VT, Custom);
       else
-        setOperationAction(ISD::SPLAT_VECTOR, EltVT, Custom);
-      setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
+        setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT},
+                           EltVT, Custom);
+      setOperationAction({ISD::LOAD, ISD::STORE, ISD::MLOAD, ISD::MSTORE,
+                          ISD::MGATHER, ISD::MSCATTER, ISD::VP_LOAD,
+                          ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
+                          ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER,
+                          ISD::VP_SCATTER},
+                         VT, Custom);
 
       setOperationAction(ISD::FNEG, VT, Expand);
       setOperationAction(ISD::FABS, VT, Expand);
@@ -4449,11 +4457,27 @@ static SDValue lowerScalarSplat(SDValue Passthru, SDValue Scalar, SDValue VL,
   bool HasPassthru = Passthru && !Passthru.isUndef();
   if (!HasPassthru && !Passthru)
     Passthru = DAG.getUNDEF(VT);
-  if (VT.isFloatingPoint())
-    return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, VT, Passthru, Scalar, VL);
 
+  MVT EltVT = VT.getVectorElementType();
   MVT XLenVT = Subtarget.getXLenVT();
 
+  if (VT.isFloatingPoint()) {
+    if ((EltVT == MVT::f16 && !Subtarget.hasStdExtZvfh()) ||
+        EltVT == MVT::bf16) {
+      if ((EltVT == MVT::bf16 && Subtarget.hasStdExtZfbfmin()) ||
+          (EltVT == MVT::f16 && Subtarget.hasStdExtZfhmin()))
+        Scalar = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, XLenVT, Scalar);
+      else
+        Scalar = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Scalar);
+      MVT IVT = VT.changeVectorElementType(MVT::i16);
+      Passthru = DAG.getNode(ISD::BITCAST, DL, IVT, Passthru);
+      SDValue Splat =
+          lowerScalarSplat(Passthru, Scalar, VL, IVT, DL, DAG, Subtarget);
+      return DAG.getNode(ISD::BITCAST, DL, VT, Splat);
+    }
+    return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, VT, Passthru, Scalar, VL);
+  }
+
   // Simplest case is that the operand needs to be promoted to XLenVT.
   if (Scalar.getValueType().bitsLE(XLenVT)) {
     // If the operand is a constant, sign extend to increase our chances
@@ -8967,6 +8991,16 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec, Idx);
   }
 
+  if ((EltVT == MVT::f16 && !Subtarget.hasVInstructionsF16()) ||
+      EltVT == MVT::bf16) {
+    // If we don't have vfmv.f.s for f16/bf16, extract to a gpr then use fmv.h.x
+    MVT IntVT = VecVT.changeTypeToInteger();
+    SDValue IntVec = DAG.getBitcast(IntVT, Vec);
+    SDValue IntExtract =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, XLenVT, IntVec, Idx);
+    return DAG.getNode(RISCVISD::FMV_H_X, DL, EltVT, IntExtract);
+  }
+
   // If this is a fixed vector, we need to convert it to a scalable vector.
   MVT ContainerVT = VecVT;
   if (VecVT.isFixedLengthVector()) {
@@ -16984,6 +17018,17 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       return Op0.getOperand(0);
     }
 
+    if (ISD::isNormalLoad(Op0.getNode()) && Op0.hasOneUse() &&
+        cast<LoadSDNode>(Op0)->isSimple()) {
+      MVT IVT = MVT::getIntegerVT(Op0.getValueSizeInBits());
+      auto *LN0 = cast<LoadSDNode>(Op0);
+      SDValue Load =
+          DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(),
+                         LN0->getBasePtr(), IVT, LN0->getMemOperand());
+      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
+      return Load;
+    }
+
     // This is a target-specific version of a DAGCombine performed in
     // DAGCombiner::visitBITCAST. It performs the equivalent of:
     // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
@@ -19595,8 +19640,9 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
             DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
                         DAG.getIntPtrConstant(HiVA.getLocMemOffset(), DL));
         // Emit the store.
-        MemOpChains.push_back(
-            DAG.getStore(Chain, DL, Hi, Address, MachinePointerInfo()));
+        MemOpChains.push_back(DAG.getStore(
+            Chain, DL, Hi, Address,
+            MachinePointerInfo::getStack(MF, HiVA.getLocMemOffset())));
       } else {
         // Second half of f64 is passed in another GPR.
         Register RegHigh = HiVA.getLocReg();
@@ -19678,7 +19724,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
       // Emit the store.
       MemOpChains.push_back(
-          DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
+          DAG.getStore(Chain, DL, ArgValue, Address,
+                       MachinePointerInfo::getStack(MF, VA.getLocMemOffset())));
     }
   }
 
@@ -19707,11 +19754,14 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress/ExternalSymbol node, turn it into a
   // TargetGlobalAddress/TargetExternalSymbol node so that legalize won't
   // split it and then direct call can be matched by PseudoCALL.
+  bool CalleeIsLargeExternalSymbol = false;
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     if (auto *S = dyn_cast<GlobalAddressSDNode>(Callee))
       Callee = getLargeGlobalAddress(S, DL, PtrVT, DAG);
-    else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
       Callee = getLargeExternalSymbol(S, DL, PtrVT, DAG);
+      CalleeIsLargeExternalSymbol = true;
+    }
   } else if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = S->getGlobal();
     Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, RISCVII::MO_CALL);
@@ -19747,16 +19797,28 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Emit the call.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
+  // Use software guarded branch for large code model non-indirect calls
+  // Tail call to external symbol will have a null CLI.CB and we need another
+  // way to determine the callsite type
+  bool NeedSWGuarded = false;
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      Subtarget.hasStdExtZicfilp() &&
+      ((CLI.CB && !CLI.CB->isIndirectCall()) || CalleeIsLargeExternalSymbol))
+    NeedSWGuarded = true;
+
   if (IsTailCall) {
     MF.getFrameInfo().setHasTailCall();
-    SDValue Ret = DAG.getNode(RISCVISD::TAIL, DL, NodeTys, Ops);
+    unsigned CallOpc =
+        NeedSWGuarded ? RISCVISD::SW_GUARDED_TAIL : RISCVISD::TAIL;
+    SDValue Ret = DAG.getNode(CallOpc, DL, NodeTys, Ops);
     if (CLI.CFIType)
       Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
     return Ret;
   }
 
-  Chain = DAG.getNode(RISCVISD::CALL, DL, NodeTys, Ops);
+  unsigned CallOpc = NeedSWGuarded ? RISCVISD::SW_GUARDED_CALL : RISCVISD::CALL;
+  Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
   if (CLI.CFIType)
     Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
@@ -20204,6 +20266,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CZERO_EQZ)
   NODE_NAME_CASE(CZERO_NEZ)
   NODE_NAME_CASE(SW_GUARDED_BRIND)
+  NODE_NAME_CASE(SW_GUARDED_CALL)
+  NODE_NAME_CASE(SW_GUARDED_TAIL)
   NODE_NAME_CASE(TUPLE_INSERT)
   NODE_NAME_CASE(TUPLE_EXTRACT)
   NODE_NAME_CASE(SF_VC_XV_SE)
@@ -21428,9 +21492,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
   if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
-  unsigned Factor = 2;
+  const unsigned Factor = 2;
 
-  VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
   VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
   const DataLayout &DL = LI->getDataLayout();
 
@@ -21438,18 +21501,15 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
                                     LI->getPointerAddressSpace(), DL))
     return false;
 
-  Function *VlsegNFunc;
-  Value *VL, *Return;
+  Value *Return;
   Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
-  SmallVector<Value *, 10> Ops;
 
-  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
-    VlsegNFunc = Intrinsic::getDeclaration(
+  if (auto *FVTy = dyn_cast<FixedVectorType>(ResVTy)) {
+    Function *VlsegNFunc = Intrinsic::getDeclaration(
         LI->getModule(), FixedVlsegIntrIds[Factor - 2],
         {ResVTy, LI->getPointerOperandType(), XLenTy});
-    VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
-    Ops.append({LI->getPointerOperand(), VL});
-    Return = Builder.CreateCall(VlsegNFunc, Ops);
+    Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
+    Return = Builder.CreateCall(VlsegNFunc, {LI->getPointerOperand(), VL});
   } else {
     static const Intrinsic::ID IntrIds[] = {
         Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3,
@@ -21463,15 +21523,15 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
         LI->getContext(), "riscv.vector.tuple",
         ScalableVectorType::get(Type::getInt8Ty(LI->getContext()),
                                 NumElts * SEW / 8),
-        2);
+        Factor);
 
-    VlsegNFunc = Intrinsic::getDeclaration(LI->getModule(), IntrIds[Factor - 2],
-                                           {VecTupTy, XLenTy});
-    VL = Constant::getAllOnesValue(XLenTy);
+    Function *VlsegNFunc = Intrinsic::getDeclaration(
+        LI->getModule(), IntrIds[Factor - 2], {VecTupTy, XLenTy});
+    Value *VL = Constant::getAllOnesValue(XLenTy);
 
-    Ops.append({PoisonValue::get(VecTupTy), LI->getPointerOperand(), VL,
-                ConstantInt::get(XLenTy, Log2_64(SEW))});
-    Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
+    Value *Vlseg = Builder.CreateCall(
+        VlsegNFunc, {PoisonValue::get(VecTupTy), LI->getPointerOperand(), VL,
+                     ConstantInt::get(XLenTy, Log2_64(SEW))});
 
     SmallVector<Type *, 2> AggrTypes{Factor, ResVTy};
     Return = PoisonValue::get(StructType::get(LI->getContext(), AggrTypes));
@@ -21499,26 +21559,23 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
   if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
-  unsigned Factor = 2;
+  const unsigned Factor = 2;
 
-  VectorType *VTy = cast<VectorType>(II->getType());
-  VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
+  VectorType *InVTy = cast<VectorType>(II->getArgOperand(0)->getType());
   const DataLayout &DL = SI->getDataLayout();
 
   if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
                                     SI->getPointerAddressSpace(), DL))
     return false;
 
-  Function *VssegNFunc;
-  Value *VL;
   Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
 
-  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
-    VssegNFunc = Intrinsic::getDeclaration(
+  if (auto *FVTy = dyn_cast<FixedVectorType>(InVTy)) {
+    Function *VssegNFunc = Intrinsic::getDeclaration(
         SI->getModule(), FixedVssegIntrIds[Factor - 2],
         {InVTy, SI->getPointerOperandType(), XLenTy});
-    VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
-    Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
+    Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
+    Builder.CreateCall(VssegNFunc, {II->getArgOperand(0), II->getArgOperand(1),
                                     SI->getPointerOperand(), VL});
   } else {
     static const Intrinsic::ID IntrIds[] = {
@@ -21533,19 +21590,20 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
         SI->getContext(), "riscv.vector.tuple",
         ScalableVectorType::get(Type::getInt8Ty(SI->getContext()),
                                 NumElts * SEW / 8),
-        2);
+        Factor);
 
-    VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
-                                           {VecTupTy, XLenTy});
+    Function *VssegNFunc = Intrinsic::getDeclaration(
+        SI->getModule(), IntrIds[Factor - 2], {VecTupTy, XLenTy});
 
-    VL = Constant::getAllOnesValue(XLenTy);
+    Value *VL = Constant::getAllOnesValue(XLenTy);
 
     Function *VecInsertFunc = Intrinsic::getDeclaration(
         SI->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy});
     Value *StoredVal = PoisonValue::get(VecTupTy);
     for (unsigned i = 0; i < Factor; ++i)
-      StoredVal = Builder.CreateCall(
-          VecInsertFunc, {StoredVal, II->getOperand(i), Builder.getInt32(i)});
+      StoredVal =
+          Builder.CreateCall(VecInsertFunc, {StoredVal, II->getArgOperand(i),
+                                             Builder.getInt32(i)});
 
     Builder.CreateCall(VssegNFunc, {StoredVal, SI->getPointerOperand(), VL,
                                     ConstantInt::get(XLenTy, Log2_64(SEW))});
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ceb9d49900284..05581552ab604 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -411,9 +411,12 @@ enum NodeType : unsigned {
   CZERO_EQZ, // vt.maskc for XVentanaCondOps.
   CZERO_NEZ, // vt.maskcn for XVentanaCondOps.
 
-  /// Software guarded BRIND node. Operand 0 is the chain operand and
-  /// operand 1 is the target address.
+  // Software guarded BRIND node. Operand 0 is the chain operand and
+  // operand 1 is the target address.
   SW_GUARDED_BRIND,
+  // Software guarded calls for large code model
+  SW_GUARDED_CALL,
+  SW_GUARDED_TAIL,
 
   SF_VC_XV_SE,
   SF_VC_IV_SE,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index b594531ccb095..91503bd7f41f3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -104,6 +104,7 @@ Register RISCVInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
     MemBytes = 1;
     break;
   case RISCV::LH:
+  case RISCV::LH_INX:
   case RISCV::LHU:
   case RISCV::FLH:
     MemBytes = 2;
@@ -144,6 +145,7 @@ Register RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
     MemBytes = 1;
     break;
   case RISCV::SH:
+  case RISCV::SH_INX:
   case RISCV::FSH:
     MemBytes = 2;
     break;
@@ -462,6 +464,13 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  if (RISCV::GPRF16RegClass.contains(DstReg, SrcReg)) {
+    BuildMI(MBB, MBBI, DL, get(RISCV::PseudoMV_FPR16INX), DstReg)
+        .addReg(SrcReg,
+                getKillRegState(KillSrc) | getRenamableRegState(RenamableSrc));
+    return;
+  }
+
   if (RISCV::GPRPairRegClass.contains(DstReg, SrcReg)) {
     // Emit an ADDI for both parts of GPRPair.
     BuildMI(MBB, MBBI, DL, get(RISCV::ADDI),
@@ -583,6 +592,9 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
              RISCV::SW : RISCV::SD;
     IsScalableVector = false;
+  } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) {
+    Opcode = RISCV::SH_INX;
+    IsScalableVector = false;
   } else if (RISCV::GPRPairRegClass.hasSubClassEq(RC)) {
     Opcode = RISCV::PseudoRV32ZdinxSD;
     IsScalableVector = false;
@@ -666,6 +678,9 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
              RISCV::LW : RISCV::LD;
     IsScalableVector = false;
+  } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) {
+    Opcode = RISCV::LH_INX;
+    IsScalableVector = false;
   } else if (RISCV::GPRPairRegClass.hasSubClassEq(RC)) {
     Opcode = RISCV::PseudoRV32ZdinxLD;
     IsScalableVector = false;
@@ -761,6 +776,47 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
       LoadOpc = RISCV::LBU;
       break;
     }
+    if (RISCV::getRVVMCOpcode(MI.getOpcode()) == RISCV::VMV_X_S) {
+      unsigned Log2SEW =
+          MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+      if (STI.getXLen() < (1U << Log2SEW))
+        return nullptr;
+      switch (Log2SEW) {
+      case 3:
+        LoadOpc = RISCV::LB;
+        break;
+      case 4:
+        LoadOpc = RISCV::LH;
+        break;
+      case 5:
+        LoadOpc = RISCV::LW;
+        break;
+      case 6:
+        LoadOpc = RISCV::LD;
+        break;
+      default:
+        llvm_unreachable("Unexpected SEW");
+      }
+      break;
+    }
+    if (RISCV::getRVVMCOpcode(MI.getOpcode()) == RISCV::VFMV_F_S) {
+      unsigned Log2SEW =
+          MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+      switch (Log2SEW) {
+      case 4:
+        LoadOpc = RISCV::FLH;
+        break;
+      case 5:
+        LoadOpc = RISCV::FLW;
+        break;
+      case 6:
+        LoadOpc = RISCV::FLD;
+        break;
+      default:
+        llvm_unreachable("Unexpected SEW");
+      }
+      break;
+    }
     return nullptr;
   case RISCV::SEXT_H:
     LoadOpc = RISCV::LH;
@@ -1497,6 +1553,9 @@ unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   }
 
   switch (Opcode) {
+  case RISCV::PseudoMV_FPR16INX:
+    // MV is always compressible to either c.mv or c.li rd, 0.
+    return STI.hasStdExtCOrZca() ? 2 : 4;
   case TargetOpcode::STACKMAP:
     // The upper bound for a stackmap intrinsic is the full length of its shadow
     return StackMapOpers(&MI).getNumPatchBytes();
@@ -2552,6 +2611,7 @@ bool RISCVInstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg,
   case RISCV::LB:
   case RISCV::LBU:
   case RISCV::LH:
+  case RISCV::LH_INX:
   case RISCV::LHU:
   case RISCV::LW:
   case RISCV::LWU:
@@ -2561,6 +2621,7 @@ bool RISCVInstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg,
   case RISCV::FLD:
   case RISCV::SB:
   case RISCV::SH:
+  case RISCV::SH_INX:
   case RISCV::SW:
   case RISCV::SD:
   case RISCV::FSH:
@@ -2624,9 +2685,11 @@ bool RISCVInstrInfo::getMemOperandsWithOffsetWidth(
   case RISCV::LBU:
   case RISCV::SB:
   case RISCV::LH:
+  case RISCV::LH_INX:
   case RISCV::LHU:
   case RISCV::FLH:
   case RISCV::SH:
+  case RISCV::SH_INX:
   case RISCV::FSH:
   case RISCV::LW:
   case RISCV::LWU:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index fe5623e2920e2..a2f1e3ded18fe 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -57,6 +57,9 @@ def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_CallSeqEnd,
 def riscv_call      : SDNode<"RISCVISD::CALL", SDT_RISCVCall,
                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                               SDNPVariadic]>;
+def riscv_sw_guarded_call : SDNode<"RISCVISD::SW_GUARDED_CALL", SDT_RISCVCall,
+                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                    SDNPVariadic]>;
 def riscv_ret_glue  : SDNode<"RISCVISD::RET_GLUE", SDTNone,
                              [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def riscv_sret_glue : SDNode<"RISCVISD::SRET_GLUE", SDTNone,
@@ -69,6 +72,9 @@ def riscv_brcc      : SDNode<"RISCVISD::BR_CC", SDT_RISCVBrCC,
 def riscv_tail      : SDNode<"RISCVISD::TAIL", SDT_RISCVCall,
                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                               SDNPVariadic]>;
+def riscv_sw_guarded_tail : SDNode<"RISCVISD::SW_GUARDED_TAIL", SDT_RISCVCall,
+                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                    SDNPVariadic]>;
 def riscv_sw_guarded_brind : SDNode<"RISCVISD::SW_GUARDED_BRIND",
                                     SDTBrind, [SDNPHasChain]>;
 def riscv_sllw      : SDNode<"RISCVISD::SLLW", SDT_RISCVIntBinOpW>;
@@ -514,8 +520,8 @@ class BranchCC_rri<bits<3> funct3, string opcodestr>
 }
 
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
-class Load_ri<bits<3> funct3, string opcodestr>
-    : RVInstI<funct3, OPC_LOAD, (outs GPR:$rd), (ins GPRMem:$rs1, simm12:$imm12),
+class Load_ri<bits<3> funct3, string opcodestr, DAGOperand rty = GPR>
+    : RVInstI<funct3, OPC_LOAD, (outs rty:$rd), (ins GPRMem:$rs1, simm12:$imm12),
               opcodestr, "$rd, ${imm12}(${rs1})">;
 
 class HLoad_r<bits<7> funct7, bits<5> funct5, string opcodestr>
@@ -529,9 +535,9 @@ class HLoad_r<bits<7> funct7, bits<5> funct5, string opcodestr>
 // reflecting the order these fields are specified in the instruction
 // encoding.
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
-class Store_rri<bits<3> funct3, string opcodestr>
+class Store_rri<bits<3> funct3, string opcodestr, DAGOperand rty = GPR>
     : RVInstS<funct3, OPC_STORE, (outs),
-              (ins GPR:$rs2, GPRMem:$rs1, simm12:$imm12),
+              (ins rty:$rs2, GPRMem:$rs1, simm12:$imm12),
               opcodestr, "$rs2, ${imm12}(${rs1})">;
 
 class HStore_rr<bits<7> funct7, string opcodestr>
@@ -543,8 +549,8 @@ class HStore_rr<bits<7> funct7, string opcodestr>
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class ALU_ri<bits<3> funct3, string opcodestr>
-    : RVInstI<funct3, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1, simm12:$imm12),
+class ALU_ri<bits<3> funct3, string opcodestr, DAGOperand rty = GPR>
+    : RVInstI<funct3, OPC_OP_IMM, (outs rty:$rd), (ins rty:$rs1, simm12:$imm12),
               opcodestr, "$rd, $rs1, $imm12">,
       Sched<[WriteIALU, ReadIALU]>;
 
@@ -1555,10 +1561,15 @@ let Predicates = [NoStdExtZicfilp] in
 def PseudoCALLIndirect : Pseudo<(outs), (ins GPRJALR:$rs1),
                                 [(riscv_call GPRJALR:$rs1)]>,
                          PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;
-let Predicates = [HasStdExtZicfilp] in
+let Predicates = [HasStdExtZicfilp] in {
 def PseudoCALLIndirectNonX7 : Pseudo<(outs), (ins GPRJALRNonX7:$rs1),
                                      [(riscv_call GPRJALRNonX7:$rs1)]>,
                               PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;
+// For large code model, non-indirect calls could be software-guarded
+def PseudoCALLIndirectX7 : Pseudo<(outs), (ins GPRX7:$rs1),
+                                  [(riscv_sw_guarded_call GPRX7:$rs1)]>,
+                           PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;
+}
 }
 
 let isBarrier = 1, isReturn = 1, isTerminator = 1 in
@@ -1579,10 +1590,15 @@ let Predicates = [NoStdExtZicfilp] in
 def PseudoTAILIndirect : Pseudo<(outs), (ins GPRTC:$rs1),
                                 [(riscv_tail GPRTC:$rs1)]>,
                          PseudoInstExpansion<(JALR X0, GPR:$rs1, 0)>;
-let Predicates = [HasStdExtZicfilp] in
+let Predicates = [HasStdExtZicfilp] in {
 def PseudoTAILIndirectNonX7 : Pseudo<(outs), (ins GPRTCNonX7:$rs1),
                                      [(riscv_tail GPRTCNonX7:$rs1)]>,
                               PseudoInstExpansion<(JALR X0, GPR:$rs1, 0)>;
+// For large code model, non-indirect calls could be software-guarded
+def PseudoTAILIndirectX7 : Pseudo<(outs), (ins GPRX7:$rs1),
+                                  [(riscv_sw_guarded_tail GPRX7:$rs1)]>,
+                           PseudoInstExpansion<(JALR X0, GPR:$rs1, 0)>;
+}
 }
 
 def : Pat<(riscv_tail (iPTR tglobaladdr:$dst)),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td
index b586b10192fff..b54baa16d9286 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td
@@ -792,17 +792,18 @@ let Predicates = [HasVendorXCValu, IsRV32], AddedComplexity = 1 in {
   def : Pat<(sext_inreg (XLenVT GPR:$rs1), i16), (CV_EXTHS GPR:$rs1)>;
   def : Pat<(sext_inreg (XLenVT GPR:$rs1), i8), (CV_EXTBS GPR:$rs1)>;
   def : Pat<(and (XLenVT GPR:$rs1), 0xffff), (CV_EXTHZ GPR:$rs1)>;
+  def : Pat<(and (XLenVT GPR:$rs1), 0xff), (CV_EXTBZ GPR:$rs1)>;
 
   defm CLIP   : PatCoreVAluGprImm<int_riscv_cv_alu_clip>;
   defm CLIPU  : PatCoreVAluGprImm<int_riscv_cv_alu_clipu>;
-  defm ADDN   : PatCoreVAluGprGprImm<int_riscv_cv_alu_addn>;
-  defm ADDUN  : PatCoreVAluGprGprImm<int_riscv_cv_alu_addun>;
-  defm ADDRN  : PatCoreVAluGprGprImm<int_riscv_cv_alu_addrn>;
-  defm ADDURN : PatCoreVAluGprGprImm<int_riscv_cv_alu_addurn>;
-  defm SUBN   : PatCoreVAluGprGprImm<int_riscv_cv_alu_subn>;
-  defm SUBUN  : PatCoreVAluGprGprImm<int_riscv_cv_alu_subun>;
-  defm SUBRN  : PatCoreVAluGprGprImm<int_riscv_cv_alu_subrn>;
-  defm SUBURN : PatCoreVAluGprGprImm<int_riscv_cv_alu_suburn>;
+  defm ADDN   : PatCoreVAluGprGprImm<int_riscv_cv_alu_addN>;
+  defm ADDUN  : PatCoreVAluGprGprImm<int_riscv_cv_alu_adduN>;
+  defm ADDRN  : PatCoreVAluGprGprImm<int_riscv_cv_alu_addRN>;
+  defm ADDURN : PatCoreVAluGprGprImm<int_riscv_cv_alu_adduRN>;
+  defm SUBN   : PatCoreVAluGprGprImm<int_riscv_cv_alu_subN>;
+  defm SUBUN  : PatCoreVAluGprGprImm<int_riscv_cv_alu_subuN>;
+  defm SUBRN  : PatCoreVAluGprGprImm<int_riscv_cv_alu_subRN>;
+  defm SUBURN : PatCoreVAluGprGprImm<int_riscv_cv_alu_subuRN>;
 } // Predicates = [HasVendorXCValu, IsRV32]
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
index 11c2695a59854..bff740a33c1c1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
@@ -112,8 +112,9 @@ class CLoadB_ri<bits<6> funct6, string OpcodeStr>
 }
 
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
-class CLoadH_ri<bits<6> funct6, bit funct1, string OpcodeStr>
-    : RVInst16CLH<funct6, funct1, 0b00, (outs GPRC:$rd),
+class CLoadH_ri<bits<6> funct6, bit funct1, string OpcodeStr,
+                DAGOperand rty = GPRC>
+    : RVInst16CLH<funct6, funct1, 0b00, (outs rty:$rd),
                   (ins GPRCMem:$rs1, uimm2_lsb0:$imm),
                   OpcodeStr, "$rd, ${imm}(${rs1})"> {
   bits<2> imm;
@@ -132,9 +133,10 @@ class CStoreB_rri<bits<6> funct6, string OpcodeStr>
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
-class CStoreH_rri<bits<6> funct6, bit funct1, string OpcodeStr>
+class CStoreH_rri<bits<6> funct6, bit funct1, string OpcodeStr,
+                  DAGOperand rty = GPRC>
     : RVInst16CSH<funct6, funct1, 0b00, (outs),
-                  (ins GPRC:$rs2, GPRCMem:$rs1, uimm2_lsb0:$imm),
+                  (ins rty:$rs2, GPRCMem:$rs1, uimm2_lsb0:$imm),
                   OpcodeStr, "$rs2, ${imm}(${rs1})"> {
   bits<2> imm;
 
@@ -202,7 +204,15 @@ def C_SB : CStoreB_rri<0b100010, "c.sb">,
            Sched<[WriteSTB, ReadStoreData, ReadMemBase]>;
 def C_SH : CStoreH_rri<0b100011, 0b0, "c.sh">,
            Sched<[WriteSTH, ReadStoreData, ReadMemBase]>;
+
+// Compressed versions of Zhinx load/store.
+let isCodeGenOnly = 1 in {
+def C_LH_INX : CLoadH_ri<0b100001, 0b1, "c.lh", GPRF16C>,
+               Sched<[WriteLDH, ReadMemBase]>;
+def C_SH_INX : CStoreH_rri<0b100011, 0b0, "c.sh", GPRF16C>,
+               Sched<[WriteSTH, ReadStoreData, ReadMemBase]>;
 }
+} // Predicates = [HasStdExtZcb]
 
 // Zcmp
 let DecoderNamespace = "RVZcmp", Predicates = [HasStdExtZcmp],
@@ -318,6 +328,13 @@ def : CompressPat<(SB GPRC:$rs2, GPRCMem:$rs1, uimm2:$imm),
                   (C_SB GPRC:$rs2, GPRCMem:$rs1, uimm2:$imm)>;
 def : CompressPat<(SH GPRC:$rs2, GPRCMem:$rs1, uimm2_lsb0:$imm),
                   (C_SH GPRC:$rs2, GPRCMem:$rs1, uimm2_lsb0:$imm)>;
+
+let isCompressOnly = true in {
+def : CompressPat<(LH_INX GPRF16C:$rd, GPRCMem:$rs1, uimm2_lsb0:$imm),
+                  (C_LH_INX GPRF16C:$rd, GPRCMem:$rs1, uimm2_lsb0:$imm)>;
+def : CompressPat<(SH_INX GPRF16C:$rs2, GPRCMem:$rs1, uimm2_lsb0:$imm),
+                  (C_SH_INX GPRF16C:$rs2, GPRCMem:$rs1, uimm2_lsb0:$imm)>;
+}
 }// Predicates = [HasStdExtZcb]
 
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index 792cb7fa6dbc2..51123180d47c6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -33,9 +33,14 @@ def riscv_fmv_x_signexth
 
 // Zhinxmin and Zhinx
 
+def GPRAsFPR16 : AsmOperandClass {
+  let Name = "GPRAsFPR16";
+  let ParserMethod = "parseGPRAsFPR";
+  let RenderMethod = "addRegOperands";
+}
+
 def FPR16INX : RegisterOperand<GPRF16> {
-  let ParserMatchClass = GPRAsFPR;
-  let DecoderMethod = "DecodeGPRRegisterClass";
+  let ParserMatchClass = GPRAsFPR16;
 }
 
 def ZfhExt     : ExtInfo<"", "", [HasStdExtZfh],
@@ -84,6 +89,19 @@ def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>;
 def FSH : FPStore_r<0b001, "fsh", FPR16, WriteFST16>;
 } // Predicates = [HasHalfFPLoadStoreMove]
 
+let Predicates = [HasStdExtZhinxmin], isCodeGenOnly = 1 in {
+def LH_INX : Load_ri<0b001, "lh", GPRF16>, Sched<[WriteLDH, ReadMemBase]>;
+def SH_INX : Store_rri<0b001, "sh", GPRF16>,
+             Sched<[WriteSTH, ReadStoreData, ReadMemBase]>;
+
+// ADDI with GPRF16 register class to use for copy. This should not be used as
+// general ADDI, so the immediate should always be zero.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveReg = 1,
+    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+def PseudoMV_FPR16INX : Pseudo<(outs GPRF16:$rd), (ins GPRF16:$rs), []>,
+                        Sched<[WriteIALU, ReadIALU]>;
+}
+
 foreach Ext = ZfhExts in {
   let SchedRW = [WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16Addend] in {
     defm FMADD_H  : FPFMA_rrr_frm_m<OPC_MADD,  0b10, "fmadd.h",  Ext>;
@@ -426,13 +444,10 @@ let Predicates = [HasStdExtZhinxmin] in {
 defm Select_FPR16INX : SelectCC_GPR_rrirr<FPR16INX, f16>;
 
 /// Loads
-def : Pat<(f16 (load (AddrRegImm (XLenVT GPR:$rs1), simm12:$imm12))),
-          (COPY_TO_REGCLASS (LH GPR:$rs1, simm12:$imm12), GPRF16)>;
+def : LdPat<load, LH_INX, f16>;
 
 /// Stores
-def : Pat<(store (f16 FPR16INX:$rs2),
-                 (AddrRegImm (XLenVT GPR:$rs1), simm12:$imm12)),
-          (SH (COPY_TO_REGCLASS FPR16INX:$rs2, GPR), GPR:$rs1, simm12:$imm12)>;
+def : StPat<store, SH_INX, GPRF16, f16>;
 } // Predicates = [HasStdExtZhinxmin]
 
 let Predicates = [HasStdExtZfhmin] in {
@@ -458,8 +473,8 @@ def : Pat<(any_fpround FPR32INX:$rs1), (FCVT_H_S_INX FPR32INX:$rs1, FRM_DYN)>;
 def : Pat<(any_fpextend FPR16INX:$rs1), (FCVT_S_H_INX FPR16INX:$rs1, FRM_RNE)>;
 
 // Moves (no conversion)
-def : Pat<(f16 (riscv_fmv_h_x GPR:$src)), (COPY_TO_REGCLASS GPR:$src, GPR)>;
-def : Pat<(riscv_fmv_x_anyexth FPR16INX:$src), (COPY_TO_REGCLASS FPR16INX:$src, GPR)>;
+def : Pat<(f16 (riscv_fmv_h_x GPR:$src)), (EXTRACT_SUBREG GPR:$src, sub_16)>;
+def : Pat<(riscv_fmv_x_anyexth FPR16INX:$src), (INSERT_SUBREG (XLenVT (IMPLICIT_DEF)), FPR16INX:$src, sub_16)>;
 
 def : Pat<(fcopysign FPR32INX:$rs1, FPR16INX:$rs2), (FSGNJ_S_INX $rs1, (FCVT_S_H_INX $rs2, FRM_RNE))>;
 } // Predicates = [HasStdExtZhinxmin]
diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
index 3f423450618df..5973e5bf2e525 100644
--- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
@@ -103,8 +103,10 @@ static unsigned log2LdstWidth(unsigned Opcode) {
   case RISCV::SB:
     return 0;
   case RISCV::LH:
+  case RISCV::LH_INX:
   case RISCV::LHU:
   case RISCV::SH:
+  case RISCV::SH_INX:
     return 1;
   case RISCV::LW:
   case RISCV::SW:
@@ -128,8 +130,10 @@ static unsigned offsetMask(unsigned Opcode) {
   case RISCV::SB:
     return maskTrailingOnes<unsigned>(2U);
   case RISCV::LH:
+  case RISCV::LH_INX:
   case RISCV::LHU:
   case RISCV::SH:
+  case RISCV::SH_INX:
     return maskTrailingOnes<unsigned>(1U);
   case RISCV::LW:
   case RISCV::SW:
@@ -173,6 +177,7 @@ static int64_t getBaseAdjustForCompression(int64_t Offset, unsigned Opcode) {
 // Return true if Reg is in a compressed register class.
 static bool isCompressedReg(Register Reg) {
   return RISCV::GPRCRegClass.contains(Reg) ||
+         RISCV::GPRF16CRegClass.contains(Reg) ||
          RISCV::FPR32CRegClass.contains(Reg) ||
          RISCV::FPR64CRegClass.contains(Reg);
 }
@@ -186,6 +191,7 @@ static bool isCompressibleLoad(const MachineInstr &MI) {
     return false;
   case RISCV::LBU:
   case RISCV::LH:
+  case RISCV::LH_INX:
   case RISCV::LHU:
     return STI.hasStdExtZcb();
   case RISCV::LW:
@@ -207,6 +213,7 @@ static bool isCompressibleStore(const MachineInstr &MI) {
     return false;
   case RISCV::SB:
   case RISCV::SH:
+  case RISCV::SH_INX:
     return STI.hasStdExtZcb();
   case RISCV::SW:
   case RISCV::SD:
@@ -320,6 +327,8 @@ static Register analyzeCompressibleUses(MachineInstr &FirstMI,
   // Work out the compressed register class from which to scavenge.
   if (RISCV::GPRRegClass.contains(RegImm.Reg))
     RCToScavenge = &RISCV::GPRCRegClass;
+  else if (RISCV::GPRF16RegClass.contains(RegImm.Reg))
+    RCToScavenge = &RISCV::GPRF16CRegClass;
   else if (RISCV::FPR32RegClass.contains(RegImm.Reg))
     RCToScavenge = &RISCV::FPR32CRegClass;
   else if (RISCV::FPR64RegClass.contains(RegImm.Reg))
@@ -410,6 +419,11 @@ bool RISCVMakeCompressibleOpt::runOnMachineFunction(MachineFunction &Fn) {
         BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(RISCV::ADDI), NewReg)
             .addReg(RegImm.Reg)
             .addImm(RegImm.Imm);
+      } else if (RISCV::GPRF16RegClass.contains(RegImm.Reg)) {
+        assert(RegImm.Imm == 0);
+        BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(RISCV::PseudoMV_FPR16INX),
+                NewReg)
+            .addReg(RegImm.Reg);
       } else {
         // If we are looking at replacing an FPR register we don't expect to
         // have any offset. The only compressible FP instructions with an offset
diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index b6ac3384e7d3e..b3a2877edde4e 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -385,6 +385,7 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
       return false;
     case RISCV::LB:
     case RISCV::LH:
+    case RISCV::LH_INX:
     case RISCV::LW:
     case RISCV::LBU:
     case RISCV::LHU:
@@ -395,6 +396,7 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
     case RISCV::FLD:
     case RISCV::SB:
     case RISCV::SH:
+    case RISCV::SH_INX:
     case RISCV::SW:
     case RISCV::SD:
     case RISCV::FSH:
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 91d539a355ac2..26195ef721db3 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -115,11 +115,11 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   }
 
   // Use markSuperRegs to ensure any register aliases are also reserved
-  markSuperRegs(Reserved, RISCV::X2); // sp
-  markSuperRegs(Reserved, RISCV::X3); // gp
-  markSuperRegs(Reserved, RISCV::X4); // tp
+  markSuperRegs(Reserved, RISCV::X2_H); // sp
+  markSuperRegs(Reserved, RISCV::X3_H); // gp
+  markSuperRegs(Reserved, RISCV::X4_H); // tp
   if (TFI->hasFP(MF))
-    markSuperRegs(Reserved, RISCV::X8); // fp
+    markSuperRegs(Reserved, RISCV::X8_H); // fp
   // Reserve the base register if we need to realign the stack and allocate
   // variable-sized objects at runtime.
   if (TFI->hasBP(MF))
@@ -131,7 +131,7 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   // There are only 16 GPRs for RVE.
   if (Subtarget.hasStdExtE())
-    for (MCPhysReg Reg = RISCV::X16; Reg <= RISCV::X31; Reg++)
+    for (MCPhysReg Reg = RISCV::X16_H; Reg <= RISCV::X31_H; Reg++)
       markSuperRegs(Reserved, Reg);
 
   // V registers for code generation. We handle them manually.
@@ -150,8 +150,8 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   if (MF.getFunction().getCallingConv() == CallingConv::GRAAL) {
     if (Subtarget.hasStdExtE())
       report_fatal_error("Graal reserved registers do not exist in RVE");
-    markSuperRegs(Reserved, RISCV::X23);
-    markSuperRegs(Reserved, RISCV::X27);
+    markSuperRegs(Reserved, RISCV::X23_H);
+    markSuperRegs(Reserved, RISCV::X27_H);
   }
 
   // Shadow stack pointer.
@@ -715,6 +715,12 @@ Register RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return TFI->hasFP(MF) ? RISCV::X8 : RISCV::X2;
 }
 
+StringRef RISCVRegisterInfo::getRegAsmName(MCRegister Reg) const {
+  if (Reg == RISCV::SF_VCIX_STATE)
+    return "sf.vcix_state";
+  return TargetRegisterInfo::getRegAsmName(Reg);
+}
+
 const uint32_t *
 RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF,
                                         CallingConv::ID CC) const {
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index cb0bb77d1fcbc..6ddb1eb9c14d5 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -102,6 +102,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
 
   Register getFrameRegister(const MachineFunction &MF) const override;
 
+  StringRef getRegAsmName(MCRegister Reg) const override;
+
   bool requiresRegisterScavenging(const MachineFunction &MF) const override {
     return true;
   }
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 5725d8eda88ce..e3c9ac52d66a3 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -82,42 +82,84 @@ def sub_gpr_odd  : SubRegIndex<32, 32> {
 // instructions.
 
 let RegAltNameIndices = [ABIRegAltName] in {
+  // 16-bit sub-registers for use by Zhinx. Having a 16-bit sub-register reduces
+  // the spill size for these operations.
   let isConstant = true in
-  def X0  : RISCVReg<0, "x0", ["zero"]>, DwarfRegNum<[0]>;
+  def X0_H  : RISCVReg<0, "x0", ["zero"]>;
   let CostPerUse = [0, 1] in {
-  def X1  : RISCVReg<1, "x1", ["ra"]>, DwarfRegNum<[1]>;
-  def X2  : RISCVReg<2, "x2", ["sp"]>, DwarfRegNum<[2]>;
-  def X3  : RISCVReg<3, "x3", ["gp"]>, DwarfRegNum<[3]>;
-  def X4  : RISCVReg<4, "x4", ["tp"]>, DwarfRegNum<[4]>;
-  def X5  : RISCVReg<5, "x5", ["t0"]>, DwarfRegNum<[5]>;
-  def X6  : RISCVReg<6, "x6", ["t1"]>, DwarfRegNum<[6]>;
-  def X7  : RISCVReg<7, "x7", ["t2"]>, DwarfRegNum<[7]>;
+  def X1_H  : RISCVReg<1, "x1", ["ra"]>;
+  def X2_H  : RISCVReg<2, "x2", ["sp"]>;
+  def X3_H  : RISCVReg<3, "x3", ["gp"]>;
+  def X4_H  : RISCVReg<4, "x4", ["tp"]>;
+  def X5_H  : RISCVReg<5, "x5", ["t0"]>;
+  def X6_H  : RISCVReg<6, "x6", ["t1"]>;
+  def X7_H  : RISCVReg<7, "x7", ["t2"]>;
   }
-  def X8  : RISCVReg<8, "x8", ["s0", "fp"]>, DwarfRegNum<[8]>;
-  def X9  : RISCVReg<9, "x9", ["s1"]>, DwarfRegNum<[9]>;
-  def X10 : RISCVReg<10,"x10", ["a0"]>, DwarfRegNum<[10]>;
-  def X11 : RISCVReg<11,"x11", ["a1"]>, DwarfRegNum<[11]>;
-  def X12 : RISCVReg<12,"x12", ["a2"]>, DwarfRegNum<[12]>;
-  def X13 : RISCVReg<13,"x13", ["a3"]>, DwarfRegNum<[13]>;
-  def X14 : RISCVReg<14,"x14", ["a4"]>, DwarfRegNum<[14]>;
-  def X15 : RISCVReg<15,"x15", ["a5"]>, DwarfRegNum<[15]>;
+  def X8_H  : RISCVReg<8, "x8", ["s0", "fp"]>;
+  def X9_H  : RISCVReg<9, "x9", ["s1"]>;
+  def X10_H : RISCVReg<10,"x10", ["a0"]>;
+  def X11_H : RISCVReg<11,"x11", ["a1"]>;
+  def X12_H : RISCVReg<12,"x12", ["a2"]>;
+  def X13_H : RISCVReg<13,"x13", ["a3"]>;
+  def X14_H : RISCVReg<14,"x14", ["a4"]>;
+  def X15_H : RISCVReg<15,"x15", ["a5"]>;
   let CostPerUse = [0, 1] in {
-  def X16 : RISCVReg<16,"x16", ["a6"]>, DwarfRegNum<[16]>;
-  def X17 : RISCVReg<17,"x17", ["a7"]>, DwarfRegNum<[17]>;
-  def X18 : RISCVReg<18,"x18", ["s2"]>, DwarfRegNum<[18]>;
-  def X19 : RISCVReg<19,"x19", ["s3"]>, DwarfRegNum<[19]>;
-  def X20 : RISCVReg<20,"x20", ["s4"]>, DwarfRegNum<[20]>;
-  def X21 : RISCVReg<21,"x21", ["s5"]>, DwarfRegNum<[21]>;
-  def X22 : RISCVReg<22,"x22", ["s6"]>, DwarfRegNum<[22]>;
-  def X23 : RISCVReg<23,"x23", ["s7"]>, DwarfRegNum<[23]>;
-  def X24 : RISCVReg<24,"x24", ["s8"]>, DwarfRegNum<[24]>;
-  def X25 : RISCVReg<25,"x25", ["s9"]>, DwarfRegNum<[25]>;
-  def X26 : RISCVReg<26,"x26", ["s10"]>, DwarfRegNum<[26]>;
-  def X27 : RISCVReg<27,"x27", ["s11"]>, DwarfRegNum<[27]>;
-  def X28 : RISCVReg<28,"x28", ["t3"]>, DwarfRegNum<[28]>;
-  def X29 : RISCVReg<29,"x29", ["t4"]>, DwarfRegNum<[29]>;
-  def X30 : RISCVReg<30,"x30", ["t5"]>, DwarfRegNum<[30]>;
-  def X31 : RISCVReg<31,"x31", ["t6"]>, DwarfRegNum<[31]>;
+  def X16_H : RISCVReg<16,"x16", ["a6"]>;
+  def X17_H : RISCVReg<17,"x17", ["a7"]>;
+  def X18_H : RISCVReg<18,"x18", ["s2"]>;
+  def X19_H : RISCVReg<19,"x19", ["s3"]>;
+  def X20_H : RISCVReg<20,"x20", ["s4"]>;
+  def X21_H : RISCVReg<21,"x21", ["s5"]>;
+  def X22_H : RISCVReg<22,"x22", ["s6"]>;
+  def X23_H : RISCVReg<23,"x23", ["s7"]>;
+  def X24_H : RISCVReg<24,"x24", ["s8"]>;
+  def X25_H : RISCVReg<25,"x25", ["s9"]>;
+  def X26_H : RISCVReg<26,"x26", ["s10"]>;
+  def X27_H : RISCVReg<27,"x27", ["s11"]>;
+  def X28_H : RISCVReg<28,"x28", ["t3"]>;
+  def X29_H : RISCVReg<29,"x29", ["t4"]>;
+  def X30_H : RISCVReg<30,"x30", ["t5"]>;
+  def X31_H : RISCVReg<31,"x31", ["t6"]>;
+  }
+
+  let SubRegIndices = [sub_16] in {
+  let isConstant = true in
+  def X0  : RISCVRegWithSubRegs<0, "x0", [X0_H], ["zero"]>, DwarfRegNum<[0]>;
+  let CostPerUse = [0, 1] in {
+  def X1  : RISCVRegWithSubRegs<1, "x1", [X1_H], ["ra"]>, DwarfRegNum<[1]>;
+  def X2  : RISCVRegWithSubRegs<2, "x2", [X2_H], ["sp"]>, DwarfRegNum<[2]>;
+  def X3  : RISCVRegWithSubRegs<3, "x3", [X3_H], ["gp"]>, DwarfRegNum<[3]>;
+  def X4  : RISCVRegWithSubRegs<4, "x4", [X4_H], ["tp"]>, DwarfRegNum<[4]>;
+  def X5  : RISCVRegWithSubRegs<5, "x5", [X5_H], ["t0"]>, DwarfRegNum<[5]>;
+  def X6  : RISCVRegWithSubRegs<6, "x6", [X6_H], ["t1"]>, DwarfRegNum<[6]>;
+  def X7  : RISCVRegWithSubRegs<7, "x7", [X7_H], ["t2"]>, DwarfRegNum<[7]>;
+  }
+  def X8  : RISCVRegWithSubRegs<8, "x8", [X8_H], ["s0", "fp"]>, DwarfRegNum<[8]>;
+  def X9  : RISCVRegWithSubRegs<9, "x9", [X9_H], ["s1"]>, DwarfRegNum<[9]>;
+  def X10 : RISCVRegWithSubRegs<10,"x10", [X10_H], ["a0"]>, DwarfRegNum<[10]>;
+  def X11 : RISCVRegWithSubRegs<11,"x11", [X11_H], ["a1"]>, DwarfRegNum<[11]>;
+  def X12 : RISCVRegWithSubRegs<12,"x12", [X12_H], ["a2"]>, DwarfRegNum<[12]>;
+  def X13 : RISCVRegWithSubRegs<13,"x13", [X13_H], ["a3"]>, DwarfRegNum<[13]>;
+  def X14 : RISCVRegWithSubRegs<14,"x14", [X14_H], ["a4"]>, DwarfRegNum<[14]>;
+  def X15 : RISCVRegWithSubRegs<15,"x15", [X15_H], ["a5"]>, DwarfRegNum<[15]>;
+  let CostPerUse = [0, 1] in {
+  def X16 : RISCVRegWithSubRegs<16,"x16", [X16_H], ["a6"]>, DwarfRegNum<[16]>;
+  def X17 : RISCVRegWithSubRegs<17,"x17", [X17_H], ["a7"]>, DwarfRegNum<[17]>;
+  def X18 : RISCVRegWithSubRegs<18,"x18", [X18_H], ["s2"]>, DwarfRegNum<[18]>;
+  def X19 : RISCVRegWithSubRegs<19,"x19", [X19_H], ["s3"]>, DwarfRegNum<[19]>;
+  def X20 : RISCVRegWithSubRegs<20,"x20", [X20_H], ["s4"]>, DwarfRegNum<[20]>;
+  def X21 : RISCVRegWithSubRegs<21,"x21", [X21_H], ["s5"]>, DwarfRegNum<[21]>;
+  def X22 : RISCVRegWithSubRegs<22,"x22", [X22_H], ["s6"]>, DwarfRegNum<[22]>;
+  def X23 : RISCVRegWithSubRegs<23,"x23", [X23_H], ["s7"]>, DwarfRegNum<[23]>;
+  def X24 : RISCVRegWithSubRegs<24,"x24", [X24_H], ["s8"]>, DwarfRegNum<[24]>;
+  def X25 : RISCVRegWithSubRegs<25,"x25", [X25_H], ["s9"]>, DwarfRegNum<[25]>;
+  def X26 : RISCVRegWithSubRegs<26,"x26", [X26_H], ["s10"]>, DwarfRegNum<[26]>;
+  def X27 : RISCVRegWithSubRegs<27,"x27", [X27_H], ["s11"]>, DwarfRegNum<[27]>;
+  def X28 : RISCVRegWithSubRegs<28,"x28", [X28_H], ["t3"]>, DwarfRegNum<[28]>;
+  def X29 : RISCVRegWithSubRegs<29,"x29", [X29_H], ["t4"]>, DwarfRegNum<[29]>;
+  def X30 : RISCVRegWithSubRegs<30,"x30", [X30_H], ["t5"]>, DwarfRegNum<[30]>;
+  def X31 : RISCVRegWithSubRegs<31,"x31", [X31_H], ["t6"]>, DwarfRegNum<[31]>;
+  }
   }
 }
 
@@ -565,8 +607,17 @@ def VRM8NoV0 : VReg<VM8VTs, (sub VRM8, V0M8), 8>;
 
 def VMV0 : VReg<VMaskVTs, (add V0), 1>;
 
+// 16-bit GPR sub-register class used by Zhinx instructions.
+def GPRF16 : RISCVRegisterClass<[f16], 16, (add (sequence "X%u_H", 10, 17),
+                                                (sequence "X%u_H", 5, 7),
+                                                (sequence "X%u_H", 28, 31),
+                                                (sequence "X%u_H", 8, 9),
+                                                (sequence "X%u_H", 18, 27),
+                                                (sequence "X%u_H", 0, 4))>;
+def GPRF16C : RISCVRegisterClass<[f16], 16, (add (sequence "X%u_H", 10, 15),
+                                                 (sequence "X%u_H", 8, 9))>;
+
 let RegInfos = XLenRI in {
-def GPRF16  : RISCVRegisterClass<[f16], 16, (add GPR)>;
 def GPRF32  : RISCVRegisterClass<[f32], 32, (add GPR)>;
 } // RegInfos = XLenRI
 
@@ -666,3 +717,7 @@ def SSP    : RISCVReg<0, "ssp">;
 
 // Dummy SiFive VCIX state register
 def SF_VCIX_STATE : RISCVReg<0, "sf.vcix_state">;
+def : RISCVRegisterClass<[XLenVT], 32, (add SF_VCIX_STATE)> {
+  let RegInfos = XLenRI;
+  let isAllocatable = 0;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 6a72857b93b6c..b9d35a924669f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -96,7 +96,7 @@ static cl::opt<bool>
 static cl::opt<bool> EnableMISchedLoadClustering(
     "riscv-misched-load-clustering", cl::Hidden,
     cl::desc("Enable load clustering in the machine scheduler"),
-    cl::init(false));
+    cl::init(true));
 
 static cl::opt<bool> EnableVSETVLIAfterRVVRegAlloc(
     "riscv-vsetvl-after-rvv-regalloc", cl::Hidden,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 768df71715fa6..c042782389f18 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1163,9 +1163,47 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       Dst->getScalarSizeInBits() > ST->getELen())
     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
   std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
   std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
 
+  // Handle i1 source and dest cases *before* calling logic in BasicTTI.
+  // The shared implementation doesn't model vector widening during legalization
+  // and instead assumes scalarization.  In order to scalarize an <N x i1>
+  // vector, we need to extend/trunc to/from i8.  If we don't special case
+  // this, we can get an infinite recursion cycle.
+  switch (ISD) {
+  default:
+    break;
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    if (Src->getScalarSizeInBits() == 1) {
+      // We do not use vsext/vzext to extend from mask vector.
+      // Instead we use the following instructions to extend from mask vector:
+      // vmv.v.i v8, 0
+      // vmerge.vim v8, v8, -1, v0 (repeated per split)
+      return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
+             DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
+                                                   DstLT.second, CostKind) +
+             DstLT.first - 1;
+    }
+    break;
+  case ISD::TRUNCATE:
+    if (Dst->getScalarSizeInBits() == 1) {
+      // We do not use several vncvt to truncate to mask vector. So we could
+      // not use PowDiff to calculate it.
+      // Instead we use the following instructions to truncate to mask vector:
+      // vand.vi v8, v8, 1
+      // vmsne.vi v0, v8, 0
+      return SrcLT.first *
+                 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
+                                         SrcLT.second, CostKind) +
+             SrcLT.first - 1;
+    }
+    break;
+  };
+
   // Our actual lowering for the case where a wider legal type is available
   // uses promotion to the wider type.  This is reflected in the result of
   // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
@@ -1181,22 +1219,11 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   // The split cost is handled by the base getCastInstrCost
   assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
 
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  assert(ISD && "Invalid opcode");
-
   int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
                 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
   switch (ISD) {
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND: {
-    if (Src->getScalarSizeInBits() == 1) {
-      // We do not use vsext/vzext to extend from mask vector.
-      // Instead we use the following instructions to extend from mask vector:
-      // vmv.v.i v8, 0
-      // vmerge.vim v8, v8, -1, v0
-      return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM},
-                                     DstLT.second, CostKind);
-    }
     if ((PowDiff < 1) || (PowDiff > 3))
       return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
     unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
@@ -1206,16 +1233,6 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     return getRISCVInstructionCost(Op, DstLT.second, CostKind);
   }
   case ISD::TRUNCATE:
-    if (Dst->getScalarSizeInBits() == 1) {
-      // We do not use several vncvt to truncate to mask vector. So we could
-      // not use PowDiff to calculate it.
-      // Instead we use the following instructions to truncate to mask vector:
-      // vand.vi v8, v8, 1
-      // vmsne.vi v0, v8, 0
-      return getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
-                                     SrcLT.second, CostKind);
-    }
-    [[fallthrough]];
   case ISD::FP_EXTEND:
   case ISD::FP_ROUND: {
     // Counts of narrow/widen instructions.
@@ -1618,23 +1635,38 @@ InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 
 }
 
-InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                                 Type *CondTy,
-                                                 CmpInst::Predicate VecPred,
-                                                 TTI::TargetCostKind CostKind,
-                                                 const Instruction *I) {
+InstructionCost RISCVTTIImpl::getCmpSelInstrCost(
+    unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+    TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
+    TTI::OperandValueInfo Op2Info, const Instruction *I) {
   if (CostKind != TTI::TCK_RecipThroughput)
     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
-                                     I);
+                                     Op1Info, Op2Info, I);
 
   if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
-                                     I);
+                                     Op1Info, Op2Info, I);
 
   // Skip if scalar size of ValTy is bigger than ELEN.
   if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
-                                     I);
+                                     Op1Info, Op2Info, I);
+
+  auto GetConstantMatCost =
+      [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
+    if (OpInfo.isUniform())
+      // We return 0 we currently ignore the cost of materializing scalar
+      // constants in GPRs.
+      return 0;
+
+    return getConstantPoolLoadCost(ValTy, CostKind);
+  };
+
+  InstructionCost ConstantMatCost;
+  if (Op1Info.isConstant())
+    ConstantMatCost += GetConstantMatCost(Op1Info);
+  if (Op2Info.isConstant())
+    ConstantMatCost += GetConstantMatCost(Op2Info);
 
   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
   if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
@@ -1643,14 +1675,16 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
         // vmandn.mm v8, v8, v9
         // vmand.mm v9, v0, v9
         // vmor.mm v0, v9, v8
-        return LT.first *
-               getRISCVInstructionCost(
-                   {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
-                   LT.second, CostKind);
+        return ConstantMatCost +
+               LT.first *
+                   getRISCVInstructionCost(
+                       {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
+                       LT.second, CostKind);
       }
       // vselect and max/min are supported natively.
-      return LT.first *
-             getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
+      return ConstantMatCost +
+             LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
+                                                CostKind);
     }
 
     if (ValTy->getScalarSizeInBits() == 1) {
@@ -1660,7 +1694,8 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       //  vmand.mm v9, v0, v9
       //  vmor.mm v0, v9, v8
       MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
-      return LT.first *
+      return ConstantMatCost +
+             LT.first *
                  getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
                                          InterimVT, CostKind) +
              LT.first * getRISCVInstructionCost(
@@ -1671,7 +1706,8 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     // vmv.v.x v10, a0
     // vmsne.vi v0, v10, 0
     // vmerge.vvm v8, v9, v8, v0
-    return LT.first * getRISCVInstructionCost(
+    return ConstantMatCost +
+           LT.first * getRISCVInstructionCost(
                           {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
                           LT.second, CostKind);
   }
@@ -1680,8 +1716,9 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       CmpInst::isIntPredicate(VecPred)) {
     // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
     // provided they incur the same cost across all implementations
-    return LT.first *
-           getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind);
+    return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
+                                                                LT.second,
+                                                                CostKind);
   }
 
   if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
@@ -1689,7 +1726,8 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 
     // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
     if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
-      return getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
+      return ConstantMatCost +
+             getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
 
     // If we do not support the input floating point vector type, use the base
     // one which will calculate as:
@@ -1699,7 +1737,7 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
         (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
         (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
-                                       I);
+                                       Op1Info, Op2Info, I);
 
     // Assuming vector fp compare and mask instructions are all the same cost
     // until a need arises to differentiate them.
@@ -1708,7 +1746,8 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
     case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
     case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
-      return LT.first * getRISCVInstructionCost(
+      return ConstantMatCost +
+             LT.first * getRISCVInstructionCost(
                             {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
                             LT.second, CostKind);
 
@@ -1716,9 +1755,10 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
     case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
     case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
-      return LT.first *
-             getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
-                                     LT.second, CostKind);
+      return ConstantMatCost +
+             LT.first *
+                 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
+                                         LT.second, CostKind);
 
     case CmpInst::FCMP_OEQ: // vmfeq.vv
     case CmpInst::FCMP_OGT: // vmflt.vv
@@ -1726,8 +1766,9 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     case CmpInst::FCMP_OLT: // vmflt.vv
     case CmpInst::FCMP_OLE: // vmfle.vv
     case CmpInst::FCMP_UNE: // vmfne.vv
-      return LT.first *
-             getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
+      return ConstantMatCost +
+             LT.first *
+                 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
     default:
       break;
     }
@@ -1750,7 +1791,8 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 
   // TODO: Add cost for scalar type.
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+                                   Op1Info, Op2Info, I);
 }
 
 InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,
@@ -1908,6 +1950,29 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
                                          Args, CxtI);
 
+  // f16 with zvfhmin and bf16 will be promoted to f32.
+  // FIXME: nxv32[b]f16 will be custom lowered and split.
+  unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
+  InstructionCost CastCost = 0;
+  if ((LT.second.getVectorElementType() == MVT::f16 ||
+       LT.second.getVectorElementType() == MVT::bf16) &&
+      TLI->getOperationAction(ISDOpcode, LT.second) ==
+          TargetLoweringBase::LegalizeAction::Promote) {
+    MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
+    Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
+    Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
+    // Add cost of extending arguments
+    CastCost += LT.first * Args.size() *
+                getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
+                                 TTI::CastContextHint::None, CostKind);
+    // Add cost of truncating result
+    CastCost +=
+        LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
+                                    TTI::CastContextHint::None, CostKind);
+    // Compute cost of op in promoted type
+    LT.second = PromotedVT;
+  }
+
   auto getConstantMatCost =
     [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
     if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
@@ -1929,7 +1994,7 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
     ConstantMatCost += getConstantMatCost(1, Op2Info);
 
   unsigned Op;
-  switch (TLI->InstructionOpcodeToISD(Opcode)) {
+  switch (ISDOpcode) {
   case ISD::ADD:
   case ISD::SUB:
     Op = RISCV::VADD_VV;
@@ -1959,11 +2024,9 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
     break;
   case ISD::FADD:
   case ISD::FSUB:
-    // TODO: Address FP16 with VFHMIN
     Op = RISCV::VFADD_VV;
     break;
   case ISD::FMUL:
-    // TODO: Address FP16 with VFHMIN
     Op = RISCV::VFMUL_VV;
     break;
   case ISD::FDIV:
@@ -1975,9 +2038,9 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
   default:
     // Assuming all other instructions have the same cost until a need arises to
     // differentiate them.
-    return ConstantMatCost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
-                                                           Op1Info, Op2Info,
-                                                           Args, CxtI);
+    return CastCost + ConstantMatCost +
+           BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
+                                         Args, CxtI);
   }
 
   InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
@@ -1986,7 +2049,7 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
   // scalar floating point ops aren't cheaper than their vector equivalents.
   if (Ty->isFPOrFPVectorTy())
     InstrCost *= 2;
-  return ConstantMatCost + LT.first * InstrCost;
+  return CastCost + ConstantMatCost + LT.first * InstrCost;
 }
 
 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index f16c4fc0eed02..65bbd90550855 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -200,10 +200,12 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
                   TTI::OperandValueInfo OpdInfo = {TTI::OK_AnyValue, TTI::OP_None},
                   const Instruction *I = nullptr);
 
-  InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-                                     CmpInst::Predicate VecPred,
-                                     TTI::TargetCostKind CostKind,
-                                     const Instruction *I = nullptr);
+  InstructionCost getCmpSelInstrCost(
+      unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
+      TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr);
 
   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                  const Instruction *I = nullptr);
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 127585f85915f..a9efd5448fdf6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -28,6 +28,8 @@ static const std::map<std::string, SPIRV::Extension::Extension>
          SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float16_add},
         {"SPV_EXT_shader_atomic_float_min_max",
          SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float_min_max},
+        {"SPV_EXT_arithmetic_fence",
+         SPIRV::Extension::Extension::SPV_EXT_arithmetic_fence},
         {"SPV_INTEL_arbitrary_precision_integers",
          SPIRV::Extension::Extension::SPV_INTEL_arbitrary_precision_integers},
         {"SPV_INTEL_cache_controls",
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 86be79cbb5e7f..370df24bc7af9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -144,6 +144,8 @@ class SPIRVEmitIntrinsics
   Type *deduceFunParamElementType(Function *F, unsigned OpIdx);
   Type *deduceFunParamElementType(Function *F, unsigned OpIdx,
                                   std::unordered_set<Function *> &FVisited);
+  void replaceWithPtrcasted(Instruction *CI, Type *NewElemTy, Type *KnownElemTy,
+                            CallInst *AssignCI);
 
 public:
   static char ID;
@@ -188,6 +190,28 @@ bool isConvergenceIntrinsic(const Instruction *I) {
          II->getIntrinsicID() == Intrinsic::experimental_convergence_loop ||
          II->getIntrinsicID() == Intrinsic::experimental_convergence_anchor;
 }
+
+bool expectIgnoredInIRTranslation(const Instruction *I) {
+  const auto *II = dyn_cast<IntrinsicInst>(I);
+  if (!II)
+    return false;
+  return II->getIntrinsicID() == Intrinsic::invariant_start;
+}
+
+bool allowEmitFakeUse(const Value *Arg) {
+  if (const auto *II = dyn_cast<IntrinsicInst>(Arg))
+    if (Function *F = II->getCalledFunction())
+      if (F->getName().starts_with("llvm.spv."))
+        return false;
+  if (dyn_cast<AtomicCmpXchgInst>(Arg) || dyn_cast<InsertValueInst>(Arg) ||
+      dyn_cast<UndefValue>(Arg))
+    return false;
+  if (const auto *LI = dyn_cast<LoadInst>(Arg))
+    if (LI->getType()->isAggregateType())
+      return false;
+  return true;
+}
+
 } // namespace
 
 char SPIRVEmitIntrinsics::ID = 0;
@@ -283,8 +307,20 @@ static inline Type *reconstructType(SPIRVGlobalRegistry *GR, Value *Op) {
 void SPIRVEmitIntrinsics::buildAssignType(IRBuilder<> &B, Type *Ty,
                                           Value *Arg) {
   Value *OfType = PoisonValue::get(Ty);
-  CallInst *AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type,
-                                       {Arg->getType()}, OfType, Arg, {}, B);
+  CallInst *AssignCI = nullptr;
+  if (Arg->getType()->isAggregateType() && Ty->isAggregateType() &&
+      allowEmitFakeUse(Arg)) {
+    LLVMContext &Ctx = Arg->getContext();
+    SmallVector<Metadata *, 2> ArgMDs{
+        MDNode::get(Ctx, ValueAsMetadata::getConstant(OfType)),
+        MDString::get(Ctx, Arg->getName())};
+    B.CreateIntrinsic(Intrinsic::spv_value_md, {},
+                      {MetadataAsValue::get(Ctx, MDTuple::get(Ctx, ArgMDs))});
+    AssignCI = B.CreateIntrinsic(Intrinsic::fake_use, {}, {Arg});
+  } else {
+    AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type, {Arg->getType()},
+                               OfType, Arg, {}, B);
+  }
   GR->addAssignPtrTypeInstr(Arg, AssignCI);
 }
 
@@ -475,10 +511,11 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper(
       if (DemangledName.length() > 0)
         DemangledName = SPIRV::lookupBuiltinNameHelper(DemangledName);
       auto AsArgIt = ResTypeByArg.find(DemangledName);
-      if (AsArgIt != ResTypeByArg.end()) {
+      if (AsArgIt != ResTypeByArg.end())
         Ty = deduceElementTypeHelper(CI->getArgOperand(AsArgIt->second),
                                      Visited, UnknownElemTypeI8);
-      }
+      else if (Type *KnownRetTy = GR->findDeducedElementType(CalledF))
+        Ty = KnownRetTy;
     }
   }
 
@@ -808,6 +845,7 @@ void SPIRVEmitIntrinsics::deduceOperandElementType(Instruction *I,
       CallInst *PtrCastI =
           B.CreateIntrinsic(Intrinsic::spv_ptrcast, {Types}, Args);
       I->setOperand(OpIt.second, PtrCastI);
+      buildAssignPtr(B, KnownElemTy, PtrCastI);
     }
   }
 }
@@ -1268,6 +1306,8 @@ Instruction *SPIRVEmitIntrinsics::visitInsertValueInst(InsertValueInst &I) {
 }
 
 Instruction *SPIRVEmitIntrinsics::visitExtractValueInst(ExtractValueInst &I) {
+  if (I.getAggregateOperand()->getType()->isAggregateType())
+    return &I;
   IRBuilder<> B(I.getParent());
   B.SetInsertPoint(&I);
   SmallVector<Value *> Args;
@@ -1534,7 +1574,8 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
       I->setOperand(OpNo, NewOp);
     }
   }
-  if (I->hasName()) {
+  if (I->hasName() && !I->getType()->isAggregateType() &&
+      !expectIgnoredInIRTranslation(I)) {
     reportFatalOnTokenType(I);
     setInsertPointAfterDef(B, I);
     std::vector<Value *> Args = {I};
@@ -1707,6 +1748,26 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
   return true;
 }
 
+void SPIRVEmitIntrinsics::replaceWithPtrcasted(Instruction *CI, Type *NewElemTy,
+                                               Type *KnownElemTy,
+                                               CallInst *AssignCI) {
+  updateAssignType(AssignCI, CI, PoisonValue::get(NewElemTy));
+  IRBuilder<> B(CI->getContext());
+  B.SetInsertPoint(*CI->getInsertionPointAfterDef());
+  B.SetCurrentDebugLocation(CI->getDebugLoc());
+  Type *OpTy = CI->getType();
+  SmallVector<Type *, 2> Types = {OpTy, OpTy};
+  SmallVector<Value *, 2> Args = {CI, buildMD(PoisonValue::get(KnownElemTy)),
+                                  B.getInt32(getPointerAddressSpace(OpTy))};
+  CallInst *PtrCasted =
+      B.CreateIntrinsic(Intrinsic::spv_ptrcast, {Types}, Args);
+  SmallVector<User *> Users(CI->users());
+  for (auto *U : Users)
+    if (U != AssignCI && U != PtrCasted)
+      U->replaceUsesOfWith(CI, PtrCasted);
+  buildAssignPtr(B, KnownElemTy, PtrCasted);
+}
+
 // Try to deduce a better type for pointers to untyped ptr.
 bool SPIRVEmitIntrinsics::postprocessTypes() {
   bool Changed = false;
@@ -1718,6 +1779,18 @@ bool SPIRVEmitIntrinsics::postprocessTypes() {
     Type *KnownTy = GR->findDeducedElementType(*IB);
     if (!KnownTy || !AssignCI || !isa<Instruction>(AssignCI->getArgOperand(0)))
       continue;
+    // Try to improve the type deduced after all Functions are processed.
+    if (auto *CI = dyn_cast<CallInst>(*IB)) {
+      if (Function *CalledF = CI->getCalledFunction()) {
+        Type *RetElemTy = GR->findDeducedElementType(CalledF);
+        // Fix inconsistency between known type and function's return type.
+        if (RetElemTy && RetElemTy != KnownTy) {
+          replaceWithPtrcasted(CI, RetElemTy, KnownTy, AssignCI);
+          Changed = true;
+          continue;
+        }
+      }
+    }
     Instruction *I = cast<Instruction>(AssignCI->getArgOperand(0));
     for (User *U : I->users()) {
       Instruction *Inst = dyn_cast<Instruction>(U);
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
index b78f1c3f060a2..f95f0d2988be2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
@@ -73,6 +73,22 @@ enum BaseTypeAttributeEncoding {
   UnsignedChar = 7
 };
 
+enum SourceLanguage {
+  Unknown = 0,
+  ESSL = 1,
+  GLSL = 2,
+  OpenCL_C = 3,
+  OpenCL_CPP = 4,
+  HLSL = 5,
+  CPP_for_OpenCL = 6,
+  SYCL = 7,
+  HERO_C = 8,
+  NZSL = 9,
+  WGSL = 10,
+  Slang = 11,
+  Zig = 12
+};
+
 bool SPIRVEmitNonSemanticDI::emitGlobalDI(MachineFunction &MF) {
   // If this MachineFunction doesn't have any BB repeat procedure
   // for the next
@@ -83,8 +99,8 @@ bool SPIRVEmitNonSemanticDI::emitGlobalDI(MachineFunction &MF) {
 
   // Required variables to get from metadata search
   LLVMContext *Context;
-  SmallString<128> FilePath;
-  unsigned SourceLanguage = 0;
+  SmallVector<SmallString<128>> FilePaths;
+  SmallVector<int64_t> LLVMSourceLanguages;
   int64_t DwarfVersion = 0;
   int64_t DebugInfoVersion = 0;
   SmallPtrSet<DIBasicType *, 12> BasicTypes;
@@ -101,9 +117,10 @@ bool SPIRVEmitNonSemanticDI::emitGlobalDI(MachineFunction &MF) {
     for (const auto *Op : DbgCu->operands()) {
       if (const auto *CompileUnit = dyn_cast<DICompileUnit>(Op)) {
         DIFile *File = CompileUnit->getFile();
-        sys::path::append(FilePath, File->getDirectory(), File->getFilename());
-        SourceLanguage = CompileUnit->getSourceLanguage();
-        break;
+        FilePaths.emplace_back();
+        sys::path::append(FilePaths.back(), File->getDirectory(),
+                          File->getFilename());
+        LLVMSourceLanguages.push_back(CompileUnit->getSourceLanguage());
       }
     }
     const NamedMDNode *ModuleFlags = M->getNamedMetadata("llvm.module.flags");
@@ -160,9 +177,6 @@ bool SPIRVEmitNonSemanticDI::emitGlobalDI(MachineFunction &MF) {
       return StrReg;
     };
 
-    // Emit OpString with FilePath which is required by DebugSource
-    const Register FilePathStrReg = EmitOpString(FilePath);
-
     const SPIRVType *VoidTy =
         GR->getOrCreateSPIRVType(Type::getVoidTy(*Context), MIRBuilder);
 
@@ -187,27 +201,54 @@ bool SPIRVEmitNonSemanticDI::emitGlobalDI(MachineFunction &MF) {
           return InstReg;
         };
 
-    // Emit DebugSource which is required by DebugCompilationUnit
-    const Register DebugSourceResIdReg = EmitDIInstruction(
-        SPIRV::NonSemanticExtInst::DebugSource, {FilePathStrReg});
-
     const SPIRVType *I32Ty =
         GR->getOrCreateSPIRVType(Type::getInt32Ty(*Context), MIRBuilder);
 
-    // Convert DwarfVersion, DebugInfo and SourceLanguage integers to OpConstant
-    // instructions required by DebugCompilationUnit
     const Register DwarfVersionReg =
         GR->buildConstantInt(DwarfVersion, MIRBuilder, I32Ty, false);
     const Register DebugInfoVersionReg =
         GR->buildConstantInt(DebugInfoVersion, MIRBuilder, I32Ty, false);
-    const Register SourceLanguageReg =
-        GR->buildConstantInt(SourceLanguage, MIRBuilder, I32Ty, false);
-
-    [[maybe_unused]]
-    const Register DebugCompUnitResIdReg =
-        EmitDIInstruction(SPIRV::NonSemanticExtInst::DebugCompilationUnit,
-                          {DebugInfoVersionReg, DwarfVersionReg,
-                           DebugSourceResIdReg, SourceLanguageReg});
+
+    for (unsigned Idx = 0; Idx < LLVMSourceLanguages.size(); ++Idx) {
+      const Register FilePathStrReg = EmitOpString(FilePaths[Idx]);
+
+      const Register DebugSourceResIdReg = EmitDIInstruction(
+          SPIRV::NonSemanticExtInst::DebugSource, {FilePathStrReg});
+
+      SourceLanguage SpirvSourceLanguage = SourceLanguage::Unknown;
+      switch (LLVMSourceLanguages[Idx]) {
+      case dwarf::DW_LANG_OpenCL:
+        SpirvSourceLanguage = SourceLanguage::OpenCL_C;
+        break;
+      case dwarf::DW_LANG_OpenCL_CPP:
+        SpirvSourceLanguage = SourceLanguage::OpenCL_CPP;
+        break;
+      case dwarf::DW_LANG_CPP_for_OpenCL:
+        SpirvSourceLanguage = SourceLanguage::CPP_for_OpenCL;
+        break;
+      case dwarf::DW_LANG_GLSL:
+        SpirvSourceLanguage = SourceLanguage::GLSL;
+        break;
+      case dwarf::DW_LANG_HLSL:
+        SpirvSourceLanguage = SourceLanguage::HLSL;
+        break;
+      case dwarf::DW_LANG_SYCL:
+        SpirvSourceLanguage = SourceLanguage::SYCL;
+        break;
+      case dwarf::DW_LANG_Zig:
+        SpirvSourceLanguage = SourceLanguage::Zig;
+        break;
+      }
+
+      const Register SourceLanguageReg =
+          GR->buildConstantInt(SpirvSourceLanguage, MIRBuilder, I32Ty, false);
+
+      [[maybe_unused]]
+      const Register DebugCompUnitResIdReg =
+          EmitDIInstruction(SPIRV::NonSemanticExtInst::DebugCompilationUnit,
+                            {DebugInfoVersionReg, DwarfVersionReg,
+                             DebugSourceResIdReg, SourceLanguageReg});
+    }
 
     // We aren't extracting any DebugInfoFlags now so we
     // emitting zero to use as <id>Flags argument for DebugBasicType
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 3e1873e899680..ceca0a180c95b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -1128,6 +1128,11 @@ SPIRVGlobalRegistry::getPointerStorageClass(Register VReg) const {
   SPIRVType *Type = getSPIRVTypeForVReg(VReg);
   assert(Type && Type->getOpcode() == SPIRV::OpTypePointer &&
          Type->getOperand(1).isImm() && "Pointer type is expected");
+  return getPointerStorageClass(Type);
+}
+
+SPIRV::StorageClass::StorageClass
+SPIRVGlobalRegistry::getPointerStorageClass(const SPIRVType *Type) const {
   return static_cast<SPIRV::StorageClass::StorageClass>(
       Type->getOperand(1).getImm());
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index cad2bf96adf33..ace5cfe91ebe4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -55,6 +55,8 @@ class SPIRVGlobalRegistry {
   // created during substitution of aggregate arguments
   // (see `SPIRVPrepareFunctions::removeAggregateTypesFromSignature()`)
   DenseMap<Value *, Type *> MutatedAggRet;
+  // map an instruction to its value's attributes (type, name)
+  DenseMap<MachineInstr *, std::pair<Type *, std::string>> ValueAttrs;
 
   // Look for an equivalent of the newType in the map. Return the equivalent
   // if it's found, otherwise insert newType to the map and return the type.
@@ -188,6 +190,21 @@ class SPIRVGlobalRegistry {
     return It == MutatedAggRet.end() ? nullptr : It->second;
   }
 
+  // A registry of value's attributes (type, name)
+  // - Add a record.
+  void addValueAttrs(MachineInstr *Key, std::pair<Type *, std::string> Val) {
+    ValueAttrs[Key] = Val;
+  }
+  // - Find a record.
+  bool findValueAttrs(const MachineInstr *Key, Type *&Ty, StringRef &Name) {
+    auto It = ValueAttrs.find(Key);
+    if (It == ValueAttrs.end())
+      return false;
+    Ty = It->second.first;
+    Name = It->second.second;
+    return true;
+  }
+
   // Deduced element types of untyped pointers and composites:
   // - Add a record to the map of deduced element types.
   void addDeducedElementType(Value *Val, Type *Ty) { DeducedElTys[Val] = Ty; }
@@ -388,6 +405,8 @@ class SPIRVGlobalRegistry {
 
   // Gets the storage class of the pointer type assigned to this vreg.
   SPIRV::StorageClass::StorageClass getPointerStorageClass(Register VReg) const;
+  SPIRV::StorageClass::StorageClass
+  getPointerStorageClass(const SPIRVType *Type) const;
 
   // Return the number of bits SPIR-V pointers and size_t variables require.
   unsigned getPointerSize() const { return PointerSize; }
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 51bacb00b1c51..1d63b5b69c641 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -519,8 +519,8 @@ def OpMatrixTimesMatrix: BinOp<"OpMatrixTimesMatrix", 146>;
 def OpOuterProduct: BinOp<"OpOuterProduct", 147>;
 def OpDot: BinOp<"OpDot", 148>;
 
-def OpIAddCarry: BinOpTyped<"OpIAddCarry", 149, iID, addc>;
-def OpISubBorrow: BinOpTyped<"OpISubBorrow", 150, iID, subc>;
+defm OpIAddCarry: BinOpTypedGen<"OpIAddCarry", 149, addc, 0, 1>;
+defm OpISubBorrow: BinOpTypedGen<"OpISubBorrow", 150, subc, 0, 1>;
 def OpUMulExtended: BinOp<"OpUMulExtended", 151>;
 def OpSMulExtended: BinOp<"OpSMulExtended", 152>;
 
@@ -878,3 +878,7 @@ def OpCooperativeMatrixMulAddKHR: Op<4459, (outs ID:$res),
                   "$res = OpCooperativeMatrixMulAddKHR $type $A $B $C">;
 def OpCooperativeMatrixLengthKHR: Op<4460, (outs ID:$res), (ins TYPE:$type, ID:$coop_matr_type),
                   "$res = OpCooperativeMatrixLengthKHR $type $coop_matr_type">;
+
+// SPV_EXT_arithmetic_fence
+def OpArithmeticFenceEXT: Op<6145, (outs ID:$res), (ins TYPE:$type, ID:$target),
+                  "$res = OpArithmeticFenceEXT $type $target">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index e475810f92f71..7a565249a342d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -169,6 +169,9 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool selectFloatDot(Register ResVReg, const SPIRVType *ResType,
                       MachineInstr &I) const;
 
+  bool selectOverflowArith(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I, unsigned Opcode) const;
+
   bool selectIntegerDot(Register ResVReg, const SPIRVType *ResType,
                         MachineInstr &I) const;
 
@@ -246,6 +249,7 @@ class SPIRVInstructionSelector : public InstructionSelector {
 
   bool selectUnmergeValues(MachineInstr &I) const;
 
+  // Utilities
   Register buildI32Constant(uint32_t Val, MachineInstr &I,
                             const SPIRVType *ResType = nullptr) const;
 
@@ -257,6 +261,14 @@ class SPIRVInstructionSelector : public InstructionSelector {
 
   bool wrapIntoSpecConstantOp(MachineInstr &I,
                               SmallVector<Register> &CompositeArgs) const;
+
+  Register getUcharPtrTypeReg(MachineInstr &I,
+                              SPIRV::StorageClass::StorageClass SC) const;
+  MachineInstrBuilder buildSpecConstantOp(MachineInstr &I, Register Dest,
+                                          Register Src, Register DestType,
+                                          uint32_t Opcode) const;
+  MachineInstrBuilder buildConstGenericPtr(MachineInstr &I, Register SrcPtr,
+                                           SPIRVType *SrcPtrTy) const;
 };
 
 } // end anonymous namespace
@@ -386,11 +398,22 @@ bool SPIRVInstructionSelector::select(MachineInstr &I) {
   return false;
 }
 
+static bool mayApplyGenericSelection(unsigned Opcode) {
+  switch (Opcode) {
+  case TargetOpcode::G_CONSTANT:
+    return false;
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SSUBO:
+    return true;
+  }
+  return isTypeFoldingSupported(Opcode);
+}
+
 bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
                                          const SPIRVType *ResType,
                                          MachineInstr &I) const {
   const unsigned Opcode = I.getOpcode();
-  if (isTypeFoldingSupported(Opcode) && Opcode != TargetOpcode::G_CONSTANT)
+  if (mayApplyGenericSelection(Opcode))
     return selectImpl(I, *CoverageInfo);
   switch (Opcode) {
   case TargetOpcode::G_CONSTANT:
@@ -526,6 +549,8 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
     return selectExtInst(ResVReg, ResType, I, CL::asin, GL::Asin);
   case TargetOpcode::G_FATAN:
     return selectExtInst(ResVReg, ResType, I, CL::atan, GL::Atan);
+  case TargetOpcode::G_FATAN2:
+    return selectExtInst(ResVReg, ResType, I, CL::atan2, GL::Atan2);
   case TargetOpcode::G_FCOSH:
     return selectExtInst(ResVReg, ResType, I, CL::cosh, GL::Cosh);
   case TargetOpcode::G_FSINH:
@@ -567,6 +592,21 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
   case TargetOpcode::G_USUBSAT:
     return selectExtInst(ResVReg, ResType, I, CL::u_sub_sat);
 
+  case TargetOpcode::G_UADDO:
+    return selectOverflowArith(ResVReg, ResType, I,
+                               ResType->getOpcode() == SPIRV::OpTypeVector
+                                   ? SPIRV::OpIAddCarryV
+                                   : SPIRV::OpIAddCarryS);
+  case TargetOpcode::G_USUBO:
+    return selectOverflowArith(ResVReg, ResType, I,
+                               ResType->getOpcode() == SPIRV::OpTypeVector
+                                   ? SPIRV::OpISubBorrowV
+                                   : SPIRV::OpISubBorrowS);
+  case TargetOpcode::G_UMULO:
+    return selectOverflowArith(ResVReg, ResType, I, SPIRV::OpUMulExtended);
+  case TargetOpcode::G_SMULO:
+    return selectOverflowArith(ResVReg, ResType, I, SPIRV::OpSMulExtended);
+
   case TargetOpcode::G_SEXT:
     return selectExt(ResVReg, ResType, I, true);
   case TargetOpcode::G_ANYEXT:
@@ -714,6 +754,15 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
   case TargetOpcode::G_UNMERGE_VALUES:
     return selectUnmergeValues(I);
 
+  // Discard gen opcodes for intrinsics which we do not expect to actually
+  // represent code after lowering or intrinsics which are not implemented but
+  // should not crash when found in a customer's LLVM IR input.
+  case TargetOpcode::G_TRAP:
+  case TargetOpcode::G_DEBUGTRAP:
+  case TargetOpcode::G_UBSANTRAP:
+  case TargetOpcode::DBG_LABEL:
+    return true;
+
   default:
     return false;
   }
@@ -1056,6 +1105,71 @@ bool SPIRVInstructionSelector::selectFence(MachineInstr &I) const {
       .constrainAllUses(TII, TRI, RBI);
 }
 
+bool SPIRVInstructionSelector::selectOverflowArith(Register ResVReg,
+                                                   const SPIRVType *ResType,
+                                                   MachineInstr &I,
+                                                   unsigned Opcode) const {
+  Type *ResTy = nullptr;
+  StringRef ResName;
+  if (!GR.findValueAttrs(&I, ResTy, ResName))
+    report_fatal_error(
+        "Not enough info to select the arithmetic with overflow instruction");
+  if (!ResTy || !ResTy->isStructTy())
+    report_fatal_error("Expect struct type result for the arithmetic "
+                       "with overflow instruction");
+  // "Result Type must be from OpTypeStruct. The struct must have two members,
+  // and the two members must be the same type."
+  Type *ResElemTy = cast<StructType>(ResTy)->getElementType(0);
+  ResTy = StructType::create(SmallVector<Type *, 2>{ResElemTy, ResElemTy});
+  // Build SPIR-V types and constant(s) if needed.
+  MachineIRBuilder MIRBuilder(I);
+  SPIRVType *StructType = GR.getOrCreateSPIRVType(
+      ResTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, false);
+  assert(I.getNumDefs() > 1 && "Not enought operands");
+  SPIRVType *BoolType = GR.getOrCreateSPIRVBoolType(I, TII);
+  unsigned N = GR.getScalarOrVectorComponentCount(ResType);
+  if (N > 1)
+    BoolType = GR.getOrCreateSPIRVVectorType(BoolType, N, I, TII);
+  Register BoolTypeReg = GR.getSPIRVTypeID(BoolType);
+  Register ZeroReg = buildZerosVal(ResType, I);
+  // A new virtual register to store the result struct.
+  Register StructVReg = MRI->createGenericVirtualRegister(LLT::scalar(64));
+  MRI->setRegClass(StructVReg, &SPIRV::IDRegClass);
+  // Build the result name if needed.
+  if (ResName.size() > 0)
+    buildOpName(StructVReg, ResName, MIRBuilder);
+  // Build the arithmetic with overflow instruction.
+  MachineBasicBlock &BB = *I.getParent();
+  auto MIB =
+      BuildMI(BB, MIRBuilder.getInsertPt(), I.getDebugLoc(), TII.get(Opcode))
+          .addDef(StructVReg)
+          .addUse(GR.getSPIRVTypeID(StructType));
+  for (unsigned i = I.getNumDefs(); i < I.getNumOperands(); ++i)
+    MIB.addUse(I.getOperand(i).getReg());
+  bool Status = MIB.constrainAllUses(TII, TRI, RBI);
+  // Build instructions to extract fields of the instruction's result.
+  // A new virtual register to store the higher part of the result struct.
+  Register HigherVReg = MRI->createGenericVirtualRegister(LLT::scalar(64));
+  MRI->setRegClass(HigherVReg, &SPIRV::iIDRegClass);
+  for (unsigned i = 0; i < I.getNumDefs(); ++i) {
+    auto MIB =
+        BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract))
+            .addDef(i == 1 ? HigherVReg : I.getOperand(i).getReg())
+            .addUse(GR.getSPIRVTypeID(ResType))
+            .addUse(StructVReg)
+            .addImm(i);
+    Status &= MIB.constrainAllUses(TII, TRI, RBI);
+  }
+  // Build boolean value from the higher part.
+  Status &= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpINotEqual))
+                .addDef(I.getOperand(1).getReg())
+                .addUse(BoolTypeReg)
+                .addUse(HigherVReg)
+                .addUse(ZeroReg)
+                .constrainAllUses(TII, TRI, RBI);
+  return Status;
+}
+
 bool SPIRVInstructionSelector::selectAtomicCmpXchg(Register ResVReg,
                                                    const SPIRVType *ResType,
                                                    MachineInstr &I) const {
@@ -1148,6 +1262,58 @@ static bool isUSMStorageClass(SPIRV::StorageClass::StorageClass SC) {
   }
 }
 
+// Returns true ResVReg is referred only from global vars and OpName's.
+static bool isASCastInGVar(MachineRegisterInfo *MRI, Register ResVReg) {
+  bool IsGRef = false;
+  bool IsAllowedRefs =
+      std::all_of(MRI->use_instr_begin(ResVReg), MRI->use_instr_end(),
+                  [&IsGRef](auto const &It) {
+                    unsigned Opcode = It.getOpcode();
+                    if (Opcode == SPIRV::OpConstantComposite ||
+                        Opcode == SPIRV::OpVariable ||
+                        isSpvIntrinsic(It, Intrinsic::spv_init_global))
+                      return IsGRef = true;
+                    return Opcode == SPIRV::OpName;
+                  });
+  return IsAllowedRefs && IsGRef;
+}
+
+Register SPIRVInstructionSelector::getUcharPtrTypeReg(
+    MachineInstr &I, SPIRV::StorageClass::StorageClass SC) const {
+  return GR.getSPIRVTypeID(GR.getOrCreateSPIRVPointerType(
+      GR.getOrCreateSPIRVIntegerType(8, I, TII), I, TII, SC));
+}
+
+MachineInstrBuilder
+SPIRVInstructionSelector::buildSpecConstantOp(MachineInstr &I, Register Dest,
+                                              Register Src, Register DestType,
+                                              uint32_t Opcode) const {
+  return BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                 TII.get(SPIRV::OpSpecConstantOp))
+      .addDef(Dest)
+      .addUse(DestType)
+      .addImm(Opcode)
+      .addUse(Src);
+}
+
+MachineInstrBuilder
+SPIRVInstructionSelector::buildConstGenericPtr(MachineInstr &I, Register SrcPtr,
+                                               SPIRVType *SrcPtrTy) const {
+  SPIRVType *GenericPtrTy = GR.getOrCreateSPIRVPointerType(
+      GR.getPointeeType(SrcPtrTy), I, TII, SPIRV::StorageClass::Generic);
+  Register Tmp = MRI->createVirtualRegister(&SPIRV::pIDRegClass);
+  MRI->setType(Tmp, LLT::pointer(storageClassToAddressSpace(
+                                     SPIRV::StorageClass::Generic),
+                                 GR.getPointerSize()));
+  MachineFunction *MF = I.getParent()->getParent();
+  GR.assignSPIRVTypeToVReg(GenericPtrTy, Tmp, *MF);
+  MachineInstrBuilder MIB = buildSpecConstantOp(
+      I, Tmp, SrcPtr, GR.getSPIRVTypeID(GenericPtrTy),
+      static_cast<uint32_t>(SPIRV::Opcode::PtrCastToGeneric));
+  GR.add(MIB.getInstr(), MF, Tmp);
+  return MIB;
+}
+
 // In SPIR-V address space casting can only happen to and from the Generic
 // storage class. We can also only cast Workgroup, CrossWorkgroup, or Function
 // pointers to and from Generic pointers. As such, we can convert e.g. from
@@ -1156,36 +1322,57 @@ static bool isUSMStorageClass(SPIRV::StorageClass::StorageClass SC) {
 bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
                                                    const SPIRVType *ResType,
                                                    MachineInstr &I) const {
-  // If the AddrSpaceCast user is single and in OpConstantComposite or
-  // OpVariable, we should select OpSpecConstantOp.
-  auto UIs = MRI->use_instructions(ResVReg);
-  if (!UIs.empty() && ++UIs.begin() == UIs.end() &&
-      (UIs.begin()->getOpcode() == SPIRV::OpConstantComposite ||
-       UIs.begin()->getOpcode() == SPIRV::OpVariable ||
-       isSpvIntrinsic(*UIs.begin(), Intrinsic::spv_init_global))) {
-    Register NewReg = I.getOperand(1).getReg();
-    MachineBasicBlock &BB = *I.getParent();
-    SPIRVType *SpvBaseTy = GR.getOrCreateSPIRVIntegerType(8, I, TII);
-    ResType = GR.getOrCreateSPIRVPointerType(SpvBaseTy, I, TII,
-                                             SPIRV::StorageClass::Generic);
-    bool Result =
-        BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp))
-            .addDef(ResVReg)
-            .addUse(GR.getSPIRVTypeID(ResType))
-            .addImm(static_cast<uint32_t>(SPIRV::Opcode::PtrCastToGeneric))
-            .addUse(NewReg)
-            .constrainAllUses(TII, TRI, RBI);
-    return Result;
-  }
+  MachineBasicBlock &BB = *I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+
   Register SrcPtr = I.getOperand(1).getReg();
   SPIRVType *SrcPtrTy = GR.getSPIRVTypeForVReg(SrcPtr);
-  SPIRV::StorageClass::StorageClass SrcSC = GR.getPointerStorageClass(SrcPtr);
-  SPIRV::StorageClass::StorageClass DstSC = GR.getPointerStorageClass(ResVReg);
+
+  // don't generate a cast for a null that may be represented by OpTypeInt
+  if (SrcPtrTy->getOpcode() != SPIRV::OpTypePointer ||
+      ResType->getOpcode() != SPIRV::OpTypePointer)
+    return BuildMI(BB, I, DL, TII.get(TargetOpcode::COPY))
+        .addDef(ResVReg)
+        .addUse(SrcPtr)
+        .constrainAllUses(TII, TRI, RBI);
+
+  SPIRV::StorageClass::StorageClass SrcSC = GR.getPointerStorageClass(SrcPtrTy);
+  SPIRV::StorageClass::StorageClass DstSC = GR.getPointerStorageClass(ResType);
+
+  if (isASCastInGVar(MRI, ResVReg)) {
+    // AddrSpaceCast uses within OpVariable and OpConstantComposite instructions
+    // are expressed by OpSpecConstantOp with an Opcode.
+    // TODO: maybe insert a check whether the Kernel capability was declared and
+    // so PtrCastToGeneric/GenericCastToPtr are available.
+    unsigned SpecOpcode =
+        DstSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(SrcSC)
+            ? static_cast<uint32_t>(SPIRV::Opcode::PtrCastToGeneric)
+            : (SrcSC == SPIRV::StorageClass::Generic &&
+                       isGenericCastablePtr(DstSC)
+                   ? static_cast<uint32_t>(SPIRV::Opcode::GenericCastToPtr)
+                   : 0);
+    // TODO: OpConstantComposite expects i8*, so we are forced to forget a
+    // correct value of ResType and use general i8* instead. Maybe this should
+    // be addressed in the emit-intrinsic step to infer a correct
+    // OpConstantComposite type.
+    if (SpecOpcode) {
+      return buildSpecConstantOp(I, ResVReg, SrcPtr,
+                                 getUcharPtrTypeReg(I, DstSC), SpecOpcode)
+          .constrainAllUses(TII, TRI, RBI);
+    } else if (isGenericCastablePtr(SrcSC) && isGenericCastablePtr(DstSC)) {
+      MachineInstrBuilder MIB = buildConstGenericPtr(I, SrcPtr, SrcPtrTy);
+      return MIB.constrainAllUses(TII, TRI, RBI) &&
+             buildSpecConstantOp(
+                 I, ResVReg, MIB->getOperand(0).getReg(),
+                 getUcharPtrTypeReg(I, DstSC),
+                 static_cast<uint32_t>(SPIRV::Opcode::GenericCastToPtr))
+                 .constrainAllUses(TII, TRI, RBI);
+    }
+  }
 
   // don't generate a cast between identical storage classes
   if (SrcSC == DstSC)
-    return BuildMI(*I.getParent(), I, I.getDebugLoc(),
-                   TII.get(TargetOpcode::COPY))
+    return BuildMI(BB, I, DL, TII.get(TargetOpcode::COPY))
         .addDef(ResVReg)
         .addUse(SrcPtr)
         .constrainAllUses(TII, TRI, RBI);
@@ -1201,8 +1388,6 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
     Register Tmp = MRI->createVirtualRegister(&SPIRV::iIDRegClass);
     SPIRVType *GenericPtrTy = GR.getOrCreateSPIRVPointerType(
         GR.getPointeeType(SrcPtrTy), I, TII, SPIRV::StorageClass::Generic);
-    MachineBasicBlock &BB = *I.getParent();
-    const DebugLoc &DL = I.getDebugLoc();
     bool Success = BuildMI(BB, I, DL, TII.get(SPIRV::OpPtrCastToGeneric))
                        .addDef(Tmp)
                        .addUse(GR.getSPIRVTypeID(GenericPtrTy))
@@ -2415,6 +2600,16 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
           .addUse(I.getOperand(2).getReg())
           .addUse(I.getOperand(3).getReg());
     break;
+  case Intrinsic::arithmetic_fence:
+    if (STI.canUseExtension(SPIRV::Extension::SPV_EXT_arithmetic_fence))
+      BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpArithmeticFenceEXT))
+          .addDef(ResVReg)
+          .addUse(GR.getSPIRVTypeID(ResType))
+          .addUse(I.getOperand(2).getReg());
+    else
+      BuildMI(BB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), ResVReg)
+          .addUse(I.getOperand(2).getReg());
+    break;
   case Intrinsic::spv_thread_id:
     return selectSpvThreadId(ResVReg, ResType, I);
   case Intrinsic::spv_fdot:
@@ -2460,6 +2655,16 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   }
   case Intrinsic::spv_step:
     return selectStep(ResVReg, ResType, I);
+  // Discard intrinsics which we do not expect to actually represent code after
+  // lowering or intrinsics which are not implemented but should not crash when
+  // found in a customer's LLVM IR input.
+  case Intrinsic::instrprof_increment:
+  case Intrinsic::instrprof_increment_step:
+  case Intrinsic::instrprof_value_profile:
+    break;
+  // Discard internal intrinsics.
+  case Intrinsic::spv_value_md:
+    break;
   default: {
     std::string DiagMsg;
     raw_string_ostream OS(DiagMsg);
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 9fe4d8a16bc32..460f0127d4ffc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -287,7 +287,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
   // TODO: add proper legalization rules.
   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG).alwaysLegal();
 
-  getActionDefinitionsBuilder({G_UADDO, G_USUBO, G_SMULO, G_UMULO})
+  getActionDefinitionsBuilder(
+      {G_UADDO, G_SADDO, G_USUBO, G_SSUBO, G_UMULO, G_SMULO})
       .alwaysLegal();
 
   // FP conversions.
@@ -320,6 +321,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
                                G_FACOS,
                                G_FASIN,
                                G_FATAN,
+                               G_FATAN2,
                                G_FCOSH,
                                G_FSINH,
                                G_FTANH,
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index fa71223a341b1..8908d8965b67c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1200,6 +1200,14 @@ void addInstrRequirements(const MachineInstr &MI,
     Reqs.addExtension(SPIRV::Extension::SPV_KHR_cooperative_matrix);
     Reqs.addCapability(SPIRV::Capability::CooperativeMatrixKHR);
     break;
+  case SPIRV::OpArithmeticFenceEXT:
+    if (!ST.canUseExtension(SPIRV::Extension::SPV_EXT_arithmetic_fence))
+      report_fatal_error("OpArithmeticFenceEXT requires the "
+                         "following SPIR-V extension: SPV_EXT_arithmetic_fence",
+                         false);
+    Reqs.addExtension(SPIRV::Extension::SPV_EXT_arithmetic_fence);
+    Reqs.addCapability(SPIRV::Capability::ArithmeticFenceEXT);
+    break;
   default:
     break;
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index cd0aff1a51843..3c2af34dd5523 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -294,8 +294,21 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
       default:
         break;
       }
-      if (SpvType)
+      if (SpvType) {
+        // check if the address space needs correction
+        LLT RegType = MRI.getType(Reg);
+        if (SpvType->getOpcode() == SPIRV::OpTypePointer &&
+            RegType.isPointer() &&
+            storageClassToAddressSpace(GR->getPointerStorageClass(SpvType)) !=
+                RegType.getAddressSpace()) {
+          const SPIRVSubtarget &ST =
+              MI->getParent()->getParent()->getSubtarget<SPIRVSubtarget>();
+          SpvType = GR->getOrCreateSPIRVPointerType(
+              GR->getPointeeType(SpvType), *MI, *ST.getInstrInfo(),
+              addressSpaceToStorageClass(RegType.getAddressSpace(), ST));
+        }
         GR->assignSPIRVTypeToVReg(SpvType, Reg, MIB.getMF());
+      }
       if (!MRI.getRegClassOrNull(Reg))
         MRI.setRegClass(Reg, SpvType ? GR->getRegClass(SpvType)
                                      : &SPIRV::iIDRegClass);
@@ -341,6 +354,17 @@ createNewIdReg(SPIRVType *SpvType, Register SrcReg, MachineRegisterInfo &MRI,
   return {Reg, GetIdOp};
 }
 
+static void setInsertPtAfterDef(MachineIRBuilder &MIB, MachineInstr *Def) {
+  MachineBasicBlock &MBB = *Def->getParent();
+  MachineBasicBlock::iterator DefIt =
+      Def->getNextNode() ? Def->getNextNode()->getIterator() : MBB.end();
+  // Skip all the PHI and debug instructions.
+  while (DefIt != MBB.end() &&
+         (DefIt->isPHI() || DefIt->isDebugOrPseudoInstr()))
+    DefIt = std::next(DefIt);
+  MIB.setInsertPt(MBB, DefIt);
+}
+
 // Insert ASSIGN_TYPE instuction between Reg and its definition, set NewReg as
 // a dst of the definition, assign SPIRVType to both registers. If SpvType is
 // provided, use it as SPIRVType in ASSIGN_TYPE, otherwise create it from Ty.
@@ -350,11 +374,9 @@ namespace llvm {
 Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpvType,
                            SPIRVGlobalRegistry *GR, MachineIRBuilder &MIB,
                            MachineRegisterInfo &MRI) {
-  MachineInstr *Def = MRI.getVRegDef(Reg);
   assert((Ty || SpvType) && "Either LLVM or SPIRV type is expected.");
-  MIB.setInsertPt(*Def->getParent(),
-                  (Def->getNextNode() ? Def->getNextNode()->getIterator()
-                                      : Def->getParent()->end()));
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  setInsertPtAfterDef(MIB, Def);
   SpvType = SpvType ? SpvType : GR->getOrCreateSPIRVType(Ty, MIB);
   Register NewReg = MRI.createGenericVirtualRegister(MRI.getType(Reg));
   if (auto *RC = MRI.getRegClassOrNull(Reg)) {
@@ -376,7 +398,13 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpvType,
       .addUse(NewReg)
       .addUse(GR->getSPIRVTypeID(SpvType))
       .setMIFlags(Flags);
-  Def->getOperand(0).setReg(NewReg);
+  for (unsigned I = 0, E = Def->getNumDefs(); I != E; ++I) {
+    MachineOperand &MO = Def->getOperand(I);
+    if (MO.getReg() == Reg) {
+      MO.setReg(NewReg);
+      break;
+    }
+  }
   return NewReg;
 }
 
@@ -460,6 +488,25 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
             Def->getOpcode() != SPIRV::ASSIGN_TYPE)
           insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MF.getRegInfo());
         ToErase.push_back(&MI);
+      } else if (MIOp == TargetOpcode::FAKE_USE && MI.getNumOperands() > 0) {
+        MachineInstr *MdMI = MI.getPrevNode();
+        if (MdMI && isSpvIntrinsic(*MdMI, Intrinsic::spv_value_md)) {
+          // It's an internal service info from before IRTranslator passes.
+          MachineInstr *Def = getVRegDef(MRI, MI.getOperand(0).getReg());
+          for (unsigned I = 1, E = MI.getNumOperands(); I != E && Def; ++I)
+            if (getVRegDef(MRI, MI.getOperand(I).getReg()) != Def)
+              Def = nullptr;
+          if (Def) {
+            const MDNode *MD = MdMI->getOperand(1).getMetadata();
+            StringRef ValueName =
+                cast<MDString>(MD->getOperand(1))->getString();
+            const MDNode *TypeMD = cast<MDNode>(MD->getOperand(0));
+            Type *ValueTy = getMDOperandAsType(TypeMD, 0);
+            GR->addValueAttrs(Def, std::make_pair(ValueTy, ValueName.str()));
+          }
+          ToErase.push_back(MdMI);
+        }
+        ToErase.push_back(&MI);
       } else if (MIOp == TargetOpcode::G_CONSTANT ||
                  MIOp == TargetOpcode::G_FCONSTANT ||
                  MIOp == TargetOpcode::G_BUILD_VECTOR) {
@@ -485,6 +532,14 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
                    ? MI.getOperand(1).getCImm()->getType()
                    : TargetExtIt->second;
           const ConstantInt *OpCI = MI.getOperand(1).getCImm();
+          // TODO: we may wish to analyze here if OpCI is zero and LLT RegType =
+          // MRI.getType(Reg); RegType.isPointer() is true, so that we observe
+          // at this point not i64/i32 constant but null pointer in the
+          // corresponding address space of RegType.getAddressSpace(). This may
+          // help to successfully validate the case when a OpConstantComposite's
+          // constituent has type that does not match Result Type of
+          // OpConstantComposite (see, for example,
+          // pointers/PtrCast-null-in-OpSpecConstantOp.ll).
           Register PrimaryReg = GR->find(OpCI, &MF);
           if (!PrimaryReg.isValid()) {
             GR->add(OpCI, &MF, Reg);
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index eb5139ac5383a..1872b238d1077 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -342,30 +342,6 @@ static void lowerFunnelShifts(IntrinsicInst *FSHIntrinsic) {
   FSHIntrinsic->setCalledFunction(FSHFunc);
 }
 
-static void buildUMulWithOverflowFunc(Function *UMulFunc) {
-  // The function body is already created.
-  if (!UMulFunc->empty())
-    return;
-
-  BasicBlock *EntryBB = BasicBlock::Create(UMulFunc->getParent()->getContext(),
-                                           "entry", UMulFunc);
-  IRBuilder<> IRB(EntryBB);
-  // Build the actual unsigned multiplication logic with the overflow
-  // indication. Do unsigned multiplication Mul = A * B. Then check
-  // if unsigned division Div = Mul / A is not equal to B. If so,
-  // then overflow has happened.
-  Value *Mul = IRB.CreateNUWMul(UMulFunc->getArg(0), UMulFunc->getArg(1));
-  Value *Div = IRB.CreateUDiv(Mul, UMulFunc->getArg(0));
-  Value *Overflow = IRB.CreateICmpNE(UMulFunc->getArg(0), Div);
-
-  // umul.with.overflow intrinsic return a structure, where the first element
-  // is the multiplication result, and the second is an overflow bit.
-  Type *StructTy = UMulFunc->getReturnType();
-  Value *Agg = IRB.CreateInsertValue(PoisonValue::get(StructTy), Mul, {0});
-  Value *Res = IRB.CreateInsertValue(Agg, Overflow, {1});
-  IRB.CreateRet(Res);
-}
-
 static void lowerExpectAssume(IntrinsicInst *II) {
   // If we cannot use the SPV_KHR_expect_assume extension, then we need to
   // ignore the intrinsic and move on. It should be removed later on by LLVM.
@@ -407,20 +383,6 @@ static bool toSpvOverloadedIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID,
   return true;
 }
 
-static void lowerUMulWithOverflow(IntrinsicInst *UMulIntrinsic) {
-  // Get a separate function - otherwise, we'd have to rework the CFG of the
-  // current one. Then simply replace the intrinsic uses with a call to the new
-  // function.
-  Module *M = UMulIntrinsic->getModule();
-  FunctionType *UMulFuncTy = UMulIntrinsic->getFunctionType();
-  Type *FSHLRetTy = UMulFuncTy->getReturnType();
-  const std::string FuncName = lowerLLVMIntrinsicName(UMulIntrinsic);
-  Function *UMulFunc =
-      getOrCreateFunction(M, FSHLRetTy, UMulFuncTy->params(), FuncName);
-  buildUMulWithOverflowFunc(UMulFunc);
-  UMulIntrinsic->setCalledFunction(UMulFunc);
-}
-
 // Substitutes calls to LLVM intrinsics with either calls to SPIR-V intrinsics
 // or calls to proper generated functions. Returns True if F was modified.
 bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
@@ -444,10 +406,6 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
         lowerFunnelShifts(II);
         Changed = true;
         break;
-      case Intrinsic::umul_with_overflow:
-        lowerUMulWithOverflow(II);
-        Changed = true;
-        break;
       case Intrinsic::assume:
       case Intrinsic::expect: {
         const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F);
@@ -478,9 +436,13 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
 // noted in 'spv.cloned_funcs' metadata for later restoration.
 Function *
 SPIRVPrepareFunctions::removeAggregateTypesFromSignature(Function *F) {
+  bool IsRetAggr = F->getReturnType()->isAggregateType();
+  // Allow intrinsics with aggregate return type to reach GlobalISel
+  if (F->isIntrinsic() && IsRetAggr)
+    return F;
+
   IRBuilder<> B(F->getContext());
 
-  bool IsRetAggr = F->getReturnType()->isAggregateType();
   bool HasAggrArg =
       std::any_of(F->arg_begin(), F->arg_end(), [](Argument &Arg) {
         return Arg.getType()->isAggregateType();
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 23cd32eff45d5..6bc27c7a0d193 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -303,6 +303,7 @@ defm SPV_INTEL_cache_controls : ExtensionOperand<108>;
 defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109>;
 defm SPV_INTEL_global_variable_fpga_decorations : ExtensionOperand<110>;
 defm SPV_KHR_cooperative_matrix : ExtensionOperand<111>;
+defm SPV_EXT_arithmetic_fence : ExtensionOperand<112>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -480,6 +481,7 @@ defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_
 defm GlobalVariableFPGADecorationsINTEL : CapabilityOperand<6189, 0, 0, [SPV_INTEL_global_variable_fpga_decorations], []>;
 defm CacheControlsINTEL : CapabilityOperand<6441, 0, 0, [SPV_INTEL_cache_controls], []>;
 defm CooperativeMatrixKHR : CapabilityOperand<6022, 0, 0, [SPV_KHR_cooperative_matrix], []>;
+defm ArithmeticFenceEXT : CapabilityOperand<6144, 0, 0, [SPV_EXT_arithmetic_fence], []>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define SourceLanguage enum values and at the same time
@@ -1631,6 +1633,7 @@ multiclass OpcodeOperand<bits<32> value> {
 defm InBoundsAccessChain : OpcodeOperand<66>;
 defm InBoundsPtrAccessChain : OpcodeOperand<70>;
 defm PtrCastToGeneric : OpcodeOperand<121>;
+defm GenericCastToPtr : OpcodeOperand<122>;
 defm Bitcast : OpcodeOperand<124>;
 defm ConvertPtrToU : OpcodeOperand<117>;
 defm ConvertUToPtr : OpcodeOperand<120>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
index 2fbb4381da263..24047f31fab29 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
@@ -24,6 +24,7 @@
 namespace llvm {
 class SPIRVTTIImpl : public BasicTTIImplBase<SPIRVTTIImpl> {
   using BaseT = BasicTTIImplBase<SPIRVTTIImpl>;
+  using TTI = TargetTransformInfo;
 
   friend BaseT;
 
@@ -37,6 +38,16 @@ class SPIRVTTIImpl : public BasicTTIImplBase<SPIRVTTIImpl> {
   explicit SPIRVTTIImpl(const SPIRVTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
+
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
+    // SPIR-V natively supports OpBitcount, per 3.53.14 in the spec, as such it
+    // is reasonable to assume the Op is fast / preferable to the expanded loop.
+    // Furthermore, this prevents information being lost if transforms are
+    // applied to SPIR-V before lowering to a concrete target.
+    if (!isPowerOf2_32(TyWidth) || TyWidth > 64)
+      return TTI::PSK_Software; // Arbitrary bit-width INT is not core SPIR-V.
+    return TTI::PSK_FastHardware;
+  }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index a8016d42b0154..d204a8ac7975d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -46,8 +46,7 @@ static uint32_t convertCharsToWord(const StringRef &Str, unsigned i) {
 
 // Get length including padding and null terminator.
 static size_t getPaddedLen(const StringRef &Str) {
-  const size_t Len = Str.size() + 1;
-  return (Len % 4 == 0) ? Len : Len + (4 - (Len % 4));
+  return (Str.size() + 4) & ~3;
 }
 
 void addStringImm(const StringRef &Str, MCInst &Inst) {
@@ -163,31 +162,6 @@ void buildOpSpirvDecorations(Register Reg, MachineIRBuilder &MIRBuilder,
   }
 }
 
-// TODO: maybe the following two functions should be handled in the subtarget
-// to allow for different OpenCL vs Vulkan handling.
-unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC) {
-  switch (SC) {
-  case SPIRV::StorageClass::Function:
-    return 0;
-  case SPIRV::StorageClass::CrossWorkgroup:
-    return 1;
-  case SPIRV::StorageClass::UniformConstant:
-    return 2;
-  case SPIRV::StorageClass::Workgroup:
-    return 3;
-  case SPIRV::StorageClass::Generic:
-    return 4;
-  case SPIRV::StorageClass::DeviceOnlyINTEL:
-    return 5;
-  case SPIRV::StorageClass::HostOnlyINTEL:
-    return 6;
-  case SPIRV::StorageClass::Input:
-    return 7;
-  default:
-    report_fatal_error("Unable to get address space id");
-  }
-}
-
 SPIRV::StorageClass::StorageClass
 addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI) {
   switch (AddrSpace) {
@@ -617,4 +591,11 @@ bool sortBlocks(Function &F) {
   return Modified;
 }
 
+MachineInstr *getVRegDef(MachineRegisterInfo &MRI, Register Reg) {
+  MachineInstr *MaybeDef = MRI.getVRegDef(Reg);
+  if (MaybeDef && MaybeDef->getOpcode() == SPIRV::ASSIGN_TYPE)
+    MaybeDef = MRI.getVRegDef(MaybeDef->getOperand(1).getReg());
+  return MaybeDef;
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index 7c7616000d22b..f7e8a827c2767 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -134,7 +134,31 @@ void buildOpSpirvDecorations(Register Reg, MachineIRBuilder &MIRBuilder,
                              const MDNode *GVarMD);
 
 // Convert a SPIR-V storage class to the corresponding LLVM IR address space.
-unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC);
+// TODO: maybe the following two functions should be handled in the subtarget
+// to allow for different OpenCL vs Vulkan handling.
+constexpr unsigned
+storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC) {
+  switch (SC) {
+  case SPIRV::StorageClass::Function:
+    return 0;
+  case SPIRV::StorageClass::CrossWorkgroup:
+    return 1;
+  case SPIRV::StorageClass::UniformConstant:
+    return 2;
+  case SPIRV::StorageClass::Workgroup:
+    return 3;
+  case SPIRV::StorageClass::Generic:
+    return 4;
+  case SPIRV::StorageClass::DeviceOnlyINTEL:
+    return 5;
+  case SPIRV::StorageClass::HostOnlyINTEL:
+    return 6;
+  case SPIRV::StorageClass::Input:
+    return 7;
+  default:
+    report_fatal_error("Unable to get address space id");
+  }
+}
 
 // Convert an LLVM IR address space to a SPIR-V storage class.
 SPIRV::StorageClass::StorageClass
@@ -315,5 +339,7 @@ inline const Type *unifyPtrType(const Type *Ty) {
   return toTypedPointer(const_cast<Type *>(Ty));
 }
 
+MachineInstr *getVRegDef(MachineRegisterInfo &MRI, Register Reg);
+
 } // namespace llvm
 #endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index fec2d3a35ae6d..50a96368bbdca 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -48,6 +48,10 @@ static std::string computeDataLayout(const Triple &T, bool is64Bit) {
   // Alignments for 64 bit integers.
   Ret += "-i64:64";
 
+  // Alignments for 128 bit integers.
+  // This is not specified in the ABI document but is the de facto standard.
+  Ret += "-i128:128";
+
   // On SparcV9 128 floats are aligned to 128 bits, on others only to 64.
   // On SparcV9 registers can hold 64 or 32 bits, on others only 32.
   if (is64Bit)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 3dabc5ef540cf..abf414fd1ffb7 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1925,11 +1925,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
     IsTailCall = false;
 
   // Integer args <=32 bits should have an extension attribute.
-  bool IsInternal = false;
-  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    if (const Function *Fn = dyn_cast<Function>(G->getGlobal()))
-      IsInternal = isFullyInternal(Fn);
-  verifyNarrowIntegerArgs(Outs, IsInternal);
+  verifyNarrowIntegerArgs_Call(Outs, &MF.getFunction(), Callee);
 
   // Analyze the operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -2192,7 +2188,7 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   MachineFunction &MF = DAG.getMachineFunction();
 
   // Integer args <=32 bits should have an extension attribute.
-  verifyNarrowIntegerArgs(Outs, isFullyInternal(&MF.getFunction()));
+  verifyNarrowIntegerArgs_Ret(Outs, &MF.getFunction());
 
   // Assign locations to each returned value.
   SmallVector<CCValAssign, 16> RetLocs;
@@ -7361,8 +7357,9 @@ SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT(
   if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
     SDValue Op0 = N->getOperand(0);
     EVT VecVT = Op0.getValueType();
-    return combineExtract(SDLoc(N), N->getValueType(0), VecVT, Op0,
-                          IndexN->getZExtValue(), DCI, false);
+    if (canTreatAsByteVector(VecVT))
+      return combineExtract(SDLoc(N), N->getValueType(0), VecVT, Op0,
+                            IndexN->getZExtValue(), DCI, false);
   }
   return SDValue();
 }
@@ -9834,23 +9831,74 @@ bool SystemZTargetLowering::isFullyInternal(const Function *Fn) const {
   return true;
 }
 
-// Verify that narrow integer arguments are extended as required by the ABI.
+static void printFunctionArgExts(const Function *F, raw_fd_ostream &OS) {
+  FunctionType *FT = F->getFunctionType();
+  const AttributeList &Attrs = F->getAttributes();
+  if (Attrs.hasRetAttrs())
+    OS << Attrs.getAsString(AttributeList::ReturnIndex) << " ";
+  OS << *F->getReturnType() << " @" << F->getName() << "(";
+  for (unsigned I = 0, E = FT->getNumParams(); I != E; ++I) {
+    if (I)
+      OS << ", ";
+    OS << *FT->getParamType(I);
+    AttributeSet ArgAttrs = Attrs.getParamAttrs(I);
+    for (auto A : {Attribute::SExt, Attribute::ZExt, Attribute::NoExt})
+      if (ArgAttrs.hasAttribute(A))
+        OS << " " << Attribute::getNameFromAttrKind(A);
+  }
+  OS << ")\n";
+}
+
 void SystemZTargetLowering::
+verifyNarrowIntegerArgs_Call(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const Function *F, SDValue Callee) const {
+  bool IsInternal = false;
+  const Function *CalleeFn = nullptr;
+  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    if ((CalleeFn = dyn_cast<Function>(G->getGlobal())))
+      IsInternal = isFullyInternal(CalleeFn);
+  if (!verifyNarrowIntegerArgs(Outs, IsInternal)) {
+    errs() << "ERROR: Missing extension attribute of passed "
+           << "value in call to function:\n" << "Callee:  ";
+    if (CalleeFn != nullptr)
+      printFunctionArgExts(CalleeFn, errs());
+    else
+      errs() << "-";
+    errs() << "Caller:  ";
+    printFunctionArgExts(F, errs());
+    llvm_unreachable("");
+  }
+}
+
+void SystemZTargetLowering::
+verifyNarrowIntegerArgs_Ret(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                            const Function *F) const {
+  if (!verifyNarrowIntegerArgs(Outs, isFullyInternal(F))) {
+    errs() << "ERROR: Missing extension attribute of returned "
+           << "value from function:\n";
+    printFunctionArgExts(F, errs());
+    llvm_unreachable("");
+  }
+}
+
+// Verify that narrow integer arguments are extended as required by the ABI.
+// Return false if an error is found.
+bool SystemZTargetLowering::
 verifyNarrowIntegerArgs(const SmallVectorImpl<ISD::OutputArg> &Outs,
                         bool IsInternal) const {
   if (IsInternal || !Subtarget.isTargetELF())
-    return;
+    return true;
 
   // Temporarily only do the check when explicitly requested, until it can be
   // enabled by default.
   if (!EnableIntArgExtCheck)
-    return;
+    return true;
 
   if (EnableIntArgExtCheck.getNumOccurrences()) {
     if (!EnableIntArgExtCheck)
-      return;
+      return true;
   } else if (!getTargetMachine().Options.VerifyArgABICompliance)
-    return;
+    return true;
 
   for (unsigned i = 0; i < Outs.size(); ++i) {
     MVT VT = Outs[i].VT;
@@ -9858,10 +9906,11 @@ verifyNarrowIntegerArgs(const SmallVectorImpl<ISD::OutputArg> &Outs,
     if (VT.isInteger()) {
       assert((VT == MVT::i32 || VT.getSizeInBits() >= 64) &&
              "Unexpected integer argument VT.");
-      assert((VT != MVT::i32 ||
-              (Flags.isSExt() || Flags.isZExt() || Flags.isNoExt())) &&
-             "Narrow integer argument must have a valid extension type.");
-      (void)Flags;
+      if (VT == MVT::i32 &&
+          !Flags.isSExt() && !Flags.isZExt() && !Flags.isNoExt())
+        return false;
     }
   }
+
+  return true;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 8c528897182d1..2b065245c16f2 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -806,7 +806,11 @@ class SystemZTargetLowering : public TargetLowering {
   const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
 
   bool isFullyInternal(const Function *Fn) const;
-  void verifyNarrowIntegerArgs(const SmallVectorImpl<ISD::OutputArg> &Outs,
+  void verifyNarrowIntegerArgs_Call(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const Function *F, SDValue Callee) const;
+  void verifyNarrowIntegerArgs_Ret(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const Function *F) const;
+  bool verifyNarrowIntegerArgs(const SmallVectorImpl<ISD::OutputArg> &Outs,
                                bool IsInternal) const;
 };
 
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 3cd1e05aa5d18..e44777c5c4857 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -959,13 +959,13 @@ static unsigned getOperandsExtensionCost(const Instruction *I) {
   return ExtCost;
 }
 
-InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                                   Type *CondTy,
-                                                   CmpInst::Predicate VecPred,
-                                                   TTI::TargetCostKind CostKind,
-                                                   const Instruction *I) {
+InstructionCost SystemZTTIImpl::getCmpSelInstrCost(
+    unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+    TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
+    TTI::OperandValueInfo Op2Info, const Instruction *I) {
   if (CostKind != TTI::TCK_RecipThroughput)
-    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
+    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+                                     Op1Info, Op2Info);
 
   if (!ValTy->isVectorTy()) {
     switch (Opcode) {
@@ -1041,7 +1041,8 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     }
   }
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+                                   Op1Info, Op2Info);
 }
 
 InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 9294fada1eb77..e221200cfa08c 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -104,10 +104,12 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
                                    TTI::CastContextHint CCH,
                                    TTI::TargetCostKind CostKind,
                                    const Instruction *I = nullptr);
-  InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-                                     CmpInst::Predicate VecPred,
-                                     TTI::TargetCostKind CostKind,
-                                     const Instruction *I = nullptr);
+  InstructionCost getCmpSelInstrCost(
+      unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
+      TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr);
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 129fdaf37fc0d..95db5500b0e1b 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -1255,7 +1255,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
 
   void onEndOfFunction(SMLoc ErrorLoc) {
     if (!SkipTypeCheck)
-      TC.endOfFunction(ErrorLoc);
+      TC.endOfFunction(ErrorLoc, true);
     // Reset the type checker state.
     TC.clear();
   }
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
index 8b1e1dca4f847..6c71460201537 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
+#include <sstream>
 
 using namespace llvm;
 
@@ -59,14 +60,7 @@ void WebAssemblyAsmTypeCheck::localDecl(
 }
 
 void WebAssemblyAsmTypeCheck::dumpTypeStack(Twine Msg) {
-  LLVM_DEBUG({
-    std::string s;
-    for (auto VT : Stack) {
-      s += WebAssembly::typeToString(VT);
-      s += " ";
-    }
-    dbgs() << Msg << s << '\n';
-  });
+  LLVM_DEBUG({ dbgs() << Msg << getTypesString(Stack, 0) << "\n"; });
 }
 
 bool WebAssemblyAsmTypeCheck::typeError(SMLoc ErrorLoc, const Twine &Msg) {
@@ -77,34 +71,124 @@ bool WebAssemblyAsmTypeCheck::typeError(SMLoc ErrorLoc, const Twine &Msg) {
   return Parser.Error(ErrorLoc, Msg);
 }
 
-bool WebAssemblyAsmTypeCheck::popType(SMLoc ErrorLoc,
-                                      std::optional<wasm::ValType> EVT) {
-  if (Stack.empty()) {
-    return typeError(ErrorLoc,
-                     EVT ? StringRef("empty stack while popping ") +
-                               WebAssembly::typeToString(*EVT)
-                         : StringRef("empty stack while popping value"));
+bool WebAssemblyAsmTypeCheck::match(StackType TypeA, StackType TypeB) {
+  if (TypeA == TypeB)
+    return false;
+  if (std::get_if<Any>(&TypeA) || std::get_if<Any>(&TypeB))
+    return false;
+
+  if (std::get_if<Ref>(&TypeB))
+    std::swap(TypeA, TypeB);
+  assert(std::get_if<wasm::ValType>(&TypeB));
+  if (std::get_if<Ref>(&TypeA) &&
+      WebAssembly::isRefType(std::get<wasm::ValType>(TypeB)))
+    return false;
+  return true;
+}
+
+std::string WebAssemblyAsmTypeCheck::getTypesString(ArrayRef<StackType> Types,
+                                                    size_t StartPos) {
+  SmallVector<std::string, 4> TypeStrs;
+  for (auto I = Types.size(); I > StartPos; I--) {
+    if (std::get_if<Any>(&Types[I - 1]))
+      TypeStrs.push_back("any");
+    else if (std::get_if<Ref>(&Types[I - 1]))
+      TypeStrs.push_back("ref");
+    else
+      TypeStrs.push_back(
+          WebAssembly::typeToString(std::get<wasm::ValType>(Types[I - 1])));
   }
-  auto PVT = Stack.pop_back_val();
-  if (EVT && *EVT != PVT) {
-    return typeError(ErrorLoc,
-                     StringRef("popped ") + WebAssembly::typeToString(PVT) +
-                         ", expected " + WebAssembly::typeToString(*EVT));
+
+  std::stringstream SS;
+  SS << "[";
+  bool First = true;
+  for (auto It = TypeStrs.rbegin(); It != TypeStrs.rend(); ++It) {
+    if (!First)
+      SS << ", ";
+    SS << *It;
+    First = false;
   }
-  return false;
+  SS << "]";
+  return SS.str();
 }
 
-bool WebAssemblyAsmTypeCheck::popRefType(SMLoc ErrorLoc) {
-  if (Stack.empty()) {
-    return typeError(ErrorLoc, StringRef("empty stack while popping reftype"));
-  }
-  auto PVT = Stack.pop_back_val();
-  if (!WebAssembly::isRefType(PVT)) {
-    return typeError(ErrorLoc, StringRef("popped ") +
-                                   WebAssembly::typeToString(PVT) +
-                                   ", expected reftype");
+SmallVector<WebAssemblyAsmTypeCheck::StackType, 4>
+WebAssemblyAsmTypeCheck::valTypeToStackType(ArrayRef<wasm::ValType> ValTypes) {
+  SmallVector<StackType, 4> Types(ValTypes.size());
+  std::transform(ValTypes.begin(), ValTypes.end(), Types.begin(),
+                 [](wasm::ValType Val) -> StackType { return Val; });
+  return Types;
+}
+
+bool WebAssemblyAsmTypeCheck::checkTypes(SMLoc ErrorLoc,
+                                         ArrayRef<wasm::ValType> ValTypes,
+                                         bool ExactMatch) {
+  return checkTypes(ErrorLoc, valTypeToStackType(ValTypes), ExactMatch);
+}
+
+bool WebAssemblyAsmTypeCheck::checkTypes(SMLoc ErrorLoc,
+                                         ArrayRef<StackType> Types,
+                                         bool ExactMatch) {
+  auto StackI = Stack.size();
+  auto TypeI = Types.size();
+  bool Error = false;
+  // Compare elements one by one from the stack top
+  for (; StackI > 0 && TypeI > 0; StackI--, TypeI--) {
+    if (match(Stack[StackI - 1], Types[TypeI - 1])) {
+      Error = true;
+      break;
+    }
   }
-  return false;
+  // Even if no match failure has happened in the loop above, if not all
+  // elements of Types has been matched, that means we don't have enough
+  // elements on the stack.
+  //
+  // Also, if not all elements of the Stack has been matched and when
+  // 'ExactMatch' is true, that means we have superfluous elements remaining on
+  // the stack (e.g. at the end of a function).
+  if (TypeI > 0 || (ExactMatch && StackI > 0))
+    Error = true;
+
+  if (!Error)
+    return false;
+
+  auto StackStartPos =
+      ExactMatch ? 0 : std::max(0, (int)Stack.size() - (int)Types.size());
+  return typeError(ErrorLoc, "type mismatch, expected " +
+                                 getTypesString(Types, 0) + " but got " +
+                                 getTypesString(Stack, StackStartPos));
+}
+
+bool WebAssemblyAsmTypeCheck::popTypes(SMLoc ErrorLoc,
+                                       ArrayRef<wasm::ValType> ValTypes,
+                                       bool ExactMatch) {
+  return popTypes(ErrorLoc, valTypeToStackType(ValTypes), ExactMatch);
+}
+
+bool WebAssemblyAsmTypeCheck::popTypes(SMLoc ErrorLoc,
+                                       ArrayRef<StackType> Types,
+                                       bool ExactMatch) {
+  bool Error = checkTypes(ErrorLoc, Types, ExactMatch);
+  auto NumPops = std::min(Stack.size(), Types.size());
+  for (size_t I = 0, E = NumPops; I != E; I++)
+    Stack.pop_back();
+  return Error;
+}
+
+bool WebAssemblyAsmTypeCheck::popType(SMLoc ErrorLoc, StackType Type) {
+  return popTypes(ErrorLoc, {Type});
+}
+
+bool WebAssemblyAsmTypeCheck::popRefType(SMLoc ErrorLoc) {
+  return popType(ErrorLoc, Ref{});
+}
+
+bool WebAssemblyAsmTypeCheck::popAnyType(SMLoc ErrorLoc) {
+  return popType(ErrorLoc, Any{});
+}
+
+void WebAssemblyAsmTypeCheck::pushTypes(ArrayRef<wasm::ValType> ValTypes) {
+  Stack.append(valTypeToStackType(ValTypes));
 }
 
 bool WebAssemblyAsmTypeCheck::getLocal(SMLoc ErrorLoc, const MCOperand &LocalOp,
@@ -117,59 +201,29 @@ bool WebAssemblyAsmTypeCheck::getLocal(SMLoc ErrorLoc, const MCOperand &LocalOp,
   return false;
 }
 
-static std::optional<std::string>
-checkStackTop(const SmallVectorImpl<wasm::ValType> &ExpectedStackTop,
-              const SmallVectorImpl<wasm::ValType> &Got) {
-  for (size_t I = 0; I < ExpectedStackTop.size(); I++) {
-    auto EVT = ExpectedStackTop[I];
-    auto PVT = Got[Got.size() - ExpectedStackTop.size() + I];
-    if (PVT != EVT)
-      return std::string{"got "} + WebAssembly::typeToString(PVT) +
-             ", expected " + WebAssembly::typeToString(EVT);
-  }
-  return std::nullopt;
-}
-
 bool WebAssemblyAsmTypeCheck::checkBr(SMLoc ErrorLoc, size_t Level) {
   if (Level >= BrStack.size())
     return typeError(ErrorLoc,
                      StringRef("br: invalid depth ") + std::to_string(Level));
   const SmallVector<wasm::ValType, 4> &Expected =
       BrStack[BrStack.size() - Level - 1];
-  if (Expected.size() > Stack.size())
-    return typeError(ErrorLoc, "br: insufficient values on the type stack");
-  auto IsStackTopInvalid = checkStackTop(Expected, Stack);
-  if (IsStackTopInvalid)
-    return typeError(ErrorLoc, "br " + IsStackTopInvalid.value());
+  return checkTypes(ErrorLoc, Expected);
   return false;
 }
 
 bool WebAssemblyAsmTypeCheck::checkEnd(SMLoc ErrorLoc, bool PopVals) {
   if (!PopVals)
     BrStack.pop_back();
-  if (LastSig.Returns.size() > Stack.size())
-    return typeError(ErrorLoc, "end: insufficient values on the type stack");
 
-  if (PopVals) {
-    for (auto VT : llvm::reverse(LastSig.Returns)) {
-      if (popType(ErrorLoc, VT))
-        return true;
-    }
-    return false;
-  }
-
-  auto IsStackTopInvalid = checkStackTop(LastSig.Returns, Stack);
-  if (IsStackTopInvalid)
-    return typeError(ErrorLoc, "end " + IsStackTopInvalid.value());
-  return false;
+  if (PopVals)
+    return popTypes(ErrorLoc, LastSig.Returns);
+  return checkTypes(ErrorLoc, LastSig.Returns);
 }
 
 bool WebAssemblyAsmTypeCheck::checkSig(SMLoc ErrorLoc,
                                        const wasm::WasmSignature &Sig) {
-  bool Error = false;
-  for (auto VT : llvm::reverse(Sig.Params))
-    Error |= popType(ErrorLoc, VT);
-  Stack.insert(Stack.end(), Sig.Returns.begin(), Sig.Returns.end());
+  bool Error = popTypes(ErrorLoc, Sig.Params);
+  pushTypes(Sig.Returns);
   return Error;
 }
 
@@ -246,7 +300,7 @@ bool WebAssemblyAsmTypeCheck::getSignature(SMLoc ErrorLoc,
       TypeName = "tag";
       break;
     default:
-      return true;
+      llvm_unreachable("Signature symbol should either be a function or a tag");
     }
     return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() +
                                    ": missing ." + TypeName + "type");
@@ -254,15 +308,8 @@ bool WebAssemblyAsmTypeCheck::getSignature(SMLoc ErrorLoc,
   return false;
 }
 
-bool WebAssemblyAsmTypeCheck::endOfFunction(SMLoc ErrorLoc) {
-  bool Error = false;
-  // Check the return types.
-  for (auto RVT : llvm::reverse(ReturnTypes))
-    Error |= popType(ErrorLoc, RVT);
-  if (!Stack.empty()) {
-    return typeError(ErrorLoc, std::to_string(Stack.size()) +
-                                   " superfluous return values");
-  }
+bool WebAssemblyAsmTypeCheck::endOfFunction(SMLoc ErrorLoc, bool ExactMatch) {
+  bool Error = popTypes(ErrorLoc, ReturnTypes, ExactMatch);
   Unreachable = true;
   return Error;
 }
@@ -276,83 +323,104 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
 
   if (Name == "local.get") {
     if (!getLocal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) {
-      Stack.push_back(Type);
+      pushType(Type);
       return false;
     }
+    pushType(Any{});
     return true;
   }
 
   if (Name == "local.set") {
     if (!getLocal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return popType(ErrorLoc, Type);
+    popType(ErrorLoc, Any{});
     return true;
   }
 
   if (Name == "local.tee") {
     if (!getLocal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) {
       bool Error = popType(ErrorLoc, Type);
-      Stack.push_back(Type);
+      pushType(Type);
       return Error;
     }
+    popType(ErrorLoc, Any{});
+    pushType(Any{});
     return true;
   }
 
   if (Name == "global.get") {
     if (!getGlobal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) {
-      Stack.push_back(Type);
+      pushType(Type);
       return false;
     }
+    pushType(Any{});
     return true;
   }
 
   if (Name == "global.set") {
     if (!getGlobal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
       return popType(ErrorLoc, Type);
+    popType(ErrorLoc, Any{});
     return true;
   }
 
   if (Name == "table.get") {
     bool Error = popType(ErrorLoc, wasm::ValType::I32);
     if (!getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) {
-      Stack.push_back(Type);
+      pushType(Type);
       return Error;
     }
+    pushType(Any{});
     return true;
   }
 
   if (Name == "table.set") {
     bool Error = false;
-    if (!getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
-      Error |= popType(ErrorLoc, Type);
-    else
+    SmallVector<StackType, 2> PopTypes;
+    PopTypes.push_back(wasm::ValType::I32);
+    if (!getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) {
+      PopTypes.push_back(Type);
+    } else {
       Error = true;
-    Error |= popType(ErrorLoc, wasm::ValType::I32);
+      PopTypes.push_back(Any{});
+    }
+    Error |= popTypes(ErrorLoc, PopTypes);
     return Error;
   }
 
   if (Name == "table.size") {
     bool Error = getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type);
-    Stack.push_back(wasm::ValType::I32);
+    pushType(wasm::ValType::I32);
     return Error;
   }
 
   if (Name == "table.grow") {
-    bool Error = popType(ErrorLoc, wasm::ValType::I32);
-    if (!getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
-      Error |= popType(ErrorLoc, Type);
-    else
+    bool Error = false;
+    SmallVector<StackType, 2> PopTypes;
+    if (!getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) {
+      PopTypes.push_back(Type);
+    } else {
       Error = true;
-    Stack.push_back(wasm::ValType::I32);
+      PopTypes.push_back(Any{});
+    }
+    PopTypes.push_back(wasm::ValType::I32);
+    Error |= popTypes(ErrorLoc, PopTypes);
+    pushType(wasm::ValType::I32);
     return Error;
   }
 
   if (Name == "table.fill") {
-    bool Error = popType(ErrorLoc, wasm::ValType::I32);
-    if (!getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type))
-      Error |= popType(ErrorLoc, Type);
-    else
+    bool Error = false;
+    SmallVector<StackType, 2> PopTypes;
+    PopTypes.push_back(wasm::ValType::I32);
+    if (!getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) {
+      PopTypes.push_back(Type);
+    } else {
       Error = true;
-    Error |= popType(ErrorLoc, wasm::ValType::I32);
+      PopTypes.push_back(Any{});
+    }
+    PopTypes.push_back(wasm::ValType::I32);
+    Error |= popTypes(ErrorLoc, PopTypes);
     return Error;
   }
 
@@ -381,7 +449,7 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
   }
 
   if (Name == "drop") {
-    return popType(ErrorLoc, {});
+    return popType(ErrorLoc, Any{});
   }
 
   if (Name == "try" || Name == "block" || Name == "loop" || Name == "if") {
@@ -406,7 +474,7 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
                         wasm::WASM_SYMBOL_TYPE_TAG, Sig))
         // catch instruction pushes values whose types are specified in the
         // tag's "params" part
-        Stack.insert(Stack.end(), Sig->Params.begin(), Sig->Params.end());
+        pushTypes(Sig->Params);
       else
         Error = true;
     }
@@ -421,14 +489,14 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
   }
 
   if (Name == "return") {
-    return endOfFunction(ErrorLoc);
+    return endOfFunction(ErrorLoc, false);
   }
 
   if (Name == "call_indirect" || Name == "return_call_indirect") {
     // Function value.
     bool Error = popType(ErrorLoc, wasm::ValType::I32);
     Error |= checkSig(ErrorLoc, LastSig);
-    if (Name == "return_call_indirect" && endOfFunction(ErrorLoc))
+    if (Name == "return_call_indirect" && endOfFunction(ErrorLoc, false))
       return true;
     return Error;
   }
@@ -441,7 +509,7 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
       Error |= checkSig(ErrorLoc, *Sig);
     else
       Error = true;
-    if (Name == "return_call" && endOfFunction(ErrorLoc))
+    if (Name == "return_call" && endOfFunction(ErrorLoc, false))
       return true;
     return Error;
   }
@@ -453,7 +521,7 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
 
   if (Name == "ref.is_null") {
     bool Error = popRefType(ErrorLoc);
-    Stack.push_back(wasm::ValType::I32);
+    pushType(wasm::ValType::I32);
     return Error;
   }
 
@@ -471,22 +539,22 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst,
   auto RegOpc = WebAssembly::getRegisterOpcode(Opc);
   assert(RegOpc != -1 && "Failed to get register version of MC instruction");
   const auto &II = MII.get(RegOpc);
-  bool Error = false;
   // First pop all the uses off the stack and check them.
-  for (unsigned I = II.getNumOperands(); I > II.getNumDefs(); I--) {
-    const auto &Op = II.operands()[I - 1];
-    if (Op.OperandType == MCOI::OPERAND_REGISTER) {
-      auto VT = WebAssembly::regClassToValType(Op.RegClass);
-      Error |= popType(ErrorLoc, VT);
-    }
+  SmallVector<wasm::ValType, 4> PopTypes;
+  for (unsigned I = II.getNumDefs(); I < II.getNumOperands(); I++) {
+    const auto &Op = II.operands()[I];
+    if (Op.OperandType == MCOI::OPERAND_REGISTER)
+      PopTypes.push_back(WebAssembly::regClassToValType(Op.RegClass));
   }
+  bool Error = popTypes(ErrorLoc, PopTypes);
+  SmallVector<wasm::ValType, 4> PushTypes;
   // Now push all the defs onto the stack.
   for (unsigned I = 0; I < II.getNumDefs(); I++) {
     const auto &Op = II.operands()[I];
     assert(Op.OperandType == MCOI::OPERAND_REGISTER && "Register expected");
-    auto VT = WebAssembly::regClassToValType(Op.RegClass);
-    Stack.push_back(VT);
+    PushTypes.push_back(WebAssembly::regClassToValType(Op.RegClass));
   }
+  pushTypes(PushTypes);
   return Error;
 }
 
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
index 972162d3e02f4..df063d749e3b4 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCSymbol.h"
+#include <variant>
 
 namespace llvm {
 
@@ -28,7 +29,10 @@ class WebAssemblyAsmTypeCheck final {
   MCAsmParser &Parser;
   const MCInstrInfo &MII;
 
-  SmallVector<wasm::ValType, 8> Stack;
+  struct Ref : public std::monostate {};
+  struct Any : public std::monostate {};
+  using StackType = std::variant<wasm::ValType, Ref, Any>;
+  SmallVector<StackType, 16> Stack;
   SmallVector<SmallVector<wasm::ValType, 4>, 8> BrStack;
   SmallVector<wasm::ValType, 16> LocalTypes;
   SmallVector<wasm::ValType, 4> ReturnTypes;
@@ -36,10 +40,33 @@ class WebAssemblyAsmTypeCheck final {
   bool Unreachable = false;
   bool Is64;
 
+  // checkTypes checks 'Types' against the value stack. popTypes checks 'Types'
+  // against the value stack and also pops them.
+  //
+  // If ExactMatch is true, 'Types' will be compared against not only the top of
+  // the value stack but the whole remaining value stack
+  // (TODO: This should be the whole remaining value stack "at the the current
+  // block level", which has not been implemented yet)
+  bool checkTypes(SMLoc ErrorLoc, ArrayRef<wasm::ValType> Types,
+                  bool ExactMatch = false);
+  bool checkTypes(SMLoc ErrorLoc, ArrayRef<StackType> Types,
+                  bool ExactMatch = false);
+  bool popTypes(SMLoc ErrorLoc, ArrayRef<wasm::ValType> Types,
+                bool ExactMatch = false);
+  bool popTypes(SMLoc ErrorLoc, ArrayRef<StackType> Types,
+                bool ExactMatch = false);
+  bool popType(SMLoc ErrorLoc, StackType Type);
+  bool popRefType(SMLoc ErrorLoc);
+  bool popAnyType(SMLoc ErrorLoc);
+  void pushTypes(ArrayRef<wasm::ValType> Types);
+  void pushType(StackType Type) { Stack.push_back(Type); }
+  bool match(StackType TypeA, StackType TypeB);
+  std::string getTypesString(ArrayRef<StackType> Types, size_t StartPos);
+  SmallVector<StackType, 4>
+  valTypeToStackType(ArrayRef<wasm::ValType> ValTypes);
+
   void dumpTypeStack(Twine Msg);
   bool typeError(SMLoc ErrorLoc, const Twine &Msg);
-  bool popType(SMLoc ErrorLoc, std::optional<wasm::ValType> EVT);
-  bool popRefType(SMLoc ErrorLoc);
   bool getLocal(SMLoc ErrorLoc, const MCOperand &LocalOp, wasm::ValType &Type);
   bool checkEnd(SMLoc ErrorLoc, bool PopVals = false);
   bool checkBr(SMLoc ErrorLoc, size_t Level);
@@ -59,7 +86,7 @@ class WebAssemblyAsmTypeCheck final {
   void funcDecl(const wasm::WasmSignature &Sig);
   void localDecl(const SmallVectorImpl<wasm::ValType> &Locals);
   void setLastSig(const wasm::WasmSignature &Sig) { LastSig = Sig; }
-  bool endOfFunction(SMLoc ErrorLoc);
+  bool endOfFunction(SMLoc ErrorLoc, bool ExactMatch);
   bool typeCheck(SMLoc ErrorLoc, const MCInst &Inst, OperandVector &Operands);
 
   void clear() {
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index 7169d588548b0..bab7fe9d25e44 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -498,6 +498,62 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .clampScalar(0, s32, sMaxScalar)
       .widenScalarToNextPow2(1);
 
+  // For G_UITOFP and G_FPTOUI without AVX512, we have to custom legalize types
+  // <= s32 manually. Otherwise, in custom handler there is no way to
+  // understand whether s32 is an original type and we need to promote it to
+  // s64 or s32 is obtained after widening and we shouldn't widen it to s64.
+  //
+  // For AVX512 we simply widen types as there is direct mapping from opcodes
+  // to asm instructions.
+  getActionDefinitionsBuilder(G_UITOFP)
+      .legalIf([=](const LegalityQuery &Query) {
+        return HasAVX512 && typeInSet(0, {s32, s64})(Query) &&
+               typeInSet(1, {s32, s64})(Query);
+      })
+      .customIf([=](const LegalityQuery &Query) {
+        return !HasAVX512 &&
+               ((HasSSE1 && typeIs(0, s32)(Query)) ||
+                (HasSSE2 && typeIs(0, s64)(Query))) &&
+               scalarNarrowerThan(1, Is64Bit ? 64 : 32)(Query);
+      })
+      .lowerIf([=](const LegalityQuery &Query) {
+        // Lower conversions from s64
+        return !HasAVX512 &&
+               ((HasSSE1 && typeIs(0, s32)(Query)) ||
+                (HasSSE2 && typeIs(0, s64)(Query))) &&
+               (Is64Bit && typeIs(1, s64)(Query));
+      })
+      .clampScalar(0, s32, HasSSE2 ? s64 : s32)
+      .widenScalarToNextPow2(0)
+      .clampScalar(1, s32, sMaxScalar)
+      .widenScalarToNextPow2(1);
+
+  getActionDefinitionsBuilder(G_FPTOUI)
+      .legalIf([=](const LegalityQuery &Query) {
+        return HasAVX512 && typeInSet(0, {s32, s64})(Query) &&
+               typeInSet(1, {s32, s64})(Query);
+      })
+      .customIf([=](const LegalityQuery &Query) {
+        return !HasAVX512 &&
+               ((HasSSE1 && typeIs(1, s32)(Query)) ||
+                (HasSSE2 && typeIs(1, s64)(Query))) &&
+               scalarNarrowerThan(0, Is64Bit ? 64 : 32)(Query);
+      })
+      // TODO: replace with customized legalization using
+      // specifics of cvttsd2si. The selection of this node requires
+      // a vector type. Either G_SCALAR_TO_VECTOR is needed or more advanced
+      // support of G_BUILD_VECTOR/G_INSERT_VECTOR_ELT is required beforehand.
+      .lowerIf([=](const LegalityQuery &Query) {
+        return !HasAVX512 &&
+               ((HasSSE1 && typeIs(1, s32)(Query)) ||
+                (HasSSE2 && typeIs(1, s64)(Query))) &&
+               (Is64Bit && typeIs(0, s64)(Query));
+      })
+      .clampScalar(0, s32, sMaxScalar)
+      .widenScalarToNextPow2(0)
+      .clampScalar(1, s32, HasSSE2 ? s64 : s32)
+      .widenScalarToNextPow2(1);
+
   // vector ops
   getActionDefinitionsBuilder(G_BUILD_VECTOR)
       .customIf([=](const LegalityQuery &Query) {
@@ -590,6 +646,10 @@ bool X86LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
     return false;
   case TargetOpcode::G_BUILD_VECTOR:
     return legalizeBuildVector(MI, MRI, Helper);
+  case TargetOpcode::G_FPTOUI:
+    return legalizeFPTOUI(MI, MRI, Helper);
+  case TargetOpcode::G_UITOFP:
+    return legalizeUITOFP(MI, MRI, Helper);
   }
   llvm_unreachable("expected switch to return");
 }
@@ -645,6 +705,45 @@ bool X86LegalizerInfo::legalizeBuildVector(MachineInstr &MI,
   return true;
 }
 
+bool X86LegalizerInfo::legalizeFPTOUI(MachineInstr &MI,
+                                      MachineRegisterInfo &MRI,
+                                      LegalizerHelper &Helper) const {
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
+  unsigned DstSizeInBits = DstTy.getScalarSizeInBits();
+  const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
+
+  // Simply reuse FPTOSI when it is possible to widen the type
+  if (DstSizeInBits <= 32) {
+    auto Casted = MIRBuilder.buildFPTOSI(DstTy == s32 ? s64 : s32, Src);
+    MIRBuilder.buildTrunc(Dst, Casted);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+bool X86LegalizerInfo::legalizeUITOFP(MachineInstr &MI,
+                                      MachineRegisterInfo &MRI,
+                                      LegalizerHelper &Helper) const {
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
+  const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
+
+  // Simply reuse SITOFP when it is possible to widen the type
+  if (SrcTy.getSizeInBits() <= 32) {
+    auto Ext = MIRBuilder.buildZExt(SrcTy == s32 ? s64 : s32, Src);
+    MIRBuilder.buildSITOFP(Dst, Ext);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
 bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                          MachineInstr &MI) const {
   return true;
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
index 229a58986903d..39bd9892e2f16 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
@@ -39,6 +39,12 @@ class X86LegalizerInfo : public LegalizerInfo {
 private:
   bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
                            LegalizerHelper &Helper) const;
+
+  bool legalizeFPTOUI(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      LegalizerHelper &Helper) const;
+
+  bool legalizeUITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      LegalizerHelper &Helper) const;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
index 61633a09d93cf..43c0145ec8e2a 100644
--- a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
@@ -296,7 +296,9 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     getInstrPartialMappingIdxs(MI, MRI, /* isFP= */ true, OpRegBankIdx);
     break;
   case TargetOpcode::G_SITOFP:
-  case TargetOpcode::G_FPTOSI: {
+  case TargetOpcode::G_FPTOSI:
+  case TargetOpcode::G_UITOFP:
+  case TargetOpcode::G_FPTOUI: {
     // Some of the floating-point instructions have mixed GPR and FP
     // operands: fine-tune the computed mapping.
     auto &Op0 = MI.getOperand(0);
@@ -304,10 +306,10 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     const LLT Ty0 = MRI.getType(Op0.getReg());
     const LLT Ty1 = MRI.getType(Op1.getReg());
 
-    bool FirstArgIsFP = Opc == TargetOpcode::G_SITOFP;
-    bool SecondArgIsFP = Opc == TargetOpcode::G_FPTOSI;
+    bool FirstArgIsFP =
+        Opc == TargetOpcode::G_SITOFP || Opc == TargetOpcode::G_UITOFP;
     OpRegBankIdx[0] = getPartialMappingIdx(MI, Ty0, /* isFP= */ FirstArgIsFP);
-    OpRegBankIdx[1] = getPartialMappingIdx(MI, Ty1, /* isFP= */ SecondArgIsFP);
+    OpRegBankIdx[1] = getPartialMappingIdx(MI, Ty1, /* isFP= */ !FirstArgIsFP);
     break;
   }
   case TargetOpcode::G_FCMP: {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index a4b72515252a0..534717a4bea4e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -39,6 +39,11 @@ using namespace llvm;
   CASE_MASK_INS_COMMON(Inst, Suffix, src)         \
   CASE_MASKZ_INS_COMMON(Inst, Suffix, src)
 
+#define CASE_PTERNLOG(Inst, src)                                               \
+  CASE_AVX512_INS_COMMON(Inst, Z, r##src##i)                                   \
+  CASE_AVX512_INS_COMMON(Inst, Z256, r##src##i)                                \
+  CASE_AVX512_INS_COMMON(Inst, Z128, r##src##i)
+
 #define CASE_MOVDUP(Inst, src)                    \
   CASE_AVX512_INS_COMMON(Inst, Z, r##src)         \
   CASE_AVX512_INS_COMMON(Inst, Z256, r##src)      \
@@ -617,6 +622,90 @@ static bool printFMAComments(const MCInst *MI, raw_ostream &OS,
   return true;
 }
 
+static bool printPTERNLOGComments(const MCInst *MI, raw_ostream &OS,
+                                  const MCInstrInfo &MCII) {
+  unsigned NumOperands = MI->getNumOperands();
+
+  int Src2Idx;
+  int Src3Idx;
+  switch (MI->getOpcode()) {
+    // dest, src1, src2, src3, tbl
+    // dest, src1, mask, src2, src3, tbl
+    CASE_PTERNLOG(PTERNLOGD, r)
+    CASE_PTERNLOG(PTERNLOGQ, r)
+    Src2Idx = NumOperands - 3;
+    Src3Idx = NumOperands - 2;
+    break;
+
+    // dest, src1, src2, memory, tbl
+    // dest, src1, mask, src2, memory, tbl
+    CASE_PTERNLOG(PTERNLOGD, m)
+    CASE_PTERNLOG(PTERNLOGQ, m)
+    CASE_PTERNLOG(PTERNLOGD, mb)
+    CASE_PTERNLOG(PTERNLOGQ, mb)
+    Src2Idx = NumOperands - 7;
+    Src3Idx = -1;
+    break;
+
+  default:
+    return false;
+  }
+  const char *DestName = getRegName(MI->getOperand(0).getReg());
+  const char *Src1Name = getRegName(MI->getOperand(1).getReg());
+  const char *Src2Name = getRegName(MI->getOperand(Src2Idx).getReg());
+  const char *Src3Name =
+      Src3Idx != -1 ? getRegName(MI->getOperand(Src3Idx).getReg()) : "mem";
+  uint8_t TruthTable = MI->getOperand(NumOperands - 1).getImm();
+
+  OS << DestName;
+  printMasking(OS, MI, MCII);
+  OS << " = ";
+
+  constexpr unsigned kNumVariables = 3;
+  constexpr unsigned kNumTruthTableEntries = 1 << kNumVariables;
+  int NumMinterms = llvm::popcount(TruthTable);
+  if (NumMinterms == 0) {
+    OS << '0';
+  } else if (NumMinterms == kNumTruthTableEntries) {
+    OS << "-1";
+  } else {
+    while (TruthTable != 0) {
+      // Index of the lowest bit set.
+      unsigned I = llvm::countr_zero(TruthTable);
+      // Clear the lowest bit set.
+      TruthTable &= TruthTable - 1;
+      // Our index tells us which sources are and are not complemented. Note
+      // that the indexing goes left-to-right.
+      bool Src1 = I & 0b100;
+      bool Src2 = I & 0b010;
+      bool Src3 = I & 0b001;
+
+      // Group in parenthesis to make the output more obvious but only if there
+      // are multiple terms.
+      if (NumMinterms > 1)
+        OS << '(';
+
+      if (!Src1)
+        OS << '~';
+      OS << Src1Name << " & ";
+      if (!Src2)
+        OS << '~';
+      OS << Src2Name << " & ";
+      if (!Src3)
+        OS << '~';
+      OS << Src3Name;
+
+      if (NumMinterms > 1)
+        OS << ')';
+
+      // Output an OR if there is another term in the table.
+      if (TruthTable != 0)
+        OS << " | ";
+    }
+  }
+  OS << '\n';
+  return true;
+}
 
 //===----------------------------------------------------------------------===//
 // Top Level Entrypoint
@@ -636,6 +725,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   if (printFMAComments(MI, OS, MCII))
     return true;
 
+  if (printPTERNLOGComments(MI, OS, MCII))
+    return true;
+
   switch (MI->getOpcode()) {
   default:
     // Not an instruction for which we can decode comments.
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 307aeb2ea4c6f..91af111db8cda 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -1104,6 +1104,9 @@ def CC_X86 : CallingConv<[
 
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
 
+def CSR_IPRA_32 : CalleeSavedRegs<(add EBP, ESI)>;
+def CSR_IPRA_64 : CalleeSavedRegs<(add RBP, RBX)>;
+
 def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
 def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
 
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index c6518a8e4363e..4bf660b5e234a 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -902,6 +902,8 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
     uint64_t Disp = (int32_t)AM.Disp;
     unsigned IndexReg = AM.IndexReg;
     unsigned Scale = AM.Scale;
+    MVT PtrVT = TLI.getValueType(DL, U->getType()).getSimpleVT();
+
     gep_type_iterator GTI = gep_type_begin(U);
     // Iterate through the indices, folding what we can. Constants can be
     // folded, and one dynamic index can be handled, if the scale is supported.
@@ -937,7 +939,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
             (S == 1 || S == 2 || S == 4 || S == 8)) {
           // Scaled-index addressing.
           Scale = S;
-          IndexReg = getRegForGEPIndex(Op);
+          IndexReg = getRegForGEPIndex(PtrVT, Op);
           if (IndexReg == 0)
             return false;
           break;
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 564bfd63f3c27..70e4c199190d6 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1975,6 +1975,16 @@ bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
       AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
       AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
       AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
+    // However, when GV is a local function symbol and in the same section as
+    // the current instruction, and AM.Disp is negative and near INT32_MIN,
+    // referencing GV+Disp generates a relocation referencing the section symbol
+    // with an even smaller offset, which might underflow. We should bail out if
+    // the negative offset is too close to INT32_MIN. Actually, we are more
+    // conservative here, using a smaller magic number also used by
+    // isOffsetSuitableForCodeModel.
+    if (isa_and_nonnull<Function>(AM.GV) && AM.Disp < -16 * 1024 * 1024)
+      return true;
+
     AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
   }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d9eedfdfd53a4..73f7f52846f62 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35451,11 +35451,11 @@ static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
     if (MIIt->getOperand(3).getImm() == OppCC)
       std::swap(Op1Reg, Op2Reg);
 
-    if (RegRewriteTable.contains(Op1Reg))
-      Op1Reg = RegRewriteTable[Op1Reg].first;
+    if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
+      Op1Reg = It->second.first;
 
-    if (RegRewriteTable.contains(Op2Reg))
-      Op2Reg = RegRewriteTable[Op2Reg].second;
+    if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
+      Op2Reg = It->second.second;
 
     MIB =
         BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index b9124658028da..8561658379f7e 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -124,12 +124,14 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
       !Subtarget.hasX87())
     return MVT::i32;
 
-  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
-    return getRegisterTypeForCallingConv(Context, CC,
-                                         VT.changeVectorElementType(MVT::f16));
+  if (isTypeLegal(MVT::f16)) {
+    if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
+      return getRegisterTypeForCallingConv(
+          Context, CC, VT.changeVectorElementType(MVT::f16));
 
-  if (VT == MVT::bf16)
-    return MVT::f16;
+    if (VT == MVT::bf16)
+      return MVT::f16;
+  }
 
   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
 }
@@ -162,7 +164,8 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
       return 3;
   }
 
-  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
+  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
+      isTypeLegal(MVT::f16))
     return getNumRegistersForCallingConv(Context, CC,
                                          VT.changeVectorElementType(MVT::f16));
 
@@ -194,7 +197,8 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
   }
 
   // Split vNbf16 vectors according to vNf16.
-  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
+  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
+      isTypeLegal(MVT::f16))
     VT = VT.changeVectorElementType(MVT::f16);
 
   return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 1d8808f4e2b7d..302d50581e1e6 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -410,6 +410,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return CallsEHReturn ? CSR_32EHRet_SaveList : CSR_32_SaveList;
 }
 
+const MCPhysReg *
+X86RegisterInfo::getIPRACSRegs(const MachineFunction *MF) const {
+  return Is64Bit ? CSR_IPRA_64_SaveList : CSR_IPRA_32_SaveList;
+}
+
 const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy(
     const MachineFunction *MF) const {
   assert(MF && "Invalid MachineFunction pointer.");
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index 2f73698a4b94d..68ee372f27b14 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -99,6 +99,9 @@ class X86RegisterInfo final : public X86GenRegisterInfo {
   /// callee-save registers on this target.
   const MCPhysReg *
   getCalleeSavedRegs(const MachineFunction* MF) const override;
+  /// getIPRACSRegs - This API can be removed when rbp is safe to optimized out
+  /// when IPRA is on.
+  const MCPhysReg *getIPRACSRegs(const MachineFunction *MF) const override;
   const MCPhysReg *
   getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 0fa138cefc3b8..46bc73c5e928e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3157,15 +3157,14 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
 }
 
-InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                               Type *CondTy,
-                                               CmpInst::Predicate VecPred,
-                                               TTI::TargetCostKind CostKind,
-                                               const Instruction *I) {
+InstructionCost X86TTIImpl::getCmpSelInstrCost(
+    unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+    TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
+    TTI::OperandValueInfo Op2Info, const Instruction *I) {
   // Early out if this type isn't scalar/vector integer/float.
   if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
-                                     I);
+                                     Op1Info, Op2Info, I);
 
   // Legalize the type.
   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
@@ -3229,9 +3228,11 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
         // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
         if (CondTy && !ST->hasAVX())
           return getCmpSelInstrCost(Opcode, ValTy, CondTy,
-                                    CmpInst::Predicate::FCMP_UNO, CostKind) +
+                                    CmpInst::Predicate::FCMP_UNO, CostKind,
+                                    Op1Info, Op2Info) +
                  getCmpSelInstrCost(Opcode, ValTy, CondTy,
-                                    CmpInst::Predicate::FCMP_OEQ, CostKind) +
+                                    CmpInst::Predicate::FCMP_OEQ, CostKind,
+                                    Op1Info, Op2Info) +
                  getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
 
         break;
@@ -3451,7 +3452,8 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     if (ValTy->getScalarType()->isFloatingPointTy())
       return 3;
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+                                   Op1Info, Op2Info, I);
 }
 
 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 8ea67dcbe5166..c16461b157e07 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -156,10 +156,12 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
                                    TTI::CastContextHint CCH,
                                    TTI::TargetCostKind CostKind,
                                    const Instruction *I = nullptr);
-  InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-                                     CmpInst::Predicate VecPred,
-                                     TTI::TargetCostKind CostKind,
-                                     const Instruction *I = nullptr);
+  InstructionCost getCmpSelInstrCost(
+      unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
+      TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr);
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
index 3f99387f759d9..db86637ecf83f 100644
--- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "XtensaAsmPrinter.h"
+#include "MCTargetDesc/XtensaInstPrinter.h"
 #include "MCTargetDesc/XtensaMCExpr.h"
 #include "MCTargetDesc/XtensaTargetStreamer.h"
 #include "TargetInfo/XtensaTargetInfo.h"
@@ -157,6 +158,57 @@ void XtensaAsmPrinter::emitConstantPool() {
   OutStreamer->popSection();
 }
 
+void XtensaAsmPrinter::printOperand(const MachineInstr *MI, int OpNo,
+                                    raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+  case MachineOperand::MO_Immediate: {
+    MCOperand MC = lowerOperand(MI->getOperand(OpNo));
+    XtensaInstPrinter::printOperand(MC, O);
+    break;
+  }
+  default:
+    llvm_unreachable("unknown operand type");
+  }
+}
+
+bool XtensaAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                       const char *ExtraCode, raw_ostream &O) {
+  // Print the operand if there is no operand modifier.
+  if (!ExtraCode || !ExtraCode[0]) {
+    printOperand(MI, OpNo, O);
+    return false;
+  }
+
+  // Fallback to the default implementation.
+  return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
+}
+
+bool XtensaAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                             unsigned OpNo,
+                                             const char *ExtraCode,
+                                             raw_ostream &OS) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  assert(OpNo + 1 < MI->getNumOperands() && "Insufficient operands");
+
+  const MachineOperand &Base = MI->getOperand(OpNo);
+  const MachineOperand &Offset = MI->getOperand(OpNo + 1);
+
+  assert(Base.isReg() &&
+         "Unexpected base pointer for inline asm memory operand.");
+  assert(Offset.isImm() && "Unexpected offset for inline asm memory operand.");
+
+  OS << XtensaInstPrinter::getRegisterName(Base.getReg());
+  OS << ", ";
+  OS << Offset.getImm();
+
+  return false;
+}
+
 MCSymbol *
 XtensaAsmPrinter::GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
   // Create a symbol for the name.
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
index f9cf5ae8c9f65..1137309cd9a45 100644
--- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
@@ -42,6 +42,14 @@ class LLVM_LIBRARY_VISIBILITY XtensaAsmPrinter : public AsmPrinter {
 
   void emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
 
+  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       const char *ExtraCode, raw_ostream &O) override;
+
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             const char *ExtraCode, raw_ostream &OS) override;
+
   MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
 
   MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
diff --git a/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp
index 6f6d3342fcd7f..af1110487b427 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp
@@ -33,6 +33,10 @@ class XtensaDAGToDAGISel : public SelectionDAGISel {
 
   void Select(SDNode *Node) override;
 
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    InlineAsm::ConstraintCode ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+
   // For load/store instructions generate (base+offset) pair from
   // memory address. The offset must be a multiple of scale argument.
   bool selectMemRegAddr(SDValue Addr, SDValue &Base, SDValue &Offset,
@@ -212,3 +216,22 @@ void XtensaDAGToDAGISel::Select(SDNode *Node) {
 
   SelectCode(Node);
 }
+
+bool XtensaDAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
+    std::vector<SDValue> &OutOps) {
+  switch (ConstraintID) {
+  default:
+    llvm_unreachable("Unexpected asm memory constraint");
+  case InlineAsm::ConstraintCode::m: {
+    SDValue Base, Offset;
+
+    selectMemRegAddr(Op, Base, Offset, 4);
+    OutOps.push_back(Base);
+    OutOps.push_back(Offset);
+
+    return false;
+  }
+  }
+  return false;
+}
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
index bc1360e212307..670930e99334f 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -142,6 +142,74 @@ bool XtensaTargetLowering::isOffsetFoldingLegal(
   return false;
 }
 
+//===----------------------------------------------------------------------===//
+// Inline asm support
+//===----------------------------------------------------------------------===//
+TargetLowering::ConstraintType
+XtensaTargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return C_RegisterClass;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+TargetLowering::ConstraintWeight
+XtensaTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &Info, const char *Constraint) const {
+  ConstraintWeight Weight = CW_Invalid;
+  Value *CallOperandVal = Info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+
+  Type *Ty = CallOperandVal->getType();
+
+  // Look at the constraint type.
+  switch (*Constraint) {
+  default:
+    Weight = TargetLowering::getSingleConstraintMatchWeight(Info, Constraint);
+    break;
+  case 'r':
+    if (Ty->isIntegerTy())
+      Weight = CW_Register;
+    break;
+  }
+  return Weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+XtensaTargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'r': // General-purpose register
+      return std::make_pair(0U, &Xtensa::ARRegClass);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+void XtensaTargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
+  // Only support length 1 constraints for now.
+  if (Constraint.size() > 1)
+    return;
+
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
 //===----------------------------------------------------------------------===//
 // Calling conventions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.h b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
index 2a878e45047d2..f1cd00c41437a 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.h
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
@@ -76,6 +76,21 @@ class XtensaTargetLowering : public TargetLowering {
 
   const char *getTargetNodeName(unsigned Opcode) const override;
 
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+
+  TargetLowering::ConstraintType
+  getConstraintType(StringRef Constraint) const override;
+
+  TargetLowering::ConstraintWeight
+  getSingleConstraintMatchWeight(AsmOperandInfo &Info,
+                                 const char *Constraint) const override;
+
+  void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 01642b0677aba..9943c3cbb9fc7 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -843,7 +843,7 @@ getStrideAndModOffsetOfGEP(Value *PtrOp, const DataLayout &DL) {
   // Return a minimum gep stride, greatest common divisor of consective gep
   // index scales(c.f. Bézout's identity).
   while (auto *GEP = dyn_cast<GEPOperator>(PtrOp)) {
-    MapVector<Value *, APInt> VarOffsets;
+    SmallMapVector<Value *, APInt, 4> VarOffsets;
     if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset))
       break;
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 5f8efd1a8f32e..093a81648c92e 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -34,9 +34,7 @@ class Lowerer : public coro::LowererBase {
 public:
   Lowerer(Module &M)
       : LowererBase(M), Builder(Context),
-        AnyResumeFnPtrTy(FunctionType::get(Type::getVoidTy(Context), Int8Ptr,
-                                           /*isVarArg=*/false)
-                             ->getPointerTo()) {}
+        AnyResumeFnPtrTy(PointerType::getUnqual(Context)) {}
   void lowerEarlyIntrinsics(Function &F);
 };
 }
@@ -91,11 +89,9 @@ void Lowerer::lowerCoroDone(IntrinsicInst *II) {
   static_assert(coro::Shape::SwitchFieldIndex::Resume == 0,
                 "resume function not at offset zero");
   auto *FrameTy = Int8Ptr;
-  PointerType *FramePtrTy = FrameTy->getPointerTo();
 
   Builder.SetInsertPoint(II);
-  auto *BCI = Builder.CreateBitCast(Operand, FramePtrTy);
-  auto *Load = Builder.CreateLoad(FrameTy, BCI);
+  auto *Load = Builder.CreateLoad(FrameTy, Operand);
   auto *Cond = Builder.CreateICmpEQ(Load, NullPtr);
 
   II->replaceAllUsesWith(Cond);
@@ -128,10 +124,9 @@ void Lowerer::lowerCoroNoop(IntrinsicInst *II) {
 
     // Create a noop.frame struct type.
     StructType *FrameTy = StructType::create(C, "NoopCoro.Frame");
-    auto *FramePtrTy = FrameTy->getPointerTo();
-    auto *FnTy = FunctionType::get(Type::getVoidTy(C), FramePtrTy,
+    auto *FnTy = FunctionType::get(Type::getVoidTy(C), Builder.getPtrTy(0),
                                    /*isVarArg=*/false);
-    auto *FnPtrTy = FnTy->getPointerTo();
+    auto *FnPtrTy = Builder.getPtrTy(0);
     FrameTy->setBody({FnPtrTy, FnPtrTy});
 
     // Create a Noop function that does nothing.
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 10e2e41096098..453736912a8c5 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -97,7 +97,8 @@ static const char *const CoroIntrinsics[] = {
 
 #ifndef NDEBUG
 static bool isCoroutineIntrinsicName(StringRef Name) {
-  return Intrinsic::lookupLLVMIntrinsicByName(CoroIntrinsics, Name) != -1;
+  return Intrinsic::lookupLLVMIntrinsicByName(CoroIntrinsics, Name, "coro") !=
+         -1;
 }
 #endif
 
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index f213ac1c8d7d5..96b5c8440e5f9 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -397,13 +397,11 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
     if (!IsOffsetKnown) {
       AliasOffetMap[&I].reset();
     } else {
-      auto Itr = AliasOffetMap.find(&I);
-      if (Itr == AliasOffetMap.end()) {
-        AliasOffetMap[&I] = Offset;
-      } else if (Itr->second && *Itr->second != Offset) {
+      auto [Itr, Inserted] = AliasOffetMap.try_emplace(&I, Offset);
+      if (!Inserted && Itr->second && *Itr->second != Offset) {
         // If we have seen two different possible values for this alias, we set
         // it to empty.
-        AliasOffetMap[&I].reset();
+        Itr->second.reset();
       }
     }
   }
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 1f9b546ed2999..90e8c39e5a90d 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -485,11 +485,36 @@ static bool allCallersPassValidPointerForArgument(
   });
 }
 
+// Try to prove that all Calls to F do not modify the memory pointed to by Arg,
+// using alias analysis local to each caller of F.
+static bool isArgUnmodifiedByAllCalls(Argument *Arg,
+                                      FunctionAnalysisManager &FAM) {
+  for (User *U : Arg->getParent()->users()) {
+
+    // Bail if we find an unexpected (non CallInst) use of the function.
+    auto *Call = dyn_cast<CallInst>(U);
+    if (!Call)
+      return false;
+
+    MemoryLocation Loc =
+        MemoryLocation::getForArgument(Call, Arg->getArgNo(), nullptr);
+
+    AAResults &AAR = FAM.getResult<AAManager>(*Call->getFunction());
+    // Bail as soon as we find a Call where Arg may be modified.
+    if (isModSet(AAR.getModRefInfo(Call, Loc)))
+      return false;
+  }
+
+  // All Users are Calls which do not modify the Arg.
+  return true;
+}
+
 /// Determine that this argument is safe to promote, and find the argument
 /// parts it can be promoted into.
 static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
                          unsigned MaxElements, bool IsRecursive,
-                         SmallVectorImpl<OffsetAndArgPart> &ArgPartsVec) {
+                         SmallVectorImpl<OffsetAndArgPart> &ArgPartsVec,
+                         FunctionAnalysisManager &FAM) {
   // Quick exit for unused arguments
   if (Arg->use_empty())
     return true;
@@ -716,10 +741,16 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
     return true;
 
   // Okay, now we know that the argument is only used by load instructions, and
-  // it is safe to unconditionally perform all of them. Use alias analysis to
-  // check to see if the pointer is guaranteed to not be modified from entry of
-  // the function to each of the load instructions.
+  // it is safe to unconditionally perform all of them.
+
+  // If we can determine that no call to the Function modifies the memory region
+  // accessed through Arg, through alias analysis using actual arguments in the
+  // callers, we know that it is guaranteed to be safe to promote the argument.
+  if (isArgUnmodifiedByAllCalls(Arg, FAM))
+    return true;
 
+  // Otherwise, use alias analysis to check if the pointer is guaranteed to not
+  // be modified from entry of the function to each of the load instructions.
   for (LoadInst *Load : Loads) {
     // Check to see if the load is invalidated from the start of the block to
     // the load itself.
@@ -846,7 +877,8 @@ static Function *promoteArguments(Function *F, FunctionAnalysisManager &FAM,
     // If we can promote the pointer to its value.
     SmallVector<OffsetAndArgPart, 4> ArgParts;
 
-    if (findArgParts(PtrArg, DL, AAR, MaxElements, IsRecursive, ArgParts)) {
+    if (findArgParts(PtrArg, DL, AAR, MaxElements, IsRecursive, ArgParts,
+                     FAM)) {
       SmallVector<Type *, 4> Types;
       for (const auto &Pair : ArgParts)
         Types.push_back(Pair.second.Ty);
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 56d1133b25549..99b46591da420 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -3294,6 +3294,12 @@ InformationCache::getIndirectlyCallableFunctions(Attributor &A) const {
   return IndirectlyCallableFunctions;
 }
 
+std::optional<unsigned> InformationCache::getFlatAddressSpace() const {
+  if (TargetTriple.isAMDGPU() || TargetTriple.isNVPTX())
+    return 0;
+  return std::nullopt;
+}
+
 void Attributor::recordDependence(const AbstractAttribute &FromAA,
                                   const AbstractAttribute &ToAA,
                                   DepClassTy DepClass) {
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 038a374e19f79..238bdf9c344b0 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -1557,7 +1557,7 @@ bool AAPointerInfoFloating::collectConstantsForGEP(Attributor &A,
                                                    const OffsetInfo &PtrOI,
                                                    const GEPOperator *GEP) {
   unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
-  MapVector<Value *, APInt> VariableOffsets;
+  SmallMapVector<Value *, APInt, 4> VariableOffsets;
   APInt ConstantOffset(BitWidth, 0);
 
   assert(!UsrOI.isUnknown() && !PtrOI.isUnknown() &&
@@ -12580,8 +12580,19 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
   void initialize(Attributor &A) override {
     assert(getAssociatedType()->isPtrOrPtrVectorTy() &&
            "Associated value is not a pointer");
-    if (getAssociatedType()->getPointerAddressSpace())
+
+    if (!A.getInfoCache().getFlatAddressSpace().has_value()) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    unsigned FlatAS = A.getInfoCache().getFlatAddressSpace().value();
+    unsigned AS = getAssociatedType()->getPointerAddressSpace();
+    if (AS != FlatAS) {
+      [[maybe_unused]] bool R = takeAddressSpace(AS);
+      assert(R && "The take should happen");
       indicateOptimisticFixpoint();
+    }
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
@@ -12603,12 +12614,13 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
 
   /// See AbstractAttribute::manifest(...).
   ChangeStatus manifest(Attributor &A) override {
-    Value *AssociatedValue = &getAssociatedValue();
-    Value *OriginalValue = peelAddrspacecast(AssociatedValue);
-    if (getAddressSpace() == NoAddressSpace ||
+    if (getAddressSpace() == InvalidAddressSpace ||
         getAddressSpace() == getAssociatedType()->getPointerAddressSpace())
       return ChangeStatus::UNCHANGED;
 
+    Value *AssociatedValue = &getAssociatedValue();
+    Value *OriginalValue = peelAddrspacecast(AssociatedValue);
+
     PointerType *NewPtrTy =
         PointerType::get(getAssociatedType()->getContext(), getAddressSpace());
     bool UseOriginalValue =
@@ -12655,17 +12667,17 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
     if (!isValidState())
       return "addrspace(<invalid>)";
     return "addrspace(" +
-           (AssumedAddressSpace == NoAddressSpace
+           (AssumedAddressSpace == InvalidAddressSpace
                 ? "none"
                 : std::to_string(AssumedAddressSpace)) +
            ")";
   }
 
 private:
-  uint32_t AssumedAddressSpace = NoAddressSpace;
+  uint32_t AssumedAddressSpace = InvalidAddressSpace;
 
   bool takeAddressSpace(uint32_t AS) {
-    if (AssumedAddressSpace == NoAddressSpace) {
+    if (AssumedAddressSpace == InvalidAddressSpace) {
       AssumedAddressSpace = AS;
       return true;
     }
diff --git a/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp b/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
index acc10f57c29ac..66ae0706d638c 100644
--- a/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
+++ b/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
@@ -169,7 +169,8 @@ class CVPLatticeFunc
   /// just a few kinds of instructions since we're only propagating values that
   /// can be called.
   void ComputeInstructionState(
-      Instruction &I, DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+      Instruction &I,
+      SmallDenseMap<CVPLatticeKey, CVPLatticeVal, 16> &ChangedValues,
       SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) override {
     switch (I.getOpcode()) {
     case Instruction::Call:
@@ -238,9 +239,10 @@ class CVPLatticeFunc
 
   /// Handle return instructions. The function's return state is the merge of
   /// the returned value state and the function's return state.
-  void visitReturn(ReturnInst &I,
-                   DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
-                   SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+  void
+  visitReturn(ReturnInst &I,
+              SmallDenseMap<CVPLatticeKey, CVPLatticeVal, 16> &ChangedValues,
+              SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
     Function *F = I.getParent()->getParent();
     if (F->getReturnType()->isVoidTy())
       return;
@@ -254,9 +256,10 @@ class CVPLatticeFunc
   /// the merge of the argument state with the call sites corresponding actual
   /// argument state. The call site state is the merge of the call site state
   /// with the returned value state of the called function.
-  void visitCallBase(CallBase &CB,
-                     DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
-                     SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+  void
+  visitCallBase(CallBase &CB,
+                SmallDenseMap<CVPLatticeKey, CVPLatticeVal, 16> &ChangedValues,
+                SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
     Function *F = CB.getCalledFunction();
     auto RegI = CVPLatticeKey(&CB, IPOGrouping::Register);
 
@@ -298,9 +301,10 @@ class CVPLatticeFunc
 
   /// Handle select instructions. The select instruction state is the merge the
   /// true and false value states.
-  void visitSelect(SelectInst &I,
-                   DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
-                   SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+  void
+  visitSelect(SelectInst &I,
+              SmallDenseMap<CVPLatticeKey, CVPLatticeVal, 16> &ChangedValues,
+              SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
     auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
     auto RegT = CVPLatticeKey(I.getTrueValue(), IPOGrouping::Register);
     auto RegF = CVPLatticeKey(I.getFalseValue(), IPOGrouping::Register);
@@ -312,7 +316,7 @@ class CVPLatticeFunc
   /// variable, we attempt to track the value. The loaded value state is the
   /// merge of the loaded value state with the global variable state.
   void visitLoad(LoadInst &I,
-                 DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                 SmallDenseMap<CVPLatticeKey, CVPLatticeVal, 16> &ChangedValues,
                  SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
     auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
     if (auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand())) {
@@ -327,9 +331,10 @@ class CVPLatticeFunc
   /// Handle store instructions. If the pointer operand of the store is a
   /// global variable, we attempt to track the value. The global variable state
   /// is the merge of the stored value state with the global variable state.
-  void visitStore(StoreInst &I,
-                  DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
-                  SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+  void
+  visitStore(StoreInst &I,
+             SmallDenseMap<CVPLatticeKey, CVPLatticeVal, 16> &ChangedValues,
+             SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
     auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand());
     if (!GV)
       return;
@@ -342,7 +347,7 @@ class CVPLatticeFunc
   /// Handle all other instructions. All other instructions are marked
   /// overdefined.
   void visitInst(Instruction &I,
-                 DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                 SmallDenseMap<CVPLatticeKey, CVPLatticeVal, 16> &ChangedValues,
                  SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
     // Simply bail if this instruction has no user.
     if (I.use_empty())
diff --git a/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
index d3d27de4218c8..c997b180937af 100644
--- a/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
+++ b/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -135,7 +135,6 @@ EliminateAvailableExternallyPass::run(Module &M, ModuleAnalysisManager &MAM) {
   // for this contextual information. Eliding it in favor of the original would
   // undo these optimizations.
   if (!eliminateAvailableExternally(M, /*Convert=*/(CtxProf && !!(*CtxProf))))
-    ;
-  return PreservedAnalyses::all();
+    return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 0742b259c489c..519a4e9314a26 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1658,8 +1658,8 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
                        ".cfi.jumptable", &M);
   ArrayType *JumpTableType =
       ArrayType::get(getJumpTableEntryType(), Functions.size());
-  auto JumpTable =
-      ConstantExpr::getPointerCast(JumpTableFn, JumpTableType->getPointerTo(0));
+  auto JumpTable = ConstantExpr::getPointerCast(
+      JumpTableFn, PointerType::getUnqual(M.getContext()));
 
   lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout);
 
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 576a31f8b86ae..5e7d2c3c713d3 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -619,6 +619,16 @@ class CallsiteContextGraph {
     return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
   }
 
+  // Create and return a new ContextNode.
+  ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
+                             CallInfo C = CallInfo()) {
+    NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
+    auto *NewNode = NodeOwner.back().get();
+    if (F)
+      NodeToCallingFunc[NewNode] = F;
+    return NewNode;
+  }
+
   /// Helpers to find the node corresponding to the given call or stackid.
   ContextNode *getNodeForInst(const CallInfo &C);
   ContextNode *getNodeForAlloc(const CallInfo &C);
@@ -1082,11 +1092,8 @@ typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
     CallInfo Call, const FuncTy *F) {
   assert(!getNodeForAlloc(Call));
-  NodeOwner.push_back(
-      std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
-  ContextNode *AllocNode = NodeOwner.back().get();
+  ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
   AllocationCallToContextNodeMap[Call] = AllocNode;
-  NodeToCallingFunc[AllocNode] = F;
   // Use LastContextId as a uniq id for MIB allocation nodes.
   AllocNode->OrigStackOrAllocId = LastContextId;
   // Alloc type should be updated as we add in the MIBs. We should assert
@@ -1143,9 +1150,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
     auto StackId = getStackId(*ContextIter);
     ContextNode *StackNode = getNodeForStackId(StackId);
     if (!StackNode) {
-      NodeOwner.push_back(
-          std::make_unique<ContextNode>(/*IsAllocation=*/false));
-      StackNode = NodeOwner.back().get();
+      StackNode = createNewNode(/*IsAllocation=*/false);
       StackEntryIdToContextNodeMap[StackId] = StackNode;
       StackNode->OrigStackOrAllocId = StackId;
     }
@@ -1377,9 +1382,12 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
   // Compute the last node's context ids once, as it is shared by all calls in
   // this entry.
   DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
-  assert(!LastNodeContextIds.empty());
 
-  for (unsigned I = 0; I < Calls.size(); I++) {
+  bool PrevIterCreatedNode = false;
+  bool CreatedNode = false;
+  for (unsigned I = 0; I < Calls.size();
+       I++, PrevIterCreatedNode = CreatedNode) {
+    CreatedNode = false;
     auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
     // Skip any for which we didn't assign any ids, these don't get a node in
     // the graph.
@@ -1391,7 +1399,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       if (!CallToMatchingCall.contains(Call))
         continue;
       auto MatchingCall = CallToMatchingCall[Call];
-      assert(NonAllocationCallToContextNodeMap.contains(MatchingCall));
+      if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
+        // This should only happen if we had a prior iteration, and it didn't
+        // create a node because of the below recomputation of context ids
+        // finding none remaining and continuing early.
+        assert(I > 0 && !PrevIterCreatedNode);
+        continue;
+      }
       NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
           Call);
       continue;
@@ -1439,11 +1453,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       continue;
 
     // Create new context node.
-    NodeOwner.push_back(
-        std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
-    ContextNode *NewNode = NodeOwner.back().get();
-    NodeToCallingFunc[NewNode] = Func;
+    ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
     NonAllocationCallToContextNodeMap[Call] = NewNode;
+    CreatedNode = true;
     NewNode->AllocTypes = computeAllocType(SavedContextIds);
 
     ContextNode *FirstNode = getNodeForStackId(Ids[0]);
@@ -1548,13 +1560,23 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
     // of length, and within each length, lexicographically by stack id. The
     // latter is so that we can specially handle calls that have identical stack
     // id sequences (either due to cloning or artificially because of the MIB
-    // context pruning).
-    std::stable_sort(Calls.begin(), Calls.end(),
-                     [](const CallContextInfo &A, const CallContextInfo &B) {
-                       return A.StackIds.size() > B.StackIds.size() ||
-                              (A.StackIds.size() == B.StackIds.size() &&
-                               A.StackIds < B.StackIds);
-                     });
+    // context pruning). Those with the same Ids are then sorted by function to
+    // facilitate efficiently mapping them to the same context node.
+    // Because the functions are pointers, to ensure a stable sort first assign
+    // each function pointer to its first index in the Calls array, and then use
+    // that to sort by.
+    DenseMap<const FuncTy *, unsigned> FuncToIndex;
+    for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
+      FuncToIndex.insert({CallCtxInfo.Func, Idx});
+    std::stable_sort(
+        Calls.begin(), Calls.end(),
+        [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
+          return A.StackIds.size() > B.StackIds.size() ||
+                 (A.StackIds.size() == B.StackIds.size() &&
+                  (A.StackIds < B.StackIds ||
+                   (A.StackIds == B.StackIds &&
+                    FuncToIndex[A.Func] < FuncToIndex[B.Func])));
+        });
 
     // Find the node for the last stack id, which should be the same
     // across all calls recorded for this id, and is the id for this
@@ -1572,18 +1594,26 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
     DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
     assert(!LastNodeContextIds.empty());
 
-    // Map from function to the first call from the below list (with matching
-    // stack ids) found in that function. Note that calls from different
-    // functions can have the same stack ids because this is the list of stack
-    // ids that had (possibly pruned) nodes after building the graph from the
-    // allocation MIBs.
-    DenseMap<const FuncTy *, CallInfo> FuncToCallMap;
+#ifndef NDEBUG
+    // Save the set of functions seen for a particular set of the same stack
+    // ids. This is used to ensure that they have been correctly sorted to be
+    // adjacent in the Calls list, since we rely on that to efficiently place
+    // all such matching calls onto the same context node.
+    DenseSet<const FuncTy *> MatchingIdsFuncSet;
+#endif
 
     for (unsigned I = 0; I < Calls.size(); I++) {
       auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
       assert(SavedContextIds.empty());
       assert(LastId == Ids.back());
 
+#ifndef NDEBUG
+      // If this call has a different set of ids than the last one, clear the
+      // set used to ensure they are sorted properly.
+      if (I > 0 && Ids != Calls[I - 1].StackIds)
+        MatchingIdsFuncSet.clear();
+#endif
+
       // First compute the context ids for this stack id sequence (the
       // intersection of the context ids of the corresponding nodes).
       // Start with the remaining saved ids for the last node.
@@ -1652,23 +1682,38 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
           continue;
       }
 
-      // If the prior call had the same stack ids this map would not be empty.
+#ifndef NDEBUG
+      // If the prior call had the same stack ids this set would not be empty.
       // Check if we already have a call that "matches" because it is located
-      // in the same function.
-      if (FuncToCallMap.contains(Func)) {
-        // Record the matching call found for this call, and skip it. We
-        // will subsequently combine it into the same node.
-        CallToMatchingCall[Call] = FuncToCallMap[Func];
-        continue;
-      }
+      // in the same function. If the Calls list was sorted properly we should
+      // not encounter this situation as all such entries should be adjacent
+      // and processed in bulk further below.
+      assert(!MatchingIdsFuncSet.contains(Func));
+
+      MatchingIdsFuncSet.insert(Func);
+#endif
 
       // Check if the next set of stack ids is the same (since the Calls vector
       // of tuples is sorted by the stack ids we can just look at the next one).
+      // If so, save them in the CallToMatchingCall map so that they get
+      // assigned to the same context node, and skip them.
       bool DuplicateContextIds = false;
-      if (I + 1 < Calls.size()) {
-        auto &CallCtxInfo = Calls[I + 1];
+      for (unsigned J = I + 1; J < Calls.size(); J++) {
+        auto &CallCtxInfo = Calls[J];
         auto &NextIds = CallCtxInfo.StackIds;
-        DuplicateContextIds = Ids == NextIds;
+        if (NextIds != Ids)
+          break;
+        auto *NextFunc = CallCtxInfo.Func;
+        if (NextFunc != Func) {
+          // We have another Call with the same ids but that cannot share this
+          // node, must duplicate ids for it.
+          DuplicateContextIds = true;
+          break;
+        }
+        auto &NextCall = CallCtxInfo.Call;
+        CallToMatchingCall[NextCall] = Call;
+        // Update I so that it gets incremented correctly to skip this call.
+        I = J;
       }
 
       // If we don't have duplicate context ids, then we can assign all the
@@ -1692,14 +1737,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
         set_subtract(LastNodeContextIds, StackSequenceContextIds);
         if (LastNodeContextIds.empty())
           break;
-        // No longer possibly in a sequence of calls with duplicate stack ids,
-        // clear the map.
-        FuncToCallMap.clear();
-      } else
-        // Record the call with its function, so we can locate it the next time
-        // we find a call from this function when processing the calls with the
-        // same stack ids.
-        FuncToCallMap[Func] = Call;
+      }
     }
   }
 
@@ -2128,10 +2166,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
     } else {
       FuncToCallsWithMetadata[Func].push_back({NewCall});
       // Create Node and record node info.
-      NodeOwner.push_back(
-          std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
-      NewNode = NodeOwner.back().get();
-      NodeToCallingFunc[NewNode] = Func;
+      NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
       TailCallToContextNodeMap[NewCall] = NewNode;
       NewNode->AllocTypes = Edge->AllocTypes;
     }
@@ -2704,13 +2739,11 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
     const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
     DenseSet<uint32_t> ContextIdsToMove) {
   ContextNode *Node = Edge->Callee;
-  NodeOwner.push_back(
-      std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
-  ContextNode *Clone = NodeOwner.back().get();
+  assert(NodeToCallingFunc.count(Node));
+  ContextNode *Clone =
+      createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
   Node->addClone(Clone);
   Clone->MatchingCalls = Node->MatchingCalls;
-  assert(NodeToCallingFunc.count(Node));
-  NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
   moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
                                 ContextIdsToMove);
   return Clone;
diff --git a/llvm/lib/Transforms/IPO/ModuleInliner.cpp b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
index 542c319b88074..dbc733826944b 100644
--- a/llvm/lib/Transforms/IPO/ModuleInliner.cpp
+++ b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
@@ -49,6 +49,13 @@ using namespace llvm;
 STATISTIC(NumInlined, "Number of functions inlined");
 STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
 
+cl::opt<bool> CtxProfPromoteAlwaysInline(
+    "ctx-prof-promote-alwaysinline", cl::init(false), cl::Hidden,
+    cl::desc("If using a contextual profile in this module, and an indirect "
+             "call target is marked as alwaysinline, perform indirect call "
+             "promotion for that target. If multiple targets for an indirect "
+             "call site fit this description, they are all promoted."));
+
 /// Return true if the specified inline history ID
 /// indicates an inline history that includes the specified function.
 static bool inlineHistoryIncludes(
@@ -145,10 +152,11 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M,
   assert(Calls != nullptr && "Expected an initialized InlineOrder");
 
   // Populate the initial list of calls in this module.
+  SetVector<std::pair<CallBase *, Function *>> ICPCandidates;
   for (Function &F : M) {
     auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-    for (Instruction &I : instructions(F))
-      if (auto *CB = dyn_cast<CallBase>(&I))
+    for (Instruction &I : instructions(F)) {
+      if (auto *CB = dyn_cast<CallBase>(&I)) {
         if (Function *Callee = CB->getCalledFunction()) {
           if (!Callee->isDeclaration())
             Calls->push({CB, -1});
@@ -163,7 +171,17 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M,
                      << setIsVerbose();
             });
           }
+        } else if (CtxProfPromoteAlwaysInline && CtxProf &&
+                   CB->isIndirectCall()) {
+          CtxProfAnalysis::collectIndirectCallPromotionList(*CB, CtxProf,
+                                                            ICPCandidates);
         }
+      }
+    }
+  }
+  for (auto &[CB, Target] : ICPCandidates) {
+    if (auto *DirectCB = promoteCallWithIfThenElse(*CB, *Target, CtxProf))
+      Calls->push({DirectCB, -1});
   }
   if (Calls->empty())
     return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 28da864cad0ff..35664a5c7a2ac 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -3536,8 +3536,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
           PoisonValue::get(Int8ArrTy), CB->getName() + "_shared", nullptr,
           GlobalValue::NotThreadLocal,
           static_cast<unsigned>(AddressSpace::Shared));
-      auto *NewBuffer =
-          ConstantExpr::getPointerCast(SharedMem, Int8Ty->getPointerTo());
+      auto *NewBuffer = ConstantExpr::getPointerCast(
+          SharedMem, PointerType::getUnqual(M->getContext()));
 
       auto Remark = [&](OptimizationRemark OR) {
         return OR << "Replaced globalized variable with "
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index ea51d77904571..9934c065ebf85 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -436,6 +436,71 @@ static Instruction *foldVecTruncToExtElt(TruncInst &Trunc,
   return ExtractElementInst::Create(VecInput, IC.Builder.getInt32(Elt));
 }
 
+/// Whenever an element is extracted from a vector, optionally shifted down, and
+/// then truncated, canonicalize by converting it to a bitcast followed by an
+/// extractelement.
+///
+/// Examples (little endian):
+///   trunc (extractelement <4 x i64> %X, 0) to i32
+///   --->
+///   extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0
+///
+///   trunc (lshr (extractelement <4 x i32> %X, 0), 8) to i8
+///   --->
+///   extractelement <16 x i8> (bitcast <4 x i32> %X to <16 x i8>), i32 1
+static Instruction *foldVecExtTruncToExtElt(TruncInst &Trunc,
+                                            InstCombinerImpl &IC) {
+  Value *Src = Trunc.getOperand(0);
+  Type *SrcType = Src->getType();
+  Type *DstType = Trunc.getType();
+
+  // Only attempt this if we have simple aliasing of the vector elements.
+  // A badly fit destination size would result in an invalid cast.
+  unsigned SrcBits = SrcType->getScalarSizeInBits();
+  unsigned DstBits = DstType->getScalarSizeInBits();
+  unsigned TruncRatio = SrcBits / DstBits;
+  if ((SrcBits % DstBits) != 0)
+    return nullptr;
+
+  Value *VecOp;
+  ConstantInt *Cst;
+  const APInt *ShiftAmount = nullptr;
+  if (!match(Src, m_OneUse(m_ExtractElt(m_Value(VecOp), m_ConstantInt(Cst)))) &&
+      !match(Src,
+             m_OneUse(m_LShr(m_ExtractElt(m_Value(VecOp), m_ConstantInt(Cst)),
+                             m_APInt(ShiftAmount)))))
+    return nullptr;
+
+  auto *VecOpTy = cast<VectorType>(VecOp->getType());
+  auto VecElts = VecOpTy->getElementCount();
+
+  uint64_t BitCastNumElts = VecElts.getKnownMinValue() * TruncRatio;
+  uint64_t VecOpIdx = Cst->getZExtValue();
+  uint64_t NewIdx = IC.getDataLayout().isBigEndian()
+                        ? (VecOpIdx + 1) * TruncRatio - 1
+                        : VecOpIdx * TruncRatio;
+
+  // Adjust index by the whole number of truncated elements.
+  if (ShiftAmount) {
+    // Check shift amount is in range and shifts a whole number of truncated
+    // elements.
+    if (ShiftAmount->uge(SrcBits) || ShiftAmount->urem(DstBits) != 0)
+      return nullptr;
+
+    uint64_t IdxOfs = ShiftAmount->udiv(DstBits).getZExtValue();
+    NewIdx = IC.getDataLayout().isBigEndian() ? (NewIdx - IdxOfs)
+                                              : (NewIdx + IdxOfs);
+  }
+
+  assert(BitCastNumElts <= std::numeric_limits<uint32_t>::max() &&
+         NewIdx <= std::numeric_limits<uint32_t>::max() && "overflow 32-bits");
+
+  auto *BitCastTo =
+      VectorType::get(DstType, BitCastNumElts, VecElts.isScalable());
+  Value *BitCast = IC.Builder.CreateBitCast(VecOp, BitCastTo);
+  return ExtractElementInst::Create(BitCast, IC.Builder.getInt32(NewIdx));
+}
+
 /// Funnel/Rotate left/right may occur in a wider type than necessary because of
 /// type promotion rules. Try to narrow the inputs and convert to funnel shift.
 Instruction *InstCombinerImpl::narrowFunnelShift(TruncInst &Trunc) {
@@ -848,36 +913,8 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
   if (Instruction *I = foldVecTruncToExtElt(Trunc, *this))
     return I;
 
-  // Whenever an element is extracted from a vector, and then truncated,
-  // canonicalize by converting it to a bitcast followed by an
-  // extractelement.
-  //
-  // Example (little endian):
-  //   trunc (extractelement <4 x i64> %X, 0) to i32
-  //   --->
-  //   extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0
-  Value *VecOp;
-  ConstantInt *Cst;
-  if (match(Src, m_OneUse(m_ExtractElt(m_Value(VecOp), m_ConstantInt(Cst))))) {
-    auto *VecOpTy = cast<VectorType>(VecOp->getType());
-    auto VecElts = VecOpTy->getElementCount();
-
-    // A badly fit destination size would result in an invalid cast.
-    if (SrcWidth % DestWidth == 0) {
-      uint64_t TruncRatio = SrcWidth / DestWidth;
-      uint64_t BitCastNumElts = VecElts.getKnownMinValue() * TruncRatio;
-      uint64_t VecOpIdx = Cst->getZExtValue();
-      uint64_t NewIdx = DL.isBigEndian() ? (VecOpIdx + 1) * TruncRatio - 1
-                                         : VecOpIdx * TruncRatio;
-      assert(BitCastNumElts <= std::numeric_limits<uint32_t>::max() &&
-             "overflow 32-bits");
-
-      auto *BitCastTo =
-          VectorType::get(DestTy, BitCastNumElts, VecElts.isScalable());
-      Value *BitCast = Builder.CreateBitCast(VecOp, BitCastTo);
-      return ExtractElementInst::Create(BitCast, Builder.getInt32(NewIdx));
-    }
-  }
+  if (Instruction *I = foldVecExtTruncToExtElt(Trunc, *this))
+    return I;
 
   // trunc (ctlz_i32(zext(A), B) --> add(ctlz_i16(A, B), C)
   if (match(Src, m_OneUse(m_Intrinsic<Intrinsic::ctlz>(m_ZExt(m_Value(A)),
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index b1215bb4d83b0..d4d45384ec90e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5273,32 +5273,46 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
 
   {
     // Try to remove shared multiplier from comparison:
-    // X * Z u{lt/le/gt/ge}/eq/ne Y * Z
+    // X * Z pred Y * Z
     Value *X, *Y, *Z;
-    if (Pred == ICmpInst::getUnsignedPredicate(Pred) &&
-        ((match(Op0, m_Mul(m_Value(X), m_Value(Z))) &&
-          match(Op1, m_c_Mul(m_Specific(Z), m_Value(Y)))) ||
-         (match(Op0, m_Mul(m_Value(Z), m_Value(X))) &&
-          match(Op1, m_c_Mul(m_Specific(Z), m_Value(Y)))))) {
-      bool NonZero;
-      if (ICmpInst::isEquality(Pred)) {
-        KnownBits ZKnown = computeKnownBits(Z, 0, &I);
-        // if Z % 2 != 0
-        //    X * Z eq/ne Y * Z -> X eq/ne Y
-        if (ZKnown.countMaxTrailingZeros() == 0)
-          return new ICmpInst(Pred, X, Y);
-        NonZero = !ZKnown.One.isZero() || isKnownNonZero(Z, Q);
-        // if Z != 0 and nsw(X * Z) and nsw(Y * Z)
-        //    X * Z eq/ne Y * Z -> X eq/ne Y
-        if (NonZero && BO0 && BO1 && Op0HasNSW && Op1HasNSW)
+    if ((match(Op0, m_Mul(m_Value(X), m_Value(Z))) &&
+         match(Op1, m_c_Mul(m_Specific(Z), m_Value(Y)))) ||
+        (match(Op0, m_Mul(m_Value(Z), m_Value(X))) &&
+         match(Op1, m_c_Mul(m_Specific(Z), m_Value(Y))))) {
+      if (ICmpInst::isSigned(Pred)) {
+        if (Op0HasNSW && Op1HasNSW) {
+          KnownBits ZKnown = computeKnownBits(Z, 0, &I);
+          if (ZKnown.isStrictlyPositive())
+            return new ICmpInst(Pred, X, Y);
+          if (ZKnown.isNegative())
+            return new ICmpInst(ICmpInst::getSwappedPredicate(Pred), X, Y);
+        }
+      } else {
+        bool NonZero;
+        if (ICmpInst::isEquality(Pred)) {
+          // If X != Y, fold (X *nw Z) eq/ne (Y *nw Z) -> Z eq/ne 0
+          if (((Op0HasNSW && Op1HasNSW) || (Op0HasNUW && Op1HasNUW)) &&
+              isKnownNonEqual(X, Y, DL, &AC, &I, &DT))
+            return new ICmpInst(Pred, Z, Constant::getNullValue(Z->getType()));
+
+          KnownBits ZKnown = computeKnownBits(Z, 0, &I);
+          // if Z % 2 != 0
+          //    X * Z eq/ne Y * Z -> X eq/ne Y
+          if (ZKnown.countMaxTrailingZeros() == 0)
+            return new ICmpInst(Pred, X, Y);
+          NonZero = !ZKnown.One.isZero() || isKnownNonZero(Z, Q);
+          // if Z != 0 and nsw(X * Z) and nsw(Y * Z)
+          //    X * Z eq/ne Y * Z -> X eq/ne Y
+          if (NonZero && BO0 && BO1 && Op0HasNSW && Op1HasNSW)
+            return new ICmpInst(Pred, X, Y);
+        } else
+          NonZero = isKnownNonZero(Z, Q);
+
+        // If Z != 0 and nuw(X * Z) and nuw(Y * Z)
+        //    X * Z u{lt/le/gt/ge}/eq/ne Y * Z -> X u{lt/le/gt/ge}/eq/ne Y
+        if (NonZero && BO0 && BO1 && Op0HasNUW && Op1HasNUW)
           return new ICmpInst(Pred, X, Y);
-      } else
-        NonZero = isKnownNonZero(Z, Q);
-
-      // If Z != 0 and nuw(X * Z) and nuw(Y * Z)
-      //    X * Z u{lt/le/gt/ge}/eq/ne Y * Z -> X u{lt/le/gt/ge}/eq/ne Y
-      if (NonZero && BO0 && BO1 && Op0HasNUW && Op1HasNUW)
-        return new ICmpInst(Pred, X, Y);
+      }
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 9c4d206692fac..0ad178594be03 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -104,15 +104,15 @@ bool InstCombinerImpl::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
     return true;
   }
 
-  if (Depth == MaxAnalysisRecursionDepth)
-    return false;
-
   Instruction *VInst = dyn_cast<Instruction>(V);
   if (!VInst) {
     llvm::computeKnownBits(V, Known, Depth, Q);
     return false;
   }
 
+  if (Depth == MaxAnalysisRecursionDepth)
+    return false;
+
   Value *NewVal;
   if (VInst->hasOneUse()) {
     // If the instruction has one use, we can directly simplify it.
@@ -858,37 +858,16 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
   }
   case Instruction::SRem: {
     const APInt *Rem;
-    if (match(I->getOperand(1), m_APInt(Rem))) {
-      // X % -1 demands all the bits because we don't want to introduce
-      // INT_MIN % -1 (== undef) by accident.
-      if (Rem->isAllOnes())
-        break;
-      APInt RA = Rem->abs();
-      if (RA.isPowerOf2()) {
-        if (DemandedMask.ult(RA))    // srem won't affect demanded bits
-          return I->getOperand(0);
+    if (match(I->getOperand(1), m_APInt(Rem)) && Rem->isPowerOf2()) {
+      if (DemandedMask.ult(*Rem)) // srem won't affect demanded bits
+        return I->getOperand(0);
 
-        APInt LowBits = RA - 1;
-        APInt Mask2 = LowBits | APInt::getSignMask(BitWidth);
-        if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1, Q))
-          return I;
-
-        // The low bits of LHS are unchanged by the srem.
-        Known.Zero = LHSKnown.Zero & LowBits;
-        Known.One = LHSKnown.One & LowBits;
-
-        // If LHS is non-negative or has all low bits zero, then the upper bits
-        // are all zero.
-        if (LHSKnown.isNonNegative() || LowBits.isSubsetOf(LHSKnown.Zero))
-          Known.Zero |= ~LowBits;
-
-        // If LHS is negative and not all low bits are zero, then the upper bits
-        // are all one.
-        if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One))
-          Known.One |= ~LowBits;
-
-        break;
-      }
+      APInt LowBits = *Rem - 1;
+      APInt Mask2 = LowBits | APInt::getSignMask(BitWidth);
+      if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1, Q))
+        return I;
+      Known = KnownBits::srem(LHSKnown, KnownBits::makeConstant(*Rem));
+      break;
     }
 
     llvm::computeKnownBits(I, Known, Depth, Q);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 5740285675eba..916a14e13ff2a 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2349,12 +2349,7 @@ Instruction *InstCombinerImpl::narrowMathIfNoOverflow(BinaryOperator &BO) {
 /// transform.
 static GEPNoWrapFlags getMergedGEPNoWrapFlags(GEPOperator &GEP1,
                                               GEPOperator &GEP2) {
-  GEPNoWrapFlags NW = GEP1.getNoWrapFlags() & GEP2.getNoWrapFlags();
-  // Without inbounds, we could only preserve nusw if we know that x + y does
-  // not wrap.
-  if (!NW.isInBounds())
-    NW = NW.withoutNoUnsignedSignedWrap();
-  return NW;
+  return GEP1.getNoWrapFlags().intersectForOffsetAdd(GEP2.getNoWrapFlags());
 }
 
 /// Thread a GEP operation with constant indices through the constant true/false
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 7e2721d0c5a5e..7c06e0c757e1c 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -385,7 +385,7 @@ struct Decomposition {
 struct OffsetResult {
   Value *BasePtr;
   APInt ConstantOffset;
-  MapVector<Value *, APInt> VariableOffsets;
+  SmallMapVector<Value *, APInt, 4> VariableOffsets;
   bool AllInbounds;
 
   OffsetResult() : BasePtr(nullptr), ConstantOffset(0, uint64_t(0)) {}
@@ -410,7 +410,7 @@ static OffsetResult collectOffsets(GEPOperator &GEP, const DataLayout &DL) {
   // If we have a nested GEP, check if we can combine the constant offset of the
   // inner GEP with the outer GEP.
   if (auto *InnerGEP = dyn_cast<GetElementPtrInst>(Result.BasePtr)) {
-    MapVector<Value *, APInt> VariableOffsets2;
+    SmallMapVector<Value *, APInt, 4> VariableOffsets2;
     APInt ConstantOffset2(BitWidth, 0);
     bool CanCollectInner = InnerGEP->collectOffset(
         DL, BitWidth, VariableOffsets2, ConstantOffset2);
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index a304f7b056f5f..ce8c988ba531d 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1944,8 +1944,9 @@ struct DSEState {
       return false;
     IRBuilder<> IRB(Malloc);
     Type *SizeTTy = Malloc->getArgOperand(0)->getType();
-    auto *Calloc = emitCalloc(ConstantInt::get(SizeTTy, 1),
-                              Malloc->getArgOperand(0), IRB, TLI);
+    auto *Calloc =
+        emitCalloc(ConstantInt::get(SizeTTy, 1), Malloc->getArgOperand(0), IRB,
+                   TLI, Malloc->getType()->getPointerAddressSpace());
     if (!Calloc)
       return false;
 
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index db39d8621d077..2ba600497e00d 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -422,7 +422,7 @@ GVNPass::Expression GVNPass::ValueTable::createGEPExpr(GetElementPtrInst *GEP) {
   Type *PtrTy = GEP->getType()->getScalarType();
   const DataLayout &DL = GEP->getDataLayout();
   unsigned BitWidth = DL.getIndexTypeSizeInBits(PtrTy);
-  MapVector<Value *, APInt> VariableOffsets;
+  SmallMapVector<Value *, APInt, 4> VariableOffsets;
   APInt ConstantOffset(BitWidth, 0);
   if (GEP->collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset)) {
     // Convert into offset representation, to recognize equivalent address
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index 2a4f68e125252..7f99cd2060a9d 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -56,7 +56,7 @@ static std::optional<JumpTableTy> parseJumpTable(GetElementPtrInst *GEP,
   const DataLayout &DL = F.getDataLayout();
   const unsigned BitWidth =
       DL.getIndexSizeInBits(GEP->getPointerAddressSpace());
-  MapVector<Value *, APInt> VariableOffsets;
+  SmallMapVector<Value *, APInt, 4> VariableOffsets;
   APInt ConstantOffset(BitWidth, 0);
   if (!GEP->collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset))
     return std::nullopt;
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 23e9c70b62642..d73b550741fb3 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/Loads.h"
@@ -65,9 +66,9 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
@@ -567,12 +568,11 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
   // We want to visit children before parents. We will enqueue all the parents
   // before their children in the worklist and process the worklist in reverse
   // order.
-  SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop);
+  SmallVector<BasicBlock *, 16> Worklist =
+      collectChildrenInLoop(DT, N, CurLoop);
 
   bool Changed = false;
-  for (DomTreeNode *DTN : reverse(Worklist)) {
-    BasicBlock *BB = DTN->getBlock();
-    // Only need to process the contents of this block if it is not part of a
+  for (BasicBlock *BB : reverse(Worklist)) {
     // subloop (which would already have been processed).
     if (inSubLoop(BB, CurLoop, LI))
       continue;
@@ -1603,13 +1603,14 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
   //
   const auto &BlockColors = SafetyInfo->getBlockColors();
   SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB));
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
   while (!PredBBs.empty()) {
     BasicBlock *PredBB = *PredBBs.begin();
     assert(CurLoop->contains(PredBB) &&
            "Expect all predecessors are in the loop");
     if (PN->getBasicBlockIndex(PredBB) >= 0) {
       BasicBlock *NewPred = SplitBlockPredecessors(
-          ExitBB, PredBB, ".split.loop.exit", DT, LI, MSSAU, true);
+          ExitBB, PredBB, ".split.loop.exit", &DTU, LI, MSSAU, true);
       // Since we do not allow splitting EH-block with BlockColors in
       // canSplitPredecessors(), we can simply assign predecessor's color to
       // the new block.
@@ -2819,10 +2820,17 @@ static bool hoistBOAssociation(Instruction &I, Loop &L,
   if (!BO || !BO->isAssociative())
     return false;
 
-  // TODO: Only hoist ADDs and MULs for now.
+  // TODO: Only hoist ADDs, MULs, FADDs, and FMULs for now.
   Instruction::BinaryOps Opcode = BO->getOpcode();
-  if (Opcode != Instruction::Add && Opcode != Instruction::Mul)
+  switch (Opcode) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::FAdd:
+  case Instruction::FMul:
+    break;
+  default:
     return false;
+  }
 
   bool LVInRHS = L.isLoopInvariant(BO->getOperand(0));
   auto *BO0 = dyn_cast<BinaryOperator>(BO->getOperand(LVInRHS));
@@ -2857,6 +2865,12 @@ static bool hoistBOAssociation(Instruction &I, Loop &L,
     if (auto *I = dyn_cast<Instruction>(Inv))
       I->setHasNoUnsignedWrap(true);
     NewBO->setHasNoUnsignedWrap(true);
+  } else if (Opcode == Instruction::FAdd || Opcode == Instruction::FMul) {
+    // Intersect FMF flags for FADD and FMUL.
+    FastMathFlags Intersect = BO->getFastMathFlags() & BO0->getFastMathFlags();
+    if (auto *I = dyn_cast<Instruction>(Inv))
+      I->setFastMathFlags(Intersect);
+    NewBO->setFastMathFlags(Intersect);
   }
 
   BO->replaceAllUsesWith(NewBO);
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 2f88b19a8d390..b568811dcdbca 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -284,6 +284,7 @@ static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start,
 
 void MemCpyOptPass::eraseInstruction(Instruction *I) {
   MSSAU->removeMemoryAccess(I);
+  EEI->removeInstruction(I);
   I->eraseFromParent();
 }
 
@@ -638,7 +639,7 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
   if (!LI->isSimple() || !LI->hasOneUse() || LI->getParent() != SI->getParent())
     return false;
 
-  BatchAAResults BAA(*AA);
+  BatchAAResults BAA(*AA, EEI);
   auto *T = LI->getType();
   // Don't introduce calls to memcpy/memmove intrinsics out of thin air if
   // the corresponding libcalls are not available.
@@ -1147,14 +1148,14 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   IRBuilder<> Builder(M);
   auto *CopySource = MDep->getSource();
   Instruction *NewCopySource = nullptr;
-  auto CleanupOnRet = llvm::make_scope_exit([&NewCopySource] {
+  auto CleanupOnRet = llvm::make_scope_exit([&] {
     if (NewCopySource && NewCopySource->use_empty())
       // Safety: It's safe here because we will only allocate more instructions
       // after finishing all BatchAA queries, but we have to be careful if we
       // want to do something like this in another place. Then we'd probably
       // have to delay instruction removal until all transforms on an
       // instruction finished.
-      NewCopySource->eraseFromParent();
+      eraseInstruction(NewCopySource);
   });
   MaybeAlign CopySourceAlign = MDep->getSourceAlign();
   // We just need to calculate the actual size of the copy.
@@ -1751,7 +1752,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
         return true;
       }
 
-  BatchAAResults BAA(*AA);
+  BatchAAResults BAA(*AA, EEI);
   // FIXME: Not using getClobberingMemoryAccess() here due to PR54682.
   MemoryAccess *AnyClobber = MA->getDefiningAccess();
   MemoryLocation DestLoc = MemoryLocation::getForDest(M);
@@ -1876,7 +1877,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
   if (!CallAccess)
     return false;
   MemCpyInst *MDep = nullptr;
-  BatchAAResults BAA(*AA);
+  BatchAAResults BAA(*AA, EEI);
   MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
       CallAccess->getDefiningAccess(), Loc, BAA);
   if (auto *MD = dyn_cast<MemoryDef>(Clobber))
@@ -1949,7 +1950,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
 /// 4. The memcpy src is not modified during the call. (ModRef check shows no
 /// Mod.)
 bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) {
-  BatchAAResults BAA(*AA);
+  BatchAAResults BAA(*AA, EEI);
   Value *ImmutArg = CB.getArgOperand(ArgNo);
 
   // 1. Ensure passed argument is immutable during call.
@@ -2024,7 +2025,7 @@ bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) {
     return false;
 
   // 4. The memcpy src must not be modified during the call.
-  if (isModSet(AA->getModRefInfo(&CB, MemoryLocation::getForSource(MDep))))
+  if (isModSet(BAA.getModRefInfo(&CB, MemoryLocation::getForSource(MDep))))
     return false;
 
   LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to Immut src:\n"
@@ -2117,6 +2118,8 @@ bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
   MSSA = MSSA_;
   MemorySSAUpdater MSSAU_(MSSA_);
   MSSAU = &MSSAU_;
+  EarliestEscapeInfo EEI_(*DT);
+  EEI = &EEI_;
 
   while (true) {
     if (!iterateOnFunction(F))
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index d464e49990b3b..da9b804e94a74 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -700,7 +700,7 @@ bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) {
 bool ScalarizerVisitor::isTriviallyScalarizable(Intrinsic::ID ID) {
   if (isTriviallyVectorizable(ID))
     return true;
-  return Function::isTargetIntrinsic(ID) &&
+  return Intrinsic::isTargetIntrinsic(ID) &&
          TTI->isTargetIntrinsicTriviallyScalarizable(ID);
 }
 
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index aca8225cebb3f..92e47cbc7ae8b 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -30,6 +30,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
@@ -85,7 +86,43 @@ using PhiMap = MapVector<PHINode *, BBValueVector>;
 using BB2BBVecMap = MapVector<BasicBlock *, BBVector>;
 
 using BBPhiMap = DenseMap<BasicBlock *, PhiMap>;
-using BBPredicates = DenseMap<BasicBlock *, Value *>;
+
+using MaybeCondBranchWeights = std::optional<class CondBranchWeights>;
+
+class CondBranchWeights {
+  uint32_t TrueWeight;
+  uint32_t FalseWeight;
+
+  CondBranchWeights(uint32_t T, uint32_t F) : TrueWeight(T), FalseWeight(F) {}
+
+public:
+  static MaybeCondBranchWeights tryParse(const BranchInst &Br) {
+    assert(Br.isConditional());
+
+    uint64_t T, F;
+    if (!extractBranchWeights(Br, T, F))
+      return std::nullopt;
+
+    return CondBranchWeights(T, F);
+  }
+
+  static void setMetadata(BranchInst &Br,
+                          const MaybeCondBranchWeights &Weights) {
+    assert(Br.isConditional());
+    if (!Weights)
+      return;
+    uint32_t Arr[] = {Weights->TrueWeight, Weights->FalseWeight};
+    setBranchWeights(Br, Arr, false);
+  }
+
+  CondBranchWeights invert() const {
+    return CondBranchWeights{FalseWeight, TrueWeight};
+  }
+};
+
+using ValueWeightPair = std::pair<Value *, MaybeCondBranchWeights>;
+
+using BBPredicates = DenseMap<BasicBlock *, ValueWeightPair>;
 using PredMap = DenseMap<BasicBlock *, BBPredicates>;
 using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
 
@@ -271,7 +308,7 @@ class StructurizeCFG {
 
   void analyzeLoops(RegionNode *N);
 
-  Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
+  ValueWeightPair buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
 
   void gatherPredicates(RegionNode *N);
 
@@ -449,16 +486,22 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
 }
 
 /// Build the condition for one edge
-Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
-                                      bool Invert) {
+ValueWeightPair StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+                                               bool Invert) {
   Value *Cond = Invert ? BoolFalse : BoolTrue;
+  MaybeCondBranchWeights Weights;
+
   if (Term->isConditional()) {
     Cond = Term->getCondition();
+    Weights = CondBranchWeights::tryParse(*Term);
 
-    if (Idx != (unsigned)Invert)
+    if (Idx != (unsigned)Invert) {
       Cond = invertCondition(Cond);
+      if (Weights)
+        Weights = Weights->invert();
+    }
   }
-  return Cond;
+  return {Cond, Weights};
 }
 
 /// Analyze the predecessors of each block and build up predicates
@@ -490,8 +533,8 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
             if (Visited.count(Other) && !Loops.count(Other) &&
                 !Pred.count(Other) && !Pred.count(P)) {
 
-              Pred[Other] = BoolFalse;
-              Pred[P] = BoolTrue;
+              Pred[Other] = {BoolFalse, std::nullopt};
+              Pred[P] = {BoolTrue, std::nullopt};
               continue;
             }
           }
@@ -512,9 +555,9 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
 
       BasicBlock *Entry = R->getEntry();
       if (Visited.count(Entry))
-        Pred[Entry] = BoolTrue;
+        Pred[Entry] = {BoolTrue, std::nullopt};
       else
-        LPred[Entry] = BoolFalse;
+        LPred[Entry] = {BoolFalse, std::nullopt};
     }
   }
 }
@@ -578,12 +621,14 @@ void StructurizeCFG::insertConditions(bool Loops) {
     Dominator.addBlock(Parent);
 
     Value *ParentValue = nullptr;
-    for (std::pair<BasicBlock *, Value *> BBAndPred : Preds) {
+    MaybeCondBranchWeights ParentWeights = std::nullopt;
+    for (std::pair<BasicBlock *, ValueWeightPair> BBAndPred : Preds) {
       BasicBlock *BB = BBAndPred.first;
-      Value *Pred = BBAndPred.second;
+      auto [Pred, Weight] = BBAndPred.second;
 
       if (BB == Parent) {
         ParentValue = Pred;
+        ParentWeights = Weight;
         break;
       }
       PhiInserter.AddAvailableValue(BB, Pred);
@@ -592,6 +637,7 @@ void StructurizeCFG::insertConditions(bool Loops) {
 
     if (ParentValue) {
       Term->setCondition(ParentValue);
+      CondBranchWeights::setMetadata(*Term, ParentWeights);
     } else {
       if (!Dominator.resultIsRememberedBlock())
         PhiInserter.AddAvailableValue(Dominator.result(), Default);
@@ -607,7 +653,7 @@ void StructurizeCFG::simplifyConditions() {
   for (auto &I : concat<PredMap::value_type>(Predicates, LoopPreds)) {
     auto &Preds = I.second;
     for (auto &J : Preds) {
-      auto &Cond = J.second;
+      Value *Cond = J.second.first;
       Instruction *Inverted;
       if (match(Cond, m_Not(m_OneUse(m_Instruction(Inverted)))) &&
           !Cond->use_empty()) {
@@ -904,9 +950,10 @@ void StructurizeCFG::setPrevNode(BasicBlock *BB) {
 /// Does BB dominate all the predicates of Node?
 bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
   BBPredicates &Preds = Predicates[Node->getEntry()];
-  return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) {
-    return DT->dominates(BB, Pred.first);
-  });
+  return llvm::all_of(Preds,
+                      [&](std::pair<BasicBlock *, ValueWeightPair> Pred) {
+                        return DT->dominates(BB, Pred.first);
+                      });
 }
 
 /// Can we predict that this node will always be called?
@@ -918,9 +965,9 @@ bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
   if (!PrevNode)
     return true;
 
-  for (std::pair<BasicBlock*, Value*> Pred : Preds) {
+  for (std::pair<BasicBlock *, ValueWeightPair> Pred : Preds) {
     BasicBlock *BB = Pred.first;
-    Value *V = Pred.second;
+    Value *V = Pred.second.first;
 
     if (V != BoolTrue)
       return false;
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 4144c7993b7e4..7bffd4da75a5b 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -503,7 +503,8 @@ static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
 static bool
 DbgVariableRecordsRemoveRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
   SmallVector<DbgVariableRecord *, 8> ToBeRemoved;
-  DenseMap<DebugVariable, std::pair<SmallVector<Value *, 4>, DIExpression *>>
+  SmallDenseMap<DebugVariable,
+                std::pair<SmallVector<Value *, 4>, DIExpression *>, 4>
       VariableMap;
   for (auto &I : *BB) {
     for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
@@ -584,7 +585,8 @@ static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
     return DbgVariableRecordsRemoveRedundantDbgInstrsUsingForwardScan(BB);
 
   SmallVector<DbgValueInst *, 8> ToBeRemoved;
-  DenseMap<DebugVariable, std::pair<SmallVector<Value *, 4>, DIExpression *>>
+  SmallDenseMap<DebugVariable,
+                std::pair<SmallVector<Value *, 4>, DIExpression *>, 4>
       VariableMap;
   for (auto &I : *BB) {
     if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) {
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index d4727dece19f6..7bb4b55fcb7cf 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -1978,15 +1978,15 @@ Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL,
 }
 
 Value *llvm::emitCalloc(Value *Num, Value *Size, IRBuilderBase &B,
-                        const TargetLibraryInfo &TLI) {
+                        const TargetLibraryInfo &TLI, unsigned AddrSpace) {
   Module *M = B.GetInsertBlock()->getModule();
   if (!isLibFuncEmittable(M, &TLI, LibFunc_calloc))
     return nullptr;
 
   StringRef CallocName = TLI.getName(LibFunc_calloc);
   Type *SizeTTy = getSizeTTy(B, &TLI);
-  FunctionCallee Calloc = getOrInsertLibFunc(M, TLI, LibFunc_calloc,
-                                             B.getPtrTy(), SizeTTy, SizeTTy);
+  FunctionCallee Calloc = getOrInsertLibFunc(
+      M, TLI, LibFunc_calloc, B.getPtrTy(AddrSpace), SizeTTy, SizeTTy);
   inferNonMandatoryLibFuncAttrs(M, CallocName, TLI);
   CallInst *CI = B.CreateCall(Calloc, {Num, Size}, CallocName);
 
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 5f872c352429c..4ead5cdcf29c0 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -623,36 +623,39 @@ CallBase *llvm::promoteCallWithIfThenElse(CallBase &CB, Function &Callee,
     // All the ctx-es belonging to a function must have the same size counters.
     Ctx.resizeCounters(NewCountersSize);
 
-    // Maybe in this context, the indirect callsite wasn't observed at all
+    // Maybe in this context, the indirect callsite wasn't observed at all. That
+    // would make both direct and indirect BBs cold - which is what we already
+    // have from resising the counters.
     if (!Ctx.hasCallsite(CSIndex))
       return;
     auto &CSData = Ctx.callsite(CSIndex);
-    auto It = CSData.find(CalleeGUID);
 
-    // Maybe we did notice the indirect callsite, but to other targets.
-    if (It == CSData.end())
-      return;
-
-    assert(CalleeGUID == It->second.guid());
-
-    uint32_t DirectCount = It->second.getEntrycount();
-    uint32_t TotalCount = 0;
+    uint64_t TotalCount = 0;
     for (const auto &[_, V] : CSData)
       TotalCount += V.getEntrycount();
+    uint64_t DirectCount = 0;
+    // If we called the direct target, update the DirectCount. If we didn't, we
+    // still want to update the indirect BB (to which the TotalCount goes, in
+    // that case).
+    if (auto It = CSData.find(CalleeGUID); It != CSData.end()) {
+      assert(CalleeGUID == It->second.guid());
+      DirectCount = It->second.getEntrycount();
+      // This direct target needs to be moved to this caller under the
+      // newly-allocated callsite index.
+      assert(Ctx.callsites().count(NewCSID) == 0);
+      Ctx.ingestContext(NewCSID, std::move(It->second));
+      CSData.erase(CalleeGUID);
+    }
+
     assert(TotalCount >= DirectCount);
-    uint32_t IndirectCount = TotalCount - DirectCount;
+    uint64_t IndirectCount = TotalCount - DirectCount;
     // The ICP's effect is as-if the direct BB would have been taken DirectCount
     // times, and the indirect BB, IndirectCount times
     Ctx.counters()[DirectID] = DirectCount;
     Ctx.counters()[IndirectID] = IndirectCount;
 
-    // This particular indirect target needs to be moved to this caller under
-    // the newly-allocated callsite index.
-    assert(Ctx.callsites().count(NewCSID) == 0);
-    Ctx.ingestContext(NewCSID, std::move(It->second));
-    CSData.erase(CalleeGUID);
   };
-  CtxProf.update(ProfileUpdater, &Caller);
+  CtxProf.update(ProfileUpdater, Caller);
   return &DirectCall;
 }
 
diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
index 95edd27c675d2..baaad8bb48f33 100644
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -1427,20 +1427,18 @@ codelayout::computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
 
 double codelayout::calcExtTspScore(ArrayRef<uint64_t> Order,
                                    ArrayRef<uint64_t> NodeSizes,
-                                   ArrayRef<uint64_t> NodeCounts,
                                    ArrayRef<EdgeCount> EdgeCounts) {
   // Estimate addresses of the blocks in memory.
-  std::vector<uint64_t> Addr(NodeSizes.size(), 0);
-  for (size_t Idx = 1; Idx < Order.size(); Idx++) {
+  SmallVector<uint64_t> Addr(NodeSizes.size(), 0);
+  for (uint64_t Idx = 1; Idx < Order.size(); Idx++)
     Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
-  }
-  std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
-  for (auto Edge : EdgeCounts)
+  SmallVector<uint64_t> OutDegree(NodeSizes.size(), 0);
+  for (auto &Edge : EdgeCounts)
     ++OutDegree[Edge.src];
 
   // Increase the score for each jump.
   double Score = 0;
-  for (auto Edge : EdgeCounts) {
+  for (auto &Edge : EdgeCounts) {
     bool IsConditional = OutDegree[Edge.src] > 1;
     Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst],
                            Edge.count, IsConditional);
@@ -1449,13 +1447,11 @@ double codelayout::calcExtTspScore(ArrayRef<uint64_t> Order,
 }
 
 double codelayout::calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
-                                   ArrayRef<uint64_t> NodeCounts,
                                    ArrayRef<EdgeCount> EdgeCounts) {
-  std::vector<uint64_t> Order(NodeSizes.size());
-  for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
+  SmallVector<uint64_t> Order(NodeSizes.size());
+  for (uint64_t Idx = 0; Idx < NodeSizes.size(); Idx++)
     Order[Idx] = Idx;
-  }
-  return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
+  return calcExtTspScore(Order, NodeSizes, EdgeCounts);
 }
 
 std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 25a01558165da..671b0d0822a5d 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -2375,7 +2375,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     assert(Deleted);
     (void)Deleted;
   };
-  CtxProf.update(Updater, &Caller);
+  CtxProf.update(Updater, Caller);
   return Ret;
 }
 
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 725b512fb86e7..cfe40f91f9a5d 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -112,6 +112,12 @@ static cl::opt<unsigned> PHICSENumPHISmallSize(
         "When the basic block contains not more than this number of PHI nodes, "
         "perform a (faster!) exhaustive search instead of set-driven one."));
 
+static cl::opt<unsigned> MaxPhiEntriesIncreaseAfterRemovingEmptyBlock(
+    "max-phi-entries-increase-after-removing-empty-block", cl::init(1000),
+    cl::Hidden,
+    cl::desc("Stop removing an empty block if removing it will introduce more "
+             "than this number of phi entries in its successor"));
+
 // Max recursion depth for collectBitParts used when detecting bswap and
 // bitreverse idioms.
 static const unsigned BitPartRecursionMaxDepth = 48;
@@ -919,7 +925,7 @@ CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ,
 }
 
 using PredBlockVector = SmallVector<BasicBlock *, 16>;
-using IncomingValueMap = DenseMap<BasicBlock *, Value *>;
+using IncomingValueMap = SmallDenseMap<BasicBlock *, Value *, 16>;
 
 /// Determines the value to use as the phi node input for a block.
 ///
@@ -1047,6 +1053,33 @@ CanRedirectPredsOfEmptyBBToSucc(BasicBlock *BB, BasicBlock *Succ,
   return true;
 }
 
+/// Check whether removing \p BB will make the phis in its \p Succ have too
+/// many incoming entries. This function does not check whether \p BB is
+/// foldable or not.
+static bool introduceTooManyPhiEntries(BasicBlock *BB, BasicBlock *Succ) {
+  // If BB only has one predecessor, then removing it will not introduce more
+  // incoming edges for phis.
+  if (BB->hasNPredecessors(1))
+    return false;
+  unsigned NumPreds = pred_size(BB);
+  unsigned NumChangedPhi = 0;
+  for (auto &Phi : Succ->phis()) {
+    // If the incoming value is a phi and the phi is defined in BB,
+    // then removing BB will not increase the total phi entries of the ir.
+    if (auto *IncomingPhi = dyn_cast<PHINode>(Phi.getIncomingValueForBlock(BB)))
+      if (IncomingPhi->getParent() == BB)
+        continue;
+    // Otherwise, we need to add entries to the phi
+    NumChangedPhi++;
+  }
+  // For every phi that needs to be changed, (NumPreds - 1) new entries will be
+  // added. If the total increase in phi entries exceeds
+  // MaxPhiEntriesIncreaseAfterRemovingEmptyBlock, it will be considered as
+  // introducing too many new phi entries.
+  return (NumPreds - 1) * NumChangedPhi >
+         MaxPhiEntriesIncreaseAfterRemovingEmptyBlock;
+}
+
 /// Replace a value flowing from a block to a phi with
 /// potentially multiple instances of that value flowing from the
 /// block's predecessors to the phi.
@@ -1146,7 +1179,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
       BBKillable ||
       CanRedirectPredsOfEmptyBBToSucc(BB, Succ, BBPreds, SuccPreds, CommonPred);
 
-  if (!BBKillable && !BBPhisMergeable)
+  if ((!BBKillable && !BBPhisMergeable) || introduceTooManyPhiEntries(BB, Succ))
     return false;
 
   // Check to see if merging these blocks/phis would cause conflicts for any of
@@ -2434,7 +2467,7 @@ Value *getSalvageOpsForGEP(GetElementPtrInst *GEP, const DataLayout &DL,
                            SmallVectorImpl<Value *> &AdditionalValues) {
   unsigned BitWidth = DL.getIndexSizeInBits(GEP->getPointerAddressSpace());
   // Rewrite a GEP into a DIExpression.
-  MapVector<Value *, APInt> VariableOffsets;
+  SmallMapVector<Value *, APInt, 4> VariableOffsets;
   APInt ConstantOffset(BitWidth, 0);
   if (!GEP->collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset))
     return nullptr;
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 5e69923fd989d..44fdfe530178a 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -47,7 +47,6 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -762,7 +761,6 @@ namespace {
       AU.addPreserved<ScalarEvolutionWrapperPass>();
       AU.addPreserved<SCEVAAWrapperPass>();
       AU.addPreservedID(LCSSAID);
-      AU.addPreserved<DependenceAnalysisWrapperPass>();
       AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
       AU.addPreserved<BranchProbabilityInfoWrapperPass>();
       AU.addPreserved<MemorySSAWrapperPass>();
@@ -849,7 +847,6 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F,
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<LoopAnalysis>();
   PA.preserve<ScalarEvolutionAnalysis>();
-  PA.preserve<DependenceAnalysis>();
   if (MSSAAnalysis)
     PA.preserve<MemorySSAAnalysis>();
   // BPI maps conditional terminators to probabilities, LoopSimplify can insert
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 9a4289e1a30da..70047273c3b9a 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -445,21 +445,22 @@ TransformationMode llvm::hasLICMVersioningTransformation(const Loop *L) {
 }
 
 /// Does a BFS from a given node to all of its children inside a given loop.
-/// The returned vector of nodes includes the starting point.
-SmallVector<DomTreeNode *, 16>
-llvm::collectChildrenInLoop(DomTreeNode *N, const Loop *CurLoop) {
-  SmallVector<DomTreeNode *, 16> Worklist;
+/// The returned vector of basic blocks includes the starting point.
+SmallVector<BasicBlock *, 16> llvm::collectChildrenInLoop(DominatorTree *DT,
+                                                          DomTreeNode *N,
+                                                          const Loop *CurLoop) {
+  SmallVector<BasicBlock *, 16> Worklist;
   auto AddRegionToWorklist = [&](DomTreeNode *DTN) {
     // Only include subregions in the top level loop.
     BasicBlock *BB = DTN->getBlock();
     if (CurLoop->contains(BB))
-      Worklist.push_back(DTN);
+      Worklist.push_back(DTN->getBlock());
   };
 
   AddRegionToWorklist(N);
 
   for (size_t I = 0; I < Worklist.size(); I++) {
-    for (DomTreeNode *Child : Worklist[I]->children())
+    for (DomTreeNode *Child : DT->getNode(Worklist[I])->children())
       AddRegionToWorklist(Child);
   }
 
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 0927a3015818f..1088547e1f3ef 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -49,6 +49,7 @@ PoisonFlags::PoisonFlags(const Instruction *I) {
   Exact = false;
   Disjoint = false;
   NNeg = false;
+  GEPNW = GEPNoWrapFlags::none();
   if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(I)) {
     NUW = OBO->hasNoUnsignedWrap();
     NSW = OBO->hasNoSignedWrap();
@@ -63,6 +64,8 @@ PoisonFlags::PoisonFlags(const Instruction *I) {
     NUW = TI->hasNoUnsignedWrap();
     NSW = TI->hasNoSignedWrap();
   }
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    GEPNW = GEP->getNoWrapFlags();
 }
 
 void PoisonFlags::apply(Instruction *I) {
@@ -80,6 +83,8 @@ void PoisonFlags::apply(Instruction *I) {
     I->setHasNoUnsignedWrap(NUW);
     I->setHasNoSignedWrap(NSW);
   }
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    GEP->setNoWrapFlags(GEPNW);
 }
 
 /// ReuseOrCreateCast - Arrange for there to be a cast of V to Ty at IP,
@@ -370,11 +375,15 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *Offset, Value *V) {
       // generated code.
       if (isa<DbgInfoIntrinsic>(IP))
         ScanLimit++;
-      if (IP->getOpcode() == Instruction::GetElementPtr &&
-          IP->getOperand(0) == V && IP->getOperand(1) == Idx &&
-          cast<GEPOperator>(&*IP)->getSourceElementType() ==
-              Builder.getInt8Ty())
-        return &*IP;
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(IP)) {
+        if (GEP->getPointerOperand() == V &&
+            GEP->getSourceElementType() == Builder.getInt8Ty() &&
+            GEP->getOperand(1) == Idx) {
+          rememberFlags(GEP);
+          GEP->setNoWrapFlags(GEPNoWrapFlags::none());
+          return &*IP;
+        }
+      }
       if (IP == BlockBegin) break;
     }
   }
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 1f2c9389c008b..ffd4037f23ffa 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -402,9 +403,9 @@ static InstructionCost computeSpeculationCost(const User *I,
 }
 
 /// If we have a merge point of an "if condition" as accepted above,
-/// return true if the specified value dominates the block.  We
-/// don't handle the true generality of domination here, just a special case
-/// which works well enough for us.
+/// return true if the specified value dominates the block.  We don't handle
+/// the true generality of domination here, just a special case which works
+/// well enough for us.
 ///
 /// If AggressiveInsts is non-null, and if V does not dominate BB, we check to
 /// see if V (which must be an instruction) and its recursive operands
@@ -3057,16 +3058,17 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
     if (auto *LI = dyn_cast<LoadInst>(&CurI)) {
       if (LI->getPointerOperand() == StorePtr && LI->getType() == StoreTy &&
           LI->isSimple() && LI->getAlign() >= StoreToHoist->getAlign()) {
-        // Local objects (created by an `alloca` instruction) are always
-        // writable, so once we are past a read from a location it is valid to
-        // also write to that same location.
-        // If the address of the local object never escapes the function, that
-        // means it's never concurrently read or written, hence moving the store
-        // from under the condition will not introduce a data race.
-        auto *AI = dyn_cast<AllocaInst>(getUnderlyingObject(StorePtr));
-        if (AI && !PointerMayBeCaptured(AI, false, true))
+        Value *Obj = getUnderlyingObject(StorePtr);
+        bool ExplicitlyDereferenceableOnly;
+        if (isWritableObject(Obj, ExplicitlyDereferenceableOnly) &&
+            !PointerMayBeCaptured(Obj, /*ReturnCaptures=*/false,
+                                  /*StoreCaptures=*/true) &&
+            (!ExplicitlyDereferenceableOnly ||
+             isDereferenceablePointer(StorePtr, StoreTy,
+                                      LI->getDataLayout()))) {
           // Found a previous load, return it.
           return LI;
+        }
       }
       // The load didn't work out, but we may still find a store.
     }
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 8bd3dbf069573..eeff4a9f6a8ba 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -5,7 +5,6 @@ add_llvm_component_library(LLVMVectorize
   LoopVectorize.cpp
   SandboxVectorizer/DependencyGraph.cpp
   SandboxVectorizer/Passes/BottomUpVec.cpp
-  SandboxVectorizer/Region.cpp
   SandboxVectorizer/SandboxVectorizer.cpp
   SLPVectorizer.cpp
   Vectorize.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index e695902c9d72a..43be72f0f34d4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -79,6 +79,10 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
                 "Scalable vectorization is available and favored when the "
                 "cost is inconclusive.")));
 
+static cl::opt<bool> EnableHistogramVectorization(
+    "enable-histogram-loop-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Enables autovectorization of some loops containing histograms"));
+
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
 
@@ -1051,6 +1055,133 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
   return true;
 }
 
+/// Find histogram operations that match high-level code in loops:
+/// \code
+/// buckets[indices[i]]+=step;
+/// \endcode
+///
+/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
+/// array the computed histogram. It uses a BinOp to sum all counts, storing
+/// them using a loop-variant index Load from the 'indices' input array.
+///
+/// On successful matches it updates the STATISTIC 'HistogramsDetected',
+/// regardless of hardware support. When there is support, it additionally
+/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
+/// used to update histogram in \p HistogramPtrs.
+static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
+                          const PredicatedScalarEvolution &PSE,
+                          SmallVectorImpl<HistogramInfo> &Histograms) {
+
+  // Store value must come from a Binary Operation.
+  Instruction *HPtrInstr = nullptr;
+  BinaryOperator *HBinOp = nullptr;
+  if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr))))
+    return false;
+
+  // BinOp must be an Add or a Sub modifying the bucket value by a
+  // loop invariant amount.
+  // FIXME: We assume the loop invariant term is on the RHS.
+  //        Fine for an immediate/constant, but maybe not a generic value?
+  Value *HIncVal = nullptr;
+  if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+      !match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))))
+    return false;
+
+  // Make sure the increment value is loop invariant.
+  if (!TheLoop->isLoopInvariant(HIncVal))
+    return false;
+
+  // The address to store is calculated through a GEP Instruction.
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(HPtrInstr);
+  if (!GEP)
+    return false;
+
+  // Restrict address calculation to constant indices except for the last term.
+  Value *HIdx = nullptr;
+  for (Value *Index : GEP->indices()) {
+    if (HIdx)
+      return false;
+    if (!isa<ConstantInt>(Index))
+      HIdx = Index;
+  }
+
+  if (!HIdx)
+    return false;
+
+  // Check that the index is calculated by loading from another array. Ignore
+  // any extensions.
+  // FIXME: Support indices from other sources than a linear load from memory?
+  //        We're currently trying to match an operation looping over an array
+  //        of indices, but there could be additional levels of indirection
+  //        in place, or possibly some additional calculation to form the index
+  //        from the loaded data.
+  Value *VPtrVal;
+  if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Load(m_Value(VPtrVal)))))
+    return false;
+
+  // Make sure the index address varies in this loop, not an outer loop.
+  const auto *AR = dyn_cast<SCEVAddRecExpr>(PSE.getSE()->getSCEV(VPtrVal));
+  if (!AR || AR->getLoop() != TheLoop)
+    return false;
+
+  // Ensure we'll have the same mask by checking that all parts of the histogram
+  // (gather load, update, scatter store) are in the same block.
+  LoadInst *IndexedLoad = cast<LoadInst>(HBinOp->getOperand(0));
+  BasicBlock *LdBB = IndexedLoad->getParent();
+  if (LdBB != HBinOp->getParent() || LdBB != HSt->getParent())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");
+
+  // Store the operations that make up the histogram.
+  Histograms.emplace_back(IndexedLoad, HBinOp, HSt);
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeIndirectUnsafeDependences() {
+  // For now, we only support an IndirectUnsafe dependency that calculates
+  // a histogram
+  if (!EnableHistogramVectorization)
+    return false;
+
+  // Find a single IndirectUnsafe dependency.
+  const MemoryDepChecker::Dependence *IUDep = nullptr;
+  const MemoryDepChecker &DepChecker = LAI->getDepChecker();
+  const auto *Deps = DepChecker.getDependences();
+  // If there were too many dependences, LAA abandons recording them. We can't
+  // proceed safely if we don't know what the dependences are.
+  if (!Deps)
+    return false;
+
+  for (const MemoryDepChecker::Dependence &Dep : *Deps) {
+    // Ignore dependencies that are either known to be safe or can be
+    // checked at runtime.
+    if (MemoryDepChecker::Dependence::isSafeForVectorization(Dep.Type) !=
+        MemoryDepChecker::VectorizationSafetyStatus::Unsafe)
+      continue;
+
+    // We're only interested in IndirectUnsafe dependencies here, where the
+    // address might come from a load from memory. We also only want to handle
+    // one such dependency, at least for now.
+    if (Dep.Type != MemoryDepChecker::Dependence::IndirectUnsafe || IUDep)
+      return false;
+
+    IUDep = &Dep;
+  }
+  if (!IUDep)
+    return false;
+
+  // For now only normal loads and stores are supported.
+  LoadInst *LI = dyn_cast<LoadInst>(IUDep->getSource(DepChecker));
+  StoreInst *SI = dyn_cast<StoreInst>(IUDep->getDestination(DepChecker));
+
+  if (!LI || !SI)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "LV: Checking for a histogram on: " << *SI << "\n");
+  return findHistogram(LI, SI, TheLoop, LAI->getPSE(), Histograms);
+}
+
 bool LoopVectorizationLegality::canVectorizeMemory() {
   LAI = &LAIs.getInfo(*TheLoop);
   const OptimizationRemarkAnalysis *LAR = LAI->getReport();
@@ -1062,7 +1193,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   }
 
   if (!LAI->canVectorizeMemory())
-    return false;
+    return canVectorizeIndirectUnsafeDependences();
 
   if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
     reportVectorizationFailure("We don't allow storing to uniform addresses",
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 00eec0a6f7b14..f24cd43a93bc7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -435,11 +435,10 @@ class LoopVectorizationPlanner {
   /// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop
   /// to re-use expansion results generated during main plan execution.
   ///
-  /// Returns a mapping of SCEVs to their expanded IR values and a mapping for
-  /// the reduction resume values. Note that this is a temporary workaround
-  /// needed due to the current epilogue handling.
-  std::pair<DenseMap<const SCEV *, Value *>,
-            DenseMap<const RecurrenceDescriptor *, Value *>>
+  /// Returns a mapping of SCEVs to their expanded IR values.
+  /// Note that this is a temporary workaround needed due to the current
+  /// epilogue handling.
+  DenseMap<const SCEV *, Value *>
   executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
               InnerLoopVectorizer &LB, DominatorTree *DT,
               bool IsEpilogueVectorization,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5e4f33c55610f..f5ef50934f59f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -408,10 +408,11 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
 /// the following procedure:
 ///   1) Returns exact trip count if it is known.
 ///   2) Returns expected trip count according to profile data if any.
-///   3) Returns upper bound estimate if it is known.
+///   3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
 ///   4) Returns std::nullopt if all of the above failed.
-static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
-                                                   Loop *L) {
+static std::optional<unsigned>
+getSmallBestKnownTC(ScalarEvolution &SE, Loop *L,
+                    bool CanUseConstantMax = true) {
   // Check if exact trip count is known.
   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
     return ExpectedTC;
@@ -421,6 +422,9 @@ static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
       return *EstimatedTC;
 
+  if (!CanUseConstantMax)
+    return std::nullopt;
+
   // Check if upper bound estimate is known.
   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
     return ExpectedTC;
@@ -505,8 +509,7 @@ class InnerLoopVectorizer {
   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
   /// Instr's operands.
   void scalarizeInstruction(const Instruction *Instr,
-                            VPReplicateRecipe *RepRecipe,
-                            const VPIteration &Instance,
+                            VPReplicateRecipe *RepRecipe, const VPLane &Lane,
                             VPTransformState &State);
 
   /// Fix the non-induction PHIs in \p Plan.
@@ -677,27 +680,6 @@ class InnerLoopVectorizer {
   /// Structure to hold information about generated runtime checks, responsible
   /// for cleaning the checks, if vectorization turns out unprofitable.
   GeneratedRTChecks &RTChecks;
-
-  // Holds the resume values for reductions in the loops, used to set the
-  // correct start value of reduction PHIs when vectorizing the epilogue.
-  SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
-      ReductionResumeValues;
-};
-
-class InnerLoopUnroller : public InnerLoopVectorizer {
-public:
-  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
-                    LoopInfo *LI, DominatorTree *DT,
-                    const TargetLibraryInfo *TLI,
-                    const TargetTransformInfo *TTI, AssumptionCache *AC,
-                    OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
-                    LoopVectorizationLegality *LVL,
-                    LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
-                    ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
-      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
-                            ElementCount::getFixed(1),
-                            ElementCount::getFixed(1), UnrollFactor, LVL, CM,
-                            BFI, PSI, Check) {}
 };
 
 /// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -1955,14 +1937,10 @@ class GeneratedRTChecks {
           // count. Assume that the outer loop executes at least twice.
           unsigned BestTripCount = 2;
 
-          // If exact trip count is known use that.
-          if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
-            BestTripCount = SmallTC;
-          else if (LoopVectorizeWithBlockFrequency) {
-            // Else use profile data if available.
-            if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
-              BestTripCount = *EstimatedTC;
-          }
+          // Get the best known TC estimate.
+          if (auto EstimatedTC = getSmallBestKnownTC(
+                  *SE, OuterLoop, /* CanUseConstantMax = */ false))
+            BestTripCount = *EstimatedTC;
 
           BestTripCount = std::max(BestTripCount, 1U);
           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
@@ -2322,16 +2300,10 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
 
 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
                                                VPReplicateRecipe *RepRecipe,
-                                               const VPIteration &Instance,
+                                               const VPLane &Lane,
                                                VPTransformState &State) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
 
-  // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
-  // the first lane and part.
-  if (isa<NoAliasScopeDeclInst>(Instr))
-    if (!Instance.isFirstIteration())
-      return;
-
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
 
@@ -2354,18 +2326,18 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
   // Replace the operands of the cloned instructions with their scalar
   // equivalents in the new loop.
   for (const auto &I : enumerate(RepRecipe->operands())) {
-    auto InputInstance = Instance;
+    auto InputLane = Lane;
     VPValue *Operand = I.value();
     if (vputils::isUniformAfterVectorization(Operand))
-      InputInstance.Lane = VPLane::getFirstLane();
-    Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
+      InputLane = VPLane::getFirstLane();
+    Cloned->setOperand(I.index(), State.get(Operand, InputLane));
   }
   State.addNewMetadata(Cloned, Instr);
 
   // Place the cloned scalar in the new loop.
   State.Builder.Insert(Cloned);
 
-  State.set(RepRecipe, Cloned, Instance);
+  State.set(RepRecipe, Cloned, Lane);
 
   // If we just cloned a new assumption, add it the assumption cache.
   if (auto *II = dyn_cast<AssumeInst>(Cloned))
@@ -2374,6 +2346,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
   // End if-block.
   VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
   bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
+  assert((Parent || all_of(RepRecipe->operands(),
+                           [](VPValue *Op) {
+                             return Op->isDefinedOutsideLoopRegions();
+                           })) &&
+         "Expected a recipe is either within a region or all of its operands "
+         "are defined outside the vectorized region.");
   if (IfPredicateInstr)
     PredicatedInstructions.push_back(Cloned);
 }
@@ -2784,7 +2762,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
       VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
       assert(StepVPV && "step must have been expanded during VPlan execution");
       Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
-                                        : State.get(StepVPV, {0, 0});
+                                        : State.get(StepVPV, VPLane(0));
       Value *Escape =
           emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
                                II.getKind(), II.getInductionBinOp());
@@ -2934,9 +2912,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
     for (PHINode &PN : Exit->phis())
       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
 
-  VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
-  VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
-  Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
   if (Cost->requiresScalarEpilogue(VF.isVector())) {
     // No edge from the middle block to the unique exit block has been inserted
     // and there is nothing to fix from vector loop; phis should have incoming
@@ -2952,7 +2927,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
     // Fix-up external users of the induction variables.
     for (const auto &Entry : Legal->getInductionVars())
       fixupIVUsers(Entry.first, Entry.second,
-                   getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
+                   getOrCreateVectorTripCount(nullptr),
                    IVEndValues[Entry.first], LoopMiddleBlock, Plan, State);
   }
 
@@ -2963,8 +2938,12 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
   for (Instruction *PI : PredicatedInstructions)
     sinkScalarOperands(&*PI);
 
+  VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
+  VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
+  BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
+
   // Remove redundant induction instructions.
-  cse(VectorLoop->getHeader());
+  cse(HeaderBB);
 
   // Set/update profile weights for the vector and remainder loops as original
   // loop iterations are now distributed among them. Note that original loop
@@ -2979,8 +2958,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
   // For scalable vectorization we can't know at compile time how many iterations
   // of the loop are handled in one vector iteration, so instead assume a pessimistic
   // vscale of '1'.
-  setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
-                               LI->getLoopFor(LoopScalarBody),
+  Loop *ScalarLoop = LI->getLoopFor(LoopScalarBody);
+  Loop *VectorLoop = LI->getLoopFor(HeaderBB);
+  setProfileInfoAfterUnrolling(ScalarLoop, VectorLoop, ScalarLoop,
                                VF.getKnownMinValue() * UF);
 }
 
@@ -5142,7 +5122,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
   // Each 'key' in the map opens a new interval. The values
   // of the map are the index of the 'last seen' usage of the
   // instruction that is the key.
-  using IntervalMap = DenseMap<Instruction *, unsigned>;
+  using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>;
 
   // Maps instruction to its index.
   SmallVector<Instruction *, 64> IdxToInstr;
@@ -5185,7 +5165,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
 
   // Saves the list of intervals that end with the index in 'key'.
   using InstrList = SmallVector<Instruction *, 2>;
-  DenseMap<unsigned, InstrList> TransposeEnds;
+  SmallDenseMap<unsigned, InstrList, 16> TransposeEnds;
 
   // Transpose the EndPoints to a list of values that end at each index.
   for (auto &Interval : EndPoint)
@@ -6507,8 +6487,33 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // We've proven all lanes safe to speculate, fall through.
     [[fallthrough]];
   case Instruction::Add:
+  case Instruction::Sub: {
+    auto Info = Legal->getHistogramInfo(I);
+    if (Info && VF.isVector()) {
+      const HistogramInfo *HGram = Info.value();
+      // Assume that a non-constant update value (or a constant != 1) requires
+      // a multiply, and add that into the cost.
+      InstructionCost MulCost = TTI::TCC_Free;
+      ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
+      if (!RHS || RHS->getZExtValue() != 1)
+        MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
+
+      // Find the cost of the histogram operation itself.
+      Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
+      Type *ScalarTy = I->getType();
+      Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
+      IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
+                                  Type::getVoidTy(I->getContext()),
+                                  {PtrTy, ScalarTy, MaskTy});
+
+      // Add the costs together with the add/sub operation.
+      return TTI.getIntrinsicInstrCost(
+                 ICA, TargetTransformInfo::TCK_RecipThroughput) +
+             MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
+    }
+    [[fallthrough]];
+  }
   case Instruction::FAdd:
-  case Instruction::Sub:
   case Instruction::FSub:
   case Instruction::Mul:
   case Instruction::FMul:
@@ -6540,8 +6545,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
     }
     auto Op2Info = TTI.getOperandInfo(Op2);
-    if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
-        Legal->isInvariant(Op2))
+    auto IsInvariant = [this](Value *Op) {
+      if (!Legal->isInvariant(Op))
+        return false;
+      // Consider Op2 invariant, if it is not a predicated instruction in the
+      // loop. In that case, it is not trivially hoistable.
+      return !isa<Instruction>(Op) ||
+             !TheLoop->contains(cast<Instruction>(Op)) ||
+             !isPredicatedInst(cast<Instruction>(Op));
+    };
+    if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && IsInvariant(Op2))
       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
 
     SmallVector<const Value *, 4> Operands(I->operand_values());
@@ -6587,7 +6600,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
       Pred = Cmp->getPredicate();
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
-                                  CostKind, I);
+                                  CostKind, {TTI::OK_AnyValue, TTI::OP_None},
+                                  {TTI::OK_AnyValue, TTI::OP_None}, I);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
@@ -6606,7 +6620,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     VectorTy = ToVectorTy(ValTy, VF);
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
                                   cast<CmpInst>(I)->getPredicate(), CostKind,
-                                  I);
+                                  {TTI::OK_AnyValue, TTI::OP_None},
+                                  {TTI::OK_AnyValue, TTI::OP_None}, I);
   }
   case Instruction::Store:
   case Instruction::Load: {
@@ -7420,10 +7435,9 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) {
 }
 
 // Check if \p RedResult is a ComputeReductionResult instruction, and if it is
-// create a merge phi node for it and add it to \p ReductionResumeValues.
+// create a merge phi node for it.
 static void createAndCollectMergePhiForReduction(
     VPInstruction *RedResult,
-    DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
     VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
     bool VectorizingEpilogue) {
   if (!RedResult ||
@@ -7433,8 +7447,7 @@ static void createAndCollectMergePhiForReduction(
   auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
 
-  Value *FinalValue =
-      State.get(RedResult, VPIteration(0, VPLane::getFirstLane()));
+  Value *FinalValue = State.get(RedResult, VPLane(VPLane::getFirstLane()));
   auto *ResumePhi =
       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
   if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
@@ -7482,13 +7495,9 @@ static void createAndCollectMergePhiForReduction(
   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
-
-  ReductionResumeValues[&RdxDesc] = BCBlockPhi;
 }
 
-std::pair<DenseMap<const SCEV *, Value *>,
-          DenseMap<const RecurrenceDescriptor *, Value *>>
-LoopVectorizationPlanner::executePlan(
+DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
     InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
@@ -7523,7 +7532,7 @@ LoopVectorizationPlanner::executePlan(
     BestVPlan.getPreheader()->execute(&State);
   }
   if (!ILV.getTripCount())
-    ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
+    ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
   else
     assert(IsEpilogueVectorization && "should only re-use the existing trip "
                                       "count during epilogue vectorization");
@@ -7574,12 +7583,11 @@ LoopVectorizationPlanner::executePlan(
   BestVPlan.execute(&State);
 
   // 2.5 Collect reduction resume values.
-  DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
   auto *ExitVPBB =
       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
   for (VPRecipeBase &R : *ExitVPBB) {
     createAndCollectMergePhiForReduction(
-        dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
+        dyn_cast<VPInstruction>(&R), State, OrigLoop,
         State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
   }
 
@@ -7629,7 +7637,7 @@ LoopVectorizationPlanner::executePlan(
     setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
   }
 
-  return {State.ExpandedSCEVs, ReductionResumeValues};
+  return State.ExpandedSCEVs;
 }
 
 //===--------------------------------------------------------------------===//
@@ -8424,6 +8432,30 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
   };
 }
 
+VPHistogramRecipe *
+VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
+                                     ArrayRef<VPValue *> Operands) {
+  // FIXME: Support other operations.
+  unsigned Opcode = HI->Update->getOpcode();
+  assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
+         "Histogram update operation must be an Add or Sub");
+
+  SmallVector<VPValue *, 3> HGramOps;
+  // Bucket address.
+  HGramOps.push_back(Operands[1]);
+  // Increment value.
+  HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
+
+  // In case of predicated execution (due to tail-folding, or conditional
+  // execution, or both), pass the relevant mask.
+  if (Legal->isMaskRequired(HI->Store))
+    HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
+
+  return new VPHistogramRecipe(Opcode,
+                               make_range(HGramOps.begin(), HGramOps.end()),
+                               HI->Store->getDebugLoc());
+}
+
 void VPRecipeBuilder::fixHeaderPhis() {
   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
   for (VPHeaderPHIRecipe *R : PhisToFix) {
@@ -8547,6 +8579,10 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   if (auto *CI = dyn_cast<CallInst>(Instr))
     return tryToWidenCall(CI, Operands, Range);
 
+  if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
+    if (auto HistInfo = Legal->getHistogramInfo(SI))
+      return tryToWidenHistogram(*HistInfo, Operands);
+
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
     return tryToWidenMemory(Instr, Operands, Range);
 
@@ -8920,6 +8956,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
         return Legal->blockNeedsPredication(BB) || NeedsBlends;
       });
+  auto *MiddleVPBB =
+      cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
+  VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
     // Relevant instructions from basic block BB will be grouped into VPRecipe
     // ingredients and fill a new VPBasicBlock.
@@ -8946,12 +8985,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         Operands = {OpRange.begin(), OpRange.end()};
       }
 
-      // Invariant stores inside loop will be deleted and a single store
-      // with the final reduction value will be added to the exit block
+      // The stores with invariant address inside the loop will be deleted, and
+      // in the exit block, a uniform store recipe will be created for the final
+      // invariant store of the reduction.
       StoreInst *SI;
       if ((SI = dyn_cast<StoreInst>(&I)) &&
-          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+          Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
+        // Only create recipe for the final invariant store of the reduction.
+        if (!Legal->isInvariantStoreOfReduction(SI))
+          continue;
+        auto *Recipe = new VPReplicateRecipe(
+            SI, RecipeBuilder.mapToVPValues(Instr->operands()),
+            true /* IsUniform */);
+        Recipe->insertBefore(*MiddleVPBB, MBIP);
         continue;
+      }
 
       VPRecipeBase *Recipe =
           RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
@@ -9120,45 +9168,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   using namespace VPlanPatternMatch;
   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
-  // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
-  // sank outside of the loop would keep the same order as they had in the
-  // original loop.
-  SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
-  for (VPRecipeBase &R : Header->phis()) {
-    if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
-      ReductionPHIList.emplace_back(ReductionPhi);
-  }
-  bool HasIntermediateStore = false;
-  stable_sort(ReductionPHIList,
-              [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
-                                            const VPReductionPHIRecipe *R2) {
-                auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
-                auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
-                HasIntermediateStore |= IS1 || IS2;
-
-                // If neither of the recipes has an intermediate store, keep the
-                // order the same.
-                if (!IS1 && !IS2)
-                  return false;
-
-                // If only one of the recipes has an intermediate store, then
-                // move it towards the beginning of the list.
-                if (IS1 && !IS2)
-                  return true;
-
-                if (!IS1 && IS2)
-                  return false;
-
-                // If both recipes have an intermediate store, then the recipe
-                // with the later store should be processed earlier. So it
-                // should go to the beginning of the list.
-                return DT->dominates(IS2, IS1);
-              });
-
-  if (HasIntermediateStore && ReductionPHIList.size() > 1)
-    for (VPRecipeBase *R : ReductionPHIList)
-      R->moveBefore(*Header, Header->getFirstNonPhi());
-
+  VPBasicBlock *MiddleVPBB =
+      cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
   for (VPRecipeBase &R : Header->phis()) {
     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
@@ -9177,9 +9188,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       for (VPUser *U : Cur->users()) {
         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
-          assert(match(U, m_Binary<VPInstruction::ExtractFromEnd>(
-                              m_VPValue(), m_VPValue())) &&
-                 "U must be an ExtractFromEnd VPInstruction");
+          assert(UserRecipe->getParent() == MiddleVPBB &&
+                 "U must be either in the loop region or the middle block.");
           continue;
         }
         Worklist.insert(UserRecipe);
@@ -9284,8 +9294,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   }
   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
   Builder.setInsertPoint(&*LatchVPBB->begin());
-  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
   for (VPRecipeBase &R :
        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
@@ -9294,41 +9302,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       continue;
 
     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-    // Adjust AnyOf reductions; replace the reduction phi for the selected value
-    // with a boolean reduction phi node to check if the condition is true in
-    // any iteration. The final value is selected by the final
-    // ComputeReductionResult.
-    if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
-            RdxDesc.getRecurrenceKind())) {
-      auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
-        return isa<VPWidenSelectRecipe>(U) ||
-               (isa<VPReplicateRecipe>(U) &&
-                cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
-                    Instruction::Select);
-      }));
-      VPValue *Cmp = Select->getOperand(0);
-      // If the compare is checking the reduction PHI node, adjust it to check
-      // the start value.
-      if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
-        for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
-          if (CmpR->getOperand(I) == PhiR)
-            CmpR->setOperand(I, PhiR->getStartValue());
-      }
-      VPBuilder::InsertPointGuard Guard(Builder);
-      Builder.setInsertPoint(Select);
-
-      // If the true value of the select is the reduction phi, the new value is
-      // selected if the negated condition is true in any iteration.
-      if (Select->getOperand(1) == PhiR)
-        Cmp = Builder.createNot(Cmp);
-      VPValue *Or = Builder.createOr(PhiR, Cmp);
-      Select->getVPSingleValue()->replaceAllUsesWith(Or);
-
-      // Convert the reduction phi to operate on bools.
-      PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
-                              OrigLoop->getHeader()->getContext())));
-    }
-
     // If tail is folded by masking, introduce selects between the phi
     // and the live-out instruction of each reduction, at the beginning of the
     // dedicated latch block.
@@ -9395,60 +9368,96 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     // also modeled in VPlan.
     auto *FinalReductionResult = new VPInstruction(
         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
+    // Update all users outside the vector region.
+    OrigExitingVPV->replaceUsesWithIf(
+        FinalReductionResult, [](VPUser &User, unsigned) {
+          auto *Parent = cast<VPRecipeBase>(&User)->getParent();
+          return Parent && !Parent->getParent();
+        });
     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
-    OrigExitingVPV->replaceUsesWithIf(FinalReductionResult, [](VPUser &User,
-                                                               unsigned) {
-      return match(&User, m_Binary<VPInstruction::ExtractFromEnd>(m_VPValue(),
-                                                                  m_VPValue()));
-    });
+
+    // Adjust AnyOf reductions; replace the reduction phi for the selected value
+    // with a boolean reduction phi node to check if the condition is true in
+    // any iteration. The final value is selected by the final
+    // ComputeReductionResult.
+    if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
+            RdxDesc.getRecurrenceKind())) {
+      auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
+        return isa<VPWidenSelectRecipe>(U) ||
+               (isa<VPReplicateRecipe>(U) &&
+                cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
+                    Instruction::Select);
+      }));
+      VPValue *Cmp = Select->getOperand(0);
+      // If the compare is checking the reduction PHI node, adjust it to check
+      // the start value.
+      if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
+        for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
+          if (CmpR->getOperand(I) == PhiR)
+            CmpR->setOperand(I, PhiR->getStartValue());
+      }
+      VPBuilder::InsertPointGuard Guard(Builder);
+      Builder.setInsertPoint(Select);
+
+      // If the true value of the select is the reduction phi, the new value is
+      // selected if the negated condition is true in any iteration.
+      if (Select->getOperand(1) == PhiR)
+        Cmp = Builder.createNot(Cmp);
+      VPValue *Or = Builder.createOr(PhiR, Cmp);
+      Select->getVPSingleValue()->replaceAllUsesWith(Or);
+
+      // Convert the reduction phi to operate on bools.
+      PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
+                              OrigLoop->getHeader()->getContext())));
+    }
   }
 
   VPlanTransforms::clearReductionWrapFlags(*Plan);
 }
 
 void VPDerivedIVRecipe::execute(VPTransformState &State) {
-  assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
+  assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
 
   // Fast-math-flags propagate from the original induction instruction.
   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
   if (FPBinOp)
     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
 
-  Value *Step = State.get(getStepValue(), VPIteration(0, 0));
-  Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
+  Value *Step = State.get(getStepValue(), VPLane(0));
+  Value *CanonicalIV = State.get(getOperand(1), VPLane(0));
   Value *DerivedIV = emitTransformedIndex(
       State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
       Kind, cast_if_present<BinaryOperator>(FPBinOp));
   DerivedIV->setName("offset.idx");
   assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
 
-  State.set(this, DerivedIV, VPIteration(0, 0));
+  State.set(this, DerivedIV, VPLane(0));
 }
 
 void VPReplicateRecipe::execute(VPTransformState &State) {
   Instruction *UI = getUnderlyingInstr();
-  if (State.Instance) { // Generate a single instance.
+  if (State.Lane) { // Generate a single instance.
     assert((State.VF.isScalar() || !isUniform()) &&
            "uniform recipe shouldn't be predicated");
     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
-    State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
+    State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
     // Insert scalar instance packing it into a vector.
     if (State.VF.isVector() && shouldPack()) {
       // If we're constructing lane 0, initialize to start from poison.
-      if (State.Instance->Lane.isFirstLane()) {
+      if (State.Lane->isFirstLane()) {
         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
         Value *Poison = PoisonValue::get(
             VectorType::get(UI->getType(), State.VF));
         State.set(this, Poison);
       }
-      State.packScalarIntoVectorValue(this, *State.Instance);
+      State.packScalarIntoVectorValue(this, *State.Lane);
     }
     return;
   }
 
   if (IsUniform) {
     // Uniform within VL means we need to generate lane 0.
-    State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
+    State.ILV->scalarizeInstruction(UI, this, VPLane(0), State);
     return;
   }
 
@@ -9457,7 +9466,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
   if (isa<StoreInst>(UI) &&
       vputils::isUniformAfterVectorization(getOperand(1))) {
     auto Lane = VPLane::getLastLaneForVF(State.VF);
-    State.ILV->scalarizeInstruction(UI, this, VPIteration(0, Lane), State);
+    State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
     return;
   }
 
@@ -9465,7 +9474,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
   const unsigned EndLane = State.VF.getKnownMinValue();
   for (unsigned Lane = 0; Lane < EndLane; ++Lane)
-    State.ILV->scalarizeInstruction(UI, this, VPIteration(0, Lane), State);
+    State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
 }
 
 // Determine how to lower the scalar epilogue, which depends on 1) optimising
@@ -9996,6 +10005,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     InterleaveLoop = false;
   }
 
+  // If there is a histogram in the loop, do not just interleave without
+  // vectorizing. The order of operations will be incorrect without the
+  // histogram intrinsics, which are only used for recipes with VF > 1.
+  if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
+    LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
+                      << "to histogram operations.\n");
+    IntDiagMsg = std::make_pair(
+        "HistogramPreventsScalarInterleaving",
+        "Unable to interleave without vectorization due to constraints on "
+        "the order of histogram operations");
+    InterleaveLoop = false;
+  }
+
   // Override IC if user provided an interleave count.
   IC = UserIC > 0 ? UserIC : IC;
 
@@ -10045,8 +10067,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       assert(IC > 1 && "interleave count should not be 1 or 0");
       // If we decided that it is not legal to vectorize the loop, then
       // interleave it.
-      InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
-                                 &CM, BFI, PSI, Checks);
+      InnerLoopVectorizer Unroller(
+          L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
+          ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks);
 
       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
@@ -10074,8 +10097,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
         std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
-        const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
-            EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
+        auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
+                                             *BestMainPlan, MainILV, DT, true);
         ++LoopsVectorized;
 
         // Second pass vectorizes the epilogue and adjusts the control flow
@@ -10120,10 +10143,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
           Value *ResumeV = nullptr;
           // TODO: Move setting of resume values to prepareToExecute.
           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
+            ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
+                          ->getIncomingValueForBlock(L->getLoopPreheader());
             const RecurrenceDescriptor &RdxDesc =
                 ReductionPhi->getRecurrenceDescriptor();
             RecurKind RK = RdxDesc.getRecurrenceKind();
-            ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
             if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
               // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
               // start value; compare the final value from the main vector loop
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 414c6388c777b..54a1b99606bcb 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
                               VF * getNumElements(ScalarTy));
 }
 
+/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
+/// which forms type, which splits by \p TTI into whole vector types during
+/// legalization.
+static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
+                                              Type *Ty, unsigned Sz) {
+  if (!isValidElementType(Ty))
+    return bit_ceil(Sz);
+  // Find the number of elements, which forms full vectors.
+  const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
+  if (NumParts == 0 || NumParts >= Sz)
+    return bit_ceil(Sz);
+  return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
+}
+
 static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
                                                    SmallVectorImpl<int> &Mask) {
   // The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -394,7 +408,7 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
 /// total number of elements \p Size and number of registers (parts) \p
 /// NumParts.
 static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
-  return PowerOf2Ceil(divideCeil(Size, NumParts));
+  return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
 }
 
 /// Returns correct remaining number of elements, considering total amount \p
@@ -1222,6 +1236,22 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
          (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
 }
 
+/// Returns true if widened type of \p Ty elements with size \p Sz represents
+/// full vector type, i.e. adding extra element results in extra parts upon type
+/// legalization.
+static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
+                                     unsigned Sz) {
+  if (Sz <= 1)
+    return false;
+  if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
+    return false;
+  if (has_single_bit(Sz))
+    return true;
+  const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
+  return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
+         Sz % NumParts == 0;
+}
+
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -1347,6 +1377,7 @@ class BoUpSLP {
     }
     MinBWs.clear();
     ReductionBitWidth = 0;
+    BaseGraphSize = 1;
     CastMaxMinBWSizes.reset();
     ExtraBitWidthNodes.clear();
     InstrElementSize.clear();
@@ -1355,11 +1386,10 @@ class BoUpSLP {
     ValueToGatherNodes.clear();
   }
 
-  unsigned getTreeSize() const {
-    return GatheredLoadsEntriesFirst == NoGatheredLoads
-               ? VectorizableTree.size()
-               : GatheredLoadsEntriesFirst;
-  }
+  unsigned getTreeSize() const { return VectorizableTree.size(); }
+
+  /// Returns the base graph size, before any transformations.
+  unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
 
   /// Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
@@ -3311,6 +3341,15 @@ class BoUpSLP {
     /// Return true if this is a non-power-of-2 node.
     bool isNonPowOf2Vec() const {
       bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
+      return IsNonPowerOf2;
+    }
+
+    /// Return true if this is a node, which tries to vectorize number of
+    /// elements, forming whole vectors.
+    bool
+    hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
+      bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
+          TTI, getValueType(Scalars.front()), Scalars.size());
       assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
              "Reshuffling not supported with non-power-of-2 vectors yet.");
       return IsNonPowerOf2;
@@ -3430,8 +3469,10 @@ class BoUpSLP {
     Last->State = EntryState;
     // FIXME: Remove once support for ReuseShuffleIndices has been implemented
     // for non-power-of-two vectors.
-    assert((has_single_bit(VL.size()) || ReuseShuffleIndices.empty()) &&
-           "Reshuffling scalars not yet supported for nodes with padding");
+    assert(
+        (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
+         ReuseShuffleIndices.empty()) &&
+        "Reshuffling scalars not yet supported for nodes with padding");
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
     if (ReorderIndices.empty()) {
@@ -4191,6 +4232,9 @@ class BoUpSLP {
   /// reduction.
   unsigned ReductionBitWidth = 0;
 
+  /// Canonical graph size before the transformations.
+  unsigned BaseGraphSize = 1;
+
   /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
   /// type sizes, used in the tree.
   std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
@@ -4409,7 +4453,10 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
     return std::nullopt;
   auto *VecTy = getWidenedType(ScalarTy, NumScalars);
   int NumParts = TTI->getNumberOfParts(VecTy);
-  if (NumParts == 0 || NumParts >= NumScalars)
+  if (NumParts == 0 || NumParts >= NumScalars ||
+      VecTy->getNumElements() % NumParts != 0 ||
+      !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
+                                VecTy->getNumElements() / NumParts))
     NumParts = 1;
   SmallVector<int> ExtractMask;
   SmallVector<int> Mask;
@@ -5266,7 +5313,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
   // node.
   if (!TE.ReuseShuffleIndices.empty()) {
     // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
-    assert(!TE.isNonPowOf2Vec() &&
+    assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
            "Reshuffling scalars not yet supported for nodes with padding");
 
     if (isSplat(TE.Scalars))
@@ -5328,8 +5375,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
             UndefCnt > Sz / 2)
           return std::nullopt;
         UsedVals.set(Val);
-        for (unsigned K = 0; K < NumParts; ++K)
-          ResOrder[Val + Sz * K] = I + K;
+        for (unsigned K = 0; K < NumParts; ++K) {
+          unsigned Idx = Val + Sz * K;
+          if (Idx < VF)
+            ResOrder[Idx] = I + K;
+        }
       }
       return std::move(ResOrder);
     }
@@ -5420,7 +5470,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
         }
       return I1 < I2;
     };
-    DenseMap<unsigned, unsigned> PhiToId;
+    SmallDenseMap<unsigned, unsigned, 16> PhiToId;
     SmallVector<unsigned> Phis(TE.Scalars.size());
     std::iota(Phis.begin(), Phis.end(), 0);
     OrdersType ResOrder(TE.Scalars.size());
@@ -5506,7 +5556,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     }
     // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
     // has been auditted for correctness with non-power-of-two vectors.
-    if (!TE.isNonPowOf2Vec())
+    if (!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
       if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
         return CurrentOrder;
   }
@@ -5659,8 +5709,8 @@ void BoUpSLP::reorderTopToBottom() {
   });
 
   // Reorder the graph nodes according to their vectorization factor.
-  for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
-       VF = bit_ceil(VF) / 2) {
+  for (unsigned VF = VectorizableTree.front()->getVectorFactor();
+       !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
     auto It = VFToOrderedEntries.find(VF);
     if (It == VFToOrderedEntries.end())
       continue;
@@ -5668,6 +5718,9 @@ void BoUpSLP::reorderTopToBottom() {
     // used order and reorder scalar elements in the nodes according to this
     // mostly used order.
     ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
+    // Delete VF entry upon exit.
+    auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
+
     // All operands are reordered and used only in this node - propagate the
     // most used order to the user node.
     MapVector<OrdersType, unsigned,
@@ -6410,7 +6463,10 @@ static void gatherPossiblyVectorizableLoads(
   if (NumScalars > 1) {
     auto *VecTy = getWidenedType(ScalarTy, NumScalars);
     NumParts = TTI.getNumberOfParts(VecTy);
-    if (NumParts == 0 || NumParts >= NumScalars)
+    if (NumParts == 0 || NumParts >= NumScalars ||
+        VecTy->getNumElements() % NumParts != 0 ||
+        !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(),
+                                  VecTy->getNumElements() / NumParts))
       NumParts = 1;
   }
   unsigned VF = PowerOf2Ceil(NumScalars / NumParts);
@@ -7512,7 +7568,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   auto TryToFindDuplicates = [&](const InstructionsState &S,
                                  bool DoNotFail = false) {
     // Check that every instruction appears once in this bundle.
-    DenseMap<Value *, unsigned> UniquePositions(VL.size());
+    SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
     for (Value *V : VL) {
       if (isConstant(V)) {
         ReuseShuffleIndices.emplace_back(
@@ -7526,25 +7582,26 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         UniqueValues.emplace_back(V);
     }
     size_t NumUniqueScalarValues = UniqueValues.size();
-    if (NumUniqueScalarValues == VL.size()) {
+    bool IsFullVectors = hasFullVectorsOrPowerOf2(
+        *TTI, UniqueValues.front()->getType(), NumUniqueScalarValues);
+    if (NumUniqueScalarValues == VL.size() &&
+        (VectorizeNonPowerOf2 || IsFullVectors)) {
       ReuseShuffleIndices.clear();
     } else {
       // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
-      if ((UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) ||
-          !llvm::has_single_bit(VL.size())) {
+      if ((UserTreeIdx.UserTE &&
+           UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
+          !has_single_bit(VL.size())) {
         LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
                              "for nodes with padding.\n");
         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
         return false;
       }
       LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
-      if (NumUniqueScalarValues <= 1 ||
-          (UniquePositions.size() == 1 && all_of(UniqueValues,
-                                                 [](Value *V) {
-                                                   return isa<UndefValue>(V) ||
-                                                          !isConstant(V);
-                                                 })) ||
-          !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
+      if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
+          (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
+             return isa<UndefValue>(V) || !isConstant(V);
+           }))) {
         if (DoNotFail && UniquePositions.size() > 1 &&
             NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
             all_of(UniqueValues, [=](Value *V) {
@@ -7552,7 +7609,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                      areAllUsersVectorized(cast<Instruction>(V),
                                            UserIgnoreList);
             })) {
-          unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
+          // Find the number of elements, which forms full vectors.
+          unsigned PWSz = getFullVectorNumberOfElements(
+              *TTI, UniqueValues.front()->getType(), UniqueValues.size());
           if (PWSz == VL.size()) {
             ReuseShuffleIndices.clear();
           } else {
@@ -9001,47 +9060,147 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
 
 void BoUpSLP::transformNodes() {
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  BaseGraphSize = VectorizableTree.size();
+  // Operands are profitable if they are:
+  // 1. At least one constant
+  // or
+  // 2. Splats
+  // or
+  // 3. Results in good vectorization opportunity, i.e. may generate vector
+  // nodes and reduce cost of the graph.
+  auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
+                                           const InstructionsState &S) {
+    SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
+    for (unsigned Op : seq<unsigned>(S.MainOp->getNumOperands()))
+      Candidates.emplace_back().emplace_back(I1->getOperand(Op),
+                                             I2->getOperand(Op));
+    return all_of(
+        Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
+          return all_of(Cand,
+                        [](const std::pair<Value *, Value *> &P) {
+                          return isa<Constant>(P.first) ||
+                                 isa<Constant>(P.second) || P.first == P.second;
+                        }) ||
+                 findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads);
+        });
+  };
   // The tree may grow here, so iterate over nodes, built before.
-  for (unsigned Idx : seq<unsigned>(VectorizableTree.size())) {
+  for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
     TreeEntry &E = *VectorizableTree[Idx];
     if (E.isGather()) {
       ArrayRef<Value *> VL = E.Scalars;
       const unsigned Sz = getVectorElementSize(VL.front());
       unsigned MinVF = getMinVF(2 * Sz);
+      // Do not try partial vectorization for small nodes (<= 2), nodes with the
+      // same opcode and same parent block or all constants.
       if (VL.size() <= 2 ||
-          (E.getOpcode() &&
-           (E.isAltShuffle() || E.getOpcode() != Instruction::Load)))
+          !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
+            E.isAltShuffle() || !allSameBlock(VL)) ||
+          allConstant(VL) || isSplat(VL))
         continue;
       // Try to find vectorizable sequences and transform them into a series of
       // insertvector instructions.
       unsigned StartIdx = 0;
       unsigned End = VL.size();
-      for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
+      for (unsigned VF = VL.size() / 2; VF >= MinVF; VF = bit_ceil(VF) / 2) {
+        SmallVector<unsigned> Slices;
         for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
           ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
           // If any instruction is vectorized already - do not try again.
-          if (getTreeEntry(Slice.front()) || getTreeEntry(Slice.back()))
+          // Reuse the existing node, if it fully matches the slice.
+          if (const TreeEntry *SE = getTreeEntry(Slice.front());
+              SE || getTreeEntry(Slice.back())) {
+            if (!SE)
+              continue;
+            if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
+              continue;
+          }
+          // Constant already handled effectively - skip.
+          if (allConstant(Slice))
             continue;
-          InstructionsState S = getSameOpcode(Slice, *TLI);
-          if (!S.getOpcode() || S.isAltShuffle() ||
-              (S.getOpcode() != Instruction::Load &&
-               any_of(Slice, [&](Value *V) {
-                 return !areAllUsersVectorized(cast<Instruction>(V),
-                                               UserIgnoreList);
-               })))
+          // Do not try to vectorize small splats (less than vector register and
+          // only with the single non-undef element).
+          bool IsSplat = isSplat(Slice);
+          if (Slices.empty() || !IsSplat ||
+              (VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
+                                             Slice.front()->getType(), VF)),
+                                         1U, VF - 1) !=
+                              std::clamp(TTI->getNumberOfParts(getWidenedType(
+                                             Slice.front()->getType(), 2 * VF)),
+                                         1U, 2 * VF)) ||
+              count(Slice, Slice.front()) ==
+                  (isa<UndefValue>(Slice.front()) ? VF - 1 : 1)) {
+            if (IsSplat)
+              continue;
+            InstructionsState S = getSameOpcode(Slice, *TLI);
+            if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice))
+              continue;
+            if (VF == 2) {
+              // Try to vectorize reduced values or if all users are vectorized.
+              // For expensive instructions extra extracts might be profitable.
+              if ((!UserIgnoreList || E.Idx != 0) &&
+                  TTI->getInstructionCost(cast<Instruction>(Slice.front()),
+                                          CostKind) < TTI::TCC_Expensive &&
+                  !all_of(Slice, [&](Value *V) {
+                    return areAllUsersVectorized(cast<Instruction>(V),
+                                                 UserIgnoreList);
+                  }))
+                continue;
+              if (S.getOpcode() == Instruction::Load) {
+                OrdersType Order;
+                SmallVector<Value *> PointerOps;
+                LoadsState Res =
+                    canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
+                // Do not vectorize gathers.
+                if (Res == LoadsState::ScatterVectorize ||
+                    Res == LoadsState::Gather)
+                  continue;
+              } else if (S.getOpcode() == Instruction::ExtractElement ||
+                         (TTI->getInstructionCost(
+                              cast<Instruction>(Slice.front()), CostKind) <
+                              TTI::TCC_Expensive &&
+                          !CheckOperandsProfitability(
+                              cast<Instruction>(Slice.front()),
+                              cast<Instruction>(Slice.back()), S))) {
+                // Do not vectorize extractelements (handled effectively
+                // alread). Do not vectorize non-profitable instructions (with
+                // low cost and non-vectorizable operands.)
+                continue;
+              }
+            }
+          }
+          Slices.emplace_back(Cnt);
+        }
+        auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt) {
+          E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
+          if (StartIdx == Cnt)
+            StartIdx = Cnt + VF;
+          if (End == Cnt + VF)
+            End = Cnt;
+        };
+        for (unsigned Cnt : Slices) {
+          ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+          // If any instruction is vectorized already - do not try again.
+          if (const TreeEntry *SE = getTreeEntry(Slice.front());
+              SE || getTreeEntry(Slice.back())) {
+            if (!SE)
+              continue;
+            if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
+              continue;
+            AddCombinedNode(SE->Idx, Cnt);
             continue;
+          }
           unsigned PrevSize = VectorizableTree.size();
           buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
           if (PrevSize + 1 == VectorizableTree.size() &&
-              VectorizableTree[PrevSize]->isGather()) {
+              VectorizableTree[PrevSize]->isGather() &&
+              VectorizableTree[PrevSize]->getOpcode() !=
+                  Instruction::ExtractElement &&
+              !isSplat(Slice)) {
             VectorizableTree.pop_back();
             continue;
           }
-          E.CombinedEntriesWithIndices.emplace_back(PrevSize, Cnt);
-          if (StartIdx == Cnt)
-            StartIdx = Cnt + VF;
-          if (End == Cnt + VF)
-            End = Cnt;
+          AddCombinedNode(PrevSize, Cnt);
         }
       }
     }
@@ -9210,10 +9369,18 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       bool NeedShuffle =
           count(VL, *It) > 1 &&
           (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
-      if (!NeedShuffle)
+      if (!NeedShuffle) {
+        if (isa<FixedVectorType>(ScalarTy)) {
+          assert(SLPReVec && "FixedVectorType is not expected.");
+          return TTI.getShuffleCost(
+              TTI::SK_InsertSubvector, VecTy, {}, CostKind,
+              std::distance(VL.begin(), It) * getNumElements(ScalarTy),
+              cast<FixedVectorType>(ScalarTy));
+        }
         return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
                                       CostKind, std::distance(VL.begin(), It),
                                       PoisonValue::get(VecTy), *It);
+      }
 
       SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
       transform(VL, ShuffleMask.begin(), [](Value *V) {
@@ -9690,9 +9857,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       return nullptr;
     Value *VecBase = nullptr;
     ArrayRef<Value *> VL = E->Scalars;
-    // If the resulting type is scalarized, do not adjust the cost.
-    if (NumParts == VL.size())
-      return nullptr;
     // Check if it can be considered reused if same extractelements were
     // vectorized already.
     bool PrevNodeFound = any_of(
@@ -9808,7 +9972,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     assert(!CommonMask.empty() && "Expected non-empty common mask.");
     auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
-    if (NumParts == 0 || NumParts >= Mask.size())
+    if (NumParts == 0 || NumParts >= Mask.size() ||
+        MaskVecTy->getNumElements() % NumParts != 0 ||
+        !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
+                                  MaskVecTy->getNumElements() / NumParts))
       NumParts = 1;
     unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
     const auto *It =
@@ -9825,7 +9992,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     assert(!CommonMask.empty() && "Expected non-empty common mask.");
     auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
-    if (NumParts == 0 || NumParts >= Mask.size())
+    if (NumParts == 0 || NumParts >= Mask.size() ||
+        MaskVecTy->getNumElements() % NumParts != 0 ||
+        !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
+                                  MaskVecTy->getNumElements() / NumParts))
       NumParts = 1;
     unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
     const auto *It =
@@ -10149,7 +10319,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   if (E->CombinedOp != TreeEntry::NotCombinedOp)
     ShuffleOrOp = E->CombinedOp;
-  SetVector<Value *> UniqueValues(VL.begin(), VL.end());
+  SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
   const unsigned Sz = UniqueValues.size();
   SmallBitVector UsedScalars(Sz, false);
   for (unsigned I = 0; I < Sz; ++I) {
@@ -10186,13 +10356,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         InstructionCost VecCost = VectorCost(CommonCost);
         // Check if the current node must be resized, if the parent node is not
         // resized.
-        if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0 &&
+        if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
+            E->Idx != 0 &&
             (E->getOpcode() != Instruction::Load ||
              !E->UserTreeIndices.empty())) {
-          const EdgeInfo &EI = E->UserTreeIndices.front();
-          if ((EI.UserTE->getOpcode() != Instruction::Select ||
-               EI.EdgeIdx != 0) &&
-              It != MinBWs.end()) {
+          const EdgeInfo &EI =
+              *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
+                return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
+              });
+          if (EI.UserTE->getOpcode() != Instruction::Select ||
+              EI.EdgeIdx != 0) {
             auto UserBWIt = MinBWs.find(EI.UserTE);
             Type *UserScalarTy =
                 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
@@ -10203,8 +10376,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
               unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
               unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
               unsigned VecOpcode;
-              auto *UserVecTy =
-                  getWidenedType(UserScalarTy, E->getVectorFactor());
+              auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
               if (BWSz > SrcBWSz)
                 VecOpcode = Instruction::Trunc;
               else
@@ -10255,9 +10427,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     if (VI && SelectOnly) {
       assert(!Ty->isVectorTy() && "Expected only for scalar type.");
       auto *CI = cast<CmpInst>(VI->getOperand(0));
-      IntrinsicCost -=
-          TTI->getCmpSelInstrCost(CI->getOpcode(), Ty, Builder.getInt1Ty(),
-                                  CI->getPredicate(), CostKind, CI);
+      IntrinsicCost -= TTI->getCmpSelInstrCost(
+          CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
+          CostKind, {TTI::OK_AnyValue, TTI::OP_None},
+          {TTI::OK_AnyValue, TTI::OP_None}, CI);
     }
     return IntrinsicCost;
   };
@@ -10346,7 +10519,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       InsertMask[Idx] = I + 1;
     }
     unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
-    if (NumOfParts > 0)
+    if (NumOfParts > 0 && NumOfParts < NumElts)
       VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
     unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
                      VecScalarsSz;
@@ -10521,7 +10694,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
 
       InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
           E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
-          CostKind, VI);
+          CostKind, getOperandInfo(VI->getOperand(0)),
+          getOperandInfo(VI->getOperand(1)), VI);
       InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
       if (IntrinsicCost.isValid())
         ScalarCost = IntrinsicCost;
@@ -10531,8 +10705,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     auto GetVectorCost = [&](InstructionCost CommonCost) {
       auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
 
-      InstructionCost VecCost = TTI->getCmpSelInstrCost(
-          E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
+      InstructionCost VecCost =
+          TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
+                                  CostKind, getOperandInfo(E->getOperand(0)),
+                                  getOperandInfo(E->getOperand(1)), VL0);
       if (auto *SI = dyn_cast<SelectInst>(VL0)) {
         auto *CondType =
             getWidenedType(SI->getCondition()->getType(), VL.size());
@@ -10772,11 +10948,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
             TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
       } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
         auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
-        VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
-                                            CI0->getPredicate(), CostKind, VL0);
+        VecCost = TTIRef.getCmpSelInstrCost(
+            E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
+            {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
+            VL0);
         VecCost += TTIRef.getCmpSelInstrCost(
             E->getOpcode(), VecTy, MaskTy,
             cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
+            {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
             E->getAltOp());
       } else {
         Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
@@ -11607,7 +11786,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
     auto BWIt = MinBWs.find(&Root);
     if (BWIt != MinBWs.end()) {
       Type *DstTy = Root.Scalars.front()->getType();
-      unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
+      unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
       unsigned SrcSz =
           ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
       if (OriginalSz != SrcSz) {
@@ -11615,6 +11794,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
         if (OriginalSz > SrcSz)
           Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
         Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
+        if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
+          assert(SLPReVec && "Only supported by REVEC.");
+          SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
+        }
         Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
                                       TTI::CastContextHint::None,
                                       TTI::TCK_RecipThroughput);
@@ -12293,6 +12476,14 @@ BoUpSLP::isGatherShuffledEntry(
          "Expected only single user of the gather node.");
   assert(VL.size() % NumParts == 0 &&
          "Number of scalars must be divisible by NumParts.");
+  if (!TE->UserTreeIndices.empty() &&
+      TE->UserTreeIndices.front().UserTE->isGather() &&
+      TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
+    assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement ||
+            isSplat(TE->Scalars)) &&
+           "Expected splat or extractelements only node.");
+    return {};
+  }
   unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
   SmallVector<std::optional<TTI::ShuffleKind>> Res;
   for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -13461,7 +13652,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
   Type *OrigScalarTy = GatheredScalars.front()->getType();
   auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
   unsigned NumParts = TTI->getNumberOfParts(VecTy);
-  if (NumParts == 0 || NumParts >= GatheredScalars.size())
+  if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
+      VecTy->getNumElements() % NumParts != 0 ||
+      !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
+                                VecTy->getNumElements() / NumParts))
     NumParts = 1;
   if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
     // Check for gathered extracts.
@@ -17119,7 +17313,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
     if (R.isGathered(Chain.front()) ||
         R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
       return std::nullopt;
-    Size = R.getTreeSize();
+    Size = R.getCanonicalGraphSize();
     return false;
   }
   R.reorderTopToBottom();
@@ -17129,7 +17323,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
 
   R.computeMinimumValueSizes();
 
-  Size = R.getTreeSize();
+  Size = R.getCanonicalGraphSize();
   if (S.getOpcode() == Instruction::Load)
     Size = 2; // cut off masked gather small trees
   InstructionCost Cost = R.getTreeCost();
@@ -17667,7 +17861,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     for (unsigned I = NextInst; I < MaxInst; ++I) {
       unsigned ActualVF = std::min(MaxInst - I, VF);
 
-      if (!has_single_bit(ActualVF))
+      if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
         continue;
 
       if (MaxVFOnly && ActualVF < MaxVF)
@@ -17823,7 +18017,7 @@ class HorizontalReduction {
   /// List of possibly reduced values.
   SmallVector<SmallVector<Value *>> ReducedVals;
   /// Maps reduced value to the corresponding reduction operation.
-  DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
+  SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
   WeakTrackingVH ReductionRoot;
   /// The type of reduction operation.
   RecurKind RdxKind;
@@ -18192,7 +18386,9 @@ class HorizontalReduction {
     // instruction op id and/or alternate op id, plus do extra analysis for
     // loads (grouping them by the distabce between pointers) and cmp
     // instructions (grouping them by the predicate).
-    MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
+    SmallMapVector<
+        size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
+        8>
         PossibleReducedVals;
     initReductionOps(Root);
     DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap;
@@ -18383,7 +18579,8 @@ class HorizontalReduction {
       for (Value *V : Candidates)
         TrackedVals.try_emplace(V, V);
 
-    auto At = [](MapVector<Value *, unsigned> &MV, Value *V) -> unsigned & {
+    auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
+                 Value *V) -> unsigned & {
       auto *It = MV.find(V);
       assert(It != MV.end() && "Unable to find given key.");
       return It->second;
@@ -18470,7 +18667,7 @@ class HorizontalReduction {
                                     RdxKind != RecurKind::FMul &&
                                     RdxKind != RecurKind::FMulAdd;
       // Gather same values.
-      MapVector<Value *, unsigned> SameValuesCounter;
+      SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
       if (IsSupportedHorRdxIdentityOp)
         for (Value *V : Candidates) {
           Value *OrigV = TrackedToOrig.at(V);
@@ -19089,10 +19286,10 @@ class HorizontalReduction {
 
   /// Emits actual operation for the scalar identity values, found during
   /// horizontal reduction analysis.
-  Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
-                       BoUpSLP &R,
-                       const MapVector<Value *, unsigned> &SameValuesCounter,
-                       const DenseMap<Value *, Value *> &TrackedToOrig) {
+  Value *
+  emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
+                const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
+                const DenseMap<Value *, Value *> &TrackedToOrig) {
     assert(IsSupportedHorRdxIdentityOp &&
            "The optimization of matched scalar identity horizontal reductions "
            "must be supported.");
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
index 139e581ce03d9..67b56451c7b59 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
@@ -31,11 +31,11 @@ void DGNode::dump() const {
 }
 #endif // NDEBUG
 
-InstrInterval DependencyGraph::extend(ArrayRef<Instruction *> Instrs) {
+Interval<Instruction> DependencyGraph::extend(ArrayRef<Instruction *> Instrs) {
   if (Instrs.empty())
     return {};
   // TODO: For now create a chain of dependencies.
-  InstrInterval Interval(Instrs);
+  Interval<Instruction> Interval(Instrs);
   auto *TopI = Interval.top();
   auto *BotI = Interval.bottom();
   DGNode *LastN = getOrCreateNode(TopI);
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index 0c44d05f0474d..c59abd09d4362 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -8,6 +8,8 @@
 
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/SandboxIR/Instruction.h"
 
 using namespace llvm::sandboxir;
 
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
index 161d300e6e9f2..80afcb499a2c2 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
@@ -8,8 +8,8 @@
 
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/SandboxIR/Constant.h"
 #include "llvm/SandboxIR/PassManager.h"
-#include "llvm/SandboxIR/SandboxIR.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h"
 
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 25b8bf3e089e5..02ef6f7b6fb98 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -21,6 +21,7 @@ namespace llvm {
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class TargetLibraryInfo;
+struct HistogramInfo;
 
 /// Helper class to create VPRecipies from IR instructions.
 class VPRecipeBuilder {
@@ -103,6 +104,13 @@ class VPRecipeBuilder {
   VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands,
                             VPBasicBlock *VPBB);
 
+  /// Makes Histogram count operations safe for vectorization, by emitting a
+  /// llvm.experimental.vector.histogram.add intrinsic in place of the
+  /// Load + Add|Sub + Store operations that perform the histogram in the
+  /// original scalar loop.
+  VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
+                                         ArrayRef<VPValue *> Operands);
+
 public:
   VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
                   LoopVectorizationLegality *Legal,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 5e4d487261c6f..4247d20cb0e53 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -228,28 +228,27 @@ VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
     : VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
       LVer(nullptr), TypeAnalysis(Plan->getCanonicalIV()->getScalarType()) {}
 
-Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
+Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) {
   if (Def->isLiveIn())
     return Def->getLiveInIRValue();
 
-  if (hasScalarValue(Def, Instance)) {
-    return Data.VPV2Scalars[Def][Instance.Lane.mapToCacheIndex(VF)];
-  }
-  if (!Instance.Lane.isFirstLane() &&
-      vputils::isUniformAfterVectorization(Def) &&
-      hasScalarValue(Def, {Instance.Part, VPLane::getFirstLane()})) {
+  if (hasScalarValue(Def, Lane))
+    return Data.VPV2Scalars[Def][Lane.mapToCacheIndex(VF)];
+
+  if (!Lane.isFirstLane() && vputils::isUniformAfterVectorization(Def) &&
+      hasScalarValue(Def, VPLane::getFirstLane())) {
     return Data.VPV2Scalars[Def][0];
   }
 
   assert(hasVectorValue(Def));
   auto *VecPart = Data.VPV2Vector[Def];
   if (!VecPart->getType()->isVectorTy()) {
-    assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar");
+    assert(Lane.isFirstLane() && "cannot get lane > 0 for scalar");
     return VecPart;
   }
   // TODO: Cache created scalar values.
-  Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF);
-  auto *Extract = Builder.CreateExtractElement(VecPart, Lane);
+  Value *LaneV = Lane.getAsRuntimeExpr(Builder, VF);
+  auto *Extract = Builder.CreateExtractElement(VecPart, LaneV);
   // set(Def, Extract, Instance);
   return Extract;
 }
@@ -258,11 +257,11 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
   if (NeedsScalar) {
     assert((VF.isScalar() || Def->isLiveIn() || hasVectorValue(Def) ||
             !vputils::onlyFirstLaneUsed(Def) ||
-            (hasScalarValue(Def, VPIteration(0, 0)) &&
+            (hasScalarValue(Def, VPLane(0)) &&
              Data.VPV2Scalars[Def].size() == 1)) &&
            "Trying to access a single scalar per part but has multiple scalars "
            "per part.");
-    return get(Def, VPIteration(0, 0));
+    return get(Def, VPLane(0));
   }
 
   // If Values have been set for this Def return the one relevant for \p Part.
@@ -289,7 +288,7 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
     return Shuf;
   };
 
-  if (!hasScalarValue(Def, {0, 0})) {
+  if (!hasScalarValue(Def, {0})) {
     assert(Def->isLiveIn() && "expected a live-in");
     Value *IRV = Def->getLiveInIRValue();
     Value *B = GetBroadcastInstrs(IRV);
@@ -297,7 +296,7 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
     return B;
   }
 
-  Value *ScalarValue = get(Def, {0, 0});
+  Value *ScalarValue = get(Def, VPLane(0));
   // If we aren't vectorizing, we can just copy the scalar map values over
   // to the vector map.
   if (VF.isScalar()) {
@@ -307,9 +306,9 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
 
   bool IsUniform = vputils::isUniformAfterVectorization(Def);
 
-  unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
+  VPLane LastLane(IsUniform ? 0 : VF.getKnownMinValue() - 1);
   // Check if there is a scalar value for the selected lane.
-  if (!hasScalarValue(Def, {0, LastLane})) {
+  if (!hasScalarValue(Def, LastLane)) {
     // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
     // VPExpandSCEVRecipes can also be uniform.
     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
@@ -320,7 +319,7 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
     LastLane = 0;
   }
 
-  auto *LastInst = cast<Instruction>(get(Def, {0, LastLane}));
+  auto *LastInst = cast<Instruction>(get(Def, LastLane));
   // Set the insert point after the last scalarized instruction or after the
   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
   // will directly follow the scalar definitions.
@@ -347,7 +346,7 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
     set(Def, Undef);
     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
-      packScalarIntoVectorValue(Def, {0, Lane});
+      packScalarIntoVectorValue(Def, Lane);
     VectorValue = get(Def);
   }
   Builder.restoreIP(OldIP);
@@ -401,11 +400,11 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
 }
 
 void VPTransformState::packScalarIntoVectorValue(VPValue *Def,
-                                                 const VPIteration &Instance) {
-  Value *ScalarInst = get(Def, Instance);
+                                                 const VPLane &Lane) {
+  Value *ScalarInst = get(Def, Lane);
   Value *VectorValue = get(Def);
-  VectorValue = Builder.CreateInsertElement(
-      VectorValue, ScalarInst, Instance.Lane.getAsRuntimeExpr(Builder, VF));
+  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
+                                            Lane.getAsRuntimeExpr(Builder, VF));
   set(Def, VectorValue);
 }
 
@@ -483,7 +482,7 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
 }
 
 void VPBasicBlock::execute(VPTransformState *State) {
-  bool Replica = State->Instance && !State->Instance->isFirstIteration();
+  bool Replica = bool(State->Lane);
   VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB;
   VPBlockBase *SingleHPred = nullptr;
   BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
@@ -765,14 +764,14 @@ void VPRegionBlock::execute(VPTransformState *State) {
     return;
   }
 
-  assert(!State->Instance && "Replicating a Region with non-null instance.");
+  assert(!State->Lane && "Replicating a Region with non-null instance.");
 
   // Enter replicating mode.
-  State->Instance = VPIteration(0, 0);
   assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
+  State->Lane = VPLane(0);
   for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
        ++Lane) {
-    State->Instance->Lane = VPLane(Lane, VPLane::Kind::First);
+    State->Lane = VPLane(Lane, VPLane::Kind::First);
     // Visit the VPBlocks connected to \p this, starting from it.
     for (VPBlockBase *Block : RPOT) {
       LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
@@ -781,7 +780,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
   }
 
   // Exit replicating mode.
-  State->Instance.reset();
+  State->Lane.reset();
 }
 
 InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
@@ -864,10 +863,10 @@ VPlan::~VPlan() {
     delete BackedgeTakenCount;
 }
 
-static VPIRBasicBlock *createVPIRBasicBlockFor(BasicBlock *BB) {
-  auto *VPIRBB = new VPIRBasicBlock(BB);
+VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) {
+  auto *VPIRBB = new VPIRBasicBlock(IRBB);
   for (Instruction &I :
-       make_range(BB->begin(), BB->getTerminator()->getIterator()))
+       make_range(IRBB->begin(), IRBB->getTerminator()->getIterator()))
     VPIRBB->appendRecipe(new VPIRInstruction(I));
   return VPIRBB;
 }
@@ -876,7 +875,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
                                    PredicatedScalarEvolution &PSE,
                                    bool RequiresScalarEpilogueCheck,
                                    bool TailFolded, Loop *TheLoop) {
-  VPIRBasicBlock *Entry = createVPIRBasicBlockFor(TheLoop->getLoopPreheader());
+  VPIRBasicBlock *Entry =
+      VPIRBasicBlock::fromBasicBlock(TheLoop->getLoopPreheader());
   VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
   auto Plan = std::make_unique<VPlan>(Entry, VecPreheader);
 
@@ -916,7 +916,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
   BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
-  auto *VPExitBlock = createVPIRBasicBlockFor(IRExitBlock);
+  auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
   // The connection order corresponds to the operands of the conditional branch.
   VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
   VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -992,7 +992,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
-  VPIRBasicBlock *IRVPBB = createVPIRBasicBlockFor(IRBB);
+  VPIRBasicBlock *IRVPBB = VPIRBasicBlock::fromBasicBlock(IRBB);
   for (auto &R : make_early_inc_range(*VPBB)) {
     assert(!R.isPhi() && "Tried to move phi recipe to end of block");
     R.moveBefore(*IRVPBB, IRVPBB->end());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c886a39aec76e..594492344d43c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -172,6 +172,7 @@ class VPLane {
   Kind LaneKind;
 
 public:
+  VPLane(unsigned Lane) : Lane(Lane), LaneKind(VPLane::Kind::First) {}
   VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
 
   static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
@@ -230,23 +231,6 @@ class VPLane {
   }
 };
 
-/// VPIteration represents a single point in the iteration space of the output
-/// (vectorized and/or unrolled) IR loop.
-struct VPIteration {
-  /// in [0..UF)
-  unsigned Part;
-
-  VPLane Lane;
-
-  VPIteration(unsigned Part, unsigned Lane,
-              VPLane::Kind Kind = VPLane::Kind::First)
-      : Part(Part), Lane(Lane, Kind) {}
-
-  VPIteration(unsigned Part, const VPLane &Lane) : Part(Part), Lane(Lane) {}
-
-  bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); }
-};
-
 /// VPTransformState holds information passed down when "executing" a VPlan,
 /// needed for generating the output IR.
 struct VPTransformState {
@@ -257,10 +241,10 @@ struct VPTransformState {
   /// The chosen Vectorization Factor of the loop being vectorized.
   ElementCount VF;
 
-  /// Hold the indices to generate specific scalar instructions. Null indicates
+  /// Hold the index to generate specific scalar instructions. Null indicates
   /// that all instances are to be generated, using either scalar or vector
   /// instructions.
-  std::optional<VPIteration> Instance;
+  std::optional<VPLane> Lane;
 
   struct DataState {
     // Each value from the original loop, when vectorized, is represented by a
@@ -275,15 +259,15 @@ struct VPTransformState {
   Value *get(VPValue *Def, bool IsScalar = false);
 
   /// Get the generated Value for a given VPValue and given Part and Lane.
-  Value *get(VPValue *Def, const VPIteration &Instance);
+  Value *get(VPValue *Def, const VPLane &Lane);
 
   bool hasVectorValue(VPValue *Def) { return Data.VPV2Vector.contains(Def); }
 
-  bool hasScalarValue(VPValue *Def, VPIteration Instance) {
+  bool hasScalarValue(VPValue *Def, VPLane Lane) {
     auto I = Data.VPV2Scalars.find(Def);
     if (I == Data.VPV2Scalars.end())
       return false;
-    unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
+    unsigned CacheIdx = Lane.mapToCacheIndex(VF);
     return CacheIdx < I->second.size() && I->second[CacheIdx];
   }
 
@@ -291,7 +275,7 @@ struct VPTransformState {
   /// IsScalar is false. If \p IsScalar is true, set the scalar in lane 0.
   void set(VPValue *Def, Value *V, bool IsScalar = false) {
     if (IsScalar) {
-      set(Def, V, VPIteration(0, 0));
+      set(Def, V, VPLane(0));
       return;
     }
     assert((VF.isScalar() || V->getType()->isVectorTy()) &&
@@ -305,23 +289,23 @@ struct VPTransformState {
     Data.VPV2Vector[Def] = V;
   }
 
-  /// Set the generated scalar \p V for \p Def and the given \p Instance.
-  void set(VPValue *Def, Value *V, const VPIteration &Instance) {
+  /// Set the generated scalar \p V for \p Def and the given \p Lane.
+  void set(VPValue *Def, Value *V, const VPLane &Lane) {
     auto Iter = Data.VPV2Scalars.insert({Def, {}});
     auto &Scalars = Iter.first->second;
-    unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
+    unsigned CacheIdx = Lane.mapToCacheIndex(VF);
     if (Scalars.size() <= CacheIdx)
       Scalars.resize(CacheIdx + 1);
     assert(!Scalars[CacheIdx] && "should overwrite existing value");
     Scalars[CacheIdx] = V;
   }
 
-  /// Reset an existing scalar value for \p Def and a given \p Instance.
-  void reset(VPValue *Def, Value *V, const VPIteration &Instance) {
+  /// Reset an existing scalar value for \p Def and a given \p Lane.
+  void reset(VPValue *Def, Value *V, const VPLane &Lane) {
     auto Iter = Data.VPV2Scalars.find(Def);
     assert(Iter != Data.VPV2Scalars.end() &&
            "need to overwrite existing value");
-    unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
+    unsigned CacheIdx = Lane.mapToCacheIndex(VF);
     assert(CacheIdx < Iter->second.size() &&
            "need to overwrite existing value");
     Iter->second[CacheIdx] = V;
@@ -345,7 +329,7 @@ struct VPTransformState {
   void setDebugLocFrom(DebugLoc DL);
 
   /// Construct the vector value of a scalarized value \p V one lane at a time.
-  void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance);
+  void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane);
 
   /// Hold state information used when constructing the CFG of the output IR,
   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
@@ -923,6 +907,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPWidenLoadSC:
     case VPRecipeBase::VPWidenStoreEVLSC:
     case VPRecipeBase::VPWidenStoreSC:
+    case VPRecipeBase::VPHistogramSC:
       // TODO: Widened stores don't define a value, but widened loads do. Split
       // the recipes to be able to make widened loads VPSingleDefRecipes.
       return false;
@@ -1289,7 +1274,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
   /// Utility methods serving execute(): generates a scalar single instance of
   /// the modeled instruction for a given lane. \returns the scalar generated
   /// value for lane \p Lane.
-  Value *generatePerLane(VPTransformState &State, const VPIteration &Lane);
+  Value *generatePerLane(VPTransformState &State, const VPLane &Lane);
 
 #if !defined(NDEBUG)
   /// Return true if the VPInstruction is a floating point math operation, i.e.
@@ -1624,7 +1609,7 @@ class VPScalarCastRecipe : public VPSingleDefRecipe {
 };
 
 /// A recipe for widening Call instructions.
-class VPWidenCallRecipe : public VPSingleDefRecipe {
+class VPWidenCallRecipe : public VPRecipeWithIRFlags {
   /// ID of the vector intrinsic to call when widening the call. If set the
   /// Intrinsic::not_intrinsic, a library call will be used instead.
   Intrinsic::ID VectorIntrinsicID;
@@ -1639,7 +1624,8 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
   VPWidenCallRecipe(Value *UV, iterator_range<IterT> CallArguments,
                     Intrinsic::ID VectorIntrinsicID, DebugLoc DL = {},
                     Function *Variant = nullptr)
-      : VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, UV, DL),
+      : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments,
+                            *cast<Instruction>(UV)),
         VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {
     assert(
         isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) &&
@@ -1680,6 +1666,51 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
 #endif
 };
 
+/// A recipe representing a sequence of load -> update -> store as part of
+/// a histogram operation. This means there may be aliasing between vector
+/// lanes, which is handled by the llvm.experimental.vector.histogram family
+/// of intrinsics. The only update operations currently supported are
+/// 'add' and 'sub' where the other term is loop-invariant.
+class VPHistogramRecipe : public VPRecipeBase {
+  /// Opcode of the update operation, currently either add or sub.
+  unsigned Opcode;
+
+public:
+  template <typename IterT>
+  VPHistogramRecipe(unsigned Opcode, iterator_range<IterT> Operands,
+                    DebugLoc DL = {})
+      : VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Opcode(Opcode) {}
+
+  ~VPHistogramRecipe() override = default;
+
+  VPHistogramRecipe *clone() override {
+    return new VPHistogramRecipe(Opcode, operands(), getDebugLoc());
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPHistogramSC);
+
+  /// Produce a vectorized histogram operation.
+  void execute(VPTransformState &State) override;
+
+  /// Return the cost of this VPHistogramRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
+  unsigned getOpcode() const { return Opcode; }
+
+  /// Return the mask operand if one was provided, or a null pointer if all
+  /// lanes should be executed unconditionally.
+  VPValue *getMask() const {
+    return getNumOperands() == 3 ? getOperand(2) : nullptr;
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for widening select instructions.
 struct VPWidenSelectRecipe : public VPSingleDefRecipe {
   template <typename IterT>
@@ -2725,6 +2756,10 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
   /// Generate the wide load or gather.
   void execute(VPTransformState &State) override;
 
+  /// Return the cost of this VPWidenLoadEVLRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -2803,6 +2838,10 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   /// Generate the wide store or scatter.
   void execute(VPTransformState &State) override;
 
+  /// Return the cost of this VPWidenStoreEVLRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -3280,6 +3319,10 @@ class VPIRBasicBlock : public VPBasicBlock {
     return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
   }
 
+  /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all
+  /// instructions in \p IRBB, except its terminator which is managed in VPlan.
+  static VPIRBasicBlock *fromBasicBlock(BasicBlock *IRBB);
+
   /// The method which generates the output IR instructions that correspond to
   /// this VPBasicBlock, thereby "executing" the VPlan.
   void execute(VPTransformState *State) override;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 4ddbd0d5fafb8..ed0bb13d9425f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -26,7 +26,7 @@ namespace llvm {
 namespace VPlanPatternMatch {
 
 template <typename Val, typename Pattern> bool match(Val *V, const Pattern &P) {
-  return const_cast<Pattern &>(P).match(V);
+  return P.match(V);
 }
 
 template <typename Pattern> bool match(VPUser *U, const Pattern &P) {
@@ -35,7 +35,7 @@ template <typename Pattern> bool match(VPUser *U, const Pattern &P) {
 }
 
 template <typename Class> struct class_match {
-  template <typename ITy> bool match(ITy *V) { return isa<Class>(V); }
+  template <typename ITy> bool match(ITy *V) const { return isa<Class>(V); }
 };
 
 /// Match an arbitrary VPValue and ignore it.
@@ -46,7 +46,7 @@ template <typename Class> struct bind_ty {
 
   bind_ty(Class *&V) : VR(V) {}
 
-  template <typename ITy> bool match(ITy *V) {
+  template <typename ITy> bool match(ITy *V) const {
     if (auto *CV = dyn_cast<Class>(V)) {
       VR = CV;
       return true;
@@ -63,7 +63,7 @@ template <unsigned BitWidth = 0> struct specific_intval {
 
   specific_intval(APInt V) : Val(std::move(V)) {}
 
-  bool match(VPValue *VPV) {
+  bool match(VPValue *VPV) const {
     if (!VPV->isLiveIn())
       return false;
     Value *V = VPV->getLiveInIRValue();
@@ -94,7 +94,7 @@ template <typename LTy, typename RTy> struct match_combine_or {
 
   match_combine_or(const LTy &Left, const RTy &Right) : L(Left), R(Right) {}
 
-  template <typename ITy> bool match(ITy *V) {
+  template <typename ITy> bool match(ITy *V) const {
     if (L.match(V))
       return true;
     if (R.match(V))
@@ -139,16 +139,16 @@ struct UnaryRecipe_match {
 
   UnaryRecipe_match(Op0_t Op0) : Op0(Op0) {}
 
-  bool match(const VPValue *V) {
+  bool match(const VPValue *V) const {
     auto *DefR = V->getDefiningRecipe();
     return DefR && match(DefR);
   }
 
-  bool match(const VPSingleDefRecipe *R) {
+  bool match(const VPSingleDefRecipe *R) const {
     return match(static_cast<const VPRecipeBase *>(R));
   }
 
-  bool match(const VPRecipeBase *R) {
+  bool match(const VPRecipeBase *R) const {
     if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R))
       return false;
     assert(R->getNumOperands() == 1 &&
@@ -174,16 +174,16 @@ struct BinaryRecipe_match {
 
   BinaryRecipe_match(Op0_t Op0, Op1_t Op1) : Op0(Op0), Op1(Op1) {}
 
-  bool match(const VPValue *V) {
+  bool match(const VPValue *V) const {
     auto *DefR = V->getDefiningRecipe();
     return DefR && match(DefR);
   }
 
-  bool match(const VPSingleDefRecipe *R) {
+  bool match(const VPSingleDefRecipe *R) const {
     return match(static_cast<const VPRecipeBase *>(R));
   }
 
-  bool match(const VPRecipeBase *R) {
+  bool match(const VPRecipeBase *R) const {
     if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R))
       return false;
     assert(R->getNumOperands() == 2 &&
@@ -314,12 +314,14 @@ m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
 }
 
 struct VPCanonicalIVPHI_match {
-  bool match(const VPValue *V) {
+  bool match(const VPValue *V) const {
     auto *DefR = V->getDefiningRecipe();
     return DefR && match(DefR);
   }
 
-  bool match(const VPRecipeBase *R) { return isa<VPCanonicalIVPHIRecipe>(R); }
+  bool match(const VPRecipeBase *R) const {
+    return isa<VPCanonicalIVPHIRecipe>(R);
+  }
 };
 
 inline VPCanonicalIVPHI_match m_CanonicalIV() {
@@ -332,12 +334,12 @@ template <typename Op0_t, typename Op1_t> struct VPScalarIVSteps_match {
 
   VPScalarIVSteps_match(Op0_t Op0, Op1_t Op1) : Op0(Op0), Op1(Op1) {}
 
-  bool match(const VPValue *V) {
+  bool match(const VPValue *V) const {
     auto *DefR = V->getDefiningRecipe();
     return DefR && match(DefR);
   }
 
-  bool match(const VPRecipeBase *R) {
+  bool match(const VPRecipeBase *R) const {
     if (!isa<VPScalarIVStepsRecipe>(R))
       return false;
     assert(R->getNumOperands() == 2 &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f33293e65010f..0d092b9c10acc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/VectorBuilder.h"
@@ -48,6 +49,24 @@ extern cl::opt<unsigned> ForceTargetInstructionCost;
 
 bool VPRecipeBase::mayWriteToMemory() const {
   switch (getVPDefID()) {
+  case VPInstructionSC:
+    if (Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode()))
+      return false;
+    switch (cast<VPInstruction>(this)->getOpcode()) {
+    case Instruction::Or:
+    case Instruction::ICmp:
+    case Instruction::Select:
+    case VPInstruction::Not:
+    case VPInstruction::CalculateTripCountMinusVF:
+    case VPInstruction::CanonicalIVIncrementForPart:
+    case VPInstruction::ExtractFromEnd:
+    case VPInstruction::FirstOrderRecurrenceSplice:
+    case VPInstruction::LogicalAnd:
+    case VPInstruction::PtrAdd:
+      return false;
+    default:
+      return true;
+    }
   case VPInterleaveSC:
     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
@@ -137,21 +156,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPScalarCastSC:
     return false;
   case VPInstructionSC:
-    switch (cast<VPInstruction>(this)->getOpcode()) {
-    case Instruction::Or:
-    case Instruction::ICmp:
-    case Instruction::Select:
-    case VPInstruction::Not:
-    case VPInstruction::CalculateTripCountMinusVF:
-    case VPInstruction::CanonicalIVIncrementForPart:
-    case VPInstruction::ExtractFromEnd:
-    case VPInstruction::FirstOrderRecurrenceSplice:
-    case VPInstruction::LogicalAnd:
-    case VPInstruction::PtrAdd:
-      return false;
-    default:
-      return true;
-    }
+    return mayWriteToMemory();
   case VPWidenCallSC: {
     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
@@ -209,7 +214,7 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
                        ? MiddleVPBB
                        : ExitingVPBB;
   BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
-  Value *V = State.get(ExitValue, VPIteration(0, 0));
+  Value *V = State.get(ExitValue, VPLane(0));
   if (Phi->getBasicBlockIndex(PredBB) != -1)
     Phi->setIncomingValueForBlock(PredBB, V);
   else
@@ -390,7 +395,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
 }
 
 Value *VPInstruction::generatePerLane(VPTransformState &State,
-                                      const VPIteration &Lane) {
+                                      const VPLane &Lane) {
   IRBuilderBase &Builder = State.Builder;
 
   assert(getOpcode() == VPInstruction::PtrAdd &&
@@ -432,9 +437,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
   case VPInstruction::ActiveLaneMask: {
     // Get first lane of vector induction variable.
-    Value *VIVElem0 = State.get(getOperand(0), VPIteration(0, 0));
+    Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
     // Get the original loop tripcount.
-    Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
+    Value *ScalarTC = State.get(getOperand(1), VPLane(0));
 
     // If this part of the active lane mask is scalar, generate the CMP directly
     // to avoid unnecessary extracts.
@@ -469,7 +474,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
   case VPInstruction::CalculateTripCountMinusVF: {
     unsigned UF = getParent()->getPlan()->getUF();
-    Value *ScalarTC = State.get(getOperand(0), {0, 0});
+    Value *ScalarTC = State.get(getOperand(0), VPLane(0));
     Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
     Value *Sub = Builder.CreateSub(ScalarTC, Step);
     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
@@ -477,32 +482,24 @@ Value *VPInstruction::generate(VPTransformState &State) {
     return Builder.CreateSelect(Cmp, Sub, Zero);
   }
   case VPInstruction::ExplicitVectorLength: {
-    // Compute EVL
-    auto GetEVL = [=](VPTransformState &State, Value *AVL) {
-      assert(AVL->getType()->isIntegerTy() &&
-             "Requested vector length should be an integer.");
-
-      // TODO: Add support for MaxSafeDist for correct loop emission.
-      assert(State.VF.isScalable() && "Expected scalable vector factor.");
-      Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
-
-      Value *EVL = State.Builder.CreateIntrinsic(
-          State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
-          {AVL, VFArg, State.Builder.getTrue()});
-      return EVL;
-    };
     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
     // be outside of the main loop.
-    // Compute VTC - IV as the AVL (requested vector length).
-    Value *Index = State.get(getOperand(0), VPIteration(0, 0));
-    Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
-    Value *AVL = State.Builder.CreateSub(TripCount, Index);
-    Value *EVL = GetEVL(State, AVL);
+    Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
+    // Compute EVL
+    assert(AVL->getType()->isIntegerTy() &&
+           "Requested vector length should be an integer.");
+
+    assert(State.VF.isScalable() && "Expected scalable vector factor.");
+    Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
+
+    Value *EVL = State.Builder.CreateIntrinsic(
+        State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
+        {AVL, VFArg, State.Builder.getTrue()});
     return EVL;
   }
   case VPInstruction::CanonicalIVIncrementForPart: {
     unsigned Part = getUnrollPart(*this);
-    auto *IV = State.get(getOperand(0), VPIteration(0, 0));
+    auto *IV = State.get(getOperand(0), VPLane(0));
     assert(Part != 0 && "Must have a positive part");
     // The canonical IV is incremented by the vectorization factor (num of
     // SIMD elements) times the unroll part.
@@ -511,7 +508,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
                              hasNoSignedWrap());
   }
   case VPInstruction::BranchOnCond: {
-    Value *Cond = State.get(getOperand(0), VPIteration(0, 0));
+    Value *Cond = State.get(getOperand(0), VPLane(0));
     // Replace the temporary unreachable terminator with a new conditional
     // branch, hooking it up to backward destination for exiting blocks now and
     // to forward destination(s) later when they are created.
@@ -614,14 +611,6 @@ Value *VPInstruction::generate(VPTransformState &State) {
                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
     }
 
-    // If there were stores of the reduction value to a uniform memory address
-    // inside the loop, create the final store here.
-    if (StoreInst *SI = RdxDesc.IntermediateStore) {
-      auto *NewSI = Builder.CreateAlignedStore(
-          ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
-      propagateMetadata(NewSI, SI);
-    }
-
     return ReducedPartRdx;
   }
   case VPInstruction::ExtractFromEnd: {
@@ -633,8 +622,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
       assert(Offset <= State.VF.getKnownMinValue() &&
              "invalid offset to extract from");
       // Extract lane VF - Offset from the operand.
-      Res = State.get(getOperand(0),
-                      VPIteration(0, VPLane::getLaneFromEnd(State.VF, Offset)));
+      Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
     } else {
       assert(Offset <= 1 && "invalid offset to extract from");
       Res = State.get(getOperand(0));
@@ -700,7 +688,7 @@ bool VPInstruction::isFPMathOp() const {
 #endif
 
 void VPInstruction::execute(VPTransformState &State) {
-  assert(!State.Instance && "VPInstruction executing an Instance");
+  assert(!State.Lane && "VPInstruction executing an Lane");
   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
   assert((hasFastMathFlags() == isFPMathOp() ||
           getOpcode() == Instruction::Select) &&
@@ -715,9 +703,9 @@ void VPInstruction::execute(VPTransformState &State) {
   if (GeneratesPerAllLanes) {
     for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
          Lane != NumLanes; ++Lane) {
-      Value *GeneratedValue = generatePerLane(State, VPIteration(0, Lane));
+      Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
       assert(GeneratedValue && "generatePerLane must produce a value");
-      State.set(this, GeneratedValue, VPIteration(0, Lane));
+      State.set(this, GeneratedValue, VPLane(Lane));
     }
     return;
   }
@@ -865,7 +853,7 @@ void VPIRInstruction::execute(VPTransformState &State) {
     // Set insertion point in PredBB in case an extract needs to be generated.
     // TODO: Model extracts explicitly.
     State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
-    Value *V = State.get(ExitValue, VPIteration(0, Lane));
+    Value *V = State.get(ExitValue, VPLane(Lane));
     auto *Phi = cast<PHINode>(&I);
     Phi->addIncoming(V, PredBB);
   }
@@ -913,12 +901,12 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
     Value *Arg;
     if (UseIntrinsic &&
         isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
-      Arg = State.get(I.value(), VPIteration(0, 0));
+      Arg = State.get(I.value(), VPLane(0));
     // Some vectorized function variants may also take a scalar argument,
     // e.g. linear parameters for pointers. This needs to be the scalar value
     // from the start of the respective part when interleaving.
     else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
-      Arg = State.get(I.value(), VPIteration(0, 0));
+      Arg = State.get(I.value(), VPLane(0));
     else
       Arg = State.get(I.value());
     if (UseIntrinsic &&
@@ -947,8 +935,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
 
   CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
 
-  if (isa<FPMathOperator>(V))
-    V->copyFastMathFlags(CI);
+  setFlags(V);
 
   if (!V->getType()->isVoidTy())
     State.set(this, V);
@@ -964,11 +951,6 @@ InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
                                     CostKind);
   }
 
-  FastMathFlags FMF;
-  // TODO: Manage flags via VPRecipeWithIRFlags.
-  if (auto *FPMO = dyn_cast_or_null<FPMathOperator>(getUnderlyingValue()))
-    FMF = FPMO->getFastMathFlags();
-
   // Some backends analyze intrinsic arguments to determine cost. Use the
   // underlying value for the operand if it has one. Otherwise try to use the
   // operand of the underlying call instruction, if there is one. Otherwise
@@ -996,6 +978,7 @@ InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
         ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
 
   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
+  FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
   IntrinsicCostAttributes CostAttrs(
       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
@@ -1015,7 +998,9 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
     O << " = ";
   }
 
-  O << "call @" << CalledFn->getName() << "(";
+  O << "call";
+  printFlags(O);
+  O << "  @" << CalledFn->getName() << "(";
   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
     Op->printAsOperand(O, SlotTracker);
   });
@@ -1030,6 +1015,94 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
     O << ")";
   }
 }
+#endif
+
+void VPHistogramRecipe::execute(VPTransformState &State) {
+  State.setDebugLocFrom(getDebugLoc());
+  IRBuilderBase &Builder = State.Builder;
+
+  Value *Address = State.get(getOperand(0));
+  Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
+  VectorType *VTy = cast<VectorType>(Address->getType());
+
+  // The histogram intrinsic requires a mask even if the recipe doesn't;
+  // if the mask operand was omitted then all lanes should be executed and
+  // we just need to synthesize an all-true mask.
+  Value *Mask = nullptr;
+  if (VPValue *VPMask = getMask())
+    Mask = State.get(VPMask);
+  else
+    Mask =
+        Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
+
+  // If this is a subtract, we want to invert the increment amount. We may
+  // add a separate intrinsic in future, but for now we'll try this.
+  if (Opcode == Instruction::Sub)
+    IncAmt = Builder.CreateNeg(IncAmt);
+  else
+    assert(Opcode == Instruction::Add && "only add or sub supported for now");
+
+  State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
+                                {VTy, IncAmt->getType()},
+                                {Address, IncAmt, Mask});
+}
+
+InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
+                                               VPCostContext &Ctx) const {
+  // FIXME: Take the gather and scatter into account as well. For now we're
+  //        generating the same cost as the fallback path, but we'll likely
+  //        need to create a new TTI method for determining the cost, including
+  //        whether we can use base + vec-of-smaller-indices or just
+  //        vec-of-pointers.
+  assert(VF.isVector() && "Invalid VF for histogram cost");
+  Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
+  VPValue *IncAmt = getOperand(1);
+  Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
+  VectorType *VTy = VectorType::get(IncTy, VF);
+
+  // Assume that a non-constant update value (or a constant != 1) requires
+  // a multiply, and add that into the cost.
+  InstructionCost MulCost =
+      Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy);
+  if (IncAmt->isLiveIn()) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());
+
+    if (CI && CI->getZExtValue() == 1)
+      MulCost = TTI::TCC_Free;
+  }
+
+  // Find the cost of the histogram operation itself.
+  Type *PtrTy = VectorType::get(AddressTy, VF);
+  Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+  IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
+                              Type::getVoidTy(Ctx.LLVMCtx),
+                              {PtrTy, IncTy, MaskTy});
+
+  // Add the costs together with the add/sub operation.
+  return Ctx.TTI.getIntrinsicInstrCost(
+             ICA, TargetTransformInfo::TCK_RecipThroughput) +
+         MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN-HISTOGRAM buckets: ";
+  getOperand(0)->printAsOperand(O, SlotTracker);
+
+  if (Opcode == Instruction::Sub)
+    O << ", dec: ";
+  else {
+    assert(Opcode == Instruction::Add);
+    O << ", inc: ";
+  }
+  getOperand(1)->printAsOperand(O, SlotTracker);
+
+  if (VPValue *Mask = getMask()) {
+    O << ", mask: ";
+    Mask->printAsOperand(O, SlotTracker);
+  }
+}
 
 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
                                 VPSlotTracker &SlotTracker) const {
@@ -1053,7 +1126,7 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {
   // We have to take the 'vectorized' value and pick the first lane.
   // Instcombine will make this a no-op.
   auto *InvarCond =
-      isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
+      isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr;
 
   Value *Cond = InvarCond ? InvarCond : State.get(getCond());
   Value *Op0 = State.get(getOperand(1));
@@ -1267,7 +1340,9 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
     Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
     return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
-                                      CostKind, CtxI);
+                                      CostKind,
+                                      {TTI::OK_AnyValue, TTI::OP_None},
+                                      {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
   }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
@@ -1416,7 +1491,7 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
 }
 
 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
-  assert(!State.Instance && "Int or FP induction being replicated.");
+  assert(!State.Lane && "Int or FP induction being replicated.");
 
   Value *Start = getStartValue()->getLiveInIRValue();
   const InductionDescriptor &ID = getInductionDescriptor();
@@ -1435,7 +1510,7 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
 
   // Now do the actual transformations, and start with fetching the step value.
-  Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+  Value *Step = State.get(getStepValue(), VPLane(0));
 
   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
          "Expected either an induction phi-node or a truncate of it!");
@@ -1478,7 +1553,7 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
     // Multiply the vectorization factor by the step using integer or
     // floating-point arithmetic as appropriate.
     Type *StepType = Step->getType();
-    Value *RuntimeVF = State.get(getVFValue(), {0, 0});
+    Value *RuntimeVF = State.get(getVFValue(), VPLane(0));
     if (Step->getType()->isFloatingPointTy())
       RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
     else
@@ -1575,8 +1650,8 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
   /// variable on which to base the steps, \p Step is the size of the step.
 
-  Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
-  Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+  Value *BaseIV = State.get(getOperand(0), VPLane(0));
+  Value *Step = State.get(getStepValue(), VPLane(0));
   IRBuilderBase &Builder = State.Builder;
 
   // Ensure step has the same type as that of scalar IV.
@@ -1613,8 +1688,8 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
 
   unsigned StartLane = 0;
   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
-  if (State.Instance) {
-    StartLane = State.Instance->Lane.getKnownLane();
+  if (State.Lane) {
+    StartLane = State.Lane->getKnownLane();
     EndLane = StartLane + 1;
   }
   Value *StartIdx0 =
@@ -1646,7 +1721,7 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
            "scalable");
     auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
     auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
-    State.set(this, Add, VPIteration(0, Lane));
+    State.set(this, Add, VPLane(Lane));
   }
 }
 
@@ -1684,7 +1759,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
     //       the lane-zero scalar value.
     SmallVector<Value *> Ops;
     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
-      Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
+      Ops.push_back(State.get(getOperand(I), VPLane(0)));
 
     auto *NewGEP =
         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
@@ -1697,9 +1772,8 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
     // produce a vector of pointers unless VF is scalar.
     // The pointer operand of the new GEP. If it's loop-invariant, we
     // won't broadcast it.
-    auto *Ptr = isPointerLoopInvariant()
-                    ? State.get(getOperand(0), VPIteration(0, 0))
-                    : State.get(getOperand(0));
+    auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0))
+                                         : State.get(getOperand(0));
 
     // Collect all the indices for the new GEP. If any index is
     // loop-invariant, we won't broadcast it.
@@ -1707,7 +1781,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
     for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
       VPValue *Operand = getOperand(I);
       if (isIndexLoopInvariant(I - 1))
-        Indices.push_back(State.get(Operand, VPIteration(0, 0)));
+        Indices.push_back(State.get(Operand, VPLane(0)));
       else
         Indices.push_back(State.get(Operand));
     }
@@ -1747,9 +1821,9 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
   // or query DataLayout for a more suitable index type otherwise.
   const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
   Type *IndexTy = State.VF.isScalable() && (IsReverse || CurrentPart > 0)
-                      ? DL.getIndexType(IndexedTy->getPointerTo())
+                      ? DL.getIndexType(Builder.getPtrTy(0))
                       : Builder.getInt32Ty();
-  Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
+  Value *Ptr = State.get(getOperand(0), VPLane(0));
   bool InBounds = isInBounds();
 
   Value *ResultPtr = nullptr;
@@ -1850,7 +1924,7 @@ void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 void VPReductionRecipe::execute(VPTransformState &State) {
-  assert(!State.Instance && "Reduction being replicated.");
+  assert(!State.Lane && "Reduction being replicated.");
   Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
   RecurKind Kind = RdxDesc.getRecurrenceKind();
   // Propagate the fast-math flags carried by the underlying instruction.
@@ -1900,7 +1974,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
 }
 
 void VPReductionEVLRecipe::execute(VPTransformState &State) {
-  assert(!State.Instance && "Reduction being replicated.");
+  assert(!State.Lane && "Reduction being replicated.");
 
   auto &Builder = State.Builder;
   // Propagate the fast-math flags carried by the underlying instruction.
@@ -1911,7 +1985,7 @@ void VPReductionEVLRecipe::execute(VPTransformState &State) {
   RecurKind Kind = RdxDesc.getRecurrenceKind();
   Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
   Value *VecOp = State.get(getVecOp());
-  Value *EVL = State.get(getEVL(), VPIteration(0, 0));
+  Value *EVL = State.get(getEVL(), VPLane(0));
 
   VectorBuilder VBuilder(Builder);
   VBuilder.setEVL(EVL);
@@ -2033,7 +2107,7 @@ Value *VPScalarCastRecipe ::generate(VPTransformState &State) {
   case Instruction::ZExt:
   case Instruction::Trunc: {
     // Note: SExt/ZExt not used yet.
-    Value *Op = State.get(getOperand(0), VPIteration(0, 0));
+    Value *Op = State.get(getOperand(0), VPLane(0));
     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
   }
   default:
@@ -2042,7 +2116,7 @@ Value *VPScalarCastRecipe ::generate(VPTransformState &State) {
 }
 
 void VPScalarCastRecipe ::execute(VPTransformState &State) {
-  State.set(this, generate(State), VPIteration(0, 0));
+  State.set(this, generate(State), VPLane(0));
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2057,9 +2131,9 @@ void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
-  assert(State.Instance && "Branch on Mask works only on single instance.");
+  assert(State.Lane && "Branch on Mask works only on single instance.");
 
-  unsigned Lane = State.Instance->Lane.getKnownLane();
+  unsigned Lane = State.Lane->getKnownLane();
 
   Value *ConditionBit = nullptr;
   VPValue *BlockInMask = getMask();
@@ -2082,9 +2156,9 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
 }
 
 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
-  assert(State.Instance && "Predicated instruction PHI works per instance.");
+  assert(State.Lane && "Predicated instruction PHI works per instance.");
   Instruction *ScalarPredInst =
-      cast<Instruction>(State.get(getOperand(0), *State.Instance));
+      cast<Instruction>(State.get(getOperand(0), *State.Lane));
   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
   assert(PredicatingBB && "Predicated block has no single predecessor.");
@@ -2116,13 +2190,13 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
                      PredicatingBB);
     Phi->addIncoming(ScalarPredInst, PredicatedBB);
-    if (State.hasScalarValue(this, *State.Instance))
-      State.reset(this, Phi, *State.Instance);
+    if (State.hasScalarValue(this, *State.Lane))
+      State.reset(this, Phi, *State.Lane);
     else
-      State.set(this, Phi, *State.Instance);
+      State.set(this, Phi, *State.Lane);
     // NOTE: Currently we need to update the value of the operand, so the next
     // predicated iteration inserts its generated value in the correct vector.
-    State.reset(getOperand(0), Phi, *State.Instance);
+    State.reset(getOperand(0), Phi, *State.Lane);
   }
 }
 
@@ -2245,7 +2319,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
   CallInst *NewLI;
-  Value *EVL = State.get(getEVL(), VPIteration(0, 0));
+  Value *EVL = State.get(getEVL(), VPLane(0));
   Value *Addr = State.get(getAddr(), !CreateGather);
   Value *Mask = nullptr;
   if (VPValue *VPMask = getMask()) {
@@ -2275,6 +2349,31 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   State.set(this, Res);
 }
 
+InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
+                                                  VPCostContext &Ctx) const {
+  if (!Consecutive || IsMasked)
+    return VPWidenMemoryRecipe::computeCost(VF, Ctx);
+
+  // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
+  // here because the EVL recipes using EVL to replace the tail mask. But in the
+  // legacy model, it will always calculate the cost of mask.
+  // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
+  // don't need to compare to the legacy cost model.
+  Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+  const Align Alignment =
+      getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
+  unsigned AS =
+      getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
+      Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
+  if (!Reverse)
+    return Cost;
+
+  return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                                       cast<VectorType>(Ty), {}, CostKind, 0);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                  VPSlotTracker &SlotTracker) const {
@@ -2343,7 +2442,7 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
 
   CallInst *NewSI = nullptr;
   Value *StoredVal = State.get(StoredValue);
-  Value *EVL = State.get(getEVL(), VPIteration(0, 0));
+  Value *EVL = State.get(getEVL(), VPLane(0));
   if (isReverse())
     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
   Value *Mask = nullptr;
@@ -2371,6 +2470,31 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   State.addMetadata(NewSI, SI);
 }
 
+InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
+                                                   VPCostContext &Ctx) const {
+  if (!Consecutive || IsMasked)
+    return VPWidenMemoryRecipe::computeCost(VF, Ctx);
+
+  // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
+  // here because the EVL recipes using EVL to replace the tail mask. But in the
+  // legacy model, it will always calculate the cost of mask.
+  // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
+  // don't need to compare to the legacy cost model.
+  Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+  const Align Alignment =
+      getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
+  unsigned AS =
+      getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
+      Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
+  if (!Reverse)
+    return Cost;
+
+  return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                                       cast<VectorType>(Ty), {}, CostKind, 0);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
@@ -2469,7 +2593,7 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
 void VPInterleaveRecipe::execute(VPTransformState &State) {
-  assert(!State.Instance && "Interleave group being replicated.");
+  assert(!State.Lane && "Interleave group being replicated.");
   const InterleaveGroup<Instruction> *Group = IG;
   Instruction *Instr = Group->getInsertPos();
 
@@ -2503,7 +2627,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
     Idx = State.Builder.getInt32(-Index);
 
   VPValue *Addr = getAddr();
-  Value *ResAddr = State.get(Addr, VPIteration(0, 0));
+  Value *ResAddr = State.get(Addr, VPLane(0));
   if (auto *I = dyn_cast<Instruction>(ResAddr))
     State.setDebugLocFrom(I->getDebugLoc());
 
@@ -2803,7 +2927,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
 
   // A pointer induction, performed by using a gep
   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
-  Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
+  Value *ScalarStepValue = State.get(getOperand(1), VPLane(0));
   Type *PhiType = IndDesc.getStep()->getType();
   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
   // Add induction update using an incorrect block temporarily. The phi node
@@ -2837,7 +2961,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
   StartOffset = State.Builder.CreateAdd(
       StartOffset, State.Builder.CreateStepVector(VecPhiType));
 
-  assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, 0)) &&
+  assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) &&
          "scalar step must be the same across all parts");
   Value *GEP = State.Builder.CreateGEP(
       State.Builder.getInt8Ty(), NewPointerPhi,
@@ -2867,7 +2991,7 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
-  assert(!State.Instance && "cannot be used in per-lane");
+  assert(!State.Lane && "cannot be used in per-lane");
   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
   SCEVExpander Exp(SE, DL, "induction");
 
@@ -2876,7 +3000,7 @@ void VPExpandSCEVRecipe::execute(VPTransformState &State) {
   assert(!State.ExpandedSCEVs.contains(Expr) &&
          "Same SCEV expanded multiple times");
   State.ExpandedSCEVs[Expr] = Res;
-  State.set(this, Res, {0, 0});
+  State.set(this, Res, VPLane(0));
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3085,7 +3209,7 @@ void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
 
 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
-  Value *Start = State.get(getOperand(0), VPIteration(0, 0));
+  Value *Start = State.get(getOperand(0), VPLane(0));
   PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
   Phi->addIncoming(Start, VectorPH);
   Phi->setDebugLoc(getDebugLoc());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3b37a1ec9560e..a878613c4ba48 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1423,7 +1423,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
 /// ...
 /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
 ///                                               [ %NextEVLIV, %vector.body ]
-/// %VPEVL = EXPLICIT-VECTOR-LENGTH %EVLPhi, original TC
+/// %AVL = sub original TC, %EVLPhi
+/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
 /// ...
 /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
 /// ...
@@ -1453,9 +1454,14 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
   // Create the ExplicitVectorLengthPhi recipe in the main loop.
   auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
   EVLPhi->insertAfter(CanonicalIVPHI);
-  auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength,
-                                  {EVLPhi, Plan.getTripCount()});
-  VPEVL->insertBefore(*Header, Header->getFirstNonPhi());
+  // TODO: Add support for MaxSafeDist for correct loop emission.
+  // Compute original TC - IV as the AVL (application vector length).
+  auto *AVL = new VPInstruction(Instruction::Sub, {Plan.getTripCount(), EVLPhi},
+                                DebugLoc(), "avl");
+  AVL->insertBefore(*Header, Header->getFirstNonPhi());
+  auto *VPEVL =
+      new VPInstruction(VPInstruction::ExplicitVectorLength, AVL, DebugLoc());
+  VPEVL->insertAfter(AVL);
 
   auto *CanonicalIVIncrement =
       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 4907d3f039727..ca78f32506ef7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -264,24 +264,6 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
     return;
 
   if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
-    VPValue *Op0, *Op1;
-    if (match(VPI, m_VPInstruction<VPInstruction::ExtractFromEnd>(
-                       m_VPValue(Op0), m_VPValue(Op1)))) {
-      VPI->setOperand(1, getValueForPart(Op1, UF - 1));
-      addUniformForAllParts(VPI);
-      if (Plan.hasScalarVFOnly()) {
-        // Extracting from end with VF = 1 implies retrieving the scalar part UF
-        // - Op1.
-        unsigned Offset =
-            cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
-        VPI->replaceAllUsesWith(getValueForPart(Op0, UF - Offset));
-      } else {
-        // Otherwise we extract from the last part.
-        remapOperands(VPI, UF - 1);
-      }
-      return;
-    }
-
     if (vputils::onlyFirstPartUsed(VPI)) {
       addUniformForAllParts(VPI);
       return;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index a23a59aa2f11c..4c383244f96f1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -358,6 +358,7 @@ class VPDef {
     VPWidenEVLSC,
     VPWidenSelectSC,
     VPBlendSC,
+    VPHistogramSC,
     // START: Phi-like recipes. Need to be kept together.
     VPWidenPHISC,
     VPPredInstPHISC,
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 187c44fb9d04d..d948b7eb39b39 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -559,6 +559,9 @@ if(build_runtimes)
     if(TARGET clang-offload-packager)
       list(APPEND extra_deps clang-offload-packager)
     endif()
+    if(TARGET clang-nvlink-wrapper)
+      list(APPEND extra_deps clang-nvlink-wrapper)
+    endif()
   endif()
   if(LLVM_LIBC_FULL_BUILD)
     list(APPEND extra_cmake_args "-DLLVM_LIBC_FULL_BUILD=ON")
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
index b96fdb0109829..b3e66ccc705f8 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
@@ -14,11 +14,11 @@ define void @fadd() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fadd <4 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fadd <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fadd <16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = fadd <vscale x 1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = fadd <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = fadd <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16 = fadd <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16 = fadd <vscale x 16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fadd <vscale x 1 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fadd <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fadd <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fadd <vscale x 8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fadd <vscale x 16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fadd <1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fadd <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fadd <4 x float> undef, undef
@@ -81,21 +81,37 @@ define void @fadd() {
 }
 
 define void @fadd_f16() {
-; CHECK-LABEL: 'fadd_f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fadd half undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fadd <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fadd <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fadd <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fadd <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fadd <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fadd <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fadd <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fadd <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fadd <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fadd <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fadd <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fadd <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'fadd_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fadd half undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fadd <1 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fadd <2 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fadd <4 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fadd <8 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fadd <16 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fadd <32 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fadd <vscale x 1 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fadd <vscale x 2 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fadd <vscale x 4 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fadd <vscale x 8 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fadd <vscale x 16 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fadd <vscale x 32 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fadd_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fadd half undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1F16 = fadd <1 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F16 = fadd <2 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F16 = fadd <4 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8F16 = fadd <8 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16F16 = fadd <16 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32F16 = fadd <32 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1F16 = fadd <vscale x 1 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2F16 = fadd <vscale x 2 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4F16 = fadd <vscale x 4 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8F16 = fadd <vscale x 8 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16F16 = fadd <vscale x 16 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fadd <vscale x 32 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %F16 = fadd half undef, undef
 
@@ -126,11 +142,11 @@ define void @fsub() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fsub <4 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fsub <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fsub <16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = fsub <vscale x 1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = fsub <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = fsub <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16 = fsub <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16 = fsub <vscale x 16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fsub <vscale x 1 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fsub <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fsub <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fsub <vscale x 8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fsub <vscale x 16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fsub <1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fsub <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fsub <4 x float> undef, undef
@@ -193,21 +209,37 @@ define void @fsub() {
 }
 
 define void @fsub_f16() {
-; CHECK-LABEL: 'fsub_f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fsub <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fsub <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fsub <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fsub <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fsub <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fsub <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fsub <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fsub <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fsub <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fsub <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fsub <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fsub <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'fsub_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fsub <1 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fsub <2 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fsub <4 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fsub <8 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fsub <16 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fsub <32 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fsub <vscale x 1 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fsub <vscale x 2 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fsub <vscale x 4 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fsub <vscale x 8 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fsub <vscale x 16 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fsub <vscale x 32 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fsub_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1F16 = fsub <1 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F16 = fsub <2 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F16 = fsub <4 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8F16 = fsub <8 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16F16 = fsub <16 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32F16 = fsub <32 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1F16 = fsub <vscale x 1 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2F16 = fsub <vscale x 2 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4F16 = fsub <vscale x 4 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8F16 = fsub <vscale x 8 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16F16 = fsub <vscale x 16 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fsub <vscale x 32 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %F16 = fsub half undef, undef
 
@@ -238,11 +270,11 @@ define void @fmul() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fmul <4 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fmul <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fmul <16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = fmul <vscale x 1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = fmul <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = fmul <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16 = fmul <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16 = fmul <vscale x 16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fmul <vscale x 1 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fmul <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fmul <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fmul <vscale x 8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fmul <vscale x 16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fmul <1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fmul <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fmul <4 x float> undef, undef
@@ -305,21 +337,37 @@ define void @fmul() {
 }
 
 define void @fmul_f16() {
-; CHECK-LABEL: 'fmul_f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fmul half undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fmul <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fmul <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fmul <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fmul <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fmul <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fmul <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fmul <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fmul <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fmul <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fmul <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fmul <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fmul <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'fmul_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fmul half undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fmul <1 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fmul <2 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fmul <4 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fmul <8 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fmul <16 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fmul <32 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fmul <vscale x 1 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fmul <vscale x 2 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fmul <vscale x 4 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fmul <vscale x 8 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fmul <vscale x 16 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fmul <vscale x 32 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fmul_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fmul half undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1F16 = fmul <1 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F16 = fmul <2 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F16 = fmul <4 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8F16 = fmul <8 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16F16 = fmul <16 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32F16 = fmul <32 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1F16 = fmul <vscale x 1 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2F16 = fmul <vscale x 2 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4F16 = fmul <vscale x 4 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8F16 = fmul <vscale x 8 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16F16 = fmul <vscale x 16 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fmul <vscale x 32 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %F16 = fmul half undef, undef
 
@@ -350,11 +398,11 @@ define void @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fdiv <4 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fdiv <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fdiv <16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = fdiv <vscale x 1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = fdiv <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = fdiv <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16 = fdiv <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16 = fdiv <vscale x 16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fdiv <vscale x 1 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fdiv <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fdiv <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fdiv <vscale x 8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fdiv <vscale x 16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fdiv <1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fdiv <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fdiv <4 x float> undef, undef
@@ -417,21 +465,37 @@ define void @fdiv() {
 }
 
 define void @fdiv_f16() {
-; CHECK-LABEL: 'fdiv_f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fdiv half undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fdiv <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fdiv <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fdiv <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fdiv <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fdiv <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fdiv <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'fdiv_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fdiv half undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fdiv <1 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fdiv <2 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fdiv <4 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fdiv <8 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fdiv <16 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fdiv <32 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fdiv_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fdiv half undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1F16 = fdiv <1 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F16 = fdiv <2 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F16 = fdiv <4 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8F16 = fdiv <8 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16F16 = fdiv <16 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32F16 = fdiv <32 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %F16 = fdiv half undef, undef
 
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index 5650d2cf90eac..853eef6bcb2e8 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -93,7 +93,7 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
@@ -103,7 +103,7 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
@@ -112,8 +112,8 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
@@ -122,8 +122,8 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
@@ -131,9 +131,9 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
@@ -141,29 +141,29 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -253,7 +253,7 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
@@ -263,7 +263,7 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
@@ -272,8 +272,8 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
@@ -282,8 +282,8 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
@@ -291,8 +291,8 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
@@ -301,8 +301,8 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
@@ -310,9 +310,9 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
@@ -320,9 +320,9 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -416,7 +416,7 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
@@ -426,7 +426,7 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
@@ -435,8 +435,8 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
@@ -445,8 +445,8 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
@@ -454,9 +454,9 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
@@ -464,29 +464,29 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -576,7 +576,7 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
@@ -586,7 +586,7 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
@@ -595,8 +595,8 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
@@ -605,8 +605,8 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
@@ -614,9 +614,9 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
@@ -624,18 +624,18 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
@@ -643,9 +643,9 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -1095,7 +1095,7 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
@@ -1105,7 +1105,7 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
@@ -1114,8 +1114,8 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
@@ -1124,8 +1124,8 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
@@ -1133,9 +1133,9 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
@@ -1143,29 +1143,29 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -1255,7 +1255,7 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
@@ -1265,7 +1265,7 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
@@ -1274,8 +1274,8 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
@@ -1284,8 +1284,8 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
@@ -1293,8 +1293,8 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
@@ -1303,8 +1303,8 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
@@ -1312,9 +1312,9 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
@@ -1322,9 +1322,9 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -1418,7 +1418,7 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
@@ -1428,7 +1428,7 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
@@ -1437,8 +1437,8 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
@@ -1447,8 +1447,8 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
@@ -1456,9 +1456,9 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
@@ -1466,29 +1466,29 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -1578,7 +1578,7 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
@@ -1588,7 +1588,7 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
@@ -1597,8 +1597,8 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
@@ -1607,8 +1607,8 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
@@ -1616,9 +1616,9 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
@@ -1626,18 +1626,18 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
@@ -1645,9 +1645,9 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -2117,7 +2117,7 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
@@ -2127,7 +2127,7 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
@@ -2137,7 +2137,7 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
@@ -2147,7 +2147,7 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
@@ -2157,27 +2157,27 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v256i32_v256i16 = trunc <256 x i32> undef to <256 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v256i64_v256i16 = trunc <256 x i64> undef to <256 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
@@ -2430,7 +2430,7 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
@@ -2439,8 +2439,8 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
@@ -2449,8 +2449,8 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
@@ -2459,8 +2459,8 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
@@ -2469,28 +2469,28 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v256i32_v256i16 = trunc <256 x i32> undef to <256 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v256i64_v256i16 = trunc <256 x i64> undef to <256 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
@@ -6959,3 +6959,29 @@ define void @oddvec_sizes() {
 
   ret void
 }
+
+; This test exercises cases which involve both vector widening and
+; vector splitting.  We previously crashed on this case due to an
+; infinite recursion between cast costing and scalarization costing.
+define void @legalization_crash() {
+; CHECK-LABEL: 'legalization_crash'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16700 for instruction: %1 = bitcast <24 x i8> undef to <192 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %2 = trunc <192 x i8> undef to <192 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %3 = zext <192 x i1> undef to <192 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %4 = sext <192 x i1> undef to <192 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 948 for instruction: %5 = sitofp <192 x i1> undef to <192 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 948 for instruction: %6 = uitofp <192 x i1> undef to <192 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16892 for instruction: %7 = fptosi <192 x float> undef to <192 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16892 for instruction: %8 = fptoui <192 x float> undef to <192 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  bitcast <24 x i8> undef to <192 x i1>
+  trunc <192 x i8> undef to <192 x i1>
+  zext <192 x i1> undef to <192 x i8>
+  sext <192 x i1> undef to <192 x i8>
+  sitofp <192 x i1> undef to <192 x float>
+  uitofp <192 x i1> undef to <192 x float>
+  fptosi <192 x float> undef to <192 x i1>
+  fptoui <192 x float> undef to <192 x i1>
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll b/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll
index 118d92b80d254..fce9f71d37634 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll
@@ -9,28 +9,28 @@ define void @foo_no_vscale_range() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv64i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 781 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 662 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv64i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 390 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 335 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv64i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 781 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 662 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv64i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 390 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 335 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
@@ -75,28 +75,28 @@ define void @foo_vscale_range_2_16() vscale_range(2,16) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv64i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv64i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i64.nxv64i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res.i32.nxv64i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll
index 9eadcaca6bb55..2bf1e5d26e2da 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll
@@ -394,10 +394,10 @@ define void @select() {
 define void @select_of_constants() {
 ; CHECK-LABEL: 'select_of_constants'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %1 = select i1 undef, <2 x i64> <i64 128, i64 128>, <2 x i64> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = select i1 undef, <2 x i64> <i64 128, i64 127>, <2 x i64> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %3 = select i1 undef, <2 x i64> <i64 0, i64 1>, <2 x i64> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %4 = select i1 undef, <2 x i64> <i64 128, i64 533>, <2 x i64> <i64 0, i64 573>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = select <4 x i1> undef, <4 x i32> <i32 524288, i32 262144, i32 131072, i32 65536>, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %2 = select i1 undef, <2 x i64> <i64 128, i64 127>, <2 x i64> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %3 = select i1 undef, <2 x i64> <i64 0, i64 1>, <2 x i64> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %4 = select i1 undef, <2 x i64> <i64 128, i64 533>, <2 x i64> <i64 0, i64 573>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = select <4 x i1> undef, <4 x i32> <i32 524288, i32 262144, i32 131072, i32 65536>, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   ; Splat constants
diff --git a/llvm/test/Analysis/CtxProfAnalysis/flatten-icp.ll b/llvm/test/Analysis/CtxProfAnalysis/flatten-icp.ll
new file mode 100644
index 0000000000000..fbffe780f0afa
--- /dev/null
+++ b/llvm/test/Analysis/CtxProfAnalysis/flatten-icp.ll
@@ -0,0 +1,55 @@
+; RUN: split-file %s %t
+; RUN: llvm-ctxprof-util fromJSON --input %t/profile.json --output %t/profile.ctxprofdata
+;
+; In the given profile, in one of the contexts the indirect call is taken, the
+; target we're trying to ICP - GUID:2000 - doesn't appear at all. That should
+; contribute to the count of the "indirect call BB".
+; RUN: opt %t/test.ll -S -passes='require<ctx-prof-analysis>,module-inline,ctx-prof-flatten' -use-ctx-profile=%t/profile.ctxprofdata -ctx-prof-promote-alwaysinline 
+
+; CHECK-LABEL: define i32 @caller(ptr %c)
+; CHECK-NEXT:     [[CND:[0-9]+]] = icmp eq ptr %c, @one
+; CHECK-NEXT:     br i1 [[CND]], label %{{.*}}, label %{{.*}}, !prof ![[BW:[0-9]+]]
+
+; CHECK: ![[BW]] = !{!"branch_weights", i32 10, i32 10}
+
+;--- test.ll
+declare i32 @external(i32 %x)
+define i32 @one() #0 !guid !0 {
+  call void @llvm.instrprof.increment(ptr @one, i64 123, i32 1, i32 0)
+  call void @llvm.instrprof.callsite(ptr @one, i64 123, i32 1, i32 0, ptr @external)
+  %ret = call i32 @external(i32 1)
+  ret i32 %ret
+}
+
+define i32 @caller(ptr %c) #1 !guid !1 {
+  call void @llvm.instrprof.increment(ptr @caller, i64 567, i32 1, i32 0)
+  call void @llvm.instrprof.callsite(ptr @caller, i64 567, i32 1, i32 0, ptr %c)
+  %ret = call i32 %c()
+  ret i32 %ret
+}
+
+define i32 @root(ptr %c) !guid !2 {
+  call void @llvm.instrprof.increment(ptr @root, i64 432, i32 1, i32 0)
+  call void @llvm.instrprof.callsite(ptr @root, i64 432, i32 2, i32 0, ptr @caller)
+  %a = call i32 @caller(ptr %c)
+  call void @llvm.instrprof.callsite(ptr @root, i64 432, i32 2, i32 1, ptr @caller)
+  %b = call i32 @caller(ptr %c)
+  %ret = add i32 %a, %b
+  ret i32 %ret
+
+}
+
+attributes #0 = { alwaysinline }
+attributes #1 = { noinline }
+!0 = !{i64 1000}
+!1 = !{i64 3000}
+!2 = !{i64 4000}
+
+;--- profile.json
+[ {
+  "Guid": 4000, "Counters":[10], "Callsites": [
+    [{"Guid":3000, "Counters":[10], "Callsites":[[{"Guid":1000, "Counters":[10]}]]}],
+    [{"Guid":3000, "Counters":[10], "Callsites":[[{"Guid":9000, "Counters":[10]}]]}]
+  ]
+}
+]
diff --git a/llvm/test/Analysis/Lint/const-store.ll b/llvm/test/Analysis/Lint/const-store.ll
new file mode 100644
index 0000000000000..030a0be3aecc2
--- /dev/null
+++ b/llvm/test/Analysis/Lint/const-store.ll
@@ -0,0 +1,49 @@
+; RUN: not opt --mtriple=amdgcn --passes=lint --lint-abort-on-error %s -disable-output 2>&1 | FileCheck %s
+; RUN: opt --mtriple=amdgcn --mcpu=gfx1030 --passes=lint %s -disable-output 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK0
+; RUN: opt --mtriple=x86_64 --passes=lint --lint-abort-on-error %s -disable-output 2>&1 | FileCheck %s --allow-empty --check-prefix=NOERR
+; NOERR: {{^$}}
+
+define amdgpu_kernel void @store_const(ptr addrspace(4) %out, i32 %a, i32 %b) {
+; CHECK: Undefined behavior: Write to memory in const addrspace
+; CHECK-NEXT: store i32 %r, ptr addrspace(4) %out
+  %r = add i32 %a, %b
+  store i32 %r, ptr addrspace(4) %out
+  ret void
+}
+
+declare void @llvm.memset.p4.i64(ptr addrspace(4) noalias nocapture writeonly, i8, i64, i1)
+define amdgpu_kernel void @memset_const(ptr addrspace(4) %dst) {
+; CHECK0: Undefined behavior: Write to memory in const addrspace
+; CHECK0-NEXT: call void @llvm.memset.p4.i64(ptr addrspace(4) %dst, i8 0, i64 256, i1 false)
+  call void @llvm.memset.p4.i64(ptr addrspace(4) %dst, i8 0, i64 256, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p6.p0.i32(ptr addrspace(6) noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1)
+define amdgpu_kernel void @memcpy_to_const(ptr addrspace(6) %dst, ptr %src) {
+; CHECK0: Undefined behavior: Write to memory in const addrspace
+; CHECK0-NEXT: call void @llvm.memcpy.p6.p0.i32(ptr addrspace(6) %dst, ptr %src, i32 256, i1 false)
+  call void @llvm.memcpy.p6.p0.i32(ptr addrspace(6) %dst, ptr %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @cmpxchg_to_const(ptr addrspace(4) %dst, i32 %src) {
+; CHECK0: Undefined behavior: Write to memory in const addrspace
+; CHECK0-NEXT: %void = cmpxchg ptr addrspace(4) %dst, i32 0, i32 %src seq_cst monotonic
+  %void = cmpxchg ptr addrspace(4) %dst, i32 0, i32 %src seq_cst monotonic
+  ret void
+}
+
+define amdgpu_kernel void @atomicrmw_to_const(ptr addrspace(4) %dst, i32 %src) {
+; CHECK0: Undefined behavior: Write to memory in const addrspace
+; CHECK0-NEXT: %void = atomicrmw add ptr addrspace(4) %dst, i32 %src acquire
+  %void = atomicrmw add ptr addrspace(4) %dst, i32 %src acquire
+  ret void
+}
+
+declare void @const_param(ptr addrspace(6))
+define amdgpu_kernel void @call_with_const(ptr addrspace(6) %dst) {
+; CHECK0-NOT: call void @const_param(ptr addrspace(6) %dst)
+  call void @const_param(ptr addrspace(6) %dst)
+  ret void
+}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pr96656.ll b/llvm/test/Analysis/LoopAccessAnalysis/pr96656.ll
deleted file mode 100644
index 5b9833553fa02..0000000000000
--- a/llvm/test/Analysis/LoopAccessAnalysis/pr96656.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
-
-define void @false.equal.predicate(ptr %arg, ptr %arg1, i1 %arg2) {
-; CHECK-LABEL: 'false.equal.predicate'
-; CHECK-NEXT:    loop.body:
-; CHECK-NEXT:      Memory dependences are safe
-; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:      Run-time memory checks:
-; CHECK-NEXT:      Grouped accesses:
-; CHECK-EMPTY:
-; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:      SCEV assumptions:
-; CHECK-NEXT:      Equal predicate: %load == 1
-; CHECK-EMPTY:
-; CHECK-NEXT:      Expressions re-written:
-; CHECK-NEXT:      [PSE] %gep10 = getelementptr double, ptr %gep8, i64 %mul:
-; CHECK-NEXT:        {(8 + %arg1),+,(8 * (sext i32 %load to i64))<nsw>}<%loop.body>
-; CHECK-NEXT:        --> {(8 + %arg1),+,8}<%loop.body>
-;
-entry:
-  %load = load i32, ptr %arg, align 4
-  br i1 %arg2, label %noloop.exit, label %loop.ph
-
-loop.ph:                                          ; preds = %entry
-  %sext7 = sext i32 %load to i64
-  %gep8 = getelementptr i8, ptr %arg1, i64 8
-  br label %loop.body
-
-loop.body:                                        ; preds = %loop.body, %loop.ph
-  %phi = phi i64 [ 0, %loop.ph ], [ %add, %loop.body ]
-  %mul = mul i64 %phi, %sext7
-  %gep10 = getelementptr double, ptr %gep8, i64 %mul
-  %load11 = load double, ptr %gep10, align 8
-  store double %load11, ptr %arg1, align 8
-  %add = add i64 %phi, 1
-  %icmp = icmp eq i64 %phi, 0
-  br i1 %icmp, label %loop.exit, label %loop.body
-
-noloop.exit:                                      ; preds = %entry
-  %sext = sext i32 %load to i64
-  %gep = getelementptr double, ptr %arg1, i64 %sext
-  %load5 = load double, ptr %gep, align 8
-  store double %load5, ptr %arg, align 8
-  ret void
-
-loop.exit:                                        ; preds = %loop.body
-  ret void
-}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
index 1e12dbf3bbee3..1585c7b562806 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
@@ -223,6 +223,54 @@ exit:
   ret void
 }
 
+define double @single_iteration_unknown_stride(i32 %x, ptr %y, i1 %cond) {
+; CHECK-LABEL: 'single_iteration_unknown_stride'
+; CHECK-NEXT:    loop.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      Equal predicate: %x == 1
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %gep10 = getelementptr double, ptr %gep8, i64 %mul:
+; CHECK-NEXT:        {(8 + %y),+,(8 * (sext i32 %x to i64))<nsw>}<%loop.body>
+; CHECK-NEXT:        --> {(8 + %y),+,8}<%loop.body>
+;
+entry:
+  br i1 %cond, label %noloop.exit, label %loop.ph
+
+loop.ph:                                          ; preds = %entry
+  %sext7 = sext i32 %x to i64
+  %gep8 = getelementptr i8, ptr %y, i64 8
+  br label %loop.body
+
+loop.body:                                        ; preds = %loop.body, %loop.ph
+  %iv = phi i64 [ 0, %loop.ph ], [ %iv.next, %loop.body ]
+  %mul = mul i64 %iv, %sext7
+  %gep10 = getelementptr double, ptr %gep8, i64 %mul
+  %load11 = load double, ptr %gep10, align 8
+  store double %load11, ptr %y, align 8
+  %iv.next = add i64 %iv, 1
+  %icmp = icmp eq i64 %iv, 0
+  br i1 %icmp, label %loop.exit, label %loop.body
+
+noloop.exit:                                      ; preds = %entry
+  %sext = sext i32 %x to i64
+  %gep = getelementptr double, ptr %y, i64 %sext
+  %load5 = load double, ptr %gep, align 8
+  ret double %load5
+
+loop.exit:                                        ; preds = %loop.body
+  %sext2 = sext i32 %x to i64
+  %gep2 = getelementptr double, ptr %y, i64 %sext2
+  %load6 = load double, ptr %gep2, align 8
+  ret double %load6
+}
+
 ; A loop with two symbolic strides.
 define void @two_strides(ptr noalias %A, ptr noalias %B, i64 %N, i64 %stride.1, i64 %stride.2) {
 ; CHECK-LABEL: 'two_strides'
diff --git a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
index 37d6584b1e85f..124d895e5aef5 100644
--- a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
@@ -1568,33 +1568,6 @@ exit:
   ret i32 0
 }
 
-
-define i32 @ptr_induction_ult_2(ptr %a, ptr %b) {
-; CHECK-LABEL: 'ptr_induction_ult_2'
-; CHECK-NEXT:  Classifying expressions for: @ptr_induction_ult_2
-; CHECK-NEXT:    %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ]
-; CHECK-NEXT:    --> {%a,+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
-; CHECK-NEXT:    %ptr.iv.next = getelementptr i32, ptr %ptr.iv, i64 1
-; CHECK-NEXT:    --> {(4 + %a),+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
-; CHECK-NEXT:  Determining loop execution counts for: @ptr_induction_ult_2
-; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
-; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
-; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
-;
-entry:
-  %cmp.6 = icmp ult ptr %a, %b
-  br i1 %cmp.6, label %loop, label %exit
-
-loop:
-  %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ]
-  %ptr.iv.next = getelementptr i32, ptr %ptr.iv, i64 1
-  %exitcond = icmp eq ptr %ptr.iv, %b
-  br i1 %exitcond, label %exit, label %loop
-
-exit:
-  ret i32 0
-}
-
 define void @ptr_induction_eq_1(ptr %a, ptr %b) {
 ; CHECK-LABEL: 'ptr_induction_eq_1'
 ; CHECK-NEXT:  Classifying expressions for: @ptr_induction_eq_1
@@ -1652,47 +1625,6 @@ exit:
   ret void
 }
 
-; TODO: It feels like we should be able to calculate the symbolic max
-; exit count for the loop.inc block here, in the same way as
-; ptr_induction_eq_1. The problem seems to be in howFarToZero when the
-; ControlsOnlyExit is set to false.
-define void @ptr_induction_early_exit_eq_1(ptr %a, ptr %b, ptr %c) {
-; CHECK-LABEL: 'ptr_induction_early_exit_eq_1'
-; CHECK-NEXT:  Classifying expressions for: @ptr_induction_early_exit_eq_1
-; CHECK-NEXT:    %ptr.iv = phi ptr [ %ptr.iv.next, %loop.inc ], [ %a, %entry ]
-; CHECK-NEXT:    --> {%a,+,8}<nuw><%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
-; CHECK-NEXT:    %ld1 = load ptr, ptr %ptr.iv, align 8
-; CHECK-NEXT:    --> %ld1 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
-; CHECK-NEXT:    %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 8
-; CHECK-NEXT:    --> {(8 + %a),+,8}<nw><%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
-; CHECK-NEXT:  Determining loop execution counts for: @ptr_induction_early_exit_eq_1
-; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
-; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
-; CHECK-NEXT:    exit count for loop.inc: ***COULDNOTCOMPUTE***
-; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
-; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
-; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
-; CHECK-NEXT:    symbolic max exit count for loop.inc: ***COULDNOTCOMPUTE***
-;
-entry:
-  %cmp = icmp eq ptr %a, %b
-  br i1 %cmp, label %exit, label %loop
-
-loop:
-  %ptr.iv = phi ptr [ %ptr.iv.next, %loop.inc ], [ %a, %entry ]
-  %ld1 = load ptr, ptr %ptr.iv, align 8
-  %earlyexitcond = icmp eq ptr %ld1, %c
-  br i1 %earlyexitcond, label %exit, label %loop.inc
-
-loop.inc:
-  %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 8
-  %exitcond = icmp eq ptr %ptr.iv.next, %b
-  br i1 %exitcond, label %exit, label %loop
-
-exit:
-  ret void
-}
-
 define void @ptr_induction_early_exit_eq_2(ptr %a, i64 %n, ptr %c) {
 ; CHECK-LABEL: 'ptr_induction_early_exit_eq_2'
 ; CHECK-NEXT:  Classifying expressions for: @ptr_induction_early_exit_eq_2
@@ -1733,7 +1665,6 @@ exit:
   ret void
 }
 
-
 define void @gep_addrec_nw(ptr %a) {
 ; CHECK-LABEL: 'gep_addrec_nw'
 ; CHECK-NEXT:  Classifying expressions for: @gep_addrec_nw
diff --git a/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll b/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll
index 3022281658a75..bb97005e8faf4 100644
--- a/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll
+++ b/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll
@@ -58,6 +58,15 @@ define void @test_well_defined_infinite_st(i32 %N) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is ((-2 + %N) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i32 2147483647
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is ((-2 + %N) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
 ;
 entry:
   br label %for.body
@@ -79,6 +88,15 @@ define void @test_well_defined_infinite_ld(i32 %N) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is ((-2 + %N) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i32 2147483647
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is ((-2 + %N) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
 ;
 entry:
   br label %for.body
@@ -100,6 +118,15 @@ define void @test_no_mustprogress(i32 %N) {
 ; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is ((-2 + %N) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i32 2147483647
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is ((-2 + %N) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
 ;
 entry:
   br label %for.body
@@ -187,6 +214,15 @@ define void @test_abnormal_exit(i32 %N) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is ((-2 + %N) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i32 2147483647
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is ((-2 + %N) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
 ;
 entry:
   br label %for.body
@@ -209,10 +245,24 @@ define void @test_other_exit(i32 %N) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:    exit count for for.body: i32 9
 ; CHECK-NEXT:    exit count for for.latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    predicated exit count for for.latch: ((-2 + %N) /u 2)
+; CHECK-NEXT:     Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
+; CHECK-EMPTY:
 ; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 9
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is i32 9
 ; CHECK-NEXT:    symbolic max exit count for for.body: i32 9
 ; CHECK-NEXT:    symbolic max exit count for for.latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    predicated symbolic max exit count for for.latch: ((-2 + %N) /u 2)
+; CHECK-NEXT:     Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
+; CHECK-EMPTY:
+; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (9 umin ((-2 + %N) /u 2))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (9 umin ((-2 + %N) /u 2))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i32 %N to i1) to i32) == 0
 ;
 entry:
   br label %for.body
@@ -267,6 +317,18 @@ define void @test_sext(i64 %N) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (%N /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i64 %N to i1) to i64) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i64 9223372036854775807
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i64 %N to i1) to i64) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (%N /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i64 %N to i1) to i64) == 0
 ;
 entry:
   br label %for.body
@@ -288,6 +350,21 @@ define void @test_zext_of_sext(i64 %N) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (%N /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i64 %N to i1) to i64) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i64 9223372036854775807
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i64 %N to i1) to i64) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (%N /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i64 %N to i1) to i64) == 0
 ;
 entry:
   br label %for.body
@@ -310,6 +387,18 @@ define void @test_zext_offset(i64 %N) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is ((-21 + %N) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:      Equal predicate: (zext i1 (true + (trunc i64 %N to i1)) to i64) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i64 9223372036854775807
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:      Equal predicate: (zext i1 (true + (trunc i64 %N to i1)) to i64) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is ((-21 + %N) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:      Equal predicate: (zext i1 (true + (trunc i64 %N to i1)) to i64) == 0
 ;
 entry:
   br label %for.body
@@ -332,6 +421,18 @@ define void @test_sext_offset(i64 %N) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is ((-21 + %N) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:      Equal predicate: (zext i1 (true + (trunc i64 %N to i1)) to i64) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i64 9223372036854775807
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:      Equal predicate: (zext i1 (true + (trunc i64 %N to i1)) to i64) == 0
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is ((-21 + %N) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:      Equal predicate: (zext i1 (true + (trunc i64 %N to i1)) to i64) == 0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-max-backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/predicated-max-backedge-taken-count-guard-info.ll
new file mode 100644
index 0000000000000..1805b983c8e2e
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/predicated-max-backedge-taken-count-guard-info.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<scalar-evolution>' -scalar-evolution-classify-expressions=0 -disable-output %s 2>&1 | FileCheck %s
+
+define i32 @ptr_induction_ult(ptr %a, ptr %b) {
+; CHECK-LABEL: 'ptr_induction_ult'
+; CHECK-NEXT:  Determining loop execution counts for: @ptr_induction_ult
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is (((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 4)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %b to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i2))) to i64) == 0
+; CHECK-NEXT:  Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %b to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i2))) to i64) == 0
+; CHECK-NEXT:  Loop %loop: Predicated symbolic max backedge-taken count is (((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 4)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %b to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i2))) to i64) == 0
+;
+entry:
+  %cmp.6 = icmp ult ptr %a, %b
+  br i1 %cmp.6, label %loop, label %exit
+
+loop:
+  %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ]
+  %ptr.iv.next = getelementptr i32, ptr %ptr.iv, i64 1
+  %exitcond = icmp eq ptr %ptr.iv, %b
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret i32 0
+}
+
+define i32 @ptr_induction_ult_3_step_6(ptr %a, ptr %b) {
+; CHECK-LABEL: 'ptr_induction_ult_3_step_6'
+; CHECK-NEXT:  Determining loop execution counts for: @ptr_induction_ult_3_step_6
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is (((3074457345618258603 * (ptrtoint ptr %b to i64)) + (-3074457345618258603 * (ptrtoint ptr %a to i64))) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i64 ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) to i1) to i64) == 0
+; CHECK-NEXT:  Loop %loop: Predicated constant max backedge-taken count is i64 9223372036854775807
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i64 ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) to i1) to i64) == 0
+; CHECK-NEXT:  Loop %loop: Predicated symbolic max backedge-taken count is (((3074457345618258603 * (ptrtoint ptr %b to i64)) + (-3074457345618258603 * (ptrtoint ptr %a to i64))) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i64 ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) to i1) to i64) == 0
+;
+entry:
+  %cmp.6 = icmp ult ptr %a, %b
+  br i1 %cmp.6, label %loop, label %exit
+
+loop:
+  %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ]
+  %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 6
+  %exitcond = icmp eq ptr %ptr.iv, %b
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret i32 0
+}
+
+define i32 @ptr_induction_ult_3_step_7(ptr %a, ptr %b) {
+; CHECK-LABEL: 'ptr_induction_ult_3_step_7'
+; CHECK-NEXT:  Determining loop execution counts for: @ptr_induction_ult_3_step_7
+; CHECK-NEXT:  Loop %loop: backedge-taken count is ((7905747460161236407 * (ptrtoint ptr %b to i64)) + (-7905747460161236407 * (ptrtoint ptr %a to i64)))
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 -1
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((7905747460161236407 * (ptrtoint ptr %b to i64)) + (-7905747460161236407 * (ptrtoint ptr %a to i64)))
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %cmp.6 = icmp ult ptr %a, %b
+  br i1 %cmp.6, label %loop, label %exit
+
+loop:
+  %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ]
+  %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 7
+  %exitcond = icmp eq ptr %ptr.iv, %b
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret i32 0
+}
+
+; %a and %b may not have the same alignment, so the loop may only via the early
+; exit when %ptr.iv > %b. The predicated exit count for the latch can be
+; computed by adding a predicate.
+define void @ptr_induction_early_exit_eq_1(ptr %a, ptr %b, ptr %c) {
+; CHECK-LABEL: 'ptr_induction_early_exit_eq_1'
+; CHECK-NEXT:  Determining loop execution counts for: @ptr_induction_early_exit_eq_1
+; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
+; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    exit count for loop.inc: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    predicated exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8)
+; CHECK-NEXT:     Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i3 ((trunc i64 (ptrtoint ptr %b to i64) to i3) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i3))) to i64) == 0
+; CHECK-EMPTY:
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    symbolic max exit count for loop.inc: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    predicated symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8)
+; CHECK-NEXT:     Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i3 ((trunc i64 (ptrtoint ptr %b to i64) to i3) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i3))) to i64) == 0
+; CHECK-EMPTY:
+; CHECK-NEXT:  Loop %loop: Predicated constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i3 ((trunc i64 (ptrtoint ptr %b to i64) to i3) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i3))) to i64) == 0
+; CHECK-NEXT:  Loop %loop: Predicated symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i3 ((trunc i64 (ptrtoint ptr %b to i64) to i3) + (-1 * (trunc i64 (ptrtoint ptr %a to i64) to i3))) to i64) == 0
+;
+entry:
+  %cmp = icmp eq ptr %a, %b
+  br i1 %cmp, label %exit, label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %ptr.iv.next, %loop.inc ], [ %a, %entry ]
+  %ld1 = load ptr, ptr %ptr.iv, align 8
+  %earlyexitcond = icmp eq ptr %ld1, %c
+  br i1 %earlyexitcond, label %exit, label %loop.inc
+
+loop.inc:
+  %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 8
+  %exitcond = icmp eq ptr %ptr.iv.next, %b
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-urem.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-urem.ll
new file mode 100644
index 0000000000000..d24655f6ae5c1
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-urem.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes='print<scalar-evolution>' -scalar-evolution-classify-expressions=0 -disable-output %s 2>&1 | FileCheck %s
+
+declare void @foo()
+
+define void @test_trip_count_expr_contains_urem(i32 %N) {
+; CHECK-LABEL: 'test_trip_count_expr_contains_urem'
+; CHECK-NEXT:  Determining loop execution counts for: @test_trip_count_expr_contains_urem
+; CHECK-NEXT:  Loop %loop: backedge-taken count is ((1 + (-1 * (zext i4 (1 + (trunc i32 %N to i4)) to i32))<nsw> + %N) /u 16)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 268435455
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((1 + (-1 * (zext i4 (1 + (trunc i32 %N to i4)) to i32))<nsw> + %N) /u 16)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %n.rnd.up = add i32 %N, 1
+  %n.mod.vf = urem i32 %n.rnd.up, 16
+  %n.vec = sub i32 %n.rnd.up, %n.mod.vf
+  br label %loop
+
+loop:
+  %index = phi i32 [ 0, %entry ], [ %index.next, %loop ]
+  %index.next = add i32 %index, 16
+  call void @foo()
+  %ec = icmp eq i32 %index, %n.vec
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
index f6ac8fbd99c58..e2b743e5f514a 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
@@ -1,131 +1,131 @@
 ; RUN: opt -mtriple amdgcn-mesa-mesa3d -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32.v8i32(
 define float @image_atomic_swap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32.v8i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32.v8i32(
 define float @image_atomic_add(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32.v8i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32.v8i32(
 define float @image_atomic_sub(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32.v8i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32.v8i32(
 define float @image_atomic_smin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32.v8i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32.v8i32(
 define float @image_atomic_umin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32.v8i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32.v8i32(
 define float @image_atomic_smax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32.v8i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32.v8i32(
 define float @image_atomic_umax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32.v8i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32.v8i32(
 define float @image_atomic_and(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32.v8i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32.v8i32(
 define float @image_atomic_or(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32.v8i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32.v8i32(
 define float @image_atomic_xor(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32.v8i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32.v8i32(
 define float @image_atomic_inc(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32.v8i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32.v8i32(
 define float @image_atomic_dec(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32.v8i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32.v8i32(
 define float @image_atomic_cmpswap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data, i32 inreg %cmp) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %data, i32 %cmp, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32.v8i32(i32 %data, i32 %cmp, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32.v8i32(
 define float @image_atomic_add_2d(<8 x i32> inreg %rsrc, i32 inreg %s, i32 inreg %t, i32 inreg %data) #0 {
 main_body:
-  %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32.v8i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   %r = bitcast i32 %orig to float
   ret float %r
 }
 
-declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
-declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
-
-declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32.v8i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+
+declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32.v8i32(i32, i32, i32, <8 x i32>, i32, i32) #0
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/Analysis/ValueTracking/knownbits-rem-lowbits.ll b/llvm/test/Analysis/ValueTracking/knownbits-rem.ll
similarity index 73%
rename from llvm/test/Analysis/ValueTracking/knownbits-rem-lowbits.ll
rename to llvm/test/Analysis/ValueTracking/knownbits-rem.ll
index 0521c7130055f..0aa340c46bdec 100644
--- a/llvm/test/Analysis/ValueTracking/knownbits-rem-lowbits.ll
+++ b/llvm/test/Analysis/ValueTracking/knownbits-rem.ll
@@ -12,6 +12,17 @@ define i8 @urem_low_bits_know(i8 %xx, i8 %yy) {
   ret i8 %r
 }
 
+define i8 @urem_high_bits_know(i8 %xx, i8 %yy) {
+; CHECK-LABEL: @urem_high_bits_know(
+; CHECK-NEXT:    ret i8 0
+;
+  %x = and i8 %xx, 2
+  %y = and i8 %yy, -4
+  %rem = urem i8 %x, %y
+  %r = and i8 %rem, 8
+  ret i8 %r
+}
+
 define i8 @urem_low_bits_know2(i8 %xx, i8 %yy) {
 ; CHECK-LABEL: @urem_low_bits_know2(
 ; CHECK-NEXT:    ret i8 2
@@ -91,6 +102,62 @@ define i8 @srem_low_bits_know2(i8 %xx, i8 %yy) {
   ret i8 %r
 }
 
+define i8 @srem_high_bits_know(i8 %xx, i8 %yy) {
+; CHECK-LABEL: @srem_high_bits_know(
+; CHECK-NEXT:    ret i8 -2
+;
+  %x = or i8 %xx, -2
+  %y = and i8 %yy, -4
+  %rem = srem i8 %x, %y
+  %r = and i8 %rem, -2
+  ret i8 %r
+}
+
+define i8 @srem_high_bits_know2(i8 %xx, i8 %yy) {
+; CHECK-LABEL: @srem_high_bits_know2(
+; CHECK-NEXT:    ret i8 0
+;
+  %x = and i8 %xx, 13
+  %y = or i8 %yy, -4
+  %rem = srem i8 %x, %y
+  %r = and i8 %rem, 8
+  ret i8 %r
+}
+
+define i8 @srem_high_bits_know3(i8 %xx, i8 %yy) {
+; CHECK-LABEL: @srem_high_bits_know3(
+; CHECK-NEXT:    ret i8 8
+;
+  %x = or i8 %xx, -13
+  %y = and i8 %yy, 4
+  %rem = srem i8 %x, %y
+  %r = and i8 %rem, 8
+  ret i8 %r
+}
+
+define i8 @srem_high_bits_know4(i8 %xx, i8 %yy) {
+; CHECK-LABEL: @srem_high_bits_know4(
+; CHECK-NEXT:    ret i8 0
+;
+  %x = and i8 %xx, 4
+  %y = or i8 %yy, -13
+  %rem = srem i8 %x, %y
+  %r = and i8 %rem, 8
+  ret i8 %r
+}
+
+define i8 @srem_high_bits_know5(i8 %xx, i8 %yy) {
+; CHECK-LABEL: @srem_high_bits_know5(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[XX:%.*]], 2
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %x = and i8 %xx, 2
+  %y = and i8 %yy, 4
+  %rem = srem i8 %x, %y
+  %r = and i8 %rem, 2
+  ret i8 %r
+}
+
 define i8 @srem_todo_low_bits_partially_know_should_fold_out_srem(i8 %xx, i8 %yy) {
 ; CHECK-LABEL: @srem_todo_low_bits_partially_know_should_fold_out_srem(
 ; CHECK-NEXT:    [[X:%.*]] = or i8 [[XX:%.*]], 10
diff --git a/llvm/test/Assembler/amdgcn-intrinsic-attributes.ll b/llvm/test/Assembler/amdgcn-intrinsic-attributes.ll
new file mode 100644
index 0000000000000..3652f6a4a27e3
--- /dev/null
+++ b/llvm/test/Assembler/amdgcn-intrinsic-attributes.ll
@@ -0,0 +1,21 @@
+; REQUIRES: amdgpu-registered-target
+
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+; Test assumed alignment parameter
+
+; CHECK: declare i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) nocapture align 4, i1 immarg) #0
+
+define i32 @ds_append(ptr addrspace(3) %ptr) {
+  %ret = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %ptr, i1 false)
+  ret i32 %ret
+}
+
+; Test assumed alignment parameter
+; CHECK: declare i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) nocapture align 4, i1 immarg) #0
+define i32 @ds_consume(ptr addrspace(3) %ptr) {
+  %ret = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %ptr, i1 false)
+  ret i32 %ret
+}
+
+; CHECK: attributes #0 = { convergent nocallback nofree nounwind willreturn memory(argmem: readwrite) }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-subvector.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-subvector.ll
new file mode 100644
index 0000000000000..149bf72b053ef
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-subvector.ll
@@ -0,0 +1,352 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sve -global-isel -stop-after=irtranslator -aarch64-enable-gisel-sve=1 %s -o - | FileCheck %s
+
+define i32 @extract_v4i32_vector_insert_const(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+  ; CHECK-LABEL: name: extract_v4i32_vector_insert_const
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $d1, $q0, $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  ; CHECK-NEXT:   [[INSERT_SUBVECTOR:%[0-9]+]]:_(<4 x s32>) = G_INSERT_SUBVECTOR [[COPY]], [[COPY1]](<2 x s32>), 0
+  ; CHECK-NEXT:   [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[INSERT_SUBVECTOR]](<4 x s32>), [[C]](s64)
+  ; CHECK-NEXT:   $w0 = COPY [[EVEC]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %a, <2 x i32> %b, i64 0)
+  %d = extractelement <4 x i32> %vector, i32 1
+  ret i32 %d
+}
+
+define double @extract_v4double_vector_insert_const(<4 x double> %a, <2 x double> %b, i32 %c) {
+  ; CHECK-LABEL: name: extract_v4double_vector_insert_const
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $q0, $q1, $q2, $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[COPY]](<2 x s64>), [[COPY1]](<2 x s64>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  ; CHECK-NEXT:   [[INSERT_SUBVECTOR:%[0-9]+]]:_(<4 x s64>) = G_INSERT_SUBVECTOR [[CONCAT_VECTORS]], [[COPY2]](<2 x s64>), 0
+  ; CHECK-NEXT:   [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[INSERT_SUBVECTOR]](<4 x s64>), [[C]](s64)
+  ; CHECK-NEXT:   $d0 = COPY [[EVEC]](s64)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+entry:
+  %vector = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> %a, <2 x double> %b, i64 0)
+  %d = extractelement <4 x double> %vector, i32 1
+  ret double %d
+}
+
+define float @extract_v4float_vector_insert_const(<4 x float> %a, <2 x float> %b, i32 %c) {
+  ; CHECK-LABEL: name: extract_v4float_vector_insert_const
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $d1, $q0, $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+  ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  ; CHECK-NEXT:   [[INSERT_SUBVECTOR:%[0-9]+]]:_(<4 x s32>) = G_INSERT_SUBVECTOR [[BITCAST]], [[COPY1]](<2 x s32>), 0
+  ; CHECK-NEXT:   [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[INSERT_SUBVECTOR]](<4 x s32>), [[C]](s64)
+  ; CHECK-NEXT:   $s0 = COPY [[EVEC]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $s0
+entry:
+  %vector = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> %a, <2 x float> %b, i64 0)
+  %d = extractelement <4 x float> %vector, i32 1
+  ret float %d
+}
+
+define i32 @extract_v4i32_vector_insert(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+  ; CHECK-LABEL: name: extract_v4i32_vector_insert
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $d1, $q0, $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[INSERT_SUBVECTOR:%[0-9]+]]:_(<4 x s32>) = G_INSERT_SUBVECTOR [[COPY]], [[COPY1]](<2 x s32>), 0
+  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY2]](s32)
+  ; CHECK-NEXT:   [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[INSERT_SUBVECTOR]](<4 x s32>), [[ZEXT]](s64)
+  ; CHECK-NEXT:   $w0 = COPY [[EVEC]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %a, <2 x i32> %b, i64 0)
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vector_extract(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+  ; CHECK-LABEL: name: extract_v4i32_vector_extract
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $d1, $q0, $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT_SUBVECTOR [[COPY]](<4 x s32>), 0
+  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY2]](s32)
+  ; CHECK-NEXT:   [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[EXTRACT_SUBVECTOR]](<4 x s32>), [[ZEXT]](s64)
+  ; CHECK-NEXT:   $w0 = COPY [[EVEC]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <4 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %a, i64 0)
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vector_extract_const(<vscale x 4 x i32> %a, i32 %c, ptr %p) {
+  ; CHECK-LABEL: name: extract_v4i32_vector_extract_const
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $w0, $x1, $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p0) = COPY $x1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_EXTRACT_SUBVECTOR [[COPY]](<vscale x 4 x s32>), 0
+  ; CHECK-NEXT:   G_STORE [[EXTRACT_SUBVECTOR]](<vscale x 4 x s32>), [[COPY2]](p0) :: (store (<vscale x 4 x s32>) into %ir.p)
+  ; CHECK-NEXT:   $w0 = COPY [[C]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <vscale x 4 x i32> @llvm.vector.extract(<vscale x 4 x i32> %a, i64 0)
+  store <vscale x 4 x i32> %vector, ptr %p, align 16
+  ret i32 1
+}
+
+define i32 @extract_v2double_vector_extract_const(<vscale x 2 x double> %a, i32 %c, ptr %p) {
+  ; CHECK-LABEL: name: extract_v2double_vector_extract_const
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $w0, $x1, $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p0) = COPY $x1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_EXTRACT_SUBVECTOR [[COPY]](<vscale x 2 x s64>), 0
+  ; CHECK-NEXT:   G_STORE [[EXTRACT_SUBVECTOR]](<vscale x 2 x s64>), [[COPY2]](p0) :: (store (<vscale x 2 x s64>) into %ir.p)
+  ; CHECK-NEXT:   $w0 = COPY [[C]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <vscale x 2 x double> @llvm.vector.extract(<vscale x 2 x double> %a, i64 0)
+  store <vscale x 2 x double> %vector, ptr %p, align 16
+  ret i32 1
+}
+
+define i32 @extract_v4i32_vector_insert_const_vscale(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 %c, ptr %p) {
+  ; CHECK-LABEL: name: extract_v4i32_vector_insert_const_vscale
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $w0, $x1, $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(p0) = COPY $x1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_SUBVECTOR [[COPY]], [[COPY1]](<vscale x 4 x s32>), 0
+  ; CHECK-NEXT:   G_STORE [[INSERT_SUBVECTOR]](<vscale x 4 x s32>), [[COPY3]](p0) :: (store (<vscale x 4 x s32>) into %ir.p)
+  ; CHECK-NEXT:   $w0 = COPY [[C]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i64 0)
+  store <vscale x 4 x i32> %vector, ptr %p, align 16
+  ret i32 1
+}
+
+define i32 @extract_v4i32_vector_extract_const_illegal_fixed(<4 x i32> %a, ptr %p) {
+  ; CHECK-LABEL: name: extract_v4i32_vector_extract_const_illegal_fixed
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $q0, $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C]](s64)
+  ; CHECK-NEXT:   G_STORE [[EVEC]](s32), [[COPY1]](p0) :: (store (s32) into %ir.p, align 16)
+  ; CHECK-NEXT:   $w0 = COPY [[C1]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <1 x i32> @llvm.vector.extract(<4 x i32> %a, i64 0)
+  store <1 x i32> %vector, ptr %p, align 16
+  ret i32 1
+}
+
+define i32 @extract_v4i32_vector_extract_const_illegal_scalable(<vscale x 4 x i32> %a, ptr %p) {
+  ; CHECK-LABEL: name: extract_v4i32_vector_extract_const_illegal_scalable
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $x0, $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[VSCALE:%[0-9]+]]:_(s64) = G_VSCALE i64 1
+  ; CHECK-NEXT:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[VSCALE]], [[C]]
+  ; CHECK-NEXT:   [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<vscale x 4 x s32>), [[MUL]](s64)
+  ; CHECK-NEXT:   G_STORE [[EVEC]](s32), [[COPY1]](p0) :: (store (s32) into %ir.p, align 16)
+  ; CHECK-NEXT:   $w0 = COPY [[C1]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <1 x i32> @llvm.vector.extract(<vscale x 4 x i32> %a, i64 0)
+  store <1 x i32> %vector, ptr %p, align 16
+  ret i32 1
+}
+
+define i32 @extract_v4i32_vector_insert_const_illegal_scalable(<vscale x 4 x i32> %a, <1 x i32> %b, i32 %c, ptr %p) {
+  ; CHECK-LABEL: name: extract_v4i32_vector_insert_const_illegal_scalable
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $d1, $w0, $x1, $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(p0) = COPY $x1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[VSCALE:%[0-9]+]]:_(s64) = G_VSCALE i64 1
+  ; CHECK-NEXT:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[VSCALE]], [[C]]
+  ; CHECK-NEXT:   [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[UV]](s32), [[MUL]](s64)
+  ; CHECK-NEXT:   G_STORE [[IVEC]](<vscale x 4 x s32>), [[COPY3]](p0) :: (store (<vscale x 4 x s32>) into %ir.p)
+  ; CHECK-NEXT:   $w0 = COPY [[C1]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <vscale x 4  x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %a, <1 x i32> %b, i64 0)
+  store <vscale x 4 x i32> %vector, ptr %p, align 16
+  ret i32 1
+}
+
+define i32 @extract_v4i32_vector_insert_const_fixed(<4 x i32> %a, <1 x i32> %b, i32 %c, ptr %p) {
+  ; CHECK-LABEL: name: extract_v4i32_vector_insert_const_fixed
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $d1, $q0, $w0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(p0) = COPY $x1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[UV]](s32), [[C]](s64)
+  ; CHECK-NEXT:   G_STORE [[IVEC]](<4 x s32>), [[COPY3]](p0) :: (store (<4 x s32>) into %ir.p)
+  ; CHECK-NEXT:   $w0 = COPY [[C1]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <4  x i32> @llvm.vector.insert.v4i32.v4i32(<4 x i32> %a, <1 x i32> %b, i64 0)
+  store <4 x i32> %vector, ptr %p, align 16
+  ret i32 1
+}
+
+define i32 @extract_v4i32_vector_insert_const_fixed_illegal(<1 x i32> %a, <1 x i32> %b, i32 %c, ptr %p) {
+  ; CHECK-LABEL: name: extract_v4i32_vector_insert_const_fixed_illegal
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $d0, $d1, $w0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
+  ; CHECK-NEXT:   [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(p0) = COPY $x1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; CHECK-NEXT:   G_STORE [[COPY4]](s32), [[COPY3]](p0) :: (store (s32) into %ir.p, align 16)
+  ; CHECK-NEXT:   $w0 = COPY [[C]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <1  x i32> @llvm.vector.insert.v1i32.v4i32(<1 x i32> %a, <1 x i32> %b, i64 0)
+  store <1 x i32> %vector, ptr %p, align 16
+  ret i32 1
+}
+
+define i32 @extract_v4ptr_vector_insert_const_fixed_illegal(<1 x ptr> %a, <1 x ptr> %b, i32 %c, ptr %p) {
+  ; CHECK-LABEL: name: extract_v4ptr_vector_insert_const_fixed_illegal
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $d0, $d1, $w0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $d0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(p0) = COPY $d1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(p0) = COPY $x1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(p0) = COPY [[COPY]](p0)
+  ; CHECK-NEXT:   G_STORE [[COPY4]](p0), [[COPY3]](p0) :: (store (p0) into %ir.p, align 16)
+  ; CHECK-NEXT:   $w0 = COPY [[C]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <1  x ptr> @llvm.vector.insert.v1ptr.v4ptr(<1 x ptr> %a, <1 x ptr> %b, i64 0)
+  store <1 x ptr> %vector, ptr %p, align 16
+  ret i32 1
+}
+
+define i32 @extract_v4ifloat_vector_insert_const_fixed_illegal(<1 x float> %a, <1 x float> %b, float %c, ptr %p) {
+  ; CHECK-LABEL: name: extract_v4ifloat_vector_insert_const_fixed_illegal
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $d0, $d1, $s2, $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
+  ; CHECK-NEXT:   [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $s2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; CHECK-NEXT:   G_STORE [[COPY4]](s32), [[COPY3]](p0) :: (store (s32) into %ir.p, align 16)
+  ; CHECK-NEXT:   $w0 = COPY [[C]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <1  x float> @llvm.vector.insert.v1f32.v4f32(<1 x float> %a, <1 x float> %b, i64 0)
+  store <1 x float> %vector, ptr %p, align 16
+  ret i32 1
+}
+
+define i32 @extract_v4iptr_vector_insert_const_fixed_illegal(<1 x ptr> %a, <1 x ptr> %b, ptr %c, ptr %p) {
+  ; CHECK-LABEL: name: extract_v4iptr_vector_insert_const_fixed_illegal
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $d0, $d1, $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $d0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(p0) = COPY $d1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(p0) = COPY $x1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(p0) = COPY [[COPY]](p0)
+  ; CHECK-NEXT:   G_STORE [[COPY4]](p0), [[COPY3]](p0) :: (store (p0) into %ir.p, align 16)
+  ; CHECK-NEXT:   $w0 = COPY [[C]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <1  x ptr> @llvm.vector.insert.v1ptr.v4ptr(<1 x ptr> %a, <1 x ptr> %b, i64 0)
+  store <1 x ptr> %vector, ptr %p, align 16
+  ret i32 1
+}
+
+define i32 @extract_v4iptr_vector_insert_const_fixed_legal(<4 x ptr> %a, <4 x ptr> %b, ptr %c, ptr %p) {
+  ; CHECK-LABEL: name: extract_v4iptr_vector_insert_const_fixed_legal
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $q0, $q1, $q2, $q3, $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x p0>) = G_CONCAT_VECTORS [[COPY]](<2 x s64>), [[COPY1]](<2 x s64>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x p0>) = G_CONCAT_VECTORS [[COPY2]](<2 x s64>), [[COPY3]](<2 x s64>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(p0) = COPY $x1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[INSERT_SUBVECTOR:%[0-9]+]]:_(<4 x p0>) = G_INSERT_SUBVECTOR [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]](<4 x p0>), 0
+  ; CHECK-NEXT:   G_STORE [[INSERT_SUBVECTOR]](<4 x p0>), [[COPY5]](p0) :: (store (<4 x p0>) into %ir.p, align 16)
+  ; CHECK-NEXT:   $w0 = COPY [[C]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %vector = call <4  x ptr> @llvm.vector.insert.v4ptr.v4ptr(<4 x ptr> %a, <4 x ptr> %b, i64 0)
+  store <4 x ptr> %vector, ptr %p, align 16
+  ret i32 1
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp-arith-fp16.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp-arith-fp16.mir
index f69cbd73b4b32..42538d58c87bf 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp-arith-fp16.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp-arith-fp16.mir
@@ -12,21 +12,24 @@ body:             |
 
     ; NO-FP16-LABEL: name: fadd
     ; NO-FP16: liveins: $h0, $h1
-    ; NO-FP16: %x:_(s16) = COPY $h0
-    ; NO-FP16: %y:_(s16) = COPY $h1
-    ; NO-FP16: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %x(s16)
-    ; NO-FP16: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT %y(s16)
-    ; NO-FP16: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FPEXT]], [[FPEXT1]]
-    ; NO-FP16: %op:_(s16) = G_FPTRUNC [[FADD]](s32)
-    ; NO-FP16: $h0 = COPY %op(s16)
-    ; NO-FP16: RET_ReallyLR implicit $h0
+    ; NO-FP16-NEXT: {{  $}}
+    ; NO-FP16-NEXT: %x:_(s16) = COPY $h0
+    ; NO-FP16-NEXT: %y:_(s16) = COPY $h1
+    ; NO-FP16-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %x(s16)
+    ; NO-FP16-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT %y(s16)
+    ; NO-FP16-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FPEXT]], [[FPEXT1]]
+    ; NO-FP16-NEXT: %op:_(s16) = G_FPTRUNC [[FADD]](s32)
+    ; NO-FP16-NEXT: $h0 = COPY %op(s16)
+    ; NO-FP16-NEXT: RET_ReallyLR implicit $h0
+    ;
     ; FP16-LABEL: name: fadd
     ; FP16: liveins: $h0, $h1
-    ; FP16: %x:_(s16) = COPY $h0
-    ; FP16: %y:_(s16) = COPY $h1
-    ; FP16: %op:_(s16) = G_FADD %x, %y
-    ; FP16: $h0 = COPY %op(s16)
-    ; FP16: RET_ReallyLR implicit $h0
+    ; FP16-NEXT: {{  $}}
+    ; FP16-NEXT: %x:_(s16) = COPY $h0
+    ; FP16-NEXT: %y:_(s16) = COPY $h1
+    ; FP16-NEXT: %op:_(s16) = G_FADD %x, %y
+    ; FP16-NEXT: $h0 = COPY %op(s16)
+    ; FP16-NEXT: RET_ReallyLR implicit $h0
     %x:_(s16) = COPY $h0
     %y:_(s16) = COPY $h1
     %op:_(s16) = G_FADD %x, %y
@@ -43,21 +46,24 @@ body:             |
 
     ; NO-FP16-LABEL: name: fsub
     ; NO-FP16: liveins: $h0, $h1
-    ; NO-FP16: %x:_(s16) = COPY $h0
-    ; NO-FP16: %y:_(s16) = COPY $h1
-    ; NO-FP16: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %x(s16)
-    ; NO-FP16: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT %y(s16)
-    ; NO-FP16: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[FPEXT]], [[FPEXT1]]
-    ; NO-FP16: %op:_(s16) = G_FPTRUNC [[FSUB]](s32)
-    ; NO-FP16: $h0 = COPY %op(s16)
-    ; NO-FP16: RET_ReallyLR implicit $h0
+    ; NO-FP16-NEXT: {{  $}}
+    ; NO-FP16-NEXT: %x:_(s16) = COPY $h0
+    ; NO-FP16-NEXT: %y:_(s16) = COPY $h1
+    ; NO-FP16-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %x(s16)
+    ; NO-FP16-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT %y(s16)
+    ; NO-FP16-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[FPEXT]], [[FPEXT1]]
+    ; NO-FP16-NEXT: %op:_(s16) = G_FPTRUNC [[FSUB]](s32)
+    ; NO-FP16-NEXT: $h0 = COPY %op(s16)
+    ; NO-FP16-NEXT: RET_ReallyLR implicit $h0
+    ;
     ; FP16-LABEL: name: fsub
     ; FP16: liveins: $h0, $h1
-    ; FP16: %x:_(s16) = COPY $h0
-    ; FP16: %y:_(s16) = COPY $h1
-    ; FP16: %op:_(s16) = G_FSUB %x, %y
-    ; FP16: $h0 = COPY %op(s16)
-    ; FP16: RET_ReallyLR implicit $h0
+    ; FP16-NEXT: {{  $}}
+    ; FP16-NEXT: %x:_(s16) = COPY $h0
+    ; FP16-NEXT: %y:_(s16) = COPY $h1
+    ; FP16-NEXT: %op:_(s16) = G_FSUB %x, %y
+    ; FP16-NEXT: $h0 = COPY %op(s16)
+    ; FP16-NEXT: RET_ReallyLR implicit $h0
     %x:_(s16) = COPY $h0
     %y:_(s16) = COPY $h1
     %op:_(s16) = G_FSUB %x, %y
@@ -74,21 +80,24 @@ body:             |
 
     ; NO-FP16-LABEL: name: fmul
     ; NO-FP16: liveins: $h0, $h1
-    ; NO-FP16: %x:_(s16) = COPY $h0
-    ; NO-FP16: %y:_(s16) = COPY $h1
-    ; NO-FP16: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %x(s16)
-    ; NO-FP16: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT %y(s16)
-    ; NO-FP16: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[FPEXT1]]
-    ; NO-FP16: %op:_(s16) = G_FPTRUNC [[FMUL]](s32)
-    ; NO-FP16: $h0 = COPY %op(s16)
-    ; NO-FP16: RET_ReallyLR implicit $h0
+    ; NO-FP16-NEXT: {{  $}}
+    ; NO-FP16-NEXT: %x:_(s16) = COPY $h0
+    ; NO-FP16-NEXT: %y:_(s16) = COPY $h1
+    ; NO-FP16-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %x(s16)
+    ; NO-FP16-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT %y(s16)
+    ; NO-FP16-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[FPEXT1]]
+    ; NO-FP16-NEXT: %op:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; NO-FP16-NEXT: $h0 = COPY %op(s16)
+    ; NO-FP16-NEXT: RET_ReallyLR implicit $h0
+    ;
     ; FP16-LABEL: name: fmul
     ; FP16: liveins: $h0, $h1
-    ; FP16: %x:_(s16) = COPY $h0
-    ; FP16: %y:_(s16) = COPY $h1
-    ; FP16: %op:_(s16) = G_FMUL %x, %y
-    ; FP16: $h0 = COPY %op(s16)
-    ; FP16: RET_ReallyLR implicit $h0
+    ; FP16-NEXT: {{  $}}
+    ; FP16-NEXT: %x:_(s16) = COPY $h0
+    ; FP16-NEXT: %y:_(s16) = COPY $h1
+    ; FP16-NEXT: %op:_(s16) = G_FMUL %x, %y
+    ; FP16-NEXT: $h0 = COPY %op(s16)
+    ; FP16-NEXT: RET_ReallyLR implicit $h0
     %x:_(s16) = COPY $h0
     %y:_(s16) = COPY $h1
     %op:_(s16) = G_FMUL %x, %y
@@ -105,21 +114,24 @@ body:             |
 
     ; NO-FP16-LABEL: name: fdiv
     ; NO-FP16: liveins: $h0, $h1
-    ; NO-FP16: %x:_(s16) = COPY $h0
-    ; NO-FP16: %y:_(s16) = COPY $h1
-    ; NO-FP16: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %x(s16)
-    ; NO-FP16: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT %y(s16)
-    ; NO-FP16: [[FDIV:%[0-9]+]]:_(s32) = G_FDIV [[FPEXT]], [[FPEXT1]]
-    ; NO-FP16: %op:_(s16) = G_FPTRUNC [[FDIV]](s32)
-    ; NO-FP16: $h0 = COPY %op(s16)
-    ; NO-FP16: RET_ReallyLR implicit $h0
+    ; NO-FP16-NEXT: {{  $}}
+    ; NO-FP16-NEXT: %x:_(s16) = COPY $h0
+    ; NO-FP16-NEXT: %y:_(s16) = COPY $h1
+    ; NO-FP16-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %x(s16)
+    ; NO-FP16-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT %y(s16)
+    ; NO-FP16-NEXT: [[FDIV:%[0-9]+]]:_(s32) = G_FDIV [[FPEXT]], [[FPEXT1]]
+    ; NO-FP16-NEXT: %op:_(s16) = G_FPTRUNC [[FDIV]](s32)
+    ; NO-FP16-NEXT: $h0 = COPY %op(s16)
+    ; NO-FP16-NEXT: RET_ReallyLR implicit $h0
+    ;
     ; FP16-LABEL: name: fdiv
     ; FP16: liveins: $h0, $h1
-    ; FP16: %x:_(s16) = COPY $h0
-    ; FP16: %y:_(s16) = COPY $h1
-    ; FP16: %op:_(s16) = G_FDIV %x, %y
-    ; FP16: $h0 = COPY %op(s16)
-    ; FP16: RET_ReallyLR implicit $h0
+    ; FP16-NEXT: {{  $}}
+    ; FP16-NEXT: %x:_(s16) = COPY $h0
+    ; FP16-NEXT: %y:_(s16) = COPY $h1
+    ; FP16-NEXT: %op:_(s16) = G_FDIV %x, %y
+    ; FP16-NEXT: $h0 = COPY %op(s16)
+    ; FP16-NEXT: RET_ReallyLR implicit $h0
     %x:_(s16) = COPY $h0
     %y:_(s16) = COPY $h1
     %op:_(s16) = G_FDIV %x, %y
@@ -136,18 +148,22 @@ body:             |
 
     ; NO-FP16-LABEL: name: fneg
     ; NO-FP16: liveins: $h0
-    ; NO-FP16: %x:_(s16) = COPY $h0
-    ; NO-FP16: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %x(s16)
-    ; NO-FP16: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT]]
-    ; NO-FP16: %op:_(s16) = G_FPTRUNC [[FNEG]](s32)
-    ; NO-FP16: $h0 = COPY %op(s16)
-    ; NO-FP16: RET_ReallyLR implicit $h0
+    ; NO-FP16-NEXT: {{  $}}
+    ; NO-FP16-NEXT: %x:_(s16) = COPY $h0
+    ; NO-FP16-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %x(s16)
+    ; NO-FP16-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768
+    ; NO-FP16-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ANYEXT]], [[C]]
+    ; NO-FP16-NEXT: %op:_(s16) = G_TRUNC [[XOR]](s32)
+    ; NO-FP16-NEXT: $h0 = COPY %op(s16)
+    ; NO-FP16-NEXT: RET_ReallyLR implicit $h0
+    ;
     ; FP16-LABEL: name: fneg
     ; FP16: liveins: $h0
-    ; FP16: %x:_(s16) = COPY $h0
-    ; FP16: %op:_(s16) = G_FNEG %x
-    ; FP16: $h0 = COPY %op(s16)
-    ; FP16: RET_ReallyLR implicit $h0
+    ; FP16-NEXT: {{  $}}
+    ; FP16-NEXT: %x:_(s16) = COPY $h0
+    ; FP16-NEXT: %op:_(s16) = G_FNEG %x
+    ; FP16-NEXT: $h0 = COPY %op(s16)
+    ; FP16-NEXT: RET_ReallyLR implicit $h0
     %x:_(s16) = COPY $h0
     %op:_(s16) = G_FNEG %x
     $h0 = COPY %op(s16)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index db2412de48b56..a21b786a2bae9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -717,6 +717,9 @@
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: G_FATAN2 (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined 
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_FCOSH (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index 8710703ab970e..e058c83f274f1 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -1,500 +1,817 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra -frame-pointer=non-leaf | FileCheck %s --check-prefix=CHECK-CVT --check-prefix=CHECK-COMMON
-; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fullfp16 -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra -frame-pointer=non-leaf | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-FP16
-
-; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple \
-; RUN: -asm-verbose=false -disable-post-ra -frame-pointer=non-leaf -global-isel \
-; RUN: -global-isel-abort=2 -pass-remarks-missed=gisel-* 2>&1 | FileCheck %s \
-; RUN: --check-prefixes=FALLBACK,GISEL-CVT,GISEL
-
-; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fullfp16 \
-; RUN: -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra \
-; RUN: -frame-pointer=non-leaf -global-isel -global-isel-abort=2 \
-; RUN: -pass-remarks-missed=gisel-* 2>&1 | FileCheck %s \
-; RUN: --check-prefixes=FALLBACK-FP16,GISEL-FP16,GISEL
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-
-; CHECK-CVT-LABEL: test_fadd:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fadd s0, s0, s1
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fadd:
-; CHECK-FP16-NEXT:  fadd h0, h0, h1
-; CHECK-FP16-NEXT:  ret
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,CHECK-CVT,CHECK-CVT-SD
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-SD
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-CVT,CHECK-CVT-GI
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-GI
 
 define half @test_fadd(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fadd:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fadd:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fadd h0, h0, h1
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fadd:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-GI-NEXT:    fcvt h0, s0
+; CHECK-CVT-GI-NEXT:    ret
   %r = fadd half %a, %b
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_fsub:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fsub s0, s0, s1
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fsub:
-; CHECK-FP16-NEXT: fsub h0, h0, h1
-; CHECK-FP16-NEXT: ret
-
 define half @test_fsub(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fsub:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fsub s0, s0, s1
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fsub:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fsub h0, h0, h1
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fsub:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fsub s0, s0, s1
+; CHECK-CVT-GI-NEXT:    fcvt h0, s0
+; CHECK-CVT-GI-NEXT:    ret
   %r = fsub half %a, %b
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_fmul:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fmul s0, s0, s1
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fmul:
-; CHECK-FP16-NEXT: fmul h0, h0, h1
-; CHECK-FP16-NEXT: ret
-
 define half @test_fmul(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fmul:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fmul s0, s0, s1
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fmul:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fmul:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fmul s0, s0, s1
+; CHECK-CVT-GI-NEXT:    fcvt h0, s0
+; CHECK-CVT-GI-NEXT:    ret
   %r = fmul half %a, %b
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_fmadd:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fmul s0, s0, s1
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcvt s1, h2
-; CHECK-CVT-NEXT: fadd s0, s0, s1
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fmadd:
-; CHECK-FP16-NEXT: fmadd h0, h0, h1, h2
-; CHECK-FP16-NEXT: ret
-
 define half @test_fmadd(half %a, half %b, half %c) #0 {
+; CHECK-CVT-SD-LABEL: test_fmadd:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fmul s0, s0, s1
+; CHECK-CVT-SD-NEXT:    fcvt s1, h2
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fmadd:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fmadd h0, h0, h1, h2
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fmadd:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fmul s0, s0, s1
+; CHECK-CVT-GI-NEXT:    fcvt s1, h2
+; CHECK-CVT-GI-NEXT:    fcvt h0, s0
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-GI-NEXT:    fcvt h0, s0
+; CHECK-CVT-GI-NEXT:    ret
   %mul = fmul fast half %a, %b
   %r = fadd fast half %mul, %c
   ret half %r
 }
-; CHECK-CVT-LABEL: test_fdiv:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fdiv s0, s0, s1
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fdiv:
-; CHECK-FP16-NEXT: fdiv	h0, h0, h1
-; CHECK-FP16-NEXT: ret
 
 define half @test_fdiv(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fdiv:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fdiv s0, s0, s1
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fdiv:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fdiv h0, h0, h1
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fdiv:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fdiv s0, s0, s1
+; CHECK-CVT-GI-NEXT:    fcvt h0, s0
+; CHECK-CVT-GI-NEXT:    ret
   %r = fdiv half %a, %b
   ret half %r
 }
 
-; CHECK-COMMON-LABEL: test_frem:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: fcvt s1, h1
-; CHECK-COMMON-NEXT: bl {{_?}}fmodf
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
 define half @test_frem(half %a, half %b) #0 {
+; CHECK-LABEL: test_frem:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    fcvt s1, h1
+; CHECK-NEXT:    bl fmodf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = frem half %a, %b
   ret half %r
 }
 
-; CHECK-COMMON-LABEL: test_store:
-; CHECK-COMMON-NEXT: str  h0, [x0]
-; CHECK-COMMON-NEXT: ret
 define void @test_store(half %a, ptr %b) #0 {
+; CHECK-LABEL: test_store:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
   store half %a, ptr %b
   ret void
 }
 
-; CHECK-COMMON-LABEL: test_load:
-; CHECK-COMMON-NEXT: ldr  h0, [x0]
-; CHECK-COMMON-NEXT: ret
 define half @test_load(ptr %a) #0 {
+; CHECK-LABEL: test_load:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
   %r = load half, ptr %a
   ret half %r
 }
 
 declare half @test_callee(half %a, half %b) #0
 
-; CHECK-COMMON-LABEL: test_call:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: bl {{_?}}test_callee
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
 define half @test_call(half %a, half %b) #0 {
+; CHECK-LABEL: test_call:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl test_callee
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @test_callee(half %a, half %b)
   ret half %r
 }
 
-; CHECK-COMMON-LABEL: test_call_flipped:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fmov  s2, s0
-; CHECK-COMMON-NEXT: fmov  s0, s1
-; CHECK-COMMON-NEXT: fmov  s1, s2
-; CHECK-COMMON-NEXT: bl {{_?}}test_callee
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
 define half @test_call_flipped(half %a, half %b) #0 {
+; CHECK-LABEL: test_call_flipped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fmov s2, s0
+; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    fmov s1, s2
+; CHECK-NEXT:    bl test_callee
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @test_callee(half %b, half %a)
   ret half %r
 }
 
-; CHECK-COMMON-LABEL: test_tailcall_flipped:
-; CHECK-COMMON-NEXT: fmov  s2, s0
-; CHECK-COMMON-NEXT: fmov  s0, s1
-; CHECK-COMMON-NEXT: fmov  s1, s2
-; CHECK-COMMON-NEXT: b {{_?}}test_callee
 define half @test_tailcall_flipped(half %a, half %b) #0 {
+; CHECK-LABEL: test_tailcall_flipped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s2, s0
+; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    fmov s1, s2
+; CHECK-NEXT:    b test_callee
   %r = tail call half @test_callee(half %b, half %a)
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_select:
-; CHECK-CVT-NEXT: cmp  w0, #0
-; CHECK-CVT-NEXT: fcsel s0, s0, s1, ne
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_select:
-; CHECK-FP16-NEXT: cmp w0, #0
-; CHECK-FP16-NEXT: fcsel h0, h0, h1, ne
-; CHECK-FP16-NEXT: ret
-
 define half @test_select(half %a, half %b, i1 zeroext %c) #0 {
+; CHECK-CVT-SD-LABEL: test_select:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-SD-NEXT:    cmp w0, #0
+; CHECK-CVT-SD-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-SD-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-CVT-SD-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: test_select:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    cmp w0, #0
+; CHECK-FP16-SD-NEXT:    fcsel h0, h0, h1, ne
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_select:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-GI-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-GI-NEXT:    fmov w8, s0
+; CHECK-CVT-GI-NEXT:    fmov w9, s1
+; CHECK-CVT-GI-NEXT:    tst w0, #0x1
+; CHECK-CVT-GI-NEXT:    csel w8, w8, w9, ne
+; CHECK-CVT-GI-NEXT:    fmov s0, w8
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: test_select:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-FP16-GI-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-FP16-GI-NEXT:    fmov w8, s0
+; CHECK-FP16-GI-NEXT:    fmov w9, s1
+; CHECK-FP16-GI-NEXT:    tst w0, #0x1
+; CHECK-FP16-GI-NEXT:    csel w8, w8, w9, ne
+; CHECK-FP16-GI-NEXT:    fmov s0, w8
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-FP16-GI-NEXT:    ret
   %r = select i1 %c, half %a, half %b
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_select_cc:
-; CHECK-CVT-DAG: fcvt s3, h3
-; CHECK-CVT-DAG: fcvt s2, h2
-; CHECK-CVT-DAG: fcmp s2, s3
-; CHECK-CVT-NEXT: fcsel s0, s0, s1, ne
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_select_cc:
-; CHECK-FP16-NEXT: fcmp h2, h3
-; CHECK-FP16-NEXT: fcsel h0, h0, h1, ne
-; CHECK-FP16-NEXT: ret
-
 define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
+; CHECK-CVT-SD-LABEL: test_select_cc:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s3, h3
+; CHECK-CVT-SD-NEXT:    fcvt s2, h2
+; CHECK-CVT-SD-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-SD-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-SD-NEXT:    fcmp s2, s3
+; CHECK-CVT-SD-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-CVT-SD-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: test_select_cc:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcmp h2, h3
+; CHECK-FP16-SD-NEXT:    fcsel h0, h0, h1, ne
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_select_cc:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s2, h2
+; CHECK-CVT-GI-NEXT:    fcvt s3, h3
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-GI-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-GI-NEXT:    fmov w8, s0
+; CHECK-CVT-GI-NEXT:    fmov w9, s1
+; CHECK-CVT-GI-NEXT:    fcmp s2, s3
+; CHECK-CVT-GI-NEXT:    csel w8, w8, w9, ne
+; CHECK-CVT-GI-NEXT:    fmov s0, w8
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: test_select_cc:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-FP16-GI-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-FP16-GI-NEXT:    fcmp h2, h3
+; CHECK-FP16-GI-NEXT:    fmov w8, s0
+; CHECK-FP16-GI-NEXT:    fmov w9, s1
+; CHECK-FP16-GI-NEXT:    csel w8, w8, w9, ne
+; CHECK-FP16-GI-NEXT:    fmov s0, w8
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-FP16-GI-NEXT:    ret
   %cc = fcmp une half %c, %d
   %r = select i1 %cc, half %a, half %b
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_select_cc_f32_f16:
-; CHECK-CVT-DAG:   fcvt s2, h2
-; CHECK-CVT-DAG:   fcvt s3, h3
-; CHECK-CVT-NEXT:  fcmp s2, s3
-; CHECK-CVT-NEXT:  fcsel s0, s0, s1, ne
-; CHECK-CVT-NEXT:  ret
-
-; CHECK-FP16-LABEL: test_select_cc_f32_f16:
-; CHECK-FP16-NEXT: fcmp	h2, h3
-; CHECK-FP16-NEXT: fcsel	s0, s0, s1, ne
-; CHECK-FP16-NEXT: ret
-
 define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
+; CHECK-CVT-SD-LABEL: test_select_cc_f32_f16:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s3, h3
+; CHECK-CVT-SD-NEXT:    fcvt s2, h2
+; CHECK-CVT-SD-NEXT:    fcmp s2, s3
+; CHECK-CVT-SD-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_select_cc_f32_f16:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcmp h2, h3
+; CHECK-FP16-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_select_cc_f32_f16:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s2, h2
+; CHECK-CVT-GI-NEXT:    fcvt s3, h3
+; CHECK-CVT-GI-NEXT:    fcmp s2, s3
+; CHECK-CVT-GI-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-CVT-GI-NEXT:    ret
   %cc = fcmp une half %c, %d
   %r = select i1 %cc, float %a, float %b
   ret float %r
 }
 
-; CHECK-CVT-LABEL: test_select_cc_f16_f32:
-; CHECK-CVT-DAG:  fcmp s2, s3
-; CHECK-CVT-NEXT: fcsel s0, s0, s1, ne
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_select_cc_f16_f32:
-; CHECK-FP16-NEXT: fcmp	s2, s3
-; CHECK-FP16-NEXT: fcsel h0, h0, h1, ne
-; CHECK-FP16-NEXT: ret
-
 define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 {
+; CHECK-CVT-SD-LABEL: test_select_cc_f16_f32:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcmp s2, s3
+; CHECK-CVT-SD-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-SD-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-SD-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-CVT-SD-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: test_select_cc_f16_f32:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcmp s2, s3
+; CHECK-FP16-SD-NEXT:    fcsel h0, h0, h1, ne
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_select_cc_f16_f32:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-GI-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-GI-NEXT:    fcmp s2, s3
+; CHECK-CVT-GI-NEXT:    fmov w8, s0
+; CHECK-CVT-GI-NEXT:    fmov w9, s1
+; CHECK-CVT-GI-NEXT:    csel w8, w8, w9, ne
+; CHECK-CVT-GI-NEXT:    fmov s0, w8
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: test_select_cc_f16_f32:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-FP16-GI-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-FP16-GI-NEXT:    fcmp s2, s3
+; CHECK-FP16-GI-NEXT:    fmov w8, s0
+; CHECK-FP16-GI-NEXT:    fmov w9, s1
+; CHECK-FP16-GI-NEXT:    csel w8, w8, w9, ne
+; CHECK-FP16-GI-NEXT:    fmov s0, w8
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-FP16-GI-NEXT:    ret
   %cc = fcmp une float %c, %d
   %r = select i1 %cc, half %a, half %b
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_une:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset  w0, ne
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_une:
-; CHECK-FP16-NEXT: fcmp	h0, h1
-; CHECK-FP16-NEXT: cset w0, ne
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_une(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_une:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w0, ne
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fcmp_une:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcmp h0, h1
+; CHECK-FP16-NEXT:    cset w0, ne
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_une:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w0, ne
+; CHECK-CVT-GI-NEXT:    ret
   %r = fcmp une half %a, %b
   ret i1 %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_ueq:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset [[TRUE:w[0-9]+]], eq
-; CHECK-CVT-NEXT: csinc w0, [[TRUE]], wzr, vc
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_ueq:
-; CHECK-FP16-NEXT: fcmp	h0, h1
-; CHECK-FP16-NEXT: cset [[TRUE:w[0-9]+]], eq
-; CHECK-FP16-NEXT: csinc w0, [[TRUE]], wzr, vc
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_ueq(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_ueq:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w8, eq
+; CHECK-CVT-SD-NEXT:    csinc w0, w8, wzr, vc
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: test_fcmp_ueq:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcmp h0, h1
+; CHECK-FP16-SD-NEXT:    cset w8, eq
+; CHECK-FP16-SD-NEXT:    csinc w0, w8, wzr, vc
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ueq:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w8, eq
+; CHECK-CVT-GI-NEXT:    cset w9, vs
+; CHECK-CVT-GI-NEXT:    orr w0, w8, w9
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: test_fcmp_ueq:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    fcmp h0, h1
+; CHECK-FP16-GI-NEXT:    cset w8, eq
+; CHECK-FP16-GI-NEXT:    cset w9, vs
+; CHECK-FP16-GI-NEXT:    orr w0, w8, w9
+; CHECK-FP16-GI-NEXT:    ret
   %r = fcmp ueq half %a, %b
   ret i1 %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_ugt:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset  w0, hi
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_ugt:
-; CHECK-FP16-NEXT: fcmp h0, h1
-; CHECK-FP16-NEXT: cset  w0, hi
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_ugt(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_ugt:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w0, hi
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fcmp_ugt:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcmp h0, h1
+; CHECK-FP16-NEXT:    cset w0, hi
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ugt:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w0, hi
+; CHECK-CVT-GI-NEXT:    ret
   %r = fcmp ugt half %a, %b
   ret i1 %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_uge:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset  w0, pl
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_uge:
-; CHECK-FP16-NEXT: fcmp h0, h1
-; CHECK-FP16-NEXT: cset  w0, pl
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_uge(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_uge:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w0, pl
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fcmp_uge:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcmp h0, h1
+; CHECK-FP16-NEXT:    cset w0, pl
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_uge:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w0, pl
+; CHECK-CVT-GI-NEXT:    ret
   %r = fcmp uge half %a, %b
   ret i1 %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_ult:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset  w0, lt
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_ult:
-; CHECK-FP16-NEXT: fcmp h0, h1
-; CHECK-FP16-NEXT: cset  w0, lt
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_ult(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_ult:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w0, lt
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fcmp_ult:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcmp h0, h1
+; CHECK-FP16-NEXT:    cset w0, lt
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ult:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w0, lt
+; CHECK-CVT-GI-NEXT:    ret
   %r = fcmp ult half %a, %b
   ret i1 %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_ule:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset  w0, le
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_ule:
-; CHECK-FP16-NEXT: fcmp h0, h1
-; CHECK-FP16-NEXT: cset  w0, le
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_ule(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_ule:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w0, le
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fcmp_ule:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcmp h0, h1
+; CHECK-FP16-NEXT:    cset w0, le
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ule:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w0, le
+; CHECK-CVT-GI-NEXT:    ret
   %r = fcmp ule half %a, %b
   ret i1 %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_uno:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset  w0, vs
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_uno:
-; CHECK-FP16-NEXT: fcmp h0, h1
-; CHECK-FP16-NEXT: cset  w0, vs
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_uno(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_uno:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w0, vs
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fcmp_uno:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcmp h0, h1
+; CHECK-FP16-NEXT:    cset w0, vs
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_uno:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w0, vs
+; CHECK-CVT-GI-NEXT:    ret
   %r = fcmp uno half %a, %b
   ret i1 %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_one:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset [[TRUE:w[0-9]+]], mi
-; CHECK-CVT-NEXT: csinc w0, [[TRUE]], wzr, le
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_one:
-; CHECK-FP16-NEXT: fcmp h0, h1
-; CHECK-FP16-NEXT: cset [[TRUE:w[0-9]+]], mi
-; CHECK-FP16-NEXT: csinc w0, [[TRUE]], wzr, le
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_one(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_one:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w8, mi
+; CHECK-CVT-SD-NEXT:    csinc w0, w8, wzr, le
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: test_fcmp_one:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcmp h0, h1
+; CHECK-FP16-SD-NEXT:    cset w8, mi
+; CHECK-FP16-SD-NEXT:    csinc w0, w8, wzr, le
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_one:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w8, mi
+; CHECK-CVT-GI-NEXT:    cset w9, gt
+; CHECK-CVT-GI-NEXT:    orr w0, w8, w9
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: test_fcmp_one:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    fcmp h0, h1
+; CHECK-FP16-GI-NEXT:    cset w8, mi
+; CHECK-FP16-GI-NEXT:    cset w9, gt
+; CHECK-FP16-GI-NEXT:    orr w0, w8, w9
+; CHECK-FP16-GI-NEXT:    ret
   %r = fcmp one half %a, %b
   ret i1 %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_oeq:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset  w0, eq
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_oeq:
-; CHECK-FP16-NEXT: fcmp h0, h1
-; CHECK-FP16-NEXT: cset  w0, eq
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_oeq(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_oeq:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w0, eq
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fcmp_oeq:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcmp h0, h1
+; CHECK-FP16-NEXT:    cset w0, eq
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_oeq:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w0, eq
+; CHECK-CVT-GI-NEXT:    ret
   %r = fcmp oeq half %a, %b
   ret i1 %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_ogt:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset  w0, gt
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_ogt:
-; CHECK-FP16-NEXT: fcmp h0, h1
-; CHECK-FP16-NEXT: cset  w0, gt
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_ogt(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_ogt:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w0, gt
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fcmp_ogt:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcmp h0, h1
+; CHECK-FP16-NEXT:    cset w0, gt
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ogt:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w0, gt
+; CHECK-CVT-GI-NEXT:    ret
   %r = fcmp ogt half %a, %b
   ret i1 %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_oge:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset  w0, ge
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_oge:
-; CHECK-FP16-NEXT: fcmp h0, h1
-; CHECK-FP16-NEXT: cset  w0, ge
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_oge(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_oge:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w0, ge
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fcmp_oge:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcmp h0, h1
+; CHECK-FP16-NEXT:    cset w0, ge
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_oge:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w0, ge
+; CHECK-CVT-GI-NEXT:    ret
   %r = fcmp oge half %a, %b
   ret i1 %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_olt:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset  w0, mi
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_olt:
-; CHECK-FP16-NEXT: fcmp h0, h1
-; CHECK-FP16-NEXT: cset  w0, mi
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_olt(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_olt:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w0, mi
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fcmp_olt:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcmp h0, h1
+; CHECK-FP16-NEXT:    cset w0, mi
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_olt:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w0, mi
+; CHECK-CVT-GI-NEXT:    ret
   %r = fcmp olt half %a, %b
   ret i1 %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_ole:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset  w0, ls
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_ole:
-; CHECK-FP16-NEXT: fcmp h0, h1
-; CHECK-FP16-NEXT: cset  w0, ls
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_ole(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_ole:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w0, ls
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fcmp_ole:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcmp h0, h1
+; CHECK-FP16-NEXT:    cset w0, ls
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ole:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w0, ls
+; CHECK-CVT-GI-NEXT:    ret
   %r = fcmp ole half %a, %b
   ret i1 %r
 }
 
-; CHECK-CVT-LABEL: test_fcmp_ord:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: cset  w0, vc
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fcmp_ord:
-; CHECK-FP16-NEXT: fcmp h0, h1
-; CHECK-FP16-NEXT: cset  w0, vc
-; CHECK-FP16-NEXT: ret
-
 define i1 @test_fcmp_ord(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_fcmp_ord:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    cset w0, vc
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fcmp_ord:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcmp h0, h1
+; CHECK-FP16-NEXT:    cset w0, vc
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ord:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    cset w0, vc
+; CHECK-CVT-GI-NEXT:    ret
   %r = fcmp ord half %a, %b
   ret i1 %r
 }
 
-; CHECK-COMMON-LABEL: test_fccmp:
-; CHECK-CVT:      fcvt  s1, h0
-; CHECK-CVT-NEXT: fmov  s2, #5.00000000
-; CHECK-CVT-NEXT: fcmp  s1, s2
-; CHECK-CVT-NEXT: fmov  s2, #8.00000000
-; CHECK-CVT-NEXT: fccmp s1, s2, #4, mi
-; CHECK-CVT-NEXT: adrp x8
-; CHECK-CVT-NEXT: ldr h1, [x8,
-; CHECK-CVT-NEXT: fcsel s0, s0, s1, gt
-; CHECK-CVT-NEXT: str   h0, [x0]
-; CHECK-CVT-NEXT: ret
-; CHECK-FP16:      fmov  h1, #5.00000000
-; CHECK-FP16-NEXT: fcmp  h0, h1
-; CHECK-FP16-NEXT: fmov  h2, #8.00000000
-; CHECK-FP16-NEXT: fccmp h0, h2, #4, mi
-; CHECK-FP16-NEXT: fcsel h0, h0, h1, gt
-; CHECK-FP16-NEXT: str   h0, [x0]
-; CHECK-FP16-NEXT: ret
-
 define void @test_fccmp(half %in, ptr %out) {
+; CHECK-CVT-SD-LABEL: test_fccmp:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-SD-NEXT:    fcvt s1, h0
+; CHECK-CVT-SD-NEXT:    fmov s2, #5.00000000
+; CHECK-CVT-SD-NEXT:    adrp x8, .LCPI29_0
+; CHECK-CVT-SD-NEXT:    fcmp s1, s2
+; CHECK-CVT-SD-NEXT:    fmov s2, #8.00000000
+; CHECK-CVT-SD-NEXT:    fccmp s1, s2, #4, mi
+; CHECK-CVT-SD-NEXT:    ldr h1, [x8, :lo12:.LCPI29_0]
+; CHECK-CVT-SD-NEXT:    fcsel s0, s0, s1, gt
+; CHECK-CVT-SD-NEXT:    str h0, [x0]
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: test_fccmp:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fmov h1, #5.00000000
+; CHECK-FP16-SD-NEXT:    fmov h2, #8.00000000
+; CHECK-FP16-SD-NEXT:    fcmp h0, h1
+; CHECK-FP16-SD-NEXT:    fccmp h0, h2, #4, mi
+; CHECK-FP16-SD-NEXT:    fcsel h0, h0, h1, gt
+; CHECK-FP16-SD-NEXT:    str h0, [x0]
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fccmp:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    mov w8, #17664 // =0x4500
+; CHECK-CVT-GI-NEXT:    mov w9, #18432 // =0x4800
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-GI-NEXT:    fcvt s2, h0
+; CHECK-CVT-GI-NEXT:    fmov s1, w8
+; CHECK-CVT-GI-NEXT:    fmov s3, w9
+; CHECK-CVT-GI-NEXT:    fmov w9, s0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcvt s3, h3
+; CHECK-CVT-GI-NEXT:    fcmp s2, s1
+; CHECK-CVT-GI-NEXT:    fccmp s2, s3, #4, mi
+; CHECK-CVT-GI-NEXT:    csel w8, w9, w8, gt
+; CHECK-CVT-GI-NEXT:    strh w8, [x0]
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: test_fccmp:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    fmov h1, #5.00000000
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-FP16-GI-NEXT:    fmov h2, #8.00000000
+; CHECK-FP16-GI-NEXT:    fmov w8, s0
+; CHECK-FP16-GI-NEXT:    fcmp h0, h1
+; CHECK-FP16-GI-NEXT:    fmov w9, s1
+; CHECK-FP16-GI-NEXT:    fccmp h0, h2, #4, mi
+; CHECK-FP16-GI-NEXT:    csel w8, w8, w9, gt
+; CHECK-FP16-GI-NEXT:    strh w8, [x0]
+; CHECK-FP16-GI-NEXT:    ret
   %cmp1 = fcmp ogt half %in, 0xH4800
   %cmp2 = fcmp olt half %in, 0xH4500
   %cond = and i1 %cmp1, %cmp2
@@ -503,21 +820,31 @@ define void @test_fccmp(half %in, ptr %out) {
   ret void
 }
 
-; CHECK-CVT-LABEL: test_br_cc:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcmp s0, s1
-; CHECK-CVT-NEXT: csel x8, x0, x1, pl
-; CHECK-CVT-NEXT: str wzr, [x8]
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_br_cc:
-; CHECK-FP16-NEXT: fcmp h0, h1
-; CHECK-FP16-NEXT: csel x8, x0, x1, pl
-; CHECK-FP16-NEXT: str wzr, [x8]
-; CHECK-FP16-NEXT: ret
-
 define void @test_br_cc(half %a, half %b, ptr %p1, ptr %p2) #0 {
+; CHECK-CVT-SD-LABEL: test_br_cc:
+; CHECK-CVT-SD:       // %bb.0: // %common.ret
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fcmp s0, s1
+; CHECK-CVT-SD-NEXT:    csel x8, x0, x1, pl
+; CHECK-CVT-SD-NEXT:    str wzr, [x8]
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_br_cc:
+; CHECK-FP16:       // %bb.0: // %common.ret
+; CHECK-FP16-NEXT:    fcmp h0, h1
+; CHECK-FP16-NEXT:    csel x8, x0, x1, pl
+; CHECK-FP16-NEXT:    str wzr, [x8]
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_br_cc:
+; CHECK-CVT-GI:       // %bb.0: // %common.ret
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcmp s0, s1
+; CHECK-CVT-GI-NEXT:    csel x8, x0, x1, pl
+; CHECK-CVT-GI-NEXT:    str wzr, [x8]
+; CHECK-CVT-GI-NEXT:    ret
   %c = fcmp uge half %a, %b
   br i1 %c, label %then, label %else
 then:
@@ -528,17 +855,25 @@ else:
   ret void
 }
 
-; CHECK-COMMON-LABEL: test_phi:
-; CHECK-COMMON: mov  x[[PTR:[0-9]+]], x0
-; CHECK-COMMON: ldr  h[[AB:[0-9]+]], [x0]
-; CHECK-COMMON: [[LOOP:LBB[0-9_]+]]:
-; CHECK-COMMON: fmov  s[[R:[0-9]+]], s[[AB]]
-; CHECK-COMMON: ldr  h[[AB]], [x[[PTR]]]
-; CHECK-COMMON: mov  x0, x[[PTR]]
-; CHECK-COMMON: bl {{_?}}test_dummy
-; CHECK-COMMON: fmov  s0, s[[R]]
-; CHECK-COMMON: ret
 define half @test_phi(ptr %p1) #0 {
+; CHECK-LABEL: test_phi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    ldr h9, [x0]
+; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:  .LBB31_1: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    fmov s8, s9
+; CHECK-NEXT:    ldr h9, [x19]
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl test_dummy
+; CHECK-NEXT:    tbnz w0, #0, .LBB31_1
+; CHECK-NEXT:  // %bb.2: // %return
+; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
   %a = load half, ptr %p1
   br label %loop
@@ -553,208 +888,224 @@ return:
 
 declare i1 @test_dummy(ptr %p1) #0
 
+define i32 @test_fptosi_i32(half %a) #0 {
 ; CHECK-CVT-LABEL: test_fptosi_i32:
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcvtzs w0, s0
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzs w0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_fptosi_i32:
-; CHECK-FP16-NEXT: fcvtzs w0, h0
-; CHECK-FP16-NEXT: ret
-
-define i32 @test_fptosi_i32(half %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcvtzs w0, h0
+; CHECK-FP16-NEXT:    ret
   %r = fptosi half %a to i32
   ret i32 %r
 }
 
+define i64 @test_fptosi_i64(half %a) #0 {
 ; CHECK-CVT-LABEL: test_fptosi_i64:
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcvtzs x0, s0
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzs x0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_fptosi_i64:
-; CHECK-FP16-NEXT: fcvtzs x0, h0
-; CHECK-FP16-NEXT: ret
-
-define i64 @test_fptosi_i64(half %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcvtzs x0, h0
+; CHECK-FP16-NEXT:    ret
   %r = fptosi half %a to i64
   ret i64 %r
 }
 
+define i32 @test_fptoui_i32(half %a) #0 {
 ; CHECK-CVT-LABEL: test_fptoui_i32:
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcvtzu w0, s0
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzu w0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_fptoui_i32:
-; CHECK-FP16-NEXT: fcvtzu w0, h0
-; CHECK-FP16-NEXT: ret
-
-define i32 @test_fptoui_i32(half %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcvtzu w0, h0
+; CHECK-FP16-NEXT:    ret
   %r = fptoui half %a to i32
   ret i32 %r
 }
 
+define i64 @test_fptoui_i64(half %a) #0 {
 ; CHECK-CVT-LABEL: test_fptoui_i64:
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcvtzu x0, s0
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzu x0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_fptoui_i64:
-; CHECK-FP16-NEXT: fcvtzu x0, h0
-; CHECK-FP16-NEXT: ret
-
-define i64 @test_fptoui_i64(half %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcvtzu x0, h0
+; CHECK-FP16-NEXT:    ret
   %r = fptoui half %a to i64
   ret i64 %r
 }
 
+define half @test_uitofp_i32(i32 %a) #0 {
 ; CHECK-CVT-LABEL: test_uitofp_i32:
-; CHECK-CVT-NEXT: ucvtf s0, w0
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf s0, w0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_uitofp_i32:
-; CHECK-FP16-NEXT: ucvtf h0, w0
-; CHECK-FP16-NEXT: ret
-
-define half @test_uitofp_i32(i32 %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    ucvtf h0, w0
+; CHECK-FP16-NEXT:    ret
   %r = uitofp i32 %a to half
   ret half %r
 }
 
+define half @test_uitofp_i64(i64 %a) #0 {
 ; CHECK-CVT-LABEL: test_uitofp_i64:
-; CHECK-CVT-NEXT: ucvtf s0, x0
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf s0, x0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_uitofp_i64:
-; CHECK-FP16-NEXT: ucvtf h0, x0
-; CHECK-FP16-NEXT: ret
-
-define half @test_uitofp_i64(i64 %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    ucvtf h0, x0
+; CHECK-FP16-NEXT:    ret
   %r = uitofp i64 %a to half
   ret half %r
 }
 
+define half @test_sitofp_i32(i32 %a) #0 {
 ; CHECK-CVT-LABEL: test_sitofp_i32:
-; CHECK-CVT-NEXT: scvtf s0, w0
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf s0, w0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_sitofp_i32:
-; CHECK-FP16-NEXT: scvtf h0, w0
-; CHECK-FP16-NEXT: ret
-
-define half @test_sitofp_i32(i32 %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    scvtf h0, w0
+; CHECK-FP16-NEXT:    ret
   %r = sitofp i32 %a to half
   ret half %r
 }
 
+define half @test_sitofp_i64(i64 %a) #0 {
 ; CHECK-CVT-LABEL: test_sitofp_i64:
-; CHECK-CVT-NEXT: scvtf s0, x0
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf s0, x0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_sitofp_i64:
-; CHECK-FP16-NEXT: scvtf h0, x0
-; CHECK-FP16-NEXT: ret
-define half @test_sitofp_i64(i64 %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    scvtf h0, x0
+; CHECK-FP16-NEXT:    ret
   %r = sitofp i64 %a to half
   ret half %r
 }
 
+define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
 ; CHECK-CVT-LABEL: test_uitofp_i32_fadd:
-; CHECK-CVT-NEXT: ucvtf s1, w0
-; CHECK-CVT-NEXT: fcvt h1, s1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fadd s0, s0, s1
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf s1, w0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvt h1, s1
+; CHECK-CVT-NEXT:    fcvt s1, h1
+; CHECK-CVT-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_uitofp_i32_fadd:
-; CHECK-FP16-NEXT: ucvtf h1, w0
-; CHECK-FP16-NEXT: fadd h0, h0, h1
-; CHECK-FP16-NEXT: ret
-
-define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    ucvtf h1, w0
+; CHECK-FP16-NEXT:    fadd h0, h0, h1
+; CHECK-FP16-NEXT:    ret
   %c = uitofp i32 %a to half
   %r = fadd half %b, %c
   ret half %r
 }
 
+define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
 ; CHECK-CVT-LABEL: test_sitofp_i32_fadd:
-; CHECK-CVT-NEXT: scvtf s1, w0
-; CHECK-CVT-NEXT: fcvt h1, s1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fadd s0, s0, s1
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf s1, w0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvt h1, s1
+; CHECK-CVT-NEXT:    fcvt s1, h1
+; CHECK-CVT-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_sitofp_i32_fadd:
-; CHECK-FP16-NEXT: scvtf h1, w0
-; CHECK-FP16-NEXT: fadd h0, h0, h1
-; CHECK-FP16-NEXT: ret
-
-define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    scvtf h1, w0
+; CHECK-FP16-NEXT:    fadd h0, h0, h1
+; CHECK-FP16-NEXT:    ret
   %c = sitofp i32 %a to half
   %r = fadd half %b, %c
   ret half %r
 }
 
-; CHECK-COMMON-LABEL: test_fptrunc_float:
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ret
-
 define half @test_fptrunc_float(float %a) #0 {
+; CHECK-LABEL: test_fptrunc_float:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ret
   %r = fptrunc float %a to half
   ret half %r
 }
 
-; CHECK-COMMON-LABEL: test_fptrunc_double:
-; CHECK-COMMON-NEXT: fcvt h0, d0
-; CHECK-COMMON-NEXT: ret
 define half @test_fptrunc_double(double %a) #0 {
+; CHECK-LABEL: test_fptrunc_double:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvt h0, d0
+; CHECK-NEXT:    ret
   %r = fptrunc double %a to half
   ret half %r
 }
 
-; CHECK-COMMON-LABEL: test_fpext_float:
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: ret
 define float @test_fpext_float(half %a) #0 {
+; CHECK-LABEL: test_fpext_float:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    ret
   %r = fpext half %a to float
   ret float %r
 }
 
-; CHECK-COMMON-LABEL: test_fpext_double:
-; CHECK-COMMON-NEXT: fcvt d0, h0
-; CHECK-COMMON-NEXT: ret
 define double @test_fpext_double(half %a) #0 {
+; CHECK-LABEL: test_fpext_double:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvt d0, h0
+; CHECK-NEXT:    ret
   %r = fpext half %a to double
   ret double %r
 }
 
-
-; CHECK-COMMON-LABEL: test_bitcast_halftoi16:
-; CHECK-COMMON-NEXT: fmov w0, s0
-; CHECK-COMMON-NEXT: ret
 define i16 @test_bitcast_halftoi16(half %a) #0 {
+; CHECK-LABEL: test_bitcast_halftoi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %r = bitcast half %a to i16
   ret i16 %r
 }
 
-; CHECK-COMMON-LABEL: test_bitcast_i16tohalf:
-; CHECK-COMMON-NEXT: fmov s0, w0
-; CHECK-COMMON-NEXT: ret
 define half @test_bitcast_i16tohalf(i16 %a) #0 {
+; CHECK-LABEL: test_bitcast_i16tohalf:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
   %r = bitcast i16 %a to half
   ret half %r
 }
 
-
 declare half @llvm.sqrt.f16(half %a) #0
 declare half @llvm.powi.f16.i32(half %a, i32 %b) #0
 declare half @llvm.sin.f16(half %a) #0
@@ -786,540 +1137,445 @@ declare half @llvm.round.f16(half %a) #0
 declare half @llvm.roundeven.f16(half %a) #0
 declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0
 
-; FALLBACK-NOT: remark:{{.*}}test_sqrt
-; FALLBACK-FP16-NOT: remark:{{.*}}test_sqrt
 
+define half @test_sqrt(half %a) #0 {
 ; CHECK-CVT-LABEL: test_sqrt:
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fsqrt s0, s0
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fsqrt s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_sqrt:
-; CHECK-FP16-NEXT: fsqrt h0, h0
-; CHECK-FP16-NEXT: ret
-
-; GISEL-CVT-LABEL: test_sqrt:
-; GISEL-CVT-NEXT: fcvt s0, h0
-; GISEL-CVT-NEXT: fsqrt s0, s0
-; GISEL-CVT-NEXT: fcvt h0, s0
-; GISEL-CVT-NEXT: ret
-
-; GISEL-FP16-LABEL: test_sqrt:
-; GISEL-FP16-NEXT: fsqrt h0, h0
-; GISEL-FP16-NEXT: ret
-
-define half @test_sqrt(half %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fsqrt h0, h0
+; CHECK-FP16-NEXT:    ret
   %r = call half @llvm.sqrt.f16(half %a)
   ret half %r
 }
 
-; CHECK-COMMON-LABEL: test_powi:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}__powisf2
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
 define half @test_powi(half %a, i32 %b) #0 {
+; CHECK-LABEL: test_powi:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl __powisf2
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.powi.f16.i32(half %a, i32 %b)
   ret half %r
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_sin
-; FALLBACK-FP16-NOT: remark:{{.*}}test_sin
-
-; CHECK-COMMON-LABEL: test_sin:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}sinf
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_sin:
-; GISEL-NEXT: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}sinf
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
+
 define half @test_sin(half %a) #0 {
+; CHECK-LABEL: test_sin:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl sinf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.sin.f16(half %a)
   ret half %r
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_cos
-; FALLBACK-FP16-NOT: remark:{{.*}}test_cos
-
-; CHECK-COMMON-LABEL: test_cos:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}cosf
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_cos:
-; GISEL-NEXT: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}cosf
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
 define half @test_cos(half %a) #0 {
+; CHECK-LABEL: test_cos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl cosf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.cos.f16(half %a)
   ret half %r
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_tan
-; FALLBACK-FP16-NOT: remark:{{.*}}test_tan
-
-; CHECK-COMMON-LABEL: test_tan:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}tanf
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_tan:
-; GISEL-NEXT: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}tanf
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
 define half @test_tan(half %a) #0 {
+; CHECK-LABEL: test_tan:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.tan.f16(half %a)
   ret half %r
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_acos
-; FALLBACK-FP16-NOT: remark:{{.*}}test_acos
-
-; CHECK-COMMON-LABEL: test_acos:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}acosf
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_acos:
-; GISEL-NEXT: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}acosf
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
 define half @test_acos(half %a) #0 {
+; CHECK-LABEL: test_acos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl acosf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.acos.f16(half %a)
   ret half %r
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_asin
-; FALLBACK-FP16-NOT: remark:{{.*}}test_asin
-
-; CHECK-COMMON-LABEL: test_asin:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}asinf
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_asin:
-; GISEL-NEXT: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}asinf
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
 define half @test_asin(half %a) #0 {
+; CHECK-LABEL: test_asin:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl asinf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.asin.f16(half %a)
   ret half %r
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_atan
-; FALLBACK-FP16-NOT: remark:{{.*}}test_atan
-
-; CHECK-COMMON-LABEL: test_atan:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}atanf
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_atan:
-; GISEL-NEXT: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}atanf
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
 define half @test_atan(half %a) #0 {
+; CHECK-LABEL: test_atan:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl atanf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.atan.f16(half %a)
   ret half %r
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_cosh
-; FALLBACK-FP16-NOT: remark:{{.*}}test_cosh
-
-; CHECK-COMMON-LABEL: test_cosh:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}coshf
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_cosh:
-; GISEL-NEXT: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}coshf
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
 define half @test_cosh(half %a) #0 {
+; CHECK-LABEL: test_cosh:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl coshf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.cosh.f16(half %a)
   ret half %r
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_sinh
-; FALLBACK-FP16-NOT: remark:{{.*}}test_sinh
-
-; CHECK-COMMON-LABEL: test_sinh:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}sinhf
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_sinh:
-; GISEL-NEXT: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}sinhf
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
 define half @test_sinh(half %a) #0 {
+; CHECK-LABEL: test_sinh:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl sinhf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.sinh.f16(half %a)
   ret half %r
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_tanh
-; FALLBACK-FP16-NOT: remark:{{.*}}test_tanh
-
-; CHECK-COMMON-LABEL: test_tanh:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}tanhf
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_tanh:
-; GISEL-NEXT: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}tanhf
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
 define half @test_tanh(half %a) #0 {
+; CHECK-LABEL: test_tanh:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl tanhf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.tanh.f16(half %a)
   ret half %r
 }
 
-; CHECK-COMMON-LABEL: test_pow:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: fcvt s1, h1
-; CHECK-COMMON-NEXT: bl {{_?}}powf
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
 define half @test_pow(half %a, half %b) #0 {
+; CHECK-LABEL: test_pow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    fcvt s1, h1
+; CHECK-NEXT:    bl powf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.pow.f16(half %a, half %b)
   ret half %r
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_exp
-; FALLBACK-FP16-NOT: remark:{{.*}}test_exp
-
-; CHECK-COMMON-LABEL: test_exp:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}expf
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_exp:
-; GISEL-NEXT: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}expf
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
 define half @test_exp(half %a) #0 {
+; CHECK-LABEL: test_exp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl expf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.exp.f16(half %a)
   ret half %r
 }
 
-; CHECK-COMMON-LABEL: test_exp2:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}exp2f
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_exp2:
-; GISEL-NEXT: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}exp2f
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
 define half @test_exp2(half %a) #0 {
+; CHECK-LABEL: test_exp2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl exp2f
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.exp2.f16(half %a)
   ret half %r
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_log
-; FALLBACK-FP16-NOT: remark:{{.*}}test_log
-
-; CHECK-COMMON-LABEL: test_log:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}logf
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_log:
-; GISEL: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}logf
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
-
 define half @test_log(half %a) #0 {
+; CHECK-LABEL: test_log:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl logf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.log.f16(half %a)
   ret half %r
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_log10
-; FALLBACK-FP16-NOT: remark:{{.*}}test_log10
-
-; CHECK-COMMON-LABEL: test_log10:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}log10f
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_log10:
-; GISEL-NEXT: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}log10f
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
-
 define half @test_log10(half %a) #0 {
+; CHECK-LABEL: test_log10:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl log10f
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.log10.f16(half %a)
   ret half %r
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_log2
-; FALLBACK-FP16-NOT: remark:{{.*}}test_log2
-
-; CHECK-COMMON-LABEL: test_log2:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-COMMON-NEXT: mov  x29, sp
-; CHECK-COMMON-NEXT: fcvt s0, h0
-; CHECK-COMMON-NEXT: bl {{_?}}log2f
-; CHECK-COMMON-NEXT: fcvt h0, s0
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16
-; CHECK-COMMON-NEXT: ret
-
-; GISEL-LABEL: test_log2:
-; GISEL-NEXT: stp x29, x30, [sp, #-16]!
-; GISEL-NEXT: mov  x29, sp
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: bl {{_?}}log2f
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ldp x29, x30, [sp], #16
-; GISEL-NEXT: ret
-
 define half @test_log2(half %a) #0 {
+; CHECK-LABEL: test_log2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl log2f
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call half @llvm.log2.f16(half %a)
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_fma:
-; CHECK-CVT-NEXT: fcvt s2, h2
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fmadd s0, s0, s1, s2
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fma:
-; CHECK-FP16-NEXT: fmadd h0, h0, h1, h2
-; CHECK-FP16-NEXT: ret
-
 define half @test_fma(half %a, half %b, half %c) #0 {
+; CHECK-CVT-SD-LABEL: test_fma:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s2, h2
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fmadd s0, s0, s1, s2
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fma:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fmadd h0, h0, h1, h2
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fma:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fcvt s2, h2
+; CHECK-CVT-GI-NEXT:    fmadd s0, s0, s1, s2
+; CHECK-CVT-GI-NEXT:    fcvt h0, s0
+; CHECK-CVT-GI-NEXT:    ret
   %r = call half @llvm.fma.f16(half %a, half %b, half %c)
   ret half %r
 }
 
+define half @test_fabs(half %a) #0 {
 ; CHECK-CVT-LABEL: test_fabs:
-; CHECK-CVT-NEXT: fmov w8, s0
-; CHECK-CVT-NEXT: and w8, w8, #0x7fff
-; CHECK-CVT-NEXT: fmov s0, w8
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    and w8, w8, #0x7fff
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_fabs:
-; CHECK-FP16-NEXT: fabs h0, h0
-; CHECK-FP16-NEXT: ret
-
-; FALLBACK-NOT: remark:{{.*}}test_fabs
-; FALLBACK-FP16-NOT: remark:{{.*}}test_fabs
-
-; GISEL-CVT-LABEL: test_fabs:
-; GISEL-CVT-NEXT: fcvt s0, h0
-; GISEL-CVT-NEXT: fabs s0, s0
-; GISEL-CVT-NEXT: fcvt h0, s0
-; GISEL-CVT-NEXT: ret
-
-; GISEL-FP16-LABEL: test_fabs:
-; GISEL-FP16-NEXT: fabs h0, h0
-; GISEL-FP16-NEXT: ret
-
-define half @test_fabs(half %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fabs h0, h0
+; CHECK-FP16-NEXT:    ret
   %r = call half @llvm.fabs.f16(half %a)
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_minnum:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fminnm s0, s0, s1
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_minnum:
-; CHECK-FP16-NEXT: fminnm h0, h0, h1
-; CHECK-FP16-NEXT: ret
-
 define half @test_minnum(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_minnum:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fminnm s0, s0, s1
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_minnum:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fminnm h0, h0, h1
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_minnum:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fminnm s0, s0, s1
+; CHECK-CVT-GI-NEXT:    fcvt h0, s0
+; CHECK-CVT-GI-NEXT:    ret
   %r = call half @llvm.minnum.f16(half %a, half %b)
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_maxnum:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fmaxnm s0, s0, s1
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_maxnum:
-; CHECK-FP16-NEXT: fmaxnm h0, h0, h1
-; CHECK-FP16-NEXT: ret
-
 define half @test_maxnum(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_maxnum:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fmaxnm s0, s0, s1
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_maxnum:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fmaxnm h0, h0, h1
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_maxnum:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fmaxnm s0, s0, s1
+; CHECK-CVT-GI-NEXT:    fcvt h0, s0
+; CHECK-CVT-GI-NEXT:    ret
   %r = call half @llvm.maxnum.f16(half %a, half %b)
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_copysign:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
-; CHECK-CVT-NEXT: bif.16b v0, v1, v2
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_copysign:
-; CHECK-FP16-NEXT: mvni.8h v2, #128, lsl #8
-; CHECK-FP16-NEXT: bif.16b  v0, v1, v2
-; CHECK-FP16-NEXT: ret
-
 define half @test_copysign(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_copysign:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-CVT-SD-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: test_copysign:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    mvni v2.8h, #128, lsl #8
+; CHECK-FP16-SD-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-FP16-SD-NEXT:    // kill: def $h1 killed $h1 def $q1
+; CHECK-FP16-SD-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-FP16-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_copysign:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    mvni v2.4h, #128, lsl #8
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: test_copysign:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    mvni v2.4h, #128, lsl #8
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-FP16-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-FP16-GI-NEXT:    ret
   %r = call half @llvm.copysign.f16(half %a, half %b)
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_copysign_f32:
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
-; CHECK-CVT-NEXT: bif.16b v0, v1, v2
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_copysign_f32:
-; CHECK-FP16-NEXT: fcvt h1, s1
-; CHECK-FP16-NEXT: mvni.8h	v2, #128, lsl #8
-; CHECK-FP16-NEXT: bif.16b v0, v1, v2
-; CHECK-FP16-NEXT: ret
-
 define half @test_copysign_f32(half %a, float %b) #0 {
+; CHECK-CVT-SD-LABEL: test_copysign_f32:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-CVT-SD-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-CVT-SD-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: test_copysign_f32:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvt h1, s1
+; CHECK-FP16-SD-NEXT:    mvni v2.8h, #128, lsl #8
+; CHECK-FP16-SD-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-FP16-SD-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-FP16-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_copysign_f32:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt h1, s1
+; CHECK-CVT-GI-NEXT:    mvni v2.4h, #128, lsl #8
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: test_copysign_f32:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    fcvt h1, s1
+; CHECK-FP16-GI-NEXT:    mvni v2.4h, #128, lsl #8
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-FP16-GI-NEXT:    ret
   %tb = fptrunc float %b to half
   %r = call half @llvm.copysign.f16(half %a, half %tb)
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_copysign_f64:
-; CHECK-CVT-NEXT: fcvt s1, d1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
-; CHECK-CVT-NEXT: bif.16b v0, v1, v2
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_copysign_f64:
-; CHECK-FP16-NEXT: fcvt h1, d1
-; CHECK-FP16-NEXT: mvni.8h v2, #128, lsl #8
-; CHECK-FP16-NEXT: bif.16b v0, v1, v2
-; CHECK-FP16-NEXT: ret
-
 define half @test_copysign_f64(half %a, double %b) #0 {
+; CHECK-CVT-SD-LABEL: test_copysign_f64:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, d1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-CVT-SD-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: test_copysign_f64:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvt h1, d1
+; CHECK-FP16-SD-NEXT:    mvni v2.8h, #128, lsl #8
+; CHECK-FP16-SD-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-FP16-SD-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-FP16-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_copysign_f64:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt h1, d1
+; CHECK-CVT-GI-NEXT:    mvni v2.4h, #128, lsl #8
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: test_copysign_f64:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    fcvt h1, d1
+; CHECK-FP16-GI-NEXT:    mvni v2.4h, #128, lsl #8
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-FP16-GI-NEXT:    ret
   %tb = fptrunc double %b to half
   %r = call half @llvm.copysign.f16(half %a, half %tb)
   ret half %r
@@ -1328,204 +1584,190 @@ define half @test_copysign_f64(half %a, double %b) #0 {
 ; Check that the FP promotion will use a truncating FP_ROUND, so we can fold
 ; away the (fpext (fp_round <result>)) here.
 
-; CHECK-CVT-LABEL: test_copysign_extended:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
-; CHECK-CVT-NEXT: bif.16b v0, v1, v2
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_copysign_extended:
-; CHECK-FP16-NEXT: mvni.8h v2, #128, lsl #8
-; CHECK-FP16-NEXT: bif.16b v0, v1, v2
-; CHECK-FP16-NEXT: fcvt s0, h0
-; CHECK-FP16-NEXT: ret
-
 define float @test_copysign_extended(half %a, half %b) #0 {
+; CHECK-CVT-SD-LABEL: test_copysign_extended:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-CVT-SD-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-CVT-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: test_copysign_extended:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    mvni v2.8h, #128, lsl #8
+; CHECK-FP16-SD-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-FP16-SD-NEXT:    // kill: def $h1 killed $h1 def $q1
+; CHECK-FP16-SD-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-FP16-SD-NEXT:    fcvt s0, h0
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_copysign_extended:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    mvni v2.4h, #128, lsl #8
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: test_copysign_extended:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    mvni v2.4h, #128, lsl #8
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-FP16-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-FP16-GI-NEXT:    fcvt s0, h0
+; CHECK-FP16-GI-NEXT:    ret
   %r = call half @llvm.copysign.f16(half %a, half %b)
   %xr = fpext half %r to float
   ret float %xr
 }
 
+define half @test_floor(half %a) #0 {
 ; CHECK-CVT-LABEL: test_floor:
-; CHECK-CVT-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
-; CHECK-CVT-NEXT: frintm [[INT32:s[0-9]+]], [[FLOAT32]]
-; CHECK-CVT-NEXT: fcvt h0, [[INT32]]
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintm s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_floor:
-; CHECK-FP16-NEXT: frintm h0, h0
-; CHECK-FP16-NEXT: ret
-
-; FALLBACK-NOT: remark:{{.*}}test_floor
-; FALLBACK-FP16-NOT: remark:{{.*}}test_floor
-
-; GISEL-CVT-LABEL: test_floor:
-; GISEL-CVT-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
-; GISEL-CVT-NEXT: frintm [[INT32:s[0-9]+]], [[FLOAT32]]
-; GISEL-CVT-NEXT: fcvt h0, [[INT32]]
-; GISEL-CVT-NEXT: ret
-
-; GISEL-FP16-LABEL: test_floor:
-; GISEL-FP16-NEXT: frintm h0, h0
-; GISEL-FP16-NEXT: ret
-
-define half @test_floor(half %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    frintm h0, h0
+; CHECK-FP16-NEXT:    ret
   %r = call half @llvm.floor.f16(half %a)
   ret half %r
 }
 
+define half @test_ceil(half %a) #0 {
 ; CHECK-CVT-LABEL: test_ceil:
-; CHECK-CVT-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
-; CHECK-CVT-NEXT: frintp [[INT32:s[0-9]+]], [[FLOAT32]]
-; CHECK-CVT-NEXT: fcvt h0, [[INT32]]
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintp s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_ceil:
-; CHECK-FP16-NEXT: frintp h0, h0
-; CHECK-FP16-NEXT: ret
-
-; FALLBACK-NOT: remark:{{.*}}test_ceil
-; FALLBACK-FP16-NOT: remark:{{.*}}test_ceil
-
-; GISEL-CVT-LABEL: test_ceil:
-; GISEL-CVT-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
-; GISEL-CVT-NEXT: frintp [[INT32:s[0-9]+]], [[FLOAT32]]
-; GISEL-CVT-NEXT: fcvt h0, [[INT32]]
-; GISEL-CVT-NEXT: ret
-
-; GISEL-FP16-LABEL: test_ceil:
-; GISEL-FP16-NEXT: frintp h0, h0
-; GISEL-FP16-NEXT: ret
-define half @test_ceil(half %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    frintp h0, h0
+; CHECK-FP16-NEXT:    ret
   %r = call half @llvm.ceil.f16(half %a)
   ret half %r
 }
 
+define half @test_trunc(half %a) #0 {
 ; CHECK-CVT-LABEL: test_trunc:
-; CHECK-CVT-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
-; CHECK-CVT-NEXT: frintz [[INT32:s[0-9]+]], [[FLOAT32]]
-; CHECK-CVT-NEXT: fcvt h0, [[INT32]]
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintz s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_trunc:
-; CHECK-FP16-NEXT: frintz h0, h0
-; CHECK-FP16-NEXT: ret
-
-define half @test_trunc(half %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    frintz h0, h0
+; CHECK-FP16-NEXT:    ret
   %r = call half @llvm.trunc.f16(half %a)
   ret half %r
 }
 
+define half @test_rint(half %a) #0 {
 ; CHECK-CVT-LABEL: test_rint:
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: frintx s0, s0
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintx s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_rint:
-; CHECK-FP16-NEXT: frintx h0, h0
-; CHECK-FP16-NEXT: ret
-
-define half @test_rint(half %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    frintx h0, h0
+; CHECK-FP16-NEXT:    ret
   %r = call half @llvm.rint.f16(half %a)
   ret half %r
 }
 
+define half @test_nearbyint(half %a) #0 {
 ; CHECK-CVT-LABEL: test_nearbyint:
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: frinti s0, s0
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frinti s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_nearbyint:
-; CHECK-FP16-NEXT: frinti h0, h0
-; CHECK-FP16-NEXT: ret
-
-define half @test_nearbyint(half %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    frinti h0, h0
+; CHECK-FP16-NEXT:    ret
   %r = call half @llvm.nearbyint.f16(half %a)
   ret half %r
 }
 
+define half @test_round(half %a) #0 {
 ; CHECK-CVT-LABEL: test_round:
-; CHECK-CVT-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
-; CHECK-CVT-NEXT: frinta [[INT32:s[0-9]+]], [[FLOAT32]]
-; CHECK-CVT-NEXT: fcvt h0, [[INT32]]
-; CHECK-CVT-NEXT: ret
-
-; GISEL-CVT-LABEL: test_round:
-; GISEL-CVT-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
-; GISEL-CVT-NEXT: frinta [[INT32:s[0-9]+]], [[FLOAT32]]
-; GISEL-CVT-NEXT: fcvt h0, [[INT32]]
-; GISEL-CVT-NEXT: ret
-
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frinta s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_round:
-; CHECK-FP16-NEXT: frinta h0, h0
-; CHECK-FP16-NEXT: ret
-
-; GISEL-FP16-LABEL: test_round:
-; GISEL-FP16-NEXT: frinta h0, h0
-; GISEL-FP16-NEXT: ret
-
-define half @test_round(half %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    frinta h0, h0
+; CHECK-FP16-NEXT:    ret
   %r = call half @llvm.round.f16(half %a)
   ret half %r
 }
 
+define half @test_roundeven(half %a) #0 {
 ; CHECK-CVT-LABEL: test_roundeven:
-; CHECK-CVT-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
-; CHECK-CVT-NEXT: frintn [[INT32:s[0-9]+]], [[FLOAT32]]
-; CHECK-CVT-NEXT: fcvt h0, [[INT32]]
-; CHECK-CVT-NEXT: ret
-
-; GISEL-CVT-LABEL: test_roundeven:
-; GISEL-CVT-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
-; GISEL-CVT-NEXT: frintn [[INT32:s[0-9]+]], [[FLOAT32]]
-; GISEL-CVT-NEXT: fcvt h0, [[INT32]]
-; GISEL-CVT-NEXT: ret
-
-
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintn s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    ret
+;
 ; CHECK-FP16-LABEL: test_roundeven:
-; CHECK-FP16-NEXT: frintn h0, h0
-; CHECK-FP16-NEXT: ret
-
-; GISEL-FP16-LABEL: test_roundeven:
-; GISEL-FP16-NEXT: frintn h0, h0
-; GISEL-FP16-NEXT: ret
-
-define half @test_roundeven(half %a) #0 {
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    frintn h0, h0
+; CHECK-FP16-NEXT:    ret
   %r = call half @llvm.roundeven.f16(half %a)
   ret half %r
 }
 
-; CHECK-CVT-LABEL: test_fmuladd:
-; CHECK-CVT-NEXT: fcvt s1, h1
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fmul s0, s0, s1
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcvt s1, h2
-; CHECK-CVT-NEXT: fadd s0, s0, s1
-; CHECK-CVT-NEXT: fcvt h0, s0
-; CHECK-CVT-NEXT: ret
-
-; CHECK-FP16-LABEL: test_fmuladd:
-; CHECK-FP16-NEXT: fmadd h0, h0, h1, h2
-; CHECK-FP16-NEXT: ret
-
 define half @test_fmuladd(half %a, half %b, half %c) #0 {
+; CHECK-CVT-SD-LABEL: test_fmuladd:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvt s1, h1
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fmul s0, s0, s1
+; CHECK-CVT-SD-NEXT:    fcvt s1, h2
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    fcvt s0, h0
+; CHECK-CVT-SD-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_fmuladd:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fmadd h0, h0, h1, h2
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fmuladd:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fcvt s1, h1
+; CHECK-CVT-GI-NEXT:    fmul s0, s0, s1
+; CHECK-CVT-GI-NEXT:    fcvt s1, h2
+; CHECK-CVT-GI-NEXT:    fcvt h0, s0
+; CHECK-CVT-GI-NEXT:    fcvt s0, h0
+; CHECK-CVT-GI-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-GI-NEXT:    fcvt h0, s0
+; CHECK-CVT-GI-NEXT:    ret
   %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
   ret half %r
 }
 
 attributes #0 = { nounwind }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-COMMON: {{.*}}
-; CHECK-CVT: {{.*}}
-; CHECK-FP16: {{.*}}
-; FALLBACK: {{.*}}
-; FALLBACK-FP16: {{.*}}
-; GISEL: {{.*}}
-; GISEL-CVT: {{.*}}
-; GISEL-FP16: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index e19e2ead11f4d..43e9007073634 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -41,9 +41,11 @@ define half @fabs_f16(half %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fabs s0, s0
-; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s0
+; CHECK-GI-NOFP16-NEXT:    and w8, w8, #0x7fff
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
+; CHECK-GI-NOFP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_f16:
@@ -160,22 +162,8 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    fabs v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    fabs v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    mvni v1.8h, #128, lsl #8
+; CHECK-GI-NOFP16-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:
@@ -200,9 +188,8 @@ define <4 x half> @fabs_v4f16(<4 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v4f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fabs v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mvni v1.4h, #128, lsl #8
+; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v4f16:
@@ -227,12 +214,8 @@ define <8 x half> @fabs_v8f16(<8 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v8f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-GI-NOFP16-NEXT:    fabs v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fabs v2.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mvni v1.8h, #128, lsl #8
+; CHECK-GI-NOFP16-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v8f16:
@@ -259,18 +242,9 @@ define <16 x half> @fabs_v16f16(<16 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v16f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
-; CHECK-GI-NOFP16-NEXT:    fabs v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    fabs v3.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    fabs v4.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    fabs v5.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn2 v0.8h, v4.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn2 v1.8h, v5.4s
+; CHECK-GI-NOFP16-NEXT:    mvni v2.8h, #128, lsl #8
+; CHECK-GI-NOFP16-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NOFP16-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v16f16:
diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll
index a0e9edff733e0..de2671afe60ab 100644
--- a/llvm/test/CodeGen/AArch64/fneg.ll
+++ b/llvm/test/CodeGen/AArch64/fneg.ll
@@ -41,9 +41,11 @@ define half @fabs_f16(half %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fneg s0, s0
-; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s0
+; CHECK-GI-NOFP16-NEXT:    eor w8, w8, #0xffff8000
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
+; CHECK-GI-NOFP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_f16:
@@ -161,22 +163,8 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    fneg v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    fneg v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    movi v1.8h, #128, lsl #8
+; CHECK-GI-NOFP16-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:
@@ -202,9 +190,8 @@ define <4 x half> @fabs_v4f16(<4 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v4f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fneg v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    movi v1.4h, #128, lsl #8
+; CHECK-GI-NOFP16-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v4f16:
@@ -230,12 +217,8 @@ define <8 x half> @fabs_v8f16(<8 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v8f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-GI-NOFP16-NEXT:    fneg v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fneg v2.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    movi v1.8h, #128, lsl #8
+; CHECK-GI-NOFP16-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v8f16:
@@ -263,18 +246,9 @@ define <16 x half> @fabs_v16f16(<16 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v16f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
-; CHECK-GI-NOFP16-NEXT:    fneg v2.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    fneg v3.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    fneg v4.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    fneg v5.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn2 v0.8h, v4.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn2 v1.8h, v5.4s
+; CHECK-GI-NOFP16-NEXT:    movi v2.8h, #128, lsl #8
+; CHECK-GI-NOFP16-NEXT:    eor v0.16b, v0.16b, v2.16b
+; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v1.16b, v2.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v16f16:
diff --git a/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll b/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll
new file mode 100644
index 0000000000000..da0e365db2d31
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s
+
+
+define <4 x half> @test_fscale_f16(<4 x half> %vn, <4 x i16> %vm) {
+; CHECK-LABEL: test_fscale_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %res = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> %vn, <4 x i16> %vm)
+  ret <4 x half> %res
+}
+
+define <8 x half> @test_fscaleq_f16(<8 x half> %vn, <8 x i16> %vm) {
+; CHECK-LABEL: test_fscaleq_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %res = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> %vn, <8 x i16> %vm)
+  ret <8 x half> %res
+}
+
+define <2 x float> @test_fscale_f32(<2 x float> %vn, <2 x i32> %vm) {
+; CHECK-LABEL: test_fscale_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %res = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> %vn, <2 x i32> %vm)
+  ret <2 x float> %res
+}
+
+define <4 x float> @test_fscaleq_f32(<4 x float> %vn, <4 x i32> %vm) {
+; CHECK-LABEL: test_fscaleq_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %res = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> %vn, <4 x i32> %vm)
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_fscaleq_f64(<2 x double> %vn, <2 x i64> %vm) {
+; CHECK-LABEL: test_fscaleq_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fscale v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %res = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> %vn, <2 x i64> %vm)
+  ret <2 x double> %res
+}
+
+declare <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half>, <4 x i16>)
+declare <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half>, <8 x i16>)
+declare <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float>, <2 x i32>)
+declare <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float>, <4 x i32>)
+declare <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double>, <2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 841da1f8ea57c..c1b9a4c9dbb79 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -211,6 +211,162 @@ define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
   ret <2 x i32> %partial.reduce
 }
 
+define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-DOT-LABEL: udot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot_8to64:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    umull v4.8h, v2.8b, v3.8b
+; CHECK-NODOT-NEXT:    umull2 v2.8h, v2.16b, v3.16b
+; CHECK-NODOT-NEXT:    ushll v3.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    ushll v5.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    ushll2 v4.4s, v4.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v3.4s
+; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v3.2s
+; CHECK-NODOT-NEXT:    uaddl2 v3.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT:    uaddl v4.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v2.4s
+; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-NODOT-NEXT:    add v0.2d, v4.2d, v0.2d
+; CHECK-NODOT-NEXT:    ret
+entry:
+  %a.wide = zext <16 x i8> %a to <16 x i64>
+  %b.wide = zext <16 x i8> %b to <16 x i64>
+  %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
+  <4 x i64> %acc, <16 x i64> %mult)
+  ret <4 x i64> %partial.reduce
+}
+
+define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
+; CHECK-DOT-LABEL: sdot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: sdot_8to64:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    smull v4.8h, v2.8b, v3.8b
+; CHECK-NODOT-NEXT:    smull2 v2.8h, v2.16b, v3.16b
+; CHECK-NODOT-NEXT:    sshll v3.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    sshll v5.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v3.4s
+; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v3.2s
+; CHECK-NODOT-NEXT:    saddl2 v3.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT:    saddl v4.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v2.4s
+; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-NODOT-NEXT:    add v0.2d, v4.2d, v0.2d
+; CHECK-NODOT-NEXT:    ret
+entry:
+  %a.wide = sext <16 x i8> %a to <16 x i64>
+  %b.wide = sext <16 x i8> %b to <16 x i64>
+  %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
+  <4 x i64> %acc, <16 x i64> %mult)
+  ret <4 x i64> %partial.reduce
+}
+
+define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
+; CHECK-NOI8MM-LABEL: usdot_8to64:
+; CHECK-NOI8MM:       // %bb.0: // %entry
+; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NOI8MM-NEXT:    ushll v6.4s, v4.4h, #0
+; CHECK-NOI8MM-NEXT:    sshll v7.4s, v5.4h, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v4.4s, v4.8h, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v5.4s, v5.8h, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v16.4s, v2.8h, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v17.4s, v3.8h, #0
+; CHECK-NOI8MM-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NOI8MM-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v6.4s, v7.4s
+; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v7.2s
+; CHECK-NOI8MM-NEXT:    smull v18.2d, v4.2s, v5.2s
+; CHECK-NOI8MM-NEXT:    smull2 v4.2d, v4.4s, v5.4s
+; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v16.4s, v17.4s
+; CHECK-NOI8MM-NEXT:    smlal v0.2d, v16.2s, v17.2s
+; CHECK-NOI8MM-NEXT:    smlal2 v4.2d, v2.4s, v3.4s
+; CHECK-NOI8MM-NEXT:    smlal v18.2d, v2.2s, v3.2s
+; CHECK-NOI8MM-NEXT:    add v1.2d, v4.2d, v1.2d
+; CHECK-NOI8MM-NEXT:    add v0.2d, v18.2d, v0.2d
+; CHECK-NOI8MM-NEXT:    ret
+;
+; CHECK-I8MM-LABEL: usdot_8to64:
+; CHECK-I8MM:       // %bb.0: // %entry
+; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
+; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-I8MM-NEXT:    ret
+entry:
+  %a.wide = zext <16 x i8> %a to <16 x i64>
+  %b.wide = sext <16 x i8> %b to <16 x i64>
+  %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
+  <4 x i64> %acc, <16 x i64> %mult)
+  ret <4 x i64> %partial.reduce
+}
+
+define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-NOI8MM-LABEL: sudot_8to64:
+; CHECK-NOI8MM:       // %bb.0: // %entry
+; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-NOI8MM-NEXT:    sshll v6.4s, v4.4h, #0
+; CHECK-NOI8MM-NEXT:    ushll v7.4s, v5.4h, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v5.4s, v5.8h, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v16.4s, v2.8h, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v17.4s, v3.8h, #0
+; CHECK-NOI8MM-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NOI8MM-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v6.4s, v7.4s
+; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v7.2s
+; CHECK-NOI8MM-NEXT:    smull v18.2d, v4.2s, v5.2s
+; CHECK-NOI8MM-NEXT:    smull2 v4.2d, v4.4s, v5.4s
+; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v16.4s, v17.4s
+; CHECK-NOI8MM-NEXT:    smlal v0.2d, v16.2s, v17.2s
+; CHECK-NOI8MM-NEXT:    smlal2 v4.2d, v2.4s, v3.4s
+; CHECK-NOI8MM-NEXT:    smlal v18.2d, v2.2s, v3.2s
+; CHECK-NOI8MM-NEXT:    add v1.2d, v4.2d, v1.2d
+; CHECK-NOI8MM-NEXT:    add v0.2d, v18.2d, v0.2d
+; CHECK-NOI8MM-NEXT:    ret
+;
+; CHECK-I8MM-LABEL: sudot_8to64:
+; CHECK-I8MM:       // %bb.0: // %entry
+; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
+; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-I8MM-NEXT:    ret
+entry:
+  %a.wide = sext <16 x i8> %a to <16 x i64>
+  %b.wide = zext <16 x i8> %b to <16 x i64>
+  %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
+  <4 x i64> %acc, <16 x i64> %mult)
+  ret <4 x i64> %partial.reduce
+}
+
 define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 ; CHECK-LABEL: not_udot:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/pr58516.ll b/llvm/test/CodeGen/AArch64/pr58516.ll
index b4840f01ce116..3361ded48d4e2 100644
--- a/llvm/test/CodeGen/AArch64/pr58516.ll
+++ b/llvm/test/CodeGen/AArch64/pr58516.ll
@@ -40,7 +40,8 @@ define void @osfx(ptr %this) comdat personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:  // %bb.1: // %invoke.cont12
 ; CHECK-NEXT:    str wzr, [x20]
 ; CHECK-NEXT:    str wzr, [x21]
-; CHECK-NEXT:  .LBB0_2: // %try.cont
+; CHECK-NEXT:  .LBB0_2: // Block address taken
+; CHECK-NEXT:    // %try.cont
 ; CHECK-NEXT:  $ehgcr_0_2:
 ; CHECK-NEXT:    .seh_startepilogue
 ; CHECK-NEXT:    sub sp, x29, #24
diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
new file mode 100644
index 0000000000000..cf490021026e0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
@@ -0,0 +1,459 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sve -aarch64-disable-multivector-spill-fill -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -mattr=+sve -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2  -mattr=+sve -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR
+
+declare void @my_func()
+declare void @my_func2(<vscale x 16 x i8> %v)
+
+define void @fbyte(<vscale x 16 x i8> %v) #0{
+; NOPAIR-LABEL: fbyte:
+; NOPAIR:       // %bb.0:
+; NOPAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; NOPAIR-NEXT:    cntd x9
+; NOPAIR-NEXT:    stp x9, x19, [sp, #16] // 16-byte Folded Spill
+; NOPAIR-NEXT:    addvl sp, sp, #-18
+; NOPAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    addvl sp, sp, #-1
+; NOPAIR-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; NOPAIR-NEXT:    bl __arm_sme_state
+; NOPAIR-NEXT:    and x19, x0, #0x1
+; NOPAIR-NEXT:    tbz w19, #0, .LBB0_2
+; NOPAIR-NEXT:  // %bb.1:
+; NOPAIR-NEXT:    smstop sm
+; NOPAIR-NEXT:  .LBB0_2:
+; NOPAIR-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; NOPAIR-NEXT:    bl my_func2
+; NOPAIR-NEXT:    tbz w19, #0, .LBB0_4
+; NOPAIR-NEXT:  // %bb.3:
+; NOPAIR-NEXT:    smstart sm
+; NOPAIR-NEXT:  .LBB0_4:
+; NOPAIR-NEXT:    addvl sp, sp, #1
+; NOPAIR-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    addvl sp, sp, #18
+; NOPAIR-NEXT:    ldr x19, [sp, #24] // 8-byte Folded Reload
+; NOPAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; NOPAIR-NEXT:    ret
+;
+; PAIR-LABEL: fbyte:
+; PAIR:       // %bb.0:
+; PAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; PAIR-NEXT:    cntd x9
+; PAIR-NEXT:    stp x9, x19, [sp, #16] // 16-byte Folded Spill
+; PAIR-NEXT:    addvl sp, sp, #-18
+; PAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    addvl sp, sp, #-1
+; PAIR-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; PAIR-NEXT:    bl __arm_sme_state
+; PAIR-NEXT:    and x19, x0, #0x1
+; PAIR-NEXT:    tbz w19, #0, .LBB0_2
+; PAIR-NEXT:  // %bb.1:
+; PAIR-NEXT:    smstop sm
+; PAIR-NEXT:  .LBB0_2:
+; PAIR-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; PAIR-NEXT:    bl my_func2
+; PAIR-NEXT:    tbz w19, #0, .LBB0_4
+; PAIR-NEXT:  // %bb.3:
+; PAIR-NEXT:    smstart sm
+; PAIR-NEXT:  .LBB0_4:
+; PAIR-NEXT:    addvl sp, sp, #1
+; PAIR-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    addvl sp, sp, #18
+; PAIR-NEXT:    ldr x19, [sp, #24] // 8-byte Folded Reload
+; PAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; PAIR-NEXT:    ret
+  call void @my_func2(<vscale x 16 x i8> %v)
+  ret void
+}
+
+define void @fhalf(<vscale x 8 x half> %v) #1{
+; NOPAIR-LABEL: fhalf:
+; NOPAIR:       // %bb.0:
+; NOPAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; NOPAIR-NEXT:    cntd x9
+; NOPAIR-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; NOPAIR-NEXT:    addvl sp, sp, #-18
+; NOPAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    smstop sm
+; NOPAIR-NEXT:    bl my_func
+; NOPAIR-NEXT:    smstart sm
+; NOPAIR-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    addvl sp, sp, #18
+; NOPAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; NOPAIR-NEXT:    ret
+;
+; PAIR-LABEL: fhalf:
+; PAIR:       // %bb.0:
+; PAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; PAIR-NEXT:    cntd x9
+; PAIR-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; PAIR-NEXT:    addvl sp, sp, #-18
+; PAIR-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    ptrue pn8.b
+; PAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    smstop sm
+; PAIR-NEXT:    bl my_func
+; PAIR-NEXT:    smstart sm
+; PAIR-NEXT:    ptrue pn8.b
+; PAIR-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    addvl sp, sp, #18
+; PAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; PAIR-NEXT:    ret
+  call void @my_func()
+  ret void
+}
+
+define void @ffloat(<vscale x 4 x i32> %v) #2 {
+; NOPAIR-LABEL: ffloat:
+; NOPAIR:       // %bb.0:
+; NOPAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; NOPAIR-NEXT:    rdsvl x9, #1
+; NOPAIR-NEXT:    lsr x9, x9, #3
+; NOPAIR-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; NOPAIR-NEXT:    cntd x9
+; NOPAIR-NEXT:    str x9, [sp, #24] // 8-byte Folded Spill
+; NOPAIR-NEXT:    addsvl sp, sp, #-18
+; NOPAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    smstart sm
+; NOPAIR-NEXT:    smstop sm
+; NOPAIR-NEXT:    bl my_func
+; NOPAIR-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT:    addsvl sp, sp, #18
+; NOPAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; NOPAIR-NEXT:    ret
+;
+; PAIR-LABEL: ffloat:
+; PAIR:       // %bb.0:
+; PAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; PAIR-NEXT:    rdsvl x9, #1
+; PAIR-NEXT:    lsr x9, x9, #3
+; PAIR-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; PAIR-NEXT:    cntd x9
+; PAIR-NEXT:    str x9, [sp, #24] // 8-byte Folded Spill
+; PAIR-NEXT:    addsvl sp, sp, #-18
+; PAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    smstart sm
+; PAIR-NEXT:    smstop sm
+; PAIR-NEXT:    bl my_func
+; PAIR-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    addsvl sp, sp, #18
+; PAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; PAIR-NEXT:    ret
+  call void @my_func()
+  ret void
+}
+
+
+
+attributes #0 = { nounwind "aarch64_pstate_sm_compatible" }
+attributes #1 = { nounwind "aarch64_pstate_sm_enabled" }
+attributes #2 = { nounwind "aarch64_pstate_sm_body" }
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fscale.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fscale.ll
new file mode 100644
index 0000000000000..591fe8da6b79c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fscale.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+fp8 -force-streaming -verify-machineinstrs < %s | FileCheck %s
+
+; FSCALE (Single, x2)
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_scale_single_x2_half( <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vec_scale_single_x2_half:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    fscale { z0.h, z1.h }, { z0.h, z1.h }, z2.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.single.x2.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x i16> %zm)
+  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_scale_single_x2_float( <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x i32> %zm) {
+; CHECK-LABEL: multi_vec_scale_single_x2_float:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    fscale { z0.s, z1.s }, { z0.s, z1.s }, z2.s
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.single.x2.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x i32> %zm)
+  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_scale_single_x2_double( <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x i64> %zm) {
+; CHECK-LABEL: multi_vec_scale_single_x2_double:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    fscale { z0.d, z1.d }, { z0.d, z1.d }, z2.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.single.x2.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x i64> %zm)
+  ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; FSCALE (Single, x4)
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_scale_single_x4_half( <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vec_scale_single_x4_half:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    fscale { z0.h - z3.h }, { z0.h - z3.h }, z4.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.single.x4.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x i16> %zm)
+  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_scale_single_x4_float( <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2,  <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x i32> %zm) {
+; CHECK-LABEL: multi_vec_scale_single_x4_float:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    fscale { z0.s - z3.s }, { z0.s - z3.s }, z4.s
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.single.x4.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x i32> %zm)
+  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_scale_single_x4_double( <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x i64> %zm) {
+; CHECK-LABEL: multi_vec_scale_single_x4_double:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    fscale { z0.d - z3.d }, { z0.d - z3.d }, z4.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.single.x4.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x i64> %zm)
+  ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; FSCALE (Multi, x2)
+define { <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_scale_x2_half( <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) {
+; CHECK-LABEL: multi_vec_scale_x2_half:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT:    fscale { z0.h, z1.h }, { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.x2.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_scale_x2_float( <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2 ) {
+; CHECK-LABEL: multi_vec_scale_x2_float:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT:    fscale { z0.s, z1.s }, { z0.s, z1.s }, { z2.s, z3.s }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.x2.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
+  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_scale_x2_double( <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) {
+; CHECK-LABEL: multi_vec_scale_x2_double:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT:    fscale { z0.d, z1.d }, { z0.d, z1.d }, { z2.d, z3.d }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.x2.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
+  ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; FSCALE (Multi, x4)
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_scale_x4_half( <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
+; CHECK-LABEL: multi_vec_scale_x4_half:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT:    fscale { z0.h - z3.h }, { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.x4.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4)
+  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_scale_x4_float( <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2,  <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
+; CHECK-LABEL: multi_vec_scale_x4_float:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT:    fscale { z0.s - z3.s }, { z0.s - z3.s }, { z4.s - z7.s }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.x4.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x i32> %zm1,  <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4)
+  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_scale_x4_double( <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
+; CHECK-LABEL: multi_vec_scale_x4_double:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT:    fscale { z0.d - z3.d }, { z0.d - z3.d }, { z4.d - z7.d }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.x4.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4)
+  ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.single.x2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i16>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.single.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i32>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.single.x2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i64>)
+
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.single.x4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>,<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i16>)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.single.x4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i32>)
+declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.single.x4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i64>)
+
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.x2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.x2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.scale.x4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>,<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.fp8.scale.x4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.fp8.scale.x4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll
new file mode 100644
index 0000000000000..778f31194baf4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>}  @test_luti4_zt_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) #0 {
+; CHECK-LABEL: test_luti4_zt_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    luti4 { z0.b - z3.b }, zt0, { z0, z1 }
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res
+}
+
+attributes #0 = { "target-features"="+sme2,+sme-lutv2"}
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-write-zt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-write-zt.ll
new file mode 100644
index 0000000000000..d877eff1d3ad9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-write-zt.ll
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+
+define void @test_write_zt_i8_0(<vscale x 16 x i8> %zn) #0 {
+; CHECK-LABEL: test_write_zt_i8_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0, z0
+; CHECK-NEXT:    ret
+ call void @llvm.aarch64.sme.write.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> %zn, i32 0)
+ ret void
+}
+
+define void @test_write_zt_i8_1(<vscale x 16 x i8> %zn) #0 {
+; CHECK-LABEL: test_write_zt_i8_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0[1, mul vl], z0
+; CHECK-NEXT:    ret
+ call void @llvm.aarch64.sme.write.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> %zn, i32 1)
+ ret void
+}
+
+define void @test_write_zt_i16_2(<vscale x 8 x i16> %zn) #0 {
+; CHECK-LABEL: test_write_zt_i16_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0[2, mul vl], z0
+; CHECK-NEXT:    ret
+ call void @llvm.aarch64.sme.write.lane.zt.nxv8i16(i32 0, <vscale x 8 x i16> %zn, i32 2)
+ ret void
+}
+
+define void @test_write_zt_i32_3(<vscale x 4 x i32> %zn) #0 {
+; CHECK-LABEL: test_write_zt_i32_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0[3, mul vl], z0
+; CHECK-NEXT:    ret
+ call void @llvm.aarch64.sme.write.lane.zt.nxv4i32(i32 0, <vscale x 4 x i32> %zn, i32 3)
+ ret void
+}
+
+define void @test_write_zt_i64_1(<vscale x 2 x i64> %zn) #0 {
+; CHECK-LABEL: test_write_zt_i64_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0[1, mul vl], z0
+; CHECK-NEXT:    ret
+ call void @llvm.aarch64.sme.write.lane.zt.nxv2i64(i32 0, <vscale x 2 x i64> %zn, i32 1)
+ ret void
+}
+
+define void @test_write_zt_f16_2(<vscale x 8 x half> %zn) #0 {
+; CHECK-LABEL: test_write_zt_f16_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0[2, mul vl], z0
+; CHECK-NEXT:    ret
+ call void @llvm.aarch64.sme.write.lane.zt.nxv8f16(i32 0, <vscale x 8 x half> %zn, i32 2)
+ ret void
+}
+
+define void @test_write_zt_f32_3(<vscale x 4 x float> %zn) #0 {
+; CHECK-LABEL: test_write_zt_f32_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0[3, mul vl], z0
+; CHECK-NEXT:    ret
+ call void @llvm.aarch64.sme.write.lane.zt.nxv4f32(i32 0, <vscale x 4 x float> %zn, i32 3)
+ ret void
+}
+
+define void @test_write_zt_f64_1(<vscale x 2 x double> %zn) #0 {
+; CHECK-LABEL: test_write_zt_f64_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0[1, mul vl], z0
+; CHECK-NEXT:    ret
+ call void @llvm.aarch64.sme.write.lane.zt.nxv2f64(i32 0, <vscale x 2 x double> %zn, i32 1)
+ ret void
+}
+
+define void @test_write_zt_bf16_2(<vscale x 8 x bfloat> %zn) #0 {
+; CHECK-LABEL: test_write_zt_bf16_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0[2, mul vl], z0
+; CHECK-NEXT:    ret
+ call void @llvm.aarch64.sme.write.lane.zt.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, i32 2)
+ ret void
+}
+
+;; ALIAS
+
+define void @test_write_zt_i8(<vscale x 16 x i8> %v) #0 {
+; CHECK-LABEL: test_write_zt_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0, z0
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.write.zt.nxv16i8(i32 0, <vscale x 16 x i8> %v)
+  ret void
+}
+
+define void @test_write_zt_i16(<vscale x 8 x i16> %v) #0 {
+; CHECK-LABEL: test_write_zt_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0, z0
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.write.zt.nxv8i16(i32 0, <vscale x 8 x i16> %v)
+  ret void
+}
+
+define void @test_write_zt_i32(<vscale x 4 x i32> %v) #0 {
+; CHECK-LABEL: test_write_zt_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0, z0
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.write.zt.nxv4i32(i32 0, <vscale x 4 x i32> %v)
+  ret void
+}
+
+define void @test_write_zt_i64(<vscale x 2 x i64> %v) #0 {
+; CHECK-LABEL: test_write_zt_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0, z0
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.write.zt.nxv2i64(i32 0, <vscale x 2 x i64> %v)
+  ret void
+}
+
+define void @test_write_zt_f16(<vscale x 8 x half> %v) #0 {
+; CHECK-LABEL: test_write_zt_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0, z0
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.write.zt.nxv8f16(i32 0, <vscale x 8 x half> %v)
+  ret void
+}
+
+define void @test_write_zt_bf16(<vscale x 8 x bfloat> %v) #0 {
+; CHECK-LABEL: test_write_zt_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0, z0
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.write.zt.nxv8bf16(i32 0, <vscale x 8 x bfloat> %v)
+  ret void
+}
+
+define void @test_write_zt_f32(<vscale x 4 x float> %v) #0 {
+; CHECK-LABEL: test_write_zt_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0, z0
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.write.zt.nxv4f32(i32 0, <vscale x 4 x float> %v)
+  ret void
+}
+
+define void @test_write_zt_f64(<vscale x 2 x double> %v) #0 {
+; CHECK-LABEL: test_write_zt_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movt zt0, z0
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.write.zt.nxv2f64(i32 0, <vscale x 2 x double> %v)
+  ret void
+}
+
+attributes #0 = { "target-features"="+sme2,+sme-lutv2" }
diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
index c9d216935edbf..30a8396d85ab7 100644
--- a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
@@ -1,13 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-disable-multivector-spill-fill -verify-machineinstrs -force-streaming < %s | FileCheck %s --check-prefixes=NOPAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -aarch64-disable-multivector-spill-fill -verify-machineinstrs < %s |  FileCheck %s --check-prefixes=NOPAIR
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -force-streaming < %s | FileCheck %s --check-prefixes=NOPAIR
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -force-streaming < %s | FileCheck %s --check-prefixes=PAIR
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR
-
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sve -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2  -mattr=+sve -verify-machineinstrs -force-streaming < %s | FileCheck %s --check-prefixes=PAIR
 
 declare void @my_func()
 
-define void @fbyte(<vscale x 16 x i8> %v) {
+define void @fbyte(<vscale x 16 x i8> %v){
 ; NOPAIR-LABEL: fbyte:
 ; NOPAIR:       // %bb.0:
 ; NOPAIR-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
@@ -558,3 +561,4 @@ define aarch64_sve_vector_pcs  void @test_clobbers_p_reg_negative() {
  call void asm sideeffect "", "~{p10}"()
   ret void
 }
+
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 00e5ac479d02c..66d6e0388bbf9 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -126,6 +126,196 @@ entry:
   ret <vscale x 4 x i32> %partial.reduce
 }
 
+define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: udot_8to64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-NEXT:    sunpklo z2.d, z4.s
+; CHECK-NEXT:    sunpkhi z3.d, z4.s
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+  %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
+  %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
+  <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
+  ret <vscale x 4 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
+; CHECK-LABEL: sdot_8to64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-NEXT:    sunpklo z2.d, z4.s
+; CHECK-NEXT:    sunpkhi z3.d, z4.s
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+  %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
+  %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
+  <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
+  ret <vscale x 4 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
+; CHECK-I8MM-LABEL: usdot_8to64:
+; CHECK-I8MM:       // %bb.0: // %entry
+; CHECK-I8MM-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-I8MM-NEXT:    usdot z4.s, z2.b, z3.b
+; CHECK-I8MM-NEXT:    sunpklo z2.d, z4.s
+; CHECK-I8MM-NEXT:    sunpkhi z3.d, z4.s
+; CHECK-I8MM-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-I8MM-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-I8MM-NEXT:    ret
+;
+; CHECK-NOI8MM-LABEL: usdot_8to64:
+; CHECK-NOI8MM:       // %bb.0: // %entry
+; CHECK-NOI8MM-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NOI8MM-NEXT:    addvl sp, sp, #-2
+; CHECK-NOI8MM-NEXT:    str z9, [sp] // 16-byte Folded Spill
+; CHECK-NOI8MM-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NOI8MM-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NOI8MM-NEXT:    .cfi_offset w29, -16
+; CHECK-NOI8MM-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NOI8MM-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NOI8MM-NEXT:    uunpklo z4.h, z2.b
+; CHECK-NOI8MM-NEXT:    sunpklo z5.h, z3.b
+; CHECK-NOI8MM-NEXT:    uunpkhi z2.h, z2.b
+; CHECK-NOI8MM-NEXT:    sunpkhi z3.h, z3.b
+; CHECK-NOI8MM-NEXT:    ptrue p0.d
+; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z4.h
+; CHECK-NOI8MM-NEXT:    uunpkhi z4.s, z4.h
+; CHECK-NOI8MM-NEXT:    sunpklo z7.s, z5.h
+; CHECK-NOI8MM-NEXT:    sunpkhi z5.s, z5.h
+; CHECK-NOI8MM-NEXT:    uunpklo z24.s, z2.h
+; CHECK-NOI8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NOI8MM-NEXT:    sunpklo z25.s, z3.h
+; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
+; CHECK-NOI8MM-NEXT:    uunpkhi z26.d, z6.s
+; CHECK-NOI8MM-NEXT:    uunpklo z6.d, z6.s
+; CHECK-NOI8MM-NEXT:    uunpklo z27.d, z4.s
+; CHECK-NOI8MM-NEXT:    sunpklo z28.d, z7.s
+; CHECK-NOI8MM-NEXT:    sunpklo z29.d, z5.s
+; CHECK-NOI8MM-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NOI8MM-NEXT:    sunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT:    sunpkhi z5.d, z5.s
+; CHECK-NOI8MM-NEXT:    uunpkhi z30.d, z24.s
+; CHECK-NOI8MM-NEXT:    uunpkhi z31.d, z2.s
+; CHECK-NOI8MM-NEXT:    uunpklo z24.d, z24.s
+; CHECK-NOI8MM-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NOI8MM-NEXT:    sunpkhi z8.d, z25.s
+; CHECK-NOI8MM-NEXT:    sunpklo z25.d, z25.s
+; CHECK-NOI8MM-NEXT:    sunpklo z9.d, z3.s
+; CHECK-NOI8MM-NEXT:    mul z27.d, z27.d, z29.d
+; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z6.d, z28.d
+; CHECK-NOI8MM-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-NOI8MM-NEXT:    mul z4.d, z4.d, z5.d
+; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z7.d
+; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z9.d
+; CHECK-NOI8MM-NEXT:    movprfx z2, z27
+; CHECK-NOI8MM-NEXT:    mla z2.d, p0/m, z24.d, z25.d
+; CHECK-NOI8MM-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
+; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z31.d, z3.d
+; CHECK-NOI8MM-NEXT:    movprfx z3, z4
+; CHECK-NOI8MM-NEXT:    mla z3.d, p0/m, z30.d, z8.d
+; CHECK-NOI8MM-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NOI8MM-NEXT:    add z0.d, z2.d, z0.d
+; CHECK-NOI8MM-NEXT:    add z1.d, z3.d, z1.d
+; CHECK-NOI8MM-NEXT:    addvl sp, sp, #2
+; CHECK-NOI8MM-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NOI8MM-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+  %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
+  %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
+  <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
+  ret <vscale x 4 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-I8MM-LABEL: sudot_8to64:
+; CHECK-I8MM:       // %bb.0: // %entry
+; CHECK-I8MM-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-I8MM-NEXT:    usdot z4.s, z3.b, z2.b
+; CHECK-I8MM-NEXT:    sunpklo z2.d, z4.s
+; CHECK-I8MM-NEXT:    sunpkhi z3.d, z4.s
+; CHECK-I8MM-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-I8MM-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-I8MM-NEXT:    ret
+;
+; CHECK-NOI8MM-LABEL: sudot_8to64:
+; CHECK-NOI8MM:       // %bb.0: // %entry
+; CHECK-NOI8MM-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NOI8MM-NEXT:    addvl sp, sp, #-2
+; CHECK-NOI8MM-NEXT:    str z9, [sp] // 16-byte Folded Spill
+; CHECK-NOI8MM-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NOI8MM-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NOI8MM-NEXT:    .cfi_offset w29, -16
+; CHECK-NOI8MM-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NOI8MM-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NOI8MM-NEXT:    sunpklo z4.h, z2.b
+; CHECK-NOI8MM-NEXT:    uunpklo z5.h, z3.b
+; CHECK-NOI8MM-NEXT:    sunpkhi z2.h, z2.b
+; CHECK-NOI8MM-NEXT:    uunpkhi z3.h, z3.b
+; CHECK-NOI8MM-NEXT:    ptrue p0.d
+; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z4.h
+; CHECK-NOI8MM-NEXT:    sunpkhi z4.s, z4.h
+; CHECK-NOI8MM-NEXT:    uunpklo z7.s, z5.h
+; CHECK-NOI8MM-NEXT:    uunpkhi z5.s, z5.h
+; CHECK-NOI8MM-NEXT:    sunpklo z24.s, z2.h
+; CHECK-NOI8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NOI8MM-NEXT:    uunpklo z25.s, z3.h
+; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
+; CHECK-NOI8MM-NEXT:    sunpkhi z26.d, z6.s
+; CHECK-NOI8MM-NEXT:    sunpklo z6.d, z6.s
+; CHECK-NOI8MM-NEXT:    sunpklo z27.d, z4.s
+; CHECK-NOI8MM-NEXT:    uunpklo z28.d, z7.s
+; CHECK-NOI8MM-NEXT:    uunpklo z29.d, z5.s
+; CHECK-NOI8MM-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-NOI8MM-NEXT:    uunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT:    uunpkhi z5.d, z5.s
+; CHECK-NOI8MM-NEXT:    sunpkhi z30.d, z24.s
+; CHECK-NOI8MM-NEXT:    sunpkhi z31.d, z2.s
+; CHECK-NOI8MM-NEXT:    sunpklo z24.d, z24.s
+; CHECK-NOI8MM-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NOI8MM-NEXT:    uunpkhi z8.d, z25.s
+; CHECK-NOI8MM-NEXT:    uunpklo z25.d, z25.s
+; CHECK-NOI8MM-NEXT:    uunpklo z9.d, z3.s
+; CHECK-NOI8MM-NEXT:    mul z27.d, z27.d, z29.d
+; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z6.d, z28.d
+; CHECK-NOI8MM-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NOI8MM-NEXT:    mul z4.d, z4.d, z5.d
+; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z7.d
+; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z9.d
+; CHECK-NOI8MM-NEXT:    movprfx z2, z27
+; CHECK-NOI8MM-NEXT:    mla z2.d, p0/m, z24.d, z25.d
+; CHECK-NOI8MM-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
+; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z31.d, z3.d
+; CHECK-NOI8MM-NEXT:    movprfx z3, z4
+; CHECK-NOI8MM-NEXT:    mla z3.d, p0/m, z30.d, z8.d
+; CHECK-NOI8MM-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NOI8MM-NEXT:    add z0.d, z2.d, z0.d
+; CHECK-NOI8MM-NEXT:    add z1.d, z3.d, z1.d
+; CHECK-NOI8MM-NEXT:    addvl sp, sp, #2
+; CHECK-NOI8MM-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NOI8MM-NEXT:    ret
+entry:
+  %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+  %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
+  %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
+  <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
+  ret <vscale x 4 x i64> %partial.reduce
+}
+
 define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
 ; CHECK-LABEL: not_udot:
 ; CHECK:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/wineh-catchret-label-generation.ll b/llvm/test/CodeGen/AArch64/wineh-catchret-label-generation.ll
new file mode 100644
index 0000000000000..1f30865c98e19
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/wineh-catchret-label-generation.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple aarch64-unknown-windows-msvc %s -o - | FileCheck %s
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @llvm.seh.try.begin() #0
+
+define fastcc ptr @test_function(i1 %0, ptr %_Fmtfl.i.i, i1 %1) personality ptr @__CxxFrameHandler3 {
+; CHECK-LABEL: test_function:
+; CHECK:       .Lfunc_begin0:
+; CHECK-NEXT:  .seh_proc test_function
+; CHECK-NEXT:    .seh_handler __CxxFrameHandler3, @unwind, @except
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    .seh_save_fplr_x 32
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .seh_set_fp
+; CHECK-NEXT:    .seh_endprologue
+; CHECK-NEXT:    mov x3, #-2 // =0xfffffffffffffffe
+; CHECK-NEXT:    stur x3, [x29, #16]
+; CHECK-NEXT:    tbz w0, #0, .LBB0_3
+; CHECK-NEXT:  // %bb.1: // %invoke.cont.i124
+; CHECK-NEXT:    mov x0, x1
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:  .LBB0_2: // Block address taken
+; CHECK-NEXT:    // %some-block
+; CHECK-NEXT:  $ehgcr_0_2:
+; CHECK-NEXT:  .LBB0_3: // %left-block526
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:  .LBB0_4: // %common.ret1
+; CHECK-NEXT:    .seh_startepilogue
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .seh_save_fplr_x 32
+; CHECK-NEXT:    .seh_endepilogue
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .seh_endfunclet
+; CHECK-NEXT:    .seh_handlerdata
+; CHECK-NEXT:    .word ($cppxdata$test_function)@IMGREL
+; CHECK-NEXT:    .text
+; CHECK-NEXT:    .seh_endproc
+; CHECK-NEXT:    .def "?catch$5@?0?test_function@4HA";
+; CHECK-NEXT:    .scl 3;
+; CHECK-NEXT:    .type 32;
+; CHECK-NEXT:    .endef
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  "?catch$5@?0?test_function@4HA":
+; CHECK-NEXT:  .seh_proc "?catch$5@?0?test_function@4HA"
+; CHECK-NEXT:    .seh_handler __CxxFrameHandler3, @unwind, @except
+; CHECK-NEXT:  .LBB0_5: // %catch.i
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .seh_save_fplr_x 16
+; CHECK-NEXT:    .seh_endprologue
+; CHECK-NEXT:    adrp x0, .LBB0_2
+; CHECK-NEXT:    add x0, x0, .LBB0_2
+; CHECK-NEXT:    .seh_startepilogue
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .seh_save_fplr_x 16
+; CHECK-NEXT:    .seh_endepilogue
+; CHECK-NEXT:    ret
+entry:
+  br i1 %0, label %right-block527, label %left-block526
+
+common.ret1:
+  %common.ret1.op = phi ptr [ null, %left-block530 ], [ null, %some-block ], [ %_Fmtfl.i.i, %invoke.cont.i124 ], [ null, %left-block526 ]
+  ret ptr %common.ret1.op
+
+invoke.cont.i124:
+  %.not657 = icmp eq i32 1, 0
+  br i1 %.not657, label %some-block, label %common.ret1
+
+catch.dispatch.i:
+  %2 = catchswitch within none [label %catch.i] unwind to caller
+
+catch.i:
+  %3 = catchpad within %2 [ptr null, i32 0, ptr null]
+  catchret from %3 to label %some-block
+
+some-block:
+  br label %common.ret1
+
+left-block526:
+  br i1 %1, label %common.ret1, label %left-block530
+
+right-block527:
+  invoke void @llvm.seh.try.begin()
+          to label %invoke.cont.i124 unwind label %catch.dispatch.i
+
+left-block530:
+  %.not = icmp eq i32 0, 0
+  br label %common.ret1
+}
+
+attributes #0 = { nounwind willreturn memory(write) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"eh-asynch", i32 1}
diff --git a/llvm/test/CodeGen/AArch64/wineh-try-catch.ll b/llvm/test/CodeGen/AArch64/wineh-try-catch.ll
index b27e5374b2576..c3b5a8968d7bb 100644
--- a/llvm/test/CodeGen/AArch64/wineh-try-catch.ll
+++ b/llvm/test/CodeGen/AArch64/wineh-try-catch.ll
@@ -41,7 +41,7 @@
 ; CHECK-LABEL: .Ltmp0:
 ; CHECK:       bl      "?func2@@YAHXZ
 
-; CHECK:        [[CATCHRETDEST:.LBB0_[0-9]+]]:      // %catchret.dest
+; CHECK:        [[CATCHRETDEST:.LBB0_[0-9]+]]:      // Block address taken
 
 ; Check the catch funclet.
 ; CHECK-LABEL: "?catch$4@?0??func@@YAHXZ@4HA":
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 50d40368dd107..688146a6000e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -1,14 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX12 %s
 
 define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-LABEL: store_load_sindex_kernel:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
@@ -28,7 +34,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_addc_u32 s7, s7, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s1, s0, 15
@@ -42,7 +48,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ;
 ; GFX940-LABEL: store_load_sindex_kernel:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX940-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
@@ -56,7 +62,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ;
 ; GFX11-LABEL: store_load_sindex_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
@@ -70,7 +76,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ;
 ; GFX12-LABEL: store_load_sindex_kernel:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
@@ -81,6 +87,82 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: store_load_sindex_kernel:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_lshl_b32 s1, s0, 2
+; UNALIGNED_GFX9-NEXT:    s_and_b32 s0, s0, 15
+; UNALIGNED_GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword off, v0, s1
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: store_load_sindex_kernel:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s6, s6, s11
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s7, s7, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; UNALIGNED_GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_and_b32 s1, s0, 15
+; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s1, s1, 2
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword off, v0, s0
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: store_load_sindex_kernel:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_lshl_b32 s1, s0, 2
+; UNALIGNED_GFX940-NEXT:    s_and_b32 s0, s0, 15
+; UNALIGNED_GFX940-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: store_load_sindex_kernel:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_and_b32 s1, s0, 15
+; UNALIGNED_GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX11-NEXT:    s_lshl_b32 s1, s1, 2
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: store_load_sindex_kernel:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_and_b32 s1, s0, 15
+; UNALIGNED_GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
@@ -167,6 +249,82 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: store_load_vindex_kernel:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, 0, v1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v1, v2, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v0, 0, v0
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: store_load_vindex_kernel:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s6, s6, s11
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s7, s7, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; UNALIGNED_GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, 0, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, 0, v1
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: store_load_vindex_kernel:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v0, 0, v0
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: store_load_vindex_kernel:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; UNALIGNED_GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, 0, v1
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: store_load_vindex_kernel:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; UNALIGNED_GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v2, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -255,6 +413,82 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX9-LABEL: store_load_vindex_foo:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, s32, v1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v1, v2, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v0, s32, v0
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX10-LABEL: store_load_vindex_foo:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, s32, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, s32, v1
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX940-LABEL: store_load_vindex_foo:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v1, s32, v1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX11-LABEL: store_load_vindex_foo:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
+; UNALIGNED_GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, s32, v1
+; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v1, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, v0, s32 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX12-LABEL: store_load_vindex_foo:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_expcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
@@ -306,6 +540,47 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX9-LABEL: private_ptr_foo:
+; UNALIGNED_GFX9:       ; %bb.0:
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX10-LABEL: private_ptr_foo:
+; UNALIGNED_GFX10:       ; %bb.0:
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
+; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX940-LABEL: private_ptr_foo:
+; UNALIGNED_GFX940:       ; %bb.0:
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX11-LABEL: private_ptr_foo:
+; UNALIGNED_GFX11:       ; %bb.0:
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
+; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX12-LABEL: private_ptr_foo:
+; UNALIGNED_GFX12:       ; %bb.0:
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_expcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4
+; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
   store float 1.000000e+01, ptr addrspace(5) %gep, align 4
   ret void
@@ -314,7 +589,7 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9-NEXT:    s_mov_b32 s1, 0
@@ -338,7 +613,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_addc_u32 s7, s7, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
@@ -356,7 +631,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ;
 ; GFX940-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX940-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX940-NEXT:    scratch_load_dword v0, off, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 15
@@ -374,7 +649,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ;
 ; GFX11-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 15
@@ -392,7 +667,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ;
 ; GFX12-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
@@ -407,6 +682,102 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: store_load_sindex_small_offset_kernel:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; UNALIGNED_GFX9-NEXT:    s_mov_b32 s1, 0
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, off, s1 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_lshl_b32 s1, s0, 2
+; UNALIGNED_GFX9-NEXT:    s_and_b32 s0, s0, 15
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX9-NEXT:    s_addk_i32 s1, 0x100
+; UNALIGNED_GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword off, v0, s1
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_addk_i32 s0, 0x100
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: store_load_sindex_small_offset_kernel:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s6, s6, s11
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s7, s7, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; UNALIGNED_GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_and_b32 s1, s0, 15
+; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s1, s1, 2
+; UNALIGNED_GFX10-NEXT:    s_addk_i32 s0, 0x100
+; UNALIGNED_GFX10-NEXT:    s_addk_i32 s1, 0x100
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword off, v0, s0
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: store_load_sindex_small_offset_kernel:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, off, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_lshl_b32 s1, s0, 2
+; UNALIGNED_GFX940-NEXT:    s_and_b32 s0, s0, 15
+; UNALIGNED_GFX940-NEXT:    s_addk_i32 s1, 0x100
+; UNALIGNED_GFX940-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_addk_i32 s0, 0x100
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: store_load_sindex_small_offset_kernel:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_and_b32 s1, s0, 15
+; UNALIGNED_GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX11-NEXT:    s_lshl_b32 s1, s1, 2
+; UNALIGNED_GFX11-NEXT:    s_addk_i32 s0, 0x100
+; UNALIGNED_GFX11-NEXT:    s_addk_i32 s1, 0x100
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: store_load_sindex_small_offset_kernel:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_and_b32 s1, s0, 15
+; UNALIGNED_GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; UNALIGNED_GFX12-NEXT:    s_addk_co_i32 s0, 0x100
+; UNALIGNED_GFX12-NEXT:    s_addk_co_i32 s1, 0x100
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -507,6 +878,93 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:380 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: store_load_vindex_small_offset_kernel:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; UNALIGNED_GFX9-NEXT:    s_mov_b32 s0, 0
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v1, off, s0 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, 0x100, v1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v1, v2, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v0, 0x100, v0
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: store_load_vindex_small_offset_kernel:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s6, s6, s11
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s7, s7, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; UNALIGNED_GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v3, off, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x100, v1
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: store_load_vindex_small_offset_kernel:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v1, off, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v1, v2, off offset:256 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v0, 0x100, v0
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: store_load_vindex_small_offset_kernel:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v3, off, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:256 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x100, v1
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: store_load_vindex_small_offset_kernel:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:256 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:380 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -612,6 +1070,96 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX9-LABEL: store_load_vindex_small_offset_foo:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX9-NEXT:    s_add_i32 s0, s32, 0x100
+; UNALIGNED_GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, s0, v1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX9-NEXT:    s_add_i32 s0, s32, 0x100
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v1, v2, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX10-LABEL: store_load_vindex_small_offset_foo:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX10-NEXT:    s_add_i32 s0, s32, 0x100
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; UNALIGNED_GFX10-NEXT:    s_add_i32 s0, s32, 0x100
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX940-LABEL: store_load_vindex_small_offset_foo:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX940-NEXT:    s_add_i32 s0, s32, 0x100
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v1, s0, v1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX11-LABEL: store_load_vindex_small_offset_foo:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
+; UNALIGNED_GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
+; UNALIGNED_GFX11-NEXT:    s_add_i32 s0, s32, 0x100
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, s0, v1
+; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v1, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, v0, s32 offset:256 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX12-LABEL: store_load_vindex_small_offset_foo:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_expcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -628,7 +1176,7 @@ bb:
 define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX9-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9-NEXT:    s_mov_b32 s1, 0
@@ -652,7 +1200,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_addc_u32 s7, s7, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
@@ -670,7 +1218,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ;
 ; GFX940-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX940-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 15
@@ -688,7 +1236,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ;
 ; GFX11-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 15
@@ -706,7 +1254,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ;
 ; GFX12-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
@@ -721,6 +1269,102 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: store_load_sindex_large_offset_kernel:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; UNALIGNED_GFX9-NEXT:    s_mov_b32 s1, 0
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_lshl_b32 s1, s0, 2
+; UNALIGNED_GFX9-NEXT:    s_and_b32 s0, s0, 15
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX9-NEXT:    s_addk_i32 s1, 0x4004
+; UNALIGNED_GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword off, v0, s1
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_addk_i32 s0, 0x4004
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: store_load_sindex_large_offset_kernel:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s6, s6, s11
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s7, s7, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; UNALIGNED_GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_and_b32 s1, s0, 15
+; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s1, s1, 2
+; UNALIGNED_GFX10-NEXT:    s_addk_i32 s0, 0x4004
+; UNALIGNED_GFX10-NEXT:    s_addk_i32 s1, 0x4004
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword off, v0, s0
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: store_load_sindex_large_offset_kernel:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_lshl_b32 s1, s0, 2
+; UNALIGNED_GFX940-NEXT:    s_and_b32 s0, s0, 15
+; UNALIGNED_GFX940-NEXT:    s_addk_i32 s1, 0x4004
+; UNALIGNED_GFX940-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_addk_i32 s0, 0x4004
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: store_load_sindex_large_offset_kernel:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_and_b32 s1, s0, 15
+; UNALIGNED_GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX11-NEXT:    s_lshl_b32 s1, s1, 2
+; UNALIGNED_GFX11-NEXT:    s_addk_i32 s0, 0x4004
+; UNALIGNED_GFX11-NEXT:    s_addk_i32 s1, 0x4004
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: store_load_sindex_large_offset_kernel:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_and_b32 s1, s0, 15
+; UNALIGNED_GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; UNALIGNED_GFX12-NEXT:    s_addk_co_i32 s0, 0x4000
+; UNALIGNED_GFX12-NEXT:    s_addk_co_i32 s1, 0x4000
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -823,6 +1467,95 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:16508 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: store_load_vindex_large_offset_kernel:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; UNALIGNED_GFX9-NEXT:    s_mov_b32 s0, 0
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v1, v2, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v0, 0x4004, v0
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: store_load_vindex_large_offset_kernel:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s6, s6, s11
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s7, s7, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; UNALIGNED_GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v1
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: store_load_vindex_large_offset_kernel:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX940-NEXT:    s_movk_i32 s0, 0x4004
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v1, v2, s0 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v0, 0x4004, v0
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: store_load_vindex_large_offset_kernel:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; UNALIGNED_GFX11-NEXT:    s_movk_i32 s0, 0x4004
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v2, s0 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v1
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: store_load_vindex_large_offset_kernel:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:16384 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:16508 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -930,6 +1663,98 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX9-LABEL: store_load_vindex_large_offset_foo:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
+; UNALIGNED_GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, s0, v1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v1, v2, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX10-LABEL: store_load_vindex_large_offset_foo:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; UNALIGNED_GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX940-LABEL: store_load_vindex_large_offset_foo:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v1, s0, v1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, v0, s0 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX11-LABEL: store_load_vindex_large_offset_foo:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
+; UNALIGNED_GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
+; UNALIGNED_GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, s0, v1
+; UNALIGNED_GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
+; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v1, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, v0, s0 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX12-LABEL: store_load_vindex_large_offset_foo:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_expcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -1017,6 +1842,80 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: store_load_large_imm_offset_kernel:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX9-NEXT:    s_mov_b32 s0, 0
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:4
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX9-NEXT:    s_add_i32 s0, s0, 4
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword off, v0, s0
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: store_load_large_imm_offset_kernel:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s6, s6, s11
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s7, s7, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX10-NEXT:    s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX10-NEXT:    s_add_i32 s0, s0, 4
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword off, v1, s0
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: store_load_large_imm_offset_kernel:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX940-NEXT:    s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword off, v0, off offset:4 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX940-NEXT:    s_add_i32 s0, s0, 4
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword off, v0, s0 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: store_load_large_imm_offset_kernel:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; UNALIGNED_GFX11-NEXT:    s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; UNALIGNED_GFX11-NEXT:    s_add_i32 s0, s0, 4
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, off, s0 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: store_load_large_imm_offset_kernel:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 off, v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
   %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
@@ -1109,6 +2008,87 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX9-LABEL: store_load_large_imm_offset_foo:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX9-NEXT:    s_add_i32 s1, s32, s0
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:4
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX9-NEXT:    s_add_i32 s0, s1, 4
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword off, v0, s0
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX10-LABEL: store_load_large_imm_offset_foo:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX10-NEXT:    s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX10-NEXT:    s_add_i32 s1, s32, s0
+; UNALIGNED_GFX10-NEXT:    s_add_i32 s0, s1, 4
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:4
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword off, v1, s0
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX940-LABEL: store_load_large_imm_offset_foo:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX940-NEXT:    s_add_i32 s1, s32, s0
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword off, v0, s32 offset:4 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX940-NEXT:    s_add_i32 s0, s1, 4
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword off, v0, s0 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX11-LABEL: store_load_large_imm_offset_foo:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; UNALIGNED_GFX11-NEXT:    s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; UNALIGNED_GFX11-NEXT:    s_add_i32 s1, s32, s0
+; UNALIGNED_GFX11-NEXT:    s_add_i32 s0, s1, 4
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, off, s0 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX12-LABEL: store_load_large_imm_offset_foo:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_expcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
   %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
@@ -1123,7 +2103,7 @@ bb:
 define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-LABEL: store_load_vidx_sidx_offset:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 15
@@ -1142,7 +2122,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-NEXT:    s_addc_u32 s7, s7, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
@@ -1155,7 +2135,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ;
 ; GFX940-LABEL: store_load_vidx_sidx_offset:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX940-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1169,7 +2149,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ;
 ; GFX11-LABEL: store_load_vidx_sidx_offset:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1183,7 +2163,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ;
 ; GFX12-LABEL: store_load_vidx_sidx_offset:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1193,6 +2173,79 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: store_load_vidx_sidx_offset:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v0, 0, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: store_load_vidx_sidx_offset:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s6, s6, s11
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s7, s7, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; UNALIGNED_GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, 0, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: store_load_vidx_sidx_offset:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v0, 0, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1024 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1024 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: store_load_vidx_sidx_offset:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
+; UNALIGNED_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; UNALIGNED_GFX11-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v0, 0, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: store_load_vidx_sidx_offset:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; UNALIGNED_GFX12-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
   %alloca = alloca [32 x i32], align 4, addrspace(5)
   %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1263,6 +2316,65 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX9-LABEL: store_load_i64_aligned:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; UNALIGNED_GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX10-LABEL: store_load_i64_aligned:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; UNALIGNED_GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX940-LABEL: store_load_i64_aligned:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_mov_b64_e32 v[2:3], 15
+; UNALIGNED_GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX11-LABEL: store_load_i64_aligned:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX12-LABEL: store_load_i64_aligned:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_expcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 8
   %load = load volatile i64, ptr addrspace(5) %arg, align 8
@@ -1328,6 +2440,65 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX9-LABEL: store_load_i64_unaligned:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; UNALIGNED_GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX10-LABEL: store_load_i64_unaligned:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; UNALIGNED_GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX940-LABEL: store_load_i64_unaligned:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_mov_b64_e32 v[2:3], 15
+; UNALIGNED_GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX11-LABEL: store_load_i64_unaligned:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX12-LABEL: store_load_i64_unaligned:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_expcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 1
   %load = load volatile i64, ptr addrspace(5) %arg, align 1
@@ -1413,6 +2584,85 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX9-LABEL: store_load_v3i32_unaligned:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_mov_b32 s2, 3
+; UNALIGNED_GFX9-NEXT:    s_mov_b32 s1, 2
+; UNALIGNED_GFX9-NEXT:    s_mov_b32 s0, 1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; UNALIGNED_GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX10-LABEL: store_load_v3i32_unaligned:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_mov_b32 s2, 3
+; UNALIGNED_GFX10-NEXT:    s_mov_b32 s1, 2
+; UNALIGNED_GFX10-NEXT:    s_mov_b32 s0, 1
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v3, s2
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, s1
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; UNALIGNED_GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX940-LABEL: store_load_v3i32_unaligned:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_mov_b32 s2, 3
+; UNALIGNED_GFX940-NEXT:    s_mov_b32 s1, 2
+; UNALIGNED_GFX940-NEXT:    s_mov_b32 s0, 1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v4, s2
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v3, s1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v2, s0
+; UNALIGNED_GFX940-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX11-LABEL: store_load_v3i32_unaligned:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_mov_b32 s2, 3
+; UNALIGNED_GFX11-NEXT:    s_mov_b32 s1, 2
+; UNALIGNED_GFX11-NEXT:    s_mov_b32 s0, 1
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX12-LABEL: store_load_v3i32_unaligned:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_expcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_mov_b32 s2, 3
+; UNALIGNED_GFX12-NEXT:    s_mov_b32 s1, 2
+; UNALIGNED_GFX12-NEXT:    s_mov_b32 s0, 1
+; UNALIGNED_GFX12-NEXT:    s_wait_alu 0xfffe
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
   %load = load volatile <3 x i32>, ptr addrspace(5) %arg, align 1
@@ -1504,10 +2754,540 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX9-LABEL: store_load_v4i32_unaligned:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_mov_b32 s3, 4
+; UNALIGNED_GFX9-NEXT:    s_mov_b32 s2, 3
+; UNALIGNED_GFX9-NEXT:    s_mov_b32 s1, 2
+; UNALIGNED_GFX9-NEXT:    s_mov_b32 s0, 1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; UNALIGNED_GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX10-LABEL: store_load_v4i32_unaligned:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_mov_b32 s3, 4
+; UNALIGNED_GFX10-NEXT:    s_mov_b32 s2, 3
+; UNALIGNED_GFX10-NEXT:    s_mov_b32 s1, 2
+; UNALIGNED_GFX10-NEXT:    s_mov_b32 s0, 1
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v4, s3
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v3, s2
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, s1
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; UNALIGNED_GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX940-LABEL: store_load_v4i32_unaligned:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_mov_b32 s3, 4
+; UNALIGNED_GFX940-NEXT:    s_mov_b32 s2, 3
+; UNALIGNED_GFX940-NEXT:    s_mov_b32 s1, 2
+; UNALIGNED_GFX940-NEXT:    s_mov_b32 s0, 1
+; UNALIGNED_GFX940-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; UNALIGNED_GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; UNALIGNED_GFX940-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX11-LABEL: store_load_v4i32_unaligned:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_mov_b32 s3, 4
+; UNALIGNED_GFX11-NEXT:    s_mov_b32 s2, 3
+; UNALIGNED_GFX11-NEXT:    s_mov_b32 s1, 2
+; UNALIGNED_GFX11-NEXT:    s_mov_b32 s0, 1
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX12-LABEL: store_load_v4i32_unaligned:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_expcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_mov_b32 s3, 4
+; UNALIGNED_GFX12-NEXT:    s_mov_b32 s2, 3
+; UNALIGNED_GFX12-NEXT:    s_mov_b32 s1, 2
+; UNALIGNED_GFX12-NEXT:    s_mov_b32 s0, 1
+; UNALIGNED_GFX12-NEXT:    s_wait_alu 0xfffe
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
   %load = load volatile <4 x i32>, ptr addrspace(5) %arg, align 1
   ret void
 }
 
+define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
+; GFX9-LABEL: sgpr_base_large_offset:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_add_u32 s0, s2, 0xffe8
+; GFX9-NEXT:    scratch_load_dword v2, off, s0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_large_offset:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    s_add_u32 s0, s2, 0xffe8
+; GFX10-NEXT:    scratch_load_dword v2, off, s0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_large_offset:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_add_u32 s0, s0, 0xffe8
+; GFX940-NEXT:    scratch_load_dword v2, off, s0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    global_store_dword v[0:1], v2, off sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_large_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_add_u32 s0, s0, 0xffe8
+; GFX11-NEXT:    scratch_load_b32 v2, off, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_large_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:65512
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: sgpr_base_large_offset:
+; UNALIGNED_GFX9:       ; %bb.0: ; %entry
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; UNALIGNED_GFX9-NEXT:    s_add_u32 s0, s2, 0xffe8
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v2, off, s0
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: sgpr_base_large_offset:
+; UNALIGNED_GFX10:       ; %bb.0: ; %entry
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, s0, s5
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, s2, 0xffe8
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v2, off, s0
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: sgpr_base_large_offset:
+; UNALIGNED_GFX940:       ; %bb.0: ; %entry
+; UNALIGNED_GFX940-NEXT:    s_add_u32 s0, s0, 0xffe8
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v2, off, s0
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    global_store_dword v[0:1], v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: sgpr_base_large_offset:
+; UNALIGNED_GFX11:       ; %bb.0: ; %entry
+; UNALIGNED_GFX11-NEXT:    s_add_u32 s0, s0, 0xffe8
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v2, off, s0
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; UNALIGNED_GFX11-NEXT:    s_nop 0
+; UNALIGNED_GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: sgpr_base_large_offset:
+; UNALIGNED_GFX12:       ; %bb.0: ; %entry
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:65512
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; UNALIGNED_GFX12-NEXT:    s_nop 0
+; UNALIGNED_GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; UNALIGNED_GFX12-NEXT:    s_endpgm
+entry:
+  %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512
+  %load = load i32, ptr addrspace(5) %large_offset, align 4
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
+; GFX9-LABEL: sgpr_base_large_offset_split:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_and_b32 s0, s2, -4
+; GFX9-NEXT:    s_add_u32 s0, s0, 0x100ffe8
+; GFX9-NEXT:    scratch_load_dword v2, off, s0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_large_offset_split:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    s_and_b32 s0, s2, -4
+; GFX10-NEXT:    s_add_u32 s0, s0, 0x100ffe8
+; GFX10-NEXT:    scratch_load_dword v2, off, s0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_large_offset_split:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_and_b32 s0, s0, -4
+; GFX940-NEXT:    s_add_u32 s0, s0, 0x100ffe8
+; GFX940-NEXT:    scratch_load_dword v2, off, s0 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    global_store_dword v[0:1], v2, off sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_large_offset_split:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_and_b32 s0, s0, -4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, 0x100ffe8
+; GFX11-NEXT:    scratch_load_b32 v2, off, s0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_large_offset_split:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_and_b32 s0, s0, -4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_co_u32 s0, s0, 0x100ffe8
+; GFX12-NEXT:    scratch_load_b32 v2, off, s0 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: sgpr_base_large_offset_split:
+; UNALIGNED_GFX9:       ; %bb.0: ; %entry
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; UNALIGNED_GFX9-NEXT:    s_and_b32 s0, s2, -4
+; UNALIGNED_GFX9-NEXT:    s_add_u32 s0, s0, 0x100ffe8
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v2, off, s0 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: sgpr_base_large_offset_split:
+; UNALIGNED_GFX10:       ; %bb.0: ; %entry
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, s0, s5
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; UNALIGNED_GFX10-NEXT:    s_and_b32 s0, s2, -4
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, s0, 0x100ffe8
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v2, off, s0 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: sgpr_base_large_offset_split:
+; UNALIGNED_GFX940:       ; %bb.0: ; %entry
+; UNALIGNED_GFX940-NEXT:    s_and_b32 s0, s0, -4
+; UNALIGNED_GFX940-NEXT:    s_add_u32 s0, s0, 0x100ffe8
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v2, off, s0 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    global_store_dword v[0:1], v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: sgpr_base_large_offset_split:
+; UNALIGNED_GFX11:       ; %bb.0: ; %entry
+; UNALIGNED_GFX11-NEXT:    s_and_b32 s0, s0, -4
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; UNALIGNED_GFX11-NEXT:    s_add_u32 s0, s0, 0x100ffe8
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v2, off, s0 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; UNALIGNED_GFX11-NEXT:    s_nop 0
+; UNALIGNED_GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: sgpr_base_large_offset_split:
+; UNALIGNED_GFX12:       ; %bb.0: ; %entry
+; UNALIGNED_GFX12-NEXT:    s_and_b32 s0, s0, -4
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; UNALIGNED_GFX12-NEXT:    s_add_co_u32 s0, s0, 0x100ffe8
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v2, off, s0 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; UNALIGNED_GFX12-NEXT:    s_nop 0
+; UNALIGNED_GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; UNALIGNED_GFX12-NEXT:    s_endpgm
+entry:
+  ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
+  %sgpr_base_i32 = ptrtoint ptr addrspace(5) %sgpr_base to i32
+  %sgpr_base_i32_align4 = and i32 %sgpr_base_i32, 4294967292
+  %sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace(5)
+  %split_offset = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base_align4, i32 0, i32 16842728
+  %load = load volatile i32, ptr addrspace(5) %split_offset, align 4
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
+; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    v_add_u32_e32 v0, s3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffe8
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    v_add3_u32 v0, s2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 0xffe8
+; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX940:       ; %bb.0: ; %bb
+; GFX940-NEXT:    v_add_u32_e32 v0, s1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0xffe8
+; GFX940-NEXT:    v_add3_u32 v0, s0, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 0xffe8
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v0, s3, v0
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 0xffe8
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; UNALIGNED_GFX9-NEXT:    v_add3_u32 v0, s2, v0, v1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v0, v1, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, s0, s5
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, s3, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX10-NEXT:    v_add3_u32 v0, s2, v0, 0xffe8
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v1, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v0, s1, v0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v1, 0xffe8
+; UNALIGNED_GFX940-NEXT:    v_add3_u32 v0, s0, v0, v1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; UNALIGNED_GFX11-NEXT:    v_add3_u32 v0, s0, v0, 0xffe8
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v1, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; UNALIGNED_GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
+bb:
+  %add1 = add nsw i32 %sidx, %vidx
+  %add2 = add nsw i32 %add1, 65512
+  %gep = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
+  store volatile i32 15, ptr addrspace(5) %gep, align 4
+  ret void
+}
+
+define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
+; GFX9-LABEL: sgpr_base_negative_offset:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_add_u32 s0, s2, 0xffffffe8
+; GFX9-NEXT:    scratch_load_dword v2, off, s0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_negative_offset:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    scratch_load_dword v2, off, s2 offset:-24
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_negative_offset:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_add_u32 s0, s0, 0xffffffe8
+; GFX940-NEXT:    scratch_load_dword v2, off, s0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    global_store_dword v[0:1], v2, off sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_negative_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_negative_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: sgpr_base_negative_offset:
+; UNALIGNED_GFX9:       ; %bb.0: ; %entry
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; UNALIGNED_GFX9-NEXT:    s_add_u32 s0, s2, 0xffffffe8
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v2, off, s0
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: sgpr_base_negative_offset:
+; UNALIGNED_GFX10:       ; %bb.0: ; %entry
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, s0, s5
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v2, off, s2 offset:-24
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: sgpr_base_negative_offset:
+; UNALIGNED_GFX940:       ; %bb.0: ; %entry
+; UNALIGNED_GFX940-NEXT:    s_add_u32 s0, s0, 0xffffffe8
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v2, off, s0
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    global_store_dword v[0:1], v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: sgpr_base_negative_offset:
+; UNALIGNED_GFX11:       ; %bb.0: ; %entry
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; UNALIGNED_GFX11-NEXT:    s_nop 0
+; UNALIGNED_GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: sgpr_base_negative_offset:
+; UNALIGNED_GFX12:       ; %bb.0: ; %entry
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; UNALIGNED_GFX12-NEXT:    s_nop 0
+; UNALIGNED_GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; UNALIGNED_GFX12-NEXT:    s_endpgm
+entry:
+  %scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24
+  %0 = load i32, ptr addrspace(5) %scevgep28, align 4
+  store i32 %0, ptr addrspace(1) %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index de973481f8230..e9e7360733581 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -12,97 +12,90 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b32 s4, -1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s4
-; CHECK-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; CHECK-NEXT:    v_mov_b32_e32 v8, v0
-; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b32 exec_lo, s21
-; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_mov_b32_e32 v15, v1
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_mov_b32_e32 v14, v2
-; CHECK-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_mov_b32_e32 v13, v3
-; CHECK-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_mov_b32_e32 v12, v4
-; CHECK-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_mov_b32_e32 v11, v5
-; CHECK-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_mov_b32_e32 v10, v6
-; CHECK-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_mov_b32_e32 v9, v7
-; CHECK-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; CHECK-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
-; CHECK-NEXT:    v_mov_b32_e32 v2, v15
-; CHECK-NEXT:    v_mov_b32_e32 v3, v14
-; CHECK-NEXT:    v_mov_b32_e32 v4, v13
-; CHECK-NEXT:    v_mov_b32_e32 v5, v12
-; CHECK-NEXT:    v_mov_b32_e32 v6, v11
-; CHECK-NEXT:    v_mov_b32_e32 v7, v10
-; CHECK-NEXT:    v_mov_b32_e32 v8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v14, v1
+; CHECK-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v13, v2
+; CHECK-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v12, v3
+; CHECK-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v11, v4
+; CHECK-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v10, v5
+; CHECK-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v9, v6
+; CHECK-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v8, v7
+; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
+; CHECK-NEXT:    v_mov_b32_e32 v1, v14
+; CHECK-NEXT:    v_mov_b32_e32 v2, v13
+; CHECK-NEXT:    v_mov_b32_e32 v3, v12
+; CHECK-NEXT:    v_mov_b32_e32 v4, v11
+; CHECK-NEXT:    v_mov_b32_e32 v5, v10
+; CHECK-NEXT:    v_mov_b32_e32 v6, v9
+; CHECK-NEXT:    v_mov_b32_e32 v7, v8
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s4, s8
 ; CHECK-NEXT:    s_mov_b32 s5, s8
 ; CHECK-NEXT:    s_mov_b32 s6, s8
 ; CHECK-NEXT:    s_mov_b32 s7, s8
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 1
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 3
+; CHECK-NEXT:    ; implicit-def: $vgpr16 : SGPR spill to VGPR lane
+; CHECK-NEXT:    v_writelane_b32 v16, s4, 0
+; CHECK-NEXT:    v_writelane_b32 v16, s5, 1
+; CHECK-NEXT:    v_writelane_b32 v16, s6, 2
+; CHECK-NEXT:    v_writelane_b32 v16, s7, 3
 ; CHECK-NEXT:    s_mov_b32 s6, 0
 ; CHECK-NEXT:    s_mov_b32 s4, s6
 ; CHECK-NEXT:    s_mov_b32 s5, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    v_mov_b32_e32 v2, s5
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 s4, exec_lo
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 4
+; CHECK-NEXT:    v_writelane_b32 v16, s4, 4
 ; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s21
 ; CHECK-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s21
-; CHECK-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_readfirstlane_b32 s12, v8
-; CHECK-NEXT:    v_readfirstlane_b32 s10, v7
-; CHECK-NEXT:    v_readfirstlane_b32 s9, v6
-; CHECK-NEXT:    v_readfirstlane_b32 s8, v5
-; CHECK-NEXT:    v_readfirstlane_b32 s7, v4
-; CHECK-NEXT:    v_readfirstlane_b32 s6, v3
-; CHECK-NEXT:    v_readfirstlane_b32 s5, v2
-; CHECK-NEXT:    v_readfirstlane_b32 s4, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    v_readfirstlane_b32 s12, v7
+; CHECK-NEXT:    v_readfirstlane_b32 s10, v6
+; CHECK-NEXT:    v_readfirstlane_b32 s9, v5
+; CHECK-NEXT:    v_readfirstlane_b32 s8, v4
+; CHECK-NEXT:    v_readfirstlane_b32 s7, v3
+; CHECK-NEXT:    v_readfirstlane_b32 s6, v2
+; CHECK-NEXT:    v_readfirstlane_b32 s5, v1
+; CHECK-NEXT:    v_readfirstlane_b32 s4, v0
 ; CHECK-NEXT:    ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
 ; CHECK-NEXT:    s_mov_b32 s13, s10
 ; CHECK-NEXT:    s_mov_b32 s14, s9
@@ -111,59 +104,59 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    s_mov_b32 s17, s6
 ; CHECK-NEXT:    s_mov_b32 s18, s5
 ; CHECK-NEXT:    s_mov_b32 s19, s4
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 5
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 6
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 7
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s16, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s17, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s18, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s19, 12
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v16, s12, 5
+; CHECK-NEXT:    v_writelane_b32 v16, s13, 6
+; CHECK-NEXT:    v_writelane_b32 v16, s14, 7
+; CHECK-NEXT:    v_writelane_b32 v16, s15, 8
+; CHECK-NEXT:    v_writelane_b32 v16, s16, 9
+; CHECK-NEXT:    v_writelane_b32 v16, s17, 10
+; CHECK-NEXT:    v_writelane_b32 v16, s18, 11
+; CHECK-NEXT:    v_writelane_b32 v16, s19, 12
+; CHECK-NEXT:    v_mov_b32_e32 v6, v8
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v9
-; CHECK-NEXT:    v_mov_b32_e32 v8, v10
+; CHECK-NEXT:    v_mov_b32_e32 v4, v10
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v11
-; CHECK-NEXT:    v_mov_b32_e32 v6, v12
+; CHECK-NEXT:    v_mov_b32_e32 v2, v12
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v13
-; CHECK-NEXT:    v_mov_b32_e32 v4, v14
+; CHECK-NEXT:    v_mov_b32_e32 v0, v14
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v15
-; CHECK-NEXT:    v_mov_b32_e32 v2, v16
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[12:13]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[16:17]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[18:19]
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, s[4:5], v[7:8]
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[10:11], v[5:6]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, s[4:5], v[6:7]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[10:11], v[4:5]
 ; CHECK-NEXT:    s_and_b32 s4, s4, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[8:9], v[3:4]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[8:9], v[2:3]
 ; CHECK-NEXT:    s_and_b32 s4, s4, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[6:7], v[1:2]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[6:7], v[0:1]
 ; CHECK-NEXT:    s_and_b32 s4, s4, s5
 ; CHECK-NEXT:    s_and_saveexec_b32 s4, s4
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 13
+; CHECK-NEXT:    v_writelane_b32 v16, s4, 13
 ; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s21
 ; CHECK-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b32 exec_lo, s21
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s4, v2, 13
-; CHECK-NEXT:    v_readlane_b32 s8, v2, 5
-; CHECK-NEXT:    v_readlane_b32 s9, v2, 6
-; CHECK-NEXT:    v_readlane_b32 s10, v2, 7
-; CHECK-NEXT:    v_readlane_b32 s11, v2, 8
-; CHECK-NEXT:    v_readlane_b32 s12, v2, 9
-; CHECK-NEXT:    v_readlane_b32 s13, v2, 10
-; CHECK-NEXT:    v_readlane_b32 s14, v2, 11
-; CHECK-NEXT:    v_readlane_b32 s15, v2, 12
-; CHECK-NEXT:    v_readlane_b32 s16, v2, 0
-; CHECK-NEXT:    v_readlane_b32 s17, v2, 1
-; CHECK-NEXT:    v_readlane_b32 s18, v2, 2
-; CHECK-NEXT:    v_readlane_b32 s19, v2, 3
 ; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
+; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b32 exec_lo, s21
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_readlane_b32 s4, v16, 13
+; CHECK-NEXT:    v_readlane_b32 s8, v16, 5
+; CHECK-NEXT:    v_readlane_b32 s9, v16, 6
+; CHECK-NEXT:    v_readlane_b32 s10, v16, 7
+; CHECK-NEXT:    v_readlane_b32 s11, v16, 8
+; CHECK-NEXT:    v_readlane_b32 s12, v16, 9
+; CHECK-NEXT:    v_readlane_b32 s13, v16, 10
+; CHECK-NEXT:    v_readlane_b32 s14, v16, 11
+; CHECK-NEXT:    v_readlane_b32 s15, v16, 12
+; CHECK-NEXT:    v_readlane_b32 s16, v16, 0
+; CHECK-NEXT:    v_readlane_b32 s17, v16, 1
+; CHECK-NEXT:    v_readlane_b32 s18, v16, 2
+; CHECK-NEXT:    v_readlane_b32 s19, v16, 3
 ; CHECK-NEXT:    image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -171,24 +164,19 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.3:
 ; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s21
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 4
+; CHECK-NEXT:    v_readlane_b32 s4, v16, 4
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s4
 ; CHECK-NEXT:  ; %bb.4:
-; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b32 exec_lo, s21
 ; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; CHECK-NEXT:    ; implicit-def: $sgpr4
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
-; CHECK-NEXT:    ; kill: killed $vgpr4
 ; CHECK-NEXT:    s_xor_saveexec_b32 s4, -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll
index 696cbdb75f1ed..69567b34ae6e6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll
@@ -3,11 +3,14 @@
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
 ; ERR: remark: <unknown>:0:0: unable to translate instruction: call: '  %sgpr = call <4 x i32> asm sideeffect "; def $0", "={s[8:12]}"()' (in function: return_type_is_too_big_vector)
+; ERR: remark: <unknown>:0:0: unable to translate instruction: call: '  %sgpr = call <4 x i32> asm sideeffect "; def $0", "={s[8:10]}"()' (in function: return_type_is_too_small_vector)
 ; ERR: remark: <unknown>:0:0: unable to translate instruction: call: '  %reg = call i64 asm sideeffect "; def $0", "={v8}"()' (in function: return_type_is_too_big_scalar)
+; ERR: remark: <unknown>:0:0: unable to translate instruction: call: '  %reg = call i32 asm sideeffect "; def $0", "={v[8:9]}"()' (in function: return_type_is_too_small_scalar)
 ; ERR: remark: <unknown>:0:0: unable to translate instruction: call: '  %reg = call ptr addrspace(1) asm sideeffect "; def $0", "={v8}"()' (in function: return_type_is_too_big_pointer)
 ; ERR: remark: <unknown>:0:0: unable to translate instruction: call: '  %reg = call ptr addrspace(3) asm sideeffect "; def $0", "={v[8:9]}"()' (in function: return_type_is_too_small_pointer)
 ; ERR: remark: <unknown>:0:0: unable to translate instruction: call: '  call void asm sideeffect "; use $0", "{v[0:9]}"(<8 x i32> %arg)' (in function: use_vector_too_big)
 ; ERR: remark: <unknown>:0:0: unable to translate instruction: call: '  call void asm sideeffect "; use $0", "{v0}"(i64 %arg)' (in function: use_scalar_too_small)
+; ERR: remark: <unknown>:0:0: unable to translate instruction: call: '  call void asm sideeffect "; use $0", "{v[0:1]}"(i32 %arg)' (in function: use_scalar_too_big)
 ; ERR: remark: <unknown>:0:0: unable to translate instruction: call: '  call void asm sideeffect "; use $0", "{v0}"(ptr addrspace(1) %arg)' (in function: use_pointer_too_small)
 ; ERR: remark: <unknown>:0:0: unable to translate instruction: call: '  call void asm sideeffect "; use $0", "{v[0:1]}"(ptr addrspace(3) %arg)' (in function: use_pointer_too_big)
 
@@ -24,18 +27,25 @@ define amdgpu_kernel void @return_type_is_too_big_vector() {
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
   %sgpr = call <4 x i32> asm sideeffect "; def $0", "={s[8:12]}" ()
   call void asm sideeffect "; use $0", "s"(<4 x i32> %sgpr) #0
   ret void
 }
 
-; FIXME: This is crashing in the DAG
-; define amdgpu_kernel void @return_type_is_too_small_vector() {
-;   %sgpr = call <4 x i32> asm sideeffect "; def $0", "={s[8:10]}" ()
-;   call void asm sideeffect "; use $0", "s"(<4 x i32> %sgpr) #0
-;   ret void
-; }
+; This is broken because it requests 3 32-bit sgprs to handle a 4xi32 result.
+define amdgpu_kernel void @return_type_is_too_small_vector() {
+  ; CHECK-LABEL: name: return_type_is_too_small_vector
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr2_sgpr3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.0):
+  %sgpr = call <4 x i32> asm sideeffect "; def $0", "={s[8:10]}" ()
+  call void asm sideeffect "; use $0", "s"(<4 x i32> %sgpr) #0
+  ret void
+}
 
 define i64 @return_type_is_too_big_scalar() {
   ; CHECK-LABEL: name: return_type_is_too_big_scalar
@@ -50,12 +60,10 @@ define i64 @return_type_is_too_big_scalar() {
 
 define i32 @return_type_is_too_small_scalar() {
   ; CHECK-LABEL: name: return_type_is_too_small_scalar
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $vgpr8_vgpr9
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr8_vgpr9
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[TRUNC]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.0):
   %reg = call i32 asm sideeffect "; def $0", "={v[8:9]}" ()
   ret i32 %reg
 }
@@ -77,7 +85,6 @@ define ptr addrspace(3) @return_type_is_too_small_pointer() {
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $vgpr8_vgpr9
   %reg = call ptr addrspace(3) asm sideeffect "; def $0", "={v[8:9]}" ()
   ret ptr addrspace(3) %reg
 }
@@ -141,14 +148,13 @@ define void @use_scalar_too_small(i64 %arg) {
 
 define void @use_scalar_too_big(i32 %arg) {
   ; CHECK-LABEL: name: use_scalar_too_big
-  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32)
-  ; CHECK-NEXT:   $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
-  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9 /* reguse */, $vgpr0_vgpr1
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.0):
   call void asm sideeffect "; use $0", "{v[0:1]}"(i32 %arg)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
index acfa0776e4179..fb7c2d4d705e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
@@ -107,8 +107,8 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_ASHRREV_I16_t16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_t16_e64_]]
+    ; GFX11-NEXT: [[V_ASHRREV_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
     %2:vgpr(s16) = G_TRUNC %0
@@ -200,8 +200,8 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_ASHRREV_I16_t16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_t16_e64_]]
+    ; GFX11-NEXT: [[V_ASHRREV_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -247,9 +247,9 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_ASHRREV_I16_t16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: [[V_ASHRREV_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
     ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
-    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ASHRREV_I16_t16_e64_]], implicit $exec
+    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ASHRREV_I16_fake16_e64_]], implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -310,10 +310,10 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_ASHRREV_I16_t16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: [[V_ASHRREV_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
     ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
-    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_t16_e64_]], implicit $exec
+    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_fake16_e64_]], implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX11-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
@@ -455,8 +455,8 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_ASHRREV_I16_t16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_t16_e64_]]
+    ; GFX11-NEXT: [[V_ASHRREV_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_fake16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
     %2:sgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
index 140edabcab565..e7ec5fcbba247 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
@@ -105,8 +105,8 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_LSHRREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_t16_e64_]]
+    ; GFX11-NEXT: [[V_LSHRREV_B16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
     %2:vgpr(s16) = G_TRUNC %0
@@ -198,8 +198,8 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_LSHRREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_t16_e64_]]
+    ; GFX11-NEXT: [[V_LSHRREV_B16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -245,9 +245,9 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_LSHRREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: [[V_LSHRREV_B16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
     ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
-    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHRREV_B16_t16_e64_]], implicit $exec
+    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHRREV_B16_fake16_e64_]], implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -308,10 +308,10 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_LSHRREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: [[V_LSHRREV_B16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
     ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
-    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_t16_e64_]], implicit $exec
+    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_fake16_e64_]], implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX11-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
@@ -453,8 +453,8 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_LSHRREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_t16_e64_]]
+    ; GFX11-NEXT: [[V_LSHRREV_B16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_fake16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
     %2:sgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
index b035b69bd31d6..9dc53bd1dc0bd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
@@ -90,9 +90,9 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MAX_I16_t16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_t16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX11-NEXT: [[V_MAX_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec
     ; GFX11-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_t16_e64_]]
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
@@ -143,9 +143,9 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MIN_I16_t16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_t16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX11-NEXT: [[V_MIN_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec
     ; GFX11-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MIN_I16_t16_e64_]]
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MIN_I16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
@@ -197,10 +197,10 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MIN_I16_t16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_t16_e64 [[COPY]], [[COPY1]], implicit $exec
-    ; GFX11-NEXT: [[V_MAX_I16_t16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_t16_e64 [[V_MIN_I16_t16_e64_]], [[COPY2]], implicit $exec
+    ; GFX11-NEXT: [[V_MIN_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX11-NEXT: [[V_MAX_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_fake16_e64 [[V_MIN_I16_fake16_e64_]], [[COPY2]], implicit $exec
     ; GFX11-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_t16_e64_]]
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir
index 31fff58830448..6928c963a5fcf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir
@@ -90,9 +90,9 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MAX_U16_t16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_t16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX11-NEXT: [[V_MAX_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec
     ; GFX11-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_t16_e64_]]
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
@@ -143,9 +143,9 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MIN_U16_t16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_t16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX11-NEXT: [[V_MIN_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec
     ; GFX11-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MIN_U16_t16_e64_]]
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MIN_U16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
@@ -197,10 +197,10 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MIN_U16_t16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_t16_e64 [[COPY]], [[COPY1]], implicit $exec
-    ; GFX11-NEXT: [[V_MAX_U16_t16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_t16_e64 [[V_MIN_U16_t16_e64_]], [[COPY2]], implicit $exec
+    ; GFX11-NEXT: [[V_MIN_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX11-NEXT: [[V_MAX_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_fake16_e64 [[V_MIN_U16_fake16_e64_]], [[COPY2]], implicit $exec
     ; GFX11-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_t16_e64_]]
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir
index 5dd33765e8867..bcb6d75c18302 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir
@@ -105,8 +105,8 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_LSHLREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_t16_e64_]]
+    ; GFX11-NEXT: [[V_LSHLREV_B16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
     %2:vgpr(s16) = G_TRUNC %0
@@ -198,8 +198,8 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_LSHLREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_t16_e64_]]
+    ; GFX11-NEXT: [[V_LSHLREV_B16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -245,9 +245,9 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_LSHLREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: [[V_LSHLREV_B16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
     ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
-    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHLREV_B16_t16_e64_]], implicit $exec
+    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHLREV_B16_fake16_e64_]], implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -308,10 +308,10 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_LSHLREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: [[V_LSHLREV_B16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
     ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
-    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_t16_e64_]], implicit $exec
+    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_fake16_e64_]], implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX11-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
@@ -453,8 +453,8 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_LSHLREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_t16_e64_]]
+    ; GFX11-NEXT: [[V_LSHLREV_B16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_fake16_e64 [[COPY1]], [[COPY]], implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_fake16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
     %2:sgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-freeze.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-freeze.mir
index c06df6312c9c5..b08f850b5b2b1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-freeze.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-freeze.mir
@@ -171,12 +171,8 @@ body: |
 
     ; CHECK-LABEL: name: test_freeze_s448
     ; CHECK: [[COPY:%[0-9]+]]:_(s512) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s448) = G_TRUNC [[COPY]](s512)
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s448) = G_FREEZE [[TRUNC]]
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[FREEZE]](s448)
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s512) = G_MERGE_VALUES [[UV]](s64), [[UV1]](s64), [[UV2]](s64), [[UV3]](s64), [[UV4]](s64), [[UV5]](s64), [[UV6]](s64), [[DEF]](s64)
-    ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[MV]](s512)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s512) = G_FREEZE [[COPY]]
+    ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[FREEZE]](s512)
     %0:_(s512) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     %1:_(s448) = G_TRUNC %0
     %2:_(s448) = G_FREEZE %1
@@ -399,14 +395,12 @@ body: |
   bb.0:
 
     ; CHECK-LABEL: name: test_freeze_v33s32
-    ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK: [[DEF:%[0-9]+]]:_(<32 x s32>) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[DEF]]
-    ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[DEF]]
-    ; CHECK-NEXT: [[FREEZE2:%[0-9]+]]:_(s32) = G_FREEZE [[DEF1]]
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FREEZE]](<16 x s32>)
-    ; CHECK-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FREEZE1]](<16 x s32>)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<33 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32), [[UV7]](s32), [[UV8]](s32), [[UV9]](s32), [[UV10]](s32), [[UV11]](s32), [[UV12]](s32), [[UV13]](s32), [[UV14]](s32), [[UV15]](s32), [[UV16]](s32), [[UV17]](s32), [[UV18]](s32), [[UV19]](s32), [[UV20]](s32), [[UV21]](s32), [[UV22]](s32), [[UV23]](s32), [[UV24]](s32), [[UV25]](s32), [[UV26]](s32), [[UV27]](s32), [[UV28]](s32), [[UV29]](s32), [[UV30]](s32), [[UV31]](s32), [[FREEZE2]](s32)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<32 x s32>) = G_FREEZE [[DEF]]
+    ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s32) = G_FREEZE [[DEF1]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FREEZE]](<32 x s32>)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<33 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32), [[UV7]](s32), [[UV8]](s32), [[UV9]](s32), [[UV10]](s32), [[UV11]](s32), [[UV12]](s32), [[UV13]](s32), [[UV14]](s32), [[UV15]](s32), [[UV16]](s32), [[UV17]](s32), [[UV18]](s32), [[UV19]](s32), [[UV20]](s32), [[UV21]](s32), [[UV22]](s32), [[UV23]](s32), [[UV24]](s32), [[UV25]](s32), [[UV26]](s32), [[UV27]](s32), [[UV28]](s32), [[UV29]](s32), [[UV30]](s32), [[UV31]](s32), [[FREEZE1]](s32)
     ; CHECK-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<33 x s32>)
     %0:_(<33 x s32>) = G_IMPLICIT_DEF
     %1:_(<33 x s32>) = G_FREEZE %0
@@ -419,12 +413,10 @@ body: |
   bb.0:
 
     ; CHECK-LABEL: name: test_freeze_v64s32
-    ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[DEF]]
-    ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[DEF]]
-    ; CHECK-NEXT: [[FREEZE2:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[DEF]]
-    ; CHECK-NEXT: [[FREEZE3:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[DEF]]
-    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s32>) = G_CONCAT_VECTORS [[FREEZE]](<16 x s32>), [[FREEZE1]](<16 x s32>), [[FREEZE2]](<16 x s32>), [[FREEZE3]](<16 x s32>)
+    ; CHECK: [[DEF:%[0-9]+]]:_(<32 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<32 x s32>) = G_FREEZE [[DEF]]
+    ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(<32 x s32>) = G_FREEZE [[DEF]]
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s32>) = G_CONCAT_VECTORS [[FREEZE]](<32 x s32>), [[FREEZE1]](<32 x s32>)
     ; CHECK-NEXT: S_NOP 0, implicit [[CONCAT_VECTORS]](<64 x s32>)
     %0:_(<64 x s32>) = G_IMPLICIT_DEF
     %1:_(<64 x s32>) = G_FREEZE %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir
index b9edfbfa6d0a9..8113ebfa5362e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir
@@ -135,8 +135,9 @@ body: |
   bb.0:
 
     ; CHECK-LABEL: name: test_implicit_def_s448
-    ; CHECK: [[DEF:%[0-9]+]]:_(s448) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[DEF]](s448), 0
+    ; CHECK: [[DEF:%[0-9]+]]:_(s512) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s448) = G_TRUNC [[DEF]](s512)
+    ; CHECK-NEXT: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[TRUNC]](s448), 0
     ; CHECK-NEXT: $vgpr0 = COPY [[EXTRACT]](s32)
     %0:_(s448) = G_IMPLICIT_DEF
     %1:_(s32) = G_EXTRACT %0, 0
@@ -295,18 +296,6 @@ body: |
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %0
 ...
 
----
-name: test_implicit_def_v17s32
-body: |
-  bb.0:
-
-    ; CHECK-LABEL: name: test_implicit_def_v17s32
-    ; CHECK: [[DEF:%[0-9]+]]:_(<17 x s32>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: S_NOP 0, implicit [[DEF]](<17 x s32>)
-    %0:_(<17 x s32>) = G_IMPLICIT_DEF
-    S_NOP 0, implicit %0
-...
-
 ---
 name: test_implicit_def_v32s32
 body: |
@@ -328,9 +317,9 @@ body: |
     ; CHECK-LABEL: name: test_implicit_def_v33s32
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<32 x s32>) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<32 x s32>)
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: G_STORE [[UV]](s32), [[COPY]](p1) :: (volatile store (s32), addrspace 1)
     ; CHECK-NEXT: G_STORE [[DEF1]](s32), [[COPY]](p1) :: (volatile store (s32), addrspace 1)
@@ -348,10 +337,9 @@ body: |
   bb.0:
 
     ; CHECK-LABEL: name: test_implicit_def_v64s32
-    ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s32>) = G_CONCAT_VECTORS [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>)
-    ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[DEF]](<16 x s32>), [[DEF]](<16 x s32>)
-    ; CHECK-NEXT: S_NOP 0, implicit [[CONCAT_VECTORS]](<64 x s32>), implicit [[CONCAT_VECTORS1]](<32 x s32>)
+    ; CHECK: [[DEF:%[0-9]+]]:_(<32 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s32>) = G_CONCAT_VECTORS [[DEF]](<32 x s32>), [[DEF]](<32 x s32>)
+    ; CHECK-NEXT: S_NOP 0, implicit [[CONCAT_VECTORS]](<64 x s32>), implicit [[DEF]](<32 x s32>)
     %0:_(<64 x s32>) = G_IMPLICIT_DEF
     %1:_(<32 x s32>), %2:_(<32 x s32>) = G_UNMERGE_VALUES %0
   S_NOP 0, implicit %0, implicit %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir
index b57dd396ae355..bebbf2a262256 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir
@@ -190,13 +190,11 @@ body: |
     ; CHECK-LABEL: name: insert_vector_elt_64_65_v64s32
     ; CHECK: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<32 x s32>) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p1) = COPY $vgpr2_vgpr3
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>), [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-    ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s32>), [[UV5:%[0-9]+]]:_(<4 x s32>), [[UV6:%[0-9]+]]:_(<4 x s32>), [[UV7:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<4 x s32>), [[UV9:%[0-9]+]]:_(<4 x s32>), [[UV10:%[0-9]+]]:_(<4 x s32>), [[UV11:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-    ; CHECK-NEXT: [[UV12:%[0-9]+]]:_(<4 x s32>), [[UV13:%[0-9]+]]:_(<4 x s32>), [[UV14:%[0-9]+]]:_(<4 x s32>), [[UV15:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>), [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>), [[UV4:%[0-9]+]]:_(<4 x s32>), [[UV5:%[0-9]+]]:_(<4 x s32>), [[UV6:%[0-9]+]]:_(<4 x s32>), [[UV7:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<32 x s32>)
+    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<4 x s32>), [[UV9:%[0-9]+]]:_(<4 x s32>), [[UV10:%[0-9]+]]:_(<4 x s32>), [[UV11:%[0-9]+]]:_(<4 x s32>), [[UV12:%[0-9]+]]:_(<4 x s32>), [[UV13:%[0-9]+]]:_(<4 x s32>), [[UV14:%[0-9]+]]:_(<4 x s32>), [[UV15:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<32 x s32>)
     ; CHECK-NEXT: G_STORE [[UV]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
@@ -243,10 +241,8 @@ body: |
     ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 240
     ; CHECK-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C14]](s64)
     ; CHECK-NEXT: G_STORE [[UV15]](<4 x s32>), [[PTR_ADD14]](p1) :: (store (<4 x s32>) into unknown-address + 240, align 4, addrspace 1)
-    ; CHECK-NEXT: [[UV16:%[0-9]+]]:_(<4 x s32>), [[UV17:%[0-9]+]]:_(<4 x s32>), [[UV18:%[0-9]+]]:_(<4 x s32>), [[UV19:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-    ; CHECK-NEXT: [[UV20:%[0-9]+]]:_(<4 x s32>), [[UV21:%[0-9]+]]:_(<4 x s32>), [[UV22:%[0-9]+]]:_(<4 x s32>), [[UV23:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-    ; CHECK-NEXT: [[UV24:%[0-9]+]]:_(<4 x s32>), [[UV25:%[0-9]+]]:_(<4 x s32>), [[UV26:%[0-9]+]]:_(<4 x s32>), [[UV27:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-    ; CHECK-NEXT: [[UV28:%[0-9]+]]:_(<4 x s32>), [[UV29:%[0-9]+]]:_(<4 x s32>), [[UV30:%[0-9]+]]:_(<4 x s32>), [[UV31:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
+    ; CHECK-NEXT: [[UV16:%[0-9]+]]:_(<4 x s32>), [[UV17:%[0-9]+]]:_(<4 x s32>), [[UV18:%[0-9]+]]:_(<4 x s32>), [[UV19:%[0-9]+]]:_(<4 x s32>), [[UV20:%[0-9]+]]:_(<4 x s32>), [[UV21:%[0-9]+]]:_(<4 x s32>), [[UV22:%[0-9]+]]:_(<4 x s32>), [[UV23:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<32 x s32>)
+    ; CHECK-NEXT: [[UV24:%[0-9]+]]:_(<4 x s32>), [[UV25:%[0-9]+]]:_(<4 x s32>), [[UV26:%[0-9]+]]:_(<4 x s32>), [[UV27:%[0-9]+]]:_(<4 x s32>), [[UV28:%[0-9]+]]:_(<4 x s32>), [[UV29:%[0-9]+]]:_(<4 x s32>), [[UV30:%[0-9]+]]:_(<4 x s32>), [[UV31:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<32 x s32>)
     ; CHECK-NEXT: G_STORE [[UV16]](<4 x s32>), [[COPY1]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
     ; CHECK-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C]](s64)
     ; CHECK-NEXT: G_STORE [[UV17]](<4 x s32>), [[PTR_ADD15]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 4, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
index e67f3620d013c..b1d7d36f9912e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
@@ -1,10 +1,17 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=CI %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=VI %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=CI %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=VI %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9PLUS %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9PLUS %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX11PLUS %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX12 %s
+
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=-unaligned-access-mode -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefixes=CI %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-unaligned-access-mode -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefixes=VI %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-unaligned-access-mode -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefixes=UNALIGNED_GFX9PLUS %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-unaligned-access-mode -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefixes=UNALIGNED_GFX9PLUS %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-unaligned-access-mode -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefixes=UNALIGNED_GFX11PLUS %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-unaligned-access-mode -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefixes=UNALIGNED_GFX12 %s
 
 ---
 name: test_load_flat_s1_align1
@@ -30,14 +37,59 @@ body: |
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s1_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s1_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s1_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s1_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s1_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s1_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s1_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s1) = G_LOAD %0 :: (load (s1), align 1, addrspace 0)
     %2:_(s32) = G_ZEXT %1
@@ -68,14 +120,59 @@ body: |
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s2_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s2_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s2_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s2_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s2_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; UNALIGNED_GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s2_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s2_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s2) = G_LOAD %0 :: (load (s2), align 1, addrspace 0)
     %2:_(s32) = G_ZEXT %1
@@ -102,12 +199,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s8_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s8_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s8_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s8_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s8_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s8_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s8_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s8) = G_LOAD %0 :: (load (s8), align 4, addrspace 0)
     %2:_(s32) = G_ANYEXT %1
@@ -134,12 +266,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s8_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s8_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s8_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s8_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s8_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s8_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s8_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s8) = G_LOAD %0 :: (load (s8), align 1, addrspace 0)
     %2:_(s32) = G_ANYEXT %1
@@ -166,12 +333,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s16_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s16_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s16_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s16_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s16_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s16_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s16_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s16) = G_LOAD %0 :: (load (s16), align 4, addrspace 0)
     %2:_(s32) = G_ANYEXT %1
@@ -198,12 +400,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s16_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s16_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s16_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s16_align2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s16_align2
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s16_align2
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s16_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s16) = G_LOAD %0 :: (load (s16), align 2, addrspace 0)
     %2:_(s32) = G_ANYEXT %1
@@ -242,18 +479,83 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s16_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s16_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s16_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s16_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s16_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s16_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s16_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s16) = G_LOAD %0 :: (load (s16), align 1, addrspace 0)
     %2:_(s32) = G_ANYEXT %1
@@ -280,12 +582,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s32_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s32_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s32_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s32_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s32_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s32_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -323,18 +660,83 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s32_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s32_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s32_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s32_align2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align2
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s32_align2
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s32_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 2, addrspace 0)
     $vgpr0 = COPY %1
@@ -392,28 +794,143 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s32_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s32_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s32_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s32_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s32_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s32_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 1, addrspace 0)
     $vgpr0 = COPY %1
@@ -471,14 +988,59 @@ body: |
     ; VI-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C4]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[AND2]](s64)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s48_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
-    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s48_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
+    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s48_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
+    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s48_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s48_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
+    ; UNALIGNED_GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s48_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s48_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
+    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s48) = G_LOAD %0 :: (load (s48), align 8, addrspace 0)
     %2:_(s64) = G_ZEXT %1
@@ -513,12 +1075,47 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s64_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s64_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s64_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s64_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s64_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s64_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -552,12 +1149,47 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s64_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s64_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s64_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s64_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s64_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s64_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -611,30 +1243,155 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s64_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s64_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s64_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s64_align2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align2
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s64_align2
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s64_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 2, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -724,48 +1481,263 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s64_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s64_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s64_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s64_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s64_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s64_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -807,13 +1779,53 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s96_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s96_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s96_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s96_align16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align16
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s96_align16
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s96_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -855,13 +1867,53 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s96_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 8)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s96_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 8)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s96_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 8)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s96_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 8)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s96_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s96_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 8)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 8, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -903,13 +1955,53 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s96_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s96_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s96_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s96_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s96_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s96_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -979,34 +2071,179 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s96_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s96_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s96_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s96_align2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align2
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s96_align2
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s96_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 2, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -1128,60 +2365,335 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s96_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s96_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s96_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s96_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s96_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s96_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -1235,18 +2747,83 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
     ; VI-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s160_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 16)
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
-    ; GFX9-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s160_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
+    ; GFX9PLUS-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s160_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
+    ; GFX11PLUS-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s160_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 16)
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
+    ; GFX12-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s160_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s160_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s160_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 16)
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
+    ; UNALIGNED_GFX12-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s160) = G_LOAD %0 :: (load (s160), align 4, addrspace 0)
     S_NOP 0, implicit %1
@@ -1316,21 +2893,101 @@ body: |
     ; VI-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s224_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<3 x s32>) from unknown-address + 16, align 4)
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<3 x s32>)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s224) = G_BITCAST [[BUILD_VECTOR]](<7 x s32>)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s224_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<3 x s32>) from unknown-address + 16, align 4)
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<3 x s32>)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s224) = G_BITCAST [[BUILD_VECTOR]](<7 x s32>)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s224_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<3 x s32>) from unknown-address + 16, align 4)
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<3 x s32>)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s224) = G_BITCAST [[BUILD_VECTOR]](<7 x s32>)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s224_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<3 x s32>) from unknown-address + 16, align 4)
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<3 x s32>)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s224) = G_BITCAST [[BUILD_VECTOR]](<7 x s32>)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s224_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<3 x s32>) from unknown-address + 16, align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s224) = G_BITCAST [[BUILD_VECTOR]](<7 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9PLUS-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s224_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<3 x s32>) from unknown-address + 16, align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s224) = G_BITCAST [[BUILD_VECTOR]](<7 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s224_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<3 x s32>) from unknown-address + 16, align 4)
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s224) = G_BITCAST [[BUILD_VECTOR]](<7 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s224) = G_LOAD %0 :: (load (s224), align 4, addrspace 0)
      %2:_(s256) = G_IMPLICIT_DEF
@@ -1381,13 +3038,53 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s128_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s128_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s128_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s128_align16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s128_align16
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s128_align16
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s128_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -1435,13 +3132,53 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s128_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s128_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s128_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s128_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s128_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s128_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s128_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -1593,75 +3330,425 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s128_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s128_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s128_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s128_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s128_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s128_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s128_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -1733,17 +3820,77 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s256_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s256_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s256_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s256_align32
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s256_align32
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s256_align32
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s256_align32
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s256) = G_LOAD %0 :: (load (s256), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -1777,12 +3924,47 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p1_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p1_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p1_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_p1_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p1_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_p1_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p1_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -1816,12 +3998,47 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p1_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p1_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p1_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_p1_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p1_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_p1_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p1_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -1911,49 +4128,269 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p1_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p1_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p1_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_p1_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p1_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_p1_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p1_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -1979,12 +4416,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p3_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p3_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p3_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_p3_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p3_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_p3_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p3_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
      %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -2018,12 +4490,47 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p4_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p4_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p4_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_p4_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_p4_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p4_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p4) = G_LOAD %0 :: (load (p4), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -2057,12 +4564,47 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p4_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p4_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p4_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_p4_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_p4_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p4_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p4) = G_LOAD %0 :: (load (p4), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -2116,31 +4658,161 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p4_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p4_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p4_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_p4_align2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align2
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_p4_align2
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p4_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p4) = G_LOAD %0 :: (load (p4), align 2, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -2230,49 +4902,269 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p4_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p4_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p4_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_p4_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_p4_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p4_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p4) = G_LOAD %0 :: (load (p4), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -2298,12 +5190,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p5_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p5_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p5_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_p5_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_p5_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p5_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -2343,19 +5270,89 @@ body: |
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p5_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p5_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p5_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_p5_align2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align2
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_p5_align2
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p5_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 2, addrspace 0)
     $vgpr0 = COPY %1
@@ -2415,29 +5412,149 @@ body: |
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p5_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p5_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p5_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_p5_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_p5_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p5_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 1, addrspace 0)
     $vgpr0 = COPY %1
@@ -2463,12 +5580,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s8_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s8_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s8_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s8_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s8_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s8_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s8_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 4, addrspace 0)
     %2:_(s16) = G_BITCAST %1
@@ -2496,12 +5648,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s8_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s8_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s8_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s8_align2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s8_align2
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s8_align2
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s8_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 2, addrspace 0)
     %2:_(s16) = G_BITCAST %1
@@ -2541,18 +5728,83 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s8_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s8_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s8_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s8_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s8_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s8_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s8_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 1, addrspace 0)
     %2:_(s16) = G_BITCAST %1
@@ -2628,35 +5880,185 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s8_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s8_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s8_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v3s8_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s8_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v3s8_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s8_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), align 4, addrspace 0)
     %2:_(s24) = G_BITCAST %1
@@ -2752,45 +6154,245 @@ body: |
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; VI-NEXT: $vgpr0 = COPY [[OR4]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s8_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s8_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s8_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v3s8_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s8_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v3s8_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s8_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), align 1, addrspace 0)
     %2:_(s24) = G_BITCAST %1
@@ -2818,12 +6420,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s8_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s8_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v4s8_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v4s8_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s8_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 4, addrspace 0)
     %2:_(s32) = G_BITCAST %1
@@ -2862,18 +6499,83 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s8_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s8_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v4s8_align2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align2
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v4s8_align2
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s8_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 2, addrspace 0)
     %2:_(s32) = G_BITCAST %1
@@ -2932,28 +6634,143 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s8_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s8_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v4s8_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v4s8_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s8_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 1, addrspace 0)
     %2:_(s32) = G_BITCAST %1
@@ -2988,12 +6805,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v8s8_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v8s8_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v8s8_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v8s8_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v8s8_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v8s8_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v8s8_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<8 x s8>) = G_LOAD %0 :: (load (<8 x s8>), align 8, addrspace 0)
     %2:_(<2 x s32>) = G_BITCAST %1
@@ -3040,12 +6892,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v16s8_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v16s8_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v16s8_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v16s8_align16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v16s8_align16
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v16s8_align16
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v16s8_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<16 x s8>) = G_LOAD %0 :: (load (<16 x s8>), align 16, addrspace 0)
     %2:_(<4 x s32>) = G_BITCAST %1
@@ -3116,16 +7003,71 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v32s8_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v32s8_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v32s8_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v32s8_align32
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v32s8_align32
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v32s8_align32
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v32s8_align32
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<32 x s8>) = G_LOAD %0 :: (load (<32 x s8>), align 32, addrspace 0)
     %2:_(<8 x s32>) = G_BITCAST %1
@@ -3153,12 +7095,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s16_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s16_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s16_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s16_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s16_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -3204,18 +7181,83 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s16_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s16_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s16_align2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align2
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s16_align2
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s16_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 0)
     $vgpr0 = COPY %1
@@ -3281,28 +7323,143 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s16_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s16_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s16_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s16_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s16_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 0)
     $vgpr0 = COPY %1
@@ -3380,27 +7537,137 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s16_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s16_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s16_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v3s16_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX12-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s16_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v3s16_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s16_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 8, addrspace 0)
     %2:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -3480,34 +7747,179 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s16_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 4)
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
-    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s16_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 4)
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s16_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 4)
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v3s16_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 4)
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s16_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v3s16_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s16_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 4)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 4, addrspace 0)
     %2:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -3591,34 +8003,179 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s16_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
-    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s16_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s16_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v3s16_align2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s16_align2
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v3s16_align2
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s16_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 2, addrspace 0)
     %2:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -3730,48 +8287,263 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s16_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
-    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s16_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s16_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v3s16_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s16_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v3s16_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s16_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 1, addrspace 0)
     %2:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -3807,12 +8579,47 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s16_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s16_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v4s16_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v4s16_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s16_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -3846,12 +8653,47 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s16_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s16_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v4s16_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v4s16_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s16_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -3919,28 +8761,143 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s16_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s16_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v4s16_align2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align2
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v4s16_align2
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s16_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -4044,46 +9001,251 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s16_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s16_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v4s16_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v4s16_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s16_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -4131,13 +9293,53 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v8s16_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v8s16_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v8s16_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v8s16_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v8s16_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v8s16_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v8s16_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 8, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4171,12 +9373,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s32_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s32_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s32_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s32_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s32_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s32_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s32_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -4210,12 +9447,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s32_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s32_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s32_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s32_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s32_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s32_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s32_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -4250,12 +9522,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s32_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s32_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s32_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s32_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s32_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s32_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s32_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -4295,12 +9602,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s32_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s32_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s32_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v3s32_align16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s32_align16
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v3s32_align16
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s32_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -4342,12 +9684,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s32_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s32_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s32_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v3s32_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s32_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v3s32_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s32_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -4393,12 +9770,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s32_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s32_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s32_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v4s32_align16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s32_align16
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v4s32_align16
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s32_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4444,12 +9856,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s32_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s32_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s32_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v4s32_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s32_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v4s32_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s32_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 8, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4495,12 +9942,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s32_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s32_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s32_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v4s32_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s32_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v4s32_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s32_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4570,16 +10052,71 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v8s32_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v8s32_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v8s32_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v8s32_align32
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v8s32_align32
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v8s32_align32
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v8s32_align32
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), align 32, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -4697,22 +10234,107 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v16s32_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<4 x s32>) from unknown-address + 32, align 32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p0) :: (load (<4 x s32>) from unknown-address + 48)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v16s32_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<4 x s32>) from unknown-address + 32, align 32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p0) :: (load (<4 x s32>) from unknown-address + 48)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v16s32_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<4 x s32>) from unknown-address + 32, align 32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p0) :: (load (<4 x s32>) from unknown-address + 48)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v16s32_align32
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<4 x s32>) from unknown-address + 32, align 32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p0) :: (load (<4 x s32>) from unknown-address + 48)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v16s32_align32
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<4 x s32>) from unknown-address + 32, align 32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p0) :: (load (<4 x s32>) from unknown-address + 48)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v16s32_align32
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<4 x s32>) from unknown-address + 32, align 32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p0) :: (load (<4 x s32>) from unknown-address + 48)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v16s32_align32
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<4 x s32>) from unknown-address + 32, align 32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p0) :: (load (<4 x s32>) from unknown-address + 48)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>), align 32, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %1
@@ -4760,12 +10382,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s64_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s64_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s64_align16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align16
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s64_align16
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s64_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4813,12 +10470,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s64_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s64_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s64_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s64_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s64_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 8, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4866,12 +10558,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s64_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s64_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s64_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s64_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s64_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4955,49 +10682,269 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s64_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s64_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
+    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
+    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
+    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
+    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
+    ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
+    ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s64_align2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align2
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s64_align2
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s64_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 2, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -5149,83 +11096,473 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s64_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s64_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s64_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s64_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s64_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 1, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -5289,19 +11626,89 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s64_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 16)
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s64_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 16)
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s64_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 16)
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v3s64_align32
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 16)
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s64_align32
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v3s64_align32
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s64_align32
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 16)
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s64>) = G_LOAD %0 :: (load (<3 x s64>), align 32, addrspace 0)
     %2:_(<4 x s64>) = G_IMPLICIT_DEF
@@ -5367,19 +11774,89 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s64_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16)
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s64_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s64_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v3s64_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16)
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s64_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v3s64_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s64_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16)
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s64>) = G_LOAD %0 :: (load (<3 x s64>), align 8, addrspace 0)
     %2:_(<4 x s64>) = G_IMPLICIT_DEF
@@ -5597,119 +12074,689 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s64_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX9-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX9-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX9-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX9-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX9-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX9-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX9-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX9-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX9-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX9-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX9-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX9-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX9-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX9-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX9-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s64_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s64_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v3s64_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s64_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v3s64_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s64_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; UNALIGNED_GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; UNALIGNED_GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; UNALIGNED_GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; UNALIGNED_GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s64>) = G_LOAD %0 :: (load (<3 x s64>), align 1, addrspace 0)
     %2:_(<4 x s64>) = G_IMPLICIT_DEF
@@ -5783,16 +12830,71 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s64_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s64_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s64_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v4s64_align32
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s64_align32
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v4s64_align32
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s64_align32
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>), align 32, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -5864,16 +12966,71 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s64_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 8)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s64_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 8)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s64_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 8)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v4s64_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 8)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s64_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v4s64_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s64_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 8)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>), align 8, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -6145,152 +13302,887 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s64_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX9-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX9-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX9-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX9-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX9-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX9-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX9-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX9-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX9-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX9-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX9-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX9-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX9-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX9-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX9-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
-    ; GFX9-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
-    ; GFX9-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
-    ; GFX9-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
-    ; GFX9-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
-    ; GFX9-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
-    ; GFX9-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
-    ; GFX9-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
-    ; GFX9-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
-    ; GFX9-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
-    ; GFX9-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
-    ; GFX9-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
-    ; GFX9-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
-    ; GFX9-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
-    ; GFX9-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
-    ; GFX9-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
-    ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
-    ; GFX9-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s64_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
+    ; GFX9PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
+    ; GFX9PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
+    ; GFX9PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
+    ; GFX9PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
+    ; GFX9PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
+    ; GFX9PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
+    ; GFX9PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
+    ; GFX9PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
+    ; GFX9PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
+    ; GFX9PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
+    ; GFX9PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
+    ; GFX9PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
+    ; GFX9PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
+    ; GFX9PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s64_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
+    ; GFX11PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
+    ; GFX11PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
+    ; GFX11PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
+    ; GFX11PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
+    ; GFX11PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
+    ; GFX11PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
+    ; GFX11PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
+    ; GFX11PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
+    ; GFX11PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
+    ; GFX11PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
+    ; GFX11PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
+    ; GFX11PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
+    ; GFX11PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
+    ; GFX11PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX11PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
+    ; GFX11PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v4s64_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX12-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
+    ; GFX12-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
+    ; GFX12-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
+    ; GFX12-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
+    ; GFX12-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
+    ; GFX12-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
+    ; GFX12-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
+    ; GFX12-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
+    ; GFX12-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
+    ; GFX12-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
+    ; GFX12-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
+    ; GFX12-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
+    ; GFX12-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
+    ; GFX12-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
+    ; GFX12-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
+    ; GFX12-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX12-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
+    ; GFX12-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s64_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v4s64_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s64_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; UNALIGNED_GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; UNALIGNED_GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; UNALIGNED_GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; UNALIGNED_GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
+    ; UNALIGNED_GFX12-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
+    ; UNALIGNED_GFX12-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
+    ; UNALIGNED_GFX12-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
+    ; UNALIGNED_GFX12-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>), align 1, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -6362,17 +14254,77 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s128_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s128_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s128_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2s128_align32
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s128_align32
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2s128_align32
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s128_align32
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s128>) = G_LOAD %0 :: (load (<2 x s128>), align 32, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -6420,13 +14372,53 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p1_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p1_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2p1_align16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p1_align16
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2p1_align16
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p1_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -6474,13 +14466,53 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p1_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p1_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2p1_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p1_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2p1_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p1_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 8, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -6528,13 +14560,53 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p1_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p1_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2p1_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p1_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2p1_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p1_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -6686,75 +14758,425 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p1_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p1_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2p1_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p1_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2p1_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p1_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 1, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -6788,12 +15210,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p3_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p3_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2p3_align8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align8
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2p3_align8
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p3_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -6827,12 +15284,47 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p3_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p3_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2p3_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2p3_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p3_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -6926,46 +15418,251 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p3_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX11PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_v2p3_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2p3_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p3_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -6991,12 +15688,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s32_from_1_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s32_from_1_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s32_from_1_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_flat_s32_from_1_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_ext_load_flat_s32_from_1_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_ext_load_flat_s32_from_1_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_flat_s32_from_1_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s8), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -7022,12 +15754,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s32_from_2_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s32_from_2_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s32_from_2_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_flat_s32_from_2_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_ext_load_flat_s32_from_2_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_ext_load_flat_s32_from_2_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_flat_s32_from_2_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s16), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -7056,13 +15823,53 @@ body: |
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s64_from_1_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_1_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_1_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_flat_s64_from_1_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_1_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_1_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_flat_s64_from_1_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s8), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -7090,13 +15897,53 @@ body: |
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s64_from_2_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_2_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_2_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_flat_s64_from_2_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_2_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_2_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_flat_s64_from_2_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s16), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -7124,13 +15971,53 @@ body: |
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s64_from_4_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_4_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_4_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_flat_s64_from_4_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_4_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_4_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_flat_s64_from_4_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s32), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -7164,16 +16051,71 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s128_from_4_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
-    ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s128_from_4_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
+    ; GFX9PLUS-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s128_from_4_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
+    ; GFX11PLUS-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_flat_s128_from_4_align4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
+    ; GFX12-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_ext_load_flat_s128_from_4_align4
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9PLUS-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9PLUS-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_ext_load_flat_s128_from_4_align4
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11PLUS-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11PLUS-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_flat_s128_from_4_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s128) = G_LOAD %0 :: (load (s32), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -7201,13 +16143,53 @@ body: |
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s64_from_2_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_2_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_2_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_flat_s64_from_2_align2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_2_align2
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_2_align2
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_flat_s64_from_2_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s16), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -7235,13 +16217,53 @@ body: |
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s64_from_1_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_1_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_1_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_flat_s64_from_1_align1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_1_align1
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_1_align1
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_flat_s64_from_1_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s8), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -7268,12 +16290,47 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s32_align536870912
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s32_align536870912
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s32_align536870912
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_flat_s32_align536870912
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align536870912
+    ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_s32_align536870912
+    ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s32_align536870912
+    ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s16), align 536870912)
     $vgpr0 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
index 23a0524b69ffa..741f878c86f8b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
@@ -1,11 +1,19 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0  %s -o - | FileCheck -check-prefix=SI %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -O0 -run-pass=legalizer -global-isel-abort=0  %s -o - | FileCheck -check-prefix=CI %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0  %s -o - | FileCheck -check-prefix=SI %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -O0 -run-pass=legalizer -global-isel-abort=0  %s -o - | FileCheck -check-prefix=CI %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX12 %s
+
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -mattr=-unaligned-access-mode -O0 -run-pass=legalizer -global-isel-abort=0  %s -o - | FileCheck -check-prefixes=SI %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=-unaligned-access-mode -O0 -run-pass=legalizer -global-isel-abort=0  %s -o - | FileCheck -check-prefixes=CI %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-unaligned-access-mode -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=VI %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-unaligned-access-mode -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=UNALIGNED_GFX9 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-unaligned-access-mode -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=UNALIGNED_GFX10 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-unaligned-access-mode -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=UNALIGNED_GFX11 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-unaligned-access-mode -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=UNALIGNED_GFX12 %s
 
 ---
 name: test_load_private_s1_align1
@@ -66,6 +74,51 @@ body: |
     ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX11-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX11-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s1_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s1_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s1_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s1_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s1_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s1) = G_LOAD %0 :: (load (s1), align 1, addrspace 5)
     %2:_(s32) = G_ZEXT %1
@@ -131,6 +184,51 @@ body: |
     ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; GFX11-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX11-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s2_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s2_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; UNALIGNED_GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s2_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; UNALIGNED_GFX10-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s2_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s2_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s2) = G_LOAD %0 :: (load (s2), align 1, addrspace 5)
     %2:_(s32) = G_ZEXT %1
@@ -184,6 +282,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s8_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s8_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s8_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s8_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s8_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s8) = G_LOAD %0 :: (load (s8), align 4, addrspace 5)
     %2:_(s32) = G_ANYEXT %1
@@ -237,6 +370,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s8_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s8_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s8_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s8_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s8_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s8) = G_LOAD %0 :: (load (s8), align 1, addrspace 5)
     %2:_(s32) = G_ANYEXT %1
@@ -290,6 +458,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s16_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s16_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s16_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s16_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s16_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s16) = G_LOAD %0 :: (load (s16), align 4, addrspace 5)
     %2:_(s32) = G_ANYEXT %1
@@ -343,6 +546,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s16_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s16_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s16_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s16_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s16_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s16) = G_LOAD %0 :: (load (s16), align 2, addrspace 5)
     %2:_(s32) = G_ANYEXT %1
@@ -426,6 +664,53 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s16_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s16_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s16_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s16_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s16_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s16) = G_LOAD %0 :: (load (s16), align 1, addrspace 5)
     %2:_(s32) = G_ANYEXT %1
@@ -479,6 +764,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s32_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s32_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s32_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s32_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 5)
     $vgpr0 = COPY %1
@@ -561,6 +881,53 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s32_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s32_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s32_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s32_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 2, addrspace 5)
     $vgpr0 = COPY %1
@@ -693,6 +1060,73 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s32_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s32_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s32_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s32_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -745,6 +1179,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s24_align8
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s24_align8
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s24_align8
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s24_align8
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s24_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s24) = G_LOAD %0 :: (load (s24), align 8, addrspace 5)
     %2:_(s32) = G_ANYEXT %1
@@ -798,6 +1267,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s24_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s24_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s24_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s24_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s24_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s24) = G_LOAD %0 :: (load (s24), align 4, addrspace 5)
     %2:_(s32) = G_ANYEXT %1
@@ -887,6 +1391,71 @@ body: |
     ; GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX11-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s24_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, align 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s24_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, align 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s24_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, align 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s24_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s24_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s24) = G_LOAD %0 :: (load (s24), align 2, addrspace 5)
     %2:_(s32) = G_ANYEXT %1
@@ -1006,6 +1575,83 @@ body: |
     ; GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX11-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s24_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s24_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s24_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s24_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s24_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s24) = G_LOAD %0 :: (load (s24), align 1, addrspace 5)
     %2:_(s32) = G_ANYEXT %1
@@ -1129,6 +1775,69 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s48_align8
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s48_align8
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; UNALIGNED_GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C2]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; UNALIGNED_GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s48_align8
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; UNALIGNED_GFX10-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C2]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; UNALIGNED_GFX10-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s48_align8
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s48_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s48) = G_LOAD %0 :: (load (s48), align 8, addrspace 5)
     %2:_(s64) = G_ANYEXT %1
@@ -1202,6 +1911,49 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s64_align8
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s64_align8
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s64_align8
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s64_align8
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -1274,6 +2026,49 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 4, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s64_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 4, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s64_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s64_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s64_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 4, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -1396,6 +2191,69 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s64_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s64_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s64_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s64_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 2, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -1608,6 +2466,105 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s64_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s64_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s64_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s64_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 1, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -1896,6 +2853,138 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s96_align16
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s96_align16
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s96_align16
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s96_align16
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -1989,6 +3078,60 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 8, addrspace 5)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s96_align8
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 8, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s96_align8
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s96_align8
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s96_align8
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 8, addrspace 5)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -2082,6 +3225,60 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 4, addrspace 5)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s96_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 4, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s96_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s96_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s96_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -2245,6 +3442,88 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s96_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s96_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s96_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s96_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 2, addrspace 5)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -2533,6 +3812,138 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s96_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s96_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s96_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s96_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -2896,6 +4307,168 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s128_align16
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s128_align16
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s128_align16
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s128_align16
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p5) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -3004,6 +4577,66 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 8, addrspace 5)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s128_align8
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 8, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s128_align8
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s128_align8
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s128_align8
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p5) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 8, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -3112,6 +4745,66 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 4, addrspace 5)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s128_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 4, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s128_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s128_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s128_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p5) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -3310,6 +5003,102 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s128_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s128_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s128_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s128_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p5) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 2, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -3673,6 +5462,168 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX12-LABEL: name: test_load_private_s128_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_s128_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_s128_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_s128_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p5) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -3745,6 +5696,49 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; GFX12-LABEL: name: test_load_private_p1_align8
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_p1_align8
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_p1_align8
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_p1_align8
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     %0:_(p5) = COPY $vgpr0
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 8, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -3817,6 +5811,49 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 4, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; GFX12-LABEL: name: test_load_private_p1_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 4, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_p1_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_p1_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_p1_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     %0:_(p5) = COPY $vgpr0
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 4, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -3939,6 +5976,69 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; GFX12-LABEL: name: test_load_private_p1_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_p1_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_p1_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_p1_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     %0:_(p5) = COPY $vgpr0
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 2, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -4151,6 +6251,105 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; GFX12-LABEL: name: test_load_private_p1_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_p1_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_p1_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_p1_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     %0:_(p5) = COPY $vgpr0
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 1, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -4203,6 +6402,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; GFX12-LABEL: name: test_load_private_p3_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_p3_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_p3_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_p3_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     %0:_(p5) = COPY $vgpr0
     %1:_(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 5)
     $vgpr0 = COPY %1
@@ -4290,6 +6524,55 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; GFX12-LABEL: name: test_load_private_p3_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_p3_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_p3_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_p3_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     %0:_(p5) = COPY $vgpr0
     %1:_(p3) = G_LOAD %0 :: (load (p3), align 2, addrspace 5)
     $vgpr0 = COPY %1
@@ -4427,6 +6710,75 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; GFX12-LABEL: name: test_load_private_p3_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_p3_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_p3_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_p3_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     %0:_(p5) = COPY $vgpr0
     %1:_(p3) = G_LOAD %0 :: (load (p3), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -4479,6 +6831,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; GFX12-LABEL: name: test_load_private_p5_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_p5_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_p5_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_p5_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     %0:_(p5) = COPY $vgpr0
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 4, addrspace 5)
     $vgpr0 = COPY %1
@@ -4566,6 +6953,55 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; GFX12-LABEL: name: test_load_private_p5_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_p5_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_p5_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_p5_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     %0:_(p5) = COPY $vgpr0
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 2, addrspace 5)
     $vgpr0 = COPY %1
@@ -4703,6 +7139,75 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; GFX12-LABEL: name: test_load_private_p5_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_p5_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_p5_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_p5_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     %0:_(p5) = COPY $vgpr0
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -4755,6 +7260,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s8_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s8_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s8_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s8_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s8_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 2, addrspace 5)
     %2:_(s16) = G_BITCAST %1
@@ -4852,6 +7392,66 @@ body: |
     ; GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s8_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s8_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s8_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s8_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s8_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 1, addrspace 5)
     %2:_(<2 x s32>) = G_ANYEXT %1
@@ -5047,6 +7647,156 @@ body: |
     ; GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX11-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v3s8_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v3s8_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; UNALIGNED_GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v3s8_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; UNALIGNED_GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v3s8_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s8_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), addrspace 5, align 4)
     %2:_(s24) = G_BITCAST %1
@@ -5298,6 +8048,191 @@ body: |
     ; GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
     ; GFX11-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v3s8_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v3s8_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; UNALIGNED_GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; UNALIGNED_GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v3s8_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; UNALIGNED_GFX10-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; UNALIGNED_GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v3s8_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s8_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), align 1, addrspace 5)
     %2:_(s24) = G_BITCAST %1
@@ -5352,6 +8287,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v4s8_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s8_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v4s8_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v4s8_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s8_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 4, addrspace 5)
     %2:_(s32) = G_BITCAST %1
@@ -5425,6 +8395,49 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v8s8_align8
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v8s8_align8
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v8s8_align8
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v8s8_align8
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v8s8_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<8 x s8>) = G_LOAD %0 :: (load (<8 x s8>), align 8, addrspace 5)
     %2:_(<2 x s32>) = G_BITCAST %1
@@ -5783,6 +8796,163 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v16s8_align16
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v16s8_align16
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v16s8_align16
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v16s8_align16
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v16s8_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<16 x s8>) = G_LOAD %0 :: (load (<16 x s8>), align 1, addrspace 5)
     %2:_(<4 x s32>) = G_BITCAST %1
@@ -5836,6 +9006,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s16_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s16_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s16_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s16_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 5)
     $vgpr0 = COPY %1
@@ -5930,6 +9135,53 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s16_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s16_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s16_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s16_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 5)
     $vgpr0 = COPY %1
@@ -6074,6 +9326,73 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s16_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s16_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s16_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s16_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -6261,6 +9580,128 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v3s16_align8
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), addrspace 5)
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX12-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v3s16_align8
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v3s16_align8
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v3s16_align8
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s16_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 8, addrspace 5)
     %2:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -6465,6 +9906,151 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v3s16_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v3s16_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v3s16_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v3s16_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s16_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 2, addrspace 5)
     %2:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -6739,6 +10325,179 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v3s16_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v3s16_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v3s16_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v3s16_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s16_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 1, addrspace 5)
     %2:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -6812,6 +10571,49 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v4s16_align8
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s16_align8
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v4s16_align8
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v4s16_align8
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 8, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -6884,6 +10686,49 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v4s16_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s16_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v4s16_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v4s16_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 4, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -7028,6 +10873,71 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v4s16_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s16_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v4s16_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v4s16_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -7263,6 +11173,107 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v4s16_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s16_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v4s16_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v4s16_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -7335,6 +11346,49 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s32_align8
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s32_align8
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s32_align8
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s32_align8
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -7407,6 +11461,49 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 4, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s32_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 4, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s32_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s32_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s32_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 4, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -7529,6 +11626,69 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s32_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s32_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s32_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s32_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 2, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -7741,6 +11901,105 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s32_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s32_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s32_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s32_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 1, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -8023,6 +12282,133 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v3s32_align16
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v3s32_align16
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v3s32_align16
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v3s32_align16
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s32_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -8110,6 +12496,55 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 4, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v3s32_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 4, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v3s32_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v3s32_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v3s32_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s32_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -8467,6 +12902,163 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v4s32_align16
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s32_align16
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v4s32_align16
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v4s32_align16
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -8569,6 +13161,61 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 8, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v4s32_align8
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 8, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s32_align8
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v4s32_align8
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v4s32_align8
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 8, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -8671,6 +13318,61 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 4, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v4s32_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 4, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s32_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v4s32_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v4s32_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -8863,6 +13565,97 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v4s32_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s32_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v4s32_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v4s32_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 2, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -9220,6 +14013,163 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v4s32_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s32_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v4s32_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v4s32_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -9386,6 +14336,97 @@ body: |
     ; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<4 x s32>) from unknown-address + 16, addrspace 5)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v8s32_align32
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 32, addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<4 x s32>) from unknown-address + 16, addrspace 5)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v8s32_align32
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 32, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 16, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load (s32) from unknown-address + 24, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C6]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s32) from unknown-address + 28, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v8s32_align32
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 32, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 16, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load (s32) from unknown-address + 24, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C6]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s32) from unknown-address + 28, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v8s32_align32
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 32, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<4 x s32>) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v8s32_align32
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 32, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<4 x s32>) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), align 32, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -9678,6 +14719,163 @@ body: |
     ; GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p5) :: (load (<4 x s32>) from unknown-address + 48, addrspace 5)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v16s32_align32
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 32, addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<4 x s32>) from unknown-address + 16, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p5) :: (load (<4 x s32>) from unknown-address + 32, align 32, addrspace 5)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p5) :: (load (<4 x s32>) from unknown-address + 48, addrspace 5)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v16s32_align32
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 32, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 16, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load (s32) from unknown-address + 24, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C6]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s32) from unknown-address + 28, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C7]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p5) :: (load (s32) from unknown-address + 32, align 32, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 36
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C8]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s32) from unknown-address + 36, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 40
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C9]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p5) :: (load (s32) from unknown-address + 40, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 44
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C10]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s32) from unknown-address + 44, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C11]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p5) :: (load (s32) from unknown-address + 48, align 16, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 52
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C12]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p5) :: (load (s32) from unknown-address + 52, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 56
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C13]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p5) :: (load (s32) from unknown-address + 56, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 60
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C14]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s32) from unknown-address + 60, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v16s32_align32
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 32, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 16, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load (s32) from unknown-address + 24, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C6]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s32) from unknown-address + 28, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C7]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p5) :: (load (s32) from unknown-address + 32, align 32, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 36
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C8]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s32) from unknown-address + 36, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 40
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C9]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p5) :: (load (s32) from unknown-address + 40, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 44
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C10]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s32) from unknown-address + 44, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C11]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p5) :: (load (s32) from unknown-address + 48, align 16, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 52
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C12]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p5) :: (load (s32) from unknown-address + 52, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 56
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C13]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p5) :: (load (s32) from unknown-address + 56, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 60
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C14]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s32) from unknown-address + 60, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v16s32_align32
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 32, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<4 x s32>) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p5) :: (load (<4 x s32>) from unknown-address + 32, align 32, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p5) :: (load (<4 x s32>) from unknown-address + 48, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v16s32_align32
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 32, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<4 x s32>) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p5) :: (load (<4 x s32>) from unknown-address + 32, align 32, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p5) :: (load (<4 x s32>) from unknown-address + 48, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>), align 32, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %1
@@ -9785,6 +14983,63 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 4, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s64_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 4, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s64_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s64_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s64_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s64_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -10147,6 +15402,165 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s64_align16
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s64_align16
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s64_align16
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s64_align16
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s64_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -10301,6 +15715,100 @@ body: |
     ; GFX11-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v3s64_align32
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 32, addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p5) :: (load (s64) from unknown-address + 16, align 16, addrspace 5)
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v3s64_align32
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 32, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 16, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD4]](s32), [[LOAD5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v3s64_align32
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 32, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 16, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD4]](s32), [[LOAD5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX10-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v3s64_align32
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 32, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p5) :: (load (s64) from unknown-address + 16, align 16, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s64_align32
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 32, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p5) :: (load (s64) from unknown-address + 16, align 16, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<3 x s64>) = G_LOAD %0 :: (load (<3 x s64>), align 32, addrspace 5)
     %2:_(<4 x s64>) = G_IMPLICIT_DEF
@@ -10474,6 +15982,99 @@ body: |
     ; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s64>) from unknown-address + 16, addrspace 5)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v4s64_align32
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 32, addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s64>) from unknown-address + 16, addrspace 5)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s64_align32
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 32, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 16, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD4]](s32), [[LOAD5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load (s32) from unknown-address + 24, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s32) from unknown-address + 28, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v4s64_align32
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 32, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 16, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD4]](s32), [[LOAD5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load (s32) from unknown-address + 24, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s32) from unknown-address + 28, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v4s64_align32
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 32, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s64>) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s64_align32
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 32, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s64>) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>), align 32, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -10582,6 +16183,66 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 4, addrspace 5)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2p1_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 4, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2p1_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2p1_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2p1_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2p1_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -10754,6 +16415,102 @@ body: |
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v4p1_align8
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 8, addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<4 x s32>) from unknown-address + 16, align 8, addrspace 5)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4p1_align8
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load (s32) from unknown-address + 24, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C6]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s32) from unknown-address + 28, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v4p1_align8
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load (s32) from unknown-address + 24, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C6]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s32) from unknown-address + 28, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v4p1_align8
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<4 x s32>) from unknown-address + 16, align 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4p1_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<4 x s32>) from unknown-address + 16, align 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x p1>) = G_LOAD %0 :: (load (<4 x p1>), align 8, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -10826,6 +16583,49 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p5) :: (load (<2 x p3>), addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2p3_align8
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p5) :: (load (<2 x p3>), addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2p3_align8
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2p3_align8
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2p3_align8
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p5) :: (load (<2 x p3>), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2p3_align8
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p5) :: (load (<2 x p3>), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -10878,6 +16678,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_private_s32_from_1_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_ext_load_private_s32_from_1_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_ext_load_private_s32_from_1_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_ext_load_private_s32_from_1_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_private_s32_from_1_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s32) = G_LOAD %0 :: (load (s8), align 4, addrspace 5)
     $vgpr0 = COPY %1
@@ -10930,6 +16765,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_private_s32_from_2_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_ext_load_private_s32_from_2_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_ext_load_private_s32_from_2_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_ext_load_private_s32_from_2_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_private_s32_from_2_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s32) = G_LOAD %0 :: (load (s16), align 4, addrspace 5)
     $vgpr0 = COPY %1
@@ -10989,6 +16859,46 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_private_s64_from_1_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_ext_load_private_s64_from_1_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_ext_load_private_s64_from_1_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_ext_load_private_s64_from_1_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_private_s64_from_1_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s64) = G_LOAD %0 :: (load (s8), align 4, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -11047,6 +16957,46 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_private_s64_from_2_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_ext_load_private_s64_from_2_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_ext_load_private_s64_from_2_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_ext_load_private_s64_from_2_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_private_s64_from_2_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s64) = G_LOAD %0 :: (load (s16), align 4, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -11105,6 +17055,46 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_private_s64_from_4_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_ext_load_private_s64_from_4_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_ext_load_private_s64_from_4_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_ext_load_private_s64_from_4_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_private_s64_from_4_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s64) = G_LOAD %0 :: (load (s32), align 4, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -11181,6 +17171,61 @@ body: |
     ; GFX11-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; GFX11-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_private_s128_from_4_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
+    ; GFX12-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; GFX12-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_ext_load_private_s128_from_4_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX9-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_ext_load_private_s128_from_4_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX10-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_ext_load_private_s128_from_4_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX11-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_private_s128_from_4_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; UNALIGNED_GFX12-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
     %0:_(p5) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load (s32), align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -11239,6 +17284,46 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_private_s64_from_2_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_ext_load_private_s64_from_2_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_ext_load_private_s64_from_2_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_ext_load_private_s64_from_2_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_private_s64_from_2_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s64) = G_LOAD %0 :: (load (s16), align 4, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -11297,6 +17382,46 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX12-LABEL: name: test_ext_load_private_s64_from_1_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_ext_load_private_s64_from_1_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_ext_load_private_s64_from_1_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_ext_load_private_s64_from_1_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_ext_load_private_s64_from_1_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s64) = G_LOAD %0 :: (load (s8), align 4, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -11349,6 +17474,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_extload_private_v2s32_from_4_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_extload_private_v2s32_from_4_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_extload_private_v2s32_from_4_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_extload_private_v2s32_from_4_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_extload_private_v2s32_from_4_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -11401,6 +17561,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_extload_private_v2s32_from_4_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_extload_private_v2s32_from_4_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_extload_private_v2s32_from_4_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_extload_private_v2s32_from_4_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_extload_private_v2s32_from_4_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -11453,6 +17648,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_extload_private_v2s32_from_4_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_extload_private_v2s32_from_4_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_extload_private_v2s32_from_4_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_extload_private_v2s32_from_4_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_extload_private_v2s32_from_4_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -11505,6 +17735,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s16>), align 4, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_extload_private_v3s32_from_6_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s16>), align 4, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_extload_private_v3s32_from_6_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s16>), align 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_extload_private_v3s32_from_6_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s16>), align 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_extload_private_v3s32_from_6_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s16>), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_extload_private_v3s32_from_6_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s16>), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s16>), align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -11557,6 +17822,41 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX12-LABEL: name: test_extload_private_v4s32_from_8_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_extload_private_v4s32_from_8_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_extload_private_v4s32_from_8_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_extload_private_v4s32_from_8_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_extload_private_v4s32_from_8_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s16>), align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -12092,6 +18392,255 @@ body: |
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX11-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s96_align1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; GFX12-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s96_align1
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX9-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
+    ; UNALIGNED_GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX9-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s96_align1
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX10-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
+    ; UNALIGNED_GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX10-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s96_align1
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX11-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s96_align1
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX12-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s96>) = G_LOAD %0 :: (load (<2 x s96>), align 1, addrspace 5)
     %2:_(s96) = G_EXTRACT %1, 0
@@ -12385,6 +18934,157 @@ body: |
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX11-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s96_align2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; GFX12-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s96_align2
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]]
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
+    ; UNALIGNED_GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX9-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s96_align2
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]]
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
+    ; UNALIGNED_GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX10-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s96_align2
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX11-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s96_align2
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX12-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s96>) = G_LOAD %0 :: (load (<2 x s96>), align 2, addrspace 5)
     %2:_(s96) = G_EXTRACT %1, 0
@@ -12548,6 +19248,105 @@ body: |
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX11-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s96_align4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 4, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 4, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; GFX12-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s96_align4
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
+    ; UNALIGNED_GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX9-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s96_align4
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
+    ; UNALIGNED_GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX10-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s96_align4
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX11-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s96_align4
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX12-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s96>) = G_LOAD %0 :: (load (<2 x s96>), align 4, addrspace 5)
     %2:_(s96) = G_EXTRACT %1, 0
@@ -12711,6 +19510,105 @@ body: |
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX11-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; GFX12-LABEL: name: test_load_private_v2s96_align16
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 16, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 4, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; GFX12-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s96_align16
+    ; UNALIGNED_GFX9: liveins: $vgpr0
+    ; UNALIGNED_GFX9-NEXT: {{  $}}
+    ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 16, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32)
+    ; UNALIGNED_GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
+    ; UNALIGNED_GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX9-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2s96_align16
+    ; UNALIGNED_GFX10: liveins: $vgpr0
+    ; UNALIGNED_GFX10-NEXT: {{  $}}
+    ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 16, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32)
+    ; UNALIGNED_GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
+    ; UNALIGNED_GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX10-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2s96_align16
+    ; UNALIGNED_GFX11: liveins: $vgpr0
+    ; UNALIGNED_GFX11-NEXT: {{  $}}
+    ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 16, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX11-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
+    ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s96_align16
+    ; UNALIGNED_GFX12: liveins: $vgpr0
+    ; UNALIGNED_GFX12-NEXT: {{  $}}
+    ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 16, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
+    ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; UNALIGNED_GFX12-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s96>) = G_LOAD %0 :: (load (<2 x s96>), align 16, addrspace 5)
     %2:_(s96) = G_EXTRACT %1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir
index 00612d552a104..d82e8328f26ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir
@@ -673,88 +673,86 @@ body: |
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(<32 x s32>) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[DEF]](<32 x s32>)
+  ; CHECK-NEXT:   [[UV2:%[0-9]+]]:_(<16 x s32>), [[UV3:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[DEF]](<32 x s32>)
   ; CHECK-NEXT:   G_BRCOND [[ICMP]](s1), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-  ; CHECK-NEXT:   [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-  ; CHECK-NEXT:   [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32), [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32), [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32), [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32), [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32), [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32), [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-  ; CHECK-NEXT:   [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32), [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32), [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32), [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32), [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32), [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32), [[UV60:%[0-9]+]]:_(s32), [[UV61:%[0-9]+]]:_(s32), [[UV62:%[0-9]+]]:_(s32), [[UV63:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-  ; CHECK-NEXT:   [[UV64:%[0-9]+]]:_(s32), [[UV65:%[0-9]+]]:_(s32), [[UV66:%[0-9]+]]:_(s32), [[UV67:%[0-9]+]]:_(s32), [[UV68:%[0-9]+]]:_(s32), [[UV69:%[0-9]+]]:_(s32), [[UV70:%[0-9]+]]:_(s32), [[UV71:%[0-9]+]]:_(s32), [[UV72:%[0-9]+]]:_(s32), [[UV73:%[0-9]+]]:_(s32), [[UV74:%[0-9]+]]:_(s32), [[UV75:%[0-9]+]]:_(s32), [[UV76:%[0-9]+]]:_(s32), [[UV77:%[0-9]+]]:_(s32), [[UV78:%[0-9]+]]:_(s32), [[UV79:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-  ; CHECK-NEXT:   [[UV80:%[0-9]+]]:_(s32), [[UV81:%[0-9]+]]:_(s32), [[UV82:%[0-9]+]]:_(s32), [[UV83:%[0-9]+]]:_(s32), [[UV84:%[0-9]+]]:_(s32), [[UV85:%[0-9]+]]:_(s32), [[UV86:%[0-9]+]]:_(s32), [[UV87:%[0-9]+]]:_(s32), [[UV88:%[0-9]+]]:_(s32), [[UV89:%[0-9]+]]:_(s32), [[UV90:%[0-9]+]]:_(s32), [[UV91:%[0-9]+]]:_(s32), [[UV92:%[0-9]+]]:_(s32), [[UV93:%[0-9]+]]:_(s32), [[UV94:%[0-9]+]]:_(s32), [[UV95:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-  ; CHECK-NEXT:   [[UV96:%[0-9]+]]:_(s32), [[UV97:%[0-9]+]]:_(s32), [[UV98:%[0-9]+]]:_(s32), [[UV99:%[0-9]+]]:_(s32), [[UV100:%[0-9]+]]:_(s32), [[UV101:%[0-9]+]]:_(s32), [[UV102:%[0-9]+]]:_(s32), [[UV103:%[0-9]+]]:_(s32), [[UV104:%[0-9]+]]:_(s32), [[UV105:%[0-9]+]]:_(s32), [[UV106:%[0-9]+]]:_(s32), [[UV107:%[0-9]+]]:_(s32), [[UV108:%[0-9]+]]:_(s32), [[UV109:%[0-9]+]]:_(s32), [[UV110:%[0-9]+]]:_(s32), [[UV111:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-  ; CHECK-NEXT:   [[UV112:%[0-9]+]]:_(s32), [[UV113:%[0-9]+]]:_(s32), [[UV114:%[0-9]+]]:_(s32), [[UV115:%[0-9]+]]:_(s32), [[UV116:%[0-9]+]]:_(s32), [[UV117:%[0-9]+]]:_(s32), [[UV118:%[0-9]+]]:_(s32), [[UV119:%[0-9]+]]:_(s32), [[UV120:%[0-9]+]]:_(s32), [[UV121:%[0-9]+]]:_(s32), [[UV122:%[0-9]+]]:_(s32), [[UV123:%[0-9]+]]:_(s32), [[UV124:%[0-9]+]]:_(s32), [[UV125:%[0-9]+]]:_(s32), [[UV126:%[0-9]+]]:_(s32), [[UV127:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
-  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV]], [[UV64]]
-  ; CHECK-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[UV65]]
-  ; CHECK-NEXT:   [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[UV2]], [[UV66]]
-  ; CHECK-NEXT:   [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[UV3]], [[UV67]]
-  ; CHECK-NEXT:   [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[UV4]], [[UV68]]
-  ; CHECK-NEXT:   [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UV5]], [[UV69]]
-  ; CHECK-NEXT:   [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[UV6]], [[UV70]]
-  ; CHECK-NEXT:   [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[UV7]], [[UV71]]
-  ; CHECK-NEXT:   [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[UV8]], [[UV72]]
-  ; CHECK-NEXT:   [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[UV9]], [[UV73]]
-  ; CHECK-NEXT:   [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[UV10]], [[UV74]]
-  ; CHECK-NEXT:   [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UV11]], [[UV75]]
-  ; CHECK-NEXT:   [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UV12]], [[UV76]]
-  ; CHECK-NEXT:   [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[UV13]], [[UV77]]
-  ; CHECK-NEXT:   [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[UV14]], [[UV78]]
-  ; CHECK-NEXT:   [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UV15]], [[UV79]]
-  ; CHECK-NEXT:   [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UV16]], [[UV80]]
-  ; CHECK-NEXT:   [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[UV17]], [[UV81]]
-  ; CHECK-NEXT:   [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[UV18]], [[UV82]]
-  ; CHECK-NEXT:   [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[UV19]], [[UV83]]
-  ; CHECK-NEXT:   [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[UV20]], [[UV84]]
-  ; CHECK-NEXT:   [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[UV21]], [[UV85]]
-  ; CHECK-NEXT:   [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[UV22]], [[UV86]]
-  ; CHECK-NEXT:   [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UV23]], [[UV87]]
-  ; CHECK-NEXT:   [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UV24]], [[UV88]]
-  ; CHECK-NEXT:   [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[UV25]], [[UV89]]
-  ; CHECK-NEXT:   [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[UV26]], [[UV90]]
-  ; CHECK-NEXT:   [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[UV27]], [[UV91]]
-  ; CHECK-NEXT:   [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[UV28]], [[UV92]]
-  ; CHECK-NEXT:   [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UV29]], [[UV93]]
-  ; CHECK-NEXT:   [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[UV30]], [[UV94]]
-  ; CHECK-NEXT:   [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UV31]], [[UV95]]
-  ; CHECK-NEXT:   [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[UV32]], [[UV96]]
-  ; CHECK-NEXT:   [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UV33]], [[UV97]]
-  ; CHECK-NEXT:   [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[UV34]], [[UV98]]
-  ; CHECK-NEXT:   [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UV35]], [[UV99]]
-  ; CHECK-NEXT:   [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[UV36]], [[UV100]]
-  ; CHECK-NEXT:   [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[UV37]], [[UV101]]
-  ; CHECK-NEXT:   [[ADD38:%[0-9]+]]:_(s32) = G_ADD [[UV38]], [[UV102]]
-  ; CHECK-NEXT:   [[ADD39:%[0-9]+]]:_(s32) = G_ADD [[UV39]], [[UV103]]
-  ; CHECK-NEXT:   [[ADD40:%[0-9]+]]:_(s32) = G_ADD [[UV40]], [[UV104]]
-  ; CHECK-NEXT:   [[ADD41:%[0-9]+]]:_(s32) = G_ADD [[UV41]], [[UV105]]
-  ; CHECK-NEXT:   [[ADD42:%[0-9]+]]:_(s32) = G_ADD [[UV42]], [[UV106]]
-  ; CHECK-NEXT:   [[ADD43:%[0-9]+]]:_(s32) = G_ADD [[UV43]], [[UV107]]
-  ; CHECK-NEXT:   [[ADD44:%[0-9]+]]:_(s32) = G_ADD [[UV44]], [[UV108]]
-  ; CHECK-NEXT:   [[ADD45:%[0-9]+]]:_(s32) = G_ADD [[UV45]], [[UV109]]
-  ; CHECK-NEXT:   [[ADD46:%[0-9]+]]:_(s32) = G_ADD [[UV46]], [[UV110]]
-  ; CHECK-NEXT:   [[ADD47:%[0-9]+]]:_(s32) = G_ADD [[UV47]], [[UV111]]
-  ; CHECK-NEXT:   [[ADD48:%[0-9]+]]:_(s32) = G_ADD [[UV48]], [[UV112]]
-  ; CHECK-NEXT:   [[ADD49:%[0-9]+]]:_(s32) = G_ADD [[UV49]], [[UV113]]
-  ; CHECK-NEXT:   [[ADD50:%[0-9]+]]:_(s32) = G_ADD [[UV50]], [[UV114]]
-  ; CHECK-NEXT:   [[ADD51:%[0-9]+]]:_(s32) = G_ADD [[UV51]], [[UV115]]
-  ; CHECK-NEXT:   [[ADD52:%[0-9]+]]:_(s32) = G_ADD [[UV52]], [[UV116]]
-  ; CHECK-NEXT:   [[ADD53:%[0-9]+]]:_(s32) = G_ADD [[UV53]], [[UV117]]
-  ; CHECK-NEXT:   [[ADD54:%[0-9]+]]:_(s32) = G_ADD [[UV54]], [[UV118]]
-  ; CHECK-NEXT:   [[ADD55:%[0-9]+]]:_(s32) = G_ADD [[UV55]], [[UV119]]
-  ; CHECK-NEXT:   [[ADD56:%[0-9]+]]:_(s32) = G_ADD [[UV56]], [[UV120]]
-  ; CHECK-NEXT:   [[ADD57:%[0-9]+]]:_(s32) = G_ADD [[UV57]], [[UV121]]
-  ; CHECK-NEXT:   [[ADD58:%[0-9]+]]:_(s32) = G_ADD [[UV58]], [[UV122]]
-  ; CHECK-NEXT:   [[ADD59:%[0-9]+]]:_(s32) = G_ADD [[UV59]], [[UV123]]
-  ; CHECK-NEXT:   [[ADD60:%[0-9]+]]:_(s32) = G_ADD [[UV60]], [[UV124]]
-  ; CHECK-NEXT:   [[ADD61:%[0-9]+]]:_(s32) = G_ADD [[UV61]], [[UV125]]
-  ; CHECK-NEXT:   [[ADD62:%[0-9]+]]:_(s32) = G_ADD [[UV62]], [[UV126]]
-  ; CHECK-NEXT:   [[ADD63:%[0-9]+]]:_(s32) = G_ADD [[UV63]], [[UV127]]
+  ; CHECK-NEXT:   [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<32 x s32>)
+  ; CHECK-NEXT:   [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32), [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32), [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32), [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32), [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32), [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32), [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32), [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32), [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32), [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32), [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32), [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32), [[UV60:%[0-9]+]]:_(s32), [[UV61:%[0-9]+]]:_(s32), [[UV62:%[0-9]+]]:_(s32), [[UV63:%[0-9]+]]:_(s32), [[UV64:%[0-9]+]]:_(s32), [[UV65:%[0-9]+]]:_(s32), [[UV66:%[0-9]+]]:_(s32), [[UV67:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<32 x s32>)
+  ; CHECK-NEXT:   [[UV68:%[0-9]+]]:_(s32), [[UV69:%[0-9]+]]:_(s32), [[UV70:%[0-9]+]]:_(s32), [[UV71:%[0-9]+]]:_(s32), [[UV72:%[0-9]+]]:_(s32), [[UV73:%[0-9]+]]:_(s32), [[UV74:%[0-9]+]]:_(s32), [[UV75:%[0-9]+]]:_(s32), [[UV76:%[0-9]+]]:_(s32), [[UV77:%[0-9]+]]:_(s32), [[UV78:%[0-9]+]]:_(s32), [[UV79:%[0-9]+]]:_(s32), [[UV80:%[0-9]+]]:_(s32), [[UV81:%[0-9]+]]:_(s32), [[UV82:%[0-9]+]]:_(s32), [[UV83:%[0-9]+]]:_(s32), [[UV84:%[0-9]+]]:_(s32), [[UV85:%[0-9]+]]:_(s32), [[UV86:%[0-9]+]]:_(s32), [[UV87:%[0-9]+]]:_(s32), [[UV88:%[0-9]+]]:_(s32), [[UV89:%[0-9]+]]:_(s32), [[UV90:%[0-9]+]]:_(s32), [[UV91:%[0-9]+]]:_(s32), [[UV92:%[0-9]+]]:_(s32), [[UV93:%[0-9]+]]:_(s32), [[UV94:%[0-9]+]]:_(s32), [[UV95:%[0-9]+]]:_(s32), [[UV96:%[0-9]+]]:_(s32), [[UV97:%[0-9]+]]:_(s32), [[UV98:%[0-9]+]]:_(s32), [[UV99:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<32 x s32>)
+  ; CHECK-NEXT:   [[UV100:%[0-9]+]]:_(s32), [[UV101:%[0-9]+]]:_(s32), [[UV102:%[0-9]+]]:_(s32), [[UV103:%[0-9]+]]:_(s32), [[UV104:%[0-9]+]]:_(s32), [[UV105:%[0-9]+]]:_(s32), [[UV106:%[0-9]+]]:_(s32), [[UV107:%[0-9]+]]:_(s32), [[UV108:%[0-9]+]]:_(s32), [[UV109:%[0-9]+]]:_(s32), [[UV110:%[0-9]+]]:_(s32), [[UV111:%[0-9]+]]:_(s32), [[UV112:%[0-9]+]]:_(s32), [[UV113:%[0-9]+]]:_(s32), [[UV114:%[0-9]+]]:_(s32), [[UV115:%[0-9]+]]:_(s32), [[UV116:%[0-9]+]]:_(s32), [[UV117:%[0-9]+]]:_(s32), [[UV118:%[0-9]+]]:_(s32), [[UV119:%[0-9]+]]:_(s32), [[UV120:%[0-9]+]]:_(s32), [[UV121:%[0-9]+]]:_(s32), [[UV122:%[0-9]+]]:_(s32), [[UV123:%[0-9]+]]:_(s32), [[UV124:%[0-9]+]]:_(s32), [[UV125:%[0-9]+]]:_(s32), [[UV126:%[0-9]+]]:_(s32), [[UV127:%[0-9]+]]:_(s32), [[UV128:%[0-9]+]]:_(s32), [[UV129:%[0-9]+]]:_(s32), [[UV130:%[0-9]+]]:_(s32), [[UV131:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<32 x s32>)
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV4]], [[UV68]]
+  ; CHECK-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV5]], [[UV69]]
+  ; CHECK-NEXT:   [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[UV6]], [[UV70]]
+  ; CHECK-NEXT:   [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[UV7]], [[UV71]]
+  ; CHECK-NEXT:   [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[UV8]], [[UV72]]
+  ; CHECK-NEXT:   [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UV9]], [[UV73]]
+  ; CHECK-NEXT:   [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[UV10]], [[UV74]]
+  ; CHECK-NEXT:   [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[UV11]], [[UV75]]
+  ; CHECK-NEXT:   [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[UV12]], [[UV76]]
+  ; CHECK-NEXT:   [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[UV13]], [[UV77]]
+  ; CHECK-NEXT:   [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[UV14]], [[UV78]]
+  ; CHECK-NEXT:   [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UV15]], [[UV79]]
+  ; CHECK-NEXT:   [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UV16]], [[UV80]]
+  ; CHECK-NEXT:   [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[UV17]], [[UV81]]
+  ; CHECK-NEXT:   [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[UV18]], [[UV82]]
+  ; CHECK-NEXT:   [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UV19]], [[UV83]]
+  ; CHECK-NEXT:   [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UV20]], [[UV84]]
+  ; CHECK-NEXT:   [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[UV21]], [[UV85]]
+  ; CHECK-NEXT:   [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[UV22]], [[UV86]]
+  ; CHECK-NEXT:   [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[UV23]], [[UV87]]
+  ; CHECK-NEXT:   [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[UV24]], [[UV88]]
+  ; CHECK-NEXT:   [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[UV25]], [[UV89]]
+  ; CHECK-NEXT:   [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[UV26]], [[UV90]]
+  ; CHECK-NEXT:   [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UV27]], [[UV91]]
+  ; CHECK-NEXT:   [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UV28]], [[UV92]]
+  ; CHECK-NEXT:   [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[UV29]], [[UV93]]
+  ; CHECK-NEXT:   [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[UV30]], [[UV94]]
+  ; CHECK-NEXT:   [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[UV31]], [[UV95]]
+  ; CHECK-NEXT:   [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[UV32]], [[UV96]]
+  ; CHECK-NEXT:   [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UV33]], [[UV97]]
+  ; CHECK-NEXT:   [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[UV34]], [[UV98]]
+  ; CHECK-NEXT:   [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UV35]], [[UV99]]
+  ; CHECK-NEXT:   [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[UV36]], [[UV100]]
+  ; CHECK-NEXT:   [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UV37]], [[UV101]]
+  ; CHECK-NEXT:   [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[UV38]], [[UV102]]
+  ; CHECK-NEXT:   [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UV39]], [[UV103]]
+  ; CHECK-NEXT:   [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[UV40]], [[UV104]]
+  ; CHECK-NEXT:   [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[UV41]], [[UV105]]
+  ; CHECK-NEXT:   [[ADD38:%[0-9]+]]:_(s32) = G_ADD [[UV42]], [[UV106]]
+  ; CHECK-NEXT:   [[ADD39:%[0-9]+]]:_(s32) = G_ADD [[UV43]], [[UV107]]
+  ; CHECK-NEXT:   [[ADD40:%[0-9]+]]:_(s32) = G_ADD [[UV44]], [[UV108]]
+  ; CHECK-NEXT:   [[ADD41:%[0-9]+]]:_(s32) = G_ADD [[UV45]], [[UV109]]
+  ; CHECK-NEXT:   [[ADD42:%[0-9]+]]:_(s32) = G_ADD [[UV46]], [[UV110]]
+  ; CHECK-NEXT:   [[ADD43:%[0-9]+]]:_(s32) = G_ADD [[UV47]], [[UV111]]
+  ; CHECK-NEXT:   [[ADD44:%[0-9]+]]:_(s32) = G_ADD [[UV48]], [[UV112]]
+  ; CHECK-NEXT:   [[ADD45:%[0-9]+]]:_(s32) = G_ADD [[UV49]], [[UV113]]
+  ; CHECK-NEXT:   [[ADD46:%[0-9]+]]:_(s32) = G_ADD [[UV50]], [[UV114]]
+  ; CHECK-NEXT:   [[ADD47:%[0-9]+]]:_(s32) = G_ADD [[UV51]], [[UV115]]
+  ; CHECK-NEXT:   [[ADD48:%[0-9]+]]:_(s32) = G_ADD [[UV52]], [[UV116]]
+  ; CHECK-NEXT:   [[ADD49:%[0-9]+]]:_(s32) = G_ADD [[UV53]], [[UV117]]
+  ; CHECK-NEXT:   [[ADD50:%[0-9]+]]:_(s32) = G_ADD [[UV54]], [[UV118]]
+  ; CHECK-NEXT:   [[ADD51:%[0-9]+]]:_(s32) = G_ADD [[UV55]], [[UV119]]
+  ; CHECK-NEXT:   [[ADD52:%[0-9]+]]:_(s32) = G_ADD [[UV56]], [[UV120]]
+  ; CHECK-NEXT:   [[ADD53:%[0-9]+]]:_(s32) = G_ADD [[UV57]], [[UV121]]
+  ; CHECK-NEXT:   [[ADD54:%[0-9]+]]:_(s32) = G_ADD [[UV58]], [[UV122]]
+  ; CHECK-NEXT:   [[ADD55:%[0-9]+]]:_(s32) = G_ADD [[UV59]], [[UV123]]
+  ; CHECK-NEXT:   [[ADD56:%[0-9]+]]:_(s32) = G_ADD [[UV60]], [[UV124]]
+  ; CHECK-NEXT:   [[ADD57:%[0-9]+]]:_(s32) = G_ADD [[UV61]], [[UV125]]
+  ; CHECK-NEXT:   [[ADD58:%[0-9]+]]:_(s32) = G_ADD [[UV62]], [[UV126]]
+  ; CHECK-NEXT:   [[ADD59:%[0-9]+]]:_(s32) = G_ADD [[UV63]], [[UV127]]
+  ; CHECK-NEXT:   [[ADD60:%[0-9]+]]:_(s32) = G_ADD [[UV64]], [[UV128]]
+  ; CHECK-NEXT:   [[ADD61:%[0-9]+]]:_(s32) = G_ADD [[UV65]], [[UV129]]
+  ; CHECK-NEXT:   [[ADD62:%[0-9]+]]:_(s32) = G_ADD [[UV66]], [[UV130]]
+  ; CHECK-NEXT:   [[ADD63:%[0-9]+]]:_(s32) = G_ADD [[UV67]], [[UV131]]
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32), [[ADD2]](s32), [[ADD3]](s32), [[ADD4]](s32), [[ADD5]](s32), [[ADD6]](s32), [[ADD7]](s32), [[ADD8]](s32), [[ADD9]](s32), [[ADD10]](s32), [[ADD11]](s32), [[ADD12]](s32), [[ADD13]](s32), [[ADD14]](s32), [[ADD15]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[ADD16]](s32), [[ADD17]](s32), [[ADD18]](s32), [[ADD19]](s32), [[ADD20]](s32), [[ADD21]](s32), [[ADD22]](s32), [[ADD23]](s32), [[ADD24]](s32), [[ADD25]](s32), [[ADD26]](s32), [[ADD27]](s32), [[ADD28]](s32), [[ADD29]](s32), [[ADD30]](s32), [[ADD31]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[ADD32]](s32), [[ADD33]](s32), [[ADD34]](s32), [[ADD35]](s32), [[ADD36]](s32), [[ADD37]](s32), [[ADD38]](s32), [[ADD39]](s32), [[ADD40]](s32), [[ADD41]](s32), [[ADD42]](s32), [[ADD43]](s32), [[ADD44]](s32), [[ADD45]](s32), [[ADD46]](s32), [[ADD47]](s32)
@@ -762,10 +760,10 @@ body: |
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(<16 x s32>) = G_PHI [[DEF]](<16 x s32>), %bb.0, [[BUILD_VECTOR]](<16 x s32>), %bb.1
-  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(<16 x s32>) = G_PHI [[DEF]](<16 x s32>), %bb.0, [[BUILD_VECTOR1]](<16 x s32>), %bb.1
-  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:_(<16 x s32>) = G_PHI [[DEF]](<16 x s32>), %bb.0, [[BUILD_VECTOR2]](<16 x s32>), %bb.1
-  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:_(<16 x s32>) = G_PHI [[DEF]](<16 x s32>), %bb.0, [[BUILD_VECTOR3]](<16 x s32>), %bb.1
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(<16 x s32>) = G_PHI [[UV]](<16 x s32>), %bb.0, [[BUILD_VECTOR]](<16 x s32>), %bb.1
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(<16 x s32>) = G_PHI [[UV1]](<16 x s32>), %bb.0, [[BUILD_VECTOR1]](<16 x s32>), %bb.1
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:_(<16 x s32>) = G_PHI [[UV2]](<16 x s32>), %bb.0, [[BUILD_VECTOR2]](<16 x s32>), %bb.1
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:_(<16 x s32>) = G_PHI [[UV3]](<16 x s32>), %bb.0, [[BUILD_VECTOR3]](<16 x s32>), %bb.1
   ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s32>) = G_CONCAT_VECTORS [[PHI]](<16 x s32>), [[PHI1]](<16 x s32>), [[PHI2]](<16 x s32>), [[PHI3]](<16 x s32>)
   ; CHECK-NEXT:   S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[CONCAT_VECTORS]](<64 x s32>)
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
index c50187f594901..1565986516860 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
@@ -42,8 +42,6 @@
     ret void
   }
 
-  define void @non_power_of_2() { ret void }
-
   define amdgpu_kernel void @load_constant_v4i16_from_8_align8(ptr addrspace(4) %ptr0) {
     ret void
   }
@@ -186,23 +184,6 @@ body: |
     %1:_(s32) = G_LOAD %0 :: (load (s32) from %ir.tmp1)
 ...
 
----
-name: non_power_of_2
-legalized: true
-
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: non_power_of_2
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr(s448) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[EXTRACT:%[0-9]+]]:sgpr(s32) = G_EXTRACT [[DEF]](s448), 0
-    ; CHECK-NEXT: $sgpr0 = COPY [[EXTRACT]](s32)
-    ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0
-    %0:_(s448) = G_IMPLICIT_DEF
-    %1:_(s32) = G_EXTRACT %0:_(s448), 0
-    $sgpr0 = COPY %1:_(s32)
-    SI_RETURN_TO_EPILOG $sgpr0
-...
-
 ---
 name: load_constant_v4i16_from_8_align8
 legalized: true
diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir
index 9794130d2b000..c91b686697b9d 100644
--- a/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir
+++ b/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir
@@ -20,7 +20,7 @@ body:             |
   ; GFX908-LABEL: name: agpr32_restore_clobber_scc
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX908-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -36,7 +36,7 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
   ;
@@ -514,7 +514,7 @@ body:             |
   ; GFX908-FLATSCR-LABEL: name: agpr32_restore_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX908-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -531,7 +531,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   S_NOP 0
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT: bb.2:
-  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
   ;
@@ -1038,7 +1038,7 @@ body:             |
   ; GFX908-LABEL: name: agpr64_restore_clobber_scc
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX908-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -1056,7 +1056,7 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
   ;
@@ -1535,7 +1535,7 @@ body:             |
   ; GFX908-FLATSCR-LABEL: name: agpr64_restore_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX908-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -1554,7 +1554,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   S_NOP 0
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT: bb.2:
-  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
   ;
@@ -2061,7 +2061,7 @@ body:             |
   ; GFX908-LABEL: name: agpr96_restore_clobber_scc
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX908-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -2081,7 +2081,7 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
   ;
@@ -2561,7 +2561,7 @@ body:             |
   ; GFX908-FLATSCR-LABEL: name: agpr96_restore_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX908-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -2582,7 +2582,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   S_NOP 0
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT: bb.2:
-  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
   ;
@@ -3089,7 +3089,7 @@ body:             |
   ; GFX908-LABEL: name: agpr32_save_clobber_scc
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX908-NEXT:   liveins: $agpr0, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -3105,7 +3105,7 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   liveins: $agpr0, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
   ;
@@ -3583,7 +3583,7 @@ body:             |
   ; GFX908-FLATSCR-LABEL: name: agpr32_save_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX908-FLATSCR-NEXT:   liveins: $agpr0, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX908-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -3600,7 +3600,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   S_NOP 0
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT: bb.2:
-  ; GFX908-FLATSCR-NEXT:   liveins: $agpr0, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
   ;
@@ -4106,7 +4106,7 @@ body:             |
   ; GFX908-LABEL: name: agpr64_save_clobber_scc
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX908-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -4124,7 +4124,7 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
   ;
@@ -4603,7 +4603,7 @@ body:             |
   ; GFX908-FLATSCR-LABEL: name: agpr64_save_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX908-FLATSCR-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX908-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -4622,7 +4622,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   S_NOP 0
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT: bb.2:
-  ; GFX908-FLATSCR-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
   ;
@@ -5127,7 +5127,7 @@ body:             |
   ; GFX908-LABEL: name: agpr96_save_clobber_scc
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX908-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -5147,7 +5147,7 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
   ;
@@ -5627,7 +5627,7 @@ body:             |
   ; GFX908-FLATSCR-LABEL: name: agpr96_save_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX908-FLATSCR-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX908-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -5648,7 +5648,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   S_NOP 0
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT: bb.2:
-  ; GFX908-FLATSCR-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
   ;
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index 8d87b53efb4e7..0e16ea10c019a 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -154,25 +154,28 @@ bb:
 declare void @undef_func()
 
 ; GCN-LABEL: {{^}}kernel_call_undef_func:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_next_free_vgpr 64
-; GFX90A: .amdhsa_accum_offset 32
-; GCN:    NumVgprs: 32
-; GCN:    NumAgprs: 32
-; GFX908: TotalNumVgprs: 32
-; GFX90A: TotalNumVgprs: 64
-; GFX908: VGPRBlocks: 7
-; GFX90A: VGPRBlocks: 7
-; GFX908: NumVGPRsForWavesPerEU: 32
-; GFX90A: NumVGPRsForWavesPerEU: 64
-; GFX90A: AccumOffset: 32
-; GFX908: Occupancy: 8
-; GFX90A: Occupancy: 8
-; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7
+; GCN:    .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN:    .set kernel_call_undef_func.num_vgpr, max(32, amdgpu.max_num_vgpr)
+; GCN:    .set kernel_call_undef_func.num_agpr, max(0, amdgpu.max_num_agpr)
+; GCN:    NumVgprs: kernel_call_undef_func.num_vgpr
+; GCN:    NumAgprs: kernel_call_undef_func.num_agpr
+; GCN:    TotalNumVgprs: totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr)
+; GFX908: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 4))/4)-1
+; GFX90A: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 8))/8)-1
+; GCN:    NumVGPRsForWavesPerEU: max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)
+; GFX90A: AccumOffset: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)+1)*4
+; GFX908: Occupancy: occupancy(10, 4, 256, 8, 10, max(kernel_call_undef_func.numbered_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
+; GFX90A: Occupancy: occupancy(8, 8, 512, 8, 8, max(kernel_call_undef_func.numbered_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63
 define amdgpu_kernel void @kernel_call_undef_func() #0 {
 bb:
   call void @undef_func()
   ret void
 }
 
+; GCN:      .set amdgpu.max_num_vgpr, 32
+; GCN-NEXT: .set amdgpu.max_num_agpr, 32
+; GCN-NEXT: .set amdgpu.max_num_sgpr, 34
+
 attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir b/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir
index 80923dfc6f522..3c3c9839755a2 100644
--- a/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir
+++ b/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -start-before=greedy,0 -stop-after=virtregrewriter,2 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s
 
 ---
 # GCN-LABEL: name: alloc_vgpr_64
diff --git a/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir
index 0f0cd0e8171d1..c42b570b40812 100644
--- a/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir
+++ b/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -start-before=greedy,0 -stop-after=virtregrewriter,2 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s
 # Using the unaligned vector tuples are OK as long as they aren't used
 # in a real instruction.
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
new file mode 100644
index 0000000000000..c293891140008
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
@@ -0,0 +1,364 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX1010 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX1030 %s
+
+define void @uniform_br_no_metadata(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX9-LABEL: uniform_br_no_metadata:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_lt_i32 s21, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB0_2
+; GFX9-NEXT:  ; %bb.1: ; %if.then
+; GFX9-NEXT:    s_mov_b32 s11, s18
+; GFX9-NEXT:    s_mov_b32 s10, s17
+; GFX9-NEXT:    s_mov_b32 s9, s16
+; GFX9-NEXT:    s_mov_b32 s8, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX9-NEXT:  .LBB0_2: ; %if.end
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: uniform_br_no_metadata:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_cmp_lt_i32 s21, 1
+; GFX10-NEXT:    s_cbranch_scc1 .LBB0_2
+; GFX10-NEXT:  ; %bb.1: ; %if.then
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_mov_b32_e32 v1, s19
+; GFX10-NEXT:    s_mov_b32 s11, s18
+; GFX10-NEXT:    s_mov_b32 s10, s17
+; GFX10-NEXT:    s_mov_b32 s9, s16
+; GFX10-NEXT:    s_mov_b32 s8, s7
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-NEXT:  .LBB0_2: ; %if.end
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp sgt i32 %flag, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+  br label %if.end
+
+if.end:
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  ret void
+}
+
+define void @uniform_br_unprofitable(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX9-LABEL: uniform_br_unprofitable:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_lt_i32 s21, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB1_2
+; GFX9-NEXT:  ; %bb.1: ; %if.then
+; GFX9-NEXT:    s_mov_b32 s11, s18
+; GFX9-NEXT:    s_mov_b32 s10, s17
+; GFX9-NEXT:    s_mov_b32 s9, s16
+; GFX9-NEXT:    s_mov_b32 s8, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX9-NEXT:  .LBB1_2: ; %if.end
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: uniform_br_unprofitable:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_cmp_lt_i32 s21, 1
+; GFX10-NEXT:    s_cbranch_scc1 .LBB1_2
+; GFX10-NEXT:  ; %bb.1: ; %if.then
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_mov_b32_e32 v1, s19
+; GFX10-NEXT:    s_mov_b32 s11, s18
+; GFX10-NEXT:    s_mov_b32 s10, s17
+; GFX10-NEXT:    s_mov_b32 s9, s16
+; GFX10-NEXT:    s_mov_b32 s8, s7
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-NEXT:  .LBB1_2: ; %if.end
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp sgt i32 %flag, 0
+  br i1 %cmp, label %if.then, label %if.end, !prof !0
+
+if.then:
+  tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+  br label %if.end
+
+if.end:
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  ret void
+}
+
+define void @uniform_br_profitable(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX9-LABEL: uniform_br_profitable:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_lt_i32 s21, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX9-NEXT:  ; %bb.1: ; %if.then
+; GFX9-NEXT:    s_mov_b32 s11, s18
+; GFX9-NEXT:    s_mov_b32 s10, s17
+; GFX9-NEXT:    s_mov_b32 s9, s16
+; GFX9-NEXT:    s_mov_b32 s8, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX9-NEXT:  .LBB2_2: ; %if.end
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: uniform_br_profitable:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_cmp_lt_i32 s21, 1
+; GFX10-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX10-NEXT:  ; %bb.1: ; %if.then
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_mov_b32_e32 v1, s19
+; GFX10-NEXT:    s_mov_b32 s11, s18
+; GFX10-NEXT:    s_mov_b32 s10, s17
+; GFX10-NEXT:    s_mov_b32 s9, s16
+; GFX10-NEXT:    s_mov_b32 s8, s7
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-NEXT:  .LBB2_2: ; %if.end
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp sgt i32 %flag, 0
+  br i1 %cmp, label %if.then, label %if.end, !prof !1
+
+if.then:
+  tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+  br label %if.end
+
+if.end:
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  ret void
+}
+
+define void @divergent_br_no_metadata(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef %flag) {
+; GFX9-LABEL: divergent_br_no_metadata:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB3_2
+; GFX9-NEXT:  ; %bb.1: ; %if.then
+; GFX9-NEXT:    s_mov_b32 s11, s18
+; GFX9-NEXT:    s_mov_b32 s10, s17
+; GFX9-NEXT:    s_mov_b32 s9, s16
+; GFX9-NEXT:    s_mov_b32 s8, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX9-NEXT:  .LBB3_2: ; %if.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: divergent_br_no_metadata:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0, v0
+; GFX1010-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1010-NEXT:    s_cbranch_execz .LBB3_2
+; GFX1010-NEXT:  ; %bb.1: ; %if.then
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1010-NEXT:    v_mov_b32_e32 v1, s19
+; GFX1010-NEXT:    s_mov_b32 s11, s18
+; GFX1010-NEXT:    s_mov_b32 s10, s17
+; GFX1010-NEXT:    s_mov_b32 s9, s16
+; GFX1010-NEXT:    s_mov_b32 s8, s7
+; GFX1010-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1010-NEXT:  .LBB3_2: ; %if.end
+; GFX1010-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: divergent_br_no_metadata:
+; GFX1030:       ; %bb.0: ; %entry
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1030-NEXT:    v_cmpx_lt_i32_e32 0, v0
+; GFX1030-NEXT:    s_cbranch_execz .LBB3_2
+; GFX1030-NEXT:  ; %bb.1: ; %if.then
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s19
+; GFX1030-NEXT:    s_mov_b32 s11, s18
+; GFX1030-NEXT:    s_mov_b32 s10, s17
+; GFX1030-NEXT:    s_mov_b32 s9, s16
+; GFX1030-NEXT:    s_mov_b32 s8, s7
+; GFX1030-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1030-NEXT:  .LBB3_2: ; %if.end
+; GFX1030-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp sgt i32 %flag, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+  br label %if.end
+
+if.end:
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  ret void
+}
+
+define void @divergent_br_unprofitable(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef %flag) {
+; GFX9-LABEL: divergent_br_unprofitable:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9-NEXT:  ; %bb.1: ; %if.then
+; GFX9-NEXT:    s_mov_b32 s11, s18
+; GFX9-NEXT:    s_mov_b32 s10, s17
+; GFX9-NEXT:    s_mov_b32 s9, s16
+; GFX9-NEXT:    s_mov_b32 s8, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX9-NEXT:  .LBB4_2: ; %if.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: divergent_br_unprofitable:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0, v0
+; GFX1010-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1010-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1010-NEXT:  ; %bb.1: ; %if.then
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1010-NEXT:    v_mov_b32_e32 v1, s19
+; GFX1010-NEXT:    s_mov_b32 s11, s18
+; GFX1010-NEXT:    s_mov_b32 s10, s17
+; GFX1010-NEXT:    s_mov_b32 s9, s16
+; GFX1010-NEXT:    s_mov_b32 s8, s7
+; GFX1010-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1010-NEXT:  .LBB4_2: ; %if.end
+; GFX1010-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: divergent_br_unprofitable:
+; GFX1030:       ; %bb.0: ; %entry
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1030-NEXT:    v_cmpx_lt_i32_e32 0, v0
+; GFX1030-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1030-NEXT:  ; %bb.1: ; %if.then
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s19
+; GFX1030-NEXT:    s_mov_b32 s11, s18
+; GFX1030-NEXT:    s_mov_b32 s10, s17
+; GFX1030-NEXT:    s_mov_b32 s9, s16
+; GFX1030-NEXT:    s_mov_b32 s8, s7
+; GFX1030-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1030-NEXT:  .LBB4_2: ; %if.end
+; GFX1030-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp sgt i32 %flag, 0
+  br i1 %cmp, label %if.then, label %if.end, !prof !0
+
+if.then:
+  tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+  br label %if.end
+
+if.end:
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  ret void
+}
+
+define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef %flag) {
+; GFX9-LABEL: divergent_br_profitable:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9-NEXT:  ; %bb.1: ; %if.then
+; GFX9-NEXT:    s_mov_b32 s11, s18
+; GFX9-NEXT:    s_mov_b32 s10, s17
+; GFX9-NEXT:    s_mov_b32 s9, s16
+; GFX9-NEXT:    s_mov_b32 s8, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX9-NEXT:  .LBB5_2: ; %if.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: divergent_br_profitable:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0, v0
+; GFX1010-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1010-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1010-NEXT:  ; %bb.1: ; %if.then
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1010-NEXT:    v_mov_b32_e32 v1, s19
+; GFX1010-NEXT:    s_mov_b32 s11, s18
+; GFX1010-NEXT:    s_mov_b32 s10, s17
+; GFX1010-NEXT:    s_mov_b32 s9, s16
+; GFX1010-NEXT:    s_mov_b32 s8, s7
+; GFX1010-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1010-NEXT:  .LBB5_2: ; %if.end
+; GFX1010-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: divergent_br_profitable:
+; GFX1030:       ; %bb.0: ; %entry
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1030-NEXT:    v_cmpx_lt_i32_e32 0, v0
+; GFX1030-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1030-NEXT:  ; %bb.1: ; %if.then
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s19
+; GFX1030-NEXT:    s_mov_b32 s11, s18
+; GFX1030-NEXT:    s_mov_b32 s10, s17
+; GFX1030-NEXT:    s_mov_b32 s9, s16
+; GFX1030-NEXT:    s_mov_b32 s8, s7
+; GFX1030-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1030-NEXT:  .LBB5_2: ; %if.end
+; GFX1030-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp sgt i32 %flag, 0
+  br i1 %cmp, label %if.then, label %if.end, !prof !1
+
+if.then:
+  tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+  br label %if.end
+
+if.end:
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg)
+declare void @llvm.amdgcn.s.waitcnt(i32)
+
+!0 = !{!"branch_weights", i32 1000, i32 1000}
+!1 = !{!"branch_weights", i32 2000, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index c8ba6722d9d85..122fc42ef9b62 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -11,7 +11,7 @@
 ; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
 ; ASM: .amdhsa_user_sgpr_count 10
 ; ASM: .amdhsa_next_free_sgpr 10
-; ASM: ; NumSgprs: 16
+; ASM: ; TotalNumSgprs: 16
 ; ASM: ; NumSGPRsForWavesPerEU: 16
 
 ; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
@@ -31,7 +31,7 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret
 ; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 10
 ; ASM: .amdhsa_next_free_sgpr 10
-; ASM: ; NumSgprs: 16
+; ASM: ; TotalNumSgprs: 16
 ; ASM: ; NumSGPRsForWavesPerEU: 16
 
 ; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
@@ -47,7 +47,7 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 {
 ; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 3
 ; ASM: .amdhsa_next_free_sgpr 3
-; ASM: ; NumSgprs: 9
+; ASM: ; TotalNumSgprs: 9
 ; ASM: ; NumSGPRsForWavesPerEU: 9
 
 ; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
@@ -62,7 +62,7 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r
 ; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 2
 ; ASM: .amdhsa_next_free_sgpr 0
-; ASM: ; NumSgprs: 6
+; ASM: ; TotalNumSgprs: 6
 ; ASM: ; NumSGPRsForWavesPerEU: 6
 
 ; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
index 99a7ae37e0e78..8f4cb364751d8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
@@ -60,7 +60,9 @@ bb:
 declare void @undef_func()
 
 ; CHECK:      .type          kernel_call_undef_func
-; CHECK:      NumAgprs:       32
+; CHECK:      .set kernel_call_undef_func.num_agpr, max(0, amdgpu.max_num_agpr)
+; CHECK:      NumAgprs: kernel_call_undef_func.num_agpr
+; CHECK:      .set amdgpu.max_num_agpr, 32
 define amdgpu_kernel void @kernel_call_undef_func() #0 {
 bb:
   call void @undef_func()
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
index e4d427a0b826f..d45e116beb4e3 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
@@ -547,18 +547,20 @@ define amdgpu_kernel void @f256() #256 {
 attributes #256 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
 
 ; GCN-LABEL: {{^}}f512:
-; GFX9: NumVgprs: 128
-; GFX90A: NumVgprs: 128
-; GFX90A: NumAgprs: 128
-; GFX90A: TotalNumVgprs: 256
-; GFX10WGP-WAVE32: NumVgprs: 256
-; GFX10WGP-WAVE64: NumVgprs: 256
-; GFX10CU-WAVE32: NumVgprs: 128
-; GFX10CU-WAVE64: NumVgprs: 128
-; GFX11WGP-WAVE32: NumVgprs: 256
-; GFX11WGP-WAVE64: NumVgprs: 256
-; GFX11CU-WAVE32: NumVgprs: 192
-; GFX11CU-WAVE64: NumVgprs: 192
+; GFX9:             .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr)
+; GFX90A:           .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr)
+; GFX90A:           .set f512.num_agpr, max(128, amdgpu.max_num_agpr)
+; GFX10WGP-WAVE32:  .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr)
+; GFX10WGP-WAVE64:  .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr)
+; GFX10CU-WAVE32:   .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr)
+; GFX10CU-WAVE64:   .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr)
+; GFX11WGP-WAVE32:  .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr)
+; GFX11WGP-WAVE64:  .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr)
+; GFX11CU-WAVE32:   .set f512.num_vgpr, max(192, amdgpu.max_num_vgpr)
+; GFX11CU-WAVE64:   .set f512.num_vgpr, max(192, amdgpu.max_num_vgpr)
+; GCN:              NumVgprs: f512.num_vgpr
+; GFX90A:           NumAgprs: f512.num_agpr
+; GFX90A:           TotalNumVgprs: totalnumvgprs(f512.num_agpr, f512.num_vgpr)
 define amdgpu_kernel void @f512() #512 {
   call void @foo()
   call void @use256vgprs()
@@ -567,17 +569,20 @@ define amdgpu_kernel void @f512() #512 {
 attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
 
 ; GCN-LABEL: {{^}}f1024:
-; GFX9: NumVgprs: 64
-; GFX90A: NumAgprs: 64
-; GFX90A: TotalNumVgprs: 128
-; GFX10WGP-WAVE32: NumVgprs: 128
-; GFX10WGP-WAVE64: NumVgprs: 128
-; GFX10CU-WAVE32: NumVgprs: 64
-; GFX10CU-WAVE64: NumVgprs: 64
-; GFX11WGP-WAVE32: NumVgprs: 192
-; GFX11WGP-WAVE64: NumVgprs: 192
-; GFX11CU-WAVE32: NumVgprs: 96
-; GFX11CU-WAVE64: NumVgprs: 96
+; GFX9:             .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr)
+; GFX90A:           .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr)
+; GFX90A:           .set f1024.num_agpr, max(64, amdgpu.max_num_agpr)
+; GFX10WGP-WAVE32:  .set f1024.num_vgpr, max(128, amdgpu.max_num_vgpr)
+; GFX10WGP-WAVE64:  .set f1024.num_vgpr, max(128, amdgpu.max_num_vgpr)
+; GFX10CU-WAVE32:   .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr)
+; GFX10CU-WAVE64:   .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr)
+; GFX11WGP-WAVE32:  .set f1024.num_vgpr, max(192, amdgpu.max_num_vgpr)
+; GFX11WGP-WAVE64:  .set f1024.num_vgpr, max(192, amdgpu.max_num_vgpr)
+; GFX11CU-WAVE32:   .set f1024.num_vgpr, max(96, amdgpu.max_num_vgpr)
+; GFX11CU-WAVE64:   .set f1024.num_vgpr, max(96, amdgpu.max_num_vgpr)
+; GCN:              NumVgprs: f1024.num_vgpr
+; GFX90A:           NumAgprs: f1024.num_agpr
+; GFX90A:           TotalNumVgprs: totalnumvgprs(f1024.num_agpr, f1024.num_vgpr)
 define amdgpu_kernel void @f1024() #1024 {
   call void @foo()
   call void @use256vgprs()
diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
index 3ed2cb856eaea..2b98f61748066 100644
--- a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs --stop-after=regallocfast,1 -o - %s | FileCheck -check-prefix=REGALLOC %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs --stop-after=regallocfast,2 -o - %s | FileCheck -check-prefix=REGALLOC %s
 
 ; Test to check if the bb prolog spills are inserted correctly during regalloc.
 define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
@@ -8,22 +8,20 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
   ; REGALLOC-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
   ; REGALLOC-NEXT: {{  $}}
-  ; REGALLOC-NEXT:   renamable $vgpr3 = IMPLICIT_DEF
   ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr2, %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
   ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
-  ; REGALLOC-NEXT:   renamable $vgpr1 = COPY $vgpr0
-  ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
   ; REGALLOC-NEXT:   renamable $sgpr4 = S_MOV_B32 49
-  ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = V_CMP_GT_I32_e64 killed $vgpr1, killed $sgpr4, implicit $exec
+  ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = V_CMP_GT_I32_e64 killed $vgpr0, killed $sgpr4, implicit $exec
   ; REGALLOC-NEXT:   renamable $sgpr6 = IMPLICIT_DEF
-  ; REGALLOC-NEXT:   renamable $vgpr1 = COPY killed renamable $sgpr6
-  ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+  ; REGALLOC-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr6
+  ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
   ; REGALLOC-NEXT:   renamable $sgpr6_sgpr7 = COPY $exec, implicit-def $exec
   ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = S_AND_B64 renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; REGALLOC-NEXT:   renamable $sgpr6_sgpr7 = S_XOR_B64 renamable $sgpr4_sgpr5, killed renamable $sgpr6_sgpr7, implicit-def dead $scc
-  ; REGALLOC-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr6, 0, $vgpr0, implicit-def $sgpr6_sgpr7, implicit $sgpr6_sgpr7
-  ; REGALLOC-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 1, $vgpr0, implicit killed $sgpr6_sgpr7
-  ; REGALLOC-NEXT:   SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+  ; REGALLOC-NEXT:   $vgpr63 = IMPLICIT_DEF
+  ; REGALLOC-NEXT:   $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr6, 0, $vgpr63, implicit-def $sgpr6_sgpr7, implicit $sgpr6_sgpr7
+  ; REGALLOC-NEXT:   $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 1, $vgpr63, implicit killed $sgpr6_sgpr7
+  ; REGALLOC-NEXT:   SI_SPILL_WWM_V32_SAVE killed $vgpr63, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
   ; REGALLOC-NEXT:   $exec = S_MOV_B64_term killed renamable $sgpr4_sgpr5
   ; REGALLOC-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; REGALLOC-NEXT:   S_BRANCH %bb.3
@@ -31,16 +29,16 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-NEXT: bb.1.Flow:
   ; REGALLOC-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; REGALLOC-NEXT: {{  $}}
-  ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-  ; REGALLOC-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr4_sgpr5
-  ; REGALLOC-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+  ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+  ; REGALLOC-NEXT:   $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+  ; REGALLOC-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0, implicit-def $sgpr4_sgpr5
+  ; REGALLOC-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 1
   ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; REGALLOC-NEXT:   $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
-  ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr1, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+  ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
   ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
-  ; REGALLOC-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 2, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-  ; REGALLOC-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr5, 3, $vgpr0, implicit $sgpr4_sgpr5
-  ; REGALLOC-NEXT:   SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+  ; REGALLOC-NEXT:   $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 2, $vgpr63, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+  ; REGALLOC-NEXT:   $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr5, 3, $vgpr63, implicit $sgpr4_sgpr5
+  ; REGALLOC-NEXT:   SI_SPILL_WWM_V32_SAVE killed $vgpr63, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
   ; REGALLOC-NEXT:   $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; REGALLOC-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
   ; REGALLOC-NEXT:   S_BRANCH %bb.2
@@ -64,13 +62,12 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-NEXT:   S_BRANCH %bb.1
   ; REGALLOC-NEXT: {{  $}}
   ; REGALLOC-NEXT: bb.4.bb.3:
-  ; REGALLOC-NEXT:   $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-  ; REGALLOC-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5
-  ; REGALLOC-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3
-  ; REGALLOC-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+  ; REGALLOC-NEXT:   $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+  ; REGALLOC-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 2, implicit-def $sgpr4_sgpr5
+  ; REGALLOC-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr63, 3
+  ; REGALLOC-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; REGALLOC-NEXT:   renamable $vgpr0 = V_LSHL_ADD_U32_e64 killed $vgpr0, 2, $vgpr0, implicit $exec
-  ; REGALLOC-NEXT:   KILL killed renamable $vgpr1
   ; REGALLOC-NEXT:   SI_RETURN implicit killed $vgpr0
 bb.0:
   %cmp = icmp slt i32 %arg0, 50
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index adfc177c8bf74..0047b6b0ee934 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -971,12 +971,12 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    v_writelane_b32 v1, s98, 3
 ; CHECK-NEXT:    v_writelane_b32 v0, s92, 61
 ; CHECK-NEXT:    v_writelane_b32 v1, s99, 4
+; CHECK-NEXT:    s_mov_b32 s49, s12
 ; CHECK-NEXT:    v_writelane_b32 v0, s93, 62
 ; CHECK-NEXT:    v_writelane_b32 v1, s100, 5
-; CHECK-NEXT:    s_mov_b32 s49, s12
+; CHECK-NEXT:    s_cmp_eq_u32 s49, 0
 ; CHECK-NEXT:    v_writelane_b32 v0, s94, 63
 ; CHECK-NEXT:    v_writelane_b32 v1, s101, 6
-; CHECK-NEXT:    s_cmp_eq_u32 s49, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
index a795e99560341..e8898d6a7001c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=ALL,GFX908 %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=ALL %s
 ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL,GFX90A %s
 
 ; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
@@ -8,12 +8,13 @@
 @alias = hidden alias void (), ptr @aliasee_default
 
 ; ALL-LABEL: {{^}}kernel:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX908-NEXT: .amdhsa_next_free_sgpr 33
+; ALL:          .amdhsa_next_free_vgpr max(totalnumvgprs(kernel.num_agpr, kernel.num_vgpr), 1, 0)
+; ALL-NEXT:     .amdhsa_next_free_sgpr (max(kernel.numbered_sgpr+(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1))
+; GFX90A-NEXT:  .amdhsa_accum_offset ((((((alignto(max(1, kernel.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
 
-; GFX90A: .amdhsa_next_free_vgpr 59
-; GFX90A-NEXT: .amdhsa_next_free_sgpr 33
-; GFX90A-NEXT: .amdhsa_accum_offset 32
+; ALL:       .set kernel.num_vgpr, max(32, aliasee_default.num_vgpr)
+; ALL-NEXT:  .set kernel.num_agpr, max(0, aliasee_default.num_agpr)
+; ALL-NEXT:  .set kernel.numbered_sgpr, max(33, aliasee_default.numbered_sgpr)
 define amdgpu_kernel void @kernel() #0 {
 bb:
   call void @alias() #2
@@ -25,6 +26,9 @@ bb:
   call void asm sideeffect "; clobber a26 ", "~{a26}"()
   ret void
 }
+; ALL:      .set aliasee_default.num_vgpr, 0
+; ALL-NEXT: .set aliasee_default.num_agpr, 27
+; ALL-NEXT: .set aliasee_default.numbered_sgpr, 32
 
 attributes #0 = { noinline norecurse nounwind optnone }
 attributes #1 = { noinline norecurse nounwind readnone willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
index c976cc3d53b5e..a01268625cedb 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -7,14 +7,18 @@
 @alias0 = hidden alias void (), ptr @aliasee_default_vgpr64_sgpr102
 
 ; CHECK-LABEL: {{^}}kernel0:
-; CHECK: .amdhsa_next_free_vgpr 53
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK:      .set kernel0.num_vgpr, max(32, aliasee_default_vgpr64_sgpr102.num_vgpr)
+; CHECK-NEXT: .set kernel0.num_agpr, max(0, aliasee_default_vgpr64_sgpr102.num_agpr)
+; CHECK-NEXT: .set kernel0.numbered_sgpr, max(33, aliasee_default_vgpr64_sgpr102.numbered_sgpr)
 define amdgpu_kernel void @kernel0() #0 {
 bb:
   call void @alias0() #2
   ret void
 }
 
+; CHECK:      .set aliasee_default_vgpr64_sgpr102.num_vgpr, 53
+; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.num_agpr, 0
+; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.numbered_sgpr, 32
 define internal void @aliasee_default_vgpr64_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
index edef71ef143df..86defe3ba7ec0 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -9,8 +9,12 @@
 ; The parent kernel has a higher VGPR usage than the possible callees.
 
 ; CHECK-LABEL: {{^}}kernel1:
-; CHECK: .amdhsa_next_free_vgpr 41
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK:      .amdhsa_next_free_vgpr max(totalnumvgprs(kernel1.num_agpr, kernel1.num_vgpr), 1, 0)
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel1.numbered_sgpr+(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1))
+
+; CHECK:      .set kernel1.num_vgpr, max(41, aliasee_vgpr32_sgpr76.num_vgpr)
+; CHECK-NEXT: .set kernel1.num_agpr, max(0, aliasee_vgpr32_sgpr76.num_agpr)
+; CHECK-NEXT: .set kernel1.numbered_sgpr, max(33, aliasee_vgpr32_sgpr76.numbered_sgpr)
 define amdgpu_kernel void @kernel1() #0 {
 bb:
   call void asm sideeffect "; clobber v40 ", "~{v40}"()
@@ -18,6 +22,9 @@ bb:
   ret void
 }
 
+; CHECK:      .set aliasee_vgpr32_sgpr76.num_vgpr, 27
+; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.num_agpr, 0
+; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.numbered_sgpr, 32
 define internal void @aliasee_vgpr32_sgpr76() #1 {
 bb:
   call void asm sideeffect "; clobber v26 ", "~{v26}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
index bb34ef1a15d2b..6b1fbd9b6e16a 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -7,14 +7,21 @@
 @alias2 = hidden alias void (), ptr @aliasee_vgpr64_sgpr102
 
 ; CHECK-LABEL: {{^}}kernel2:
-; CHECK: .amdhsa_next_free_vgpr 53
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK:      .amdhsa_next_free_vgpr max(totalnumvgprs(kernel2.num_agpr, kernel2.num_vgpr), 1, 0)
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel2.numbered_sgpr+(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1))
+
+; CHECK:      .set kernel2.num_vgpr, max(32, aliasee_vgpr64_sgpr102.num_vgpr)
+; CHECK-NEXT: .set kernel2.num_agpr, max(0, aliasee_vgpr64_sgpr102.num_agpr)
+; CHECK-NEXT: .set kernel2.numbered_sgpr, max(33, aliasee_vgpr64_sgpr102.numbered_sgpr)
 define amdgpu_kernel void @kernel2() #0 {
 bb:
   call void @alias2() #2
   ret void
 }
 
+; CHECK:      .set aliasee_vgpr64_sgpr102.num_vgpr, 53
+; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.num_agpr, 0
+; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.numbered_sgpr, 32
 define internal void @aliasee_vgpr64_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
index 8a88eb7e51ad7..c81181cd82667 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -7,14 +7,21 @@
 @alias3 = hidden alias void (), ptr @aliasee_vgpr256_sgpr102
 
 ; CHECK-LABEL: {{^}}kernel3:
-; CHECK: .amdhsa_next_free_vgpr 253
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK:      .amdhsa_next_free_vgpr max(totalnumvgprs(kernel3.num_agpr, kernel3.num_vgpr), 1, 0)
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel3.numbered_sgpr+(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1))
+
+; CHECK:      .set kernel3.num_vgpr, max(32, aliasee_vgpr256_sgpr102.num_vgpr)
+; CHECK-NEXT: .set kernel3.num_agpr, max(0, aliasee_vgpr256_sgpr102.num_agpr)
+; CHECK-NEXT: .set kernel3.numbered_sgpr, max(33, aliasee_vgpr256_sgpr102.numbered_sgpr)
 define amdgpu_kernel void @kernel3() #0 {
 bb:
   call void @alias3() #2
   ret void
 }
 
+; CHECK:      .set aliasee_vgpr256_sgpr102.num_vgpr, 253
+; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.num_agpr, 0
+; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.numbered_sgpr, 33
 define internal void @aliasee_vgpr256_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v252 ", "~{v252}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 6af45035d394f..dbd00f09943c0 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -7,7 +7,7 @@
 ; Make sure to run a GPU with the SGPR allocation bug.
 
 ; GCN-LABEL: {{^}}use_vcc:
-; GCN: ; NumSgprs: 34
+; GCN: ; TotalNumSgprs: 34
 ; GCN: ; NumVgprs: 0
 define void @use_vcc() #1 {
   call void asm sideeffect "", "~{vcc}" () #0
@@ -25,7 +25,7 @@ define void @use_vcc() #1 {
 ; GCN: v_readlane_b32 s4, v40, 2
 ; GCN: s_mov_b32 s33, s4
 ; GCN: s_setpc_b64 s[30:31]
-; GCN: ; NumSgprs: 36
+; GCN: ; TotalNumSgprs: 36
 ; GCN: ; NumVgprs: 41
 define void @indirect_use_vcc() #1 {
   call void @use_vcc()
@@ -33,9 +33,9 @@ define void @indirect_use_vcc() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
-; CI: ; NumSgprs: 38
-; VI-NOBUG: ; NumSgprs: 40
-; VI-BUG: ; NumSgprs: 96
+; CI: ; TotalNumSgprs: 38
+; VI-NOBUG: ; TotalNumSgprs: 40
+; VI-BUG: ; TotalNumSgprs: 96
 ; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
   call void @indirect_use_vcc()
@@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
 }
 
 ; GCN-LABEL: {{^}}use_flat_scratch:
-; CI: ; NumSgprs: 36
-; VI: ; NumSgprs: 38
+; CI: ; TotalNumSgprs: 36
+; VI: ; TotalNumSgprs: 38
 ; GCN: ; NumVgprs: 0
 define void @use_flat_scratch() #1 {
   call void asm sideeffect "", "~{flat_scratch}" () #0
@@ -52,8 +52,8 @@ define void @use_flat_scratch() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_use_flat_scratch:
-; CI: ; NumSgprs: 38
-; VI: ; NumSgprs: 40
+; CI: ; TotalNumSgprs: 38
+; VI: ; TotalNumSgprs: 40
 ; GCN: ; NumVgprs: 41
 define void @indirect_use_flat_scratch() #1 {
   call void @use_flat_scratch()
@@ -61,9 +61,9 @@ define void @indirect_use_flat_scratch() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel:
-; CI: ; NumSgprs: 38
-; VI-NOBUG: ; NumSgprs: 40
-; VI-BUG: ; NumSgprs: 96
+; CI: ; TotalNumSgprs: 38
+; VI-NOBUG: ; TotalNumSgprs: 40
+; VI-BUG: ; TotalNumSgprs: 96
 ; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 {
   call void @indirect_use_flat_scratch()
@@ -107,23 +107,23 @@ define void @indirect_use_50_vgpr() #0 {
 }
 
 ; GCN-LABEL: {{^}}use_80_sgpr:
-; GCN: ; NumSgprs: 80
+; GCN: ; TotalNumSgprs: 80
 define void @use_80_sgpr() #1 {
   call void asm sideeffect "", "~{s79}"() #0
   ret void
 }
 
 ; GCN-LABEL: {{^}}indirect_use_80_sgpr:
-; GCN: ; NumSgprs: 82
+; GCN: ; TotalNumSgprs: 82
 define void @indirect_use_80_sgpr() #1 {
   call void @use_80_sgpr()
   ret void
 }
 
 ; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
-; CI: ; NumSgprs: 84
-; VI-NOBUG: ; NumSgprs: 86
-; VI-BUG: ; NumSgprs: 96
+; CI: ; TotalNumSgprs: 84
+; VI-NOBUG: ; TotalNumSgprs: 86
+; VI-BUG: ; TotalNumSgprs: 96
 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
   call void @indirect_use_80_sgpr()
   ret void
@@ -176,7 +176,7 @@ define amdgpu_kernel void @multi_call_use_use_stack() #0 {
 declare void @external() #0
 
 ; GCN-LABEL: {{^}}usage_external:
-; NumSgprs: 48
+; TotalNumSgprs: 48
 ; NumVgprs: 24
 ; GCN: ScratchSize: 16384
 ;
@@ -190,7 +190,7 @@ define amdgpu_kernel void @usage_external() #0 {
 declare void @external_recurse() #2
 
 ; GCN-LABEL: {{^}}usage_external_recurse:
-; NumSgprs: 48
+; TotalNumSgprs: 48
 ; NumVgprs: 24
 ; GCN: ScratchSize: 16384
 ;
@@ -234,10 +234,11 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
 ; Make sure there's no assert when a sgpr96 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr96_external_call
 ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
-; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
+; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4
+; VI-BUG: TotalNumSgprs: 96
+; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr
 define amdgpu_kernel void @count_use_sgpr96_external_call()  {
 entry:
   tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
@@ -248,10 +249,11 @@ entry:
 ; Make sure there's no assert when a sgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr160_external_call
 ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
-; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
+; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4
+; VI-BUG: TotalNumSgprs: 96
+; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr
 define amdgpu_kernel void @count_use_sgpr160_external_call()  {
 entry:
   tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@@ -262,10 +264,11 @@ entry:
 ; Make sure there's no assert when a vgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_vgpr160_external_call
 ; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
-; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(5, amdgpu.max_num_vgpr)
+; GCN: .set count_use_vgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
+; CI: TotalNumSgprs: count_use_vgpr160_external_call.numbered_sgpr+4
+; VI-BUG: TotalNumSgprs: 96
+; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr
 define amdgpu_kernel void @count_use_vgpr160_external_call()  {
 entry:
   tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@@ -273,6 +276,27 @@ entry:
   ret void
 }
 
+; GCN: .set amdgpu.max_num_vgpr, 50
+; GCN: .set amdgpu.max_num_agpr, 0
+; GCN: .set amdgpu.max_num_sgpr, 80
+
+; GCN-LABEL: amdhsa.kernels:
+; GCN:      .name: count_use_sgpr96_external_call
+; CI:       .sgpr_count: 84
+; VI-NOBUG: .sgpr_count: 86
+; VI-BUG:   .sgpr_count: 96
+; GCN:      .vgpr_count: 50
+; GCN:      .name: count_use_sgpr160_external_call
+; CI:       .sgpr_count: 84
+; VI-NOBUG: .sgpr_count: 86
+; VI-BUG:   .sgpr_count: 96
+; GCN:      .vgpr_count: 50
+; GCN:      .name: count_use_vgpr160_external_call
+; CI:       .sgpr_count: 84
+; VI-NOBUG: .sgpr_count: 86
+; VI-BUG:   .sgpr_count: 96
+; GCN:      .vgpr_count: 50
+
 attributes #0 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #1 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #2 = { nounwind noinline }
diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index b46cdb8ab3ba0..3e25904aa044d 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -36,66 +36,56 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN_DBG-NEXT:    s_add_u32 s12, s12, s9
 ; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
-; GCN_DBG-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN_DBG-NEXT:    s_load_dword s0, s[2:3], 0x9
+; GCN_DBG-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 0
 ; GCN_DBG-NEXT:    s_load_dword s1, s[2:3], 0xa
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s2, -1
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    s_cmp_lg_u32 s1, s2
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN_DBG-NEXT:    s_mov_b64 exec, -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN_DBG-NEXT:  ; %bb.1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB0_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
-; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
-; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
 ; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 0x80
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_read_b32 v1, v1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 1.0
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_add_f32_e64 v2, v1, s2
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_write_b32 v1, v2
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 1
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB0_2
 ; GCN_DBG-NEXT:  ; %bb.3: ; %DummyReturnBlock
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 entry:
   %cmp = icmp eq i32 %n, -1
@@ -144,53 +134,48 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
 ; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN_DBG-NEXT:    s_add_u32 s12, s12, s9
 ; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
-; GCN_DBG-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN_DBG-NEXT:    s_load_dword s0, s[2:3], 0x9
+; GCN_DBG-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_branch .LBB1_2
 ; GCN_DBG-NEXT:  .LBB1_1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB1_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
-; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
-; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
 ; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 0x80
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_read_b32 v1, v1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 1.0
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_add_f32_e64 v2, v1, s2
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_write_b32 v1, v2
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 1
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], 0
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GCN_DBG-NEXT:    s_branch .LBB1_2
@@ -232,53 +217,48 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN_DBG-NEXT:    s_add_u32 s12, s12, s9
 ; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
-; GCN_DBG-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN_DBG-NEXT:    s_load_dword s0, s[2:3], 0x9
+; GCN_DBG-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_branch .LBB2_2
 ; GCN_DBG-NEXT:  .LBB2_1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB2_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
-; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
-; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
 ; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 0x80
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_read_b32 v1, v1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 1.0
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_add_f32_e64 v2, v1, s2
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_write_b32 v1, v2
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 1
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB2_1
 ; GCN_DBG-NEXT:    s_branch .LBB2_2
@@ -321,51 +301,46 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN_DBG-NEXT:    s_add_u32 s12, s12, s9
 ; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
-; GCN_DBG-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN_DBG-NEXT:    s_load_dword s0, s[2:3], 0x9
+; GCN_DBG-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_branch .LBB3_2
 ; GCN_DBG-NEXT:  .LBB3_1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB3_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
-; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
-; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
 ; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 0x80
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_read_b32 v1, v1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 1.0
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_add_f32_e64 v2, v1, s2
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_write_b32 v1, v2
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 1
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GCN_DBG-NEXT:    s_branch .LBB3_2
@@ -422,66 +397,61 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN_DBG-NEXT:    s_add_u32 s12, s12, s9
 ; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
-; GCN_DBG-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN_DBG-NEXT:    s_load_dword s0, s[2:3], 0x9
+; GCN_DBG-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 0
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    ds_read_u8 v1, v1
+; GCN_DBG-NEXT:    ds_read_u8 v0, v0
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_readfirstlane_b32 s0, v1
+; GCN_DBG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN_DBG-NEXT:    s_and_b32 s0, 1, s0
 ; GCN_DBG-NEXT:    s_cmp_eq_u32 s0, 1
 ; GCN_DBG-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN_DBG-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s1, 2
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s1, 2
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 3
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 3
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_branch .LBB4_2
 ; GCN_DBG-NEXT:  .LBB4_1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB4_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
-; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 3
-; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 1
-; GCN_DBG-NEXT:    v_readlane_b32 s3, v0, 2
-; GCN_DBG-NEXT:    v_readlane_b32 s4, v0, 0
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 3
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s3, v2, 2
+; GCN_DBG-NEXT:    v_readlane_b32 s4, v2, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
 ; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s4
 ; GCN_DBG-NEXT:    s_mov_b32 s4, 0x80
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s4
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_read_b32 v1, v1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
 ; GCN_DBG-NEXT:    s_mov_b32 s4, 1.0
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_add_f32_e64 v2, v1, s4
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s4
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_write_b32 v1, v2
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 1
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 3
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 3
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB4_1
 ; GCN_DBG-NEXT:    s_branch .LBB4_2
diff --git a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
index 643f2619840a2..ede57f1a0a04c 100644
--- a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -33,6 +33,7 @@ bb2:
 
 ; GCN-LABEL: {{^}}preserve_condition_undef_flag:
 ; GCN-NOT: vcc
+; GCN: s_endpgm
 define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
 bb0:
   %tmp = icmp sgt i32 %arg1, 4
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
index 9d93609b1e881..f198833059572 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
@@ -1,8 +1,8 @@
 ; REQUIRES: asserts
-; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
-; RUN: not --crash llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
 
-; CHECK: function must have been generated already
+; CHECK-NOT: func
 
 define internal i32 @func() {
   ret i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 67a084068941a..7cec15ea5be87 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -48,72 +48,67 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 0
+; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 1
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-O0-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
-; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
+; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 2
+; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB0_4
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 0
+; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 1
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
 ; GCN-O0-NEXT:    s_mov_b32 s1, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_ashrrev_i32_e64 v4, 31, v1
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
-; GCN-O0-NEXT:    v_lshl_b64 v[3:4], v[2:3], s0
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
-; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_lshl_b64 v[2:3], v[1:2], s0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[4:7], 0 addr64
+; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 4
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 5
+; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 4
+; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB0_3
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
+; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -130,26 +125,25 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB0_3: ; %Flow
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 4
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 5
+; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 4
+; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 5
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:  .LBB0_4: ; %bb.outer.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 3
+; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 2
+; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 3
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 3
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, 3
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-O0-NEXT:    s_mov_b32 m0, -1
-; GCN-O0-NEXT:    ds_write_b32 v1, v2
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
+; GCN-O0-NEXT:    ds_write_b32 v0, v1
 ; GCN-O0-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -222,72 +216,67 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 0
+; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 1
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-O0-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
-; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
+; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 2
+; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB1_3
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 0
+; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 1
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
 ; GCN-O0-NEXT:    s_mov_b32 s1, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_ashrrev_i32_e64 v4, 31, v1
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
-; GCN-O0-NEXT:    v_lshl_b64 v[3:4], v[2:3], s0
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
-; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_lshl_b64 v[2:3], v[1:2], s0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[4:7], 0 addr64
+; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 4
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 5
+; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 4
+; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB1_4
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
+; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -305,27 +294,27 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:  .LBB1_3: ; %Flow
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 3
+; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 2
+; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 3
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_branch .LBB1_5
 ; GCN-O0-NEXT:  .LBB1_4: ; %bb.inner.end
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s2, v0, 4
-; GCN-O0-NEXT:    v_readlane_b32 s3, v0, 5
+; GCN-O0-NEXT:    v_readlane_b32 s2, v4, 4
+; GCN-O0-NEXT:    v_readlane_b32 s3, v4, 5
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
+; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -340,14 +329,10 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; GCN-O0-NEXT:    s_branch .LBB1_3
 ; GCN-O0-NEXT:  .LBB1_5: ; %bb.outer.end
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 3
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, 3
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-O0-NEXT:    s_mov_b32 m0, -1
-; GCN-O0-NEXT:    ds_write_b32 v1, v2
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
+; GCN-O0-NEXT:    ds_write_b32 v0, v1
 ; GCN-O0-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -433,19 +418,14 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_writelane_b32 v0, s2, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s3, 1
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
+; GCN-O0-NEXT:    v_writelane_b32 v4, s2, 0
+; GCN-O0-NEXT:    v_writelane_b32 v4, s3, 1
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-O0-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
@@ -453,42 +433,43 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
 ; GCN-O0-NEXT:    s_mov_b32 s4, 2
-; GCN-O0-NEXT:    v_lshlrev_b32_e64 v3, s4, v1
+; GCN-O0-NEXT:    v_lshlrev_b32_e64 v2, s4, v0
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; GCN-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[0:3], 0 addr64
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[0:3], 0 addr64
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
-; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
+; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 2
+; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_6
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[0:1], v1, s0
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
+; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[0:1], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], exec
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GCN-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
-; GCN-O0-NEXT:    v_writelane_b32 v0, s2, 4
-; GCN-O0-NEXT:    v_writelane_b32 v0, s3, 5
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    v_writelane_b32 v4, s2, 4
+; GCN-O0-NEXT:    v_writelane_b32 v4, s3, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_2
@@ -496,31 +477,30 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB2_2: ; %Flow
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 4
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 5
+; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 4
+; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 6
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 7
+; GCN-O0-NEXT:    v_writelane_b32 v4, s0, 6
+; GCN-O0-NEXT:    v_writelane_b32 v4, s1, 7
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_5
 ; GCN-O0-NEXT:  ; %bb.3: ; %bb.then
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
+; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -536,16 +516,15 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; GCN-O0-NEXT:    s_branch .LBB2_5
 ; GCN-O0-NEXT:  .LBB2_4: ; %bb.else
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
+; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -562,26 +541,25 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB2_5: ; %Flow1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 6
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 7
+; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 6
+; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 7
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:  .LBB2_6: ; %bb.outer.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 3
+; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 2
+; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 3
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 3
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, 3
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-O0-NEXT:    s_mov_b32 m0, -1
-; GCN-O0-NEXT:    ds_write_b32 v1, v2
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
+; GCN-O0-NEXT:    ds_write_b32 v0, v1
 ; GCN-O0-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -681,51 +659,46 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-O0-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
-; GCN-O0-NEXT:    v_lshlrev_b32_e64 v3, s0, v1
+; GCN-O0-NEXT:    v_lshlrev_b32_e64 v2, s0, v0
 ; GCN-O0-NEXT:    s_mov_b32 s1, 0
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; GCN-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-O0-NEXT:    s_mov_b32 s2, s4
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GCN-O0-NEXT:    s_mov_b32 s1, s5
-; GCN-O0-NEXT:    v_mov_b32_e32 v6, v4
-; GCN-O0-NEXT:    v_add_i32_e64 v5, s[2:3], s2, v2
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-O0-NEXT:    v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3]
-; GCN-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GCN-O0-NEXT:    v_mov_b32_e32 v6, v2
-; GCN-O0-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GCN-O0-NEXT:    v_add_i32_e64 v4, s[2:3], s2, v1
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT:    v_addc_u32_e64 v1, s[2:3], v1, v5, s[2:3]
+; GCN-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GCN-O0-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s1, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b32 s3, s1
 ; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
-; GCN-O0-NEXT:    v_cmp_lt_u32_e64 s[0:1], v1, s0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[4:7], 0 addr64
+; GCN-O0-NEXT:    v_cmp_lt_u32_e64 s[0:1], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], exec
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GCN-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(4)
-; GCN-O0-NEXT:    v_writelane_b32 v0, s2, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s3, 1
+; GCN-O0-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
+; GCN-O0-NEXT:    v_writelane_b32 v6, s2, 0
+; GCN-O0-NEXT:    v_writelane_b32 v6, s3, 1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_1
@@ -733,28 +706,28 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB3_1: ; %Flow2
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    v_readlane_b32 s0, v6, 0
+; GCN-O0-NEXT:    v_readlane_b32 s1, v6, 1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
+; GCN-O0-NEXT:    v_writelane_b32 v6, s0, 2
+; GCN-O0-NEXT:    v_writelane_b32 v6, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_8
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.outer.then
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    s_mov_b32 s4, s2
@@ -763,23 +736,24 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s1, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 1
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 offset:4
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
+; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 offset:4
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
-; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 4
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 5
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
+; GCN-O0-NEXT:    v_writelane_b32 v6, s0, 4
+; GCN-O0-NEXT:    v_writelane_b32 v6, s1, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_7
 ; GCN-O0-NEXT:  ; %bb.3: ; %bb.inner.then
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_waitcnt expcnt(1)
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
@@ -789,19 +763,18 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s1, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
 ; GCN-O0-NEXT:    s_branch .LBB3_7
 ; GCN-O0-NEXT:  .LBB3_4: ; %bb.outer.else
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s1, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    s_mov_b32 s2, s0
@@ -810,22 +783,23 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s5, s0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 3
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 offset:12
-; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, 3
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
+; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 offset:12
+; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 6
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 7
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
+; GCN-O0-NEXT:    v_writelane_b32 v6, s0, 6
+; GCN-O0-NEXT:    v_writelane_b32 v6, s1, 7
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_6
 ; GCN-O0-NEXT:  ; %bb.5: ; %bb.inner.then2
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_waitcnt expcnt(1)
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
@@ -835,43 +809,41 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s1, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 4
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:16
 ; GCN-O0-NEXT:  .LBB3_6: ; %Flow
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 6
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 7
+; GCN-O0-NEXT:    v_readlane_b32 s0, v6, 6
+; GCN-O0-NEXT:    v_readlane_b32 s1, v6, 7
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_branch .LBB3_1
 ; GCN-O0-NEXT:  .LBB3_7: ; %Flow1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 4
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 5
+; GCN-O0-NEXT:    v_readlane_b32 s0, v6, 4
+; GCN-O0-NEXT:    v_readlane_b32 s1, v6, 5
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:  .LBB3_8: ; %bb.outer.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 3
+; GCN-O0-NEXT:    v_readlane_b32 s0, v6, 2
+; GCN-O0-NEXT:    v_readlane_b32 s1, v6, 3
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 3
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, 3
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-O0-NEXT:    s_mov_b32 m0, -1
-; GCN-O0-NEXT:    ds_write_b32 v1, v2
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
+; GCN-O0-NEXT:    ds_write_b32 v0, v1
 ; GCN-O0-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -935,44 +907,39 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    v_writelane_b32 v3, s0, 0
+; GCN-O0-NEXT:    v_writelane_b32 v3, s1, 1
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-O0-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
-; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
+; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
+; GCN-O0-NEXT:    v_writelane_b32 v3, s0, 2
+; GCN-O0-NEXT:    v_writelane_b32 v3, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v3, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB4_2
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.then
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 0
-; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s0, v3, 0
+; GCN-O0-NEXT:    v_readlane_b32 s1, v3, 1
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
 ; GCN-O0-NEXT:    s_mov_b32 s5, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v2, 31, v0
 ; GCN-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v2
@@ -983,14 +950,13 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:  .LBB4_2: ; %bb.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
-; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 3
+; GCN-O0-NEXT:    v_readlane_b32 s0, v3, 2
+; GCN-O0-NEXT:    v_readlane_b32 s1, v3, 3
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_barrier
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
 ; GCN-O0-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1082,91 +1048,84 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0:       ; %bb.0: ; %bb
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(1)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
-; GCN-O0-NEXT:    v_writelane_b32 v0, s6, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s7, 1
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 3
+; GCN-O0-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
+; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 0
+; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 1
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 2
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:  .LBB5_1: ; %bb1
 ; GCN-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s8, v0, 2
-; GCN-O0-NEXT:    v_readlane_b32 s9, v0, 3
-; GCN-O0-NEXT:    v_readlane_b32 s6, v0, 0
-; GCN-O0-NEXT:    v_readlane_b32 s7, v0, 1
-; GCN-O0-NEXT:    v_writelane_b32 v0, s6, 4
-; GCN-O0-NEXT:    v_writelane_b32 v0, s7, 5
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s8, v6, 2
+; GCN-O0-NEXT:    v_readlane_b32 s9, v6, 3
+; GCN-O0-NEXT:    v_readlane_b32 s6, v6, 0
+; GCN-O0-NEXT:    v_readlane_b32 s7, v6, 1
+; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 4
+; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 5
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0x207
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, s4
+; GCN-O0-NEXT:    v_cmp_lt_i32_e64 s[4:5], v0, s4
 ; GCN-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 6
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 7
-; GCN-O0-NEXT:    v_writelane_b32 v0, s6, 0
-; GCN-O0-NEXT:    v_writelane_b32 v0, s7, 1
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 6
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 7
+; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 0
+; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 1
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-O0-NEXT:    v_writelane_b32 v0, s6, 2
-; GCN-O0-NEXT:    v_writelane_b32 v0, s7, 3
+; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 2
+; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:    s_cbranch_execnz .LBB5_1
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb2
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 6
-; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 7
+; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 6
+; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 7
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s6, 0
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[4:5], v1, s6
-; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v1, s6
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 8
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 9
+; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[4:5], v0, s6
+; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 8
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 9
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    s_mov_b32 s8, s4
 ; GCN-O0-NEXT:    s_mov_b32 s9, s4
 ; GCN-O0-NEXT:    s_mov_b32 s10, s4
 ; GCN-O0-NEXT:    s_mov_b32 s11, s4
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-O0-NEXT:    v_mov_b32_e32 v3, s10
-; GCN-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 10
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 11
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 10
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 11
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1175,31 +1134,31 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-O0-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-O0-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_cmp_lt_f32_e64 s[6:7], v1, s4
+; GCN-O0-NEXT:    v_cmp_lt_f32_e64 s[6:7], v0, s4
 ; GCN-O0-NEXT:    s_mov_b32 s8, s4
 ; GCN-O0-NEXT:    s_mov_b32 s9, s4
 ; GCN-O0-NEXT:    s_mov_b32 s10, s4
 ; GCN-O0-NEXT:    s_mov_b32 s11, s4
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-O0-NEXT:    v_mov_b32_e32 v3, s10
-; GCN-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 12
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 13
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 12
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 13
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1217,7 +1176,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_mov_b32 s5, s10
 ; GCN-O0-NEXT:    s_mov_b32 s6, s9
 ; GCN-O0-NEXT:    s_mov_b32 s7, s8
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, s6
@@ -1229,69 +1188,64 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_branch .LBB5_6
 ; GCN-O0-NEXT:  .LBB5_5: ; %Flow2
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(1)
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 10
-; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 11
-; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    s_waitcnt expcnt(3)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt expcnt(2)
 ; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
 ; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(3)
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 10
+; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 11
+; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_branch .LBB5_7
 ; GCN-O0-NEXT:  .LBB5_6: ; %Flow
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(1)
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 12
-; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 13
-; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    s_waitcnt expcnt(3)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt expcnt(2)
 ; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
 ; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(3)
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 12
+; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 13
+; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_branch .LBB5_5
 ; GCN-O0-NEXT:  .LBB5_7: ; %bb10
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(3)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s6, v0, 8
-; GCN-O0-NEXT:    v_readlane_b32 s7, v0, 9
+; GCN-O0-NEXT:    v_readlane_b32 s6, v6, 8
+; GCN-O0-NEXT:    v_readlane_b32 s7, v6, 9
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], -1
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 14
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 15
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 14
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 15
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 16
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 17
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 16
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 17
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1300,103 +1254,99 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-O0-NEXT:    s_xor_b64 s[4:5], exec, -1
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_writelane_b32 v0, s4, 14
-; GCN-O0-NEXT:    v_writelane_b32 v0, s5, 15
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 14
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 15
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:  .LBB5_9: ; %Flow3
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s8, v4, 16
-; GCN-O0-NEXT:    v_readlane_b32 s9, v4, 17
-; GCN-O0-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-O0-NEXT:    v_readlane_b32 s6, v4, 4
-; GCN-O0-NEXT:    v_readlane_b32 s7, v4, 5
-; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 14
-; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 15
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    s_waitcnt expcnt(4)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt expcnt(3)
 ; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt expcnt(2)
 ; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
 ; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    v_readlane_b32 s8, v6, 16
+; GCN-O0-NEXT:    v_readlane_b32 s9, v6, 17
+; GCN-O0-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-O0-NEXT:    v_readlane_b32 s6, v6, 4
+; GCN-O0-NEXT:    v_readlane_b32 s7, v6, 5
+; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 14
+; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 15
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
 ; GCN-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-O0-NEXT:    s_mov_b64 s[8:9], s[4:5]
-; GCN-O0-NEXT:    v_writelane_b32 v4, s8, 0
-; GCN-O0-NEXT:    v_writelane_b32 v4, s9, 1
-; GCN-O0-NEXT:    v_writelane_b32 v4, s6, 2
-; GCN-O0-NEXT:    v_writelane_b32 v4, s7, 3
+; GCN-O0-NEXT:    v_writelane_b32 v6, s8, 0
+; GCN-O0-NEXT:    v_writelane_b32 v6, s9, 1
+; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 2
+; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 3
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-O0-NEXT:    v_writelane_b32 v4, s6, 18
-; GCN-O0-NEXT:    v_writelane_b32 v4, s7, 19
+; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 18
+; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 19
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:    s_cbranch_execnz .LBB5_1
 ; GCN-O0-NEXT:  ; %bb.10: ; %bb12
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    s_waitcnt expcnt(3)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt expcnt(4)
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 18
-; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 19
+; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 18
+; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 19
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:  ; %bb.11: ; %bb12
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    s_waitcnt expcnt(3)
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_waitcnt expcnt(2)
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_waitcnt expcnt(1)
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GCN-O0-NEXT:    v_mov_b32_e32 v4, v3
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
-; GCN-O0-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-O0-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; GCN-O0-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-O0-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GCN-O0-NEXT:    v_mov_b32_e32 v4, v2
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
-; GCN-O0-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-O0-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; GCN-O0-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-O0-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GCN-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
-; GCN-O0-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-O0-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; GCN-O0-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-O0-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; GCN-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-O0-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
 ; GCN-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 789150f690d52..7c09fec908f93 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -46,6 +46,9 @@
 
 ; VMEM: [[ENDIF]]:
 
+; Restore val
+; VGPR: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
+
 ; Reload and restore exec mask
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
@@ -58,7 +61,7 @@
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 
 ; Restore val
-; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
+; VMEM: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
 
@@ -120,6 +123,7 @@ endif:
 ; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; GCN: [[END]]:
+; VGPR: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
@@ -129,7 +133,8 @@ endif:
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
 
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
-; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
+
+; VMEM: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
 
@@ -189,6 +194,7 @@ end:
 ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]]
 
 ; GCN: [[FLOW]]: ; %Flow
+; VGPR: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
 ; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
@@ -200,7 +206,7 @@ end:
 
 ; GCN: s_or_saveexec_b64 s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]], s[[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]]
 
-; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
+; VMEM: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
 
 ; Regular spill value restored after exec modification
 ; Followed by spill
@@ -234,6 +240,7 @@ end:
 ; GCN-NEXT: s_branch [[FLOW]]
 
 ; GCN: [[ENDIF]]:
+; VGPR: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]
 
@@ -245,7 +252,7 @@ end:
 
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 
-; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
 define amdgpu_kernel void @divergent_if_else_endif(ptr addrspace(1) %out) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir
index d5cdf584a75de..a14d515688a8b 100644
--- a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir
+++ b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir
@@ -14,10 +14,10 @@ body: |
   ; CHECK-LABEL: name: def_csr_sgpr
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr42, $sgpr43, $sgpr46, $sgpr47, $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr42, $sgpr43, $sgpr46, $sgpr47
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
   ; CHECK-NEXT:   $exec = S_MOV_B64 killed $sgpr4_sgpr5
   ; CHECK-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr42, 0, $vgpr0
   ; CHECK-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr43, 1, $vgpr0
@@ -26,8 +26,6 @@ body: |
   ; CHECK-NEXT:   S_NOP 0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $sgpr42 = S_MOV_B32 0
   ; CHECK-NEXT:   $sgpr43 = S_MOV_B32 1
   ; CHECK-NEXT:   $sgpr46_sgpr47 = S_MOV_B64 2
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index b541be9f5aa44..6686742e449f5 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -220,334 +220,327 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v4
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v0
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v10
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 1
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s10, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s10, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s10, 2
 ; GFX9-O0-NEXT:    s_mov_b32 s11, s7
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s11, 3
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, s10, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v3, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v14, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v20, vcc
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s11, 3
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, s10, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v0, v2, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v0, v13, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v19, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
-; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[10:11], s[4:5]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[9:10], s[4:5]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v20, v1, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v14, v1, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v19, v0, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v13, v0, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v8
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v22
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v18, vcc, s10, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v8, v9, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v8, v13, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v8, v15, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v21
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v17, vcc, s10, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v9, vcc, v7, v8, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v7, v12, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v14, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v19
-; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[21:22], s[4:5]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, v9, v10, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[20:21], s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v8, v9, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v12
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v15, v8, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v13, v8, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v14, v7, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v12, v7, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
-; GFX9-O0-NEXT:    v_xor_b32_e64 v15, v15, v20
-; GFX9-O0-NEXT:    v_xor_b32_e64 v13, v13, v14
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
+; GFX9-O0-NEXT:    v_xor_b32_e64 v14, v14, v19
+; GFX9-O0-NEXT:    v_xor_b32_e64 v12, v12, v13
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v14
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[13:14], s4, v[13:14]
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[12:13], s4, v[12:13]
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v17
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v19
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v5
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v4
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v15
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v19
-; GFX9-O0-NEXT:    v_or_b32_e64 v15, v13, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
-; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v14
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
-; GFX9-O0-NEXT:    v_or_b32_e64 v15, v13, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
-; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v14
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v18
+; GFX9-O0-NEXT:    v_or_b32_e64 v14, v12, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v17
+; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v13
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v14
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[12:13], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v16
+; GFX9-O0-NEXT:    v_or_b32_e64 v14, v12, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v13
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v14
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[12:13], s[6:7]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9]
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9]
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s13, 32
-; GFX9-O0-NEXT:    v_add_u32_e64 v8, v8, s13
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v9, v9
-; GFX9-O0-NEXT:    v_min_u32_e64 v8, v8, v9
+; GFX9-O0-NEXT:    v_add_u32_e64 v7, v7, s13
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX9-O0-NEXT:    v_min_u32_e64 v7, v7, v8
 ; GFX9-O0-NEXT:    s_mov_b32 s12, 0
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s12
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
-; GFX9-O0-NEXT:    v_add_u32_e64 v7, v7, s13
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v10
-; GFX9-O0-NEXT:    v_min_u32_e64 v13, v7, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX9-O0-NEXT:    v_add_u32_e64 v6, v6, s13
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v9, v9
+; GFX9-O0-NEXT:    v_min_u32_e64 v12, v6, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s12
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
 ; GFX9-O0-NEXT:    s_mov_b64 s[14:15], 64
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v12
 ; GFX9-O0-NEXT:    s_mov_b32 s16, s14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s15
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v10, s[16:17], v10, s16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s18
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17]
-; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v8, v9, s[8:9]
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v9, s[16:17], v9, s16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s18
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v6, s[16:17], v6, v10, s[16:17]
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v7, v8, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9]
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v0
+; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s13
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s13
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v2
-; GFX9-O0-NEXT:    v_min_u32_e64 v6, v5, v6
+; GFX9-O0-NEXT:    v_min_u32_e64 v5, v4, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s12
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s13
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v11, v4
-; GFX9-O0-NEXT:    v_min_u32_e64 v12, v5, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s13
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v3
+; GFX9-O0-NEXT:    v_min_u32_e64 v11, v4, v10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s12
-; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v11
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s15
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v11, s[12:13], v11, s12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s14
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v5, v12, s[12:13]
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[8:9]
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v10, s[12:13], v10, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s14
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[12:13], v4, v11, s[12:13]
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v8
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v9, vcc, v6, v7, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v7
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v5, v6, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v6, v7, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v6, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
-; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[8:9], v7, 1
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
+; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[8:9], v6, 1
 ; GFX9-O0-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-O0-NEXT:    s_xor_b64 s[4:5], s[8:9], s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
-; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
+; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
-; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v7, v7, v10
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v2, v5, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[8:9]
+; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s4, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s5, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -555,11 +548,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB0_8
 ; GFX9-O0-NEXT:  .LBB0_1: ; %Flow
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
@@ -588,20 +581,19 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_5
 ; GFX9-O0-NEXT:  .LBB0_3: ; %Flow2
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v4, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v4, 5
-; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 5
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -648,13 +640,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_3
 ; GFX9-O0-NEXT:  .LBB0_5: ; %Flow1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
-; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
@@ -663,9 +648,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 9
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -679,92 +670,87 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 11
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 11
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[29:30], s4, v[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v30
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], s5, v[23:24]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v24
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], s5, v[22:23]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v29
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    v_or_b32_e64 v23, v5, v10
-; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v22
+; GFX9-O0-NEXT:    v_or_b32_e64 v22, v5, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[2:3]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v30
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v29
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v28
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[6:7]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[6:7]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v30
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
 ; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v29
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v27
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v26
 ; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v26
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v25
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v24
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -784,22 +770,22 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v22
-; GFX9-O0-NEXT:    v_and_b32_e64 v23, v7, v23
-; GFX9-O0-NEXT:    v_and_b32_e64 v21, v11, v21
-; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v20
-; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v23
-; GFX9-O0-NEXT:    v_and_b32_e64 v23, v11, v19
-; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v22
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v20
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v19, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v20, v11, v20
+; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v11, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v19
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
@@ -815,66 +801,66 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v20, vcc, v11, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v19, vcc, v11, v16
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s4
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v10, v11, vcc
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s4
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v21
-; GFX9-O0-NEXT:    v_or_b32_e64 v19, v19, v22
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v20
-; GFX9-O0-NEXT:    v_or_b32_e64 v17, v17, v18
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v19
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13]
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v18, v18, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v2
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v2
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v14
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v12
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 10
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 11
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 11
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -912,52 +898,52 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[21:22]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v12, s6, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], v12, v[19:20]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v24
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], v12, v[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v23
 ; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v22
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
 ; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v4, s6
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v5, v4, s6
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[23:24], v5, v[19:20]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[22:23], v5, v[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 0
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v4, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v21
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v22
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v20
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[19:20]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[18:19]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v5
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s7
@@ -976,12 +962,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s4
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v15, v17, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v15, v16, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s5
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v14, vcc, v14, v15, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s4
@@ -993,7 +979,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -1006,10 +992,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s4, 10
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s5, 11
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 11
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -1037,201 +1024,194 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v9, vcc, v4, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s9
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
-; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s4, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], v3, v[11:12]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
+; GFX9-O0-NEXT:    v_sub_u32_e64 v2, s4, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-O0-NEXT:    v_sub_u32_e64 v14, s4, v3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], v14, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v15
-; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v16
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
-; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
-; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v3, s4
-; GFX9-O0-NEXT:    s_mov_b32 s10, 63
-; GFX9-O0-NEXT:    v_sub_u32_e64 v4, s10, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[13:14], v4, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[4:5]
-; GFX9-O0-NEXT:    s_mov_b32 s10, 0
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v3, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[10:11]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_sub_u32_e64 v13, s4, v2
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
+; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v15
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[10:11]
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v2, s4
+; GFX9-O0-NEXT:    s_mov_b32 s10, 63
+; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s10, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[12:13], v3, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b32 s10, 0
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v2, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[7:8], v3, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v4, v7, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v3, v6, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
-; GFX9-O0-NEXT:    v_or_b32_e64 v3, v3, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v1, v1, v2
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB0_5
 ; GFX9-O0-NEXT:    s_branch .LBB0_7
 ; GFX9-O0-NEXT:  .LBB0_9: ; %udiv-end
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
-; GFX9-O0-NEXT:    v_xor_b32_e64 v9, v6, v5
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_xor_b32_e64 v8, v5, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_xor_b32_e64 v0, v0, v8
+; GFX9-O0-NEXT:    v_xor_b32_e64 v0, v0, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v7, vcc, v7, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v7
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 32
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[7:8], s4, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[5:6], s4, v[5:6]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-O0-NEXT:    ; kill: killed $vgpr4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s4, v[3:4]
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -1444,258 +1424,252 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-G-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-G-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v2
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v3
-; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v8
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v3
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v2
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v1
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v7
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr1_vgpr2 killed $vgpr4_vgpr5 killed $exec
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr3_vgpr4 killed $exec
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v7
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr1 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr0 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 31
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v12, v3, v8
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr1 killed $exec
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v11, v2, v7
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr0 killed $exec
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 31
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v10, v1, v2
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr1_vgpr2 killed $vgpr13_vgpr14 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v9, v0, v1
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v14
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v15
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v16
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr1 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr0 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 31
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v11, v3, v8
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr1 killed $exec
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v10, v2, v7
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr0 killed $exec
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 31
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v9, v1, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v8, v0, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v2
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v1, v12, v1
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v2, v12, v2
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v4, v10, v4
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v3, v10, v3
-; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v1, s[6:7], v1, v12
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v2, s[6:7], v2, v12, s[6:7]
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v6, s[6:7], v4, v10, s[6:7]
-; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v3, s[6:7], v3, v10, s[6:7]
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v0, v11, v0
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v1, v11, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v3, v9, v3
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v2, v9, v2
+; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v0, s[6:7], v0, v11
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v1, s[6:7], v1, v11, s[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v5, s[6:7], v3, v9, s[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v2, s[6:7], v2, v9, s[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v12
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v14
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v14
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v15
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v16
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v8, v11, v5
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v5, v11, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v14
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v7, v9, v7
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v4, v9, v4
-; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v8, s[6:7], v8, v11
-; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v5, s[6:7], v5, v11, s[6:7]
-; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v7, s[6:7], v7, v9, s[6:7]
-; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v4, s[6:7], v4, v9, s[6:7]
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v13, v11, v12
-; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v11, v11, v12
-; GFX9-G-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v11, v9, v10
-; GFX9-G-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v9, v9, v10
-; GFX9-G-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v9, v8, v7
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v11, v5, v4
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12]
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v9, v1, v6
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v11, v2, v3
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v7, v10, v4
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v4, v10, v3
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v12
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v6, v8, v6
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v3, v8, v3
+; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v7, s[6:7], v7, v10
+; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v4, s[6:7], v4, v10, s[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v6, s[6:7], v6, v8, s[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v3, s[6:7], v3, v8, s[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v12, v10, v11
+; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v10, v10, v11
+; GFX9-G-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v10, v8, v9
+; GFX9-G-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v8, v8, v9
+; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v8, v7, v6
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v10, v4, v3
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[6:7], v[8:9], v[10:11]
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v8, v0, v5
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v10, v1, v2
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], v[10:11]
 ; GFX9-G-O0-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12]
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v5, v5
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v8, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, 32
-; GFX9-G-O0-NEXT:    v_add_u32_e64 v8, v8, v9
-; GFX9-G-O0-NEXT:    v_min_u32_e64 v5, v5, v8
-; GFX9-G-O0-NEXT:    s_mov_b32 s10, 64
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s10
-; GFX9-G-O0-NEXT:    v_add_u32_e64 v5, v5, v8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], v[10:11]
 ; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v4, v4
 ; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v7, v7
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, 32
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v7, v7, v8
 ; GFX9-G-O0-NEXT:    v_min_u32_e64 v4, v4, v7
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[8:9]
+; GFX9-G-O0-NEXT:    s_mov_b32 s10, 64
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v4, v4, v7
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v3, v3
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 32
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v6, v6, v7
+; GFX9-G-O0-NEXT:    v_min_u32_e64 v3, v3, v6
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[8:9]
 ; GFX9-G-O0-NEXT:    s_mov_b32 s16, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v3
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], v[9:10]
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v5, v2
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v7, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[6:7], v[8:9]
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v4, v1
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v6, v0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 32
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v6, v6, v7
+; GFX9-G-O0-NEXT:    v_min_u32_e64 v4, v4, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s10
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v6, v4, v6
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v7, v5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, 32
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v7, v7, v8
-; GFX9-G-O0-NEXT:    v_min_u32_e64 v5, v5, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s10
-; GFX9-G-O0-NEXT:    v_add_u32_e64 v7, v5, v7
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v5, v3
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v8, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, 32
-; GFX9-G-O0-NEXT:    v_add_u32_e64 v8, v8, v9
-; GFX9-G-O0-NEXT:    v_min_u32_e64 v5, v5, v8
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[8:9]
+; GFX9-G-O0-NEXT:    v_min_u32_e64 v4, v4, v7
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[8:9]
 ; GFX9-G-O0-NEXT:    s_mov_b32 s15, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s11, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s14, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, 0
-; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v7, s[8:9], v4, v5
-; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v6, s[8:9], v3, v4
+; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s16
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v4, s[8:9], v4, v5, s[8:9]
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s15
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s14
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9]
-; GFX9-G-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s10
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9]
-; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v9
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s4
-; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[12:13], v[14:15]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[12:13], v[14:15]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s12
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s13
-; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[12:13], v[10:11], v[12:13]
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v3, s[8:9], v3, v4, s[8:9]
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s14
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[8:9], v4, v7, s[8:9]
+; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v7, s[8:9], v4, v7, s[8:9]
+; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v3
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s4
+; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[11:12], v[13:14]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[11:12], v[13:14]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s12
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s13
+; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[12:13], v[9:10], v[11:12]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, 1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v9, v4, v9, s[12:13]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, 1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v10, v5, v10, s[12:13]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, 1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[10:11]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v10, v5, v10, s[8:9]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, 1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[6:7]
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v5, v5, v10
-; GFX9-G-O0-NEXT:    s_mov_b32 s7, 0x7f
-; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v7, v7, s7
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v4, v4, s6
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v7, v7, v9
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v4, v4, v8
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[7:8], v[9:10]
-; GFX9-G-O0-NEXT:    v_and_b32_e32 v4, 1, v5
-; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v2, v4, s[6:7]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-G-O0-NEXT:    v_and_b32_e32 v4, 1, v5
-; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[10:11]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v9, v4, v9, s[8:9]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[6:7]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[6:7]
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v4, v4, v9
+; GFX9-G-O0-NEXT:    s_mov_b32 s7, 0x7f
+; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v6, v6, s7
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v3, v3, s6
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v6, v6, v8
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v3, v3, v7
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9]
+; GFX9-G-O0-NEXT:    v_and_b32_e32 v3, 1, v4
+; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v3
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v5, v5, v6
-; GFX9-G-O0-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[6:7]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[6:7]
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-G-O0-NEXT:    v_and_b32_e32 v3, 1, v4
+; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v3
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[6:7]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, 1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-G-O0-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v4
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], -1
 ; GFX9-G-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-G-O0-NEXT:    v_writelane_b32 v0, s4, 0
-; GFX9-G-O0-NEXT:    v_writelane_b32 v0, s5, 1
+; GFX9-G-O0-NEXT:    ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s4, 0
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s5, 1
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-G-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1703,11 +1677,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_8
 ; GFX9-G-O0-NEXT:  .LBB0_1: ; %Flow
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v0, 2
-; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v0, 3
+; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v34, 2
+; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v34, 3
 ; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-G-O0-NEXT:  ; %bb.2: ; %Flow
 ; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -1736,24 +1710,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_5
 ; GFX9-G-O0-NEXT:  .LBB0_3: ; %Flow2
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v4, 0
-; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v4, 1
-; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v34, 0
+; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v34, 1
+; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_nop 0
 ; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_9
 ; GFX9-G-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
@@ -1813,13 +1784,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_3
 ; GFX9-G-O0-NEXT:  .LBB0_5: ; %Flow1
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v8, 4
-; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v8, 5
-; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
@@ -1828,13 +1792,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v34, 4
+; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v34, 5
+; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_nop 0
 ; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
@@ -1844,41 +1812,39 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-G-O0-NEXT:  .LBB0_6: ; %udiv-do-while
 ; GFX9-G-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_readlane_b32 s6, v16, 6
-; GFX9-G-O0-NEXT:    v_readlane_b32 s7, v16, 7
-; GFX9-G-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    v_readlane_b32 s6, v34, 6
+; GFX9-G-O0-NEXT:    v_readlane_b32 s7, v34, 7
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 1
@@ -1897,9 +1863,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v7, v2, v3
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v5, v0, v1
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr23_vgpr24 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v25
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v26
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr22_vgpr23 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v24
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v25
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr0 killed $exec
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
 ; GFX9-G-O0-NEXT:    s_mov_b32 s9, 31
@@ -1911,47 +1877,44 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v15
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v4, v2, v3
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v9, v0, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v23
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v24
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v25
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v26
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v22
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v23
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v24
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v25
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[27:28], v0, v[2:3]
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[26:27], v0, v[2:3]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[0:1], v0, v[14:15]
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr2 killed $exec
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 31
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v23, v2, v3
+; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v22, v2, v3
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v28, v30
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v29, v31
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v30, v32
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v24, v32
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v25, v33
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v26, v34
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v29
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v30
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v24, v27
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v28
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v24
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v29
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v26
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v27
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v23
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v15, v1, v15
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v15
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v24, v25
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v26
-; GFX9-G-O0-NEXT:    v_or3_b32 v14, v14, v23, v24
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v24
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v25
+; GFX9-G-O0-NEXT:    v_or3_b32 v14, v14, v22, v23
 ; GFX9-G-O0-NEXT:    v_or3_b32 v2, v2, v3, v15
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v2
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v14
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v15
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v13, s[8:9], v13, v4
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v12, s[8:9], v12, v9, s[8:9]
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[8:9], v10, v7, s[8:9]
@@ -1968,15 +1931,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_and_b32_e64 v14, v10, s8
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v14
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v24, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, s4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, s4
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v23
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v24
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v22
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v23
 ; GFX9-G-O0-NEXT:    v_and_b32_e64 v11, v10, v11
-; GFX9-G-O0-NEXT:    v_and_b32_e64 v10, v10, v22
+; GFX9-G-O0-NEXT:    v_and_b32_e64 v10, v10, v21
 ; GFX9-G-O0-NEXT:    v_and_b32_e64 v8, v6, v8
-; GFX9-G-O0-NEXT:    v_and_b32_e64 v6, v6, v21
+; GFX9-G-O0-NEXT:    v_and_b32_e64 v6, v6, v20
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v4, s[8:9], v4, v11
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9]
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v9, s[8:9], v7, v8, s[8:9]
@@ -1985,60 +1948,60 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v10
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v9
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v17
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v18
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v19
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v20
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v16
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v17
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v19
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s12, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s11, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, -1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, s8
-; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v17, s[8:9], v11, v17
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, s8
+; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v16, s[8:9], v11, v16
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s12
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v18, s[8:9], v10, v11, s[8:9]
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v17, s[8:9], v10, v11, s[8:9]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s11
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v20, s[8:9], v9, v10, s[8:9]
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v19, s[8:9], v9, v10, s[8:9]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, s10
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v19, s[8:9], v8, v9, s[8:9]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v17
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v18
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v20
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v19
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v17, v17, v20
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v19, v18, v19
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v19
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20]
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v18, s[8:9], v8, v9, s[8:9]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v16
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v18
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v16, v16, v19
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v18, v17, v18
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[16:17], v[18:19]
 ; GFX9-G-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v3
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, v2
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v0
-; GFX9-G-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, v3
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v15
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, v14
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v12
-; GFX9-G-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, v15
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v14
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v13
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v12
+; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-G-O0-NEXT:    v_writelane_b32 v16, s6, 2
-; GFX9-G-O0-NEXT:    v_writelane_b32 v16, s7, 3
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s6, 2
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s7, 3
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-G-O0-NEXT:    v_writelane_b32 v16, s6, 6
-; GFX9-G-O0-NEXT:    v_writelane_b32 v16, s7, 7
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s6, 6
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s7, 7
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
@@ -2072,87 +2035,88 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v18
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v24, v17
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v4
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v17
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v16
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v4, v19, v4
+; GFX9-G-O0-NEXT:    v_sub_u32_e64 v4, v18, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v5, v5, v19
+; GFX9-G-O0-NEXT:    v_sub_u32_e64 v5, v5, v18
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v19, v6
+; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v18, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v19, v6
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[6:7], v19, v[21:22]
-; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[26:27], v19, v[23:24]
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[24:25], v5, v[21:22]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v26
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v27
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v24
+; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v18, v6
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[6:7], v18, v[20:21]
+; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[25:26], v18, v[22:23]
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[23:24], v5, v[20:21]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, v25
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v20, v20, v23
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v19, v5, v19
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v26
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v23
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v24
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v19, v19, v22
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v18, v5, v18
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[8:9], 0
-; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[21:22], v4, v[21:22]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v21
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v22
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s[4:5]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v19, s[4:5]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v18, s[6:7]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v17, v5, v17, s[6:7]
+; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[20:21], v4, v[20:21]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v20
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v21
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v19, s[4:5]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v18, s[4:5]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v17, s[6:7]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v16, v5, v16, s[6:7]
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v17
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v6
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, 0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v17, v17, v18, s[4:5]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[4:5]
 ; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v6
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v6
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v17
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v18
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v16
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v17
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s7, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, -1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, s4
-; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v16, s[4:5], v16, v17
-; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, s10
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v15, s[4:5], v15, v16, s[4:5]
-; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, s7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, s4
+; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v15, s[4:5], v15, v16
+; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, s10
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5]
-; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s6
+; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s7
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5]
-; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s6
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v12, s[4:5], v12, v13, s[4:5]
+; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], s[8:9]
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], s[8:9]
-; GFX9-G-O0-NEXT:    v_writelane_b32 v12, s8, 6
-; GFX9-G-O0-NEXT:    v_writelane_b32 v12, s9, 7
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s8, 6
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s9, 7
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, s7
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s6
@@ -2180,165 +2144,157 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-G-O0-NEXT:  .LBB0_8: ; %udiv-bb1
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s9, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v5, s[6:7], v3, v5
-; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s10
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v6, v8, s[6:7]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s9
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v8, s[6:7], v7, v8, s[6:7]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s8
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v2, v7, s[6:7]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s6
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v4, s[6:7], v2, v4
+; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v5, s[6:7], v5, v7, s[6:7]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s9
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v6, v7, s[6:7]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s8
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v1, v6, s[6:7]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v7
-; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v6
+; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0x7f
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v9, s[6:7], v2, v3
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v8, s[6:7], v1, v2
 ; GFX9-G-O0-NEXT:    s_mov_b32 s7, 64
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v3, v9, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v15, v1, v9
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v9
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v3
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-G-O0-NEXT:    v_sub_u32_e64 v2, v8, v0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-G-O0-NEXT:    v_sub_u32_e64 v14, v0, v8
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[8:9], v9, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v9, v1
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[1:2], v9, v[13:14]
-; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[18:19], v15, v[13:14]
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[16:17], v9, v[11:12]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v18
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v19
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[8:9], v8, v0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v8, v0
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[0:1], v8, v[12:13]
+; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[17:18], v14, v[12:13]
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[15:16], v8, v[10:11]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v17
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v12, v12, v15
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v11, v9, v11
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[13:14], v3, v[13:14]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v18
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v15
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v11, v11, v14
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v10, v8, v10
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[12:13], v2, v[12:13]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[8:9]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[8:9]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v14
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v9, v9, v12, s[8:9]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[8:9]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[6:7]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[6:7]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v3
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[8:9]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[8:9]
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v12
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v13
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[8:9]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[8:9]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[6:7]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v5, v5, v8
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v7, v6, v7
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s4
-; GFX9-G-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[5:6], v[7:8]
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v4, v4, v7
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v6, v5, v6
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s4
+; GFX9-G-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[4:5], v[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-G-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-G-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-G-O0-NEXT:    v_writelane_b32 v0, s6, 4
-; GFX9-G-O0-NEXT:    v_writelane_b32 v0, s7, 5
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s6, 4
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s7, 5
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-G-O0-NEXT:    s_cbranch_execz .LBB0_5
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_7
 ; GFX9-G-O0-NEXT:  .LBB0_9: ; %udiv-end
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v9
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v10
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v12
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v2
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v0, v0, v8
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v1, v1, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v9
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v10
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v2, v2, v6
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v3, v3, v5
-; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v0, s[4:5], v0, v8
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v1, s[4:5], v1, v7, s[4:5]
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v2, s[4:5], v2, v6, s[4:5]
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v3, s[4:5], v3, v5, s[4:5]
-; GFX9-G-O0-NEXT:    ; kill: killed $vgpr4
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v0, v0, v7
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v1, v1, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v2, v2, v5
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v3, v3, v4
+; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v0, s[4:5], v0, v7
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v1, s[4:5], v1, v6, s[4:5]
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v2, s[4:5], v2, v5, s[4:5]
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v3, s[4:5], v3, v4, s[4:5]
 ; GFX9-G-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -2533,246 +2489,238 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-O0-NEXT:    v_or_b32_e64 v3, v8, v7
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    v_or_b32_e64 v1, v5, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 1
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    v_or_b32_e64 v15, v4, v2
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 1
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v9, v3, v1
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v15
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7]
+; GFX9-O0-NEXT:    v_or_b32_e64 v14, v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v8, v2, v0
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v14
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s9, 32
-; GFX9-O0-NEXT:    v_add_u32_e64 v6, v6, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
-; GFX9-O0-NEXT:    v_min_u32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX9-O0-NEXT:    v_min_u32_e64 v5, v5, v6
 ; GFX9-O0-NEXT:    s_mov_b32 s8, 0
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v5
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v8, v8
-; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v4
+; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
+; GFX9-O0-NEXT:    v_min_u32_e64 v14, v4, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v4
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], 64
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v15
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s11
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v8, s[12:13], v8, s12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s14
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13]
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v7, s[12:13], v7, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s14
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[12:13], v4, v8, s[12:13]
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[12:13]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[10:11], s[12:13]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[12:13]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v5, v6, s[12:13]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v0
+; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s9
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v2
-; GFX9-O0-NEXT:    v_min_u32_e64 v6, v5, v6
+; GFX9-O0-NEXT:    v_min_u32_e64 v5, v4, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v11, v4
-; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v3
+; GFX9-O0-NEXT:    v_min_u32_e64 v14, v4, v10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v14
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v15
 ; GFX9-O0-NEXT:    s_mov_b32 s10, s11
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v11, s[8:9], v11, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s10
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9]
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v10, s[8:9], v10, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s10
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v4, v11, s[8:9]
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v11
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[8:9]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[12:13], s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
 ; GFX9-O0-NEXT:    s_mov_b32 s10, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s11, s7
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v8
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v9, vcc, v6, v7, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v7
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v5, v6, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v6, v7, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v6, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
-; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[8:9], v7, 1
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
+; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[8:9], v6, 1
 ; GFX9-O0-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-O0-NEXT:    s_xor_b64 s[4:5], s[8:9], s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
-; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
+; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
-; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v7, v7, v10
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v2, v5, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[8:9]
+; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s5, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -2780,11 +2728,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB1_8
 ; GFX9-O0-NEXT:  .LBB1_1: ; %Flow
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
@@ -2813,20 +2761,19 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_5
 ; GFX9-O0-NEXT:  .LBB1_3: ; %Flow2
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v4, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v4, 3
-; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 3
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -2873,13 +2820,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_3
 ; GFX9-O0-NEXT:  .LBB1_5: ; %Flow1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 7
-; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
@@ -2888,9 +2828,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 7
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -2904,92 +2850,87 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB1_4
 ; GFX9-O0-NEXT:  .LBB1_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 9
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 9
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[29:30], s4, v[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v30
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], s5, v[23:24]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v24
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], s5, v[22:23]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v29
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    v_or_b32_e64 v23, v5, v10
-; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v22
+; GFX9-O0-NEXT:    v_or_b32_e64 v22, v5, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[2:3]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v30
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v29
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v28
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[6:7]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[6:7]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v30
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
 ; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v29
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v27
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v26
 ; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v26
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v25
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v24
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -3009,22 +2950,22 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v22
-; GFX9-O0-NEXT:    v_and_b32_e64 v23, v7, v23
-; GFX9-O0-NEXT:    v_and_b32_e64 v21, v11, v21
-; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v20
-; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v23
-; GFX9-O0-NEXT:    v_and_b32_e64 v23, v11, v19
-; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v22
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v20
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v19, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v20, v11, v20
+; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v11, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v19
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
@@ -3040,66 +2981,66 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v20, vcc, v11, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v19, vcc, v11, v16
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s4
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v10, v11, vcc
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s4
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v21
-; GFX9-O0-NEXT:    v_or_b32_e64 v19, v19, v22
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v20
-; GFX9-O0-NEXT:    v_or_b32_e64 v17, v17, v18
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v19
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13]
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v18, v18, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v2
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v2
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v14
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v12
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 5
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -3137,52 +3078,52 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[21:22]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v12, s6, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], v12, v[19:20]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v24
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], v12, v[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v23
 ; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v22
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
 ; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v4, s6
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v5, v4, s6
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[23:24], v5, v[19:20]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[22:23], v5, v[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 0
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v4, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v21
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v22
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v20
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[19:20]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[18:19]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v5
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s7
@@ -3201,12 +3142,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s4
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v15, v17, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v15, v16, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s5
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v14, vcc, v14, v15, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s4
@@ -3218,7 +3159,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -3231,10 +3172,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s4, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s5, 9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -3262,165 +3204,158 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_6
 ; GFX9-O0-NEXT:  .LBB1_8: ; %udiv-bb1
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v9, vcc, v4, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s9
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
-; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s4, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], v3, v[11:12]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
+; GFX9-O0-NEXT:    v_sub_u32_e64 v2, s4, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-O0-NEXT:    v_sub_u32_e64 v14, s4, v3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], v14, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v15
-; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v16
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
-; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
-; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v3, s4
+; GFX9-O0-NEXT:    v_sub_u32_e64 v13, s4, v2
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
+; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v15
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v2, s4
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 63
-; GFX9-O0-NEXT:    v_sub_u32_e64 v4, s10, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[13:14], v4, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[4:5]
+; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s10, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[12:13], v3, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 0
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v3, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[10:11]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[10:11]
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v2, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[7:8], v3, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v4, v7, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v3, v6, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
-; GFX9-O0-NEXT:    v_or_b32_e64 v3, v3, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v1, v1, v2
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 7
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_5
 ; GFX9-O0-NEXT:    s_branch .LBB1_7
 ; GFX9-O0-NEXT:  .LBB1_9: ; %udiv-end
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 32
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[7:8]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[5:6]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[4:5]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-O0-NEXT:    ; kill: killed $vgpr4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -3610,83 +3545,94 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-G-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-G-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v2
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v3
-; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v4
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v8
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v5
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v8
+; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v9
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v12
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v13
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v8, v8, v11
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v10, v9, v10
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v14
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v9, v9, v12
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v11, v10, v11
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[6:7], v[8:9], v[10:11]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v3
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v9
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v12
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v13
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v8, v8, v11
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v10, v9, v10
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v14
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v9, v9, v12
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v11, v10, v11
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], v[10:11]
 ; GFX9-G-O0-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], v[4:5]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v4, v4
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v5, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, 32
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v5, v5, v6
+; GFX9-G-O0-NEXT:    v_min_u32_e64 v4, v4, v5
+; GFX9-G-O0-NEXT:    s_mov_b32 s10, 64
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s10
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v5, v4, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v4, v4
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 32
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v6, v6, v7
+; GFX9-G-O0-NEXT:    v_min_u32_e64 v4, v4, v6
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[8:9]
+; GFX9-G-O0-NEXT:    s_mov_b32 s14, 0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], v[5:6]
@@ -3697,7 +3643,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 32
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v6, v6, v7
 ; GFX9-G-O0-NEXT:    v_min_u32_e64 v5, v5, v6
-; GFX9-G-O0-NEXT:    s_mov_b32 s10, 64
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v6, v5, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v8
@@ -3708,130 +3653,106 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v7, v7, v8
 ; GFX9-G-O0-NEXT:    v_min_u32_e64 v5, v5, v7
 ; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[8:9]
-; GFX9-G-O0-NEXT:    s_mov_b32 s14, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v2
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v3
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], v[6:7]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v6, v6
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v7, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, 32
-; GFX9-G-O0-NEXT:    v_add_u32_e64 v7, v7, v8
-; GFX9-G-O0-NEXT:    v_min_u32_e64 v6, v6, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s10
-; GFX9-G-O0-NEXT:    v_add_u32_e64 v7, v6, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v6, v6
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v8, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, 32
-; GFX9-G-O0-NEXT:    v_add_u32_e64 v8, v8, v9
-; GFX9-G-O0-NEXT:    v_min_u32_e64 v6, v6, v8
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[8:9]
 ; GFX9-G-O0-NEXT:    s_mov_b32 s13, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s11, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s12, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, 0
-; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v6, s[8:9], v5, v6
-; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s14
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s14
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v7, s[8:9], v5, v7, s[8:9]
-; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9]
-; GFX9-G-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s10
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9]
-; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v5, s[8:9], v4, v5
+; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s14
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v6, s[8:9], v4, v6, s[8:9]
+; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s13
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[8:9], v4, v7, s[8:9]
+; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v7, s[8:9], v4, v7, s[8:9]
+; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v9
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s4
-; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[12:13], v[14:15]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[12:13], v[14:15]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s12
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s13
-; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[12:13], v[10:11], v[12:13]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s4
+; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[11:12], v[13:14]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[11:12], v[13:14]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s12
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s13
+; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[12:13], v[9:10], v[11:12]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, 1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v9, v4, v9, s[12:13]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, 1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v10, v5, v10, s[12:13]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, 1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[10:11]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v10, v5, v10, s[8:9]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, 1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[6:7]
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v5, v5, v10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[10:11]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v9, v4, v9, s[8:9]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, 1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[6:7]
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v4, v4, v9
 ; GFX9-G-O0-NEXT:    s_mov_b32 s7, 0x7f
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v6, v6, s7
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v7, v7, s6
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v6, v6, v9
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v8, v7, v8
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v5, v5, s7
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v6, v6, s6
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v5, v5, v8
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v7, v6, v7
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[5:6], v[7:8]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v3
-; GFX9-G-O0-NEXT:    v_and_b32_e32 v1, 1, v5
-; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-G-O0-NEXT:    v_and_b32_e32 v0, 1, v4
+; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v7
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[6:7]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[6:7]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-G-O0-NEXT:    v_and_b32_e32 v3, 1, v5
-; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v3
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[6:7]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 1
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[6:7]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[6:7]
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-G-O0-NEXT:    v_and_b32_e32 v2, 1, v4
+; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v2
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v5, v5, v6
-; GFX9-G-O0-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[6:7]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, 1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-G-O0-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v4
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], -1
 ; GFX9-G-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-G-O0-NEXT:    v_writelane_b32 v0, s4, 0
-; GFX9-G-O0-NEXT:    v_writelane_b32 v0, s5, 1
+; GFX9-G-O0-NEXT:    ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s4, 0
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s5, 1
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-G-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -3839,68 +3760,65 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_8
 ; GFX9-G-O0-NEXT:  .LBB1_1: ; %Flow
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v0, 2
-; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v0, 3
+; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v34, 2
+; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v34, 3
 ; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-G-O0-NEXT:  ; %bb.2: ; %Flow
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_5
 ; GFX9-G-O0-NEXT:  .LBB1_3: ; %Flow2
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v4, 0
-; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v4, 1
-; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v34, 0
+; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v34, 1
+; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_9
 ; GFX9-G-O0-NEXT:  .LBB1_4: ; %udiv-loop-exit
-; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v5
@@ -3949,77 +3867,72 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_3
 ; GFX9-G-O0-NEXT:  .LBB1_5: ; %Flow1
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v8, 4
-; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v8, 5
+; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v34, 4
+; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v34, 5
 ; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_4
 ; GFX9-G-O0-NEXT:  .LBB1_6: ; %udiv-do-while
 ; GFX9-G-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_readlane_b32 s6, v16, 6
-; GFX9-G-O0-NEXT:    v_readlane_b32 s7, v16, 7
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    v_readlane_b32 s6, v34, 6
+; GFX9-G-O0-NEXT:    v_readlane_b32 s7, v34, 7
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[21:22], v2, v[0:1]
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[20:21], v2, v[0:1]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[3:4]
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr0 killed $exec
@@ -4043,8 +3956,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v3, v0, v1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s9, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v21
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v22
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v20
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v21
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v4, v2, v3
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v9, v0, v1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v12
@@ -4052,7 +3965,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v14
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v15
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[23:24], v0, v[2:3]
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[22:23], v0, v[2:3]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[0:1], v0, v[12:13]
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr2 killed $exec
@@ -4064,22 +3977,20 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v28, v30
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v29, v31
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v30, v32
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v32
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v33
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v34
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v29
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v30
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v23
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v24
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v29
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v22
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v23
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v15
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v13, v1, v13
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v21
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v22
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v20
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v21
 ; GFX9-G-O0-NEXT:    v_or3_b32 v12, v12, v14, v15
 ; GFX9-G-O0-NEXT:    v_or3_b32 v2, v2, v3, v13
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
@@ -4087,7 +3998,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v11, s[8:9], v11, v4
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9]
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9]
@@ -4109,18 +4019,18 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v11
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v24
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v25
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v24, v26
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v26
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v27
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v28
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v23
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v24
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v22
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v23
 ; GFX9-G-O0-NEXT:    v_and_b32_e64 v11, v8, v11
 ; GFX9-G-O0-NEXT:    v_and_b32_e64 v10, v8, v10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v21
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v20
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v21
 ; GFX9-G-O0-NEXT:    v_and_b32_e64 v8, v6, v8
-; GFX9-G-O0-NEXT:    v_and_b32_e64 v6, v6, v21
+; GFX9-G-O0-NEXT:    v_and_b32_e64 v6, v6, v20
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v4, s[8:9], v4, v11
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9]
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v9, s[8:9], v7, v8, s[8:9]
@@ -4129,351 +4039,344 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v10
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v9
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v17
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v18
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v19
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v20
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v16
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v17
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v19
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s12, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s11, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, -1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, s8
-; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v17, s[8:9], v11, v17
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, s8
+; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v16, s[8:9], v11, v16
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s12
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v18, s[8:9], v10, v11, s[8:9]
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v17, s[8:9], v10, v11, s[8:9]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s11
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v20, s[8:9], v9, v10, s[8:9]
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v19, s[8:9], v9, v10, s[8:9]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, s10
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v19, s[8:9], v8, v9, s[8:9]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v17
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v18
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v20
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v19
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v17, v17, v20
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v19, v18, v19
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v19
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20]
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v18, s[8:9], v8, v9, s[8:9]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v16
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v18
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v16, v16, v19
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v18, v17, v18
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[16:17], v[18:19]
 ; GFX9-G-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v3
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, v2
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v0
-; GFX9-G-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, v3
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v15
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, v14
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v12
-; GFX9-G-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, v15
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v14
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v13
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v12
+; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-G-O0-NEXT:    v_writelane_b32 v16, s6, 2
-; GFX9-G-O0-NEXT:    v_writelane_b32 v16, s7, 3
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s6, 2
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s7, 3
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-G-O0-NEXT:    v_writelane_b32 v16, s6, 6
-; GFX9-G-O0-NEXT:    v_writelane_b32 v16, s7, 7
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s6, 6
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s7, 7
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-G-O0-NEXT:    s_cbranch_execnz .LBB1_6
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_1
 ; GFX9-G-O0-NEXT:  .LBB1_7: ; %udiv-preheader
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v4
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v6
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v4
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v4, v13, v4
+; GFX9-G-O0-NEXT:    v_sub_u32_e64 v4, v12, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v5, v5, v13
+; GFX9-G-O0-NEXT:    v_sub_u32_e64 v5, v5, v12
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v13, v6
+; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v12, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v13, v6
-; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[6:7], v13, v[21:22]
-; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[26:27], v13, v[15:16]
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[24:25], v5, v[21:22]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v26
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v27
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v24
+; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v12, v6
+; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[6:7], v12, v[20:21]
+; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[25:26], v12, v[14:15]
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[23:24], v5, v[20:21]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v25
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v14, v14, v23
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v13, v5, v13
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v26
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v23
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v24
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v13, v13, v22
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v12, v5, v12
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[8:9], 0
-; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[21:22], v4, v[21:22]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v21
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v22
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v14, s[4:5]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v15
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v16
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v14, s[6:7]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v13, v5, v13, s[6:7]
+; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[20:21], v4, v[20:21]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v20
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v21
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v13, s[4:5]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[4:5]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v14
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v15
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v13, s[6:7]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v12, v5, v12, s[6:7]
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v6
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[4:5]
 ; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v6
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v14
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v17
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v18
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v19
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v20
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v12
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v13
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v17
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v18
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v19
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s7, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, -1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, s4
-; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v16, s[4:5], v16, v17
-; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, s10
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v15, s[4:5], v15, v16, s[4:5]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, s4
+; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v15, s[4:5], v15, v16
 ; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, s7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, s10
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5]
 ; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s7
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5]
 ; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s6
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v12, s[4:5], v12, v13, s[4:5]
+; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], s[8:9]
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], s[8:9]
-; GFX9-G-O0-NEXT:    v_writelane_b32 v12, s8, 6
-; GFX9-G-O0-NEXT:    v_writelane_b32 v12, s9, 7
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s8, 6
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s9, 7
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, s7
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_6
 ; GFX9-G-O0-NEXT:  .LBB1_8: ; %udiv-bb1
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s9, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v5, s[6:7], v2, v5
-; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v4, v6, s[6:7]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s9
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v8, s[6:7], v3, v4, s[6:7]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v1, v3, s[6:7]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s6
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v4, s[6:7], v1, v4
+; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s10
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v5, s[6:7], v3, v5, s[6:7]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v2, v3, s[6:7]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v0, v2, s[6:7]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v7
-; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v6
+; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0x7f
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v4, s[6:7], v1, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v3, s[6:7], v0, v1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s7, 64
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v10
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v9
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v3, v4, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v9, v1, v4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-G-O0-NEXT:    v_sub_u32_e64 v2, v3, v0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-G-O0-NEXT:    v_sub_u32_e64 v8, v0, v3
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[8:9], v4, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v4, v1
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[1:2], v4, v[13:14]
-; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[18:19], v9, v[13:14]
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[16:17], v4, v[11:12]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v18
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v19
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[8:9], v3, v0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v3, v0
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[0:1], v3, v[12:13]
+; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[17:18], v8, v[12:13]
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[15:16], v3, v[10:11]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v17
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v10, v10, v15
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v4, v4, v9
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[13:14], v3, v[13:14]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v18
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v15
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v16
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v9, v9, v14
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v3, v3, v8
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[12:13], v2, v[12:13]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[8:9]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[8:9]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v14
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[8:9]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[8:9]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v12
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[6:7]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[6:7]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v3
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[8:9]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[8:9]
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v12
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v13
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[8:9]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[6:7]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v5, v5, v8
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v7, v6, v7
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s4
-; GFX9-G-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[5:6], v[7:8]
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v4, v4, v7
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v6, v5, v6
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s4
+; GFX9-G-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[4:5], v[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-G-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-G-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-G-O0-NEXT:    v_writelane_b32 v0, s6, 4
-; GFX9-G-O0-NEXT:    v_writelane_b32 v0, s7, 5
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s6, 4
+; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s7, 5
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-G-O0-NEXT:    s_cbranch_execz .LBB1_5
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_7
 ; GFX9-G-O0-NEXT:  .LBB1_9: ; %udiv-end
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v8
-; GFX9-G-O0-NEXT:    ; kill: killed $vgpr4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX9-G-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32-wave32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32-wave32.mir
index 18828bb461892..a680b63a34b9a 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32-wave32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32-wave32.mir
@@ -16,10 +16,33 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; MUBUFW32-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset0
-    ; MUBUFW32: renamable $vgpr0, $vcc_lo = V_ADD_CO_U32_e64 12, $sgpr32, 0, implicit $exec
+    ; MUBUFW32: renamable $vgpr0, dead $vcc_lo = V_ADD_CO_U32_e64 12, $sgpr32, 0, implicit $exec
     ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr0
     ;
     ; FLATSCRW32-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset0
+    ; FLATSCRW32: renamable $vgpr0, dead $vcc_lo = V_ADD_CO_U32_e64 12, $sgpr32, 0, implicit $exec
+    ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr0
+    renamable $vgpr0, dead $vcc_lo = V_ADD_CO_U32_e64 12, %stack.0, 0, implicit $exec
+    SI_RETURN implicit $vgpr0, implicit $sgpr0
+
+...
+
+---
+name: v_add_co_u32_e64__inline_imm__fi_offset0__live_vcc
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4, alignment: 16 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; MUBUFW32-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset0__live_vcc
+    ; MUBUFW32: renamable $vgpr0, $vcc_lo = V_ADD_CO_U32_e64 12, $sgpr32, 0, implicit $exec
+    ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr0
+    ;
+    ; FLATSCRW32-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset0__live_vcc
     ; FLATSCRW32: renamable $vgpr0, $vcc_lo = V_ADD_CO_U32_e64 12, $sgpr32, 0, implicit $exec
     ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr0
     renamable $vgpr0, $vcc_lo = V_ADD_CO_U32_e64 12, %stack.0, 0, implicit $exec
@@ -271,7 +294,7 @@ body:             |
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
 
     ; MUBUFW32-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required
-    ; MUBUFW32: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; MUBUFW32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; MUBUFW32-NEXT: {{  $}}
     ; MUBUFW32-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc
     ; MUBUFW32-NEXT: renamable $vgpr0, renamable $vcc_lo = V_ADD_CO_U32_e64 killed $sgpr4, $sgpr8, 0, implicit $exec
@@ -279,7 +302,7 @@ body:             |
     ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; FLATSCRW32-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required
-    ; FLATSCRW32: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; FLATSCRW32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; FLATSCRW32-NEXT: {{  $}}
     ; FLATSCRW32-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc
     ; FLATSCRW32-NEXT: renamable $vgpr0, renamable $vcc_lo = V_ADD_CO_U32_e64 killed $sgpr4, $sgpr8, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
index 6ec296144bf19..fa442aa849d17 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
@@ -1192,7 +1192,7 @@ body:             |
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
 
     ; GFX7-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required
-    ; GFX7: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX7: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1204,7 +1204,7 @@ body:             |
     ; GFX7-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX8-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required
-    ; GFX8: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1216,7 +1216,7 @@ body:             |
     ; GFX8-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX900-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required
-    ; GFX900: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX900: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1227,7 +1227,7 @@ body:             |
     ; GFX900-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX90A-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required
-    ; GFX90A: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1238,7 +1238,7 @@ body:             |
     ; GFX90A-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX10-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required
-    ; GFX10: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1249,7 +1249,7 @@ body:             |
     ; GFX10-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX940-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required
-    ; GFX940: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
     ; GFX940-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc
@@ -1260,7 +1260,7 @@ body:             |
     ; GFX940-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX11-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required
-    ; GFX11: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX11: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc
     ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr4, $sgpr8, 0, implicit $exec
@@ -1268,7 +1268,7 @@ body:             |
     ; GFX11-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX12-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required
-    ; GFX12: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX12: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc
     ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr4, $sgpr8, 0, implicit $exec
@@ -1296,7 +1296,7 @@ body:             |
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
 
     ; GFX7-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required
-    ; GFX7: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX7: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1308,7 +1308,7 @@ body:             |
     ; GFX7-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX8-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required
-    ; GFX8: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1320,7 +1320,7 @@ body:             |
     ; GFX8-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX900-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required
-    ; GFX900: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX900: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1331,7 +1331,7 @@ body:             |
     ; GFX900-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX90A-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required
-    ; GFX90A: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1342,7 +1342,7 @@ body:             |
     ; GFX90A-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX10-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required
-    ; GFX10: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1353,7 +1353,7 @@ body:             |
     ; GFX10-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required
-    ; FLATSCRW64: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; FLATSCRW64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; FLATSCRW64-NEXT: {{  $}}
     ; FLATSCRW64-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
     ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc
@@ -1384,7 +1384,7 @@ body:             |
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
 
     ; GFX7-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required
-    ; GFX7: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX7: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1396,7 +1396,7 @@ body:             |
     ; GFX7-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX8-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required
-    ; GFX8: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1408,7 +1408,7 @@ body:             |
     ; GFX8-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX900-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required
-    ; GFX900: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX900: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1419,7 +1419,7 @@ body:             |
     ; GFX900-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX90A-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required
-    ; GFX90A: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1430,7 +1430,7 @@ body:             |
     ; GFX90A-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX10-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required
-    ; GFX10: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
@@ -1441,7 +1441,7 @@ body:             |
     ; GFX10-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required
-    ; FLATSCRW64: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+    ; FLATSCRW64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8
     ; FLATSCRW64-NEXT: {{  $}}
     ; FLATSCRW64-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
     ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir
index 3bf7e7b8c5696..2f43c8264bf90 100644
--- a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir
@@ -25,13 +25,12 @@ body:             |
     ; GCN-LABEL: name: test_single_block
     ; GCN: liveins: $sgpr4, $vgpr2_vgpr3
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr0
+    ; GCN-NEXT: renamable $vgpr63 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr63
     ; GCN-NEXT: S_NOP 0
-    ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-    ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
-    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec
-    ; GCN-NEXT: KILL killed renamable $vgpr0
+    ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR killed $vgpr63, 0
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, [[V_MOV_B32_e32_]], 0, 0, implicit $exec
     ; GCN-NEXT: SI_RETURN
     SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
     S_NOP 0
@@ -63,32 +62,31 @@ body:             |
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
   ; GCN-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
-  ; GCN-NEXT:   liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
   ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.3, implicit $exec
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr6, $sgpr10_sgpr11
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr6, 0, killed $vgpr0
+  ; GCN-NEXT:   renamable $vgpr63 = IMPLICIT_DEF
+  ; GCN-NEXT:   $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr6, 0, killed $vgpr63
   ; GCN-NEXT:   S_NOP 0
-  ; GCN-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
+  ; GCN-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR killed $vgpr63, 0
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.3
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr10_sgpr11
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec
+  ; GCN-NEXT:   $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
   ; GCN-NEXT:   S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
-  ; GCN-NEXT:   KILL killed renamable $vgpr0
   ; GCN-NEXT:   SI_RETURN
   bb.0:
     liveins: $sgpr6, $sgpr10_sgpr11
@@ -135,52 +133,50 @@ body:             |
   ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; GCN-NEXT:   liveins: $sgpr4, $sgpr10_sgpr11
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
   ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr4, $vgpr0, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr4, $sgpr10_sgpr11
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr0
+  ; GCN-NEXT:   renamable $vgpr63 = IMPLICIT_DEF
+  ; GCN-NEXT:   $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr63
   ; GCN-NEXT:   S_NOP 0
-  ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
+  ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR killed $vgpr63, 0
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr4, $vgpr0, $vgpr1, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr4, $sgpr10_sgpr11
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   S_STORE_DWORD_IMM $sgpr4, $sgpr10_sgpr11, 0, 0
-  ; GCN-NEXT:   $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec
+  ; GCN-NEXT:   $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
   ; GCN-NEXT:   S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 4
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 5, implicit $exec
+  ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
   ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.3, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.3
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $vcc = V_CMP_EQ_U32_e64 0, $vgpr1, implicit $exec
+  ; GCN-NEXT:   $vcc = V_CMP_EQ_U32_e64 0, [[V_MOV_B32_e32_1]], implicit $exec
   ; GCN-NEXT:   $sgpr6_sgpr7 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit $scc
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.4:
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr6_sgpr7
+  ; GCN-NEXT:   liveins: $sgpr6_sgpr7
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr1 = V_SUB_U32_e32 1, killed $vgpr1, implicit $exec
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+  ; GCN-NEXT:   [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 1, [[V_MOV_B32_e32_1]], implicit $exec
+  ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[V_SUB_U32_e32_]], implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.3
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.5:
-  ; GCN-NEXT:   liveins: $vgpr0, $sgpr6_sgpr7
+  ; GCN-NEXT:   liveins: $sgpr6_sgpr7
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $exec = S_OR_B64 $exec, $sgpr6_sgpr7, implicit-def $scc
-  ; GCN-NEXT:   KILL killed renamable $vgpr0
   ; GCN-NEXT:   SI_RETURN
   bb.0:
     liveins: $sgpr4, $sgpr10_sgpr11
@@ -239,26 +235,24 @@ body:             |
   ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; GCN-NEXT:   liveins: $sgpr4, $vgpr2_vgpr3
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
   ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
-  ; GCN-NEXT:   liveins: $sgpr4, $vgpr0, $vgpr2_vgpr3
+  ; GCN-NEXT:   liveins: $sgpr4, $vgpr2_vgpr3
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr0
+  ; GCN-NEXT:   renamable $vgpr63 = IMPLICIT_DEF
+  ; GCN-NEXT:   $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr63
   ; GCN-NEXT:   S_NOP 0
-  ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec
-  ; GCN-NEXT:   GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec
-  ; GCN-NEXT:   KILL killed renamable $vgpr0
+  ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR killed $vgpr63, 0
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
+  ; GCN-NEXT:   GLOBAL_STORE_DWORD $vgpr2_vgpr3, [[V_MOV_B32_e32_]], 0, 0, implicit $exec
   ; GCN-NEXT:   SI_RETURN
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr2_vgpr3
+  ; GCN-NEXT:   liveins: $vgpr2_vgpr3
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
-  ; GCN-NEXT:   GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec
-  ; GCN-NEXT:   KILL killed renamable $vgpr0
+  ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
+  ; GCN-NEXT:   GLOBAL_STORE_DWORD $vgpr2_vgpr3, [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
   ; GCN-NEXT:   SI_RETURN
   bb.0:
     liveins: $sgpr4, $vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
index 431b7d5400f43..798cd6239d262 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
@@ -108,255 +108,114 @@ define amdgpu_kernel void @kernel_calls_no_stack() {
 }
 
 define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
-; FLAT_SCR_OPT-LABEL: test:
-; FLAT_SCR_OPT:       ; %bb.0:
-; FLAT_SCR_OPT-NEXT:    s_add_u32 s6, s6, s11
-; FLAT_SCR_OPT-NEXT:    s_addc_u32 s7, s7, 0
-; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; FLAT_SCR_OPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; FLAT_SCR_OPT-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; FLAT_SCR_OPT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT_SCR_OPT-NEXT:    v_writelane_b32 v0, s0, 0
-; FLAT_SCR_OPT-NEXT:    v_writelane_b32 v0, s1, 1
-; FLAT_SCR_OPT-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s0, 0
-; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
-; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s105
-; FLAT_SCR_OPT-NEXT:    s_load_dword vcc_lo, s[2:3], 0x8
-; FLAT_SCR_OPT-NEXT:    ; kill: killed $sgpr2_sgpr3
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v0, vcc_lo
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
-; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
-; FLAT_SCR_OPT-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s0, 0
-; FLAT_SCR_OPT-NEXT:    scratch_load_dword v1, off, s0 ; 4-byte Folded Reload
-; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s105
-; FLAT_SCR_OPT-NEXT:    s_waitcnt vmcnt(0)
-; FLAT_SCR_OPT-NEXT:    v_readlane_b32 s0, v1, 0
-; FLAT_SCR_OPT-NEXT:    v_readlane_b32 s1, v1, 1
-; FLAT_SCR_OPT-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s105
-; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v2, 0
-; FLAT_SCR_OPT-NEXT:    ; kill: killed $vgpr1
-; FLAT_SCR_OPT-NEXT:    global_store_dword v2, v0, s[0:1]
-; FLAT_SCR_OPT-NEXT:    s_endpgm
-;
-; FLAT_SCR_ARCH-LABEL: test:
-; FLAT_SCR_ARCH:       ; %bb.0:
-; FLAT_SCR_ARCH-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; FLAT_SCR_ARCH-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; FLAT_SCR_ARCH-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT_SCR_ARCH-NEXT:    v_writelane_b32 v0, s0, 0
-; FLAT_SCR_ARCH-NEXT:    v_writelane_b32 v0, s1, 1
-; FLAT_SCR_ARCH-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s0, 0
-; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
-; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s105
-; FLAT_SCR_ARCH-NEXT:    s_load_dword vcc_lo, s[2:3], 0x8
-; FLAT_SCR_ARCH-NEXT:    ; kill: killed $sgpr2_sgpr3
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v0, vcc_lo
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
-; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
-; FLAT_SCR_ARCH-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s0, 0
-; FLAT_SCR_ARCH-NEXT:    scratch_load_dword v1, off, s0 ; 4-byte Folded Reload
-; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s105
-; FLAT_SCR_ARCH-NEXT:    s_waitcnt vmcnt(0)
-; FLAT_SCR_ARCH-NEXT:    v_readlane_b32 s0, v1, 0
-; FLAT_SCR_ARCH-NEXT:    v_readlane_b32 s1, v1, 1
-; FLAT_SCR_ARCH-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s105
-; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v2, 0
-; FLAT_SCR_ARCH-NEXT:    ; kill: killed $vgpr1
-; FLAT_SCR_ARCH-NEXT:    global_store_dword v2, v0, s[0:1]
-; FLAT_SCR_ARCH-NEXT:    s_endpgm
+; GCN-LABEL: test:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT:    s_load_dword vcc_lo, s[2:3], 0x8
+; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
+; GCN-NEXT:    ; kill: killed $sgpr2_sgpr3
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_writelane_b32 v0, s0, 0
+; GCN-NEXT:    v_writelane_b32 v0, s1, 1
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_readlane_b32 s0, v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, vcc_lo
+; GCN-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    global_store_dword v2, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
   call void asm sideeffect "", "~{s[0:7]}" ()
   call void asm sideeffect "", "~{s[8:15]}" ()
   call void asm sideeffect "", "~{s[16:23]}" ()
@@ -371,7 +230,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
   call void asm sideeffect "", "~{s[88:95]}" ()
   call void asm sideeffect "", "~{s[96:103]}" ()
   call void asm sideeffect "", "~{s[104:105]}" ()
-  call void asm sideeffect "", "~{v[0:7]}" ()
+  call void asm sideeffect "", "~{v[1:7]}" ()
   call void asm sideeffect "", "~{v[8:15]}" ()
   call void asm sideeffect "", "~{v[16:23]}" ()
   call void asm sideeffect "", "~{v[24:31]}" ()
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index e4ffedd686ac9..02eb1ad945329 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -25,11 +25,11 @@
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; NumSgprs: 8
-; VI-NOXNACK: ; NumSgprs: 8
-; VI-XNACK: ; NumSgprs: 12
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 8
+; CI: ; TotalNumSgprs: 8
+; VI-NOXNACK: ; TotalNumSgprs: 8
+; VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
 define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7}"()
@@ -42,11 +42,11 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; NumSgprs: 10
-; VI-NOXNACK: ; NumSgprs: 10
-; VI-XNACK: ; NumSgprs: 12
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 10
+; CI: ; TotalNumSgprs: 10
+; VI-NOXNACK: ; TotalNumSgprs: 10
+; VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
 define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -59,11 +59,11 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; NumSgprs: 12
-; VI-NOXNACK: ; NumSgprs: 14
-; VI-XNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 8
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
+; VI-XNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
 define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -76,11 +76,11 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; NumSgprs: 12
-; VI-NOXNACK: ; NumSgprs: 14
-; VI-XNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 10
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
+; VI-XNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
 define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -96,11 +96,11 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 4
-; VI-NOXNACK: NumSgprs: 6
-; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; NumSgprs: 6
-; GFX10-ARCH-FLAT: ; NumSgprs: 0
+; CI: TotalNumSgprs: 4
+; VI-NOXNACK: TotalNumSgprs: 6
+; VI-XNACK: TotalNumSgprs: 6
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -113,11 +113,11 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 4
-; VI-NOXNACK: NumSgprs: 6
-; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; NumSgprs: 6
-; GFX10-ARCH-FLAT: ; NumSgprs: 0
+; CI: TotalNumSgprs: 4
+; VI-NOXNACK: TotalNumSgprs: 6
+; VI-XNACK: TotalNumSgprs: 6
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_lo() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -130,11 +130,11 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 4
-; VI-NOXNACK: NumSgprs: 6
-; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; NumSgprs: 6
-; GFX10-ARCH-FLAT: ; NumSgprs: 0
+; CI: TotalNumSgprs: 4
+; VI-NOXNACK: TotalNumSgprs: 6
+; VI-XNACK: TotalNumSgprs: 6
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_hi() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 9653f8fdacac6..ef9590b3fd33f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -4891,5 +4891,449 @@ bb:
   ret void
 }
 
+define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
+; GFX9-LABEL: sgpr_base_large_offset:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_add_i32 s2, s2, 0xffe8
+; GFX9-NEXT:    scratch_load_dword v2, off, s2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_large_offset:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    s_add_i32 s2, s2, 0xffe8
+; GFX10-NEXT:    scratch_load_dword v2, off, s2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_large_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_add_i32 s0, s0, 0xffe8
+; GFX11-NEXT:    scratch_load_b32 v2, off, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_large_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:65512
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
+; GFX9-PAL-LABEL: sgpr_base_large_offset:
+; GFX9-PAL:       ; %bb.0: ; %entry
+; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 0xffe8
+; GFX9-PAL-NEXT:    scratch_load_dword v2, off, s0
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-PAL-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_large_offset:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_add_i32 s0, s0, 0xffe8
+; GFX940-NEXT:    scratch_load_dword v2, off, s0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    global_store_dword v[0:1], v2, off sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
+; GFX10-PAL-LABEL: sgpr_base_large_offset:
+; GFX10-PAL:       ; %bb.0: ; %entry
+; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s5
+; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 0xffe8
+; GFX10-PAL-NEXT:    scratch_load_dword v2, off, s0
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-PAL-NEXT:    s_endpgm
+;
+; GFX11-PAL-LABEL: sgpr_base_large_offset:
+; GFX11-PAL:       ; %bb.0: ; %entry
+; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 0xffe8
+; GFX11-PAL-NEXT:    scratch_load_b32 v2, off, s0
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-PAL-NEXT:    s_nop 0
+; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: sgpr_base_large_offset:
+; GFX12-PAL:       ; %bb.0: ; %entry
+; GFX12-PAL-NEXT:    scratch_load_b32 v2, off, s0 offset:65512
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-PAL-NEXT:    s_nop 0
+; GFX12-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-PAL-NEXT:    s_endpgm
+entry:
+  %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512
+  %load = load i32, ptr addrspace(5) %large_offset, align 4
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
+; GFX9-LABEL: sgpr_base_large_offset_split:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_and_b32 s0, s2, -4
+; GFX9-NEXT:    s_add_i32 s0, s0, 0x100f000
+; GFX9-NEXT:    scratch_load_dword v2, off, s0 offset:4072 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_large_offset_split:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    s_and_b32 s0, s2, -4
+; GFX10-NEXT:    s_add_i32 s0, s0, 0x100f800
+; GFX10-NEXT:    scratch_load_dword v2, off, s0 offset:2024 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_large_offset_split:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0x100f000
+; GFX11-NEXT:    s_and_b32 s0, s0, -4
+; GFX11-NEXT:    scratch_load_b32 v2, v2, s0 offset:4072 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_large_offset_split:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0x1000000
+; GFX12-NEXT:    s_and_b32 s0, s0, -4
+; GFX12-NEXT:    scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
+; GFX9-PAL-LABEL: sgpr_base_large_offset_split:
+; GFX9-PAL:       ; %bb.0: ; %entry
+; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT:    s_and_b32 s0, s0, -4
+; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 0x100f000
+; GFX9-PAL-NEXT:    scratch_load_dword v2, off, s0 offset:4072 glc
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-PAL-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_large_offset_split:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_and_b32 s0, s0, -4
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0x100f000
+; GFX940-NEXT:    scratch_load_dword v2, v2, s0 offset:4072 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    global_store_dword v[0:1], v2, off sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
+; GFX10-PAL-LABEL: sgpr_base_large_offset_split:
+; GFX10-PAL:       ; %bb.0: ; %entry
+; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s5
+; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT:    s_and_b32 s0, s0, -4
+; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 0x100f800
+; GFX10-PAL-NEXT:    scratch_load_dword v2, off, s0 offset:2024 glc dlc
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-PAL-NEXT:    s_endpgm
+;
+; GFX11-PAL-LABEL: sgpr_base_large_offset_split:
+; GFX11-PAL:       ; %bb.0: ; %entry
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 0x100f000
+; GFX11-PAL-NEXT:    s_and_b32 s0, s0, -4
+; GFX11-PAL-NEXT:    scratch_load_b32 v2, v2, s0 offset:4072 glc dlc
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-PAL-NEXT:    s_nop 0
+; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: sgpr_base_large_offset_split:
+; GFX12-PAL:       ; %bb.0: ; %entry
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v2, 0x1000000
+; GFX12-PAL-NEXT:    s_and_b32 s0, s0, -4
+; GFX12-PAL-NEXT:    scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-PAL-NEXT:    s_nop 0
+; GFX12-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-PAL-NEXT:    s_endpgm
+entry:
+  ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
+  %sgpr_base_i32 = ptrtoint ptr addrspace(5) %sgpr_base to i32
+  %sgpr_base_i32_align4 = and i32 %sgpr_base_i32, 4294967292
+  %sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace(5)
+  %split_offset = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base_align4, i32 0, i32 16842728
+  %load = load volatile i32, ptr addrspace(5) %split_offset, align 4
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
+; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_add_i32 s2, s2, s3
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, 0xffe8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    s_add_i32 s2, s2, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 0xffe8
+; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 0xffe8
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-NEXT:    scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+;
+; GFX9-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX9-PAL:       ; %bb.0: ; %bb
+; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0xffe8, v0
+; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX940:       ; %bb.0: ; %bb
+; GFX940-NEXT:    s_add_i32 s0, s0, s1
+; GFX940-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-NEXT:    v_add_u32_e32 v0, 0xffe8, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_endpgm
+;
+; GFX10-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX10-PAL:       ; %bb.0: ; %bb
+; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s5
+; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT:    s_add_i32 s0, s0, s1
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT:    v_add3_u32 v0, s0, v0, 0xffe8
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT:    s_endpgm
+;
+; GFX11-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX11-PAL:       ; %bb.0: ; %bb
+; GFX11-PAL-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-PAL-NEXT:    v_add3_u32 v0, s0, v0, 0xffe8
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    s_endpgm
+bb:
+  %add1 = add nsw i32 %sidx, %vidx
+  %add2 = add nsw i32 %add1, 65512
+  %gep = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
+  store volatile i32 15, ptr addrspace(5) %gep, align 4
+  ret void
+}
+
+define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
+; GFX9-LABEL: sgpr_base_negative_offset:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_addk_i32 s2, 0xffe8
+; GFX9-NEXT:    scratch_load_dword v2, off, s2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_negative_offset:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    scratch_load_dword v2, off, s2 offset:-24
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_negative_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_negative_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
+; GFX9-PAL-LABEL: sgpr_base_negative_offset:
+; GFX9-PAL:       ; %bb.0: ; %entry
+; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT:    s_addk_i32 s0, 0xffe8
+; GFX9-PAL-NEXT:    scratch_load_dword v2, off, s0
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-PAL-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_negative_offset:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_addk_i32 s0, 0xffe8
+; GFX940-NEXT:    scratch_load_dword v2, off, s0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    global_store_dword v[0:1], v2, off sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
+; GFX10-PAL-LABEL: sgpr_base_negative_offset:
+; GFX10-PAL:       ; %bb.0: ; %entry
+; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s5
+; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT:    scratch_load_dword v2, off, s0 offset:-24
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-PAL-NEXT:    s_endpgm
+;
+; GFX11-PAL-LABEL: sgpr_base_negative_offset:
+; GFX11-PAL:       ; %bb.0: ; %entry
+; GFX11-PAL-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-PAL-NEXT:    s_nop 0
+; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: sgpr_base_negative_offset:
+; GFX12-PAL:       ; %bb.0: ; %entry
+; GFX12-PAL-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-PAL-NEXT:    s_nop 0
+; GFX12-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-PAL-NEXT:    s_endpgm
+entry:
+  %scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24
+  %0 = load i32, ptr addrspace(5) %scevgep28, align 4
+  store i32 %0, ptr addrspace(1) %out
+  ret void
+}
+
 declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg)
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir
index ba619a659f1b0..5f36d5403ebcf 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir
@@ -12,13 +12,13 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
-    ; CHECK-NEXT: S_NOP 0, implicit-def $exec_lo
+    ; CHECK: S_NOP 0, implicit-def $exec_lo
     ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
+    ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
-    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
+    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
     S_NOP 0, implicit-def $exec_lo
@@ -37,13 +37,13 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
-    ; CHECK-NEXT: S_NOP 0, implicit-def $exec_hi
+    ; CHECK: S_NOP 0, implicit-def $exec_hi
     ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_hi
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
+    ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
-    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
+    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
     S_NOP 0, implicit-def $exec_hi
@@ -62,16 +62,16 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
-    ; CHECK-NEXT: S_NOP 0, implicit-def $exec
+    ; CHECK: S_NOP 0, implicit-def $exec
     ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1
+    ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
     ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
     ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
-    ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1
+    ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
     S_NOP 0, implicit-def $exec
@@ -93,12 +93,12 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
-    ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
+    ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo
+    ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
-    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
+    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
     S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_lo
@@ -116,12 +116,12 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
-    ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
+    ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi
+    ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
-    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
+    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
     S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_hi
@@ -139,15 +139,15 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
-    ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1
+    ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec
+    ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
     ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
-    ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1
+    ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
     S_NOP 0, implicit-def %0:sreg_64, implicit-def %1:sreg_64, implicit-def $exec
diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
index 1c7896fcb4f14..1c2436bd6b6cd 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
@@ -13,13 +13,13 @@ body:             |
   bb.0:
 
     ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
-    ; CHECK-NEXT: S_NOP 0, implicit-def $m0
+    ; CHECK: S_NOP 0, implicit-def $m0
     ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $m0
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
+    ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
-    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
+    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0
     ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
@@ -43,12 +43,12 @@ body:             |
   bb.0:
 
     ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK: $vgpr0 = IMPLICIT_DEF
     ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0
-    ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
+    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
     ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
-    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
+    ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
     ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0
     ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index.mir b/llvm/test/CodeGen/AMDGPU/frame-index.mir
index f388aeb047029..0309a156171d7 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/frame-index.mir
@@ -166,7 +166,7 @@ body:             |
   bb.0:
     liveins: $sgpr30_sgpr31, $sgpr10
     ; GCN-LABEL: name: func_add_constant_to_fi_uniform_live_SCC_i32
-    ; GCN: liveins: $sgpr10, $sgpr30_sgpr31
+    ; GCN: liveins: $sgpr30_sgpr31, $sgpr10
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_U32 $sgpr10, 4, implicit-def $scc
     ; GCN-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
new file mode 100644
index 0000000000000..22427ee344d91
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -0,0 +1,1856 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG  %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+
+define void @freeze_v2i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v2i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v2i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <2 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <2 x i32> %a
+  store <2 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v3i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v3i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx3 v[2:3], v[4:6], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v3i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b96 v[2:3], v[4:6], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <3 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <3 x i32> %a
+  store <3 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v4i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v4i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v4i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <4 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <4 x i32> %a
+  store <4 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v5i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v5i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    global_load_dword v8, v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dword v[2:3], v8, off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v5i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dword v8, v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dword v[2:3], v8, off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v5i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_load_b32 v8, v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b32 v[2:3], v8, off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v5i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b32 v0, v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b32 v[2:3], v0, off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <5 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <5 x i32> %a
+  store <5 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v6i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v6i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    global_load_dwordx2 v[8:9], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[2:3], v[8:9], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v6i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[8:9], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[8:9], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v6i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_load_b64 v[8:9], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b64 v[2:3], v[8:9], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v6i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <6 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <6 x i32> %a
+  store <6 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v7i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v7i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    global_load_dwordx3 v[8:10], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx3 v[2:3], v[8:10], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v7i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx3 v[8:10], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx3 v[2:3], v[8:10], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v7i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_load_b96 v[8:10], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b96 v[2:3], v[8:10], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v7i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b96 v[8:10], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b96 v[2:3], v[8:10], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <7 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <7 x i32> %a
+  store <7 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v8i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v8i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v8i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v8i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v8i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <8 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <8 x i32> %a
+  store <8 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v9i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v9i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x2
+; GFX10-SDAG-NEXT:    global_load_dword v12, v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dword v[2:3], v12, off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v9i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x2
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dword v12, v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dword v[2:3], v12, off offset:32
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v9i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x2
+; GFX11-SDAG-NEXT:    global_load_b32 v12, v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b32 v[2:3], v12, off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v9i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x2
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b32 v0, v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b32 v[2:3], v0, off offset:32
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <9 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <9 x i32> %a
+  store <9 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v10i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v10i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[0:1], off offset:32
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[12:13], off offset:32
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v10i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:32
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off offset:32
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <10 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <10 x i32> %a
+  store <10 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v11i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v11i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx3 v[12:14], v[0:1], off offset:32
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx3 v[2:3], v[12:14], off offset:32
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v11i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-NEXT:    global_load_b96 v[12:14], v[0:1], off offset:32
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b96 v[2:3], v[12:14], off offset:32
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <11 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <11 x i32> %a
+  store <11 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v12i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v12i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v12i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <12 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <12 x i32> %a
+  store <12 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+define void @freeze_v13i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v13i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x3
+; GFX10-SDAG-NEXT:    global_load_dword v16, v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dword v[2:3], v16, off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v13i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x3
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dword v16, v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dword v[2:3], v16, off offset:48
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v13i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x3
+; GFX11-SDAG-NEXT:    global_load_b32 v16, v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b32 v[2:3], v16, off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v13i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x3
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b32 v0, v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b32 v[2:3], v0, off offset:48
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <13 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <13 x i32> %a
+  store <13 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v14i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v14i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x3
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx2 v[16:17], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[2:3], v[16:17], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v14i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x3
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[16:17], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[16:17], off offset:48
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v14i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x3
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b64 v[16:17], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b64 v[2:3], v[16:17], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v14i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x3
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <14 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <14 x i32> %a
+  store <14 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v15i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v15i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x3
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx3 v[16:18], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx3 v[2:3], v[16:18], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v15i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x3
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx3 v[16:18], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx3 v[2:3], v[16:18], off offset:48
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v15i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x3
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b96 v[16:18], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b96 v[2:3], v[16:18], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v15i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x3
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b96 v[16:18], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b96 v[2:3], v[16:18], off offset:48
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <15 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <15 x i32> %a
+  store <15 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v16i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v16i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x3
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v16i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x3
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v16i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x3
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v16i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x3
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <16 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <16 x i32> %a
+  store <16 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v17i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v17i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x4
+; GFX10-SDAG-NEXT:    global_load_dword v20, v[0:1], off offset:64
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-SDAG-NEXT:    global_store_dword v[2:3], v20, off offset:64
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v17i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x4
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    global_load_dword v20, v[0:1], off offset:64
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dword v[2:3], v20, off offset:64
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v17i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x4
+; GFX11-SDAG-NEXT:    global_load_b32 v20, v[0:1], off offset:64
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-SDAG-NEXT:    global_store_b32 v[2:3], v20, off offset:64
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v17i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x4
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    global_load_b32 v0, v[0:1], off offset:64
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b32 v[2:3], v0, off offset:64
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <17 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <17 x i32> %a
+  store <17 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v18i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v18i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x4
+; GFX10-SDAG-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:64
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[2:3], v[20:21], off offset:64
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v18i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x4
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:64
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[20:21], off offset:64
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v18i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x4
+; GFX11-SDAG-NEXT:    global_load_b64 v[20:21], v[0:1], off offset:64
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-SDAG-NEXT:    global_store_b64 v[2:3], v[20:21], off offset:64
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v18i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x4
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:64
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off offset:64
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <18 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <18 x i32> %a
+  store <18 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v19i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v19i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x4
+; GFX10-SDAG-NEXT:    global_load_dwordx3 v[20:22], v[0:1], off offset:64
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-SDAG-NEXT:    global_store_dwordx3 v[2:3], v[20:22], off offset:64
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v19i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x4
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    global_load_dwordx3 v[20:22], v[0:1], off offset:64
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx3 v[2:3], v[20:22], off offset:64
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v19i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x4
+; GFX11-SDAG-NEXT:    global_load_b96 v[20:22], v[0:1], off offset:64
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-SDAG-NEXT:    global_store_b96 v[2:3], v[20:22], off offset:64
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v19i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x4
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    global_load_b96 v[20:22], v[0:1], off offset:64
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b96 v[2:3], v[20:22], off offset:64
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <19 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <19 x i32> %a
+  store <19 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v20i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v20i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x4
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:64
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:64
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v20i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x4
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v20i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x4
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:64
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:64
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v20i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x4
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:64
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:64
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <20 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <20 x i32> %a
+  store <20 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v21i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v21i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x5
+; GFX10-SDAG-NEXT:    global_load_dword v24, v[0:1], off offset:80
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:64
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-SDAG-NEXT:    global_store_dword v[2:3], v24, off offset:80
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:64
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v21i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x5
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX10-GISEL-NEXT:    global_load_dword v24, v[0:1], off offset:80
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dword v[2:3], v24, off offset:80
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v21i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x5
+; GFX11-SDAG-NEXT:    global_load_b32 v24, v[0:1], off offset:80
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:64
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-SDAG-NEXT:    global_store_b32 v[2:3], v24, off offset:80
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:64
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v21i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x5
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:64
+; GFX11-GISEL-NEXT:    global_load_b32 v0, v[0:1], off offset:80
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:64
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b32 v[2:3], v0, off offset:80
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <21 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <21 x i32> %a
+  store <21 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v22i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v22i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x5
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:64
+; GFX10-SDAG-NEXT:    global_load_dwordx2 v[24:25], v[0:1], off offset:80
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:64
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[2:3], v[24:25], off offset:80
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v22i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x5
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[24:25], v[0:1], off offset:80
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[24:25], off offset:80
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v22i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x5
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:64
+; GFX11-SDAG-NEXT:    global_load_b64 v[24:25], v[0:1], off offset:80
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:64
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-SDAG-NEXT:    global_store_b64 v[2:3], v[24:25], off offset:80
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v22i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x5
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:64
+; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:80
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:64
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off offset:80
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <22 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <22 x i32> %a
+  store <22 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v30i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v30i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x7
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:96
+; GFX10-SDAG-NEXT:    global_load_dwordx2 v[32:33], v[0:1], off offset:112
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:64
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:80
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:96
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[2:3], v[32:33], off offset:112
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:64
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:80
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v30i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x7
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:80
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:96
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[32:33], v[0:1], off offset:112
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off offset:80
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off offset:96
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[32:33], off offset:112
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v30i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x7
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:96
+; GFX11-SDAG-NEXT:    global_load_b64 v[32:33], v[0:1], off offset:112
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:64
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:80
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[24:27], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[28:31], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:96
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-SDAG-NEXT:    global_store_b64 v[2:3], v[32:33], off offset:112
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:64
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:80
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[24:27], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[28:31], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v30i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x7
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:64
+; GFX11-GISEL-NEXT:    global_load_b128 v[24:27], v[0:1], off offset:80
+; GFX11-GISEL-NEXT:    global_load_b128 v[28:31], v[0:1], off offset:96
+; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:112
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:64
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[24:27], off offset:80
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[28:31], off offset:96
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off offset:112
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <30 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <30 x i32> %a
+  store <30 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v31i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v31i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x7
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:96
+; GFX10-SDAG-NEXT:    global_load_dwordx3 v[32:34], v[0:1], off offset:112
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:64
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:80
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:96
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-SDAG-NEXT:    global_store_dwordx3 v[2:3], v[32:34], off offset:112
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:64
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:80
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v31i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x7
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:80
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:96
+; GFX10-GISEL-NEXT:    global_load_dwordx3 v[32:34], v[0:1], off offset:112
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off offset:80
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off offset:96
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx3 v[2:3], v[32:34], off offset:112
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v31i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x7
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:96
+; GFX11-SDAG-NEXT:    global_load_b96 v[32:34], v[0:1], off offset:112
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:64
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:80
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[24:27], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[28:31], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:96
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-SDAG-NEXT:    global_store_b96 v[2:3], v[32:34], off offset:112
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:64
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:80
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[24:27], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[28:31], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v31i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x7
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:64
+; GFX11-GISEL-NEXT:    global_load_b128 v[24:27], v[0:1], off offset:80
+; GFX11-GISEL-NEXT:    global_load_b128 v[28:31], v[0:1], off offset:96
+; GFX11-GISEL-NEXT:    global_load_b96 v[32:34], v[0:1], off offset:112
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:64
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[24:27], off offset:80
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[28:31], off offset:96
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b96 v[2:3], v[32:34], off offset:112
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <31 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <31 x i32> %a
+  store <31 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_v32i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v32i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x7
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:96
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:112
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:64
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:80
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:96
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:112
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:64
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:80
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[32:35], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v32i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x7
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:80
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:96
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:112
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off offset:80
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off offset:96
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[32:35], off offset:112
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v32i32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x7
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:96
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:112
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:64
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:80
+; GFX11-SDAG-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[24:27], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[28:31], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[32:35], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:96
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:112
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:64
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:80
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[24:27], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[28:31], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[32:35], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v32i32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x7
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:64
+; GFX11-GISEL-NEXT:    global_load_b128 v[24:27], v[0:1], off offset:80
+; GFX11-GISEL-NEXT:    global_load_b128 v[28:31], v[0:1], off offset:96
+; GFX11-GISEL-NEXT:    global_load_b128 v[32:35], v[0:1], off offset:112
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:64
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[24:27], off offset:80
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[28:31], off offset:96
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[32:35], off offset:112
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <32 x i32>, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze <32 x i32> %a
+  store <32 x i32> %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load i32, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze i32 %a
+  store i32 %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_i64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load i64, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze i64 %a
+  store i64 %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_float(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_float:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_float:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load float, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze float %a
+  store float %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_i128(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_i128:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_i128:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load i128, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze i128 %a
+  store i128 %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
+
+define void @freeze_i256(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_i256:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_i256:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_i256:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_i256:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load i256, ptr addrspace(1) %ptra, align 4
+  %freeze = freeze i256 %a
+  store i256 %freeze, ptr addrspace(1) %ptrb, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
new file mode 100644
index 0000000000000..9e3264eb9c07f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -0,0 +1,531 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefix=GCN %s
+
+; Functions that don't make calls should have constants as its resource usage as no resource information has to be propagated.
+
+; GCN-LABEL: {{^}}use_vcc:
+; GCN: .set use_vcc.num_vgpr, 0
+; GCN: .set use_vcc.num_agpr, 0
+; GCN: .set use_vcc.numbered_sgpr, 32
+; GCN: .set use_vcc.private_seg_size, 0
+; GCN: .set use_vcc.uses_vcc, 1
+; GCN: .set use_vcc.uses_flat_scratch, 0
+; GCN: .set use_vcc.has_dyn_sized_stack, 0
+; GCN: .set use_vcc.has_recursion, 0
+; GCN: .set use_vcc.has_indirect_call, 0
+; GCN: TotalNumSgprs: 36
+; GCN: NumVgprs: 0
+; GCN: ScratchSize: 0
+define void @use_vcc() #1 {
+  call void asm sideeffect "", "~{vcc}" () #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_vcc:
+; GCN: .set indirect_use_vcc.num_vgpr, max(41, use_vcc.num_vgpr)
+; GCN: .set indirect_use_vcc.num_agpr, max(0, use_vcc.num_agpr)
+; GCN: .set indirect_use_vcc.numbered_sgpr, max(34, use_vcc.numbered_sgpr)
+; GCN: .set indirect_use_vcc.private_seg_size, 16+(max(use_vcc.private_seg_size))
+; GCN: .set indirect_use_vcc.uses_vcc, or(1, use_vcc.uses_vcc)
+; GCN: .set indirect_use_vcc.uses_flat_scratch, or(0, use_vcc.uses_flat_scratch)
+; GCN: .set indirect_use_vcc.has_dyn_sized_stack, or(0, use_vcc.has_dyn_sized_stack)
+; GCN: .set indirect_use_vcc.has_recursion, or(0, use_vcc.has_recursion)
+; GCN: .set indirect_use_vcc.has_indirect_call, or(0, use_vcc.has_indirect_call)
+; GCN: TotalNumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define void @indirect_use_vcc() #1 {
+  call void @use_vcc()
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
+; GCN: .set indirect_2level_use_vcc_kernel.num_vgpr, max(32, indirect_use_vcc.num_vgpr)
+; GCN: .set indirect_2level_use_vcc_kernel.num_agpr, max(0, indirect_use_vcc.num_agpr)
+; GCN: .set indirect_2level_use_vcc_kernel.numbered_sgpr, max(33, indirect_use_vcc.numbered_sgpr)
+; GCN: .set indirect_2level_use_vcc_kernel.private_seg_size, 0+(max(indirect_use_vcc.private_seg_size))
+; GCN: .set indirect_2level_use_vcc_kernel.uses_vcc, or(1, indirect_use_vcc.uses_vcc)
+; GCN: .set indirect_2level_use_vcc_kernel.uses_flat_scratch, or(1, indirect_use_vcc.uses_flat_scratch)
+; GCN: .set indirect_2level_use_vcc_kernel.has_dyn_sized_stack, or(0, indirect_use_vcc.has_dyn_sized_stack)
+; GCN: .set indirect_2level_use_vcc_kernel.has_recursion, or(0, indirect_use_vcc.has_recursion)
+; GCN: .set indirect_2level_use_vcc_kernel.has_indirect_call, or(0, indirect_use_vcc.has_indirect_call)
+; GCN: TotalNumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
+  call void @indirect_use_vcc()
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_flat_scratch:
+; GCN: .set use_flat_scratch.num_vgpr, 0
+; GCN: .set use_flat_scratch.num_agpr, 0
+; GCN: .set use_flat_scratch.numbered_sgpr, 32
+; GCN: .set use_flat_scratch.private_seg_size, 0
+; GCN: .set use_flat_scratch.uses_vcc, 0
+; GCN: .set use_flat_scratch.uses_flat_scratch, 1
+; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
+; GCN: .set use_flat_scratch.has_recursion, 0
+; GCN: .set use_flat_scratch.has_indirect_call, 0
+; GCN: TotalNumSgprs: 38
+; GCN: NumVgprs: 0
+; GCN: ScratchSize: 0
+define void @use_flat_scratch() #1 {
+  call void asm sideeffect "", "~{flat_scratch}" () #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_flat_scratch:
+; GCN: .set indirect_use_flat_scratch.num_vgpr, max(41, use_flat_scratch.num_vgpr)
+; GCN: .set indirect_use_flat_scratch.num_agpr, max(0, use_flat_scratch.num_agpr)
+; GCN: .set indirect_use_flat_scratch.numbered_sgpr, max(34, use_flat_scratch.numbered_sgpr)
+; GCN: .set indirect_use_flat_scratch.private_seg_size, 16+(max(use_flat_scratch.private_seg_size))
+; GCN: .set indirect_use_flat_scratch.uses_vcc, or(1, use_flat_scratch.uses_vcc)
+; GCN: .set indirect_use_flat_scratch.uses_flat_scratch, or(0, use_flat_scratch.uses_flat_scratch)
+; GCN: .set indirect_use_flat_scratch.has_dyn_sized_stack, or(0, use_flat_scratch.has_dyn_sized_stack)
+; GCN: .set indirect_use_flat_scratch.has_recursion, or(0, use_flat_scratch.has_recursion)
+; GCN: .set indirect_use_flat_scratch.has_indirect_call, or(0, use_flat_scratch.has_indirect_call)
+; GCN: TotalNumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define void @indirect_use_flat_scratch() #1 {
+  call void @use_flat_scratch()
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel:
+; GCN: .set indirect_2level_use_flat_scratch_kernel.num_vgpr, max(32, indirect_use_flat_scratch.num_vgpr)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.num_agpr, max(0, indirect_use_flat_scratch.num_agpr)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.numbered_sgpr, max(33, indirect_use_flat_scratch.numbered_sgpr)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.private_seg_size, 0+(max(indirect_use_flat_scratch.private_seg_size))
+; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_vcc, or(1, indirect_use_flat_scratch.uses_vcc)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_flat_scratch, or(1, indirect_use_flat_scratch.uses_flat_scratch)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.has_dyn_sized_stack, or(0, indirect_use_flat_scratch.has_dyn_sized_stack)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.has_recursion, or(0, indirect_use_flat_scratch.has_recursion)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.has_indirect_call, or(0, indirect_use_flat_scratch.has_indirect_call)
+; GCN: TotalNumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 {
+  call void @indirect_use_flat_scratch()
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_10_vgpr:
+; GCN: .set use_10_vgpr.num_vgpr, 10
+; GCN: .set use_10_vgpr.num_agpr, 0
+; GCN: .set use_10_vgpr.numbered_sgpr, 32
+; GCN: .set use_10_vgpr.private_seg_size, 0
+; GCN: .set use_10_vgpr.uses_vcc, 0
+; GCN: .set use_10_vgpr.uses_flat_scratch, 0
+; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
+; GCN: .set use_10_vgpr.has_recursion, 0
+; GCN: .set use_10_vgpr.has_indirect_call, 0
+; GCN: TotalNumSgprs: 36
+; GCN: NumVgprs: 10
+; GCN: ScratchSize: 0
+define void @use_10_vgpr() #1 {
+  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4}"() #0
+  call void asm sideeffect "", "~{v5},~{v6},~{v7},~{v8},~{v9}"() #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_10_vgpr:
+; GCN: .set indirect_use_10_vgpr.num_vgpr, max(41, use_10_vgpr.num_vgpr)
+; GCN: .set indirect_use_10_vgpr.num_agpr, max(0, use_10_vgpr.num_agpr)
+; GCN: .set indirect_use_10_vgpr.numbered_sgpr, max(34, use_10_vgpr.numbered_sgpr)
+; GCN: .set indirect_use_10_vgpr.private_seg_size, 16+(max(use_10_vgpr.private_seg_size))
+; GCN: .set indirect_use_10_vgpr.uses_vcc, or(1, use_10_vgpr.uses_vcc)
+; GCN: .set indirect_use_10_vgpr.uses_flat_scratch, or(0, use_10_vgpr.uses_flat_scratch)
+; GCN: .set indirect_use_10_vgpr.has_dyn_sized_stack, or(0, use_10_vgpr.has_dyn_sized_stack)
+; GCN: .set indirect_use_10_vgpr.has_recursion, or(0, use_10_vgpr.has_recursion)
+; GCN: .set indirect_use_10_vgpr.has_indirect_call, or(0, use_10_vgpr.has_indirect_call)
+; GCN: TotalNumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define void @indirect_use_10_vgpr() #0 {
+  call void @use_10_vgpr()
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
+; GCN:	.set indirect_2_level_use_10_vgpr.num_vgpr, max(32, indirect_use_10_vgpr.num_vgpr)
+; GCN:	.set indirect_2_level_use_10_vgpr.num_agpr, max(0, indirect_use_10_vgpr.num_agpr)
+; GCN:	.set indirect_2_level_use_10_vgpr.numbered_sgpr, max(33, indirect_use_10_vgpr.numbered_sgpr)
+; GCN:	.set indirect_2_level_use_10_vgpr.private_seg_size, 0+(max(indirect_use_10_vgpr.private_seg_size))
+; GCN:	.set indirect_2_level_use_10_vgpr.uses_vcc, or(1, indirect_use_10_vgpr.uses_vcc)
+; GCN:	.set indirect_2_level_use_10_vgpr.uses_flat_scratch, or(1, indirect_use_10_vgpr.uses_flat_scratch)
+; GCN:	.set indirect_2_level_use_10_vgpr.has_dyn_sized_stack, or(0, indirect_use_10_vgpr.has_dyn_sized_stack)
+; GCN:	.set indirect_2_level_use_10_vgpr.has_recursion, or(0, indirect_use_10_vgpr.has_recursion)
+; GCN:	.set indirect_2_level_use_10_vgpr.has_indirect_call, or(0, indirect_use_10_vgpr.has_indirect_call)
+; GCN: TotalNumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
+  call void @indirect_use_10_vgpr()
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_50_vgpr:
+; GCN:	.set use_50_vgpr.num_vgpr, 50
+; GCN:	.set use_50_vgpr.num_agpr, 0
+; GCN:	.set use_50_vgpr.numbered_sgpr, 32
+; GCN:	.set use_50_vgpr.private_seg_size, 0
+; GCN:	.set use_50_vgpr.uses_vcc, 0
+; GCN:	.set use_50_vgpr.uses_flat_scratch, 0
+; GCN:	.set use_50_vgpr.has_dyn_sized_stack, 0
+; GCN:	.set use_50_vgpr.has_recursion, 0
+; GCN:	.set use_50_vgpr.has_indirect_call, 0
+; GCN: TotalNumSgprs: 36
+; GCN: NumVgprs: 50
+; GCN: ScratchSize: 0
+define void @use_50_vgpr() #1 {
+  call void asm sideeffect "", "~{v49}"() #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_50_vgpr:
+; GCN:	.set indirect_use_50_vgpr.num_vgpr, max(41, use_50_vgpr.num_vgpr)
+; GCN:	.set indirect_use_50_vgpr.num_agpr, max(0, use_50_vgpr.num_agpr)
+; GCN:	.set indirect_use_50_vgpr.numbered_sgpr, max(34, use_50_vgpr.numbered_sgpr)
+; GCN:	.set indirect_use_50_vgpr.private_seg_size, 16+(max(use_50_vgpr.private_seg_size))
+; GCN:	.set indirect_use_50_vgpr.uses_vcc, or(1, use_50_vgpr.uses_vcc)
+; GCN:	.set indirect_use_50_vgpr.uses_flat_scratch, or(0, use_50_vgpr.uses_flat_scratch)
+; GCN:	.set indirect_use_50_vgpr.has_dyn_sized_stack, or(0, use_50_vgpr.has_dyn_sized_stack)
+; GCN:	.set indirect_use_50_vgpr.has_recursion, or(0, use_50_vgpr.has_recursion)
+; GCN:	.set indirect_use_50_vgpr.has_indirect_call, or(0, use_50_vgpr.has_indirect_call)
+; GCN: TotalNumSgprs: 38
+; GCN: NumVgprs: 50
+; GCN: ScratchSize: 16
+define void @indirect_use_50_vgpr() #0 {
+  call void @use_50_vgpr()
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_80_sgpr:
+; GCN:	.set use_80_sgpr.num_vgpr, 1
+; GCN:	.set use_80_sgpr.num_agpr, 0
+; GCN:	.set use_80_sgpr.numbered_sgpr, 80
+; GCN:	.set use_80_sgpr.private_seg_size, 8
+; GCN:	.set use_80_sgpr.uses_vcc, 0
+; GCN:	.set use_80_sgpr.uses_flat_scratch, 0
+; GCN:	.set use_80_sgpr.has_dyn_sized_stack, 0
+; GCN:	.set use_80_sgpr.has_recursion, 0
+; GCN:	.set use_80_sgpr.has_indirect_call, 0
+; GCN: TotalNumSgprs: 84
+; GCN: NumVgprs: 1
+; GCN: ScratchSize: 8
+define void @use_80_sgpr() #1 {
+  call void asm sideeffect "", "~{s79}"() #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_80_sgpr:
+; GCN:	.set indirect_use_80_sgpr.num_vgpr, max(41, use_80_sgpr.num_vgpr)
+; GCN:	.set indirect_use_80_sgpr.num_agpr, max(0, use_80_sgpr.num_agpr)
+; GCN:	.set indirect_use_80_sgpr.numbered_sgpr, max(34, use_80_sgpr.numbered_sgpr)
+; GCN:	.set indirect_use_80_sgpr.private_seg_size, 16+(max(use_80_sgpr.private_seg_size))
+; GCN:	.set indirect_use_80_sgpr.uses_vcc, or(1, use_80_sgpr.uses_vcc)
+; GCN:	.set indirect_use_80_sgpr.uses_flat_scratch, or(0, use_80_sgpr.uses_flat_scratch)
+; GCN:	.set indirect_use_80_sgpr.has_dyn_sized_stack, or(0, use_80_sgpr.has_dyn_sized_stack)
+; GCN:	.set indirect_use_80_sgpr.has_recursion, or(0, use_80_sgpr.has_recursion)
+; GCN:	.set indirect_use_80_sgpr.has_indirect_call, or(0, use_80_sgpr.has_indirect_call)
+; GCN: TotalNumSgprs: 84
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 24
+define void @indirect_use_80_sgpr() #1 {
+  call void @use_80_sgpr()
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
+; GCN:	.set indirect_2_level_use_80_sgpr.num_vgpr, max(32, indirect_use_80_sgpr.num_vgpr)
+; GCN:	.set indirect_2_level_use_80_sgpr.num_agpr, max(0, indirect_use_80_sgpr.num_agpr)
+; GCN:	.set indirect_2_level_use_80_sgpr.numbered_sgpr, max(33, indirect_use_80_sgpr.numbered_sgpr)
+; GCN:	.set indirect_2_level_use_80_sgpr.private_seg_size, 0+(max(indirect_use_80_sgpr.private_seg_size))
+; GCN:	.set indirect_2_level_use_80_sgpr.uses_vcc, or(1, indirect_use_80_sgpr.uses_vcc)
+; GCN:	.set indirect_2_level_use_80_sgpr.uses_flat_scratch, or(1, indirect_use_80_sgpr.uses_flat_scratch)
+; GCN:	.set indirect_2_level_use_80_sgpr.has_dyn_sized_stack, or(0, indirect_use_80_sgpr.has_dyn_sized_stack)
+; GCN:	.set indirect_2_level_use_80_sgpr.has_recursion, or(0, indirect_use_80_sgpr.has_recursion)
+; GCN:	.set indirect_2_level_use_80_sgpr.has_indirect_call, or(0, indirect_use_80_sgpr.has_indirect_call)
+; GCN: TotalNumSgprs: 86
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 24
+define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
+  call void @indirect_use_80_sgpr()
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_stack0:
+; GCN:	.set use_stack0.num_vgpr, 1
+; GCN:	.set use_stack0.num_agpr, 0
+; GCN:	.set use_stack0.numbered_sgpr, 33
+; GCN:	.set use_stack0.private_seg_size, 2052
+; GCN:	.set use_stack0.uses_vcc, 0
+; GCN:	.set use_stack0.uses_flat_scratch, 0
+; GCN:	.set use_stack0.has_dyn_sized_stack, 0
+; GCN:	.set use_stack0.has_recursion, 0
+; GCN:	.set use_stack0.has_indirect_call, 0
+; GCN: TotalNumSgprs: 37
+; GCN: NumVgprs: 1
+; GCN: ScratchSize: 2052
+define void @use_stack0() #1 {
+  %alloca = alloca [512 x i32], align 4, addrspace(5)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_stack1:
+; GCN:	.set use_stack1.num_vgpr, 1
+; GCN:	.set use_stack1.num_agpr, 0
+; GCN:	.set use_stack1.numbered_sgpr, 33
+; GCN:	.set use_stack1.private_seg_size, 404
+; GCN:	.set use_stack1.uses_vcc, 0
+; GCN:	.set use_stack1.uses_flat_scratch, 0
+; GCN:	.set use_stack1.has_dyn_sized_stack, 0
+; GCN:	.set use_stack1.has_recursion, 0
+; GCN:	.set use_stack1.has_indirect_call, 0
+; GCN: TotalNumSgprs: 37
+; GCN: NumVgprs: 1
+; GCN: ScratchSize: 404
+define void @use_stack1() #1 {
+  %alloca = alloca [100 x i32], align 4, addrspace(5)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_stack:
+; GCN:	.set indirect_use_stack.num_vgpr, max(41, use_stack0.num_vgpr)
+; GCN:	.set indirect_use_stack.num_agpr, max(0, use_stack0.num_agpr)
+; GCN:	.set indirect_use_stack.numbered_sgpr, max(34, use_stack0.numbered_sgpr)
+; GCN:	.set indirect_use_stack.private_seg_size, 80+(max(use_stack0.private_seg_size))
+; GCN:	.set indirect_use_stack.uses_vcc, or(1, use_stack0.uses_vcc)
+; GCN:	.set indirect_use_stack.uses_flat_scratch, or(0, use_stack0.uses_flat_scratch)
+; GCN:	.set indirect_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack)
+; GCN:	.set indirect_use_stack.has_recursion, or(0, use_stack0.has_recursion)
+; GCN:	.set indirect_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call)
+; GCN: TotalNumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2132
+define void @indirect_use_stack() #1 {
+  %alloca = alloca [16 x i32], align 4, addrspace(5)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+  call void @use_stack0()
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2_level_use_stack:
+; GCN:	.set indirect_2_level_use_stack.num_vgpr, max(32, indirect_use_stack.num_vgpr)
+; GCN:	.set indirect_2_level_use_stack.num_agpr, max(0, indirect_use_stack.num_agpr)
+; GCN:	.set indirect_2_level_use_stack.numbered_sgpr, max(33, indirect_use_stack.numbered_sgpr)
+; GCN:	.set indirect_2_level_use_stack.private_seg_size, 0+(max(indirect_use_stack.private_seg_size))
+; GCN:	.set indirect_2_level_use_stack.uses_vcc, or(1, indirect_use_stack.uses_vcc)
+; GCN:	.set indirect_2_level_use_stack.uses_flat_scratch, or(1, indirect_use_stack.uses_flat_scratch)
+; GCN:	.set indirect_2_level_use_stack.has_dyn_sized_stack, or(0, indirect_use_stack.has_dyn_sized_stack)
+; GCN:	.set indirect_2_level_use_stack.has_recursion, or(0, indirect_use_stack.has_recursion)
+; GCN:	.set indirect_2_level_use_stack.has_indirect_call, or(0, indirect_use_stack.has_indirect_call)
+; GCN: TotalNumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2132
+define amdgpu_kernel void @indirect_2_level_use_stack() #0 {
+  call void @indirect_use_stack()
+  ret void
+}
+
+
+; Should be maximum of callee usage
+; GCN-LABEL: {{^}}multi_call_use_use_stack:
+; GCN:	.set multi_call_use_use_stack.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr)
+; GCN:	.set multi_call_use_use_stack.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr)
+; GCN:	.set multi_call_use_use_stack.numbered_sgpr, max(42, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr)
+; GCN:	.set multi_call_use_use_stack.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size))
+; GCN:	.set multi_call_use_use_stack.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc)
+; GCN:	.set multi_call_use_use_stack.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch)
+; GCN:	.set multi_call_use_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack)
+; GCN:	.set multi_call_use_use_stack.has_recursion, or(0, use_stack0.has_recursion, use_stack1.has_recursion)
+; GCN:	.set multi_call_use_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call)
+; GCN: TotalNumSgprs: 48
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2052
+define amdgpu_kernel void @multi_call_use_use_stack() #0 {
+  call void @use_stack0()
+  call void @use_stack1()
+  ret void
+}
+
+declare void @external() #0
+
+; GCN-LABEL: {{^}}multi_call_with_external:
+; GCN:	.set multi_call_with_external.num_vgpr, max(41, amdgpu.max_num_vgpr)
+; GCN:	.set multi_call_with_external.num_agpr, max(0, amdgpu.max_num_agpr)
+; GCN:	.set multi_call_with_external.numbered_sgpr, max(42, amdgpu.max_num_sgpr)
+; GCN:	.set multi_call_with_external.private_seg_size, 0
+; GCN:	.set multi_call_with_external.uses_vcc, 1
+; GCN:	.set multi_call_with_external.uses_flat_scratch, 1
+; GCN:	.set multi_call_with_external.has_dyn_sized_stack, 1
+; GCN:	.set multi_call_with_external.has_recursion, 0
+; GCN:	.set multi_call_with_external.has_indirect_call, 1
+; GCN: TotalNumSgprs: multi_call_with_external.numbered_sgpr+6
+; GCN: NumVgprs: multi_call_with_external.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @multi_call_with_external() #0 {
+  call void @use_stack0()
+  call void @use_stack1()
+  call void @external()
+  ret void
+}
+
+; GCN-LABEL: {{^}}usage_external:
+; GCN:	.set usage_external.num_vgpr, max(32, amdgpu.max_num_vgpr)
+; GCN:	.set usage_external.num_agpr, max(0, amdgpu.max_num_agpr)
+; GCN:	.set usage_external.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
+; GCN:	.set usage_external.private_seg_size, 0
+; GCN:	.set usage_external.uses_vcc, 1
+; GCN:	.set usage_external.uses_flat_scratch, 1
+; GCN:	.set usage_external.has_dyn_sized_stack, 1
+; GCN:	.set usage_external.has_recursion, 0
+; GCN:	.set usage_external.has_indirect_call, 1
+; GCN: TotalNumSgprs: usage_external.numbered_sgpr+6
+; GCN: NumVgprs: usage_external.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @usage_external() #0 {
+  call void @external()
+  ret void
+}
+
+declare void @external_recurse() #2
+
+; GCN-LABEL: {{^}}usage_external_recurse:
+; GCN:	.set usage_external_recurse.num_vgpr, max(32, amdgpu.max_num_vgpr)
+; GCN:	.set usage_external_recurse.num_agpr, max(0, amdgpu.max_num_agpr)
+; GCN:	.set usage_external_recurse.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
+; GCN:	.set usage_external_recurse.private_seg_size, 0
+; GCN:	.set usage_external_recurse.uses_vcc, 1
+; GCN:	.set usage_external_recurse.uses_flat_scratch, 1
+; GCN:	.set usage_external_recurse.has_dyn_sized_stack, 1
+; GCN:	.set usage_external_recurse.has_recursion, 1
+; GCN:	.set usage_external_recurse.has_indirect_call, 1
+; GCN: TotalNumSgprs: usage_external_recurse.numbered_sgpr+6
+; GCN: NumVgprs: usage_external_recurse.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @usage_external_recurse() #0 {
+  call void @external_recurse()
+  ret void
+}
+
+; GCN-LABEL: {{^}}direct_recursion_use_stack:
+; GCN: .set direct_recursion_use_stack.num_vgpr, 41
+; GCN: .set direct_recursion_use_stack.num_agpr, 0
+; GCN: .set direct_recursion_use_stack.numbered_sgpr, 36
+; GCN: .set direct_recursion_use_stack.private_seg_size, 2064
+; GCN: .set direct_recursion_use_stack.uses_vcc, 1
+; GCN: .set direct_recursion_use_stack.uses_flat_scratch, 0
+; GCN: .set direct_recursion_use_stack.has_dyn_sized_stack, 0
+; GCN: .set direct_recursion_use_stack.has_recursion, 1
+; GCN: .set direct_recursion_use_stack.has_indirect_call, 0
+; GCN: TotalNumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2064
+define void @direct_recursion_use_stack(i32 %val) #2 {
+  %alloca = alloca [512 x i32], align 4, addrspace(5)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+  %cmp = icmp eq i32 %val, 0
+  br i1 %cmp, label %ret, label %call
+
+call:
+  %val.sub1 = sub i32 %val, 1
+  call void @direct_recursion_use_stack(i32 %val.sub1)
+  br label %ret
+
+ret:
+  ret void
+}
+
+; GCN-LABEL: {{^}}usage_direct_recursion:
+; GCN:  .set usage_direct_recursion.num_vgpr, max(32, direct_recursion_use_stack.num_vgpr)
+; GCN:  .set usage_direct_recursion.num_agpr, max(0, direct_recursion_use_stack.num_agpr)
+; GCN:  .set usage_direct_recursion.numbered_sgpr, max(33, direct_recursion_use_stack.numbered_sgpr)
+; GCN:  .set usage_direct_recursion.private_seg_size, 0+(max(direct_recursion_use_stack.private_seg_size))
+; GCN:  .set usage_direct_recursion.uses_vcc, or(1, direct_recursion_use_stack.uses_vcc)
+; GCN:  .set usage_direct_recursion.uses_flat_scratch, or(1, direct_recursion_use_stack.uses_flat_scratch)
+; GCN:  .set usage_direct_recursion.has_dyn_sized_stack, or(0, direct_recursion_use_stack.has_dyn_sized_stack)
+; GCN:  .set usage_direct_recursion.has_recursion, or(1, direct_recursion_use_stack.has_recursion)
+; GCN:  .set usage_direct_recursion.has_indirect_call, or(0, direct_recursion_use_stack.has_indirect_call)
+; GCN: TotalNumSgprs: 42
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2064
+define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
+  call void @direct_recursion_use_stack(i32 %n)
+  ret void
+}
+
+; Make sure there's no assert when a sgpr96 is used.
+; GCN-LABEL: {{^}}count_use_sgpr96_external_call
+; GCN:	.set count_use_sgpr96_external_call.num_vgpr, max(32, amdgpu.max_num_vgpr)
+; GCN:	.set count_use_sgpr96_external_call.num_agpr, max(0, amdgpu.max_num_agpr)
+; GCN:	.set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
+; GCN:	.set count_use_sgpr96_external_call.private_seg_size, 0
+; GCN:	.set count_use_sgpr96_external_call.uses_vcc, 1
+; GCN:	.set count_use_sgpr96_external_call.uses_flat_scratch, 1
+; GCN:	.set count_use_sgpr96_external_call.has_dyn_sized_stack, 1
+; GCN:	.set count_use_sgpr96_external_call.has_recursion, 0
+; GCN:	.set count_use_sgpr96_external_call.has_indirect_call, 1
+; GCN: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+6
+; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @count_use_sgpr96_external_call()  {
+entry:
+  tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
+  call void @external()
+  ret void
+}
+
+; Make sure there's no assert when a sgpr160 is used.
+; GCN-LABEL: {{^}}count_use_sgpr160_external_call
+; GCN:	.set count_use_sgpr160_external_call.num_vgpr, max(32, amdgpu.max_num_vgpr)
+; GCN:	.set count_use_sgpr160_external_call.num_agpr, max(0, amdgpu.max_num_agpr)
+; GCN:	.set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
+; GCN:	.set count_use_sgpr160_external_call.private_seg_size, 0
+; GCN:	.set count_use_sgpr160_external_call.uses_vcc, 1
+; GCN:	.set count_use_sgpr160_external_call.uses_flat_scratch, 1
+; GCN:	.set count_use_sgpr160_external_call.has_dyn_sized_stack, 1
+; GCN:	.set count_use_sgpr160_external_call.has_recursion, 0
+; GCN:	.set count_use_sgpr160_external_call.has_indirect_call, 1
+; GCN: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+6
+; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @count_use_sgpr160_external_call()  {
+entry:
+  tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
+  call void @external()
+  ret void
+}
+
+; Make sure there's no assert when a vgpr160 is used.
+; GCN-LABEL: {{^}}count_use_vgpr160_external_call
+; GCN:	.set count_use_vgpr160_external_call.num_vgpr, max(32, amdgpu.max_num_vgpr)
+; GCN:	.set count_use_vgpr160_external_call.num_agpr, max(0, amdgpu.max_num_agpr)
+; GCN:	.set count_use_vgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
+; GCN:	.set count_use_vgpr160_external_call.private_seg_size, 0
+; GCN:	.set count_use_vgpr160_external_call.uses_vcc, 1
+; GCN:	.set count_use_vgpr160_external_call.uses_flat_scratch, 1
+; GCN:	.set count_use_vgpr160_external_call.has_dyn_sized_stack, 1
+; GCN:	.set count_use_vgpr160_external_call.has_recursion, 0
+; GCN:	.set count_use_vgpr160_external_call.has_indirect_call, 1
+; GCN: TotalNumSgprs: count_use_vgpr160_external_call.numbered_sgpr+6
+; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @count_use_vgpr160_external_call()  {
+entry:
+  tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
+  call void @external()
+  ret void
+}
+
+; Added at the of the .s are the module level maximums
+; GCN:	.set amdgpu.max_num_vgpr, 50
+; GCN:	.set amdgpu.max_num_agpr, 0
+; GCN:	.set amdgpu.max_num_sgpr, 80
+
+attributes #0 = { nounwind noinline norecurse }
+attributes #1 = { nounwind noinline norecurse }
+attributes #2 = { nounwind noinline }
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 3b078c41f4a84..7d07641f455e3 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -2635,7 +2635,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x7fc0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xffff8000
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x28000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -2775,25 +2775,25 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:796
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:516
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:520
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:524
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:528
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:532
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:536
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:540
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:544
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:548
@@ -2861,13 +2861,13 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:156
 ; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:160
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x400, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
@@ -2890,7 +2890,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0xfffd8000
 ; GFX9-NEXT:    s_mov_b32 s33, s36
@@ -2904,7 +2904,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0x3fe0
 ; GFX10-NEXT:    s_and_b32 s33, s33, 0xffffc000
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
@@ -3046,28 +3046,28 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:796
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:516
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:520
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:524
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:528
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:532
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:536
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:540
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:544
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_clause 0x15
 ; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:548
 ; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:552
@@ -3134,14 +3134,14 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:156
 ; GFX10-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:160
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:1540
-; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:1544
-; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:1548
-; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:1552
-; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:1556
-; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:1560
-; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:1564
-; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:1568
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:1536
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:1540
+; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:1544
+; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:1548
+; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:1552
+; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:1556
+; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:1560
+; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:1564
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x400, v0
@@ -3165,7 +3165,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v63, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0xfffec000
@@ -3181,7 +3181,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v60, s33 offset:1536 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v60, s33 offset:1600 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
@@ -3267,7 +3267,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[16:19], s33 offset:1588 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[16:19], s33 offset:1584 ; 16-byte Folded Spill
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:528
 ; GFX11-NEXT:    scratch_load_b128 v[20:23], off, s33 offset:544
@@ -3277,13 +3277,13 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_mov_b32_e32 v10, v21
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1572 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1568 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:592
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1556 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:608
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1536 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s32
 ; GFX11-NEXT:    v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v32, v36
 ; GFX11-NEXT:    v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49
@@ -3333,13 +3333,13 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s2
 ; GFX11-NEXT:    s_add_i32 s2, s32, 16
 ; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s2
-; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1584 ; 16-byte Folded Reload
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1572
-; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1556
-; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1540
+; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1568
+; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1552
+; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1536
 ; GFX11-NEXT:    s_add_i32 s2, s33, 0x400
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
@@ -3360,7 +3360,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v60, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v60, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v60, off, s33 offset:1536 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v60, off, s33 offset:1600 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xf600
 ; GFX11-NEXT:    s_mov_b32 s33, s34
diff --git a/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir b/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
index 742498cdd8bd1..c76a84cb1c5d4 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
@@ -21,14 +21,10 @@ body:             |
     ; CHECK-LABEL: name: split_instruction_subranges
     ; CHECK: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %1:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %3:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
-    ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
-    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]].sub1
-    ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: S_NOP 0, implicit [[SI_SPILL_V64_RESTORE]].sub0
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR1]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     %2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
@@ -61,23 +57,13 @@ body:             |
     ; CHECK-LABEL: name: split_instruction_subranges_use_is_subreg_def
     ; CHECK: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %1:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %3:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
-    ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR1]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5)
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
-    ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR2]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; CHECK-NEXT: S_NOP 0, implicit-def [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0
-    ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0
-    ; CHECK-NEXT: S_NOP 0, implicit-def [[COPY]].sub1
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_64 = COPY [[SI_SPILL_V64_RESTORE1]].sub1
-    ; CHECK-NEXT: S_NOP 0, implicit-def [[COPY2]].sub0
-    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64 = COPY [[COPY2]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit-def [[GLOBAL_LOAD_DWORDX2_SADDR1]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit-def [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub0
     ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
-    ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64 = COPY [[COPY1]].sub0
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY4]].sub0
-    ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub1:vreg_64 = COPY [[COPY3]].sub1
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY5]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR1]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     %2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 5abd4c9069c91..a4a8f43646d4b 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -6,209 +6,209 @@ define void @main(i1 %arg) #0 {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    v_writelane_b32 v8, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v8, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v8, s36, 2
-; CHECK-NEXT:    v_writelane_b32 v8, s37, 3
-; CHECK-NEXT:    v_writelane_b32 v8, s38, 4
-; CHECK-NEXT:    v_writelane_b32 v8, s39, 5
-; CHECK-NEXT:    v_writelane_b32 v8, s40, 6
-; CHECK-NEXT:    v_writelane_b32 v8, s41, 7
-; CHECK-NEXT:    v_writelane_b32 v8, s42, 8
-; CHECK-NEXT:    v_writelane_b32 v8, s43, 9
-; CHECK-NEXT:    v_writelane_b32 v8, s44, 10
-; CHECK-NEXT:    v_writelane_b32 v8, s45, 11
-; CHECK-NEXT:    v_writelane_b32 v8, s46, 12
-; CHECK-NEXT:    v_writelane_b32 v8, s47, 13
-; CHECK-NEXT:    v_writelane_b32 v8, s48, 14
-; CHECK-NEXT:    v_writelane_b32 v8, s49, 15
+; CHECK-NEXT:    v_writelane_b32 v5, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v5, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v5, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v5, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v5, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v5, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v5, s40, 6
+; CHECK-NEXT:    v_writelane_b32 v5, s41, 7
+; CHECK-NEXT:    v_writelane_b32 v5, s42, 8
+; CHECK-NEXT:    v_writelane_b32 v5, s43, 9
+; CHECK-NEXT:    v_writelane_b32 v5, s44, 10
+; CHECK-NEXT:    v_writelane_b32 v5, s45, 11
+; CHECK-NEXT:    v_writelane_b32 v5, s46, 12
+; CHECK-NEXT:    v_writelane_b32 v5, s47, 13
+; CHECK-NEXT:    v_writelane_b32 v5, s48, 14
+; CHECK-NEXT:    v_writelane_b32 v5, s49, 15
 ; CHECK-NEXT:    s_getpc_b64 s[24:25]
-; CHECK-NEXT:    v_writelane_b32 v8, s50, 16
+; CHECK-NEXT:    v_writelane_b32 v5, s50, 16
 ; CHECK-NEXT:    s_movk_i32 s4, 0xf0
 ; CHECK-NEXT:    s_mov_b32 s5, s24
-; CHECK-NEXT:    v_writelane_b32 v8, s51, 17
+; CHECK-NEXT:    v_writelane_b32 v5, s51, 17
 ; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
-; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
+; CHECK-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
 ; CHECK-NEXT:    s_movk_i32 s20, 0x130
 ; CHECK-NEXT:    s_mov_b32 s21, s24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v4, s36, 0
-; CHECK-NEXT:    v_writelane_b32 v4, s37, 1
-; CHECK-NEXT:    v_writelane_b32 v4, s38, 2
-; CHECK-NEXT:    v_writelane_b32 v4, s39, 3
-; CHECK-NEXT:    v_writelane_b32 v4, s40, 4
-; CHECK-NEXT:    v_writelane_b32 v4, s41, 5
-; CHECK-NEXT:    v_writelane_b32 v4, s42, 6
-; CHECK-NEXT:    v_writelane_b32 v4, s43, 7
-; CHECK-NEXT:    v_writelane_b32 v4, s44, 8
-; CHECK-NEXT:    v_writelane_b32 v4, s45, 9
-; CHECK-NEXT:    v_writelane_b32 v4, s46, 10
+; CHECK-NEXT:    v_writelane_b32 v7, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v7, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v7, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v7, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v7, s40, 4
+; CHECK-NEXT:    v_writelane_b32 v7, s41, 5
+; CHECK-NEXT:    v_writelane_b32 v7, s42, 6
+; CHECK-NEXT:    v_writelane_b32 v7, s43, 7
+; CHECK-NEXT:    v_writelane_b32 v7, s44, 8
+; CHECK-NEXT:    v_writelane_b32 v7, s45, 9
+; CHECK-NEXT:    v_writelane_b32 v7, s46, 10
 ; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
-; CHECK-NEXT:    v_writelane_b32 v4, s47, 11
-; CHECK-NEXT:    v_writelane_b32 v4, s48, 12
-; CHECK-NEXT:    v_writelane_b32 v4, s49, 13
+; CHECK-NEXT:    v_writelane_b32 v7, s47, 11
+; CHECK-NEXT:    v_writelane_b32 v7, s48, 12
 ; CHECK-NEXT:    s_mov_b32 s20, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    v_writelane_b32 v4, s50, 14
-; CHECK-NEXT:    v_mov_b32_e32 v5, s28
-; CHECK-NEXT:    v_mov_b32_e32 v6, v1
+; CHECK-NEXT:    v_writelane_b32 v7, s49, 13
+; CHECK-NEXT:    v_mov_b32_e32 v2, s28
+; CHECK-NEXT:    v_mov_b32_e32 v3, v1
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_writelane_b32 v4, s51, 15
+; CHECK-NEXT:    v_writelane_b32 v7, s50, 14
+; CHECK-NEXT:    v_writelane_b32 v7, s51, 15
+; CHECK-NEXT:    image_sample_lz v3, v[2:3], s[44:51], s[20:23] dmask:0x1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v1
-; CHECK-NEXT:    image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v4, s4, 16
-; CHECK-NEXT:    v_writelane_b32 v4, s5, 17
-; CHECK-NEXT:    v_writelane_b32 v4, s6, 18
-; CHECK-NEXT:    v_writelane_b32 v4, s7, 19
-; CHECK-NEXT:    v_writelane_b32 v4, s8, 20
-; CHECK-NEXT:    v_writelane_b32 v4, s9, 21
-; CHECK-NEXT:    image_sample_lz v6, v[1:2], s[4:11], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_writelane_b32 v4, s10, 22
-; CHECK-NEXT:    v_writelane_b32 v4, s11, 23
-; CHECK-NEXT:    v_writelane_b32 v4, s12, 24
-; CHECK-NEXT:    v_writelane_b32 v4, s13, 25
-; CHECK-NEXT:    v_writelane_b32 v4, s14, 26
-; CHECK-NEXT:    v_writelane_b32 v4, s15, 27
-; CHECK-NEXT:    v_writelane_b32 v8, s52, 18
-; CHECK-NEXT:    v_writelane_b32 v4, s16, 28
-; CHECK-NEXT:    v_writelane_b32 v8, s53, 19
-; CHECK-NEXT:    v_writelane_b32 v4, s17, 29
-; CHECK-NEXT:    v_writelane_b32 v8, s54, 20
-; CHECK-NEXT:    v_writelane_b32 v4, s18, 30
+; CHECK-NEXT:    v_writelane_b32 v7, s4, 16
+; CHECK-NEXT:    v_writelane_b32 v7, s5, 17
+; CHECK-NEXT:    v_writelane_b32 v7, s6, 18
+; CHECK-NEXT:    v_writelane_b32 v7, s7, 19
+; CHECK-NEXT:    v_writelane_b32 v7, s8, 20
+; CHECK-NEXT:    v_writelane_b32 v7, s9, 21
+; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[4:11], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_writelane_b32 v7, s10, 22
+; CHECK-NEXT:    v_writelane_b32 v7, s11, 23
+; CHECK-NEXT:    v_writelane_b32 v7, s12, 24
+; CHECK-NEXT:    v_writelane_b32 v7, s13, 25
+; CHECK-NEXT:    v_writelane_b32 v7, s14, 26
+; CHECK-NEXT:    v_writelane_b32 v7, s15, 27
+; CHECK-NEXT:    v_writelane_b32 v5, s52, 18
+; CHECK-NEXT:    v_writelane_b32 v7, s16, 28
+; CHECK-NEXT:    v_writelane_b32 v5, s53, 19
+; CHECK-NEXT:    v_writelane_b32 v7, s17, 29
+; CHECK-NEXT:    v_writelane_b32 v5, s54, 20
+; CHECK-NEXT:    v_writelane_b32 v7, s18, 30
 ; CHECK-NEXT:    s_mov_b32 s26, 48
 ; CHECK-NEXT:    s_mov_b32 s27, s24
-; CHECK-NEXT:    v_writelane_b32 v8, s55, 21
-; CHECK-NEXT:    v_writelane_b32 v4, s19, 31
+; CHECK-NEXT:    v_writelane_b32 v5, s55, 21
+; CHECK-NEXT:    v_writelane_b32 v7, s19, 31
 ; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[26:27], 0x0
-; CHECK-NEXT:    v_writelane_b32 v8, s56, 22
-; CHECK-NEXT:    v_writelane_b32 v8, s57, 23
-; CHECK-NEXT:    v_writelane_b32 v8, s58, 24
-; CHECK-NEXT:    v_writelane_b32 v8, s59, 25
-; CHECK-NEXT:    v_writelane_b32 v8, s60, 26
+; CHECK-NEXT:    v_writelane_b32 v5, s56, 22
+; CHECK-NEXT:    v_writelane_b32 v5, s57, 23
+; CHECK-NEXT:    v_writelane_b32 v5, s58, 24
+; CHECK-NEXT:    v_writelane_b32 v5, s59, 25
+; CHECK-NEXT:    v_writelane_b32 v5, s60, 26
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v4, s4, 32
-; CHECK-NEXT:    v_writelane_b32 v8, s61, 27
-; CHECK-NEXT:    v_writelane_b32 v4, s5, 33
-; CHECK-NEXT:    v_writelane_b32 v8, s62, 28
-; CHECK-NEXT:    v_writelane_b32 v4, s6, 34
-; CHECK-NEXT:    v_writelane_b32 v8, s63, 29
-; CHECK-NEXT:    v_writelane_b32 v4, s7, 35
-; CHECK-NEXT:    v_writelane_b32 v8, s64, 30
-; CHECK-NEXT:    v_writelane_b32 v4, s8, 36
-; CHECK-NEXT:    v_writelane_b32 v8, s65, 31
-; CHECK-NEXT:    v_writelane_b32 v4, s9, 37
-; CHECK-NEXT:    v_writelane_b32 v8, s66, 32
+; CHECK-NEXT:    v_writelane_b32 v7, s4, 32
+; CHECK-NEXT:    v_writelane_b32 v5, s61, 27
+; CHECK-NEXT:    v_writelane_b32 v7, s5, 33
+; CHECK-NEXT:    v_writelane_b32 v5, s62, 28
+; CHECK-NEXT:    v_writelane_b32 v7, s6, 34
+; CHECK-NEXT:    v_writelane_b32 v5, s63, 29
+; CHECK-NEXT:    v_writelane_b32 v7, s7, 35
+; CHECK-NEXT:    v_writelane_b32 v5, s64, 30
+; CHECK-NEXT:    v_writelane_b32 v7, s8, 36
+; CHECK-NEXT:    v_writelane_b32 v5, s65, 31
+; CHECK-NEXT:    v_writelane_b32 v7, s9, 37
+; CHECK-NEXT:    v_writelane_b32 v5, s66, 32
 ; CHECK-NEXT:    s_movk_i32 s28, 0x1f0
 ; CHECK-NEXT:    s_movk_i32 s30, 0x2f0
 ; CHECK-NEXT:    s_mov_b32 s29, s24
 ; CHECK-NEXT:    s_mov_b32 s31, s24
-; CHECK-NEXT:    v_writelane_b32 v4, s10, 38
-; CHECK-NEXT:    v_writelane_b32 v8, s67, 33
-; CHECK-NEXT:    v_writelane_b32 v4, s11, 39
+; CHECK-NEXT:    v_writelane_b32 v7, s10, 38
+; CHECK-NEXT:    v_writelane_b32 v5, s67, 33
+; CHECK-NEXT:    v_writelane_b32 v7, s11, 39
 ; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[28:29], 0x0
 ; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    s_xor_b64 s[24:25], vcc, -1
-; CHECK-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mul_f32_e32 v0, v6, v5
+; CHECK-NEXT:    v_mul_f32_e32 v0, v4, v3
 ; CHECK-NEXT:    s_and_saveexec_b64 s[26:27], s[24:25]
 ; CHECK-NEXT:    s_xor_b64 s[26:27], exec, s[26:27]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_3
 ; CHECK-NEXT:  ; %bb.1: ; %bb48
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
+; CHECK-NEXT:    v_readlane_b32 s36, v7, 0
+; CHECK-NEXT:    v_readlane_b32 s44, v7, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v7, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v7, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v7, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v7, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v7, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v7, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v7, 15
 ; CHECK-NEXT:    s_and_b64 vcc, exec, -1
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
-; CHECK-NEXT:    image_sample_lz v5, v[1:2], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s37, v7, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v7, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v7, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v7, 4
+; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[44:51], s[20:23] dmask:0x1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
+; CHECK-NEXT:    v_readlane_b32 s41, v7, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v7, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v7, 7
 ; CHECK-NEXT:  .LBB0_2: ; %bb50
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 32
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 36
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 37
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 38
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 39
+; CHECK-NEXT:    v_readlane_b32 s36, v7, 32
+; CHECK-NEXT:    v_readlane_b32 s40, v7, 36
+; CHECK-NEXT:    v_readlane_b32 s41, v7, 37
+; CHECK-NEXT:    v_readlane_b32 s42, v7, 38
+; CHECK-NEXT:    v_readlane_b32 s43, v7, 39
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 33
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 34
+; CHECK-NEXT:    v_readlane_b32 s37, v7, 33
+; CHECK-NEXT:    v_readlane_b32 s38, v7, 34
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    image_sample_lz v6, v[1:2], s[60:67], s[40:43] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 35
+; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[60:67], s[40:43] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s39, v7, 35
 ; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v6
+; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v4
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v0
-; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v5
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: ; %Flow14
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s12, v4, 32
-; CHECK-NEXT:    v_readlane_b32 s13, v4, 33
-; CHECK-NEXT:    v_readlane_b32 s14, v4, 34
-; CHECK-NEXT:    v_readlane_b32 s15, v4, 35
-; CHECK-NEXT:    v_readlane_b32 s16, v4, 36
-; CHECK-NEXT:    v_readlane_b32 s17, v4, 37
-; CHECK-NEXT:    v_readlane_b32 s18, v4, 38
-; CHECK-NEXT:    v_readlane_b32 s19, v4, 39
-; CHECK-NEXT:    v_writelane_b32 v4, s4, 40
-; CHECK-NEXT:    v_writelane_b32 v4, s5, 41
-; CHECK-NEXT:    v_writelane_b32 v4, s6, 42
-; CHECK-NEXT:    v_writelane_b32 v4, s7, 43
-; CHECK-NEXT:    v_writelane_b32 v4, s8, 44
-; CHECK-NEXT:    v_writelane_b32 v4, s9, 45
-; CHECK-NEXT:    v_writelane_b32 v4, s10, 46
-; CHECK-NEXT:    v_writelane_b32 v4, s11, 47
-; CHECK-NEXT:    v_writelane_b32 v4, s12, 48
-; CHECK-NEXT:    v_writelane_b32 v4, s13, 49
-; CHECK-NEXT:    v_writelane_b32 v4, s14, 50
-; CHECK-NEXT:    v_writelane_b32 v4, s15, 51
-; CHECK-NEXT:    v_writelane_b32 v4, s16, 52
-; CHECK-NEXT:    v_writelane_b32 v4, s17, 53
-; CHECK-NEXT:    v_writelane_b32 v4, s18, 54
-; CHECK-NEXT:    v_writelane_b32 v4, s19, 55
-; CHECK-NEXT:    v_writelane_b32 v4, s52, 56
-; CHECK-NEXT:    v_writelane_b32 v3, s60, 0
-; CHECK-NEXT:    v_writelane_b32 v4, s53, 57
-; CHECK-NEXT:    v_writelane_b32 v3, s61, 1
-; CHECK-NEXT:    v_writelane_b32 v4, s54, 58
-; CHECK-NEXT:    v_writelane_b32 v3, s62, 2
-; CHECK-NEXT:    v_writelane_b32 v4, s55, 59
-; CHECK-NEXT:    v_writelane_b32 v3, s63, 3
-; CHECK-NEXT:    v_writelane_b32 v4, s56, 60
-; CHECK-NEXT:    v_writelane_b32 v3, s64, 4
-; CHECK-NEXT:    v_writelane_b32 v4, s57, 61
-; CHECK-NEXT:    v_writelane_b32 v3, s65, 5
-; CHECK-NEXT:    v_writelane_b32 v4, s58, 62
-; CHECK-NEXT:    v_writelane_b32 v3, s66, 6
-; CHECK-NEXT:    v_writelane_b32 v4, s59, 63
-; CHECK-NEXT:    v_writelane_b32 v3, s67, 7
+; CHECK-NEXT:    v_readlane_b32 s12, v7, 32
+; CHECK-NEXT:    v_readlane_b32 s13, v7, 33
+; CHECK-NEXT:    v_readlane_b32 s14, v7, 34
+; CHECK-NEXT:    v_readlane_b32 s15, v7, 35
+; CHECK-NEXT:    v_readlane_b32 s16, v7, 36
+; CHECK-NEXT:    v_readlane_b32 s17, v7, 37
+; CHECK-NEXT:    v_readlane_b32 s18, v7, 38
+; CHECK-NEXT:    v_readlane_b32 s19, v7, 39
+; CHECK-NEXT:    v_writelane_b32 v7, s4, 40
+; CHECK-NEXT:    v_writelane_b32 v7, s5, 41
+; CHECK-NEXT:    v_writelane_b32 v7, s6, 42
+; CHECK-NEXT:    v_writelane_b32 v7, s7, 43
+; CHECK-NEXT:    v_writelane_b32 v7, s8, 44
+; CHECK-NEXT:    v_writelane_b32 v7, s9, 45
+; CHECK-NEXT:    v_writelane_b32 v7, s10, 46
+; CHECK-NEXT:    v_writelane_b32 v7, s11, 47
+; CHECK-NEXT:    v_writelane_b32 v7, s12, 48
+; CHECK-NEXT:    v_writelane_b32 v7, s13, 49
+; CHECK-NEXT:    v_writelane_b32 v7, s14, 50
+; CHECK-NEXT:    v_writelane_b32 v7, s15, 51
+; CHECK-NEXT:    v_writelane_b32 v7, s16, 52
+; CHECK-NEXT:    v_writelane_b32 v7, s17, 53
+; CHECK-NEXT:    v_writelane_b32 v7, s18, 54
+; CHECK-NEXT:    v_writelane_b32 v7, s19, 55
+; CHECK-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
+; CHECK-NEXT:    v_writelane_b32 v7, s52, 56
+; CHECK-NEXT:    v_writelane_b32 v6, s60, 0
+; CHECK-NEXT:    v_writelane_b32 v7, s53, 57
+; CHECK-NEXT:    v_writelane_b32 v6, s61, 1
+; CHECK-NEXT:    v_writelane_b32 v7, s54, 58
+; CHECK-NEXT:    v_writelane_b32 v6, s62, 2
+; CHECK-NEXT:    v_writelane_b32 v7, s55, 59
+; CHECK-NEXT:    v_writelane_b32 v6, s63, 3
+; CHECK-NEXT:    v_writelane_b32 v7, s56, 60
+; CHECK-NEXT:    v_writelane_b32 v6, s64, 4
+; CHECK-NEXT:    v_writelane_b32 v7, s57, 61
+; CHECK-NEXT:    v_writelane_b32 v6, s65, 5
+; CHECK-NEXT:    v_writelane_b32 v7, s58, 62
+; CHECK-NEXT:    v_writelane_b32 v6, s66, 6
+; CHECK-NEXT:    v_writelane_b32 v7, s59, 63
+; CHECK-NEXT:    v_writelane_b32 v6, s67, 7
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[20:21], s[26:27]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %bb32
@@ -219,68 +219,68 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s8
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
+; CHECK-NEXT:    v_readlane_b32 s36, v7, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s9
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
-; CHECK-NEXT:    image_sample_lz v5, v[0:1], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 16
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 24
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 25
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 26
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 27
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 28
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 29
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 30
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 31
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0
-; CHECK-NEXT:    v_mov_b32_e32 v7, v6
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 17
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 18
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 19
+; CHECK-NEXT:    v_readlane_b32 s37, v7, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v7, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v7, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v7, 4
+; CHECK-NEXT:    v_readlane_b32 s41, v7, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v7, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v7, 7
+; CHECK-NEXT:    v_readlane_b32 s44, v7, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v7, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v7, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v7, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v7, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v7, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v7, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v7, 15
+; CHECK-NEXT:    image_sample_lz v2, v[0:1], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s36, v7, 16
+; CHECK-NEXT:    v_readlane_b32 s44, v7, 24
+; CHECK-NEXT:    v_readlane_b32 s45, v7, 25
+; CHECK-NEXT:    v_readlane_b32 s46, v7, 26
+; CHECK-NEXT:    v_readlane_b32 s47, v7, 27
+; CHECK-NEXT:    v_readlane_b32 s48, v7, 28
+; CHECK-NEXT:    v_readlane_b32 s49, v7, 29
+; CHECK-NEXT:    v_readlane_b32 s50, v7, 30
+; CHECK-NEXT:    v_readlane_b32 s51, v7, 31
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v4, v3
+; CHECK-NEXT:    v_readlane_b32 s37, v7, 17
+; CHECK-NEXT:    v_readlane_b32 s38, v7, 18
+; CHECK-NEXT:    v_readlane_b32 s39, v7, 19
 ; CHECK-NEXT:    image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 20
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 21
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 22
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 23
+; CHECK-NEXT:    v_readlane_b32 s40, v7, 20
+; CHECK-NEXT:    v_readlane_b32 s41, v7, 21
+; CHECK-NEXT:    v_readlane_b32 s42, v7, 22
+; CHECK-NEXT:    v_readlane_b32 s43, v7, 23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dwordx3 v[5:7], off, s[8:11], 0
+; CHECK-NEXT:    buffer_store_dwordx3 v[2:4], off, s[8:11], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
 ; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:  .LBB0_6: ; %Flow12
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[22:23]
-; CHECK-NEXT:    v_readlane_b32 s52, v4, 40
-; CHECK-NEXT:    v_readlane_b32 s53, v4, 41
-; CHECK-NEXT:    v_readlane_b32 s54, v4, 42
-; CHECK-NEXT:    v_readlane_b32 s55, v4, 43
-; CHECK-NEXT:    v_readlane_b32 s56, v4, 44
-; CHECK-NEXT:    v_readlane_b32 s57, v4, 45
-; CHECK-NEXT:    v_readlane_b32 s58, v4, 46
-; CHECK-NEXT:    v_readlane_b32 s59, v4, 47
-; CHECK-NEXT:    v_readlane_b32 s60, v4, 48
-; CHECK-NEXT:    v_readlane_b32 s61, v4, 49
-; CHECK-NEXT:    v_readlane_b32 s62, v4, 50
-; CHECK-NEXT:    v_readlane_b32 s63, v4, 51
-; CHECK-NEXT:    v_readlane_b32 s64, v4, 52
-; CHECK-NEXT:    v_readlane_b32 s65, v4, 53
-; CHECK-NEXT:    v_readlane_b32 s66, v4, 54
-; CHECK-NEXT:    v_readlane_b32 s67, v4, 55
+; CHECK-NEXT:    v_readlane_b32 s52, v7, 40
+; CHECK-NEXT:    v_readlane_b32 s53, v7, 41
+; CHECK-NEXT:    v_readlane_b32 s54, v7, 42
+; CHECK-NEXT:    v_readlane_b32 s55, v7, 43
+; CHECK-NEXT:    v_readlane_b32 s56, v7, 44
+; CHECK-NEXT:    v_readlane_b32 s57, v7, 45
+; CHECK-NEXT:    v_readlane_b32 s58, v7, 46
+; CHECK-NEXT:    v_readlane_b32 s59, v7, 47
+; CHECK-NEXT:    v_readlane_b32 s60, v7, 48
+; CHECK-NEXT:    v_readlane_b32 s61, v7, 49
+; CHECK-NEXT:    v_readlane_b32 s62, v7, 50
+; CHECK-NEXT:    v_readlane_b32 s63, v7, 51
+; CHECK-NEXT:    v_readlane_b32 s64, v7, 52
+; CHECK-NEXT:    v_readlane_b32 s65, v7, 53
+; CHECK-NEXT:    v_readlane_b32 s66, v7, 54
+; CHECK-NEXT:    v_readlane_b32 s67, v7, 55
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
@@ -288,32 +288,32 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b32 s6, s8
 ; CHECK-NEXT:    s_mov_b32 s7, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s6
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 56
+; CHECK-NEXT:    v_readlane_b32 s36, v7, 56
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s7
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 57
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 58
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 59
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 60
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 61
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 62
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 63
+; CHECK-NEXT:    v_readlane_b32 s37, v7, 57
+; CHECK-NEXT:    v_readlane_b32 s38, v7, 58
+; CHECK-NEXT:    v_readlane_b32 s39, v7, 59
+; CHECK-NEXT:    v_readlane_b32 s40, v7, 60
+; CHECK-NEXT:    v_readlane_b32 s41, v7, 61
+; CHECK-NEXT:    v_readlane_b32 s42, v7, 62
+; CHECK-NEXT:    v_readlane_b32 s43, v7, 63
 ; CHECK-NEXT:    s_nop 4
-; CHECK-NEXT:    image_sample_lz v5, v[1:2], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    image_sample_lz v6, v[1:2], s[52:59], s[8:11] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1
 ; CHECK-NEXT:    ; kill: killed $vgpr1_vgpr2
 ; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v3, 0
-; CHECK-NEXT:    v_readlane_b32 s45, v3, 1
-; CHECK-NEXT:    v_readlane_b32 s46, v3, 2
-; CHECK-NEXT:    v_readlane_b32 s47, v3, 3
-; CHECK-NEXT:    v_readlane_b32 s48, v3, 4
-; CHECK-NEXT:    v_readlane_b32 s49, v3, 5
-; CHECK-NEXT:    v_readlane_b32 s50, v3, 6
-; CHECK-NEXT:    v_readlane_b32 s51, v3, 7
+; CHECK-NEXT:    v_readlane_b32 s44, v6, 0
+; CHECK-NEXT:    v_readlane_b32 s45, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s46, v6, 2
+; CHECK-NEXT:    v_readlane_b32 s47, v6, 3
+; CHECK-NEXT:    v_readlane_b32 s48, v6, 4
+; CHECK-NEXT:    v_readlane_b32 s49, v6, 5
+; CHECK-NEXT:    v_readlane_b32 s50, v6, 6
+; CHECK-NEXT:    v_readlane_b32 s51, v6, 7
 ; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
@@ -321,7 +321,7 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
 ; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_sub_f32_e32 v1, v6, v5
+; CHECK-NEXT:    v_sub_f32_e32 v1, v4, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  .LBB0_8: ; %bb33
@@ -334,46 +334,44 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[20:21]
-; CHECK-NEXT:    v_readlane_b32 s67, v8, 33
-; CHECK-NEXT:    v_readlane_b32 s66, v8, 32
-; CHECK-NEXT:    v_readlane_b32 s65, v8, 31
-; CHECK-NEXT:    v_readlane_b32 s64, v8, 30
-; CHECK-NEXT:    v_readlane_b32 s63, v8, 29
-; CHECK-NEXT:    v_readlane_b32 s62, v8, 28
-; CHECK-NEXT:    v_readlane_b32 s61, v8, 27
-; CHECK-NEXT:    v_readlane_b32 s60, v8, 26
-; CHECK-NEXT:    v_readlane_b32 s59, v8, 25
-; CHECK-NEXT:    v_readlane_b32 s58, v8, 24
-; CHECK-NEXT:    v_readlane_b32 s57, v8, 23
-; CHECK-NEXT:    v_readlane_b32 s56, v8, 22
-; CHECK-NEXT:    v_readlane_b32 s55, v8, 21
-; CHECK-NEXT:    v_readlane_b32 s54, v8, 20
-; CHECK-NEXT:    v_readlane_b32 s53, v8, 19
-; CHECK-NEXT:    v_readlane_b32 s52, v8, 18
-; CHECK-NEXT:    v_readlane_b32 s51, v8, 17
-; CHECK-NEXT:    v_readlane_b32 s50, v8, 16
-; CHECK-NEXT:    v_readlane_b32 s49, v8, 15
-; CHECK-NEXT:    v_readlane_b32 s48, v8, 14
-; CHECK-NEXT:    v_readlane_b32 s47, v8, 13
-; CHECK-NEXT:    v_readlane_b32 s46, v8, 12
-; CHECK-NEXT:    v_readlane_b32 s45, v8, 11
-; CHECK-NEXT:    v_readlane_b32 s44, v8, 10
-; CHECK-NEXT:    v_readlane_b32 s43, v8, 9
-; CHECK-NEXT:    v_readlane_b32 s42, v8, 8
-; CHECK-NEXT:    v_readlane_b32 s41, v8, 7
-; CHECK-NEXT:    v_readlane_b32 s40, v8, 6
-; CHECK-NEXT:    v_readlane_b32 s39, v8, 5
-; CHECK-NEXT:    v_readlane_b32 s38, v8, 4
-; CHECK-NEXT:    v_readlane_b32 s37, v8, 3
-; CHECK-NEXT:    v_readlane_b32 s36, v8, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v8, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v8, 0
-; CHECK-NEXT:    ; kill: killed $vgpr4
-; CHECK-NEXT:    ; kill: killed $vgpr3
+; CHECK-NEXT:    v_readlane_b32 s67, v5, 33
+; CHECK-NEXT:    v_readlane_b32 s66, v5, 32
+; CHECK-NEXT:    v_readlane_b32 s65, v5, 31
+; CHECK-NEXT:    v_readlane_b32 s64, v5, 30
+; CHECK-NEXT:    v_readlane_b32 s63, v5, 29
+; CHECK-NEXT:    v_readlane_b32 s62, v5, 28
+; CHECK-NEXT:    v_readlane_b32 s61, v5, 27
+; CHECK-NEXT:    v_readlane_b32 s60, v5, 26
+; CHECK-NEXT:    v_readlane_b32 s59, v5, 25
+; CHECK-NEXT:    v_readlane_b32 s58, v5, 24
+; CHECK-NEXT:    v_readlane_b32 s57, v5, 23
+; CHECK-NEXT:    v_readlane_b32 s56, v5, 22
+; CHECK-NEXT:    v_readlane_b32 s55, v5, 21
+; CHECK-NEXT:    v_readlane_b32 s54, v5, 20
+; CHECK-NEXT:    v_readlane_b32 s53, v5, 19
+; CHECK-NEXT:    v_readlane_b32 s52, v5, 18
+; CHECK-NEXT:    v_readlane_b32 s51, v5, 17
+; CHECK-NEXT:    v_readlane_b32 s50, v5, 16
+; CHECK-NEXT:    v_readlane_b32 s49, v5, 15
+; CHECK-NEXT:    v_readlane_b32 s48, v5, 14
+; CHECK-NEXT:    v_readlane_b32 s47, v5, 13
+; CHECK-NEXT:    v_readlane_b32 s46, v5, 12
+; CHECK-NEXT:    v_readlane_b32 s45, v5, 11
+; CHECK-NEXT:    v_readlane_b32 s44, v5, 10
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 9
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 8
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 7
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v5, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v5, 0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir b/llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir
index 96fb7cfeb2775..40089ed82b5db 100644
--- a/llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir
+++ b/llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: not llc -mtriple=amdgcn -mcpu=gfx900 -start-before=greedy,0 -stop-after=virtregrewriter,1 -o - 2>%t.err %s | FileCheck %s
+# RUN: not llc -mtriple=amdgcn -mcpu=gfx900 -start-before=greedy,0 -stop-after=virtregrewriter,2 -o - 2>%t.err %s | FileCheck %s
 # RUN: FileCheck -check-prefix=ERR %s < %t.err
 
 # This testcase cannot be compiled. An attempted eviction legality
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 60946956547a7..f1f4abe580c00 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1510,12 +1510,7 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
 ; NOOPT-NEXT:    s_mov_b32 s23, 0xe8f000
 ; NOOPT-NEXT:    s_add_u32 s20, s20, s9
 ; NOOPT-NEXT:    s_addc_u32 s21, s21, 0
-; NOOPT-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; NOOPT-NEXT:    v_mov_b32_e32 v1, v0
-; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
 ; NOOPT-NEXT:    s_mov_b32 s6, s1
@@ -1526,11 +1521,11 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
 ; NOOPT-NEXT:    s_mov_b32 s1, s6
 ; NOOPT-NEXT:    s_mov_b32 s2, s5
 ; NOOPT-NEXT:    s_mov_b32 s3, s4
-; NOOPT-NEXT:    s_waitcnt vmcnt(1)
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 0
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 1
-; NOOPT-NEXT:    v_writelane_b32 v0, s2, 2
-; NOOPT-NEXT:    v_writelane_b32 v0, s3, 3
+; NOOPT-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
+; NOOPT-NEXT:    v_writelane_b32 v31, s0, 0
+; NOOPT-NEXT:    v_writelane_b32 v31, s1, 1
+; NOOPT-NEXT:    v_writelane_b32 v31, s2, 2
+; NOOPT-NEXT:    v_writelane_b32 v31, s3, 3
 ; NOOPT-NEXT:    s_mov_b32 s0, 16
 ; NOOPT-NEXT:    s_mov_b32 s1, 15
 ; NOOPT-NEXT:    s_mov_b32 s2, 14
@@ -1548,126 +1543,130 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
 ; NOOPT-NEXT:    s_mov_b32 s14, 1
 ; NOOPT-NEXT:    s_mov_b32 s15, 0
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    v_mov_b32_e32 v1, s15
-; NOOPT-NEXT:    v_mov_b32_e32 v31, s14
-; NOOPT-NEXT:    v_mov_b32_e32 v30, s13
-; NOOPT-NEXT:    v_mov_b32_e32 v29, s12
-; NOOPT-NEXT:    v_mov_b32_e32 v28, s11
-; NOOPT-NEXT:    v_mov_b32_e32 v27, s10
-; NOOPT-NEXT:    v_mov_b32_e32 v26, s9
-; NOOPT-NEXT:    v_mov_b32_e32 v25, s8
-; NOOPT-NEXT:    v_mov_b32_e32 v24, s7
-; NOOPT-NEXT:    v_mov_b32_e32 v23, s6
-; NOOPT-NEXT:    v_mov_b32_e32 v22, s5
-; NOOPT-NEXT:    v_mov_b32_e32 v21, s4
-; NOOPT-NEXT:    v_mov_b32_e32 v20, s3
-; NOOPT-NEXT:    v_mov_b32_e32 v19, s2
-; NOOPT-NEXT:    v_mov_b32_e32 v18, s1
-; NOOPT-NEXT:    v_mov_b32_e32 v17, s0
-; NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v2, v31
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v30
-; NOOPT-NEXT:    v_mov_b32_e32 v4, v29
-; NOOPT-NEXT:    v_mov_b32_e32 v5, v28
-; NOOPT-NEXT:    v_mov_b32_e32 v6, v27
-; NOOPT-NEXT:    v_mov_b32_e32 v7, v26
-; NOOPT-NEXT:    v_mov_b32_e32 v8, v25
-; NOOPT-NEXT:    v_mov_b32_e32 v9, v24
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v23
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v22
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v21
-; NOOPT-NEXT:    v_mov_b32_e32 v13, v20
-; NOOPT-NEXT:    v_mov_b32_e32 v14, v19
-; NOOPT-NEXT:    v_mov_b32_e32 v15, v18
-; NOOPT-NEXT:    v_mov_b32_e32 v16, v17
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_mov_b32_e32 v0, s15
+; NOOPT-NEXT:    v_mov_b32_e32 v30, s14
+; NOOPT-NEXT:    v_mov_b32_e32 v29, s13
+; NOOPT-NEXT:    v_mov_b32_e32 v28, s12
+; NOOPT-NEXT:    v_mov_b32_e32 v27, s11
+; NOOPT-NEXT:    v_mov_b32_e32 v26, s10
+; NOOPT-NEXT:    v_mov_b32_e32 v25, s9
+; NOOPT-NEXT:    v_mov_b32_e32 v24, s8
+; NOOPT-NEXT:    v_mov_b32_e32 v23, s7
+; NOOPT-NEXT:    v_mov_b32_e32 v22, s6
+; NOOPT-NEXT:    v_mov_b32_e32 v21, s5
+; NOOPT-NEXT:    v_mov_b32_e32 v20, s4
+; NOOPT-NEXT:    v_mov_b32_e32 v19, s3
+; NOOPT-NEXT:    v_mov_b32_e32 v18, s2
+; NOOPT-NEXT:    v_mov_b32_e32 v17, s1
+; NOOPT-NEXT:    v_mov_b32_e32 v16, s0
+; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
+; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
+; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
+; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
+; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
+; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
+; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
+; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
+; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
+; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 4
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 5
+; NOOPT-NEXT:    v_writelane_b32 v31, s0, 4
+; NOOPT-NEXT:    v_writelane_b32 v31, s1, 5
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
 ; NOOPT-NEXT:    ; implicit-def: $vgpr0
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(1)
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(6)
+; NOOPT-NEXT:    buffer_load_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(5)
+; NOOPT-NEXT:    buffer_load_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(4)
+; NOOPT-NEXT:    buffer_load_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(3)
+; NOOPT-NEXT:    buffer_load_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(2)
+; NOOPT-NEXT:    buffer_load_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(1)
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 6
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 7
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
-; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
+; NOOPT-NEXT:    v_readlane_b32 s0, v31, 6
+; NOOPT-NEXT:    v_readlane_b32 s1, v31, 7
+; NOOPT-NEXT:    v_readfirstlane_b32 s2, v16
+; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v16
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_add_i32 m0, s2, 0xfffffe00
-; NOOPT-NEXT:    v_movrels_b32_e32 v1, v1
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; NOOPT-NEXT:    v_writelane_b32 v0, s2, 6
-; NOOPT-NEXT:    v_writelane_b32 v0, s3, 7
+; NOOPT-NEXT:    v_writelane_b32 v31, s2, 6
+; NOOPT-NEXT:    v_writelane_b32 v31, s3, 7
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
 ; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; NOOPT-NEXT:    s_cbranch_execnz .LBB5_1
 ; NOOPT-NEXT:  ; %bb.2:
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 4
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 5
+; NOOPT-NEXT:    v_readlane_b32 s0, v31, 4
+; NOOPT-NEXT:    v_readlane_b32 s1, v31, 5
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.3:
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:76 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 0
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 1
-; NOOPT-NEXT:    v_readlane_b32 s2, v0, 2
-; NOOPT-NEXT:    v_readlane_b32 s3, v0, 3
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[0:3], 0
-; NOOPT-NEXT:    ; kill: killed $vgpr0
+; NOOPT-NEXT:    v_readlane_b32 s0, v31, 0
+; NOOPT-NEXT:    v_readlane_b32 s1, v31, 1
+; NOOPT-NEXT:    v_readlane_b32 s2, v31, 2
+; NOOPT-NEXT:    v_readlane_b32 s3, v31, 3
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; NOOPT-NEXT:    s_endpgm
 ;
 ; SI-MOVREL-LABEL: extract_neg_offset_vgpr:
@@ -4022,7 +4021,6 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
 ; NOOPT-NEXT:    s_mov_b32 s23, 0xe8f000
 ; NOOPT-NEXT:    s_add_u32 s20, s20, s9
 ; NOOPT-NEXT:    s_addc_u32 s21, s21, 0
-; NOOPT-NEXT:    ; implicit-def: $vgpr16 : SGPR spill to VGPR lane
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
 ; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4034,10 +4032,11 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
 ; NOOPT-NEXT:    s_mov_b32 s1, s6
 ; NOOPT-NEXT:    s_mov_b32 s2, s5
 ; NOOPT-NEXT:    s_mov_b32 s3, s4
-; NOOPT-NEXT:    v_writelane_b32 v16, s0, 0
-; NOOPT-NEXT:    v_writelane_b32 v16, s1, 1
-; NOOPT-NEXT:    v_writelane_b32 v16, s2, 2
-; NOOPT-NEXT:    v_writelane_b32 v16, s3, 3
+; NOOPT-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
+; NOOPT-NEXT:    v_writelane_b32 v31, s0, 0
+; NOOPT-NEXT:    v_writelane_b32 v31, s1, 1
+; NOOPT-NEXT:    v_writelane_b32 v31, s2, 2
+; NOOPT-NEXT:    v_writelane_b32 v31, s3, 3
 ; NOOPT-NEXT:    s_mov_b32 s0, 16
 ; NOOPT-NEXT:    s_mov_b32 s1, 15
 ; NOOPT-NEXT:    s_mov_b32 s2, 14
@@ -4056,37 +4055,37 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
 ; NOOPT-NEXT:    s_mov_b32 s15, 1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    v_mov_b32_e32 v0, s15
-; NOOPT-NEXT:    v_mov_b32_e32 v31, s14
-; NOOPT-NEXT:    v_mov_b32_e32 v30, s13
-; NOOPT-NEXT:    v_mov_b32_e32 v29, s12
-; NOOPT-NEXT:    v_mov_b32_e32 v28, s11
-; NOOPT-NEXT:    v_mov_b32_e32 v27, s10
-; NOOPT-NEXT:    v_mov_b32_e32 v26, s9
-; NOOPT-NEXT:    v_mov_b32_e32 v25, s8
-; NOOPT-NEXT:    v_mov_b32_e32 v24, s7
-; NOOPT-NEXT:    v_mov_b32_e32 v23, s6
-; NOOPT-NEXT:    v_mov_b32_e32 v22, s5
-; NOOPT-NEXT:    v_mov_b32_e32 v21, s4
-; NOOPT-NEXT:    v_mov_b32_e32 v20, s3
-; NOOPT-NEXT:    v_mov_b32_e32 v19, s2
-; NOOPT-NEXT:    v_mov_b32_e32 v18, s1
-; NOOPT-NEXT:    v_mov_b32_e32 v17, s0
+; NOOPT-NEXT:    v_mov_b32_e32 v30, s14
+; NOOPT-NEXT:    v_mov_b32_e32 v29, s13
+; NOOPT-NEXT:    v_mov_b32_e32 v28, s12
+; NOOPT-NEXT:    v_mov_b32_e32 v27, s11
+; NOOPT-NEXT:    v_mov_b32_e32 v26, s10
+; NOOPT-NEXT:    v_mov_b32_e32 v25, s9
+; NOOPT-NEXT:    v_mov_b32_e32 v24, s8
+; NOOPT-NEXT:    v_mov_b32_e32 v23, s7
+; NOOPT-NEXT:    v_mov_b32_e32 v22, s6
+; NOOPT-NEXT:    v_mov_b32_e32 v21, s5
+; NOOPT-NEXT:    v_mov_b32_e32 v20, s4
+; NOOPT-NEXT:    v_mov_b32_e32 v19, s3
+; NOOPT-NEXT:    v_mov_b32_e32 v18, s2
+; NOOPT-NEXT:    v_mov_b32_e32 v17, s1
+; NOOPT-NEXT:    v_mov_b32_e32 v16, s0
 ; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v1, v31
-; NOOPT-NEXT:    v_mov_b32_e32 v2, v30
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v29
-; NOOPT-NEXT:    v_mov_b32_e32 v4, v28
-; NOOPT-NEXT:    v_mov_b32_e32 v5, v27
-; NOOPT-NEXT:    v_mov_b32_e32 v6, v26
-; NOOPT-NEXT:    v_mov_b32_e32 v7, v25
-; NOOPT-NEXT:    v_mov_b32_e32 v8, v24
-; NOOPT-NEXT:    v_mov_b32_e32 v9, v23
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v22
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v21
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v20
-; NOOPT-NEXT:    v_mov_b32_e32 v13, v19
-; NOOPT-NEXT:    v_mov_b32_e32 v14, v18
-; NOOPT-NEXT:    v_mov_b32_e32 v15, v17
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
+; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
+; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
+; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
+; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
+; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
+; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
+; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
+; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
+; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:80 ; 4-byte Folded Spill
@@ -4103,202 +4102,195 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
 ; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:124 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill
-; NOOPT-NEXT:    v_mov_b32_e32 v17, 33
-; NOOPT-NEXT:    buffer_store_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_mov_b32_e32 v16, 33
+; NOOPT-NEXT:    buffer_store_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
-; NOOPT-NEXT:    v_writelane_b32 v16, s0, 4
-; NOOPT-NEXT:    v_writelane_b32 v16, s1, 5
+; NOOPT-NEXT:    v_writelane_b32 v31, s0, 4
+; NOOPT-NEXT:    v_writelane_b32 v31, s1, 5
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 6
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 7
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(6)
-; NOOPT-NEXT:    buffer_load_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(5)
-; NOOPT-NEXT:    buffer_load_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(4)
-; NOOPT-NEXT:    buffer_load_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(3)
-; NOOPT-NEXT:    buffer_load_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(2)
-; NOOPT-NEXT:    buffer_load_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(1)
-; NOOPT-NEXT:    buffer_load_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readfirstlane_b32 s2, v18
-; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v18
+; NOOPT-NEXT:    v_readlane_b32 s0, v31, 6
+; NOOPT-NEXT:    v_readlane_b32 s1, v31, 7
+; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
+; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_add_i32 m0, s2, 0xfffffe00
-; NOOPT-NEXT:    v_movreld_b32_e32 v1, v17
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; NOOPT-NEXT:    v_writelane_b32 v0, s2, 6
-; NOOPT-NEXT:    v_writelane_b32 v0, s3, 7
+; NOOPT-NEXT:    v_writelane_b32 v31, s2, 6
+; NOOPT-NEXT:    v_writelane_b32 v31, s3, 7
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
 ; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; NOOPT-NEXT:    s_cbranch_execnz .LBB14_1
 ; NOOPT-NEXT:  ; %bb.2:
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 4
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 5
+; NOOPT-NEXT:    v_readlane_b32 s0, v31, 4
+; NOOPT-NEXT:    v_readlane_b32 s1, v31, 5
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.3:
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v22, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v26, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v30, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 0
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 1
-; NOOPT-NEXT:    v_readlane_b32 s2, v0, 2
-; NOOPT-NEXT:    v_readlane_b32 s3, v0, 3
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v19, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v20, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v21, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v22, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v23, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v24, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v25, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v26, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v27, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v28, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v29, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v30, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(12)
-; NOOPT-NEXT:    v_mov_b32_e32 v5, v19
-; NOOPT-NEXT:    v_mov_b32_e32 v6, v18
-; NOOPT-NEXT:    v_mov_b32_e32 v7, v17
-; NOOPT-NEXT:    v_mov_b32_e32 v1, v16
-; NOOPT-NEXT:    s_waitcnt vmcnt(8)
-; NOOPT-NEXT:    v_mov_b32_e32 v2, v23
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v22
-; NOOPT-NEXT:    v_mov_b32_e32 v4, v21
-; NOOPT-NEXT:    v_mov_b32_e32 v8, v20
-; NOOPT-NEXT:    s_waitcnt vmcnt(4)
-; NOOPT-NEXT:    v_mov_b32_e32 v13, v27
-; NOOPT-NEXT:    v_mov_b32_e32 v14, v26
-; NOOPT-NEXT:    v_mov_b32_e32 v15, v25
-; NOOPT-NEXT:    v_mov_b32_e32 v9, v24
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v31
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v30
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v29
-; NOOPT-NEXT:    v_mov_b32_e32 v16, v28
+; NOOPT-NEXT:    v_readlane_b32 s0, v31, 0
+; NOOPT-NEXT:    v_readlane_b32 s1, v31, 1
+; NOOPT-NEXT:    v_readlane_b32 s2, v31, 2
+; NOOPT-NEXT:    v_readlane_b32 s3, v31, 3
+; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
+; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
+; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
+; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v22
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v21
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v20
+; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
+; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
+; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
+; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
+; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v30
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v29
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v28
+; NOOPT-NEXT:    v_mov_b32_e32 v15, v27
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v17, v12
-; NOOPT-NEXT:    v_mov_b32_e32 v18, v11
-; NOOPT-NEXT:    v_mov_b32_e32 v19, v10
-; NOOPT-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; NOOPT-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v16, v11
+; NOOPT-NEXT:    v_mov_b32_e32 v17, v10
+; NOOPT-NEXT:    v_mov_b32_e32 v18, v9
+; NOOPT-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v15
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v14
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v13
-; NOOPT-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32
+; NOOPT-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v14
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v13
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v12
+; NOOPT-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
+; NOOPT-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    v_mov_b32_e32 v9, v4
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v3
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v2
-; NOOPT-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; NOOPT-NEXT:    v_mov_b32_e32 v8, v3
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v2
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v1
+; NOOPT-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v2, v7
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v6
-; NOOPT-NEXT:    v_mov_b32_e32 v4, v5
-; NOOPT-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
-; NOOPT-NEXT:    ; kill: killed $vgpr0
+; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
+; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; NOOPT-NEXT:    s_endpgm
 ;
 ; SI-MOVREL-LABEL: insert_neg_offset_vgpr:
@@ -4512,7 +4504,6 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
 ; NOOPT-NEXT:    s_mov_b32 s23, 0xe8f000
 ; NOOPT-NEXT:    s_add_u32 s20, s20, s9
 ; NOOPT-NEXT:    s_addc_u32 s21, s21, 0
-; NOOPT-NEXT:    ; implicit-def: $vgpr16 : SGPR spill to VGPR lane
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
 ; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4524,10 +4515,11 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
 ; NOOPT-NEXT:    s_mov_b32 s1, s6
 ; NOOPT-NEXT:    s_mov_b32 s2, s5
 ; NOOPT-NEXT:    s_mov_b32 s3, s4
-; NOOPT-NEXT:    v_writelane_b32 v16, s0, 0
-; NOOPT-NEXT:    v_writelane_b32 v16, s1, 1
-; NOOPT-NEXT:    v_writelane_b32 v16, s2, 2
-; NOOPT-NEXT:    v_writelane_b32 v16, s3, 3
+; NOOPT-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
+; NOOPT-NEXT:    v_writelane_b32 v31, s0, 0
+; NOOPT-NEXT:    v_writelane_b32 v31, s1, 1
+; NOOPT-NEXT:    v_writelane_b32 v31, s2, 2
+; NOOPT-NEXT:    v_writelane_b32 v31, s3, 3
 ; NOOPT-NEXT:    s_mov_b32 s0, 16
 ; NOOPT-NEXT:    s_mov_b32 s1, 15
 ; NOOPT-NEXT:    s_mov_b32 s2, 14
@@ -4546,37 +4538,37 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
 ; NOOPT-NEXT:    s_mov_b32 s15, 1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    v_mov_b32_e32 v0, s15
-; NOOPT-NEXT:    v_mov_b32_e32 v31, s14
-; NOOPT-NEXT:    v_mov_b32_e32 v30, s13
-; NOOPT-NEXT:    v_mov_b32_e32 v29, s12
-; NOOPT-NEXT:    v_mov_b32_e32 v28, s11
-; NOOPT-NEXT:    v_mov_b32_e32 v27, s10
-; NOOPT-NEXT:    v_mov_b32_e32 v26, s9
-; NOOPT-NEXT:    v_mov_b32_e32 v25, s8
-; NOOPT-NEXT:    v_mov_b32_e32 v24, s7
-; NOOPT-NEXT:    v_mov_b32_e32 v23, s6
-; NOOPT-NEXT:    v_mov_b32_e32 v22, s5
-; NOOPT-NEXT:    v_mov_b32_e32 v21, s4
-; NOOPT-NEXT:    v_mov_b32_e32 v20, s3
-; NOOPT-NEXT:    v_mov_b32_e32 v19, s2
-; NOOPT-NEXT:    v_mov_b32_e32 v18, s1
-; NOOPT-NEXT:    v_mov_b32_e32 v17, s0
+; NOOPT-NEXT:    v_mov_b32_e32 v30, s14
+; NOOPT-NEXT:    v_mov_b32_e32 v29, s13
+; NOOPT-NEXT:    v_mov_b32_e32 v28, s12
+; NOOPT-NEXT:    v_mov_b32_e32 v27, s11
+; NOOPT-NEXT:    v_mov_b32_e32 v26, s10
+; NOOPT-NEXT:    v_mov_b32_e32 v25, s9
+; NOOPT-NEXT:    v_mov_b32_e32 v24, s8
+; NOOPT-NEXT:    v_mov_b32_e32 v23, s7
+; NOOPT-NEXT:    v_mov_b32_e32 v22, s6
+; NOOPT-NEXT:    v_mov_b32_e32 v21, s5
+; NOOPT-NEXT:    v_mov_b32_e32 v20, s4
+; NOOPT-NEXT:    v_mov_b32_e32 v19, s3
+; NOOPT-NEXT:    v_mov_b32_e32 v18, s2
+; NOOPT-NEXT:    v_mov_b32_e32 v17, s1
+; NOOPT-NEXT:    v_mov_b32_e32 v16, s0
 ; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v1, v31
-; NOOPT-NEXT:    v_mov_b32_e32 v2, v30
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v29
-; NOOPT-NEXT:    v_mov_b32_e32 v4, v28
-; NOOPT-NEXT:    v_mov_b32_e32 v5, v27
-; NOOPT-NEXT:    v_mov_b32_e32 v6, v26
-; NOOPT-NEXT:    v_mov_b32_e32 v7, v25
-; NOOPT-NEXT:    v_mov_b32_e32 v8, v24
-; NOOPT-NEXT:    v_mov_b32_e32 v9, v23
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v22
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v21
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v20
-; NOOPT-NEXT:    v_mov_b32_e32 v13, v19
-; NOOPT-NEXT:    v_mov_b32_e32 v14, v18
-; NOOPT-NEXT:    v_mov_b32_e32 v15, v17
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
+; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
+; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
+; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
+; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
+; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
+; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
+; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
+; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
+; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:80 ; 4-byte Folded Spill
@@ -4593,202 +4585,195 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
 ; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:124 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill
-; NOOPT-NEXT:    v_mov_b32_e32 v17, 0x1f4
-; NOOPT-NEXT:    buffer_store_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_mov_b32_e32 v16, 0x1f4
+; NOOPT-NEXT:    buffer_store_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
-; NOOPT-NEXT:    v_writelane_b32 v16, s0, 4
-; NOOPT-NEXT:    v_writelane_b32 v16, s1, 5
+; NOOPT-NEXT:    v_writelane_b32 v31, s0, 4
+; NOOPT-NEXT:    v_writelane_b32 v31, s1, 5
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
-; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 6
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 7
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(6)
-; NOOPT-NEXT:    buffer_load_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(5)
-; NOOPT-NEXT:    buffer_load_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(4)
-; NOOPT-NEXT:    buffer_load_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(3)
-; NOOPT-NEXT:    buffer_load_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(2)
-; NOOPT-NEXT:    buffer_load_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(1)
-; NOOPT-NEXT:    buffer_load_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readfirstlane_b32 s2, v18
-; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v18
+; NOOPT-NEXT:    v_readlane_b32 s0, v31, 6
+; NOOPT-NEXT:    v_readlane_b32 s1, v31, 7
+; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
+; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_add_i32 m0, s2, -16
-; NOOPT-NEXT:    v_movreld_b32_e32 v1, v17
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; NOOPT-NEXT:    v_writelane_b32 v0, s2, 6
-; NOOPT-NEXT:    v_writelane_b32 v0, s3, 7
+; NOOPT-NEXT:    v_writelane_b32 v31, s2, 6
+; NOOPT-NEXT:    v_writelane_b32 v31, s3, 7
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v31, off, s[20:23], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
 ; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; NOOPT-NEXT:    s_cbranch_execnz .LBB15_1
 ; NOOPT-NEXT:  ; %bb.2:
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 4
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 5
+; NOOPT-NEXT:    v_readlane_b32 s0, v31, 4
+; NOOPT-NEXT:    v_readlane_b32 s1, v31, 5
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.3:
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v22, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v26, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v30, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 0
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 1
-; NOOPT-NEXT:    v_readlane_b32 s2, v0, 2
-; NOOPT-NEXT:    v_readlane_b32 s3, v0, 3
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v19, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v20, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v21, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v22, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v23, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v24, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v25, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v26, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v27, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v28, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v29, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v30, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(12)
-; NOOPT-NEXT:    v_mov_b32_e32 v5, v19
-; NOOPT-NEXT:    v_mov_b32_e32 v6, v18
-; NOOPT-NEXT:    v_mov_b32_e32 v7, v17
-; NOOPT-NEXT:    v_mov_b32_e32 v1, v16
-; NOOPT-NEXT:    s_waitcnt vmcnt(8)
-; NOOPT-NEXT:    v_mov_b32_e32 v2, v23
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v22
-; NOOPT-NEXT:    v_mov_b32_e32 v4, v21
-; NOOPT-NEXT:    v_mov_b32_e32 v8, v20
-; NOOPT-NEXT:    s_waitcnt vmcnt(4)
-; NOOPT-NEXT:    v_mov_b32_e32 v13, v27
-; NOOPT-NEXT:    v_mov_b32_e32 v14, v26
-; NOOPT-NEXT:    v_mov_b32_e32 v15, v25
-; NOOPT-NEXT:    v_mov_b32_e32 v9, v24
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v31
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v30
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v29
-; NOOPT-NEXT:    v_mov_b32_e32 v16, v28
+; NOOPT-NEXT:    v_readlane_b32 s0, v31, 0
+; NOOPT-NEXT:    v_readlane_b32 s1, v31, 1
+; NOOPT-NEXT:    v_readlane_b32 s2, v31, 2
+; NOOPT-NEXT:    v_readlane_b32 s3, v31, 3
+; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
+; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
+; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
+; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v22
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v21
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v20
+; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
+; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
+; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
+; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
+; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v30
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v29
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v28
+; NOOPT-NEXT:    v_mov_b32_e32 v15, v27
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v17, v12
-; NOOPT-NEXT:    v_mov_b32_e32 v18, v11
-; NOOPT-NEXT:    v_mov_b32_e32 v19, v10
-; NOOPT-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; NOOPT-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v16, v11
+; NOOPT-NEXT:    v_mov_b32_e32 v17, v10
+; NOOPT-NEXT:    v_mov_b32_e32 v18, v9
+; NOOPT-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v15
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v14
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v13
-; NOOPT-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32
+; NOOPT-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v14
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v13
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v12
+; NOOPT-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
+; NOOPT-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    v_mov_b32_e32 v9, v4
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v3
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v2
-; NOOPT-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; NOOPT-NEXT:    v_mov_b32_e32 v8, v3
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v2
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v1
+; NOOPT-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v2, v7
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v6
-; NOOPT-NEXT:    v_mov_b32_e32 v4, v5
-; NOOPT-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
-; NOOPT-NEXT:    ; kill: killed $vgpr0
+; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v6
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v4
+; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; NOOPT-NEXT:    s_endpgm
 ;
 ; SI-MOVREL-LABEL: insert_neg_inline_offset_vgpr:
@@ -5053,13 +5038,8 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
 ; NOOPT-NEXT:    s_mov_b32 s39, 0xe8f000
 ; NOOPT-NEXT:    s_add_u32 s36, s36, s9
 ; NOOPT-NEXT:    s_addc_u32 s37, s37, 0
-; NOOPT-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], s[2:3]
-; NOOPT-NEXT:    v_mov_b32_e32 v1, v0
-; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[36:39], 0 offset:76 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
 ; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5071,32 +5051,32 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
 ; NOOPT-NEXT:    s_mov_b32 s5, s8
 ; NOOPT-NEXT:    s_mov_b32 s6, s3
 ; NOOPT-NEXT:    s_mov_b32 s7, s2
-; NOOPT-NEXT:    s_waitcnt vmcnt(1)
-; NOOPT-NEXT:    v_writelane_b32 v0, s4, 0
-; NOOPT-NEXT:    v_writelane_b32 v0, s5, 1
-; NOOPT-NEXT:    v_writelane_b32 v0, s6, 2
-; NOOPT-NEXT:    v_writelane_b32 v0, s7, 3
+; NOOPT-NEXT:    ; implicit-def: $vgpr18 : SGPR spill to VGPR lane
+; NOOPT-NEXT:    v_writelane_b32 v18, s4, 0
+; NOOPT-NEXT:    v_writelane_b32 v18, s5, 1
+; NOOPT-NEXT:    v_writelane_b32 v18, s6, 2
+; NOOPT-NEXT:    v_writelane_b32 v18, s7, 3
 ; NOOPT-NEXT:    s_mov_b32 s4, 0
-; NOOPT-NEXT:    v_writelane_b32 v0, s4, 4
+; NOOPT-NEXT:    v_writelane_b32 v18, s4, 4
 ; NOOPT-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
 ; NOOPT-NEXT:    s_mov_b32 s5, s2
 ; NOOPT-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
 ; NOOPT-NEXT:    s_mov_b32 s4, 2
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
+; NOOPT-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
 ; NOOPT-NEXT:    s_mov_b32 s4, 0
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    v_mov_b32_e32 v3, 0
-; NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v2, v3
-; NOOPT-NEXT:    buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 glc
+; NOOPT-NEXT:    v_mov_b32_e32 v2, 0
+; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v2
+; NOOPT-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[36:39], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:72 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b32 s0, 1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    v_add_i32_e64 v1, s[0:1], v1, s0
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[36:39], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_add_i32_e64 v0, s[0:1], v0, s0
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:68 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b32 s16, 16
 ; NOOPT-NEXT:    s_mov_b32 s17, 15
 ; NOOPT-NEXT:    s_mov_b32 s18, 14
@@ -5125,255 +5105,266 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
 ; NOOPT-NEXT:    s_mov_b32 s13, s18
 ; NOOPT-NEXT:    s_mov_b32 s14, s17
 ; NOOPT-NEXT:    s_mov_b32 s15, s16
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 5
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 6
-; NOOPT-NEXT:    v_writelane_b32 v0, s2, 7
-; NOOPT-NEXT:    v_writelane_b32 v0, s3, 8
-; NOOPT-NEXT:    v_writelane_b32 v0, s4, 9
-; NOOPT-NEXT:    v_writelane_b32 v0, s5, 10
-; NOOPT-NEXT:    v_writelane_b32 v0, s6, 11
-; NOOPT-NEXT:    v_writelane_b32 v0, s7, 12
-; NOOPT-NEXT:    v_writelane_b32 v0, s8, 13
-; NOOPT-NEXT:    v_writelane_b32 v0, s9, 14
-; NOOPT-NEXT:    v_writelane_b32 v0, s10, 15
-; NOOPT-NEXT:    v_writelane_b32 v0, s11, 16
-; NOOPT-NEXT:    v_writelane_b32 v0, s12, 17
-; NOOPT-NEXT:    v_writelane_b32 v0, s13, 18
-; NOOPT-NEXT:    v_writelane_b32 v0, s14, 19
-; NOOPT-NEXT:    v_writelane_b32 v0, s15, 20
+; NOOPT-NEXT:    v_writelane_b32 v18, s0, 5
+; NOOPT-NEXT:    v_writelane_b32 v18, s1, 6
+; NOOPT-NEXT:    v_writelane_b32 v18, s2, 7
+; NOOPT-NEXT:    v_writelane_b32 v18, s3, 8
+; NOOPT-NEXT:    v_writelane_b32 v18, s4, 9
+; NOOPT-NEXT:    v_writelane_b32 v18, s5, 10
+; NOOPT-NEXT:    v_writelane_b32 v18, s6, 11
+; NOOPT-NEXT:    v_writelane_b32 v18, s7, 12
+; NOOPT-NEXT:    v_writelane_b32 v18, s8, 13
+; NOOPT-NEXT:    v_writelane_b32 v18, s9, 14
+; NOOPT-NEXT:    v_writelane_b32 v18, s10, 15
+; NOOPT-NEXT:    v_writelane_b32 v18, s11, 16
+; NOOPT-NEXT:    v_writelane_b32 v18, s12, 17
+; NOOPT-NEXT:    v_writelane_b32 v18, s13, 18
+; NOOPT-NEXT:    v_writelane_b32 v18, s14, 19
+; NOOPT-NEXT:    v_writelane_b32 v18, s15, 20
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    v_mov_b32_e32 v16, s15
-; NOOPT-NEXT:    v_mov_b32_e32 v15, s14
-; NOOPT-NEXT:    v_mov_b32_e32 v14, s13
-; NOOPT-NEXT:    v_mov_b32_e32 v13, s12
-; NOOPT-NEXT:    v_mov_b32_e32 v12, s11
-; NOOPT-NEXT:    v_mov_b32_e32 v11, s10
-; NOOPT-NEXT:    v_mov_b32_e32 v10, s9
-; NOOPT-NEXT:    v_mov_b32_e32 v9, s8
-; NOOPT-NEXT:    v_mov_b32_e32 v8, s7
-; NOOPT-NEXT:    v_mov_b32_e32 v7, s6
-; NOOPT-NEXT:    v_mov_b32_e32 v6, s5
-; NOOPT-NEXT:    v_mov_b32_e32 v5, s4
-; NOOPT-NEXT:    v_mov_b32_e32 v4, s3
-; NOOPT-NEXT:    v_mov_b32_e32 v3, s2
-; NOOPT-NEXT:    v_mov_b32_e32 v2, s1
-; NOOPT-NEXT:    v_mov_b32_e32 v1, s0
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[36:39], 0 offset:4 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[36:39], 0 offset:8 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[36:39], 0 offset:12 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[36:39], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[36:39], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[36:39], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[36:39], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[36:39], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[36:39], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[36:39], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[36:39], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[36:39], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[36:39], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[36:39], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[36:39], 0 offset:60 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[36:39], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_mov_b32_e32 v0, s0
+; NOOPT-NEXT:    v_mov_b32_e32 v1, s1
+; NOOPT-NEXT:    v_mov_b32_e32 v2, s2
+; NOOPT-NEXT:    v_mov_b32_e32 v3, s3
+; NOOPT-NEXT:    v_mov_b32_e32 v4, s4
+; NOOPT-NEXT:    v_mov_b32_e32 v5, s5
+; NOOPT-NEXT:    v_mov_b32_e32 v6, s6
+; NOOPT-NEXT:    v_mov_b32_e32 v7, s7
+; NOOPT-NEXT:    v_mov_b32_e32 v8, s8
+; NOOPT-NEXT:    v_mov_b32_e32 v9, s9
+; NOOPT-NEXT:    v_mov_b32_e32 v10, s10
+; NOOPT-NEXT:    v_mov_b32_e32 v11, s11
+; NOOPT-NEXT:    v_mov_b32_e32 v12, s12
+; NOOPT-NEXT:    v_mov_b32_e32 v13, s13
+; NOOPT-NEXT:    v_mov_b32_e32 v14, s14
+; NOOPT-NEXT:    v_mov_b32_e32 v15, s15
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[36:39], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[36:39], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[36:39], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[36:39], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[36:39], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[36:39], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[36:39], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[36:39], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[36:39], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[36:39], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[36:39], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[36:39], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[36:39], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[36:39], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[36:39], 0 offset:64 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 21
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 22
+; NOOPT-NEXT:    v_writelane_b32 v18, s0, 21
+; NOOPT-NEXT:    v_writelane_b32 v18, s1, 22
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
 ; NOOPT-NEXT:    ; implicit-def: $vgpr0
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(1)
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v3, off, s[36:39], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[36:39], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v5, off, s[36:39], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v6, off, s[36:39], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v7, off, s[36:39], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v8, off, s[36:39], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v9, off, s[36:39], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(6)
+; NOOPT-NEXT:    buffer_load_dword v10, off, s[36:39], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(5)
+; NOOPT-NEXT:    buffer_load_dword v11, off, s[36:39], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(4)
+; NOOPT-NEXT:    buffer_load_dword v12, off, s[36:39], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(3)
+; NOOPT-NEXT:    buffer_load_dword v13, off, s[36:39], 0 offset:56 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(2)
+; NOOPT-NEXT:    buffer_load_dword v14, off, s[36:39], 0 offset:60 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(1)
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[36:39], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 23
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 24
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:8 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v3, off, s[36:39], 0 offset:12 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v4, off, s[36:39], 0 offset:16 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v5, off, s[36:39], 0 offset:20 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v6, off, s[36:39], 0 offset:24 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v7, off, s[36:39], 0 offset:28 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v8, off, s[36:39], 0 offset:32 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v9, off, s[36:39], 0 offset:36 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v10, off, s[36:39], 0 offset:40 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v11, off, s[36:39], 0 offset:44 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v12, off, s[36:39], 0 offset:48 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v13, off, s[36:39], 0 offset:52 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v14, off, s[36:39], 0 offset:56 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v15, off, s[36:39], 0 offset:60 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[36:39], 0 offset:64 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
-; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
+; NOOPT-NEXT:    v_readlane_b32 s0, v18, 23
+; NOOPT-NEXT:    v_readlane_b32 s1, v18, 24
+; NOOPT-NEXT:    v_readfirstlane_b32 s2, v16
+; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v16
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 m0, s2
-; NOOPT-NEXT:    v_movrels_b32_e32 v1, v1
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[36:39], 0 offset:84 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[36:39], 0 offset:80 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:84 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:80 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; NOOPT-NEXT:    v_writelane_b32 v0, s2, 23
-; NOOPT-NEXT:    v_writelane_b32 v0, s3, 24
+; NOOPT-NEXT:    v_writelane_b32 v18, s2, 23
+; NOOPT-NEXT:    v_writelane_b32 v18, s3, 24
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
 ; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; NOOPT-NEXT:    s_cbranch_execnz .LBB16_1
 ; NOOPT-NEXT:  ; %bb.2:
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 21
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 22
+; NOOPT-NEXT:    v_readlane_b32 s0, v18, 21
+; NOOPT-NEXT:    v_readlane_b32 s1, v18, 22
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.3:
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
 ; NOOPT-NEXT:    ;;#ASMSTART
 ; NOOPT-NEXT:    s_mov_b32 s4, 17
 ; NOOPT-NEXT:    ;;#ASMEND
 ; NOOPT-NEXT:    s_mov_b32 s16, s4
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 5
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 6
-; NOOPT-NEXT:    v_readlane_b32 s2, v0, 7
-; NOOPT-NEXT:    v_readlane_b32 s3, v0, 8
-; NOOPT-NEXT:    v_readlane_b32 s4, v0, 9
-; NOOPT-NEXT:    v_readlane_b32 s5, v0, 10
-; NOOPT-NEXT:    v_readlane_b32 s6, v0, 11
-; NOOPT-NEXT:    v_readlane_b32 s7, v0, 12
-; NOOPT-NEXT:    v_readlane_b32 s8, v0, 13
-; NOOPT-NEXT:    v_readlane_b32 s9, v0, 14
-; NOOPT-NEXT:    v_readlane_b32 s10, v0, 15
-; NOOPT-NEXT:    v_readlane_b32 s11, v0, 16
-; NOOPT-NEXT:    v_readlane_b32 s12, v0, 17
-; NOOPT-NEXT:    v_readlane_b32 s13, v0, 18
-; NOOPT-NEXT:    v_readlane_b32 s14, v0, 19
-; NOOPT-NEXT:    v_readlane_b32 s15, v0, 20
-; NOOPT-NEXT:    v_writelane_b32 v0, s16, 25
-; NOOPT-NEXT:    v_mov_b32_e32 v16, s15
-; NOOPT-NEXT:    v_mov_b32_e32 v15, s14
-; NOOPT-NEXT:    v_mov_b32_e32 v14, s13
-; NOOPT-NEXT:    v_mov_b32_e32 v13, s12
-; NOOPT-NEXT:    v_mov_b32_e32 v12, s11
-; NOOPT-NEXT:    v_mov_b32_e32 v11, s10
-; NOOPT-NEXT:    v_mov_b32_e32 v10, s9
-; NOOPT-NEXT:    v_mov_b32_e32 v9, s8
-; NOOPT-NEXT:    v_mov_b32_e32 v8, s7
-; NOOPT-NEXT:    v_mov_b32_e32 v7, s6
-; NOOPT-NEXT:    v_mov_b32_e32 v6, s5
-; NOOPT-NEXT:    v_mov_b32_e32 v5, s4
-; NOOPT-NEXT:    v_mov_b32_e32 v4, s3
-; NOOPT-NEXT:    v_mov_b32_e32 v3, s2
-; NOOPT-NEXT:    v_mov_b32_e32 v2, s1
-; NOOPT-NEXT:    v_mov_b32_e32 v1, s0
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[36:39], 0 offset:88 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[36:39], 0 offset:92 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[36:39], 0 offset:96 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[36:39], 0 offset:100 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[36:39], 0 offset:104 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[36:39], 0 offset:108 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[36:39], 0 offset:112 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[36:39], 0 offset:116 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[36:39], 0 offset:120 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[36:39], 0 offset:124 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[36:39], 0 offset:128 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[36:39], 0 offset:132 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[36:39], 0 offset:136 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[36:39], 0 offset:140 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[36:39], 0 offset:144 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[36:39], 0 offset:148 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_readlane_b32 s0, v18, 5
+; NOOPT-NEXT:    v_readlane_b32 s1, v18, 6
+; NOOPT-NEXT:    v_readlane_b32 s2, v18, 7
+; NOOPT-NEXT:    v_readlane_b32 s3, v18, 8
+; NOOPT-NEXT:    v_readlane_b32 s4, v18, 9
+; NOOPT-NEXT:    v_readlane_b32 s5, v18, 10
+; NOOPT-NEXT:    v_readlane_b32 s6, v18, 11
+; NOOPT-NEXT:    v_readlane_b32 s7, v18, 12
+; NOOPT-NEXT:    v_readlane_b32 s8, v18, 13
+; NOOPT-NEXT:    v_readlane_b32 s9, v18, 14
+; NOOPT-NEXT:    v_readlane_b32 s10, v18, 15
+; NOOPT-NEXT:    v_readlane_b32 s11, v18, 16
+; NOOPT-NEXT:    v_readlane_b32 s12, v18, 17
+; NOOPT-NEXT:    v_readlane_b32 s13, v18, 18
+; NOOPT-NEXT:    v_readlane_b32 s14, v18, 19
+; NOOPT-NEXT:    v_readlane_b32 s15, v18, 20
+; NOOPT-NEXT:    v_writelane_b32 v18, s16, 25
+; NOOPT-NEXT:    v_mov_b32_e32 v0, s0
+; NOOPT-NEXT:    v_mov_b32_e32 v1, s1
+; NOOPT-NEXT:    v_mov_b32_e32 v2, s2
+; NOOPT-NEXT:    v_mov_b32_e32 v3, s3
+; NOOPT-NEXT:    v_mov_b32_e32 v4, s4
+; NOOPT-NEXT:    v_mov_b32_e32 v5, s5
+; NOOPT-NEXT:    v_mov_b32_e32 v6, s6
+; NOOPT-NEXT:    v_mov_b32_e32 v7, s7
+; NOOPT-NEXT:    v_mov_b32_e32 v8, s8
+; NOOPT-NEXT:    v_mov_b32_e32 v9, s9
+; NOOPT-NEXT:    v_mov_b32_e32 v10, s10
+; NOOPT-NEXT:    v_mov_b32_e32 v11, s11
+; NOOPT-NEXT:    v_mov_b32_e32 v12, s12
+; NOOPT-NEXT:    v_mov_b32_e32 v13, s13
+; NOOPT-NEXT:    v_mov_b32_e32 v14, s14
+; NOOPT-NEXT:    v_mov_b32_e32 v15, s15
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:88 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[36:39], 0 offset:92 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[36:39], 0 offset:96 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[36:39], 0 offset:100 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[36:39], 0 offset:104 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[36:39], 0 offset:108 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[36:39], 0 offset:112 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[36:39], 0 offset:116 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[36:39], 0 offset:120 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[36:39], 0 offset:124 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[36:39], 0 offset:128 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[36:39], 0 offset:132 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[36:39], 0 offset:136 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[36:39], 0 offset:140 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[36:39], 0 offset:144 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[36:39], 0 offset:148 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 26
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 27
+; NOOPT-NEXT:    v_writelane_b32 v18, s0, 26
+; NOOPT-NEXT:    v_writelane_b32 v18, s1, 27
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
 ; NOOPT-NEXT:    ; implicit-def: $vgpr0
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:  .LBB16_4: ; =>This Inner Loop Header: Depth=1
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(1)
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:88 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:92 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:96 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v3, off, s[36:39], 0 offset:100 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[36:39], 0 offset:104 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v5, off, s[36:39], 0 offset:108 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v6, off, s[36:39], 0 offset:112 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v7, off, s[36:39], 0 offset:116 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v8, off, s[36:39], 0 offset:120 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v9, off, s[36:39], 0 offset:124 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(6)
+; NOOPT-NEXT:    buffer_load_dword v10, off, s[36:39], 0 offset:128 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(5)
+; NOOPT-NEXT:    buffer_load_dword v11, off, s[36:39], 0 offset:132 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(4)
+; NOOPT-NEXT:    buffer_load_dword v12, off, s[36:39], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(3)
+; NOOPT-NEXT:    buffer_load_dword v13, off, s[36:39], 0 offset:140 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(2)
+; NOOPT-NEXT:    buffer_load_dword v14, off, s[36:39], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(1)
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[36:39], 0 offset:148 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 28
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 29
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:88 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:92 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v3, off, s[36:39], 0 offset:96 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v4, off, s[36:39], 0 offset:100 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v5, off, s[36:39], 0 offset:104 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v6, off, s[36:39], 0 offset:108 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v7, off, s[36:39], 0 offset:112 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v8, off, s[36:39], 0 offset:116 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v9, off, s[36:39], 0 offset:120 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v10, off, s[36:39], 0 offset:124 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v11, off, s[36:39], 0 offset:128 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v12, off, s[36:39], 0 offset:132 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v13, off, s[36:39], 0 offset:136 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v14, off, s[36:39], 0 offset:140 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v15, off, s[36:39], 0 offset:144 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[36:39], 0 offset:148 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
-; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
+; NOOPT-NEXT:    v_readlane_b32 s0, v18, 28
+; NOOPT-NEXT:    v_readlane_b32 s1, v18, 29
+; NOOPT-NEXT:    v_readfirstlane_b32 s2, v16
+; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v16
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 m0, s2
-; NOOPT-NEXT:    v_movrels_b32_e32 v1, v1
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[36:39], 0 offset:152 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:156 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:152 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; NOOPT-NEXT:    v_writelane_b32 v0, s2, 28
-; NOOPT-NEXT:    v_writelane_b32 v0, s3, 29
+; NOOPT-NEXT:    v_writelane_b32 v18, s2, 28
+; NOOPT-NEXT:    v_writelane_b32 v18, s3, 29
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
 ; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; NOOPT-NEXT:    s_cbranch_execnz .LBB16_4
 ; NOOPT-NEXT:  ; %bb.5:
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 26
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 27
+; NOOPT-NEXT:    v_readlane_b32 s0, v18, 26
+; NOOPT-NEXT:    v_readlane_b32 s1, v18, 27
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.6:
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 4
-; NOOPT-NEXT:    v_readlane_b32 s4, v0, 0
-; NOOPT-NEXT:    v_readlane_b32 s5, v0, 1
-; NOOPT-NEXT:    v_readlane_b32 s6, v0, 2
-; NOOPT-NEXT:    v_readlane_b32 s7, v0, 3
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v3, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[4:7], 0
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    v_readlane_b32 s0, v18, 4
+; NOOPT-NEXT:    v_readlane_b32 s4, v18, 0
+; NOOPT-NEXT:    v_readlane_b32 s5, v18, 1
+; NOOPT-NEXT:    v_readlane_b32 s6, v18, 2
+; NOOPT-NEXT:    v_readlane_b32 s7, v18, 3
 ; NOOPT-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v1, s0
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 30
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 31
+; NOOPT-NEXT:    v_writelane_b32 v18, s0, 30
+; NOOPT-NEXT:    v_writelane_b32 v18, s1, 31
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v18, off, s[36:39], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
 ; NOOPT-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
@@ -5381,10 +5372,10 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
 ; NOOPT-NEXT:  ; %bb.7: ; %bb1
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s4, v0, 25
+; NOOPT-NEXT:    v_readlane_b32 s4, v18, 25
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:    s_mov_b32 s7, s1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
@@ -5401,13 +5392,12 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
 ; NOOPT-NEXT:  .LBB16_8: ; %bb2
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 30
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 31
+; NOOPT-NEXT:    v_readlane_b32 s0, v18, 30
+; NOOPT-NEXT:    v_readlane_b32 s1, v18, 31
 ; NOOPT-NEXT:    s_or_b64 exec, exec, s[0:1]
-; NOOPT-NEXT:    ; kill: killed $vgpr0
 ; NOOPT-NEXT:    s_endpgm
 ;
 ; SI-MOVREL-LABEL: extract_vgpr_offset_multiple_in_block:
@@ -5827,7 +5817,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    s_mov_b32 s31, 0xe8f000
 ; NOOPT-NEXT:    s_add_u32 s28, s28, s9
 ; NOOPT-NEXT:    s_addc_u32 s29, s29, 0
-; NOOPT-NEXT:    ; implicit-def: $vgpr16 : SGPR spill to VGPR lane
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_load_dwordx2 s[18:19], s[2:3], 0x9
 ; NOOPT-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0xd
@@ -5841,12 +5830,13 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    s_mov_b32 s21, s24
 ; NOOPT-NEXT:    s_mov_b32 s22, s19
 ; NOOPT-NEXT:    s_mov_b32 s23, s18
-; NOOPT-NEXT:    v_writelane_b32 v16, s20, 0
-; NOOPT-NEXT:    v_writelane_b32 v16, s21, 1
-; NOOPT-NEXT:    v_writelane_b32 v16, s22, 2
-; NOOPT-NEXT:    v_writelane_b32 v16, s23, 3
+; NOOPT-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
+; NOOPT-NEXT:    v_writelane_b32 v32, s20, 0
+; NOOPT-NEXT:    v_writelane_b32 v32, s21, 1
+; NOOPT-NEXT:    v_writelane_b32 v32, s22, 2
+; NOOPT-NEXT:    v_writelane_b32 v32, s23, 3
 ; NOOPT-NEXT:    s_mov_b32 s20, 0
-; NOOPT-NEXT:    v_writelane_b32 v16, s20, 4
+; NOOPT-NEXT:    v_writelane_b32 v32, s20, 4
 ; NOOPT-NEXT:    ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21
 ; NOOPT-NEXT:    s_mov_b32 s21, s18
 ; NOOPT-NEXT:    ; kill: def $sgpr16_sgpr17 killed $sgpr16_sgpr17 def $sgpr16_sgpr17_sgpr18_sgpr19
@@ -5890,115 +5880,113 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    v_mov_b32_e32 v14, s14
 ; NOOPT-NEXT:    v_mov_b32_e32 v15, s15
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
-; NOOPT-NEXT:    v_writelane_b32 v16, s0, 5
-; NOOPT-NEXT:    v_writelane_b32 v16, s1, 6
+; NOOPT-NEXT:    v_writelane_b32 v32, s0, 5
+; NOOPT-NEXT:    v_writelane_b32 v32, s1, 6
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 7
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 8
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[28:31], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v2, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v3, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v4, off, s[28:31], 0 offset:12 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v5, off, s[28:31], 0 offset:16 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v6, off, s[28:31], 0 offset:20 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v7, off, s[28:31], 0 offset:24 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v8, off, s[28:31], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v3, off, s[28:31], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[28:31], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v5, off, s[28:31], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v6, off, s[28:31], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v7, off, s[28:31], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v8, off, s[28:31], 0 offset:36 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(6)
-; NOOPT-NEXT:    buffer_load_dword v9, off, s[28:31], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v9, off, s[28:31], 0 offset:40 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(5)
-; NOOPT-NEXT:    buffer_load_dword v10, off, s[28:31], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v10, off, s[28:31], 0 offset:44 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(4)
-; NOOPT-NEXT:    buffer_load_dword v11, off, s[28:31], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v11, off, s[28:31], 0 offset:48 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(3)
-; NOOPT-NEXT:    buffer_load_dword v12, off, s[28:31], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v12, off, s[28:31], 0 offset:52 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(2)
-; NOOPT-NEXT:    buffer_load_dword v13, off, s[28:31], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(1)
-; NOOPT-NEXT:    buffer_load_dword v14, off, s[28:31], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v15, off, s[28:31], 0 offset:56 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:60 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readfirstlane_b32 s2, v18
-; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v18
+; NOOPT-NEXT:    v_readlane_b32 s0, v32, 7
+; NOOPT-NEXT:    v_readlane_b32 s1, v32, 8
+; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
+; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 m0, s2
-; NOOPT-NEXT:    v_movreld_b32_e32 v1, v17
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:88 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:92 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:96 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:100 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:104 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:108 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:112 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:116 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:120 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:124 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:128 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:132 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:136 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:140 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:144 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[28:31], 0 offset:148 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:88 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:92 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:96 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:100 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:104 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:108 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:112 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:116 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:120 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:124 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:128 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:132 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:136 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:140 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:144 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:148 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; NOOPT-NEXT:    v_writelane_b32 v0, s2, 7
-; NOOPT-NEXT:    v_writelane_b32 v0, s3, 8
+; NOOPT-NEXT:    v_writelane_b32 v32, s2, 7
+; NOOPT-NEXT:    v_writelane_b32 v32, s3, 8
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; NOOPT-NEXT:    s_cbranch_execnz .LBB17_1
 ; NOOPT-NEXT:  ; %bb.2:
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 5
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 6
+; NOOPT-NEXT:    v_readlane_b32 s0, v32, 5
+; NOOPT-NEXT:    v_readlane_b32 s1, v32, 6
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.3:
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:88 ; 4-byte Folded Reload
@@ -6018,16 +6006,16 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    buffer_load_dword v14, off, s[28:31], 0 offset:144 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[28:31], 0 offset:148 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
-; NOOPT-NEXT:    v_mov_b32_e32 v17, 63
-; NOOPT-NEXT:    buffer_store_dword v17, off, s[28:31], 0 offset:216 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_mov_b32_e32 v16, 63
+; NOOPT-NEXT:    buffer_store_dword v16, off, s[28:31], 0 offset:216 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
 ; NOOPT-NEXT:    s_waitcnt vmcnt(1)
-; NOOPT-NEXT:    v_writelane_b32 v16, s0, 9
-; NOOPT-NEXT:    v_writelane_b32 v16, s1, 10
+; NOOPT-NEXT:    v_writelane_b32 v32, s0, 9
+; NOOPT-NEXT:    v_writelane_b32 v32, s1, 10
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill
@@ -6047,193 +6035,186 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:  .LBB17_4: ; =>This Inner Loop Header: Depth=1
-; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 11
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 12
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v2, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v3, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v4, off, s[28:31], 0 offset:164 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v5, off, s[28:31], 0 offset:168 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v6, off, s[28:31], 0 offset:172 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v7, off, s[28:31], 0 offset:176 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v8, off, s[28:31], 0 offset:180 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v3, off, s[28:31], 0 offset:164 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[28:31], 0 offset:168 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v5, off, s[28:31], 0 offset:172 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v6, off, s[28:31], 0 offset:176 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v7, off, s[28:31], 0 offset:180 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v8, off, s[28:31], 0 offset:184 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(6)
-; NOOPT-NEXT:    buffer_load_dword v9, off, s[28:31], 0 offset:184 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v9, off, s[28:31], 0 offset:188 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(5)
-; NOOPT-NEXT:    buffer_load_dword v10, off, s[28:31], 0 offset:188 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v10, off, s[28:31], 0 offset:192 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(4)
-; NOOPT-NEXT:    buffer_load_dword v11, off, s[28:31], 0 offset:192 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v11, off, s[28:31], 0 offset:196 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(3)
-; NOOPT-NEXT:    buffer_load_dword v12, off, s[28:31], 0 offset:196 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v12, off, s[28:31], 0 offset:200 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(2)
-; NOOPT-NEXT:    buffer_load_dword v13, off, s[28:31], 0 offset:200 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v13, off, s[28:31], 0 offset:204 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(1)
-; NOOPT-NEXT:    buffer_load_dword v14, off, s[28:31], 0 offset:204 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v14, off, s[28:31], 0 offset:208 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v15, off, s[28:31], 0 offset:208 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readfirstlane_b32 s2, v18
-; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v18
+; NOOPT-NEXT:    v_readlane_b32 s0, v32, 11
+; NOOPT-NEXT:    v_readlane_b32 s1, v32, 12
+; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
+; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 m0, s2
-; NOOPT-NEXT:    v_movreld_b32_e32 v1, v17
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:220 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:224 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:228 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:232 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:236 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:240 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:244 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:248 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:252 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:256 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:260 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:264 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:268 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:272 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:276 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[28:31], 0 offset:280 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:164 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:168 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:172 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:176 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:180 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:184 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:188 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:192 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:196 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:200 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:204 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:220 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:224 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:228 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:232 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:236 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:240 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:244 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:248 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:252 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:256 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:260 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:264 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:268 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:272 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:276 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:280 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:164 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:168 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:172 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:176 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:180 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:184 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:188 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:192 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:196 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:200 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:204 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; NOOPT-NEXT:    v_writelane_b32 v0, s2, 11
-; NOOPT-NEXT:    v_writelane_b32 v0, s3, 12
+; NOOPT-NEXT:    v_writelane_b32 v32, s2, 11
+; NOOPT-NEXT:    v_writelane_b32 v32, s3, 12
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; NOOPT-NEXT:    s_cbranch_execnz .LBB17_4
 ; NOOPT-NEXT:  ; %bb.5:
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 9
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 10
+; NOOPT-NEXT:    v_readlane_b32 s0, v32, 9
+; NOOPT-NEXT:    v_readlane_b32 s1, v32, 10
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.6:
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[28:31], 0 offset:228 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v19, off, s[28:31], 0 offset:232 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v20, off, s[28:31], 0 offset:236 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v21, off, s[28:31], 0 offset:240 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v22, off, s[28:31], 0 offset:244 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v23, off, s[28:31], 0 offset:248 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v24, off, s[28:31], 0 offset:252 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v25, off, s[28:31], 0 offset:256 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v26, off, s[28:31], 0 offset:260 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v27, off, s[28:31], 0 offset:264 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v28, off, s[28:31], 0 offset:268 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v29, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v30, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[28:31], 0 offset:280 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 4
-; NOOPT-NEXT:    v_readlane_b32 s4, v0, 0
-; NOOPT-NEXT:    v_readlane_b32 s5, v0, 1
-; NOOPT-NEXT:    v_readlane_b32 s6, v0, 2
-; NOOPT-NEXT:    v_readlane_b32 s7, v0, 3
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v19, off, s[28:31], 0 offset:228 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v20, off, s[28:31], 0 offset:232 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v21, off, s[28:31], 0 offset:236 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v22, off, s[28:31], 0 offset:240 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v23, off, s[28:31], 0 offset:244 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v24, off, s[28:31], 0 offset:248 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v25, off, s[28:31], 0 offset:252 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v26, off, s[28:31], 0 offset:256 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v27, off, s[28:31], 0 offset:260 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v28, off, s[28:31], 0 offset:264 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v29, off, s[28:31], 0 offset:268 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v30, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v31, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 offset:280 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(12)
-; NOOPT-NEXT:    v_mov_b32_e32 v6, v20
-; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
-; NOOPT-NEXT:    v_mov_b32_e32 v8, v18
-; NOOPT-NEXT:    v_mov_b32_e32 v2, v17
-; NOOPT-NEXT:    s_waitcnt vmcnt(8)
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v24
-; NOOPT-NEXT:    v_mov_b32_e32 v4, v23
-; NOOPT-NEXT:    v_mov_b32_e32 v5, v22
-; NOOPT-NEXT:    v_mov_b32_e32 v9, v21
-; NOOPT-NEXT:    s_waitcnt vmcnt(4)
-; NOOPT-NEXT:    v_mov_b32_e32 v14, v28
-; NOOPT-NEXT:    v_mov_b32_e32 v15, v27
-; NOOPT-NEXT:    v_mov_b32_e32 v16, v26
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v25
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v32
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v31
-; NOOPT-NEXT:    v_mov_b32_e32 v13, v30
-; NOOPT-NEXT:    v_mov_b32_e32 v17, v29
+; NOOPT-NEXT:    v_readlane_b32 s0, v32, 4
+; NOOPT-NEXT:    v_readlane_b32 s4, v32, 0
+; NOOPT-NEXT:    v_readlane_b32 s5, v32, 1
+; NOOPT-NEXT:    v_readlane_b32 s6, v32, 2
+; NOOPT-NEXT:    v_readlane_b32 s7, v32, 3
+; NOOPT-NEXT:    v_mov_b32_e32 v5, v19
+; NOOPT-NEXT:    v_mov_b32_e32 v6, v18
+; NOOPT-NEXT:    v_mov_b32_e32 v7, v17
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v16
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v23
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v22
+; NOOPT-NEXT:    v_mov_b32_e32 v4, v21
+; NOOPT-NEXT:    v_mov_b32_e32 v8, v20
+; NOOPT-NEXT:    v_mov_b32_e32 v13, v27
+; NOOPT-NEXT:    v_mov_b32_e32 v14, v26
+; NOOPT-NEXT:    v_mov_b32_e32 v15, v25
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v24
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v31
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v30
+; NOOPT-NEXT:    v_mov_b32_e32 v12, v29
+; NOOPT-NEXT:    v_mov_b32_e32 v16, v28
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
-; NOOPT-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18_vgpr19_vgpr20 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v18, v13
-; NOOPT-NEXT:    v_mov_b32_e32 v19, v12
-; NOOPT-NEXT:    v_mov_b32_e32 v20, v11
-; NOOPT-NEXT:    buffer_store_dwordx4 v[17:20], off, s[4:7], 0 offset:48
+; NOOPT-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v17, v12
+; NOOPT-NEXT:    v_mov_b32_e32 v18, v11
+; NOOPT-NEXT:    v_mov_b32_e32 v19, v10
+; NOOPT-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:48
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
-; NOOPT-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v16
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v15
-; NOOPT-NEXT:    v_mov_b32_e32 v13, v14
-; NOOPT-NEXT:    buffer_store_dwordx4 v[10:13], off, s[4:7], 0 offset:32
+; NOOPT-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v15
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v14
+; NOOPT-NEXT:    v_mov_b32_e32 v12, v13
+; NOOPT-NEXT:    buffer_store_dwordx4 v[9:12], off, s[4:7], 0 offset:32
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
-; NOOPT-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec
+; NOOPT-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v5
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v4
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v3
-; NOOPT-NEXT:    buffer_store_dwordx4 v[9:12], off, s[4:7], 0 offset:16
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v4
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v3
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v2
+; NOOPT-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
-; NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v8
-; NOOPT-NEXT:    v_mov_b32_e32 v4, v7
-; NOOPT-NEXT:    v_mov_b32_e32 v5, v6
-; NOOPT-NEXT:    buffer_store_dwordx4 v[2:5], off, s[4:7], 0
+; NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v7
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v6
+; NOOPT-NEXT:    v_mov_b32_e32 v4, v5
+; NOOPT-NEXT:    buffer_store_dwordx4 v[1:4], off, s[4:7], 0
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v1, s0
+; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 13
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 14
+; NOOPT-NEXT:    v_writelane_b32 v32, s0, 13
+; NOOPT-NEXT:    v_writelane_b32 v32, s1, 14
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:    s_cbranch_execz .LBB17_8
 ; NOOPT-NEXT:  ; %bb.7: ; %bb1
-; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:68 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:    s_mov_b32 s6, s1
@@ -6251,13 +6232,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:  .LBB17_8: ; %bb2
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 13
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 14
+; NOOPT-NEXT:    v_readlane_b32 s0, v32, 13
+; NOOPT-NEXT:    v_readlane_b32 s1, v32, 14
 ; NOOPT-NEXT:    s_or_b64 exec, exec, s[0:1]
-; NOOPT-NEXT:    ; kill: killed $vgpr0
 ; NOOPT-NEXT:    s_endpgm
 ;
 ; SI-MOVREL-LABEL: insert_vgpr_offset_multiple_in_block:
@@ -7279,28 +7259,28 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; NOOPT-NEXT:    s_mov_b32 s15, 0xe8f000
 ; NOOPT-NEXT:    s_add_u32 s12, s12, s9
 ; NOOPT-NEXT:    s_addc_u32 s13, s13, 0
-; NOOPT-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; NOOPT-NEXT:    s_load_dword s2, s[2:3], 0x9
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], -1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr3
 ; NOOPT-NEXT:    s_mov_b32 s3, 0
 ; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
 ; NOOPT-NEXT:    s_cmp_lg_u32 s2, s3
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 0
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 1
+; NOOPT-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
+; NOOPT-NEXT:    v_writelane_b32 v4, s0, 0
+; NOOPT-NEXT:    v_writelane_b32 v4, s1, 1
 ; NOOPT-NEXT:    s_mov_b64 s[8:9], exec
 ; NOOPT-NEXT:    s_mov_b64 exec, -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[8:9]
 ; NOOPT-NEXT:    s_cbranch_scc1 .LBB19_3
 ; NOOPT-NEXT:  .LBB19_1: ; %Flow
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[8:9]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 0
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 1
+; NOOPT-NEXT:    v_readlane_b32 s0, v4, 0
+; NOOPT-NEXT:    v_readlane_b32 s1, v4, 1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr2
 ; NOOPT-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 s0, 1
@@ -7330,7 +7310,7 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; NOOPT-NEXT:  .LBB19_3: ; %bb4
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[8:9]
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:    s_mov_b32 s6, s1
@@ -7342,24 +7322,21 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; NOOPT-NEXT:    s_mov_b32 s1, s6
 ; NOOPT-NEXT:    s_mov_b32 s2, s5
 ; NOOPT-NEXT:    s_mov_b32 s3, s4
-; NOOPT-NEXT:    buffer_load_dwordx4 v[1:4], off, s[0:3], 0 glc
+; NOOPT-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    ;;#ASMSTART
-; NOOPT-NEXT:    ; reg use v[1:4]
+; NOOPT-NEXT:    ; reg use v[0:3]
 ; NOOPT-NEXT:    ;;#ASMEND
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], 0
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 0
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 1
+; NOOPT-NEXT:    v_writelane_b32 v4, s0, 0
+; NOOPT-NEXT:    v_writelane_b32 v4, s1, 1
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[8:9]
 ; NOOPT-NEXT:    s_branch .LBB19_1
 ; NOOPT-NEXT:  .LBB19_4: ; %bb7
-; NOOPT-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[8:9]
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:    s_mov_b32 s7, s1
@@ -7371,10 +7348,9 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; NOOPT-NEXT:    s_mov_b32 s1, s7
 ; NOOPT-NEXT:    s_mov_b32 s2, s6
 ; NOOPT-NEXT:    s_mov_b32 s3, s5
-; NOOPT-NEXT:    v_mov_b32_e32 v1, s4
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; NOOPT-NEXT:    v_mov_b32_e32 v0, s4
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    ; kill: killed $vgpr0
 ; NOOPT-NEXT:    s_endpgm
 ;
 ; SI-MOVREL-LABEL: extract_adjacent_blocks:
@@ -7525,7 +7501,6 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; NOOPT-NEXT:    s_mov_b32 s19, 0xe8f000
 ; NOOPT-NEXT:    s_add_u32 s16, s16, s9
 ; NOOPT-NEXT:    s_addc_u32 s17, s17, 0
-; NOOPT-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; NOOPT-NEXT:    s_load_dword s2, s[0:1], 0x9
 ; NOOPT-NEXT:    s_load_dword s0, s[0:1], 0xa
@@ -7534,21 +7509,22 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; NOOPT-NEXT:    s_mov_b32 s3, 0
 ; NOOPT-NEXT:    s_cmp_lg_u32 s2, s3
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 0
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 1
+; NOOPT-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
+; NOOPT-NEXT:    v_writelane_b32 v4, s0, 0
+; NOOPT-NEXT:    v_writelane_b32 v4, s1, 1
 ; NOOPT-NEXT:    s_mov_b64 s[12:13], exec
 ; NOOPT-NEXT:    s_mov_b64 exec, -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
 ; NOOPT-NEXT:    s_cbranch_scc1 .LBB20_3
 ; NOOPT-NEXT:  .LBB20_1: ; %Flow
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 0
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 1
+; NOOPT-NEXT:    v_readlane_b32 s0, v4, 0
+; NOOPT-NEXT:    v_readlane_b32 s1, v4, 1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; NOOPT-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 s0, 1
@@ -7579,7 +7555,7 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; NOOPT-NEXT:  .LBB20_3: ; %bb4
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:    s_mov_b32 s6, s1
@@ -7591,25 +7567,22 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; NOOPT-NEXT:    s_mov_b32 s1, s6
 ; NOOPT-NEXT:    s_mov_b32 s2, s5
 ; NOOPT-NEXT:    s_mov_b32 s3, s4
-; NOOPT-NEXT:    buffer_load_dwordx4 v[1:4], off, s[0:3], 0 glc
+; NOOPT-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
-; NOOPT-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; NOOPT-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    ;;#ASMSTART
-; NOOPT-NEXT:    ; reg use v[1:4]
+; NOOPT-NEXT:    ; reg use v[0:3]
 ; NOOPT-NEXT:    ;;#ASMEND
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], 0
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 0
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 1
+; NOOPT-NEXT:    v_writelane_b32 v4, s0, 0
+; NOOPT-NEXT:    v_writelane_b32 v4, s1, 1
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
 ; NOOPT-NEXT:    s_branch .LBB20_1
 ; NOOPT-NEXT:  .LBB20_4: ; %bb7
-; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:    s_mov_b32 s10, s1
@@ -7621,13 +7594,12 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; NOOPT-NEXT:    s_mov_b32 s1, s10
 ; NOOPT-NEXT:    s_mov_b32 s2, s9
 ; NOOPT-NEXT:    s_mov_b32 s3, s8
-; NOOPT-NEXT:    v_mov_b32_e32 v1, s4
-; NOOPT-NEXT:    v_mov_b32_e32 v2, s5
-; NOOPT-NEXT:    v_mov_b32_e32 v3, s6
-; NOOPT-NEXT:    v_mov_b32_e32 v4, s7
-; NOOPT-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
+; NOOPT-NEXT:    v_mov_b32_e32 v0, s4
+; NOOPT-NEXT:    v_mov_b32_e32 v1, s5
+; NOOPT-NEXT:    v_mov_b32_e32 v2, s6
+; NOOPT-NEXT:    v_mov_b32_e32 v3, s7
+; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    ; kill: killed $vgpr0
 ; NOOPT-NEXT:    s_endpgm
 ;
 ; SI-MOVREL-LABEL: insert_adjacent_blocks:
@@ -9084,49 +9056,48 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    s_mov_b32 s27, 0xe8f000
 ; NOOPT-NEXT:    s_add_u32 s24, s24, s9
 ; NOOPT-NEXT:    s_addc_u32 s25, s25, 0
-; NOOPT-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; NOOPT-NEXT:    s_load_dword s1, s[2:3], 0x9
 ; NOOPT-NEXT:    s_load_dword s0, s[2:3], 0xa
+; NOOPT-NEXT:    ; implicit-def: $vgpr18 : SGPR spill to VGPR lane
 ; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 0
+; NOOPT-NEXT:    v_writelane_b32 v18, s1, 0
 ; NOOPT-NEXT:    s_mov_b32 s1, 8
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 1
+; NOOPT-NEXT:    v_writelane_b32 v18, s0, 1
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
-; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    v_mov_b32_e32 v0, 8
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
 ; NOOPT-NEXT:  .LBB26_1: ; %bb2
 ; NOOPT-NEXT:    ; =>This Loop Header: Depth=1
 ; NOOPT-NEXT:    ; Child Loop BB26_3 Depth 2
-; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s2, v0, 0
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    v_readlane_b32 s2, v18, 0
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], -1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_cmp_ge_i32_e64 s[2:3], v1, s2
-; NOOPT-NEXT:    v_mov_b32_e32 v1, s4
+; NOOPT-NEXT:    v_cmp_ge_i32_e64 s[2:3], v0, s2
+; NOOPT-NEXT:    v_mov_b32_e32 v0, s4
 ; NOOPT-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 2
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 3
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_writelane_b32 v18, s0, 2
+; NOOPT-NEXT:    v_writelane_b32 v18, s1, 3
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
 ; NOOPT-NEXT:    s_cbranch_vccnz .LBB26_6
 ; NOOPT-NEXT:  ; %bb.2: ; %bb4
 ; NOOPT-NEXT:    ; in Loop: Header=BB26_1 Depth=1
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(0)
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v16, 1
+; NOOPT-NEXT:    v_readlane_b32 s0, v18, 1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; NOOPT-NEXT:    ; kill: def $sgpr3 killed $sgpr3 killed $sgpr2_sgpr3
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4_sgpr5
@@ -9137,7 +9108,6 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    s_mov_b32 s5, s3
 ; NOOPT-NEXT:    s_mov_b32 s6, s2
 ; NOOPT-NEXT:    s_mov_b32 s7, s1
-; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:80 ; 4-byte Folded Spill
@@ -9159,13 +9129,13 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    v_mov_b32_e32 v13, s17
 ; NOOPT-NEXT:    v_mov_b32_e32 v14, s18
 ; NOOPT-NEXT:    v_mov_b32_e32 v15, s19
-; NOOPT-NEXT:    v_mov_b32_e32 v17, s0
-; NOOPT-NEXT:    buffer_store_dword v17, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_mov_b32_e32 v16, s0
+; NOOPT-NEXT:    buffer_store_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
-; NOOPT-NEXT:    v_writelane_b32 v16, s0, 4
-; NOOPT-NEXT:    v_writelane_b32 v16, s1, 5
+; NOOPT-NEXT:    v_writelane_b32 v18, s0, 4
+; NOOPT-NEXT:    v_writelane_b32 v18, s1, 5
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
@@ -9186,146 +9156,139 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:  .LBB26_3: ; Parent Loop BB26_1 Depth=1
 ; NOOPT-NEXT:    ; => This Inner Loop Header: Depth=2
-; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 6
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 7
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v3, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v4, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v5, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v6, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v7, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v8, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(6)
-; NOOPT-NEXT:    buffer_load_dword v9, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(5)
-; NOOPT-NEXT:    buffer_load_dword v10, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(4)
-; NOOPT-NEXT:    buffer_load_dword v11, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(3)
-; NOOPT-NEXT:    buffer_load_dword v12, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(2)
-; NOOPT-NEXT:    buffer_load_dword v13, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(1)
-; NOOPT-NEXT:    buffer_load_dword v14, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v15, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readfirstlane_b32 s2, v18
-; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v18
+; NOOPT-NEXT:    v_readlane_b32 s0, v18, 6
+; NOOPT-NEXT:    v_readlane_b32 s1, v18, 7
+; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
+; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 m0, s2
-; NOOPT-NEXT:    v_movreld_b32_e32 v1, v17
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:84 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[24:27], 0 offset:88 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:92 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[24:27], 0 offset:96 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[24:27], 0 offset:100 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[24:27], 0 offset:104 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:108 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[24:27], 0 offset:112 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[24:27], 0 offset:116 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[24:27], 0 offset:120 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[24:27], 0 offset:124 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[24:27], 0 offset:132 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[24:27], 0 offset:136 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[24:27], 0 offset:140 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[24:27], 0 offset:144 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:84 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:88 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[24:27], 0 offset:92 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:96 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[24:27], 0 offset:100 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[24:27], 0 offset:104 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[24:27], 0 offset:108 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:112 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[24:27], 0 offset:116 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[24:27], 0 offset:120 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[24:27], 0 offset:124 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:132 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[24:27], 0 offset:136 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[24:27], 0 offset:140 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[24:27], 0 offset:144 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; NOOPT-NEXT:    v_writelane_b32 v0, s2, 6
-; NOOPT-NEXT:    v_writelane_b32 v0, s3, 7
+; NOOPT-NEXT:    v_writelane_b32 v18, s2, 6
+; NOOPT-NEXT:    v_writelane_b32 v18, s3, 7
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
 ; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; NOOPT-NEXT:    s_cbranch_execnz .LBB26_3
 ; NOOPT-NEXT:  ; %bb.4: ; in Loop: Header=BB26_1 Depth=1
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 4
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 5
+; NOOPT-NEXT:    v_readlane_b32 s0, v18, 4
+; NOOPT-NEXT:    v_readlane_b32 s1, v18, 5
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.5: ; in Loop: Header=BB26_1 Depth=1
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:84 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:88 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:92 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v3, off, s[24:27], 0 offset:96 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[24:27], 0 offset:100 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v5, off, s[24:27], 0 offset:104 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v6, off, s[24:27], 0 offset:108 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v7, off, s[24:27], 0 offset:112 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v8, off, s[24:27], 0 offset:116 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v9, off, s[24:27], 0 offset:120 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v10, off, s[24:27], 0 offset:124 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v11, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v12, off, s[24:27], 0 offset:132 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v13, off, s[24:27], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v14, off, s[24:27], 0 offset:140 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[24:27], 0 offset:144 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:84 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:88 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v3, off, s[24:27], 0 offset:92 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v4, off, s[24:27], 0 offset:96 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v5, off, s[24:27], 0 offset:100 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v6, off, s[24:27], 0 offset:104 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v7, off, s[24:27], 0 offset:108 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v8, off, s[24:27], 0 offset:112 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v9, off, s[24:27], 0 offset:116 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v10, off, s[24:27], 0 offset:120 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v11, off, s[24:27], 0 offset:124 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v12, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v13, off, s[24:27], 0 offset:132 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v14, off, s[24:27], 0 offset:136 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v15, off, s[24:27], 0 offset:140 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[24:27], 0 offset:144 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], 0
 ; NOOPT-NEXT:    s_waitcnt vmcnt(14)
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
-; NOOPT-NEXT:    v_writelane_b32 v0, s0, 2
-; NOOPT-NEXT:    v_writelane_b32 v0, s1, 3
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT:    s_waitcnt vmcnt(1)
+; NOOPT-NEXT:    v_writelane_b32 v18, s0, 2
+; NOOPT-NEXT:    v_writelane_b32 v18, s1, 3
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
 ; NOOPT-NEXT:  .LBB26_6: ; %Flow
 ; NOOPT-NEXT:    ; in Loop: Header=BB26_1 Depth=1
-; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(1)
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT:    s_waitcnt expcnt(0)
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v1, 2
-; NOOPT-NEXT:    v_readlane_b32 s1, v1, 3
-; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT:    v_readlane_b32 s0, v18, 2
+; NOOPT-NEXT:    v_readlane_b32 s1, v18, 3
 ; NOOPT-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 s0, 1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    v_cmp_ne_u32_e64 s[0:1], v1, s0
 ; NOOPT-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_cbranch_vccnz .LBB26_1
 ; NOOPT-NEXT:  ; %bb.7: ; %bb8
-; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
-; NOOPT-NEXT:    ; kill: killed $vgpr0
 ; NOOPT-NEXT:    s_endpgm
 ;
 ; SI-MOVREL-LABEL: broken_phi_bb:
@@ -9570,13 +9533,13 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    s_mov_b32 s19, 0xe8f000
 ; NOOPT-NEXT:    s_add_u32 s16, s16, s5
 ; NOOPT-NEXT:    s_addc_u32 s17, s17, 0
-; NOOPT-NEXT:    ; implicit-def: $vgpr16 : SGPR spill to VGPR lane
-; NOOPT-NEXT:    v_writelane_b32 v16, s4, 0
+; NOOPT-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
+; NOOPT-NEXT:    v_writelane_b32 v33, s4, 0
 ; NOOPT-NEXT:    s_mov_b32 s4, s1
-; NOOPT-NEXT:    v_readlane_b32 s1, v16, 0
-; NOOPT-NEXT:    v_writelane_b32 v16, s4, 1
+; NOOPT-NEXT:    v_readlane_b32 s1, v33, 0
+; NOOPT-NEXT:    v_writelane_b32 v33, s4, 1
 ; NOOPT-NEXT:    s_mov_b32 s4, s0
-; NOOPT-NEXT:    v_readlane_b32 s0, v16, 1
+; NOOPT-NEXT:    v_readlane_b32 s0, v33, 1
 ; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:144 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    v_mov_b32_e32 v2, v1
 ; NOOPT-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
@@ -9591,17 +9554,17 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b32 s8, 0xf000
 ; NOOPT-NEXT:    s_mov_b32 s0, 0
-; NOOPT-NEXT:    v_writelane_b32 v16, s0, 2
+; NOOPT-NEXT:    v_writelane_b32 v33, s0, 2
 ; NOOPT-NEXT:    s_mov_b32 s2, s0
 ; NOOPT-NEXT:    s_mov_b32 s3, s8
 ; NOOPT-NEXT:    s_mov_b32 s8, s0
 ; NOOPT-NEXT:    s_mov_b32 s9, s0
 ; NOOPT-NEXT:    ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; NOOPT-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; NOOPT-NEXT:    v_writelane_b32 v16, s8, 3
-; NOOPT-NEXT:    v_writelane_b32 v16, s9, 4
-; NOOPT-NEXT:    v_writelane_b32 v16, s10, 5
-; NOOPT-NEXT:    v_writelane_b32 v16, s11, 6
+; NOOPT-NEXT:    v_writelane_b32 v33, s8, 3
+; NOOPT-NEXT:    v_writelane_b32 v33, s9, 4
+; NOOPT-NEXT:    v_writelane_b32 v33, s10, 5
+; NOOPT-NEXT:    v_writelane_b32 v33, s11, 6
 ; NOOPT-NEXT:    ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7
 ; NOOPT-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; NOOPT-NEXT:    s_waitcnt expcnt(1)
@@ -9611,7 +9574,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    v_mov_b32_e32 v0, s0
-; NOOPT-NEXT:    v_mov_b32_e32 v31, s0
 ; NOOPT-NEXT:    v_mov_b32_e32 v30, s0
 ; NOOPT-NEXT:    v_mov_b32_e32 v29, s0
 ; NOOPT-NEXT:    v_mov_b32_e32 v28, s0
@@ -9626,22 +9588,23 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    v_mov_b32_e32 v19, s0
 ; NOOPT-NEXT:    v_mov_b32_e32 v18, s0
 ; NOOPT-NEXT:    v_mov_b32_e32 v17, s0
+; NOOPT-NEXT:    v_mov_b32_e32 v16, s0
 ; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v1, v31
-; NOOPT-NEXT:    v_mov_b32_e32 v2, v30
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v29
-; NOOPT-NEXT:    v_mov_b32_e32 v4, v28
-; NOOPT-NEXT:    v_mov_b32_e32 v5, v27
-; NOOPT-NEXT:    v_mov_b32_e32 v6, v26
-; NOOPT-NEXT:    v_mov_b32_e32 v7, v25
-; NOOPT-NEXT:    v_mov_b32_e32 v8, v24
-; NOOPT-NEXT:    v_mov_b32_e32 v9, v23
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v22
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v21
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v20
-; NOOPT-NEXT:    v_mov_b32_e32 v13, v19
-; NOOPT-NEXT:    v_mov_b32_e32 v14, v18
-; NOOPT-NEXT:    v_mov_b32_e32 v15, v17
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
+; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
+; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
+; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
+; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
+; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
+; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
+; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
+; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
+; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:68 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:72 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v2, off, s[16:19], 0 offset:76 ; 4-byte Folded Spill
@@ -9659,207 +9622,200 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    buffer_store_dword v14, off, s[16:19], 0 offset:124 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v15, off, s[16:19], 0 offset:128 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
-; NOOPT-NEXT:    v_writelane_b32 v16, s0, 7
-; NOOPT-NEXT:    v_writelane_b32 v16, s1, 8
+; NOOPT-NEXT:    v_writelane_b32 v33, s0, 7
+; NOOPT-NEXT:    v_writelane_b32 v33, s1, 8
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v33, off, s[16:19], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:  .LBB27_1: ; =>This Inner Loop Header: Depth=1
-; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 9
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 10
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[16:19], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v2, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v6, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v7, off, s[16:19], 0 offset:24 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v8, off, s[16:19], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v5, off, s[16:19], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v6, off, s[16:19], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v7, off, s[16:19], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v8, off, s[16:19], 0 offset:36 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(6)
-; NOOPT-NEXT:    buffer_load_dword v9, off, s[16:19], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v9, off, s[16:19], 0 offset:40 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(5)
-; NOOPT-NEXT:    buffer_load_dword v10, off, s[16:19], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v10, off, s[16:19], 0 offset:44 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(4)
-; NOOPT-NEXT:    buffer_load_dword v11, off, s[16:19], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v11, off, s[16:19], 0 offset:48 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(3)
-; NOOPT-NEXT:    buffer_load_dword v12, off, s[16:19], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v12, off, s[16:19], 0 offset:52 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(2)
-; NOOPT-NEXT:    buffer_load_dword v13, off, s[16:19], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v13, off, s[16:19], 0 offset:56 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(1)
-; NOOPT-NEXT:    buffer_load_dword v14, off, s[16:19], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v14, off, s[16:19], 0 offset:60 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v15, off, s[16:19], 0 offset:56 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[16:19], 0 offset:60 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; NOOPT-NEXT:    buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readfirstlane_b32 s2, v18
-; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v18
+; NOOPT-NEXT:    v_readlane_b32 s0, v33, 9
+; NOOPT-NEXT:    v_readlane_b32 s1, v33, 10
+; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
+; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 m0, s2
-; NOOPT-NEXT:    v_movreld_b32_e32 v2, v17
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:156 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:160 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:164 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[16:19], 0 offset:168 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[16:19], 0 offset:172 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[16:19], 0 offset:176 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[16:19], 0 offset:180 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[16:19], 0 offset:184 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[16:19], 0 offset:188 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[16:19], 0 offset:192 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[16:19], 0 offset:196 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[16:19], 0 offset:200 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[16:19], 0 offset:204 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[16:19], 0 offset:208 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v16, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_movreld_b32_e32 v1, v16
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[16:19], 0 offset:156 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:160 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:164 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:168 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[16:19], 0 offset:172 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[16:19], 0 offset:176 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[16:19], 0 offset:180 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[16:19], 0 offset:184 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[16:19], 0 offset:188 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[16:19], 0 offset:192 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[16:19], 0 offset:196 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[16:19], 0 offset:200 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[16:19], 0 offset:204 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[16:19], 0 offset:208 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; NOOPT-NEXT:    v_writelane_b32 v0, s2, 9
-; NOOPT-NEXT:    v_writelane_b32 v0, s3, 10
+; NOOPT-NEXT:    v_writelane_b32 v33, s2, 9
+; NOOPT-NEXT:    v_writelane_b32 v33, s3, 10
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v33, off, s[16:19], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
 ; NOOPT-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; NOOPT-NEXT:    s_cbranch_execnz .LBB27_1
 ; NOOPT-NEXT:  ; %bb.2:
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 7
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 8
+; NOOPT-NEXT:    v_readlane_b32 s0, v33, 7
+; NOOPT-NEXT:    v_readlane_b32 s1, v33, 8
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.3:
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v5, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v19, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v20, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v21, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v22, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v23, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v24, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v25, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v26, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v27, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v28, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v29, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v30, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v32, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v0, 3
-; NOOPT-NEXT:    v_readlane_b32 s1, v0, 4
-; NOOPT-NEXT:    v_readlane_b32 s2, v0, 5
-; NOOPT-NEXT:    v_readlane_b32 s3, v0, 6
-; NOOPT-NEXT:    buffer_load_dword v5, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v6, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v19, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v20, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v21, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v22, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v23, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v24, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v25, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v26, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v27, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v28, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v29, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v30, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v31, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v32, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v33, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(12)
-; NOOPT-NEXT:    v_mov_b32_e32 v7, v21
-; NOOPT-NEXT:    v_mov_b32_e32 v8, v20
-; NOOPT-NEXT:    v_mov_b32_e32 v9, v19
-; NOOPT-NEXT:    v_mov_b32_e32 v1, v18
-; NOOPT-NEXT:    s_waitcnt vmcnt(8)
-; NOOPT-NEXT:    v_mov_b32_e32 v2, v25
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v24
-; NOOPT-NEXT:    v_mov_b32_e32 v4, v23
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v22
-; NOOPT-NEXT:    s_waitcnt vmcnt(4)
-; NOOPT-NEXT:    v_mov_b32_e32 v15, v29
-; NOOPT-NEXT:    v_mov_b32_e32 v16, v28
-; NOOPT-NEXT:    v_mov_b32_e32 v17, v27
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v26
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v33
-; NOOPT-NEXT:    v_mov_b32_e32 v13, v32
-; NOOPT-NEXT:    v_mov_b32_e32 v14, v31
-; NOOPT-NEXT:    v_mov_b32_e32 v18, v30
+; NOOPT-NEXT:    v_readlane_b32 s0, v33, 3
+; NOOPT-NEXT:    v_readlane_b32 s1, v33, 4
+; NOOPT-NEXT:    v_readlane_b32 s2, v33, 5
+; NOOPT-NEXT:    v_readlane_b32 s3, v33, 6
+; NOOPT-NEXT:    v_mov_b32_e32 v6, v20
+; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
+; NOOPT-NEXT:    v_mov_b32_e32 v8, v18
+; NOOPT-NEXT:    v_mov_b32_e32 v0, v17
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v24
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v23
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v22
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v21
+; NOOPT-NEXT:    v_mov_b32_e32 v14, v28
+; NOOPT-NEXT:    v_mov_b32_e32 v15, v27
+; NOOPT-NEXT:    v_mov_b32_e32 v16, v26
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v25
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v32
+; NOOPT-NEXT:    v_mov_b32_e32 v12, v31
+; NOOPT-NEXT:    v_mov_b32_e32 v13, v30
+; NOOPT-NEXT:    v_mov_b32_e32 v17, v29
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19_vgpr20_vgpr21 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v19, v14
-; NOOPT-NEXT:    v_mov_b32_e32 v20, v13
-; NOOPT-NEXT:    v_mov_b32_e32 v21, v12
-; NOOPT-NEXT:    v_mov_b32_e32 v13, v6
+; NOOPT-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18_vgpr19_vgpr20 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v18, v13
+; NOOPT-NEXT:    v_mov_b32_e32 v19, v12
+; NOOPT-NEXT:    v_mov_b32_e32 v20, v11
 ; NOOPT-NEXT:    v_mov_b32_e32 v12, v5
-; NOOPT-NEXT:    buffer_store_dwordx4 v[18:21], v[12:13], s[0:3], 0 addr64 offset:48
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v4
+; NOOPT-NEXT:    buffer_store_dwordx4 v[17:20], v[11:12], s[0:3], 0 addr64 offset:48
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12_vgpr13_vgpr14 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v17
-; NOOPT-NEXT:    v_mov_b32_e32 v13, v16
-; NOOPT-NEXT:    v_mov_b32_e32 v14, v15
-; NOOPT-NEXT:    v_mov_b32_e32 v16, v6
+; NOOPT-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v16
+; NOOPT-NEXT:    v_mov_b32_e32 v12, v15
+; NOOPT-NEXT:    v_mov_b32_e32 v13, v14
 ; NOOPT-NEXT:    v_mov_b32_e32 v15, v5
-; NOOPT-NEXT:    buffer_store_dwordx4 v[11:14], v[15:16], s[0:3], 0 addr64 offset:32
+; NOOPT-NEXT:    v_mov_b32_e32 v14, v4
+; NOOPT-NEXT:    buffer_store_dwordx4 v[10:13], v[14:15], s[0:3], 0 addr64 offset:32
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec
+; NOOPT-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v4
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v3
-; NOOPT-NEXT:    v_mov_b32_e32 v13, v2
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v3
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v2
+; NOOPT-NEXT:    v_mov_b32_e32 v12, v1
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v4
 ; NOOPT-NEXT:    v_mov_b32_e32 v2, v5
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v6
-; NOOPT-NEXT:    buffer_store_dwordx4 v[10:13], v[2:3], s[0:3], 0 addr64 offset:16
+; NOOPT-NEXT:    buffer_store_dwordx4 v[9:12], v[1:2], s[0:3], 0 addr64 offset:16
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v2, v9
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v8
-; NOOPT-NEXT:    v_mov_b32_e32 v4, v7
-; NOOPT-NEXT:    buffer_store_dwordx4 v[1:4], v[5:6], s[0:3], 0 addr64
-; NOOPT-NEXT:    ; kill: killed $vgpr0
+; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v8
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v7
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v6
+; NOOPT-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
 ; NOOPT-NEXT:    s_endpgm
 ;
 ; SI-MOVREL-LABEL: insert_or_disj_index:
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
index f20d720c3876b..dce4162c24624 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
@@ -41,7 +41,7 @@ entry:
 }
 
 ; FIXME: This should warn too
-; ERR-NOT: warning
+; ERR-NOT: warning: inline asm clobber list contains reserved registers
 define amdgpu_kernel void @def_exec(ptr addrspace(1) %ptr) {
 entry:
   %exec = call i64 asm sideeffect "; def $0", "={exec}"()
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-mismatched-size-error.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-mismatched-size-error.ll
new file mode 100644
index 0000000000000..723e0f2e7152d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-mismatched-size-error.ll
@@ -0,0 +1,104 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
+
+; Diagnose register constraints that are not wide enough.
+
+; ERR: error: couldn't allocate output register for constraint '{v[8:15]}'
+define <9 x i32> @inline_asm_9xi32_in_8v_def() {
+  %asm = call <9 x i32> asm sideeffect "; def $0", "={v[8:15]}"()
+  ret <9 x i32> %asm
+}
+
+; ERR: error: couldn't allocate input reg for constraint '{v[8:15]}'
+define void @inline_asm_9xi32_in_8v_use(<9 x i32> %val) {
+  call void asm sideeffect "; use $0", "{v[8:15]}"(<9 x i32> %val)
+  ret void
+}
+
+; ERR: error: couldn't allocate output register for constraint '{s[8:15]}'
+define <9 x i32> @inline_asm_9xi32_in_8s_def() {
+  %asm = call <9 x i32> asm sideeffect "; def $0", "={s[8:15]}"()
+  ret <9 x i32> %asm
+}
+
+
+; Diagnose register constraints that are too wide.
+
+; ERR: error: couldn't allocate output register for constraint '{v[8:16]}'
+define <8 x i32> @inline_asm_8xi32_in_9v_def() {
+  %asm = call <8 x i32> asm sideeffect "; def $0", "={v[8:16]}"()
+  ret <8 x i32> %asm
+}
+
+; ERR: error: couldn't allocate input reg for constraint '{v[8:16]}'
+define void @inline_asm_8xi32_in_9v_use(<8 x i32> %val) {
+  call void asm sideeffect "; use $0", "{v[8:16]}"(<8 x i32> %val)
+  ret void
+}
+
+; ERR: error: couldn't allocate output register for constraint '{s[8:16]}'
+define <8 x i32> @inline_asm_8xi32_in_9s_def() {
+  %asm = call <8 x i32> asm sideeffect "; def $0", "={s[8:16]}"()
+  ret <8 x i32> %asm
+}
+
+
+; Diagnose mismatched scalars with register ranges
+
+; ERR: error: couldn't allocate output register for constraint '{s[4:5]}'
+define void @inline_asm_scalar_read_too_wide() {
+  %asm = call i32 asm sideeffect "; def $0 ", "={s[4:5]}"()
+  ret void
+}
+
+; ERR: error: couldn't allocate output register for constraint '{s[4:4]}'
+define void @inline_asm_scalar_read_too_narrow() {
+  %asm = call i64 asm sideeffect "; def $0 ", "={s[4:4]}"()
+  ret void
+}
+
+; Single registers for vector types that are too wide or too narrow should be
+; diagnosed.
+
+; ERR: error: couldn't allocate input reg for constraint '{v8}'
+define void @inline_asm_4xi32_in_v_use(<4 x i32> %val) {
+  call void asm sideeffect "; use $0", "{v8}"(<4 x i32> %val)
+  ret void
+}
+
+; ERR: error: couldn't allocate output register for constraint '{v8}'
+define <4 x i32> @inline_asm_4xi32_in_v_def() {
+  %asm = call <4 x i32> asm sideeffect "; def $0", "={v8}"()
+  ret <4 x i32> %asm
+}
+
+; ERR: error: couldn't allocate output register for constraint '{s8}'
+define <4 x i32> @inline_asm_4xi32_in_s_def() {
+  %asm = call <4 x i32> asm sideeffect "; def $0", "={s8}"()
+  ret <4 x i32> %asm
+}
+
+; ERR: error: couldn't allocate input reg for constraint '{v8}'
+; ERR: error: couldn't allocate input reg for constraint 'v'
+define void @inline_asm_2xi8_in_v_use(<2 x i8> %val) {
+  call void asm sideeffect "; use $0", "{v8}"(<2 x i8> %val)
+  call void asm sideeffect "; use $0", "v"(<2 x i8> %val)
+  ret void
+}
+
+; ERR: error: couldn't allocate output register for constraint '{v8}'
+; ERR: error: couldn't allocate output register for constraint 'v'
+define <2 x i8> @inline_asm_2xi8_in_v_def() {
+  %phys = call <2 x i8> asm sideeffect "; def $0", "={v8}"()
+  %virt = call <2 x i8> asm sideeffect "; def $0", "=v"()
+  %r = and <2 x i8> %phys, %virt
+  ret <2 x i8> %r
+}
+
+; ERR: error: couldn't allocate output register for constraint '{s8}'
+; ERR: error: couldn't allocate output register for constraint 's'
+define <2 x i8> @inline_asm_2xi8_in_s_def() {
+  %phys = call <2 x i8> asm sideeffect "; def $0", "={s8}"()
+  %virt = call <2 x i8> asm sideeffect "; def $0", "=s"()
+  %r = and <2 x i8> %phys, %virt
+  ret <2 x i8> %r
+}
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-mismatched-size.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-mismatched-size.ll
new file mode 100644
index 0000000000000..0edfaed688b8f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-mismatched-size.ll
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=CHECK %s
+
+; Allow single registers that are too wide for the IR type:
+
+define i16 @inline_asm_i16_in_v_def() {
+; CHECK-LABEL: inline_asm_i16_in_v_def:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v8
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_and_b32_e32 v0, v8, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %phys = call i16 asm sideeffect "; def $0", "={v8}"()
+  %virt = call i16 asm sideeffect "; def $0", "=v"()
+  %r = and i16 %phys, %virt
+  ret i16 %r
+}
+
+define void @inline_asm_i16_in_v_use(i16 %val) {
+; CHECK-LABEL: inline_asm_i16_in_v_use:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v8, 0xffff, v0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v8
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v8
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "; use $0", "{v8}"(i16 %val)
+  call void asm sideeffect "; use $0", "v"(i16 %val)
+  ret void
+}
+
+define i16 @inline_asm_i16_in_s_def() {
+; CHECK-LABEL: inline_asm_i16_in_s_def:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s8
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s4
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_and_b32 s4, s8, s4
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %phys = call i16 asm sideeffect "; def $0", "={s8}"()
+  %virt = call i16 asm sideeffect "; def $0", "=s"()
+  %r = and i16 %phys, %virt
+  ret i16 %r
+}
+
+define i8 @inline_asm_i8_in_v_def() {
+; CHECK-LABEL: inline_asm_i8_in_v_def:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v8
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, v8
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %phys = call i8 asm sideeffect "; def $0", "={v8}"()
+  ; %virt = call i8 asm sideeffect "; def $0", "=v"()  ; currently fails
+  ; %r = and i8 %phys, %virt
+  ; ret i8 %r
+  ret i8 %phys
+}
+
+; currently broken, v8 should be set to v0 & 0xFF
+define void @inline_asm_i8_in_v_use(i8 %val) {
+; CHECK-LABEL: inline_asm_i8_in_v_use:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v8, v0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v8
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "; use $0", "{v8}"(i8 %val)
+  ; call void asm sideeffect "; use $0", "v"(i8 %val)  ; currently fails
+  ret void
+}
+
+define i8 @inline_asm_i8_in_sphys_def() {
+; CHECK-LABEL: inline_asm_i8_in_sphys_def:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s8
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s8
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %phys = call i8 asm sideeffect "; def $0", "={s8}"()
+  ; %virt = call i8 asm sideeffect "; def $0", "=s"()  ; currently fails
+  ; %r = and i8 %phys, %virt
+  ; ret i8 %r
+  ret i8 %phys
+}
+
+
+; Single registers for vector types that fit are fine.
+
+define void @inline_asm_2xi16_in_v_use(<2 x i16> %val) {
+; CHECK-LABEL: inline_asm_2xi16_in_v_use:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v8, v0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v8
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "; use $0", "{v8}"(<2 x i16> %val)
+  call void asm sideeffect "; use $0", "v"(<2 x i16> %val)
+  ret void
+}
+
+define <2 x i16> @inline_asm_2xi16_in_v_def() {
+; CHECK-LABEL: inline_asm_2xi16_in_v_def:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v8
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_and_b32_e32 v0, v8, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %phys = call <2 x i16> asm sideeffect "; def $0", "={v8}"()
+  %virt = call <2 x i16> asm sideeffect "; def $0", "=v"()
+  %r = and <2 x i16> %phys, %virt
+  ret <2 x i16> %r
+}
+
+define <2 x i16> @inline_asm_2xi16_in_s_def() {
+; CHECK-LABEL: inline_asm_2xi16_in_s_def:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s8
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s4
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_and_b32 s4, s8, s4
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %phys = call <2 x i16> asm sideeffect "; def $0", "={s8}"()
+  %virt = call <2 x i16> asm sideeffect "; def $0", "=s"()
+  %r = and <2 x i16> %phys, %virt
+  ret <2 x i16> %r
+}
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 9d1368b2ec105..e7c77d3123e82 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -3,6 +3,18 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
+define <2 x i64> @f1() #0 {
+; GFX11-LABEL: f1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  ret <2 x i64> zeroinitializer
+}
+
 define void @f0() {
 ; GFX11-LABEL: f0:
 ; GFX11:       ; %bb.0: ; %bb
@@ -36,18 +48,6 @@ bb:
   ret void
 }
 
-define <2 x i64> @f1() #0 {
-; GFX11-LABEL: f1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  ret <2 x i64> zeroinitializer
-}
-
 ; FIXME: This generates "instid1(/* invalid instid value */)".
 define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
 ; GFX11-LABEL: f2:
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index 9e336a714ca67..eef51acc4e12e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -7,13 +7,13 @@ define fastcc i32 @foo() {
   ; CHECK-LABEL: name: foo
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $vgpr40, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_WAITCNT 0
   ; CHECK-NEXT:   $sgpr16 = S_MOV_B32 $sgpr33
   ; CHECK-NEXT:   $sgpr33 = S_MOV_B32 $sgpr32
   ; CHECK-NEXT:   $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
   ; CHECK-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr17
   ; CHECK-NEXT:   $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc
   ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40
@@ -26,24 +26,22 @@ define fastcc i32 @foo() {
   ; CHECK-NEXT:   BUFFER_GL1_INV implicit $exec
   ; CHECK-NEXT:   BUFFER_GL0_INV implicit $exec
   ; CHECK-NEXT:   renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
-  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, killed $vgpr40
-  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, killed $vgpr40
+  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40
+  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40
   ; CHECK-NEXT:   S_WAITCNT 49279
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $vcc_lo = S_MOV_B32 $exec_lo
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1 (%ir-block.1):
   ; CHECK-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
-  ; CHECK-NEXT:   liveins: $vcc_lo, $vgpr40
+  ; CHECK-NEXT:   liveins: $vcc_lo
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.DummyReturnBlock:
-  ; CHECK-NEXT:   liveins: $vgpr40
-  ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $sgpr31 = V_READLANE_B32 $vgpr40, 1
   ; CHECK-NEXT:   $sgpr30 = V_READLANE_B32 $vgpr40, 0
-  ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 killed $vgpr40, 2
+  ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr40, 2
   ; CHECK-NEXT:   $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
   ; CHECK-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index f771536463778..ea18e0d9eeefb 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -1010,73 +1010,73 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
 ; GCN-NEXT:    s_load_dword s8, s[2:3], 0x44
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
+; GCN-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0xf0001
 ; GCN-NEXT:    s_lshr_b32 s42, s5, 16
-; GCN-NEXT:    v_writelane_b32 v0, s0, 0
-; GCN-NEXT:    v_writelane_b32 v0, s1, 1
+; GCN-NEXT:    v_writelane_b32 v6, s0, 0
+; GCN-NEXT:    v_writelane_b32 v6, s1, 1
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 16
-; GCN-NEXT:    v_writelane_b32 v0, s0, 2
+; GCN-NEXT:    v_writelane_b32 v6, s0, 2
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 17
-; GCN-NEXT:    v_writelane_b32 v0, s0, 3
+; GCN-NEXT:    v_writelane_b32 v6, s0, 3
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 18
-; GCN-NEXT:    v_writelane_b32 v0, s0, 4
+; GCN-NEXT:    v_writelane_b32 v6, s0, 4
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 19
-; GCN-NEXT:    v_writelane_b32 v0, s0, 5
+; GCN-NEXT:    v_writelane_b32 v6, s0, 5
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 20
-; GCN-NEXT:    v_writelane_b32 v0, s0, 6
+; GCN-NEXT:    v_writelane_b32 v6, s0, 6
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 21
-; GCN-NEXT:    v_writelane_b32 v0, s0, 7
+; GCN-NEXT:    v_writelane_b32 v6, s0, 7
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 22
-; GCN-NEXT:    v_writelane_b32 v0, s0, 8
+; GCN-NEXT:    v_writelane_b32 v6, s0, 8
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 23
-; GCN-NEXT:    v_writelane_b32 v0, s0, 9
+; GCN-NEXT:    v_writelane_b32 v6, s0, 9
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 24
-; GCN-NEXT:    v_writelane_b32 v0, s0, 10
+; GCN-NEXT:    v_writelane_b32 v6, s0, 10
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 25
-; GCN-NEXT:    v_writelane_b32 v0, s0, 11
+; GCN-NEXT:    v_writelane_b32 v6, s0, 11
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 26
-; GCN-NEXT:    v_writelane_b32 v0, s0, 12
+; GCN-NEXT:    v_writelane_b32 v6, s0, 12
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 27
-; GCN-NEXT:    v_writelane_b32 v0, s0, 13
+; GCN-NEXT:    v_writelane_b32 v6, s0, 13
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 28
-; GCN-NEXT:    v_writelane_b32 v0, s0, 14
+; GCN-NEXT:    v_writelane_b32 v6, s0, 14
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 29
-; GCN-NEXT:    v_writelane_b32 v0, s0, 15
+; GCN-NEXT:    v_writelane_b32 v6, s0, 15
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 30
-; GCN-NEXT:    v_writelane_b32 v0, s0, 16
+; GCN-NEXT:    v_writelane_b32 v6, s0, 16
 ; GCN-NEXT:    s_lshr_b32 s0, s4, 31
-; GCN-NEXT:    v_writelane_b32 v0, s0, 17
-; GCN-NEXT:    v_writelane_b32 v0, s9, 18
+; GCN-NEXT:    v_writelane_b32 v6, s0, 17
+; GCN-NEXT:    v_writelane_b32 v6, s9, 18
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0xe0002
-; GCN-NEXT:    v_writelane_b32 v0, s9, 19
+; GCN-NEXT:    v_writelane_b32 v6, s9, 19
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0xd0003
-; GCN-NEXT:    v_writelane_b32 v0, s9, 20
+; GCN-NEXT:    v_writelane_b32 v6, s9, 20
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0xc0004
-; GCN-NEXT:    v_writelane_b32 v0, s9, 21
+; GCN-NEXT:    v_writelane_b32 v6, s9, 21
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0xb0005
-; GCN-NEXT:    v_writelane_b32 v0, s9, 22
+; GCN-NEXT:    v_writelane_b32 v6, s9, 22
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0xa0006
-; GCN-NEXT:    v_writelane_b32 v0, s9, 23
+; GCN-NEXT:    v_writelane_b32 v6, s9, 23
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0x90007
-; GCN-NEXT:    v_writelane_b32 v0, s9, 24
+; GCN-NEXT:    v_writelane_b32 v6, s9, 24
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0x80008
-; GCN-NEXT:    v_writelane_b32 v0, s9, 25
+; GCN-NEXT:    v_writelane_b32 v6, s9, 25
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0x70009
-; GCN-NEXT:    v_writelane_b32 v0, s9, 26
+; GCN-NEXT:    v_writelane_b32 v6, s9, 26
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0x6000a
-; GCN-NEXT:    v_writelane_b32 v0, s9, 27
+; GCN-NEXT:    v_writelane_b32 v6, s9, 27
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0x5000b
-; GCN-NEXT:    v_writelane_b32 v0, s9, 28
+; GCN-NEXT:    v_writelane_b32 v6, s9, 28
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0x4000c
-; GCN-NEXT:    v_writelane_b32 v0, s9, 29
+; GCN-NEXT:    v_writelane_b32 v6, s9, 29
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0x3000d
-; GCN-NEXT:    v_writelane_b32 v0, s9, 30
+; GCN-NEXT:    v_writelane_b32 v6, s9, 30
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0x2000e
-; GCN-NEXT:    v_writelane_b32 v0, s9, 31
+; GCN-NEXT:    v_writelane_b32 v6, s9, 31
 ; GCN-NEXT:    s_bfe_u32 s9, s4, 0x1000f
-; GCN-NEXT:    v_writelane_b32 v0, s9, 32
+; GCN-NEXT:    v_writelane_b32 v6, s9, 32
 ; GCN-NEXT:    s_bfe_u32 s9, s5, 0xf0001
 ; GCN-NEXT:    s_lshr_b32 s43, s5, 17
 ; GCN-NEXT:    s_lshr_b32 s45, s5, 18
@@ -1125,7 +1125,7 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_lshr_b32 s2, s7, 29
 ; GCN-NEXT:    s_lshr_b32 s1, s7, 30
 ; GCN-NEXT:    s_lshr_b32 s0, s7, 31
-; GCN-NEXT:    v_writelane_b32 v0, s9, 33
+; GCN-NEXT:    v_writelane_b32 v6, s9, 33
 ; GCN-NEXT:    s_bfe_u32 s40, s5, 0xe0002
 ; GCN-NEXT:    s_bfe_u32 s41, s5, 0xd0003
 ; GCN-NEXT:    s_bfe_u32 s44, s5, 0xc0004
@@ -1630,7 +1630,7 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_cselect_b32 s5, s5, 1
 ; GCN-NEXT:    s_and_b32 s5, s5, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 33
-; GCN-NEXT:    v_readlane_b32 s9, v0, 33
+; GCN-NEXT:    v_readlane_b32 s9, v6, 33
 ; GCN-NEXT:    s_cselect_b32 s9, s9, 1
 ; GCN-NEXT:    s_lshl_b32 s9, s9, 1
 ; GCN-NEXT:    s_or_b32 s5, s5, s9
@@ -1643,21 +1643,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GCN-NEXT:    s_or_b32 s0, s1, s0
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 31
-; GCN-NEXT:    v_readlane_b32 s1, v0, 17
+; GCN-NEXT:    v_readlane_b32 s1, v6, 17
 ; GCN-NEXT:    s_cselect_b32 s1, s1, 1
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 3
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 30
-; GCN-NEXT:    v_readlane_b32 s2, v0, 16
+; GCN-NEXT:    v_readlane_b32 s2, v6, 16
 ; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_and_b32 s2, s2, 1
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 2
 ; GCN-NEXT:    s_or_b32 s1, s1, s2
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 29
-; GCN-NEXT:    v_readlane_b32 s2, v0, 15
+; GCN-NEXT:    v_readlane_b32 s2, v6, 15
 ; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 28
-; GCN-NEXT:    v_readlane_b32 s3, v0, 14
+; GCN-NEXT:    v_readlane_b32 s3, v6, 14
 ; GCN-NEXT:    s_cselect_b32 s3, s3, 1
 ; GCN-NEXT:    s_and_b32 s3, s3, 1
 ; GCN-NEXT:    s_or_b32 s2, s3, s2
@@ -1665,21 +1665,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_or_b32 s1, s2, s1
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 12
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 27
-; GCN-NEXT:    v_readlane_b32 s2, v0, 13
+; GCN-NEXT:    v_readlane_b32 s2, v6, 13
 ; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 3
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 26
-; GCN-NEXT:    v_readlane_b32 s3, v0, 12
+; GCN-NEXT:    v_readlane_b32 s3, v6, 12
 ; GCN-NEXT:    s_cselect_b32 s3, s3, 1
 ; GCN-NEXT:    s_and_b32 s3, s3, 1
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 2
 ; GCN-NEXT:    s_or_b32 s2, s2, s3
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 25
-; GCN-NEXT:    v_readlane_b32 s3, v0, 11
+; GCN-NEXT:    v_readlane_b32 s3, v6, 11
 ; GCN-NEXT:    s_cselect_b32 s3, s3, 1
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 24
-; GCN-NEXT:    v_readlane_b32 s5, v0, 10
+; GCN-NEXT:    v_readlane_b32 s5, v6, 10
 ; GCN-NEXT:    s_cselect_b32 s5, s5, 1
 ; GCN-NEXT:    s_and_b32 s5, s5, 1
 ; GCN-NEXT:    s_or_b32 s3, s5, s3
@@ -1689,21 +1689,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 8
 ; GCN-NEXT:    s_or_b32 s1, s1, s2
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 23
-; GCN-NEXT:    v_readlane_b32 s2, v0, 9
+; GCN-NEXT:    v_readlane_b32 s2, v6, 9
 ; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 3
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 22
-; GCN-NEXT:    v_readlane_b32 s3, v0, 8
+; GCN-NEXT:    v_readlane_b32 s3, v6, 8
 ; GCN-NEXT:    s_cselect_b32 s3, s3, 1
 ; GCN-NEXT:    s_and_b32 s3, s3, 1
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 2
 ; GCN-NEXT:    s_or_b32 s2, s2, s3
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 21
-; GCN-NEXT:    v_readlane_b32 s3, v0, 7
+; GCN-NEXT:    v_readlane_b32 s3, v6, 7
 ; GCN-NEXT:    s_cselect_b32 s3, s3, 1
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 20
-; GCN-NEXT:    v_readlane_b32 s5, v0, 6
+; GCN-NEXT:    v_readlane_b32 s5, v6, 6
 ; GCN-NEXT:    s_cselect_b32 s5, s5, 1
 ; GCN-NEXT:    s_and_b32 s5, s5, 1
 ; GCN-NEXT:    s_or_b32 s3, s5, s3
@@ -1711,21 +1711,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_or_b32 s2, s3, s2
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 4
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 19
-; GCN-NEXT:    v_readlane_b32 s3, v0, 5
+; GCN-NEXT:    v_readlane_b32 s3, v6, 5
 ; GCN-NEXT:    s_cselect_b32 s3, s3, 1
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 3
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 18
-; GCN-NEXT:    v_readlane_b32 s5, v0, 4
+; GCN-NEXT:    v_readlane_b32 s5, v6, 4
 ; GCN-NEXT:    s_cselect_b32 s5, s5, 1
 ; GCN-NEXT:    s_and_b32 s5, s5, 1
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 2
 ; GCN-NEXT:    s_or_b32 s3, s3, s5
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 17
-; GCN-NEXT:    v_readlane_b32 s5, v0, 3
+; GCN-NEXT:    v_readlane_b32 s5, v6, 3
 ; GCN-NEXT:    s_cselect_b32 s5, s5, 1
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 16
-; GCN-NEXT:    v_readlane_b32 s9, v0, 2
+; GCN-NEXT:    v_readlane_b32 s9, v6, 2
 ; GCN-NEXT:    s_cselect_b32 s9, s9, 1
 ; GCN-NEXT:    s_and_b32 s9, s9, 1
 ; GCN-NEXT:    s_or_b32 s5, s9, s5
@@ -1737,21 +1737,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_or_b32 s1, s2, s1
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 15
-; GCN-NEXT:    v_readlane_b32 s2, v0, 32
+; GCN-NEXT:    v_readlane_b32 s2, v6, 32
 ; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 3
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 14
-; GCN-NEXT:    v_readlane_b32 s3, v0, 31
+; GCN-NEXT:    v_readlane_b32 s3, v6, 31
 ; GCN-NEXT:    s_cselect_b32 s3, s3, 1
 ; GCN-NEXT:    s_and_b32 s3, s3, 1
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 2
 ; GCN-NEXT:    s_or_b32 s2, s2, s3
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 13
-; GCN-NEXT:    v_readlane_b32 s3, v0, 30
+; GCN-NEXT:    v_readlane_b32 s3, v6, 30
 ; GCN-NEXT:    s_cselect_b32 s3, s3, 1
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 12
-; GCN-NEXT:    v_readlane_b32 s5, v0, 29
+; GCN-NEXT:    v_readlane_b32 s5, v6, 29
 ; GCN-NEXT:    s_cselect_b32 s5, s5, 1
 ; GCN-NEXT:    s_and_b32 s5, s5, 1
 ; GCN-NEXT:    s_or_b32 s3, s5, s3
@@ -1759,21 +1759,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_or_b32 s2, s3, s2
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 12
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 11
-; GCN-NEXT:    v_readlane_b32 s3, v0, 28
+; GCN-NEXT:    v_readlane_b32 s3, v6, 28
 ; GCN-NEXT:    s_cselect_b32 s3, s3, 1
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 3
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 10
-; GCN-NEXT:    v_readlane_b32 s5, v0, 27
+; GCN-NEXT:    v_readlane_b32 s5, v6, 27
 ; GCN-NEXT:    s_cselect_b32 s5, s5, 1
 ; GCN-NEXT:    s_and_b32 s5, s5, 1
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 2
 ; GCN-NEXT:    s_or_b32 s3, s3, s5
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 9
-; GCN-NEXT:    v_readlane_b32 s5, v0, 26
+; GCN-NEXT:    v_readlane_b32 s5, v6, 26
 ; GCN-NEXT:    s_cselect_b32 s5, s5, 1
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 8
-; GCN-NEXT:    v_readlane_b32 s9, v0, 25
+; GCN-NEXT:    v_readlane_b32 s9, v6, 25
 ; GCN-NEXT:    s_cselect_b32 s9, s9, 1
 ; GCN-NEXT:    s_and_b32 s9, s9, 1
 ; GCN-NEXT:    s_or_b32 s5, s9, s5
@@ -1783,21 +1783,21 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 8
 ; GCN-NEXT:    s_or_b32 s2, s2, s3
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 7
-; GCN-NEXT:    v_readlane_b32 s3, v0, 24
+; GCN-NEXT:    v_readlane_b32 s3, v6, 24
 ; GCN-NEXT:    s_cselect_b32 s3, s3, 1
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 3
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 6
-; GCN-NEXT:    v_readlane_b32 s5, v0, 23
+; GCN-NEXT:    v_readlane_b32 s5, v6, 23
 ; GCN-NEXT:    s_cselect_b32 s5, s5, 1
 ; GCN-NEXT:    s_and_b32 s5, s5, 1
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 2
 ; GCN-NEXT:    s_or_b32 s3, s3, s5
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 5
-; GCN-NEXT:    v_readlane_b32 s5, v0, 22
+; GCN-NEXT:    v_readlane_b32 s5, v6, 22
 ; GCN-NEXT:    s_cselect_b32 s5, s5, 1
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 4
-; GCN-NEXT:    v_readlane_b32 s9, v0, 21
+; GCN-NEXT:    v_readlane_b32 s9, v6, 21
 ; GCN-NEXT:    s_cselect_b32 s9, s9, 1
 ; GCN-NEXT:    s_and_b32 s9, s9, 1
 ; GCN-NEXT:    s_or_b32 s5, s9, s5
@@ -1805,11 +1805,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_or_b32 s3, s5, s3
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 4
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 3
-; GCN-NEXT:    v_readlane_b32 s5, v0, 20
+; GCN-NEXT:    v_readlane_b32 s5, v6, 20
 ; GCN-NEXT:    s_cselect_b32 s5, s5, 1
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 3
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 2
-; GCN-NEXT:    v_readlane_b32 s9, v0, 19
+; GCN-NEXT:    v_readlane_b32 s9, v6, 19
 ; GCN-NEXT:    s_cselect_b32 s9, s9, 1
 ; GCN-NEXT:    s_and_b32 s9, s9, 1
 ; GCN-NEXT:    s_lshl_b32 s9, s9, 2
@@ -1818,7 +1818,7 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_cselect_b32 s4, s4, 1
 ; GCN-NEXT:    s_and_b32 s4, s4, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 1
-; GCN-NEXT:    v_readlane_b32 s8, v0, 18
+; GCN-NEXT:    v_readlane_b32 s8, v6, 18
 ; GCN-NEXT:    s_cselect_b32 s8, s8, 1
 ; GCN-NEXT:    s_lshl_b32 s8, s8, 1
 ; GCN-NEXT:    s_or_b32 s4, s4, s8
@@ -1830,16 +1830,15 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-NEXT:    s_or_b32 s2, s3, s2
 ; GCN-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GCN-NEXT:    s_or_b32 s1, s2, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-NEXT:    v_mov_b32_e32 v6, s1
-; GCN-NEXT:    v_mov_b32_e32 v3, s6
-; GCN-NEXT:    v_mov_b32_e32 v4, s7
-; GCN-NEXT:    v_mov_b32_e32 v5, s0
-; GCN-NEXT:    flat_store_dwordx4 v[5:6], v[1:4]
-; GCN-NEXT:    ; kill: killed $vgpr0
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    v_readlane_b32 s0, v6, 0
+; GCN-NEXT:    v_readlane_b32 s1, v6, 1
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <128 x i1> %vec, i1 1, i32 %sel
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index b49931379b84a..957f404c8cdbe 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -30,7 +30,7 @@ define hidden void @func() #1 {
 ; GCN-NOT: writelane
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 
-; GCN: ; NumSgprs: 37
+; GCN: ; TotalNumSgprs: 37
 ; GCN: ; NumVgprs: 9
 define amdgpu_kernel void @kernel_call() #0 {
   %vgpr = load volatile i32, ptr addrspace(1) undef
@@ -48,7 +48,7 @@ define amdgpu_kernel void @kernel_call() #0 {
 ; GCN-NOT: readlane
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 
-; GCN: ; NumSgprs: 34
+; GCN: ; TotalNumSgprs: 34
 ; GCN: ; NumVgprs: 10
 define void @func_regular_call() #1 {
   %vgpr = load volatile i32, ptr addrspace(1) undef
@@ -64,7 +64,7 @@ define void @func_regular_call() #1 {
 ; GCN-NEXT: s_addc_u32 s17,
 ; GCN-NEXT: s_setpc_b64 s[16:17]
 
-; GCN: ; NumSgprs: 32
+; GCN: ; TotalNumSgprs: 32
 ; GCN: ; NumVgprs: 8
 define void @func_tail_call() #1 {
   tail call void @func()
@@ -77,7 +77,7 @@ define void @func_tail_call() #1 {
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 ; GCN: s_setpc_b64
 
-; GCN: ; NumSgprs: 34
+; GCN: ; TotalNumSgprs: 34
 ; GCN: ; NumVgprs: 10
 define void @func_call_tail_call() #1 {
   %vgpr = load volatile i32, ptr addrspace(1) undef
@@ -105,13 +105,6 @@ define void @test_funcx2() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}wombat:
-define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) {
-bb:
-  call void @hoge() #0
-  ret void
-}
-
 ; Make sure we save/restore the return address around the call.
 ; Function Attrs: norecurse
 define internal void @hoge() #2 {
@@ -128,6 +121,13 @@ bb:
   ret void
 }
 
+; GCN-LABEL: {{^}}wombat:
+define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) {
+bb:
+  call void @hoge() #0
+  ret void
+}
+
 declare dso_local void @eggs()
 
 
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index ec446f1f3bf27..7b195f8e86220 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -13,22 +13,14 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v0
-; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; CHECK-NEXT:    s_add_i32 s8, s33, 0x100200
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s8 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
 ; CHECK-NEXT:    s_load_dword s8, s[6:7], 0x0
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 0
-; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; CHECK-NEXT:    s_add_i32 s8, s33, 0x100200
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s8 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
+; CHECK-NEXT:    ; implicit-def: $vgpr40 : SGPR spill to VGPR lane
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v40, s8, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def vgpr10
 ; CHECK-NEXT:    ;;#ASMEND
@@ -62,14 +54,9 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; CHECK-NEXT:    s_add_i32 s4, s33, 0x100200
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
 ; CHECK-NEXT:    s_add_i32 s4, s33, 0x100100
 ; CHECK-NEXT:    buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v40, 0
 ; CHECK-NEXT:    s_mov_b32 s5, 0
 ; CHECK-NEXT:    s_cmp_eq_u32 s4, s5
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x4000
@@ -77,24 +64,14 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
 ; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], s33 offen ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %store
-; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; CHECK-NEXT:    s_add_i32 s4, s33, 0x100200
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
 ; CHECK-NEXT:    s_add_i32 s4, s33, 0x100000
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    ; implicit-def: $sgpr4
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b32 v1, v2
-; CHECK-NEXT:    ; kill: killed $vgpr0
+; CHECK-NEXT:    ds_write_b32 v0, v1
 ; CHECK-NEXT:    s_endpgm
 ; CHECK-NEXT:  .LBB0_2: ; %end
-; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; CHECK-NEXT:    s_add_i32 s4, s33, 0x100200
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
-; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    s_endpgm
   %arr = alloca < 1339 x i32>, align 8192, addrspace(5)
   %cmp = icmp ne i32 %val, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 7bf1b8746fd87..b192fdec15739 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -123,6 +123,8 @@
 ; GCN-O0-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O0-NEXT:        Fast Register Allocator
 ; GCN-O0-NEXT:        SI Lower WWM Copies
+; GCN-O0-NEXT:        AMDGPU Reserve WWM Registers
+; GCN-O0-NEXT:        Fast Register Allocator
 ; GCN-O0-NEXT:        SI Fix VGPR copies
 ; GCN-O0-NEXT:        Remove Redundant DEBUG_VALUE analysis
 ; GCN-O0-NEXT:        Fixup Statepoint Caller Saved
@@ -149,12 +151,9 @@
 ; GCN-O0-NEXT:        Lazy Machine Block Frequency Analysis
 ; GCN-O0-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O0-NEXT:        Stack Frame Layout Analysis
-; GCN-O0-NEXT:    Function register usage analysis
-; GCN-O0-NEXT:    FunctionPass Manager
-; GCN-O0-NEXT:      Lazy Machine Block Frequency Analysis
-; GCN-O0-NEXT:      Machine Optimization Remark Emitter
-; GCN-O0-NEXT:      AMDGPU Assembly Printer
-; GCN-O0-NEXT:      Free MachineFunction
+; GCN-O0-NEXT:        Function register usage analysis
+; GCN-O0-NEXT:        AMDGPU Assembly Printer
+; GCN-O0-NEXT:        Free MachineFunction
 
 ; GCN-O1:Target Library Information
 ; GCN-O1-NEXT:Target Pass Configuration
@@ -373,6 +372,11 @@
 ; GCN-O1-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O1-NEXT:        Greedy Register Allocator
 ; GCN-O1-NEXT:        SI Lower WWM Copies
+; GCN-O1-NEXT:        Virtual Register Rewriter
+; GCN-O1-NEXT:        AMDGPU Reserve WWM Registers
+; GCN-O1-NEXT:        Virtual Register Map
+; GCN-O1-NEXT:        Live Register Matrix
+; GCN-O1-NEXT:        Greedy Register Allocator
 ; GCN-O1-NEXT:        GCN NSA Reassign
 ; GCN-O1-NEXT:        Virtual Register Rewriter
 ; GCN-O1-NEXT:        AMDGPU Mark Last Scratch Load
@@ -427,12 +431,9 @@
 ; GCN-O1-NEXT:        Lazy Machine Block Frequency Analysis
 ; GCN-O1-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O1-NEXT:        Stack Frame Layout Analysis
-; GCN-O1-NEXT:    Function register usage analysis
-; GCN-O1-NEXT:    FunctionPass Manager
-; GCN-O1-NEXT:      Lazy Machine Block Frequency Analysis
-; GCN-O1-NEXT:      Machine Optimization Remark Emitter
-; GCN-O1-NEXT:      AMDGPU Assembly Printer
-; GCN-O1-NEXT:      Free MachineFunction
+; GCN-O1-NEXT:        Function register usage analysis
+; GCN-O1-NEXT:        AMDGPU Assembly Printer
+; GCN-O1-NEXT:        Free MachineFunction
 
 ; GCN-O1-OPTS:Target Library Information
 ; GCN-O1-OPTS-NEXT:Target Pass Configuration
@@ -679,6 +680,11 @@
 ; GCN-O1-OPTS-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O1-OPTS-NEXT:        Greedy Register Allocator
 ; GCN-O1-OPTS-NEXT:        SI Lower WWM Copies
+; GCN-O1-OPTS-NEXT:        Virtual Register Rewriter
+; GCN-O1-OPTS-NEXT:        AMDGPU Reserve WWM Registers
+; GCN-O1-OPTS-NEXT:        Virtual Register Map
+; GCN-O1-OPTS-NEXT:        Live Register Matrix
+; GCN-O1-OPTS-NEXT:        Greedy Register Allocator
 ; GCN-O1-OPTS-NEXT:        GCN NSA Reassign
 ; GCN-O1-OPTS-NEXT:        Virtual Register Rewriter
 ; GCN-O1-OPTS-NEXT:        AMDGPU Mark Last Scratch Load
@@ -733,12 +739,9 @@
 ; GCN-O1-OPTS-NEXT:        Lazy Machine Block Frequency Analysis
 ; GCN-O1-OPTS-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O1-OPTS-NEXT:        Stack Frame Layout Analysis
-; GCN-O1-OPTS-NEXT:    Function register usage analysis
-; GCN-O1-OPTS-NEXT:    FunctionPass Manager
-; GCN-O1-OPTS-NEXT:      Lazy Machine Block Frequency Analysis
-; GCN-O1-OPTS-NEXT:      Machine Optimization Remark Emitter
-; GCN-O1-OPTS-NEXT:      AMDGPU Assembly Printer
-; GCN-O1-OPTS-NEXT:      Free MachineFunction
+; GCN-O1-OPTS-NEXT:        Function register usage analysis
+; GCN-O1-OPTS-NEXT:        AMDGPU Assembly Printer
+; GCN-O1-OPTS-NEXT:        Free MachineFunction
 
 ; GCN-O2:Target Library Information
 ; GCN-O2-NEXT:Target Pass Configuration
@@ -991,6 +994,11 @@
 ; GCN-O2-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O2-NEXT:        Greedy Register Allocator
 ; GCN-O2-NEXT:        SI Lower WWM Copies
+; GCN-O2-NEXT:        Virtual Register Rewriter
+; GCN-O2-NEXT:        AMDGPU Reserve WWM Registers
+; GCN-O2-NEXT:        Virtual Register Map
+; GCN-O2-NEXT:        Live Register Matrix
+; GCN-O2-NEXT:        Greedy Register Allocator
 ; GCN-O2-NEXT:        GCN NSA Reassign
 ; GCN-O2-NEXT:        Virtual Register Rewriter
 ; GCN-O2-NEXT:        AMDGPU Mark Last Scratch Load
@@ -1045,12 +1053,9 @@
 ; GCN-O2-NEXT:        Lazy Machine Block Frequency Analysis
 ; GCN-O2-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O2-NEXT:        Stack Frame Layout Analysis
-; GCN-O2-NEXT:    Function register usage analysis
-; GCN-O2-NEXT:    FunctionPass Manager
-; GCN-O2-NEXT:      Lazy Machine Block Frequency Analysis
-; GCN-O2-NEXT:      Machine Optimization Remark Emitter
-; GCN-O2-NEXT:      AMDGPU Assembly Printer
-; GCN-O2-NEXT:      Free MachineFunction
+; GCN-O2-NEXT:        Function register usage analysis
+; GCN-O2-NEXT:        AMDGPU Assembly Printer
+; GCN-O2-NEXT:        Free MachineFunction
 
 ; GCN-O3:Target Library Information
 ; GCN-O3-NEXT:Target Pass Configuration
@@ -1315,6 +1320,11 @@
 ; GCN-O3-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O3-NEXT:        Greedy Register Allocator
 ; GCN-O3-NEXT:        SI Lower WWM Copies
+; GCN-O3-NEXT:        Virtual Register Rewriter
+; GCN-O3-NEXT:        AMDGPU Reserve WWM Registers
+; GCN-O3-NEXT:        Virtual Register Map
+; GCN-O3-NEXT:        Live Register Matrix
+; GCN-O3-NEXT:        Greedy Register Allocator
 ; GCN-O3-NEXT:        GCN NSA Reassign
 ; GCN-O3-NEXT:        Virtual Register Rewriter
 ; GCN-O3-NEXT:        AMDGPU Mark Last Scratch Load
@@ -1369,12 +1379,9 @@
 ; GCN-O3-NEXT:        Lazy Machine Block Frequency Analysis
 ; GCN-O3-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O3-NEXT:        Stack Frame Layout Analysis
-; GCN-O3-NEXT:    Function register usage analysis
-; GCN-O3-NEXT:    FunctionPass Manager
-; GCN-O3-NEXT:      Lazy Machine Block Frequency Analysis
-; GCN-O3-NEXT:      Machine Optimization Remark Emitter
-; GCN-O3-NEXT:      AMDGPU Assembly Printer
-; GCN-O3-NEXT:      Free MachineFunction
+; GCN-O3-NEXT:        Function register usage analysis
+; GCN-O3-NEXT:        AMDGPU Assembly Printer
+; GCN-O3-NEXT:        Free MachineFunction
 
 define void @empty() {
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
index 5ffdbb0f8c5b0..4df6fcd584ff6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
@@ -7,10 +7,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1(<8 x i32> inreg %rsrc, i3
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
@@ -20,7 +20,7 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1(<8 x i32> inreg %rsrc, i3
 ; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0:[0-9]+]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
 ; MSAA-NEXT:    [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
@@ -32,10 +32,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1(<8 x i32> inreg %rsrc, i3
 ; MSAA-NEXT:    ret [4 x float] [[I7]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x float] undef, float %i, 0
   %i5 = insertvalue [4 x float] %i4, float %i1, 1
   %i6 = insertvalue [4 x float] %i5, float %i2, 2
@@ -47,10 +47,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask2(<8 x i32> inreg %rsrc, i3
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask2(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 2, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 2, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 2, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
@@ -60,7 +60,7 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask2(<8 x i32> inreg %rsrc, i3
 ; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask2(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
 ; MSAA-NEXT:    [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
@@ -72,10 +72,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask2(<8 x i32> inreg %rsrc, i3
 ; MSAA-NEXT:    ret [4 x float] [[I7]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 2, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 2, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 2, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 2, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 2, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x float] undef, float %i, 0
   %i5 = insertvalue [4 x float] %i4, float %i1, 1
   %i6 = insertvalue [4 x float] %i5, float %i2, 2
@@ -87,10 +87,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask4(<8 x i32> inreg %rsrc, i3
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask4(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 4, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 4, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 4, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
@@ -100,7 +100,7 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask4(<8 x i32> inreg %rsrc, i3
 ; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask4(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
 ; MSAA-NEXT:    [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
@@ -112,10 +112,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask4(<8 x i32> inreg %rsrc, i3
 ; MSAA-NEXT:    ret [4 x float] [[I7]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 4, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 4, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 4, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 4, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 4, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x float] undef, float %i, 0
   %i5 = insertvalue [4 x float] %i4, float %i1, 1
   %i6 = insertvalue [4 x float] %i5, float %i2, 2
@@ -127,10 +127,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask8(<8 x i32> inreg %rsrc, i3
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask8(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 8, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 8, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 8, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
@@ -140,7 +140,7 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask8(<8 x i32> inreg %rsrc, i3
 ; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask8(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
 ; MSAA-NEXT:    [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
@@ -152,10 +152,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask8(<8 x i32> inreg %rsrc, i3
 ; MSAA-NEXT:    ret [4 x float] [[I7]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 8, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 8, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 8, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 8, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 8, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x float] undef, float %i, 0
   %i5 = insertvalue [4 x float] %i4, float %i1, 1
   %i6 = insertvalue [4 x float] %i5, float %i2, 2
@@ -167,10 +167,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_reverse(<8 x i32> inreg %rsrc, i
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_reverse(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
@@ -180,7 +180,7 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_reverse(<8 x i32> inreg %rsrc, i
 ; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_reverse(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
 ; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
 ; MSAA-NEXT:    [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
@@ -192,10 +192,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_reverse(<8 x i32> inreg %rsrc, i
 ; MSAA-NEXT:    ret [4 x float] [[I7]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x float] undef, float %i, 0
   %i5 = insertvalue [4 x float] %i4, float %i1, 1
   %i6 = insertvalue [4 x float] %i5, float %i2, 2
@@ -208,10 +208,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_vaddr(<8 x i32> inreg %rsrc, i32
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_vaddr(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S0:%.*]], i32 [[T0:%.*]], i32 [[S1:%.*]], i32 [[T1:%.*]], i32 [[S2:%.*]], i32 [[T2:%.*]], i32 [[S3:%.*]], i32 [[T3:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S0]], i32 [[T0]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S1]], i32 [[T1]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S2]], i32 [[T2]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S3]], i32 [[T3]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S0]], i32 [[T0]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S1]], i32 [[T1]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S2]], i32 [[T2]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S3]], i32 [[T3]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
@@ -221,10 +221,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_vaddr(<8 x i32> inreg %rsrc, i32
 ; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_vaddr(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S0:%.*]], i32 [[T0:%.*]], i32 [[S1:%.*]], i32 [[T1:%.*]], i32 [[S2:%.*]], i32 [[T2:%.*]], i32 [[S3:%.*]], i32 [[T3:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S0]], i32 [[T0]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S1]], i32 [[T1]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S2]], i32 [[T2]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S3]], i32 [[T3]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S0]], i32 [[T0]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S1]], i32 [[T1]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S2]], i32 [[T2]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S3]], i32 [[T3]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
 ; MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
 ; MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
@@ -232,10 +232,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_vaddr(<8 x i32> inreg %rsrc, i32
 ; MSAA-NEXT:    ret [4 x float] [[I7]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s0, i32 %t0, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s1, i32 %t1, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s2, i32 %t2, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s3, i32 %t3, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s0, i32 %t0, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s1, i32 %t1, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s2, i32 %t2, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s3, i32 %t3, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x float] undef, float %i, 0
   %i5 = insertvalue [4 x float] %i4, float %i1, 1
   %i6 = insertvalue [4 x float] %i5, float %i2, 2
@@ -247,14 +247,14 @@ define amdgpu_ps [8 x float] @load_2dmsaa_v8f32(<8 x i32> inreg %rsrc, i32 %s, i
 ; NO-MSAA-LABEL: define amdgpu_ps [8 x float] @load_2dmsaa_v8f32(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I4:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I5:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I6:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I7:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I4:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I5:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I6:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I7:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I8:%.*]] = insertvalue [8 x float] undef, float [[I]], 0
 ; NO-MSAA-NEXT:    [[I9:%.*]] = insertvalue [8 x float] [[I8]], float [[I1]], 1
 ; NO-MSAA-NEXT:    [[I10:%.*]] = insertvalue [8 x float] [[I9]], float [[I2]], 2
@@ -268,7 +268,7 @@ define amdgpu_ps [8 x float] @load_2dmsaa_v8f32(<8 x i32> inreg %rsrc, i32 %s, i
 ; MSAA-LABEL: define amdgpu_ps [8 x float] @load_2dmsaa_v8f32(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
 ; MSAA-NEXT:    [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
@@ -288,14 +288,14 @@ define amdgpu_ps [8 x float] @load_2dmsaa_v8f32(<8 x i32> inreg %rsrc, i32 %s, i
 ; MSAA-NEXT:    ret [8 x float] [[I15]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i4 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i5 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i6 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i7 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i4 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i5 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i6 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i7 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i8 = insertvalue [8 x float] undef, float %i, 0
   %i9 = insertvalue [8 x float] %i8, float %i1, 1
   %i10 = insertvalue [8 x float] %i9, float %i2, 2
@@ -311,20 +311,20 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_interleaved(<8 x i32> inreg %rsr
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_interleaved(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I1:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
-; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I3:%.*]] = insertvalue [4 x float] [[I1]], float [[I2]], 1
-; NO-MSAA-NEXT:    [[I4:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I4:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I3]], float [[I4]], 2
-; NO-MSAA-NEXT:    [[I6:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I6:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I7:%.*]] = insertvalue [4 x float] [[I5]], float [[I6]], 3
 ; NO-MSAA-NEXT:    ret [4 x float] [[I7]]
 ;
 ; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_interleaved(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
 ; MSAA-NEXT:    [[I4:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
@@ -336,13 +336,13 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_interleaved(<8 x i32> inreg %rsr
 ; MSAA-NEXT:    ret [4 x float] [[I7]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   %i1 = insertvalue [4 x float] undef, float %i, 0
-  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
   %i3 = insertvalue [4 x float] %i1, float %i2, 1
-  %i4 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i4 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
   %i5 = insertvalue [4 x float] %i3, float %i4, 2
-  %i6 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i6 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i7 = insertvalue [4 x float] %i5, float %i6, 3
   ret [4 x float] %i7
 }
@@ -351,8 +351,8 @@ define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId01(<8 x i32> inreg %rsrc,
 ; NO-MSAA-LABEL: define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId01(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I2:%.*]] = insertvalue [2 x float] undef, float [[I]], 0
 ; NO-MSAA-NEXT:    [[I3:%.*]] = insertvalue [2 x float] [[I2]], float [[I1]], 1
 ; NO-MSAA-NEXT:    ret [2 x float] [[I3]]
@@ -360,7 +360,7 @@ define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId01(<8 x i32> inreg %rsrc,
 ; MSAA-LABEL: define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId01(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
 ; MSAA-NEXT:    [[I2:%.*]] = insertvalue [2 x float] undef, float [[I]], 0
@@ -368,8 +368,8 @@ define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId01(<8 x i32> inreg %rsrc,
 ; MSAA-NEXT:    ret [2 x float] [[I3]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
   %i2 = insertvalue [2 x float] undef, float %i, 0
   %i3 = insertvalue [2 x float] %i2, float %i1, 1
   ret [2 x float] %i3
@@ -379,8 +379,8 @@ define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId23(<8 x i32> inreg %rsrc,
 ; NO-MSAA-LABEL: define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId23(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I2:%.*]] = insertvalue [2 x float] undef, float [[I]], 0
 ; NO-MSAA-NEXT:    [[I3:%.*]] = insertvalue [2 x float] [[I2]], float [[I1]], 1
 ; NO-MSAA-NEXT:    ret [2 x float] [[I3]]
@@ -388,7 +388,7 @@ define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId23(<8 x i32> inreg %rsrc,
 ; MSAA-LABEL: define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId23(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
 ; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
 ; MSAA-NEXT:    [[I2:%.*]] = insertvalue [2 x float] undef, float [[I]], 0
@@ -396,8 +396,8 @@ define amdgpu_ps [2 x float] @load_2dmsaa_v2f32_fragId23(<8 x i32> inreg %rsrc,
 ; MSAA-NEXT:    ret [2 x float] [[I3]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i2 = insertvalue [2 x float] undef, float %i, 0
   %i3 = insertvalue [2 x float] %i2, float %i1, 1
   ret [2 x float] %i3
@@ -409,8 +409,8 @@ define amdgpu_ps [2 x <2 x float>] @load_2dmsaa_v2v2f32_dmask3(<8 x i32> inreg %
 ; NO-MSAA-LABEL: define amdgpu_ps [2 x <2 x float>] @load_2dmsaa_v2v2f32_dmask3(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [2 x <2 x float>] undef, <2 x float> [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [2 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
 ; NO-MSAA-NEXT:    ret [2 x <2 x float>] [[I5]]
@@ -418,15 +418,15 @@ define amdgpu_ps [2 x <2 x float>] @load_2dmsaa_v2v2f32_dmask3(<8 x i32> inreg %
 ; MSAA-LABEL: define amdgpu_ps [2 x <2 x float>] @load_2dmsaa_v2v2f32_dmask3(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I4:%.*]] = insertvalue [2 x <2 x float>] undef, <2 x float> [[I]], 0
 ; MSAA-NEXT:    [[I5:%.*]] = insertvalue [2 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
 ; MSAA-NEXT:    ret [2 x <2 x float>] [[I5]]
 ;
 main_body:
-  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [2 x <2 x float>] undef, <2 x float> %i, 0
   %i5 = insertvalue [2 x <2 x float>] %i4, <2 x float> %i1, 1
   ret [2 x <2 x float>] %i5
@@ -439,10 +439,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask3(<8 x i32> inreg %
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask3(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
@@ -452,8 +452,8 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask3(<8 x i32> inreg %
 ; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask3(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
 ; MSAA-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
@@ -477,10 +477,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask3(<8 x i32> inreg %
 ; MSAA-NEXT:    ret [4 x <2 x float>] [[I7]]
 ;
 main_body:
-  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
   %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
   %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
@@ -495,10 +495,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask5(<8 x i32> inreg %
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask5(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 5, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 5, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 5, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 5, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
@@ -508,8 +508,8 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask5(<8 x i32> inreg %
 ; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask5(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
 ; MSAA-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
@@ -533,10 +533,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask5(<8 x i32> inreg %
 ; MSAA-NEXT:    ret [4 x <2 x float>] [[I7]]
 ;
 main_body:
-  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 5, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 5, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 5, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 5, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 5, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
   %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
   %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
@@ -551,10 +551,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask6(<8 x i32> inreg %
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask6(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 6, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 6, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 6, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 6, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
@@ -564,8 +564,8 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask6(<8 x i32> inreg %
 ; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask6(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
 ; MSAA-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
@@ -589,10 +589,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask6(<8 x i32> inreg %
 ; MSAA-NEXT:    ret [4 x <2 x float>] [[I7]]
 ;
 main_body:
-  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 6, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 6, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 6, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 6, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 6, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
   %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
   %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
@@ -607,10 +607,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask9(<8 x i32> inreg %
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask9(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 9, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 9, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 9, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 9, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
@@ -620,8 +620,8 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask9(<8 x i32> inreg %
 ; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask9(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
 ; MSAA-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
@@ -645,10 +645,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask9(<8 x i32> inreg %
 ; MSAA-NEXT:    ret [4 x <2 x float>] [[I7]]
 ;
 main_body:
-  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 9, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 9, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 9, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 9, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 9, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
   %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
   %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
@@ -663,10 +663,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask10(<8 x i32> inreg
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask10(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 10, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 10, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 10, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 10, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
@@ -676,8 +676,8 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask10(<8 x i32> inreg
 ; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask10(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
 ; MSAA-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
@@ -701,10 +701,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask10(<8 x i32> inreg
 ; MSAA-NEXT:    ret [4 x <2 x float>] [[I7]]
 ;
 main_body:
-  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 10, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 10, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 10, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 10, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 10, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
   %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
   %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
@@ -719,10 +719,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask12(<8 x i32> inreg
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask12(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 12, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 12, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 12, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 12, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
@@ -732,8 +732,8 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask12(<8 x i32> inreg
 ; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask12(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 8, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
 ; MSAA-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
@@ -757,10 +757,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask12(<8 x i32> inreg
 ; MSAA-NEXT:    ret [4 x <2 x float>] [[I7]]
 ;
 main_body:
-  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 12, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 12, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 12, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 12, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 12, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
   %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
   %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
@@ -772,8 +772,8 @@ define amdgpu_ps [2 x half] @load_2dmsaa_v2f16_fragId01(<8 x i32> inreg %rsrc, i
 ; NO-MSAA-LABEL: define amdgpu_ps [2 x half] @load_2dmsaa_v2f16_fragId01(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I2:%.*]] = insertvalue [2 x half] undef, half [[I]], 0
 ; NO-MSAA-NEXT:    [[I3:%.*]] = insertvalue [2 x half] [[I2]], half [[I1]], 1
 ; NO-MSAA-NEXT:    ret [2 x half] [[I3]]
@@ -781,7 +781,7 @@ define amdgpu_ps [2 x half] @load_2dmsaa_v2f16_fragId01(<8 x i32> inreg %rsrc, i
 ; MSAA-LABEL: define amdgpu_ps [2 x half] @load_2dmsaa_v2f16_fragId01(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x half> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x half> [[TMP0]], i64 1
 ; MSAA-NEXT:    [[I2:%.*]] = insertvalue [2 x half] undef, half [[I]], 0
@@ -789,8 +789,8 @@ define amdgpu_ps [2 x half] @load_2dmsaa_v2f16_fragId01(<8 x i32> inreg %rsrc, i
 ; MSAA-NEXT:    ret [2 x half] [[I3]]
 ;
 main_body:
-  %i = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32.v8i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call half @llvm.amdgcn.image.load.2dmsaa.f16.i32.v8i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
   %i2 = insertvalue [2 x half] undef, half %i, 0
   %i3 = insertvalue [2 x half] %i2, half %i1, 1
   ret [2 x half] %i3
@@ -800,10 +800,10 @@ define amdgpu_ps [4 x float] @load_2darraymsaa_v4f32_dmask1(<8 x i32> inreg %rsr
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2darraymsaa_v4f32_dmask1(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
@@ -813,7 +813,7 @@ define amdgpu_ps [4 x float] @load_2darraymsaa_v4f32_dmask1(<8 x i32> inreg %rsr
 ; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2darraymsaa_v4f32_dmask1(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
 ; MSAA-NEXT:    [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
@@ -825,10 +825,10 @@ define amdgpu_ps [4 x float] @load_2darraymsaa_v4f32_dmask1(<8 x i32> inreg %rsr
 ; MSAA-NEXT:    ret [4 x float] [[I7]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call float @llvm.amdgcn.image.load.2darraymsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x float] undef, float %i, 0
   %i5 = insertvalue [4 x float] %i4, float %i1, 1
   %i6 = insertvalue [4 x float] %i5, float %i2, 2
@@ -843,10 +843,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask3(<8 x i32> in
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask3(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
@@ -856,8 +856,8 @@ define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask3(<8 x i32> in
 ; MSAA-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask3(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32.v8i32(i32 2, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i64 0
 ; MSAA-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
@@ -881,10 +881,10 @@ define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask3(<8 x i32> in
 ; MSAA-NEXT:    ret [4 x <2 x float>] [[I7]]
 ;
 main_body:
-  %i = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
   %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
   %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
@@ -901,10 +901,10 @@ define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7(<8 x i32> inreg %
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
@@ -914,9 +914,9 @@ define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7(<8 x i32> inreg %
 ; MSAA-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 2, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 4, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[TMP4:%.*]] = insertelement <3 x float> undef, float [[TMP3]], i64 0
 ; MSAA-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
@@ -948,10 +948,10 @@ define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7(<8 x i32> inreg %
 ; MSAA-NEXT:    ret [4 x <3 x float>] [[I7]]
 ;
 main_body:
-  %i = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x <3 x float>] undef, <3 x float> %i, 0
   %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
   %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
@@ -968,10 +968,10 @@ define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7_group1(<8 x i32>
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7_group1(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 4, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 5, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 6, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 [[S]], i32 [[T]], i32 7, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 4, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 5, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 6, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 7, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
@@ -981,9 +981,9 @@ define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7_group1(<8 x i32>
 ; MSAA-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7_group1(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 4, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 2, i32 [[S]], i32 [[T]], i32 4, <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 4, i32 [[S]], i32 [[T]], i32 4, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 4, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 2, i32 [[S]], i32 [[T]], i32 4, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 4, i32 [[S]], i32 [[T]], i32 4, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[TMP4:%.*]] = insertelement <3 x float> undef, float [[TMP3]], i64 0
 ; MSAA-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
@@ -1015,10 +1015,10 @@ define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask7_group1(<8 x i32>
 ; MSAA-NEXT:    ret [4 x <3 x float>] [[I7]]
 ;
 main_body:
-  %i = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 4, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 5, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 6, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 7, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 %s, i32 %t, i32 4, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 %s, i32 %t, i32 5, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 %s, i32 %t, i32 6, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 %s, i32 %t, i32 7, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x <3 x float>] undef, <3 x float> %i, 0
   %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
   %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
@@ -1030,11 +1030,11 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_sections(<8 x i32> inreg %rsrc,
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_sections(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], float [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    call void @llvm.amdgcn.image.store.2dmsaa.f32.i32(float [[VDATA]], i32 1, i32 [[S]], i32 [[T]], i32 [[FRAGID]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    call void @llvm.amdgcn.image.store.2dmsaa.f32.i32.v8i32(float [[VDATA]], i32 1, i32 [[S]], i32 [[T]], i32 [[FRAGID]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
@@ -1044,11 +1044,11 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_sections(<8 x i32> inreg %rsrc,
 ; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_sections(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], float [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
-; MSAA-NEXT:    call void @llvm.amdgcn.image.store.2dmsaa.f32.i32(float [[VDATA]], i32 1, i32 [[S]], i32 [[T]], i32 [[FRAGID]], <8 x i32> [[RSRC]], i32 0, i32 0)
-; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    call void @llvm.amdgcn.image.store.2dmsaa.f32.i32.v8i32(float [[VDATA]], i32 1, i32 [[S]], i32 [[T]], i32 [[FRAGID]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I2:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
 ; MSAA-NEXT:    [[I3:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
 ; MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
@@ -1058,11 +1058,11 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_sections(<8 x i32> inreg %rsrc,
 ; MSAA-NEXT:    ret [4 x float] [[I7]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
   call void @llvm.amdgcn.image.store.2dmsaa.f32.i32(float %vdata, i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x float] undef, float %i, 0
   %i5 = insertvalue [4 x float] %i4, float %i1, 1
   %i6 = insertvalue [4 x float] %i5, float %i2, 2
@@ -1074,10 +1074,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_blocks(<8 x i32> inreg %rsrc, i3
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_blocks(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
@@ -1085,20 +1085,20 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_blocks(<8 x i32> inreg %rsrc, i3
 ; NO-MSAA-NEXT:    [[I8:%.*]] = trunc i32 [[COND]] to i1
 ; NO-MSAA-NEXT:    br i1 [[I8]], label [[IF_EQUAL:%.*]], label [[IF_UNEQUAL:%.*]]
 ; NO-MSAA:       if_equal:
-; NO-MSAA-NEXT:    [[I9:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I10:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I11:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I12:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I9:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I10:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I11:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I12:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I13:%.*]] = insertvalue [4 x float] undef, float [[I9]], 0
 ; NO-MSAA-NEXT:    [[I14:%.*]] = insertvalue [4 x float] [[I13]], float [[I10]], 1
 ; NO-MSAA-NEXT:    [[I15:%.*]] = insertvalue [4 x float] [[I14]], float [[I11]], 2
 ; NO-MSAA-NEXT:    [[I16:%.*]] = insertvalue [4 x float] [[I15]], float [[I12]], 3
 ; NO-MSAA-NEXT:    br label [[MERGE:%.*]]
 ; NO-MSAA:       if_unequal:
-; NO-MSAA-NEXT:    [[I17:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I18:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I19:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I20:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I17:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I18:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I19:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I20:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I21:%.*]] = insertvalue [4 x float] undef, float [[I17]], 0
 ; NO-MSAA-NEXT:    [[I22:%.*]] = insertvalue [4 x float] [[I21]], float [[I18]], 1
 ; NO-MSAA-NEXT:    [[I23:%.*]] = insertvalue [4 x float] [[I22]], float [[I19]], 2
@@ -1111,7 +1111,7 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_blocks(<8 x i32> inreg %rsrc, i3
 ; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_blocks(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
 ; MSAA-NEXT:    [[I2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
@@ -1123,7 +1123,7 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_blocks(<8 x i32> inreg %rsrc, i3
 ; MSAA-NEXT:    [[I8:%.*]] = trunc i32 [[COND]] to i1
 ; MSAA-NEXT:    br i1 [[I8]], label [[IF_EQUAL:%.*]], label [[IF_UNEQUAL:%.*]]
 ; MSAA:       if_equal:
-; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I9:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
 ; MSAA-NEXT:    [[I10:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
 ; MSAA-NEXT:    [[I11:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
@@ -1134,7 +1134,7 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_blocks(<8 x i32> inreg %rsrc, i3
 ; MSAA-NEXT:    [[I16:%.*]] = insertvalue [4 x float] [[I15]], float [[I12]], 3
 ; MSAA-NEXT:    br label [[MERGE:%.*]]
 ; MSAA:       if_unequal:
-; MSAA-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I17:%.*]] = extractelement <4 x float> [[TMP2]], i64 0
 ; MSAA-NEXT:    [[I18:%.*]] = extractelement <4 x float> [[TMP2]], i64 1
 ; MSAA-NEXT:    [[I19:%.*]] = extractelement <4 x float> [[TMP2]], i64 2
@@ -1149,10 +1149,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_blocks(<8 x i32> inreg %rsrc, i3
 ; MSAA-NEXT:    ret [4 x float] [[I25]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i4 = insertvalue [4 x float] undef, float %i, 0
   %i5 = insertvalue [4 x float] %i4, float %i1, 1
   %i6 = insertvalue [4 x float] %i5, float %i2, 2
@@ -1160,20 +1160,20 @@ main_body:
   %i8 = trunc i32 %cond to i1
   br i1 %i8, label %if_equal, label %if_unequal
 if_equal:
-  %i9 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i10 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i11 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i12 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i9 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i10 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i11 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i12 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i13 = insertvalue [4 x float] undef, float %i9, 0
   %i14 = insertvalue [4 x float] %i13, float %i10, 1
   %i15 = insertvalue [4 x float] %i14, float %i11, 2
   %i16 = insertvalue [4 x float] %i15, float %i12, 3
   br label %merge
 if_unequal:
-  %i17 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i18 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i19 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i20 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i17 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i18 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i19 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i20 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
   %i21 = insertvalue [4 x float] undef, float %i17, 0
   %i22 = insertvalue [4 x float] %i21, float %i18, 1
   %i23 = insertvalue [4 x float] %i22, float %i19, 2
@@ -1188,10 +1188,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(<8 x i32>
 ; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(
 ; NO-MSAA-SAME: <8 x i32> inreg [[RSRC1:%.*]], <8 x i32> inreg [[RSRC2:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; NO-MSAA-NEXT:  main_body:
-; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC1]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0)
-; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC2]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC1]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC2]], i32 0, i32 0)
 ; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
 ; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
 ; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
@@ -1201,10 +1201,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(<8 x i32>
 ; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(
 ; MSAA-SAME: <8 x i32> inreg [[RSRC1:%.*]], <8 x i32> inreg [[RSRC2:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
 ; MSAA-NEXT:  main_body:
-; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
 ; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
-; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0)
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32.v8i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0)
 ; MSAA-NEXT:    [[I2:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
 ; MSAA-NEXT:    [[I3:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
 ; MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
@@ -1214,10 +1214,10 @@ define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(<8 x i32>
 ; MSAA-NEXT:    ret [4 x float] [[I7]]
 ;
 main_body:
-  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc1, i32 0, i32 0)
-  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc1, i32 0, i32 0)
-  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc2, i32 0, i32 0)
-  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc2, i32 0, i32 0)
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc1, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc1, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc2, i32 0, i32 0)
+  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc2, i32 0, i32 0)
   %i4 = insertvalue [4 x float] undef, float %i, 0
   %i5 = insertvalue [4 x float] %i4, float %i1, 1
   %i6 = insertvalue [4 x float] %i5, float %i2, 2
@@ -1225,16 +1225,16 @@ main_body:
   ret [4 x float] %i7
 }
 
-declare float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
 
-declare float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare float @llvm.amdgcn.image.load.2darraymsaa.f32.i32.v8i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
 
-declare half @llvm.amdgcn.image.load.2dmsaa.f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare half @llvm.amdgcn.image.load.2dmsaa.f16.i32.v8i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
 
-declare void @llvm.amdgcn.image.store.2dmsaa.f32.i32(float, i32, i32, i32, i32, <8 x i32>, i32, i32)
+declare void @llvm.amdgcn.image.store.2dmsaa.f32.i32.v8i32(float, i32, i32, i32, i32, <8 x i32>, i32, i32)
 
 attributes #0 = { nounwind readonly willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index db88ddf1807f3..32abe50ff04d8 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -8759,11 +8759,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; GFX8-NEXT:    s_add_u32 s88, s88, s9
 ; GFX8-NEXT:    s_addc_u32 s89, s89, 0
-; GFX8-NEXT:    ; implicit-def: $vgpr44 : SGPR spill to VGPR lane
+; GFX8-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s0, s3, 8
-; GFX8-NEXT:    v_writelane_b32 v44, s0, 0
-; GFX8-NEXT:    v_writelane_b32 v44, s1, 1
+; GFX8-NEXT:    v_writelane_b32 v62, s0, 0
+; GFX8-NEXT:    v_writelane_b32 v62, s1, 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
 ; GFX8-NEXT:    s_lshr_b32 s36, s3, 21
 ; GFX8-NEXT:    s_lshr_b32 s30, s3, 19
@@ -8789,7 +8789,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_lshr_b32 s54, s3, 10
 ; GFX8-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX8-NEXT:    v_writelane_b32 v44, s0, 2
+; GFX8-NEXT:    v_writelane_b32 v62, s0, 2
 ; GFX8-NEXT:    s_lshr_b32 s52, s3, 11
 ; GFX8-NEXT:    s_bfe_i64 s[74:75], s[74:75], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[72:73], s[72:73], 0x10000
@@ -8814,7 +8814,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_bfe_i64 s[30:31], s[44:45], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[36:37], s[38:39], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX8-NEXT:    v_writelane_b32 v44, s1, 3
+; GFX8-NEXT:    v_writelane_b32 v62, s1, 3
 ; GFX8-NEXT:    s_lshr_b32 s6, s3, 9
 ; GFX8-NEXT:    s_lshr_b32 s8, s3, 6
 ; GFX8-NEXT:    s_lshr_b32 s10, s3, 7
@@ -8830,7 +8830,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s74
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s72
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s70
-; GFX8-NEXT:    v_mov_b32_e32 v55, s68
+; GFX8-NEXT:    v_mov_b32_e32 v54, s68
 ; GFX8-NEXT:    v_mov_b32_e32 v20, s66
 ; GFX8-NEXT:    v_mov_b32_e32 v16, s64
 ; GFX8-NEXT:    v_mov_b32_e32 v24, s62
@@ -8851,7 +8851,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s46
 ; GFX8-NEXT:    s_lshr_b32 s70, s2, 21
 ; GFX8-NEXT:    s_lshr_b32 s68, s2, 18
-; GFX8-NEXT:    v_mov_b32_e32 v57, s42
+; GFX8-NEXT:    v_mov_b32_e32 v56, s42
 ; GFX8-NEXT:    s_lshr_b32 s66, s2, 19
 ; GFX8-NEXT:    s_lshr_b32 s64, s2, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v22, s40
@@ -8876,16 +8876,16 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_lshr_b32 s36, s2, 2
 ; GFX8-NEXT:    s_lshr_b32 s30, s2, 3
 ; GFX8-NEXT:    s_bfe_i64 s[18:19], s[2:3], 0x10000
-; GFX8-NEXT:    v_readlane_b32 s2, v44, 0
-; GFX8-NEXT:    v_readlane_b32 s3, v44, 1
+; GFX8-NEXT:    v_readlane_b32 s2, v62, 0
+; GFX8-NEXT:    v_readlane_b32 s3, v62, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s75
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s51
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s73
 ; GFX8-NEXT:    v_mov_b32_e32 v11, s49
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s71
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s47
-; GFX8-NEXT:    v_mov_b32_e32 v56, s69
-; GFX8-NEXT:    v_mov_b32_e32 v58, s43
+; GFX8-NEXT:    v_mov_b32_e32 v55, s69
+; GFX8-NEXT:    v_mov_b32_e32 v57, s43
 ; GFX8-NEXT:    v_mov_b32_e32 v21, s67
 ; GFX8-NEXT:    v_mov_b32_e32 v23, s41
 ; GFX8-NEXT:    v_mov_b32_e32 v17, s65
@@ -8942,24 +8942,24 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v42, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x1e0
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v46, s3
-; GFX8-NEXT:    v_mov_b32_e32 v45, s2
+; GFX8-NEXT:    v_mov_b32_e32 v45, s3
+; GFX8-NEXT:    v_mov_b32_e32 v44, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x1d0
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v48, s3
-; GFX8-NEXT:    v_mov_b32_e32 v47, s2
+; GFX8-NEXT:    v_mov_b32_e32 v47, s3
+; GFX8-NEXT:    v_mov_b32_e32 v46, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x1c0
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v50, s3
-; GFX8-NEXT:    v_mov_b32_e32 v49, s2
+; GFX8-NEXT:    v_mov_b32_e32 v49, s3
+; GFX8-NEXT:    v_mov_b32_e32 v48, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x1b0
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v52, s3
-; GFX8-NEXT:    v_mov_b32_e32 v51, s2
+; GFX8-NEXT:    v_mov_b32_e32 v51, s3
+; GFX8-NEXT:    v_mov_b32_e32 v50, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x1a0
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v54, s3
-; GFX8-NEXT:    v_mov_b32_e32 v53, s2
+; GFX8-NEXT:    v_mov_b32_e32 v53, s3
+; GFX8-NEXT:    v_mov_b32_e32 v52, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x190
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v15, s3
@@ -8971,26 +8971,26 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    buffer_store_dword v12, off, s[88:91], 0 ; 4-byte Folded Spill
 ; GFX8-NEXT:    buffer_store_dword v13, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; GFX8-NEXT:    flat_store_dwordx4 v[42:43], v[4:7]
-; GFX8-NEXT:    flat_store_dwordx4 v[45:46], v[8:11]
-; GFX8-NEXT:    flat_store_dwordx4 v[47:48], v[0:3]
-; GFX8-NEXT:    flat_store_dwordx4 v[49:50], v[55:58]
-; GFX8-NEXT:    flat_store_dwordx4 v[51:52], v[20:23]
-; GFX8-NEXT:    flat_store_dwordx4 v[53:54], v[16:19]
+; GFX8-NEXT:    flat_store_dwordx4 v[44:45], v[8:11]
+; GFX8-NEXT:    flat_store_dwordx4 v[46:47], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[48:49], v[54:57]
+; GFX8-NEXT:    flat_store_dwordx4 v[50:51], v[20:23]
+; GFX8-NEXT:    flat_store_dwordx4 v[52:53], v[16:19]
 ; GFX8-NEXT:    flat_store_dwordx4 v[14:15], v[24:27]
 ; GFX8-NEXT:    buffer_load_dword v18, off, s[88:91], 0 ; 4-byte Folded Reload
 ; GFX8-NEXT:    buffer_load_dword v19, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x170
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v60, s3
-; GFX8-NEXT:    v_mov_b32_e32 v59, s2
+; GFX8-NEXT:    v_mov_b32_e32 v59, s3
+; GFX8-NEXT:    v_mov_b32_e32 v58, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x160
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v62, s3
-; GFX8-NEXT:    v_mov_b32_e32 v61, s2
+; GFX8-NEXT:    v_mov_b32_e32 v61, s3
+; GFX8-NEXT:    v_mov_b32_e32 v60, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x150
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v46, s3
-; GFX8-NEXT:    v_mov_b32_e32 v45, s2
+; GFX8-NEXT:    v_mov_b32_e32 v45, s3
+; GFX8-NEXT:    v_mov_b32_e32 v44, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x140
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s0
@@ -9021,9 +9021,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v11, s15
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[28:31]
-; GFX8-NEXT:    flat_store_dwordx4 v[59:60], v[32:35]
-; GFX8-NEXT:    flat_store_dwordx4 v[61:62], v[36:39]
-; GFX8-NEXT:    flat_store_dwordx4 v[45:46], v[40:43]
+; GFX8-NEXT:    flat_store_dwordx4 v[58:59], v[32:35]
+; GFX8-NEXT:    flat_store_dwordx4 v[60:61], v[36:39]
+; GFX8-NEXT:    flat_store_dwordx4 v[44:45], v[40:43]
 ; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
 ; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
@@ -9177,9 +9177,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s30
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s31
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    v_readlane_b32 s0, v44, 2
+; GFX8-NEXT:    v_readlane_b32 s0, v62, 2
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT:    v_readlane_b32 s1, v44, 3
+; GFX8-NEXT:    v_readlane_b32 s1, v62, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s19
@@ -9187,7 +9187,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT:    ; kill: killed $vgpr44
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_sextload_v64i1_to_v64i64:
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
index b7ade2147e40c..930d17646c797 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
@@ -39,7 +39,7 @@ body:             |
     ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
     ; GFX940: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
     ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
-    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
@@ -60,7 +60,7 @@ body:             |
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
     ; GFX12: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
     ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
-    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
@@ -162,7 +162,7 @@ body:             |
     ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
     ; GFX940: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
     ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
-    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
@@ -183,7 +183,7 @@ body:             |
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
     ; GFX12: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
     ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
-    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
@@ -232,7 +232,7 @@ body:             |
     ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
     ; GFX940: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
     ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
-    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX940-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec
@@ -252,7 +252,7 @@ body:             |
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
     ; GFX12: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
     ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
-    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec
@@ -425,6 +425,160 @@ body:             |
 
 ...
 
+---
+name:            local_stack_alloc__s_add_i32__reg_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5
+    ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
+    ; GFX803: liveins: $sgpr4, $sgpr5
+    ; GFX803-NEXT: {{  $}}
+    ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
+    ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
+    ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
+    ; GFX900: liveins: $sgpr4, $sgpr5
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
+    ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
+    ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
+    ; GFX940: liveins: $sgpr4, $sgpr5
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
+    ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
+    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX940-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
+    ; GFX10: liveins: $sgpr4, $sgpr5
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
+    ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
+    ; GFX12: liveins: $sgpr4, $sgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
+    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:sreg_32 = COPY $sgpr4
+    %1:sreg_32 = COPY $sgpr5
+
+    %2:sreg_32 = S_ADD_I32 %0, %stack.0, implicit-def dead $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %2
+    %3:sreg_32 = S_ADD_I32 %1, %stack.0, implicit-def dead $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %3
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__s_add_i32__reg_offsets_commute
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5
+    ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
+    ; GFX803: liveins: $sgpr4, $sgpr5
+    ; GFX803-NEXT: {{  $}}
+    ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
+    ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
+    ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
+    ; GFX900: liveins: $sgpr4, $sgpr5
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
+    ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
+    ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
+    ; GFX940: liveins: $sgpr4, $sgpr5
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
+    ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
+    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX940-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
+    ; GFX10: liveins: $sgpr4, $sgpr5
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
+    ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
+    ; GFX12: liveins: $sgpr4, $sgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
+    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:sreg_32 = COPY $sgpr4
+    %1:sreg_32 = COPY $sgpr5
+
+    %2:sreg_32 = S_ADD_I32 %stack.0, %0, implicit-def dead $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %2
+    %3:sreg_32 = S_ADD_I32 %stack.0, %1, implicit-def dead $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %3
+    SI_RETURN
+
+...
+
 ---
 name:            local_stack_alloc__s_add_i32__literal_offsets_live_scc
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
new file mode 100644
index 0000000000000..5b225636b120a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
@@ -0,0 +1,1683 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s
+
+target triple = "amdgcn--"
+
+;;; Legal types. These are natively supported, no casts should be performed.
+
+define i8 @load_i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define i8 @load_i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[RET:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i8, ptr addrspace(7) %p
+  ret i8 %ret
+}
+
+define void @store_i8(i8 %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_i8(
+; CHECK-SAME: i8 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i8 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i16 @load_i16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define i16 @load_i16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret i16 [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i16, ptr addrspace(7) %p
+  ret i16 %ret
+}
+
+define void @store_i16(i16 %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_i16(
+; CHECK-SAME: i16 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i16 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i32 @load_i32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define i32 @load_i32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i32, ptr addrspace(7) %p
+  ret i32 %ret
+}
+
+define void @store_i32(i32 %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_i32(
+; CHECK-SAME: i32 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i32 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i64 @load_i64(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define i64 @load_i64(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret i64 [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i64, ptr addrspace(7) %p
+  ret i64 %ret
+}
+
+define void @store_i64(i64 %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_i64(
+; CHECK-SAME: i64 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i64 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i128 @load_i128(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define i128 @load_i128(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call i128 @llvm.amdgcn.raw.ptr.buffer.load.i128(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret i128 [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i128, ptr addrspace(7) %p
+  ret i128 %ret
+}
+
+define void @store_i128(i128 %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_i128(
+; CHECK-SAME: i128 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i128(i128 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i128 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <1 x i32> @load_v1i32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <1 x i32> @load_v1i32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <1 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v1i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <1 x i32> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <1 x i32>, ptr addrspace(7) %p
+  ret <1 x i32> %ret
+}
+
+define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v1i32(
+; CHECK-SAME: <1 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v1i32(<1 x i32> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <1 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x i32> @load_v2i32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <2 x i32> @load_v2i32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x i32> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i32>, ptr addrspace(7) %p
+  ret <2 x i32> %ret
+}
+
+define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v2i32(
+; CHECK-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x i32> @load_v3i32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <3 x i32> @load_v3i32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <3 x i32> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x i32>, ptr addrspace(7) %p
+  ret <3 x i32> %ret
+}
+
+define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v3i32(
+; CHECK-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x i32> @load_v4i32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <4 x i32> @load_v4i32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <4 x i32> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x i32>, ptr addrspace(7) %p
+  ret <4 x i32> %ret
+}
+
+define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v4i32(
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x i16> @load_v2i16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <2 x i16> @load_v2i16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x i16> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i16>, ptr addrspace(7) %p
+  ret <2 x i16> %ret
+}
+
+define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v2i16(
+; CHECK-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x i16> @load_v4i16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <4 x i16> @load_v4i16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <4 x i16> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x i16>, ptr addrspace(7) %p
+  ret <4 x i16> %ret
+}
+
+define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v4i16(
+; CHECK-SAME: <4 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x i16> @load_v8i16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <8 x i16> @load_v8i16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <8 x i16> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x i16>, ptr addrspace(7) %p
+  ret <8 x i16> %ret
+}
+
+define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v8i16(
+; CHECK-SAME: <8 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x i64> @load_v2i64(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <2 x i64> @load_v2i64(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.amdgcn.raw.ptr.buffer.load.v2i64(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x i64> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i64>, ptr addrspace(7) %p
+  ret <2 x i64> %ret
+}
+
+define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v2i64(
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i64(<2 x i64> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i64> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define half @load_f16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define half @load_f16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret half [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load half, ptr addrspace(7) %p
+  ret half %ret
+}
+
+define void @store_f16(half %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_f16(
+; CHECK-SAME: half [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store half %data, ptr addrspace(7) %p
+  ret void
+}
+
+define bfloat @load_bf16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define bfloat @load_bf16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.bf16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret bfloat [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load bfloat, ptr addrspace(7) %p
+  ret bfloat %ret
+}
+
+define void @store_bf16(bfloat %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_bf16(
+; CHECK-SAME: bfloat [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store bfloat %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x half> @load_v2f16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <2 x half> @load_v2f16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x half> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x half>, ptr addrspace(7) %p
+  ret <2 x half> %ret
+}
+
+define void @store_v2f16(<2 x half> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v2f16(
+; CHECK-SAME: <2 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x half> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x bfloat> @load_v4bf16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <4 x bfloat> @load_v4bf16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v4bf16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <4 x bfloat> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x bfloat>, ptr addrspace(7) %p
+  ret <4 x bfloat> %ret
+}
+
+define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v4bf16(
+; CHECK-SAME: <4 x bfloat> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x bfloat> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x half> @load_v8f16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <8 x half> @load_v8f16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x half> @llvm.amdgcn.raw.ptr.buffer.load.v8f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <8 x half> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x half>, ptr addrspace(7) %p
+  ret <8 x half> %ret
+}
+
+define void @store_v8f16(<8 x half> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v8f16(
+; CHECK-SAME: <8 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x half> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define float @load_f32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define float @load_f32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load float, ptr addrspace(7) %p
+  ret float %ret
+}
+
+define void @store_f32(float %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_f32(
+; CHECK-SAME: float [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store float %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x float> @load_v2f32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <2 x float> @load_v2f32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x float>, ptr addrspace(7) %p
+  ret <2 x float> %ret
+}
+
+define void @store_v2f32(<2 x float> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v2f32(
+; CHECK-SAME: <2 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x float> @load_v3f32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <3 x float> @load_v3f32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <3 x float> @llvm.amdgcn.raw.ptr.buffer.load.v3f32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <3 x float> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x float>, ptr addrspace(7) %p
+  ret <3 x float> %ret
+}
+
+define void @store_v3f32(<3 x float> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v3f32(
+; CHECK-SAME: <3 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3f32(<3 x float> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x float> @load_v4f32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <4 x float> @load_v4f32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <4 x float> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x float>, ptr addrspace(7) %p
+  ret <4 x float> %ret
+}
+
+define void @store_v4f32(<4 x float> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v4f32(
+; CHECK-SAME: <4 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(0) @load_p0(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define ptr @load_p0(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call ptr @llvm.amdgcn.raw.ptr.buffer.load.p0(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret ptr [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(0), ptr addrspace(7) %p
+  ret ptr addrspace(0) %ret
+}
+
+define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_p0(
+; CHECK-SAME: ptr [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p0(ptr [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(0) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(1) @load_p1(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define ptr addrspace(1) @load_p1(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(1) @llvm.amdgcn.raw.ptr.buffer.load.p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret ptr addrspace(1) [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(1), ptr addrspace(7) %p
+  ret ptr addrspace(1) %ret
+}
+
+define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_p1(
+; CHECK-SAME: ptr addrspace(1) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p1(ptr addrspace(1) [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(1) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(2) @load_p2(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define ptr addrspace(2) @load_p2(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(2) @llvm.amdgcn.raw.ptr.buffer.load.p2(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret ptr addrspace(2) [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(2), ptr addrspace(7) %p
+  ret ptr addrspace(2) %ret
+}
+
+define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_p2(
+; CHECK-SAME: ptr addrspace(2) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p2(ptr addrspace(2) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(2) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(3) @load_p3(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define ptr addrspace(3) @load_p3(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(3) @llvm.amdgcn.raw.ptr.buffer.load.p3(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret ptr addrspace(3) [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(3), ptr addrspace(7) %p
+  ret ptr addrspace(3) %ret
+}
+
+define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_p3(
+; CHECK-SAME: ptr addrspace(3) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p3(ptr addrspace(3) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(3) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(4) @load_p4(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define ptr addrspace(4) @load_p4(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(4) @llvm.amdgcn.raw.ptr.buffer.load.p4(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret ptr addrspace(4) [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(4), ptr addrspace(7) %p
+  ret ptr addrspace(4) %ret
+}
+
+define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_p4(
+; CHECK-SAME: ptr addrspace(4) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p4(ptr addrspace(4) [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(4) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(5) @load_p5(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define ptr addrspace(5) @load_p5(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(5) @llvm.amdgcn.raw.ptr.buffer.load.p5(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret ptr addrspace(5) [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(5), ptr addrspace(7) %p
+  ret ptr addrspace(5) %ret
+}
+
+define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_p5(
+; CHECK-SAME: ptr addrspace(5) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p5(ptr addrspace(5) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(5) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(6) @load_p6(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define ptr addrspace(6) @load_p6(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(6) @llvm.amdgcn.raw.ptr.buffer.load.p6(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret ptr addrspace(6) [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(6), ptr addrspace(7) %p
+  ret ptr addrspace(6) %ret
+}
+
+define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_p6(
+; CHECK-SAME: ptr addrspace(6) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p6(ptr addrspace(6) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(6) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(8) @load_p8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define ptr addrspace(8) @load_p8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(8) @llvm.amdgcn.raw.ptr.buffer.load.p8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret ptr addrspace(8) [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(8), ptr addrspace(7) %p
+  ret ptr addrspace(8) %ret
+}
+
+define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_p8(
+; CHECK-SAME: ptr addrspace(8) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p8(ptr addrspace(8) [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(8) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <2 x ptr addrspace(1)> @load_v2p1(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x ptr addrspace(1)>, ptr addrspace(7) %p
+  ret <2 x ptr addrspace(1)> %ret
+}
+
+define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v2p1(
+; CHECK-SAME: <2 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x ptr addrspace(1)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <2 x ptr addrspace(5)> @load_v2p5(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v2p5(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x ptr addrspace(5)> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x ptr addrspace(5)>, ptr addrspace(7) %p
+  ret <2 x ptr addrspace(5)> %ret
+}
+
+define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v2p5(
+; CHECK-SAME: <2 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2p5(<2 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <3 x ptr addrspace(5)> @load_v3p5(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <3 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v3p5(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <3 x ptr addrspace(5)> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x ptr addrspace(5)>, ptr addrspace(7) %p
+  ret <3 x ptr addrspace(5)> %ret
+}
+
+define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v3p5(
+; CHECK-SAME: <3 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3p5(<3 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <4 x ptr addrspace(5)> @load_v4p5(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v4p5(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <4 x ptr addrspace(5)> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x ptr addrspace(5)>, ptr addrspace(7) %p
+  ret <4 x ptr addrspace(5)> %ret
+}
+
+define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v4p5(
+; CHECK-SAME: <4 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4p5(<4 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; 3 words in a short type. These need to be bitcast to <3 x i32> to be supported.
+
+define <6 x half> @load_v6f16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <6 x half> @load_v6f16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <6 x half> @llvm.amdgcn.raw.ptr.buffer.load.v6f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <6 x half> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x half>, ptr addrspace(7) %p
+  ret <6 x half> %ret
+}
+
+define void @store_v6f16(<6 x half> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v6f16(
+; CHECK-SAME: <6 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6f16(<6 x half> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x half> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Long types (32 bit elements). Must be split into multiple operations.
+
+define <5 x float> @load_v5f32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <5 x float> @load_v5f32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <5 x float> @llvm.amdgcn.raw.ptr.buffer.load.v5f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <5 x float> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <5 x float>, ptr addrspace(7) %p
+  ret <5 x float> %ret
+}
+
+define void @store_v5f32(<5 x float> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v5f32(
+; CHECK-SAME: <5 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v5f32(<5 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <5 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x float> @load_v6f32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <6 x float> @load_v6f32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <6 x float> @llvm.amdgcn.raw.ptr.buffer.load.v6f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <6 x float> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x float>, ptr addrspace(7) %p
+  ret <6 x float> %ret
+}
+
+define void @store_v6f32(<6 x float> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v6f32(
+; CHECK-SAME: <6 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6f32(<6 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <7 x float> @load_v7f32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <7 x float> @load_v7f32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <7 x float> @llvm.amdgcn.raw.ptr.buffer.load.v7f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <7 x float> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <7 x float>, ptr addrspace(7) %p
+  ret <7 x float> %ret
+}
+
+define void @store_v7f32(<7 x float> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v7f32(
+; CHECK-SAME: <7 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v7f32(<7 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <7 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x float> @load_v8f32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <8 x float> @load_v8f32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x float> @llvm.amdgcn.raw.ptr.buffer.load.v8f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <8 x float> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x float>, ptr addrspace(7) %p
+  ret <8 x float> %ret
+}
+
+define void @store_v8f32(<8 x float> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v8f32(
+; CHECK-SAME: <8 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8f32(<8 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <10 x float> @load_v10f32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <10 x float> @load_v10f32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <10 x float> @llvm.amdgcn.raw.ptr.buffer.load.v10f32(ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <10 x float> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <10 x float>, ptr addrspace(7) %p
+  ret <10 x float> %ret
+}
+
+define void @store_v10f32(<10 x float> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v10f32(
+; CHECK-SAME: <10 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v10f32(<10 x float> [[DATA]], ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <10 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x i32> @load_v6i32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <6 x i32> @load_v6i32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <6 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v6i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <6 x i32> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x i32>, ptr addrspace(7) %p
+  ret <6 x i32> %ret
+}
+
+define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v6i32(
+; CHECK-SAME: <6 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6i32(<6 x i32> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <4 x ptr addrspace(1)> @load_v4p1(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v4p1(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <4 x ptr addrspace(1)> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x ptr addrspace(1)>, ptr addrspace(7) %p
+  ret <4 x ptr addrspace(1)> %ret
+}
+
+define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v4p1(
+; CHECK-SAME: <4 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4p1(<4 x ptr addrspace(1)> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x ptr addrspace(1)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Uneven types with 16-bit elements. Require splitting into multiple operations.
+
+define <1 x i16> @load_v1i16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <1 x i16> @load_v1i16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <1 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v1i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <1 x i16> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <1 x i16>, ptr addrspace(7) %p
+  ret <1 x i16> %ret
+}
+
+define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v1i16(
+; CHECK-SAME: <1 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v1i16(<1 x i16> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <1 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x i16> @load_v3i16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <3 x i16> @load_v3i16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <3 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v3i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <3 x i16> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x i16>, ptr addrspace(7) %p
+  ret <3 x i16> %ret
+}
+
+define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v3i16(
+; CHECK-SAME: <3 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i16(<3 x i16> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <5 x i16> @load_v5i16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <5 x i16> @load_v5i16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <5 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v5i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <5 x i16> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <5 x i16>, ptr addrspace(7) %p
+  ret <5 x i16> %ret
+}
+
+define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v5i16(
+; CHECK-SAME: <5 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v5i16(<5 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <5 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x i16> @load_v6i16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <6 x i16> @load_v6i16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <6 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v6i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <6 x i16> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x i16>, ptr addrspace(7) %p
+  ret <6 x i16> %ret
+}
+
+define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v6i16(
+; CHECK-SAME: <6 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6i16(<6 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <7 x i16> @load_v7i16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <7 x i16> @load_v7i16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <7 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v7i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <7 x i16> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <7 x i16>, ptr addrspace(7) %p
+  ret <7 x i16> %ret
+}
+
+define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v7i16(
+; CHECK-SAME: <7 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v7i16(<7 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <7 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <9 x i16> @load_v9i16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <9 x i16> @load_v9i16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <9 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v9i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <9 x i16> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <9 x i16>, ptr addrspace(7) %p
+  ret <9 x i16> %ret
+}
+
+define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v9i16(
+; CHECK-SAME: <9 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v9i16(<9 x i16> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <9 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Byte vectors. Need to be
+;;; - Split into multiple operations
+;;; - Bitcast if they have a natively supported width
+
+define <1 x i8> @load_v1i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <1 x i8> @load_v1i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <1 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v1i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <1 x i8> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <1 x i8>, ptr addrspace(7) %p
+  ret <1 x i8> %ret
+}
+
+define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v1i8(
+; CHECK-SAME: <1 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v1i8(<1 x i8> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <1 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x i8> @load_v2i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <2 x i8> @load_v2i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v2i8(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x i8> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i8>, ptr addrspace(7) %p
+  ret <2 x i8> %ret
+}
+
+define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v2i8(
+; CHECK-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i8(<2 x i8> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x i8> @load_v3i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <3 x i8> @load_v3i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <3 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v3i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <3 x i8> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x i8>, ptr addrspace(7) %p
+  ret <3 x i8> %ret
+}
+
+define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v3i8(
+; CHECK-SAME: <3 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i8(<3 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x i8> @load_v4i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <4 x i8> @load_v4i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v4i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <4 x i8> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x i8>, ptr addrspace(7) %p
+  ret <4 x i8> %ret
+}
+
+define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v4i8(
+; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i8(<4 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <5 x i8> @load_v5i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <5 x i8> @load_v5i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <5 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v5i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <5 x i8> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <5 x i8>, ptr addrspace(7) %p
+  ret <5 x i8> %ret
+}
+
+define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v5i8(
+; CHECK-SAME: <5 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v5i8(<5 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <5 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x i8> @load_v6i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <6 x i8> @load_v6i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <6 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v6i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <6 x i8> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x i8>, ptr addrspace(7) %p
+  ret <6 x i8> %ret
+}
+
+define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v6i8(
+; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6i8(<6 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <7 x i8> @load_v7i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <7 x i8> @load_v7i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <7 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v7i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <7 x i8> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <7 x i8>, ptr addrspace(7) %p
+  ret <7 x i8> %ret
+}
+
+define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v7i8(
+; CHECK-SAME: <7 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v7i8(<7 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <7 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x i8> @load_v8i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <8 x i8> @load_v8i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v8i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <8 x i8> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x i8>, ptr addrspace(7) %p
+  ret <8 x i8> %ret
+}
+
+define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v8i8(
+; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8i8(<8 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <12 x i8> @load_v12i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <12 x i8> @load_v12i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <12 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v12i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <12 x i8> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <12 x i8>, ptr addrspace(7) %p
+  ret <12 x i8> %ret
+}
+
+define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v12i8(
+; CHECK-SAME: <12 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v12i8(<12 x i8> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <12 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <16 x i8> @load_v16i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <16 x i8> @load_v16i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <16 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v16i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <16 x i8> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <16 x i8>, ptr addrspace(7) %p
+  ret <16 x i8> %ret
+}
+
+define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v16i8(
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v16i8(<16 x i8> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <16 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <32 x i8> @load_v32i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <32 x i8> @load_v32i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <32 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v32i8(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <32 x i8> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <32 x i8>, ptr addrspace(7) %p
+  ret <32 x i8> %ret
+}
+
+define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v32i8(
+; CHECK-SAME: <32 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v32i8(<32 x i8> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <32 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Arrays. Need to become vectors.
+
+define [1 x i32] @load_a1i32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define [1 x i32] @load_a1i32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call [1 x i32] @llvm.amdgcn.raw.ptr.buffer.load.a1i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret [1 x i32] [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [1 x i32], ptr addrspace(7) %p
+  ret [1 x i32] %ret
+}
+
+define void @store_a1i32([1 x i32] %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_a1i32(
+; CHECK-SAME: [1 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.a1i32([1 x i32] [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [1 x i32] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x i32] @load_a2i32(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define [2 x i32] @load_a2i32(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call [2 x i32] @llvm.amdgcn.raw.ptr.buffer.load.a2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret [2 x i32] [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x i32], ptr addrspace(7) %p
+  ret [2 x i32] %ret
+}
+
+define void @store_a2i32([2 x i32] %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_a2i32(
+; CHECK-SAME: [2 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.a2i32([2 x i32] [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x i32] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x half] @load_a2f16(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define [2 x half] @load_a2f16(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call [2 x half] @llvm.amdgcn.raw.ptr.buffer.load.a2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret [2 x half] [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x half], ptr addrspace(7) %p
+  ret [2 x half] %ret
+}
+
+define void @store_a2f16([2 x half] %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_a2f16(
+; CHECK-SAME: [2 x half] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.a2f16([2 x half] [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x half] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define [2 x ptr addrspace(1)] @load_a2p1(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call [2 x ptr addrspace(1)] @llvm.amdgcn.raw.ptr.buffer.load.a2p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret [2 x ptr addrspace(1)] [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x ptr addrspace(1)], ptr addrspace(7) %p
+  ret [2 x ptr addrspace(1)] %ret
+}
+
+define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_a2p1(
+; CHECK-SAME: [2 x ptr addrspace(1)] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.a2p1([2 x ptr addrspace(1)] [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x ptr addrspace(1)] %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Scalars of atypical width. Need to be cast to vectors and split.
+
+define i40 @load_i40(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define i40 @load_i40(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call i40 @llvm.amdgcn.raw.ptr.buffer.load.i40(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret i40 [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i40, ptr addrspace(7) %p
+  ret i40 %ret
+}
+
+define void @store_i40(i40 %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_i40(
+; CHECK-SAME: i40 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i40(i40 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i40 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i96 @load_i96(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define i96 @load_i96(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call i96 @llvm.amdgcn.raw.ptr.buffer.load.i96(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret i96 [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i96, ptr addrspace(7) %p
+  ret i96 %ret
+}
+
+define void @store_i96(i96 %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_i96(
+; CHECK-SAME: i96 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i96(i96 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i96 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i160 @load_i160(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define i160 @load_i160(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call i160 @llvm.amdgcn.raw.ptr.buffer.load.i160(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret i160 [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i160, ptr addrspace(7) %p
+  ret i160 %ret
+}
+
+define void @store_i160(i160 %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_i160(
+; CHECK-SAME: i160 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i160 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i256 @load_i256(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define i256 @load_i256(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call i256 @llvm.amdgcn.raw.ptr.buffer.load.i256(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret i256 [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i256, ptr addrspace(7) %p
+  ret i256 %ret
+}
+
+define void @store_i256(i256 %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_i256(
+; CHECK-SAME: i256 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i256(i256 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i256 %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Non-byte-sized scalars. Require zero-extension.
+
+define i7 @load_i4(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define i7 @load_i4(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call i7 @llvm.amdgcn.raw.ptr.buffer.load.i7(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret i7 [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i7, ptr addrspace(7) %p
+  ret i7 %ret
+}
+
+define void @store_i4(i7 %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_i4(
+; CHECK-SAME: i7 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i7(i7 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i7 %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Byte-sized vectors of i4. Require casts.
+
+define <2 x i4> @load_v2i4(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <2 x i4> @load_v2i4(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v2i4(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x i4> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i4>, ptr addrspace(7) %p
+  ret <2 x i4> %ret
+}
+
+define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v2i4(
+; CHECK-SAME: <2 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i4(<2 x i4> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i4> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x i4> @load_v4i4(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <4 x i4> @load_v4i4(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v4i4(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <4 x i4> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x i4>, ptr addrspace(7) %p
+  ret <4 x i4> %ret
+}
+
+define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v4i4(
+; CHECK-SAME: <4 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i4(<4 x i4> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x i4> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x i4> @load_v8i4(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <8 x i4> @load_v8i4(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v8i4(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <8 x i4> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x i4>, ptr addrspace(7) %p
+  ret <8 x i4> %ret
+}
+
+define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v8i4(
+; CHECK-SAME: <8 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8i4(<8 x i4> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x i4> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Vectors of non-byte-sized integers.
+
+define <2 x i6> @load_v2i6(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <2 x i6> @load_v2i6(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x i6> @llvm.amdgcn.raw.ptr.buffer.load.v2i6(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x i6> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i6>, ptr addrspace(7) %p
+  ret <2 x i6> %ret
+}
+
+define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v2i6(
+; CHECK-SAME: <2 x i6> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i6(<2 x i6> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i6> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;; Blocks of fp6 elements
+define <6 x i32> @load_v32i6(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <6 x i32> @load_v32i6(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <32 x i6> @llvm.amdgcn.raw.ptr.buffer.load.v32i6(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_CAST:%.*]] = bitcast <32 x i6> [[RET]] to <6 x i32>
+; CHECK-NEXT:    ret <6 x i32> [[RET_CAST]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <32 x i6>, ptr addrspace(7) %p
+  %ret.cast = bitcast <32 x i6> %ret to <6 x i32>
+  ret <6 x i32> %ret.cast
+}
+
+define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_v32i6(
+; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA:%.*]] = bitcast <6 x i32> [[DATA_ABI]] to <32 x i6>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v32i6(<32 x i6> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %data = bitcast <6 x i32> %data.abi to <32 x i6>
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <32 x i6> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Modifiers
+
+define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <4 x i8> @volatile_load_v4i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v4i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    ret <4 x i8> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load volatile <4 x i8>, ptr addrspace(7) %p
+  ret <4 x i8> %ret
+}
+
+define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @volatile_store_v4i8(
+; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i8(<4 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store volatile <4 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define <6 x i8> @volatile_load_v6i8(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <6 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v6i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    ret <6 x i8> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load volatile <6 x i8>, ptr addrspace(7) %p
+  ret <6 x i8> %ret
+}
+
+define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @volatile_store_v6i8(
+; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6i8(<6 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store volatile <6 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
index 34dcdaf29677e..b508ffff8050a 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
@@ -9,6 +9,19 @@
 @lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
 @lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
 
+; GCN-LABEL: {{^}}f0:
+; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
+; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
+; GCN:     ds_write_b8 [[NULL]], [[TREE]]
+define void @f0() {
+; OPT-LABEL: @f0() {
+; OPT-NEXT:    store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1
+; OPT-NEXT:    ret void
+;
+  store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1
+  ret void
+}
+
 ; GCN-LABEL: {{^}}k0:
 ; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
@@ -29,16 +42,3 @@ define amdgpu_kernel void @k0() {
   call void @f0()
   ret void
 }
-
-; GCN-LABEL: {{^}}f0:
-; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
-; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
-; GCN:     ds_write_b8 [[NULL]], [[TREE]]
-define void @f0() {
-; OPT-LABEL: @f0() {
-; OPT-NEXT:    store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1
-; OPT-NEXT:    ret void
-;
-  store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 9829b7e787d47..e9cd94620a6b9 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -1520,9 +1520,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x202000
 ; GFX7-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_add_i32 s6, s32, 0x201200
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x202100
 ; GFX7-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    v_writelane_b32 v23, s28, 28
@@ -1562,36 +1562,57 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    ; implicit-def: $vgpr22
-; GFX7-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX7-NEXT:    buffer_store_dword v16, off, s[0:3], s32
+; GFX7-NEXT:    v_mov_b32_e32 v16, 0x8040
+; GFX7-NEXT:    buffer_store_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
+; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
 ; GFX7-NEXT:    v_writelane_b32 v22, vcc_lo, 0
 ; GFX7-NEXT:    v_writelane_b32 v22, vcc_hi, 1
-; GFX7-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], s32
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0x8044
-; GFX7-NEXT:    buffer_store_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[28:29]
-; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; GFX7-NEXT:    v_lshr_b32_e64 v22, s32, 6
 ; GFX7-NEXT:    s_movk_i32 vcc_lo, 0x4040
-; GFX7-NEXT:    v_add_i32_e32 v22, vcc, vcc_lo, v22
-; GFX7-NEXT:    v_add_i32_e32 v22, vcc, 0x200, v22
-; GFX7-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, vcc_lo, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x200, v0
+; GFX7-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX7-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX7-NEXT:    s_and_b64 vcc, 0, exec
-; GFX7-NEXT:    s_mov_b64 s[28:29], exec
-; GFX7-NEXT:    s_mov_b64 exec, -1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], s32
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0x8044
-; GFX7-NEXT:    buffer_load_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[28:29]
-; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_readlane_b32 vcc_lo, v22, 0
 ; GFX7-NEXT:    v_readlane_b32 vcc_hi, v22, 1
-; GFX7-NEXT:    s_mov_b64 s[28:29], exec
-; GFX7-NEXT:    s_mov_b64 exec, -1
-; GFX7-NEXT:    s_mov_b64 exec, s[28:29]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v16, off, s[0:3], s32
+; GFX7-NEXT:    v_mov_b32_e32 v16, 0x8040
+; GFX7-NEXT:    buffer_load_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
@@ -1624,13 +1645,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX7-NEXT:    v_readlane_b32 s33, v23, 2
 ; GFX7-NEXT:    v_readlane_b32 s31, v23, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v23, 0
-; GFX7-NEXT:    ; kill: killed $vgpr22
 ; GFX7-NEXT:    v_readlane_b32 s28, v23, 28
 ; GFX7-NEXT:    v_readlane_b32 s29, v23, 29
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x202000
 ; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_add_i32 s6, s32, 0x201200
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x202100
 ; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -1640,9 +1660,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x202000
 ; GFX8-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x201200
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x202100
 ; GFX8-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    v_writelane_b32 v23, s58, 28
@@ -1682,36 +1702,60 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    ; implicit-def: $vgpr22
-; GFX8-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX8-NEXT:    buffer_store_dword v16, off, s[0:3], s32
+; GFX8-NEXT:    v_mov_b32_e32 v16, 0x8040
+; GFX8-NEXT:    buffer_store_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Spill
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_store_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX8-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX8-NEXT:    v_writelane_b32 v22, vcc_lo, 0
 ; GFX8-NEXT:    v_writelane_b32 v22, vcc_hi, 1
-; GFX8-NEXT:    s_or_saveexec_b64 s[58:59], -1
-; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s32
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0x8044
-; GFX8-NEXT:    buffer_store_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[58:59]
-; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; GFX8-NEXT:    v_lshrrev_b32_e64 v22, 6, s32
 ; GFX8-NEXT:    s_movk_i32 vcc_lo, 0x4040
-; GFX8-NEXT:    v_add_u32_e32 v22, vcc, vcc_lo, v22
-; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0x200, v22
-; GFX8-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, vcc_lo, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x200, v0
+; GFX8-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
 ; GFX8-NEXT:    s_and_b64 vcc, 0, exec
-; GFX8-NEXT:    s_mov_b64 s[58:59], exec
-; GFX8-NEXT:    s_mov_b64 exec, -1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s32
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0x8044
-; GFX8-NEXT:    buffer_load_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[58:59]
-; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_readlane_b32 vcc_lo, v22, 0
 ; GFX8-NEXT:    v_readlane_b32 vcc_hi, v22, 1
-; GFX8-NEXT:    s_mov_b64 s[58:59], exec
-; GFX8-NEXT:    s_mov_b64 exec, -1
-; GFX8-NEXT:    s_mov_b64 exec, s[58:59]
+; GFX8-NEXT:    v_readlane_b32 s58, v23, 28
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v16, off, s[0:3], s32
+; GFX8-NEXT:    v_mov_b32_e32 v16, 0x8040
+; GFX8-NEXT:    buffer_load_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
@@ -1744,13 +1788,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX8-NEXT:    v_readlane_b32 s33, v23, 2
 ; GFX8-NEXT:    v_readlane_b32 s31, v23, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v23, 0
-; GFX8-NEXT:    ; kill: killed $vgpr22
-; GFX8-NEXT:    v_readlane_b32 s58, v23, 28
 ; GFX8-NEXT:    v_readlane_b32 s59, v23, 29
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x202000
 ; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_add_i32 s6, s32, 0x201200
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x202100
 ; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 506f40516c9e6..25a6c80b91794 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -1734,3 +1734,1268 @@ define double @v_maximumnum_f64_fneg(double %x, double %y) {
   %result = call double @llvm.maximumnum.f64(double %fneg.x, double %fneg.y)
   ret double %result
 }
+
+define <2 x half> @v_maximumnum_v2f16(<2 x half> %x, <2 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %x, <2 x half> %y)
+  ret <2 x half> %result
+}
+
+define <2 x half> @v_maximumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v2f16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v2f16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v2f16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v2f16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v2f16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <2 x half> @llvm.maximumnum.v2f16(<2 x half> %x, <2 x half> %y)
+  ret <2 x half> %result
+}
+
+define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v3f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v3f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v3f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v3f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> %x, <3 x half> %y)
+  ret <3 x half> %result
+}
+
+define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v3f16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v3f16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v3f16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v3f16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v3f16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <3 x half> @llvm.maximumnum.v3f16(<3 x half> %x, <3 x half> %y)
+  ret <3 x half> %result
+}
+
+define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_max_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y)
+  ret <4 x half> %result
+}
+
+define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v4f16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v4f16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v4f16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v4f16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v4f16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y)
+  ret <4 x half> %result
+}
+
+define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v6f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_max_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v4
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v4, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v4
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v6f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX9-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v6f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v4
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v6f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v4
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v6f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <6 x half> @llvm.maximumnum.v6f16(<6 x half> %x, <6 x half> %y)
+  ret <6 x half> %result
+}
+
+define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v8f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_max_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX8-NEXT:    v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v7, v7
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v5
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v8f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX9-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v4
+; GFX9-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX9-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v8f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v5
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v6
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v7
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v8f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v5
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v8f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v7, v7, v7
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v5
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> %x, <8 x half> %y)
+  ret <8 x half> %result
+}
+
+define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
+; GFX8-LABEL: v_maximumnum_v2f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v2f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v2f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v2 :: v_dual_max_num_f32 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> %x, <2 x float> %y)
+  ret <2 x float> %result
+}
+
+define <2 x float> @v_maximumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) {
+; GFX8-LABEL: v_maximumnum_v2f32_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v2f32_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v2f32_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v2f32_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v2f32_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v2 :: v_dual_max_num_f32 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <2 x float> @llvm.maximumnum.v2f32(<2 x float> %x, <2 x float> %y)
+  ret <2 x float> %result
+}
+
+define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
+; GFX8-LABEL: v_maximumnum_v3f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v3f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v3f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v3 :: v_dual_max_f32 v1, v1, v4
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v0, v0, v0
+; GFX12-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v3 :: v_dual_max_num_f32 v1, v1, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> %x, <3 x float> %y)
+  ret <3 x float> %result
+}
+
+define <3 x float> @v_maximumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) {
+; GFX8-LABEL: v_maximumnum_v3f32_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v3f32_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v3f32_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v3f32_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v3 :: v_dual_max_f32 v1, v1, v4
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v3f32_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v3 :: v_dual_max_num_f32 v1, v1, v4
+; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <3 x float> @llvm.maximumnum.v3f32(<3 x float> %x, <3 x float> %y)
+  ret <3 x float> %result
+}
+
+define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
+; GFX8-LABEL: v_maximumnum_v4f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX9-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v4f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v4f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v1, v1, v5
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v6 :: v_dual_max_f32 v3, v3, v7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7
+; GFX12-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v4 :: v_dual_max_num_f32 v1, v1, v5
+; GFX12-NEXT:    v_dual_max_num_f32 v2, v2, v6 :: v_dual_max_num_f32 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %result
+}
+
+define <4 x float> @v_maximumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) {
+; GFX8-LABEL: v_maximumnum_v4f32_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v4f32_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v4f32_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v4f32_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v1, v1, v5
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v6 :: v_dual_max_f32 v3, v3, v7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v4f32_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v4 :: v_dual_max_num_f32 v1, v1, v5
+; GFX12-NEXT:    v_dual_max_num_f32 v2, v2, v6 :: v_dual_max_num_f32 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %result
+}
+
+define <2 x double> @v_maximumnum_v2f64(<2 x double> %x, <2 x double> %y) {
+; GFX8-LABEL: v_maximumnum_v2f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v2f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v2f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v2f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v2f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %result
+}
+
+define <2 x double> @v_maximumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) {
+; GFX8-LABEL: v_maximumnum_v2f64_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v2f64_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v2f64_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v2f64_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v2f64_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <2 x double> @llvm.maximumnum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %result
+}
+
+define <3 x double> @v_maximumnum_v3f64(<3 x double> %x, <3 x double> %y) {
+; GFX8-LABEL: v_maximumnum_v3f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v3f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v3f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v3f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v3f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[8:9], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[10:11], v[10:11], v[10:11]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> %x, <3 x double> %y)
+  ret <3 x double> %result
+}
+
+define <3 x double> @v_maximumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) {
+; GFX8-LABEL: v_maximumnum_v3f64_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v3f64_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v3f64_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v3f64_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v3f64_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <3 x double> @llvm.maximumnum.v3f64(<3 x double> %x, <3 x double> %y)
+  ret <3 x double> %result
+}
+
+define <4 x double> @v_maximumnum_v4f64(<4 x double> %x, <4 x double> %y) {
+; GFX8-LABEL: v_maximumnum_v4f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v4f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
+; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v4f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX10-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
+; GFX10-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX10-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v4f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v4f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[8:9], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[10:11], v[10:11], v[10:11]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[12:13], v[12:13], v[12:13]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[14:15], v[14:15], v[14:15]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> %x, <4 x double> %y)
+  ret <4 x double> %result
+}
+
+define <4 x double> @v_maximumnum_v4f64_nnan(<4 x double> %x, <4 x double> %y) {
+; GFX8-LABEL: v_maximumnum_v4f64_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v4f64_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v4f64_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX10-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v4f64_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v4f64_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <4 x double> @llvm.maximumnum.v4f64(<4 x double> %x, <4 x double> %y)
+  ret <4 x double> %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index a2ba770067d16..466505c0bcbea 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -1688,3 +1688,1268 @@ define double @v_minimumnum_f64_fneg(double %x, double %y) {
   %result = call double @llvm.minimumnum.f64(double %fneg.x, double %fneg.y)
   ret double %result
 }
+
+define <2 x half> @v_minimumnum_v2f16(<2 x half> %x, <2 x half> %y) {
+; GFX8-LABEL: v_minimumnum_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %x, <2 x half> %y)
+  ret <2 x half> %result
+}
+
+define <2 x half> @v_minimumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) {
+; GFX8-LABEL: v_minimumnum_v2f16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v2f16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v2f16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v2f16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v2f16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <2 x half> @llvm.minimumnum.v2f16(<2 x half> %x, <2 x half> %y)
+  ret <2 x half> %result
+}
+
+define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
+; GFX8-LABEL: v_minimumnum_v3f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v3f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v3f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v3f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_min_num_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> %x, <3 x half> %y)
+  ret <3 x half> %result
+}
+
+define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
+; GFX8-LABEL: v_minimumnum_v3f16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v3f16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v3f16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v3f16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v3f16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_min_num_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <3 x half> @llvm.minimumnum.v3f16(<3 x half> %x, <3 x half> %y)
+  ret <3 x half> %result
+}
+
+define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
+; GFX8-LABEL: v_minimumnum_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_min_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_min_num_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> %x, <4 x half> %y)
+  ret <4 x half> %result
+}
+
+define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
+; GFX8-LABEL: v_minimumnum_v4f16_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_min_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v4f16_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v4f16_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v4f16_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v4f16_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_min_num_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <4 x half> @llvm.minimumnum.v4f16(<4 x half> %x, <4 x half> %y)
+  ret <4 x half> %result
+}
+
+define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
+; GFX8-LABEL: v_minimumnum_v6f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_min_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v4
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v4, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v4
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v6f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX9-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v6f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX10-NEXT:    v_pk_min_f16 v1, v1, v4
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v6f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v4
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v6f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_pk_min_num_f16 v1, v1, v4
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <6 x half> @llvm.minimumnum.v6f16(<6 x half> %x, <6 x half> %y)
+  ret <6 x half> %result
+}
+
+define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
+; GFX8-LABEL: v_minimumnum_v8f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_min_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX8-NEXT:    v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v7, v7
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_min_f16_e32 v3, v3, v5
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v8f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX9-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v4
+; GFX9-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX9-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    v_pk_min_f16 v3, v3, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v8f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX10-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v6
+; GFX10-NEXT:    v_pk_min_f16 v3, v3, v7
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v8f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_pk_min_f16 v3, v3, v7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v8f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v7, v7, v7
+; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_min_num_f16 v1, v1, v5
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> %x, <8 x half> %y)
+  ret <8 x half> %result
+}
+
+define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
+; GFX8-LABEL: v_minimumnum_v2f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v2f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v2f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_min_num_f32 v0, v0, v2 :: v_dual_min_num_f32 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> %x, <2 x float> %y)
+  ret <2 x float> %result
+}
+
+define <2 x float> @v_minimumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) {
+; GFX8-LABEL: v_minimumnum_v2f32_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v2f32_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v2f32_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v2f32_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v2f32_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_min_num_f32 v0, v0, v2 :: v_dual_min_num_f32 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <2 x float> @llvm.minimumnum.v2f32(<2 x float> %x, <2 x float> %y)
+  ret <2 x float> %result
+}
+
+define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
+; GFX8-LABEL: v_minimumnum_v3f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v3f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v3f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v3 :: v_dual_min_f32 v1, v1, v4
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v0, v0, v0
+; GFX12-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v2, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_min_num_f32 v0, v0, v3 :: v_dual_min_num_f32 v1, v1, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> %x, <3 x float> %y)
+  ret <3 x float> %result
+}
+
+define <3 x float> @v_minimumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) {
+; GFX8-LABEL: v_minimumnum_v3f32_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v3f32_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v3f32_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v3f32_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v3 :: v_dual_min_f32 v1, v1, v4
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v3f32_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_min_num_f32 v0, v0, v3 :: v_dual_min_num_f32 v1, v1, v4
+; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <3 x float> @llvm.minimumnum.v3f32(<3 x float> %x, <3 x float> %y)
+  ret <3 x float> %result
+}
+
+define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
+; GFX8-LABEL: v_minimumnum_v4f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX9-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v4f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v4f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v1, v1, v5
+; GFX11-NEXT:    v_dual_min_f32 v2, v2, v6 :: v_dual_min_f32 v3, v3, v7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT:    v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7
+; GFX12-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_min_num_f32 v0, v0, v4 :: v_dual_min_num_f32 v1, v1, v5
+; GFX12-NEXT:    v_dual_min_num_f32 v2, v2, v6 :: v_dual_min_num_f32 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %result
+}
+
+define <4 x float> @v_minimumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) {
+; GFX8-LABEL: v_minimumnum_v4f32_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v4f32_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v4f32_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v4f32_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v1, v1, v5
+; GFX11-NEXT:    v_dual_min_f32 v2, v2, v6 :: v_dual_min_f32 v3, v3, v7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v4f32_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_min_num_f32 v0, v0, v4 :: v_dual_min_num_f32 v1, v1, v5
+; GFX12-NEXT:    v_dual_min_num_f32 v2, v2, v6 :: v_dual_min_num_f32 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <4 x float> @llvm.minimumnum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %result
+}
+
+define <2 x double> @v_minimumnum_v2f64(<2 x double> %x, <2 x double> %y) {
+; GFX8-LABEL: v_minimumnum_v2f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v2f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v2f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v2f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v2f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %result
+}
+
+define <2 x double> @v_minimumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) {
+; GFX8-LABEL: v_minimumnum_v2f64_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v2f64_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v2f64_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v2f64_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v2f64_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %result
+}
+
+define <3 x double> @v_minimumnum_v3f64(<3 x double> %x, <3 x double> %y) {
+; GFX8-LABEL: v_minimumnum_v3f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v3f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v3f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v3f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v3f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[8:9], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[10:11], v[10:11], v[10:11]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[8:9]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> %x, <3 x double> %y)
+  ret <3 x double> %result
+}
+
+define <3 x double> @v_minimumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) {
+; GFX8-LABEL: v_minimumnum_v3f64_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v3f64_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v3f64_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v3f64_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v3f64_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[8:9]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <3 x double> @llvm.minimumnum.v3f64(<3 x double> %x, <3 x double> %y)
+  ret <3 x double> %result
+}
+
+define <4 x double> @v_minimumnum_v4f64(<4 x double> %x, <4 x double> %y) {
+; GFX8-LABEL: v_minimumnum_v4f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v4f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
+; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX9-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v4f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX10-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
+; GFX10-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX10-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v4f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
+; GFX11-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v4f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[8:9], v[8:9], v[8:9]
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[10:11], v[10:11], v[10:11]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[12:13], v[12:13], v[12:13]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[14:15], v[14:15], v[14:15]
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> %x, <4 x double> %y)
+  ret <4 x double> %result
+}
+
+define <4 x double> @v_minimumnum_v4f64_nnan(<4 x double> %x, <4 x double> %y) {
+; GFX8-LABEL: v_minimumnum_v4f64_nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v4f64_nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX9-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v4f64_nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX10-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v4f64_nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX11-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v4f64_nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan <4 x double> @llvm.minimumnum.v4f64(<4 x double> %x, <4 x double> %y)
+  ret <4 x double> %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
index 27b71dd471a83..aa16937d7d897 100644
--- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
+++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
@@ -24,6 +24,55 @@ store i32 0, ptr addrspace(3) @used_by_kernel
 }
 ; CHECK: ; LDSByteSize: 4 bytes
 
+define void @nonkernel() {
+; GFX9-LABEL: nonkernel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    ds_write_b32 v0, v0 offset:8
+; GFX9-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: nonkernel:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    ds_write_b32 v0, v0 offset:8
+; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; G_GFX9-LABEL: nonkernel:
+; G_GFX9:       ; %bb.0:
+; G_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; G_GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; G_GFX9-NEXT:    v_mov_b32_e32 v3, 8
+; G_GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; G_GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; G_GFX9-NEXT:    ds_write_b32 v3, v2
+; G_GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; G_GFX10-LABEL: nonkernel:
+; G_GFX10:       ; %bb.0:
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; G_GFX10-NEXT:    v_mov_b32_e32 v3, 8
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; G_GFX10-NEXT:    ds_write_b32 v3, v2
+; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store i32 0, ptr addrspace(3) @used_by_both
+  store double 0.0, ptr addrspace(3) @used_by_function
+  ret void
+}
+
 ; Needs to allocate both variables, store to used_by_both is at sizeof(double)
 define amdgpu_kernel void @withcall() {
 ; GFX9-LABEL: withcall:
@@ -171,55 +220,5 @@ define amdgpu_kernel void @nocall_false_sharing() {
 }
 ; CHECK: ; LDSByteSize: 4 bytes
 
-
-define void @nonkernel() {
-; GFX9-LABEL: nonkernel:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    ds_write_b32 v0, v0 offset:8
-; GFX9-NEXT:    ds_write_b64 v0, v[0:1]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: nonkernel:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    ds_write_b32 v0, v0 offset:8
-; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; G_GFX9-LABEL: nonkernel:
-; G_GFX9:       ; %bb.0:
-; G_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; G_GFX9-NEXT:    v_mov_b32_e32 v3, 8
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; G_GFX9-NEXT:    ds_write_b32 v3, v2
-; G_GFX9-NEXT:    ds_write_b64 v2, v[0:1]
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; G_GFX10-LABEL: nonkernel:
-; G_GFX10:       ; %bb.0:
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; G_GFX10-NEXT:    v_mov_b32_e32 v3, 8
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; G_GFX10-NEXT:    ds_write_b32 v3, v2
-; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store i32 0, ptr addrspace(3) @used_by_both
-  store double 0.0, ptr addrspace(3) @used_by_function
-  ret void
-}
-
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index c302233e748fd..76a31a7fac8c1 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -141,112 +141,103 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0:       ; %bb.0:
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
 ; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; W64-O0-NEXT:    v_mov_b32_e32 v5, v3
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v2
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v1
-; W64-O0-NEXT:    v_mov_b32_e32 v1, v0
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v3
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v1
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v7
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v6
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v5
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v5
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; W64-O0-NEXT:    s_mov_b32 s4, 0
-; W64-O0-NEXT:    s_waitcnt vmcnt(4)
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 0
+; W64-O0-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; W64-O0-NEXT:    v_writelane_b32 v7, s4, 0
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 1
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 2
+; W64-O0-NEXT:    v_writelane_b32 v7, s4, 1
+; W64-O0-NEXT:    v_writelane_b32 v7, s5, 2
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(3)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
 ; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 3
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 4
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 5
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_writelane_b32 v7, s8, 3
+; W64-O0-NEXT:    v_writelane_b32 v7, s9, 4
+; W64-O0-NEXT:    v_writelane_b32 v7, s10, 5
+; W64-O0-NEXT:    v_writelane_b32 v7, s11, 6
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 7
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 8
+; W64-O0-NEXT:    v_writelane_b32 v7, s4, 7
+; W64-O0-NEXT:    v_writelane_b32 v7, s5, 8
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 7
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 8
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 3
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 4
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v7, 7
+; W64-O0-NEXT:    v_readlane_b32 s5, v7, 8
+; W64-O0-NEXT:    v_readlane_b32 s8, v7, 3
+; W64-O0-NEXT:    v_readlane_b32 s9, v7, 4
+; W64-O0-NEXT:    v_readlane_b32 s10, v7, 5
+; W64-O0-NEXT:    v_readlane_b32 s11, v7, 6
+; W64-O0-NEXT:    v_readlane_b32 s6, v7, 0
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB0_1
 ; W64-O0-NEXT:  ; %bb.3:
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 1
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v7, 1
+; W64-O0-NEXT:    v_readlane_b32 s5, v7, 2
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    ; kill: killed $vgpr1
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -498,34 +489,32 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0:       ; %bb.0: ; %entry
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; implicit-def: $vgpr13 : SGPR spill to VGPR lane
 ; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    v_mov_b32_e32 v13, v4
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v3
-; W64-O0-NEXT:    v_mov_b32_e32 v8, v2
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v0
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v3
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_mov_b32_e32 v7, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v8, v1
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v0
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
 ; W64-O0-NEXT:    v_mov_b32_e32 v14, v5
-; W64-O0-NEXT:    v_mov_b32_e32 v15, v6
 ; W64-O0-NEXT:    s_waitcnt vmcnt(3)
-; W64-O0-NEXT:    v_mov_b32_e32 v16, v4
+; W64-O0-NEXT:    v_mov_b32_e32 v15, v4
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_mov_b32_e32 v16, v3
 ; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
@@ -535,195 +524,192 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v9
-; W64-O0-NEXT:    v_mov_b32_e32 v5, v8
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v7
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v8
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v7
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v6
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v12
-; W64-O0-NEXT:    s_waitcnt vmcnt(10)
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v12
+; W64-O0-NEXT:    s_waitcnt vmcnt(9)
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v10
-; W64-O0-NEXT:    s_waitcnt vmcnt(11)
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v10
+; W64-O0-NEXT:    s_waitcnt vmcnt(10)
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; W64-O0-NEXT:    s_mov_b32 s4, 0
-; W64-O0-NEXT:    s_waitcnt vmcnt(12)
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 0
+; W64-O0-NEXT:    ; implicit-def: $vgpr17 : SGPR spill to VGPR lane
+; W64-O0-NEXT:    v_writelane_b32 v17, s4, 0
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 1
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 2
+; W64-O0-NEXT:    v_writelane_b32 v17, s4, 1
+; W64-O0-NEXT:    v_writelane_b32 v17, s5, 2
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(3)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
 ; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 3
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 4
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 5
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_writelane_b32 v17, s8, 3
+; W64-O0-NEXT:    v_writelane_b32 v17, s9, 4
+; W64-O0-NEXT:    v_writelane_b32 v17, s10, 5
+; W64-O0-NEXT:    v_writelane_b32 v17, s11, 6
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 7
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 8
+; W64-O0-NEXT:    v_writelane_b32 v17, s4, 7
+; W64-O0-NEXT:    v_writelane_b32 v17, s5, 8
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB1_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 7
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 8
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 3
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 4
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v17, 7
+; W64-O0-NEXT:    v_readlane_b32 s5, v17, 8
+; W64-O0-NEXT:    v_readlane_b32 s8, v17, 3
+; W64-O0-NEXT:    v_readlane_b32 s9, v17, 4
+; W64-O0-NEXT:    v_readlane_b32 s10, v17, 5
+; W64-O0-NEXT:    v_readlane_b32 s11, v17, 6
+; W64-O0-NEXT:    v_readlane_b32 s6, v17, 0
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB1_1
 ; W64-O0-NEXT:  ; %bb.3:
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 1
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v17, 1
+; W64-O0-NEXT:    v_readlane_b32 s5, v17, 2
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 9
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 10
+; W64-O0-NEXT:    v_writelane_b32 v17, s4, 9
+; W64-O0-NEXT:    v_writelane_b32 v17, s5, 10
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB1_4: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(3)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
 ; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 11
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 12
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 13
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 14
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_writelane_b32 v17, s8, 11
+; W64-O0-NEXT:    v_writelane_b32 v17, s9, 12
+; W64-O0-NEXT:    v_writelane_b32 v17, s10, 13
+; W64-O0-NEXT:    v_writelane_b32 v17, s11, 14
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 15
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 16
+; W64-O0-NEXT:    v_writelane_b32 v17, s4, 15
+; W64-O0-NEXT:    v_writelane_b32 v17, s5, 16
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.5: ; in Loop: Header=BB1_4 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 15
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 16
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 11
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 12
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 13
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 14
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v17, 15
+; W64-O0-NEXT:    v_readlane_b32 s5, v17, 16
+; W64-O0-NEXT:    v_readlane_b32 s8, v17, 11
+; W64-O0-NEXT:    v_readlane_b32 s9, v17, 12
+; W64-O0-NEXT:    v_readlane_b32 s10, v17, 13
+; W64-O0-NEXT:    v_readlane_b32 s11, v17, 14
+; W64-O0-NEXT:    v_readlane_b32 s6, v17, 0
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB1_4
 ; W64-O0-NEXT:  ; %bb.6:
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 9
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 10
+; W64-O0-NEXT:    v_readlane_b32 s4, v17, 9
+; W64-O0-NEXT:    v_readlane_b32 s5, v17, 10
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    global_store_dword v[4:5], v6, off
+; W64-O0-NEXT:    global_store_dword v[3:4], v5, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    global_store_dword v[1:2], v3, off
+; W64-O0-NEXT:    global_store_dword v[0:1], v2, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    ; kill: killed $vgpr0
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -1031,262 +1017,253 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0:       ; %bb.0: ; %entry
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
 ; W64-O0-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; W64-O0-NEXT:    v_mov_b32_e32 v8, v6
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v5
-; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_mov_b32_e32 v8, v5
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v4
+; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; W64-O0-NEXT:    v_mov_b32_e32 v10, v3
-; W64-O0-NEXT:    v_mov_b32_e32 v11, v2
-; W64-O0-NEXT:    v_mov_b32_e32 v13, v1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v0
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v9, v3
+; W64-O0-NEXT:    v_mov_b32_e32 v10, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v11, v1
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v0
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v9
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v8
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v7
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v8
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v7
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v13
-; W64-O0-NEXT:    v_mov_b32_e32 v8, v11
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v10
-; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v11
+; W64-O0-NEXT:    v_mov_b32_e32 v7, v10
+; W64-O0-NEXT:    v_mov_b32_e32 v8, v9
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v12
-; W64-O0-NEXT:    s_waitcnt vmcnt(7)
-; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v12
+; W64-O0-NEXT:    s_waitcnt vmcnt(6)
+; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; W64-O0-NEXT:    s_waitcnt vmcnt(7)
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(6)
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; W64-O0-NEXT:    ;;#ASMSTART
 ; W64-O0-NEXT:    s_mov_b32 s4, 17
 ; W64-O0-NEXT:    ;;#ASMEND
 ; W64-O0-NEXT:    s_mov_b32 s5, s4
-; W64-O0-NEXT:    s_waitcnt vmcnt(10)
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 0
+; W64-O0-NEXT:    ; implicit-def: $vgpr13 : SGPR spill to VGPR lane
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 0
 ; W64-O0-NEXT:    s_mov_b32 s5, 0
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 1
-; W64-O0-NEXT:    v_mov_b32_e32 v1, s4
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 1
+; W64-O0-NEXT:    v_mov_b32_e32 v0, s4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 2
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 3
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 2
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 3
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(3)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
 ; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 4
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 5
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 6
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 7
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_writelane_b32 v13, s8, 4
+; W64-O0-NEXT:    v_writelane_b32 v13, s9, 5
+; W64-O0-NEXT:    v_writelane_b32 v13, s10, 6
+; W64-O0-NEXT:    v_writelane_b32 v13, s11, 7
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 8
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 9
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 8
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 9
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB2_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 8
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 9
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 4
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 5
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 6
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 7
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 8
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 9
+; W64-O0-NEXT:    v_readlane_b32 s8, v13, 4
+; W64-O0-NEXT:    v_readlane_b32 s9, v13, 5
+; W64-O0-NEXT:    v_readlane_b32 s10, v13, 6
+; W64-O0-NEXT:    v_readlane_b32 s11, v13, 7
+; W64-O0-NEXT:    v_readlane_b32 s6, v13, 1
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB2_1
 ; W64-O0-NEXT:  ; %bb.3:
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s6, v0, 2
-; W64-O0-NEXT:    v_readlane_b32 s7, v0, 3
+; W64-O0-NEXT:    v_readlane_b32 s6, v13, 2
+; W64-O0-NEXT:    v_readlane_b32 s7, v13, 3
 ; W64-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 1
 ; W64-O0-NEXT:    s_mov_b32 s5, 0x3ff
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_and_b32_e64 v2, v2, s5
-; W64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s4
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_and_b32_e64 v1, v1, s5
+; W64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v1, s4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 10
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 11
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 10
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 11
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execz .LBB2_8
 ; W64-O0-NEXT:  ; %bb.4: ; %bb1
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 0
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 0
 ; W64-O0-NEXT:    s_mov_b32 s5, 0
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 12
-; W64-O0-NEXT:    v_mov_b32_e32 v1, s4
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 12
+; W64-O0-NEXT:    v_mov_b32_e32 v0, s4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 13
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 14
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 13
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 14
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(3)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
 ; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 15
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 16
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 17
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 18
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_writelane_b32 v13, s8, 15
+; W64-O0-NEXT:    v_writelane_b32 v13, s9, 16
+; W64-O0-NEXT:    v_writelane_b32 v13, s10, 17
+; W64-O0-NEXT:    v_writelane_b32 v13, s11, 18
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 19
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 20
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 19
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 20
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.6: ; in Loop: Header=BB2_5 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 19
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 20
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 15
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 16
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 17
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 18
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 12
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 19
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 20
+; W64-O0-NEXT:    v_readlane_b32 s8, v13, 15
+; W64-O0-NEXT:    v_readlane_b32 s9, v13, 16
+; W64-O0-NEXT:    v_readlane_b32 s10, v13, 17
+; W64-O0-NEXT:    v_readlane_b32 s11, v13, 18
+; W64-O0-NEXT:    v_readlane_b32 s6, v13, 12
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB2_5
 ; W64-O0-NEXT:  ; %bb.7:
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 13
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 14
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 13
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 14
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; W64-O0-NEXT:  .LBB2_8: ; %bb2
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 10
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 11
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 10
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 11
 ; W64-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    global_store_dword v[1:2], v3, off
+; W64-O0-NEXT:    global_store_dword v[0:1], v2, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    ; kill: killed $vgpr0
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index dd6fd5aa384f6..59ceecbf43b78 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -140,127 +140,115 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; W64-O0:       ; %bb.0:
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v2
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v1
-; W64-O0-NEXT:    v_mov_b32_e32 v1, v0
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v1
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v2
-; W64-O0-NEXT:    v_mov_b32_e32 v5, v7
-; W64-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v3
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v6
+; W64-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v3
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v2
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v1
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v7
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v6
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v5
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v5
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    s_mov_b32 s4, 0
-; W64-O0-NEXT:    s_waitcnt vmcnt(4)
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 0
+; W64-O0-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; W64-O0-NEXT:    v_writelane_b32 v7, s4, 0
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 1
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 2
+; W64-O0-NEXT:    v_writelane_b32 v7, s4, 1
+; W64-O0-NEXT:    v_writelane_b32 v7, s5, 2
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(3)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
 ; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 3
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 4
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 5
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_writelane_b32 v7, s8, 3
+; W64-O0-NEXT:    v_writelane_b32 v7, s9, 4
+; W64-O0-NEXT:    v_writelane_b32 v7, s10, 5
+; W64-O0-NEXT:    v_writelane_b32 v7, s11, 6
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 7
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 8
+; W64-O0-NEXT:    v_writelane_b32 v7, s4, 7
+; W64-O0-NEXT:    v_writelane_b32 v7, s5, 8
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 7
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 8
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 3
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 4
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v7, 7
+; W64-O0-NEXT:    v_readlane_b32 s5, v7, 8
+; W64-O0-NEXT:    v_readlane_b32 s8, v7, 3
+; W64-O0-NEXT:    v_readlane_b32 s9, v7, 4
+; W64-O0-NEXT:    v_readlane_b32 s10, v7, 5
+; W64-O0-NEXT:    v_readlane_b32 s11, v7, 6
+; W64-O0-NEXT:    v_readlane_b32 s6, v7, 0
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB0_1
 ; W64-O0-NEXT:  ; %bb.3:
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 1
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v7, 1
+; W64-O0-NEXT:    v_readlane_b32 s5, v7, 2
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    ; kill: killed $vgpr1
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -512,45 +500,42 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0:       ; %bb.0: ; %entry
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; implicit-def: $vgpr13 : SGPR spill to VGPR lane
 ; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    v_mov_b32_e32 v14, v6
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v5
+; W64-O0-NEXT:    v_mov_b32_e32 v8, v5
+; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    v_mov_b32_e32 v13, v4
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v3
-; W64-O0-NEXT:    v_mov_b32_e32 v8, v2
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v5, v1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v0
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    v_mov_b32_e32 v7, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v1
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v0
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v15, v7
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v15
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v14
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_mov_b32_e32 v15, v5
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v15
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v14
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v14, v9
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v14
+; W64-O0-NEXT:    v_mov_b32_e32 v14, v8
+; W64-O0-NEXT:    v_mov_b32_e32 v8, v14
 ; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v14, v9
-; W64-O0-NEXT:    v_mov_b32_e32 v15, v7
-; W64-O0-NEXT:    v_mov_b32_e32 v16, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v14, v8
+; W64-O0-NEXT:    v_mov_b32_e32 v15, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v16, v5
 ; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
@@ -558,45 +543,45 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v4
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v9
-; W64-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v8, v3
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v8
+; W64-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v5
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v4
-; W64-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v4
+; W64-O0-NEXT:    v_mov_b32_e32 v8, v3
+; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v9
-; W64-O0-NEXT:    v_mov_b32_e32 v5, v8
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v7
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v8
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v7
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v6
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v12
-; W64-O0-NEXT:    s_waitcnt vmcnt(10)
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v12
+; W64-O0-NEXT:    s_waitcnt vmcnt(9)
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v10
-; W64-O0-NEXT:    s_waitcnt vmcnt(11)
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v10
+; W64-O0-NEXT:    s_waitcnt vmcnt(10)
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
@@ -604,165 +589,162 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    s_mov_b32 s4, 0
-; W64-O0-NEXT:    s_waitcnt vmcnt(12)
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 0
+; W64-O0-NEXT:    ; implicit-def: $vgpr17 : SGPR spill to VGPR lane
+; W64-O0-NEXT:    v_writelane_b32 v17, s4, 0
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 1
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 2
+; W64-O0-NEXT:    v_writelane_b32 v17, s4, 1
+; W64-O0-NEXT:    v_writelane_b32 v17, s5, 2
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(3)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
 ; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 3
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 4
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 5
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_writelane_b32 v17, s8, 3
+; W64-O0-NEXT:    v_writelane_b32 v17, s9, 4
+; W64-O0-NEXT:    v_writelane_b32 v17, s10, 5
+; W64-O0-NEXT:    v_writelane_b32 v17, s11, 6
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 7
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 8
+; W64-O0-NEXT:    v_writelane_b32 v17, s4, 7
+; W64-O0-NEXT:    v_writelane_b32 v17, s5, 8
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB1_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 7
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 8
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 3
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 4
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v17, 7
+; W64-O0-NEXT:    v_readlane_b32 s5, v17, 8
+; W64-O0-NEXT:    v_readlane_b32 s8, v17, 3
+; W64-O0-NEXT:    v_readlane_b32 s9, v17, 4
+; W64-O0-NEXT:    v_readlane_b32 s10, v17, 5
+; W64-O0-NEXT:    v_readlane_b32 s11, v17, 6
+; W64-O0-NEXT:    v_readlane_b32 s6, v17, 0
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB1_1
 ; W64-O0-NEXT:  ; %bb.3:
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 1
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v17, 1
+; W64-O0-NEXT:    v_readlane_b32 s5, v17, 2
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 9
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 10
+; W64-O0-NEXT:    v_writelane_b32 v17, s4, 9
+; W64-O0-NEXT:    v_writelane_b32 v17, s5, 10
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB1_4: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(3)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
 ; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 11
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 12
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 13
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 14
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_writelane_b32 v17, s8, 11
+; W64-O0-NEXT:    v_writelane_b32 v17, s9, 12
+; W64-O0-NEXT:    v_writelane_b32 v17, s10, 13
+; W64-O0-NEXT:    v_writelane_b32 v17, s11, 14
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 15
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 16
+; W64-O0-NEXT:    v_writelane_b32 v17, s4, 15
+; W64-O0-NEXT:    v_writelane_b32 v17, s5, 16
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.5: ; in Loop: Header=BB1_4 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 15
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 16
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 11
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 12
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 13
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 14
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v17, 15
+; W64-O0-NEXT:    v_readlane_b32 s5, v17, 16
+; W64-O0-NEXT:    v_readlane_b32 s8, v17, 11
+; W64-O0-NEXT:    v_readlane_b32 s9, v17, 12
+; W64-O0-NEXT:    v_readlane_b32 s10, v17, 13
+; W64-O0-NEXT:    v_readlane_b32 s11, v17, 14
+; W64-O0-NEXT:    v_readlane_b32 s6, v17, 0
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB1_4
 ; W64-O0-NEXT:  ; %bb.6:
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 9
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 10
+; W64-O0-NEXT:    v_readlane_b32 s4, v17, 9
+; W64-O0-NEXT:    v_readlane_b32 s5, v17, 10
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; W64-O0-NEXT:    global_store_dword v[3:4], v5, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    global_store_dword v[4:5], v6, off
+; W64-O0-NEXT:    global_store_dword v[0:1], v2, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    global_store_dword v[1:2], v3, off
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    ; kill: killed $vgpr0
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -1070,48 +1052,42 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0:       ; %bb.0: ; %entry
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
 ; W64-O0-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v5
-; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v4
+; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v3
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v13, v2
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; W64-O0-NEXT:    v_mov_b32_e32 v10, v1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v9, v2
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v1
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    v_mov_b32_e32 v8, v0
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v14, v4
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v14
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v13
+; W64-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v10, v3
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v10
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v9
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v10
-; W64-O0-NEXT:    v_mov_b32_e32 v13, v9
+; W64-O0-NEXT:    v_mov_b32_e32 v9, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v9
 ; W64-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v9, v13
-; W64-O0-NEXT:    v_mov_b32_e32 v10, v6
-; W64-O0-NEXT:    v_mov_b32_e32 v11, v4
+; W64-O0-NEXT:    v_mov_b32_e32 v9, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v10, v5
+; W64-O0-NEXT:    v_mov_b32_e32 v11, v3
 ; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -1119,251 +1095,246 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v7
+; W64-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; W64-O0-NEXT:    s_waitcnt vmcnt(6)
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v2
+; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    s_waitcnt vmcnt(5)
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v12
-; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v12
+; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; W64-O0-NEXT:    s_waitcnt vmcnt(9)
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(8)
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    ;;#ASMSTART
 ; W64-O0-NEXT:    s_mov_b32 s4, 17
 ; W64-O0-NEXT:    ;;#ASMEND
 ; W64-O0-NEXT:    s_mov_b32 s5, s4
-; W64-O0-NEXT:    s_waitcnt vmcnt(10)
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 0
+; W64-O0-NEXT:    ; implicit-def: $vgpr13 : SGPR spill to VGPR lane
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 0
 ; W64-O0-NEXT:    s_mov_b32 s5, 0
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 1
-; W64-O0-NEXT:    v_mov_b32_e32 v1, s4
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 1
+; W64-O0-NEXT:    v_mov_b32_e32 v0, s4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 2
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 3
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 2
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 3
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(3)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
 ; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 4
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 5
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 6
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 7
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_writelane_b32 v13, s8, 4
+; W64-O0-NEXT:    v_writelane_b32 v13, s9, 5
+; W64-O0-NEXT:    v_writelane_b32 v13, s10, 6
+; W64-O0-NEXT:    v_writelane_b32 v13, s11, 7
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 8
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 9
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 8
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 9
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB2_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 8
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 9
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 4
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 5
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 6
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 7
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 8
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 9
+; W64-O0-NEXT:    v_readlane_b32 s8, v13, 4
+; W64-O0-NEXT:    v_readlane_b32 s9, v13, 5
+; W64-O0-NEXT:    v_readlane_b32 s10, v13, 6
+; W64-O0-NEXT:    v_readlane_b32 s11, v13, 7
+; W64-O0-NEXT:    v_readlane_b32 s6, v13, 1
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB2_1
 ; W64-O0-NEXT:  ; %bb.3:
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s6, v0, 2
-; W64-O0-NEXT:    v_readlane_b32 s7, v0, 3
+; W64-O0-NEXT:    v_readlane_b32 s6, v13, 2
+; W64-O0-NEXT:    v_readlane_b32 s7, v13, 3
 ; W64-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 1
 ; W64-O0-NEXT:    s_mov_b32 s5, 0x3ff
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_and_b32_e64 v2, v2, s5
-; W64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s4
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_and_b32_e64 v1, v1, s5
+; W64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v1, s4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 10
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 11
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 10
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 11
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execz .LBB2_8
 ; W64-O0-NEXT:  ; %bb.4: ; %bb1
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 0
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_mov_b32_e32 v7, v5
-; W64-O0-NEXT:    v_mov_b32_e32 v1, v4
-; W64-O0-NEXT:    v_mov_b32_e32 v5, v3
-; W64-O0-NEXT:    v_mov_b32_e32 v6, v2
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 0
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v4
+; W64-O0-NEXT:    v_mov_b32_e32 v0, v3
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v1
 ; W64-O0-NEXT:    ; implicit-def: $sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr5
 ; W64-O0-NEXT:    ; implicit-def: $sgpr5
-; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; W64-O0-NEXT:    v_mov_b32_e32 v2, v7
-; W64-O0-NEXT:    v_mov_b32_e32 v3, v6
-; W64-O0-NEXT:    v_mov_b32_e32 v4, v5
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v5
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_nop 0
-; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b32 s5, 0
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 12
-; W64-O0-NEXT:    v_mov_b32_e32 v1, s4
-; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 12
+; W64-O0-NEXT:    v_mov_b32_e32 v0, s4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 13
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 14
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 13
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 14
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(3)
-; W64-O0-NEXT:    v_readfirstlane_b32 s8, v1
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
-; W64-O0-NEXT:    v_readfirstlane_b32 s12, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
 ; W64-O0-NEXT:    s_waitcnt vmcnt(1)
-; W64-O0-NEXT:    v_readfirstlane_b32 s7, v3
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readfirstlane_b32 s6, v4
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4]
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
 ; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
-; W64-O0-NEXT:    v_writelane_b32 v0, s8, 15
-; W64-O0-NEXT:    v_writelane_b32 v0, s9, 16
-; W64-O0-NEXT:    v_writelane_b32 v0, s10, 17
-; W64-O0-NEXT:    v_writelane_b32 v0, s11, 18
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_writelane_b32 v13, s8, 15
+; W64-O0-NEXT:    v_writelane_b32 v13, s9, 16
+; W64-O0-NEXT:    v_writelane_b32 v13, s10, 17
+; W64-O0-NEXT:    v_writelane_b32 v13, s11, 18
 ; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; W64-O0-NEXT:    v_writelane_b32 v0, s4, 19
-; W64-O0-NEXT:    v_writelane_b32 v0, s5, 20
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 19
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 20
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.6: ; in Loop: Header=BB2_5 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 19
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 20
-; W64-O0-NEXT:    v_readlane_b32 s8, v1, 15
-; W64-O0-NEXT:    v_readlane_b32 s9, v1, 16
-; W64-O0-NEXT:    v_readlane_b32 s10, v1, 17
-; W64-O0-NEXT:    v_readlane_b32 s11, v1, 18
-; W64-O0-NEXT:    v_readlane_b32 s6, v1, 12
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 19
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 20
+; W64-O0-NEXT:    v_readlane_b32 s8, v13, 15
+; W64-O0-NEXT:    v_readlane_b32 s9, v13, 16
+; W64-O0-NEXT:    v_readlane_b32 s10, v13, 17
+; W64-O0-NEXT:    v_readlane_b32 s11, v13, 18
+; W64-O0-NEXT:    v_readlane_b32 s6, v13, 12
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB2_5
 ; W64-O0-NEXT:  ; %bb.7:
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v1, 13
-; W64-O0-NEXT:    v_readlane_b32 s5, v1, 14
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 13
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 14
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:  .LBB2_8: ; %bb2
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 10
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 11
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 10
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 11
 ; W64-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    global_store_dword v[1:2], v3, off
+; W64-O0-NEXT:    global_store_dword v[0:1], v2, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    ; kill: killed $vgpr0
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
index 6d18f354e6542..a2baa56ea0c98 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}_amdgpu_cs_main:
-; CHECK: ; NumSgprs: 4
+; CHECK: ; TotalNumSgprs: 4
 ; CHECK: ; NumVgprs: 2
 ; CHECK:           .amdgpu_pal_metadata
 ; CHECK-NEXT: ---
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 72aafcaca3ff8..37d0309caac0a 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -11,21 +11,17 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX908-NEXT: {{  $}}
   ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %26
-  ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %26
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %23
-  ; REGALLOC-GFX908-NEXT:   SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %6
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %7
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
-  ; REGALLOC-GFX908-NEXT:   [[COPY2:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
+  ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
-  ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
-  ; REGALLOC-GFX908-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
-  ; REGALLOC-GFX908-NEXT:   [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   S_ENDPGM 0
   ;
   ; PEI-GFX908-LABEL: name: partial_copy
@@ -60,18 +56,15 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX90A-NEXT: {{  $}}
   ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %25
-  ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %25
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %23
-  ; REGALLOC-GFX90A-NEXT:   SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %6
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %7
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
-  ; REGALLOC-GFX90A-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
+  ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
-  ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
-  ; REGALLOC-GFX90A-NEXT:   [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   S_ENDPGM 0
   ;
diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index 5b0354e63c236..078b133a93d6f 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -17,13 +17,11 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    s_mov_b32 s95, 0xe8f000
 ; GCN-NEXT:    s_add_u32 s92, s92, s9
 ; GCN-NEXT:    s_addc_u32 s93, s93, 0
-; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GCN-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; GCN-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
 ; GCN-NEXT:    s_load_dword s0, s[2:3], 0xb
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
 ; GCN-NEXT:    v_writelane_b32 v2, s4, 0
 ; GCN-NEXT:    v_writelane_b32 v2, s5, 1
 ; GCN-NEXT:    v_writelane_b32 v2, s6, 2
@@ -115,107 +113,109 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 0
-; GCN-NEXT:    v_writelane_b32 v1, s5, 1
-; GCN-NEXT:    v_writelane_b32 v1, s6, 2
-; GCN-NEXT:    v_writelane_b32 v1, s7, 3
-; GCN-NEXT:    v_writelane_b32 v1, s8, 4
-; GCN-NEXT:    v_writelane_b32 v1, s9, 5
-; GCN-NEXT:    v_writelane_b32 v1, s10, 6
-; GCN-NEXT:    v_writelane_b32 v1, s11, 7
+; GCN-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
+; GCN-NEXT:    v_writelane_b32 v2, s4, 0
+; GCN-NEXT:    v_writelane_b32 v2, s5, 1
+; GCN-NEXT:    v_writelane_b32 v2, s6, 2
+; GCN-NEXT:    v_writelane_b32 v2, s7, 3
+; GCN-NEXT:    v_writelane_b32 v2, s8, 4
+; GCN-NEXT:    v_writelane_b32 v2, s9, 5
+; GCN-NEXT:    v_writelane_b32 v2, s10, 6
+; GCN-NEXT:    v_writelane_b32 v2, s11, 7
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 8
-; GCN-NEXT:    v_writelane_b32 v1, s5, 9
-; GCN-NEXT:    v_writelane_b32 v1, s6, 10
-; GCN-NEXT:    v_writelane_b32 v1, s7, 11
-; GCN-NEXT:    v_writelane_b32 v1, s8, 12
-; GCN-NEXT:    v_writelane_b32 v1, s9, 13
-; GCN-NEXT:    v_writelane_b32 v1, s10, 14
-; GCN-NEXT:    v_writelane_b32 v1, s11, 15
+; GCN-NEXT:    v_writelane_b32 v2, s4, 8
+; GCN-NEXT:    v_writelane_b32 v2, s5, 9
+; GCN-NEXT:    v_writelane_b32 v2, s6, 10
+; GCN-NEXT:    v_writelane_b32 v2, s7, 11
+; GCN-NEXT:    v_writelane_b32 v2, s8, 12
+; GCN-NEXT:    v_writelane_b32 v2, s9, 13
+; GCN-NEXT:    v_writelane_b32 v2, s10, 14
+; GCN-NEXT:    v_writelane_b32 v2, s11, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 16
-; GCN-NEXT:    v_writelane_b32 v1, s5, 17
-; GCN-NEXT:    v_writelane_b32 v1, s6, 18
-; GCN-NEXT:    v_writelane_b32 v1, s7, 19
-; GCN-NEXT:    v_writelane_b32 v1, s8, 20
-; GCN-NEXT:    v_writelane_b32 v1, s9, 21
-; GCN-NEXT:    v_writelane_b32 v1, s10, 22
-; GCN-NEXT:    v_writelane_b32 v1, s11, 23
+; GCN-NEXT:    v_writelane_b32 v2, s4, 16
+; GCN-NEXT:    v_writelane_b32 v2, s5, 17
+; GCN-NEXT:    v_writelane_b32 v2, s6, 18
+; GCN-NEXT:    v_writelane_b32 v2, s7, 19
+; GCN-NEXT:    v_writelane_b32 v2, s8, 20
+; GCN-NEXT:    v_writelane_b32 v2, s9, 21
+; GCN-NEXT:    v_writelane_b32 v2, s10, 22
+; GCN-NEXT:    v_writelane_b32 v2, s11, 23
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 24
-; GCN-NEXT:    v_writelane_b32 v1, s5, 25
-; GCN-NEXT:    v_writelane_b32 v1, s6, 26
-; GCN-NEXT:    v_writelane_b32 v1, s7, 27
-; GCN-NEXT:    v_writelane_b32 v1, s8, 28
-; GCN-NEXT:    v_writelane_b32 v1, s9, 29
-; GCN-NEXT:    v_writelane_b32 v1, s10, 30
-; GCN-NEXT:    v_writelane_b32 v1, s11, 31
+; GCN-NEXT:    v_writelane_b32 v2, s4, 24
+; GCN-NEXT:    v_writelane_b32 v2, s5, 25
+; GCN-NEXT:    v_writelane_b32 v2, s6, 26
+; GCN-NEXT:    v_writelane_b32 v2, s7, 27
+; GCN-NEXT:    v_writelane_b32 v2, s8, 28
+; GCN-NEXT:    v_writelane_b32 v2, s9, 29
+; GCN-NEXT:    v_writelane_b32 v2, s10, 30
+; GCN-NEXT:    v_writelane_b32 v2, s11, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 32
-; GCN-NEXT:    v_writelane_b32 v1, s5, 33
-; GCN-NEXT:    v_writelane_b32 v1, s6, 34
-; GCN-NEXT:    v_writelane_b32 v1, s7, 35
-; GCN-NEXT:    v_writelane_b32 v1, s8, 36
-; GCN-NEXT:    v_writelane_b32 v1, s9, 37
-; GCN-NEXT:    v_writelane_b32 v1, s10, 38
-; GCN-NEXT:    v_writelane_b32 v1, s11, 39
+; GCN-NEXT:    v_writelane_b32 v2, s4, 32
+; GCN-NEXT:    v_writelane_b32 v2, s5, 33
+; GCN-NEXT:    v_writelane_b32 v2, s6, 34
+; GCN-NEXT:    v_writelane_b32 v2, s7, 35
+; GCN-NEXT:    v_writelane_b32 v2, s8, 36
+; GCN-NEXT:    v_writelane_b32 v2, s9, 37
+; GCN-NEXT:    v_writelane_b32 v2, s10, 38
+; GCN-NEXT:    v_writelane_b32 v2, s11, 39
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 40
-; GCN-NEXT:    v_writelane_b32 v1, s5, 41
-; GCN-NEXT:    v_writelane_b32 v1, s6, 42
-; GCN-NEXT:    v_writelane_b32 v1, s7, 43
-; GCN-NEXT:    v_writelane_b32 v1, s8, 44
-; GCN-NEXT:    v_writelane_b32 v1, s9, 45
-; GCN-NEXT:    v_writelane_b32 v1, s10, 46
-; GCN-NEXT:    v_writelane_b32 v1, s11, 47
+; GCN-NEXT:    v_writelane_b32 v2, s4, 40
+; GCN-NEXT:    v_writelane_b32 v2, s5, 41
+; GCN-NEXT:    v_writelane_b32 v2, s6, 42
+; GCN-NEXT:    v_writelane_b32 v2, s7, 43
+; GCN-NEXT:    v_writelane_b32 v2, s8, 44
+; GCN-NEXT:    v_writelane_b32 v2, s9, 45
+; GCN-NEXT:    v_writelane_b32 v2, s10, 46
+; GCN-NEXT:    v_writelane_b32 v2, s11, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 48
-; GCN-NEXT:    v_writelane_b32 v1, s5, 49
-; GCN-NEXT:    v_writelane_b32 v1, s6, 50
-; GCN-NEXT:    v_writelane_b32 v1, s7, 51
-; GCN-NEXT:    v_writelane_b32 v1, s8, 52
-; GCN-NEXT:    v_writelane_b32 v1, s9, 53
-; GCN-NEXT:    v_writelane_b32 v1, s10, 54
-; GCN-NEXT:    v_writelane_b32 v1, s11, 55
+; GCN-NEXT:    v_writelane_b32 v2, s4, 48
+; GCN-NEXT:    v_writelane_b32 v2, s5, 49
+; GCN-NEXT:    v_writelane_b32 v2, s6, 50
+; GCN-NEXT:    v_writelane_b32 v2, s7, 51
+; GCN-NEXT:    v_writelane_b32 v2, s8, 52
+; GCN-NEXT:    v_writelane_b32 v2, s9, 53
+; GCN-NEXT:    v_writelane_b32 v2, s10, 54
+; GCN-NEXT:    v_writelane_b32 v2, s11, 55
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 56
-; GCN-NEXT:    v_writelane_b32 v1, s5, 57
-; GCN-NEXT:    v_writelane_b32 v1, s6, 58
-; GCN-NEXT:    v_writelane_b32 v1, s7, 59
-; GCN-NEXT:    v_writelane_b32 v1, s8, 60
-; GCN-NEXT:    v_writelane_b32 v1, s9, 61
-; GCN-NEXT:    v_writelane_b32 v1, s10, 62
-; GCN-NEXT:    v_writelane_b32 v1, s11, 63
+; GCN-NEXT:    v_writelane_b32 v2, s4, 56
+; GCN-NEXT:    v_writelane_b32 v2, s5, 57
+; GCN-NEXT:    v_writelane_b32 v2, s6, 58
+; GCN-NEXT:    v_writelane_b32 v2, s7, 59
+; GCN-NEXT:    v_writelane_b32 v2, s8, 60
+; GCN-NEXT:    v_writelane_b32 v2, s9, 61
+; GCN-NEXT:    v_writelane_b32 v2, s10, 62
+; GCN-NEXT:    v_writelane_b32 v2, s11, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v2, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s4, 0
-; GCN-NEXT:    v_writelane_b32 v0, s5, 1
-; GCN-NEXT:    v_writelane_b32 v0, s6, 2
-; GCN-NEXT:    v_writelane_b32 v0, s7, 3
-; GCN-NEXT:    v_writelane_b32 v0, s8, 4
-; GCN-NEXT:    v_writelane_b32 v0, s9, 5
-; GCN-NEXT:    v_writelane_b32 v0, s10, 6
-; GCN-NEXT:    v_writelane_b32 v0, s11, 7
+; GCN-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
+; GCN-NEXT:    v_writelane_b32 v2, s4, 0
+; GCN-NEXT:    v_writelane_b32 v2, s5, 1
+; GCN-NEXT:    v_writelane_b32 v2, s6, 2
+; GCN-NEXT:    v_writelane_b32 v2, s7, 3
+; GCN-NEXT:    v_writelane_b32 v2, s8, 4
+; GCN-NEXT:    v_writelane_b32 v2, s9, 5
+; GCN-NEXT:    v_writelane_b32 v2, s10, 6
+; GCN-NEXT:    v_writelane_b32 v2, s11, 7
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[92:95], 0 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v2, off, s[92:95], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -223,76 +223,76 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_readlane_b32 s8, v2, 56
-; GCN-NEXT:    v_readlane_b32 s9, v2, 57
-; GCN-NEXT:    v_readlane_b32 s10, v2, 58
-; GCN-NEXT:    v_readlane_b32 s11, v2, 59
-; GCN-NEXT:    v_readlane_b32 s12, v2, 60
-; GCN-NEXT:    v_readlane_b32 s13, v2, 61
-; GCN-NEXT:    v_readlane_b32 s14, v2, 62
-; GCN-NEXT:    v_readlane_b32 s15, v2, 63
-; GCN-NEXT:    v_readlane_b32 s16, v2, 48
-; GCN-NEXT:    v_readlane_b32 s17, v2, 49
-; GCN-NEXT:    v_readlane_b32 s18, v2, 50
-; GCN-NEXT:    v_readlane_b32 s19, v2, 51
-; GCN-NEXT:    v_readlane_b32 s20, v2, 52
-; GCN-NEXT:    v_readlane_b32 s21, v2, 53
-; GCN-NEXT:    v_readlane_b32 s22, v2, 54
-; GCN-NEXT:    v_readlane_b32 s23, v2, 55
-; GCN-NEXT:    v_readlane_b32 s24, v2, 40
-; GCN-NEXT:    v_readlane_b32 s25, v2, 41
-; GCN-NEXT:    v_readlane_b32 s26, v2, 42
-; GCN-NEXT:    v_readlane_b32 s27, v2, 43
-; GCN-NEXT:    v_readlane_b32 s28, v2, 44
-; GCN-NEXT:    v_readlane_b32 s29, v2, 45
-; GCN-NEXT:    v_readlane_b32 s30, v2, 46
-; GCN-NEXT:    v_readlane_b32 s31, v2, 47
-; GCN-NEXT:    v_readlane_b32 s36, v2, 32
-; GCN-NEXT:    v_readlane_b32 s37, v2, 33
-; GCN-NEXT:    v_readlane_b32 s38, v2, 34
-; GCN-NEXT:    v_readlane_b32 s39, v2, 35
-; GCN-NEXT:    v_readlane_b32 s40, v2, 36
-; GCN-NEXT:    v_readlane_b32 s41, v2, 37
-; GCN-NEXT:    v_readlane_b32 s42, v2, 38
-; GCN-NEXT:    v_readlane_b32 s43, v2, 39
-; GCN-NEXT:    v_readlane_b32 s44, v2, 24
-; GCN-NEXT:    v_readlane_b32 s45, v2, 25
-; GCN-NEXT:    v_readlane_b32 s46, v2, 26
-; GCN-NEXT:    v_readlane_b32 s47, v2, 27
-; GCN-NEXT:    v_readlane_b32 s48, v2, 28
-; GCN-NEXT:    v_readlane_b32 s49, v2, 29
-; GCN-NEXT:    v_readlane_b32 s50, v2, 30
-; GCN-NEXT:    v_readlane_b32 s51, v2, 31
-; GCN-NEXT:    v_readlane_b32 s52, v2, 16
-; GCN-NEXT:    v_readlane_b32 s53, v2, 17
-; GCN-NEXT:    v_readlane_b32 s54, v2, 18
-; GCN-NEXT:    v_readlane_b32 s55, v2, 19
-; GCN-NEXT:    v_readlane_b32 s56, v2, 20
-; GCN-NEXT:    v_readlane_b32 s57, v2, 21
-; GCN-NEXT:    v_readlane_b32 s58, v2, 22
-; GCN-NEXT:    v_readlane_b32 s59, v2, 23
-; GCN-NEXT:    v_readlane_b32 s60, v2, 8
-; GCN-NEXT:    v_readlane_b32 s61, v2, 9
-; GCN-NEXT:    v_readlane_b32 s62, v2, 10
-; GCN-NEXT:    v_readlane_b32 s63, v2, 11
-; GCN-NEXT:    v_readlane_b32 s64, v2, 12
-; GCN-NEXT:    v_readlane_b32 s65, v2, 13
-; GCN-NEXT:    v_readlane_b32 s66, v2, 14
-; GCN-NEXT:    v_readlane_b32 s67, v2, 15
-; GCN-NEXT:    v_readlane_b32 s68, v2, 0
-; GCN-NEXT:    v_readlane_b32 s69, v2, 1
-; GCN-NEXT:    v_readlane_b32 s70, v2, 2
-; GCN-NEXT:    v_readlane_b32 s71, v2, 3
-; GCN-NEXT:    v_readlane_b32 s72, v2, 4
-; GCN-NEXT:    v_readlane_b32 s73, v2, 5
-; GCN-NEXT:    v_readlane_b32 s74, v2, 6
-; GCN-NEXT:    v_readlane_b32 s75, v2, 7
+; GCN-NEXT:    v_readlane_b32 s8, v0, 56
+; GCN-NEXT:    v_readlane_b32 s9, v0, 57
+; GCN-NEXT:    v_readlane_b32 s10, v0, 58
+; GCN-NEXT:    v_readlane_b32 s11, v0, 59
+; GCN-NEXT:    v_readlane_b32 s12, v0, 60
+; GCN-NEXT:    v_readlane_b32 s13, v0, 61
+; GCN-NEXT:    v_readlane_b32 s14, v0, 62
+; GCN-NEXT:    v_readlane_b32 s15, v0, 63
+; GCN-NEXT:    v_readlane_b32 s16, v0, 48
+; GCN-NEXT:    v_readlane_b32 s17, v0, 49
+; GCN-NEXT:    v_readlane_b32 s18, v0, 50
+; GCN-NEXT:    v_readlane_b32 s19, v0, 51
+; GCN-NEXT:    v_readlane_b32 s20, v0, 52
+; GCN-NEXT:    v_readlane_b32 s21, v0, 53
+; GCN-NEXT:    v_readlane_b32 s22, v0, 54
+; GCN-NEXT:    v_readlane_b32 s23, v0, 55
+; GCN-NEXT:    v_readlane_b32 s24, v0, 40
+; GCN-NEXT:    v_readlane_b32 s25, v0, 41
+; GCN-NEXT:    v_readlane_b32 s26, v0, 42
+; GCN-NEXT:    v_readlane_b32 s27, v0, 43
+; GCN-NEXT:    v_readlane_b32 s28, v0, 44
+; GCN-NEXT:    v_readlane_b32 s29, v0, 45
+; GCN-NEXT:    v_readlane_b32 s30, v0, 46
+; GCN-NEXT:    v_readlane_b32 s31, v0, 47
+; GCN-NEXT:    v_readlane_b32 s36, v0, 32
+; GCN-NEXT:    v_readlane_b32 s37, v0, 33
+; GCN-NEXT:    v_readlane_b32 s38, v0, 34
+; GCN-NEXT:    v_readlane_b32 s39, v0, 35
+; GCN-NEXT:    v_readlane_b32 s40, v0, 36
+; GCN-NEXT:    v_readlane_b32 s41, v0, 37
+; GCN-NEXT:    v_readlane_b32 s42, v0, 38
+; GCN-NEXT:    v_readlane_b32 s43, v0, 39
+; GCN-NEXT:    v_readlane_b32 s44, v0, 24
+; GCN-NEXT:    v_readlane_b32 s45, v0, 25
+; GCN-NEXT:    v_readlane_b32 s46, v0, 26
+; GCN-NEXT:    v_readlane_b32 s47, v0, 27
+; GCN-NEXT:    v_readlane_b32 s48, v0, 28
+; GCN-NEXT:    v_readlane_b32 s49, v0, 29
+; GCN-NEXT:    v_readlane_b32 s50, v0, 30
+; GCN-NEXT:    v_readlane_b32 s51, v0, 31
+; GCN-NEXT:    v_readlane_b32 s52, v0, 16
+; GCN-NEXT:    v_readlane_b32 s53, v0, 17
+; GCN-NEXT:    v_readlane_b32 s54, v0, 18
+; GCN-NEXT:    v_readlane_b32 s55, v0, 19
+; GCN-NEXT:    v_readlane_b32 s56, v0, 20
+; GCN-NEXT:    v_readlane_b32 s57, v0, 21
+; GCN-NEXT:    v_readlane_b32 s58, v0, 22
+; GCN-NEXT:    v_readlane_b32 s59, v0, 23
+; GCN-NEXT:    v_readlane_b32 s60, v0, 8
+; GCN-NEXT:    v_readlane_b32 s61, v0, 9
+; GCN-NEXT:    v_readlane_b32 s62, v0, 10
+; GCN-NEXT:    v_readlane_b32 s63, v0, 11
+; GCN-NEXT:    v_readlane_b32 s64, v0, 12
+; GCN-NEXT:    v_readlane_b32 s65, v0, 13
+; GCN-NEXT:    v_readlane_b32 s66, v0, 14
+; GCN-NEXT:    v_readlane_b32 s67, v0, 15
+; GCN-NEXT:    v_readlane_b32 s68, v0, 0
+; GCN-NEXT:    v_readlane_b32 s69, v0, 1
+; GCN-NEXT:    v_readlane_b32 s70, v0, 2
+; GCN-NEXT:    v_readlane_b32 s71, v0, 3
+; GCN-NEXT:    v_readlane_b32 s72, v0, 4
+; GCN-NEXT:    v_readlane_b32 s73, v0, 5
+; GCN-NEXT:    v_readlane_b32 s74, v0, 6
+; GCN-NEXT:    v_readlane_b32 s75, v0, 7
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s76, v1, 56
 ; GCN-NEXT:    v_readlane_b32 s77, v1, 57
@@ -319,7 +319,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_readlane_b32 s6, v1, 6
 ; GCN-NEXT:    v_readlane_b32 s7, v1, 7
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
@@ -380,14 +380,14 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-NEXT:    v_readlane_b32 s2, v0, 2
-; GCN-NEXT:    v_readlane_b32 s3, v0, 3
-; GCN-NEXT:    v_readlane_b32 s4, v0, 4
-; GCN-NEXT:    v_readlane_b32 s5, v0, 5
-; GCN-NEXT:    v_readlane_b32 s6, v0, 6
-; GCN-NEXT:    v_readlane_b32 s7, v0, 7
+; GCN-NEXT:    v_readlane_b32 s0, v2, 0
+; GCN-NEXT:    v_readlane_b32 s1, v2, 1
+; GCN-NEXT:    v_readlane_b32 s2, v2, 2
+; GCN-NEXT:    v_readlane_b32 s3, v2, 3
+; GCN-NEXT:    v_readlane_b32 s4, v2, 4
+; GCN-NEXT:    v_readlane_b32 s5, v2, 5
+; GCN-NEXT:    v_readlane_b32 s6, v2, 6
+; GCN-NEXT:    v_readlane_b32 s7, v2, 7
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[84:91]
 ; GCN-NEXT:    ;;#ASMEND
@@ -422,18 +422,6 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB0_2: ; %ret
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    ; kill: killed $vgpr2
-; GCN-NEXT:    ; kill: killed $vgpr1
-; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    s_endpgm
   %wide.sgpr0 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
   %wide.sgpr1 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
@@ -490,12 +478,11 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_mov_b32 s55, 0xe8f000
 ; GCN-NEXT:    s_add_u32 s52, s52, s9
 ; GCN-NEXT:    s_addc_u32 s53, s53, 0
-; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GCN-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-NEXT:    s_load_dword s0, s[2:3], 0xb
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-NEXT:    v_writelane_b32 v1, s4, 0
 ; GCN-NEXT:    v_writelane_b32 v1, s5, 1
 ; GCN-NEXT:    v_writelane_b32 v1, s6, 2
@@ -575,21 +562,22 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s4, 0
-; GCN-NEXT:    v_writelane_b32 v0, s5, 1
-; GCN-NEXT:    v_writelane_b32 v0, s6, 2
-; GCN-NEXT:    v_writelane_b32 v0, s7, 3
-; GCN-NEXT:    v_writelane_b32 v0, s8, 4
-; GCN-NEXT:    v_writelane_b32 v0, s9, 5
-; GCN-NEXT:    v_writelane_b32 v0, s10, 6
-; GCN-NEXT:    v_writelane_b32 v0, s11, 7
+; GCN-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
+; GCN-NEXT:    v_writelane_b32 v1, s4, 0
+; GCN-NEXT:    v_writelane_b32 v1, s5, 1
+; GCN-NEXT:    v_writelane_b32 v1, s6, 2
+; GCN-NEXT:    v_writelane_b32 v1, s7, 3
+; GCN-NEXT:    v_writelane_b32 v1, s8, 4
+; GCN-NEXT:    v_writelane_b32 v1, s9, 5
+; GCN-NEXT:    v_writelane_b32 v1, s10, 6
+; GCN-NEXT:    v_writelane_b32 v1, s11, 7
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[2:3]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s2, 8
-; GCN-NEXT:    v_writelane_b32 v0, s3, 9
+; GCN-NEXT:    v_writelane_b32 v1, s2, 8
+; GCN-NEXT:    v_writelane_b32 v1, s3, 9
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -597,93 +585,93 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_readlane_b32 s16, v1, 8
-; GCN-NEXT:    v_readlane_b32 s17, v1, 9
-; GCN-NEXT:    v_readlane_b32 s20, v1, 0
-; GCN-NEXT:    v_readlane_b32 s21, v1, 1
-; GCN-NEXT:    v_readlane_b32 s22, v1, 2
-; GCN-NEXT:    v_readlane_b32 s23, v1, 3
-; GCN-NEXT:    v_readlane_b32 s24, v1, 4
-; GCN-NEXT:    v_readlane_b32 s25, v1, 5
-; GCN-NEXT:    v_readlane_b32 s26, v1, 6
-; GCN-NEXT:    v_readlane_b32 s27, v1, 7
+; GCN-NEXT:    v_readlane_b32 s16, v0, 8
+; GCN-NEXT:    v_readlane_b32 s17, v0, 9
+; GCN-NEXT:    v_readlane_b32 s20, v0, 0
+; GCN-NEXT:    v_readlane_b32 s21, v0, 1
+; GCN-NEXT:    v_readlane_b32 s22, v0, 2
+; GCN-NEXT:    v_readlane_b32 s23, v0, 3
+; GCN-NEXT:    v_readlane_b32 s24, v0, 4
+; GCN-NEXT:    v_readlane_b32 s25, v0, 5
+; GCN-NEXT:    v_readlane_b32 s26, v0, 6
+; GCN-NEXT:    v_readlane_b32 s27, v0, 7
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s36, v0, 32
-; GCN-NEXT:    v_readlane_b32 s37, v0, 33
-; GCN-NEXT:    v_readlane_b32 s38, v0, 34
-; GCN-NEXT:    v_readlane_b32 s39, v0, 35
-; GCN-NEXT:    v_readlane_b32 s40, v0, 36
-; GCN-NEXT:    v_readlane_b32 s41, v0, 37
-; GCN-NEXT:    v_readlane_b32 s42, v0, 38
-; GCN-NEXT:    v_readlane_b32 s43, v0, 39
-; GCN-NEXT:    v_readlane_b32 s44, v0, 40
-; GCN-NEXT:    v_readlane_b32 s45, v0, 41
-; GCN-NEXT:    v_readlane_b32 s46, v0, 42
-; GCN-NEXT:    v_readlane_b32 s47, v0, 43
-; GCN-NEXT:    v_readlane_b32 s48, v0, 44
-; GCN-NEXT:    v_readlane_b32 s49, v0, 45
-; GCN-NEXT:    v_readlane_b32 s50, v0, 46
-; GCN-NEXT:    v_readlane_b32 s51, v0, 47
-; GCN-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-NEXT:    v_readlane_b32 s2, v0, 2
-; GCN-NEXT:    v_readlane_b32 s3, v0, 3
-; GCN-NEXT:    v_readlane_b32 s4, v0, 4
-; GCN-NEXT:    v_readlane_b32 s5, v0, 5
-; GCN-NEXT:    v_readlane_b32 s6, v0, 6
-; GCN-NEXT:    v_readlane_b32 s7, v0, 7
-; GCN-NEXT:    v_readlane_b32 s8, v0, 8
-; GCN-NEXT:    v_readlane_b32 s9, v0, 9
-; GCN-NEXT:    v_readlane_b32 s10, v0, 10
-; GCN-NEXT:    v_readlane_b32 s11, v0, 11
-; GCN-NEXT:    v_readlane_b32 s12, v0, 12
-; GCN-NEXT:    v_readlane_b32 s13, v0, 13
-; GCN-NEXT:    v_readlane_b32 s14, v0, 14
-; GCN-NEXT:    v_readlane_b32 s15, v0, 15
+; GCN-NEXT:    v_readlane_b32 s36, v1, 32
+; GCN-NEXT:    v_readlane_b32 s37, v1, 33
+; GCN-NEXT:    v_readlane_b32 s38, v1, 34
+; GCN-NEXT:    v_readlane_b32 s39, v1, 35
+; GCN-NEXT:    v_readlane_b32 s40, v1, 36
+; GCN-NEXT:    v_readlane_b32 s41, v1, 37
+; GCN-NEXT:    v_readlane_b32 s42, v1, 38
+; GCN-NEXT:    v_readlane_b32 s43, v1, 39
+; GCN-NEXT:    v_readlane_b32 s44, v1, 40
+; GCN-NEXT:    v_readlane_b32 s45, v1, 41
+; GCN-NEXT:    v_readlane_b32 s46, v1, 42
+; GCN-NEXT:    v_readlane_b32 s47, v1, 43
+; GCN-NEXT:    v_readlane_b32 s48, v1, 44
+; GCN-NEXT:    v_readlane_b32 s49, v1, 45
+; GCN-NEXT:    v_readlane_b32 s50, v1, 46
+; GCN-NEXT:    v_readlane_b32 s51, v1, 47
+; GCN-NEXT:    v_readlane_b32 s0, v1, 0
+; GCN-NEXT:    v_readlane_b32 s1, v1, 1
+; GCN-NEXT:    v_readlane_b32 s2, v1, 2
+; GCN-NEXT:    v_readlane_b32 s3, v1, 3
+; GCN-NEXT:    v_readlane_b32 s4, v1, 4
+; GCN-NEXT:    v_readlane_b32 s5, v1, 5
+; GCN-NEXT:    v_readlane_b32 s6, v1, 6
+; GCN-NEXT:    v_readlane_b32 s7, v1, 7
+; GCN-NEXT:    v_readlane_b32 s8, v1, 8
+; GCN-NEXT:    v_readlane_b32 s9, v1, 9
+; GCN-NEXT:    v_readlane_b32 s10, v1, 10
+; GCN-NEXT:    v_readlane_b32 s11, v1, 11
+; GCN-NEXT:    v_readlane_b32 s12, v1, 12
+; GCN-NEXT:    v_readlane_b32 s13, v1, 13
+; GCN-NEXT:    v_readlane_b32 s14, v1, 14
+; GCN-NEXT:    v_readlane_b32 s15, v1, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 16
-; GCN-NEXT:    v_readlane_b32 s1, v0, 17
-; GCN-NEXT:    v_readlane_b32 s2, v0, 18
-; GCN-NEXT:    v_readlane_b32 s3, v0, 19
-; GCN-NEXT:    v_readlane_b32 s4, v0, 20
-; GCN-NEXT:    v_readlane_b32 s5, v0, 21
-; GCN-NEXT:    v_readlane_b32 s6, v0, 22
-; GCN-NEXT:    v_readlane_b32 s7, v0, 23
-; GCN-NEXT:    v_readlane_b32 s8, v0, 24
-; GCN-NEXT:    v_readlane_b32 s9, v0, 25
-; GCN-NEXT:    v_readlane_b32 s10, v0, 26
-; GCN-NEXT:    v_readlane_b32 s11, v0, 27
-; GCN-NEXT:    v_readlane_b32 s12, v0, 28
-; GCN-NEXT:    v_readlane_b32 s13, v0, 29
-; GCN-NEXT:    v_readlane_b32 s14, v0, 30
-; GCN-NEXT:    v_readlane_b32 s15, v0, 31
+; GCN-NEXT:    v_readlane_b32 s0, v1, 16
+; GCN-NEXT:    v_readlane_b32 s1, v1, 17
+; GCN-NEXT:    v_readlane_b32 s2, v1, 18
+; GCN-NEXT:    v_readlane_b32 s3, v1, 19
+; GCN-NEXT:    v_readlane_b32 s4, v1, 20
+; GCN-NEXT:    v_readlane_b32 s5, v1, 21
+; GCN-NEXT:    v_readlane_b32 s6, v1, 22
+; GCN-NEXT:    v_readlane_b32 s7, v1, 23
+; GCN-NEXT:    v_readlane_b32 s8, v1, 24
+; GCN-NEXT:    v_readlane_b32 s9, v1, 25
+; GCN-NEXT:    v_readlane_b32 s10, v1, 26
+; GCN-NEXT:    v_readlane_b32 s11, v1, 27
+; GCN-NEXT:    v_readlane_b32 s12, v1, 28
+; GCN-NEXT:    v_readlane_b32 s13, v1, 29
+; GCN-NEXT:    v_readlane_b32 s14, v1, 30
+; GCN-NEXT:    v_readlane_b32 s15, v1, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 48
-; GCN-NEXT:    v_readlane_b32 s1, v0, 49
-; GCN-NEXT:    v_readlane_b32 s2, v0, 50
-; GCN-NEXT:    v_readlane_b32 s3, v0, 51
-; GCN-NEXT:    v_readlane_b32 s4, v0, 52
-; GCN-NEXT:    v_readlane_b32 s5, v0, 53
-; GCN-NEXT:    v_readlane_b32 s6, v0, 54
-; GCN-NEXT:    v_readlane_b32 s7, v0, 55
-; GCN-NEXT:    v_readlane_b32 s8, v0, 56
-; GCN-NEXT:    v_readlane_b32 s9, v0, 57
-; GCN-NEXT:    v_readlane_b32 s10, v0, 58
-; GCN-NEXT:    v_readlane_b32 s11, v0, 59
-; GCN-NEXT:    v_readlane_b32 s12, v0, 60
-; GCN-NEXT:    v_readlane_b32 s13, v0, 61
-; GCN-NEXT:    v_readlane_b32 s14, v0, 62
-; GCN-NEXT:    v_readlane_b32 s15, v0, 63
+; GCN-NEXT:    v_readlane_b32 s0, v1, 48
+; GCN-NEXT:    v_readlane_b32 s1, v1, 49
+; GCN-NEXT:    v_readlane_b32 s2, v1, 50
+; GCN-NEXT:    v_readlane_b32 s3, v1, 51
+; GCN-NEXT:    v_readlane_b32 s4, v1, 52
+; GCN-NEXT:    v_readlane_b32 s5, v1, 53
+; GCN-NEXT:    v_readlane_b32 s6, v1, 54
+; GCN-NEXT:    v_readlane_b32 s7, v1, 55
+; GCN-NEXT:    v_readlane_b32 s8, v1, 56
+; GCN-NEXT:    v_readlane_b32 s9, v1, 57
+; GCN-NEXT:    v_readlane_b32 s10, v1, 58
+; GCN-NEXT:    v_readlane_b32 s11, v1, 59
+; GCN-NEXT:    v_readlane_b32 s12, v1, 60
+; GCN-NEXT:    v_readlane_b32 s13, v1, 61
+; GCN-NEXT:    v_readlane_b32 s14, v1, 62
+; GCN-NEXT:    v_readlane_b32 s15, v1, 63
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[36:51]
 ; GCN-NEXT:    ;;#ASMEND
@@ -697,14 +685,6 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB1_2: ; %ret
-; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[28:29]
-; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[28:29]
-; GCN-NEXT:    ; kill: killed $vgpr1
-; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    s_endpgm
   %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
   %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
@@ -741,17 +721,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_mov_b32 s55, 0xe8f000
 ; GCN-NEXT:    s_add_u32 s52, s52, s9
 ; GCN-NEXT:    s_addc_u32 s53, s53, 0
-; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN-NEXT:    s_load_dword s0, s[2:3], 0xb
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
@@ -765,91 +737,91 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_writelane_b32 v1, s4, 0
-; GCN-NEXT:    v_writelane_b32 v1, s5, 1
-; GCN-NEXT:    v_writelane_b32 v1, s6, 2
-; GCN-NEXT:    v_writelane_b32 v1, s7, 3
-; GCN-NEXT:    v_writelane_b32 v1, s8, 4
-; GCN-NEXT:    v_writelane_b32 v1, s9, 5
-; GCN-NEXT:    v_writelane_b32 v1, s10, 6
-; GCN-NEXT:    v_writelane_b32 v1, s11, 7
-; GCN-NEXT:    v_writelane_b32 v1, s12, 8
-; GCN-NEXT:    v_writelane_b32 v1, s13, 9
-; GCN-NEXT:    v_writelane_b32 v1, s14, 10
-; GCN-NEXT:    v_writelane_b32 v1, s15, 11
-; GCN-NEXT:    v_writelane_b32 v1, s16, 12
-; GCN-NEXT:    v_writelane_b32 v1, s17, 13
-; GCN-NEXT:    v_writelane_b32 v1, s18, 14
-; GCN-NEXT:    v_writelane_b32 v1, s19, 15
+; GCN-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
+; GCN-NEXT:    v_writelane_b32 v32, s4, 0
+; GCN-NEXT:    v_writelane_b32 v32, s5, 1
+; GCN-NEXT:    v_writelane_b32 v32, s6, 2
+; GCN-NEXT:    v_writelane_b32 v32, s7, 3
+; GCN-NEXT:    v_writelane_b32 v32, s8, 4
+; GCN-NEXT:    v_writelane_b32 v32, s9, 5
+; GCN-NEXT:    v_writelane_b32 v32, s10, 6
+; GCN-NEXT:    v_writelane_b32 v32, s11, 7
+; GCN-NEXT:    v_writelane_b32 v32, s12, 8
+; GCN-NEXT:    v_writelane_b32 v32, s13, 9
+; GCN-NEXT:    v_writelane_b32 v32, s14, 10
+; GCN-NEXT:    v_writelane_b32 v32, s15, 11
+; GCN-NEXT:    v_writelane_b32 v32, s16, 12
+; GCN-NEXT:    v_writelane_b32 v32, s17, 13
+; GCN-NEXT:    v_writelane_b32 v32, s18, 14
+; GCN-NEXT:    v_writelane_b32 v32, s19, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 16
-; GCN-NEXT:    v_writelane_b32 v1, s5, 17
-; GCN-NEXT:    v_writelane_b32 v1, s6, 18
-; GCN-NEXT:    v_writelane_b32 v1, s7, 19
-; GCN-NEXT:    v_writelane_b32 v1, s8, 20
-; GCN-NEXT:    v_writelane_b32 v1, s9, 21
-; GCN-NEXT:    v_writelane_b32 v1, s10, 22
-; GCN-NEXT:    v_writelane_b32 v1, s11, 23
-; GCN-NEXT:    v_writelane_b32 v1, s12, 24
-; GCN-NEXT:    v_writelane_b32 v1, s13, 25
-; GCN-NEXT:    v_writelane_b32 v1, s14, 26
-; GCN-NEXT:    v_writelane_b32 v1, s15, 27
-; GCN-NEXT:    v_writelane_b32 v1, s16, 28
-; GCN-NEXT:    v_writelane_b32 v1, s17, 29
-; GCN-NEXT:    v_writelane_b32 v1, s18, 30
-; GCN-NEXT:    v_writelane_b32 v1, s19, 31
+; GCN-NEXT:    v_writelane_b32 v32, s4, 16
+; GCN-NEXT:    v_writelane_b32 v32, s5, 17
+; GCN-NEXT:    v_writelane_b32 v32, s6, 18
+; GCN-NEXT:    v_writelane_b32 v32, s7, 19
+; GCN-NEXT:    v_writelane_b32 v32, s8, 20
+; GCN-NEXT:    v_writelane_b32 v32, s9, 21
+; GCN-NEXT:    v_writelane_b32 v32, s10, 22
+; GCN-NEXT:    v_writelane_b32 v32, s11, 23
+; GCN-NEXT:    v_writelane_b32 v32, s12, 24
+; GCN-NEXT:    v_writelane_b32 v32, s13, 25
+; GCN-NEXT:    v_writelane_b32 v32, s14, 26
+; GCN-NEXT:    v_writelane_b32 v32, s15, 27
+; GCN-NEXT:    v_writelane_b32 v32, s16, 28
+; GCN-NEXT:    v_writelane_b32 v32, s17, 29
+; GCN-NEXT:    v_writelane_b32 v32, s18, 30
+; GCN-NEXT:    v_writelane_b32 v32, s19, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 32
-; GCN-NEXT:    v_writelane_b32 v1, s5, 33
-; GCN-NEXT:    v_writelane_b32 v1, s6, 34
-; GCN-NEXT:    v_writelane_b32 v1, s7, 35
-; GCN-NEXT:    v_writelane_b32 v1, s8, 36
-; GCN-NEXT:    v_writelane_b32 v1, s9, 37
-; GCN-NEXT:    v_writelane_b32 v1, s10, 38
-; GCN-NEXT:    v_writelane_b32 v1, s11, 39
-; GCN-NEXT:    v_writelane_b32 v1, s12, 40
-; GCN-NEXT:    v_writelane_b32 v1, s13, 41
-; GCN-NEXT:    v_writelane_b32 v1, s14, 42
-; GCN-NEXT:    v_writelane_b32 v1, s15, 43
-; GCN-NEXT:    v_writelane_b32 v1, s16, 44
-; GCN-NEXT:    v_writelane_b32 v1, s17, 45
-; GCN-NEXT:    v_writelane_b32 v1, s18, 46
-; GCN-NEXT:    v_writelane_b32 v1, s19, 47
+; GCN-NEXT:    v_writelane_b32 v32, s4, 32
+; GCN-NEXT:    v_writelane_b32 v32, s5, 33
+; GCN-NEXT:    v_writelane_b32 v32, s6, 34
+; GCN-NEXT:    v_writelane_b32 v32, s7, 35
+; GCN-NEXT:    v_writelane_b32 v32, s8, 36
+; GCN-NEXT:    v_writelane_b32 v32, s9, 37
+; GCN-NEXT:    v_writelane_b32 v32, s10, 38
+; GCN-NEXT:    v_writelane_b32 v32, s11, 39
+; GCN-NEXT:    v_writelane_b32 v32, s12, 40
+; GCN-NEXT:    v_writelane_b32 v32, s13, 41
+; GCN-NEXT:    v_writelane_b32 v32, s14, 42
+; GCN-NEXT:    v_writelane_b32 v32, s15, 43
+; GCN-NEXT:    v_writelane_b32 v32, s16, 44
+; GCN-NEXT:    v_writelane_b32 v32, s17, 45
+; GCN-NEXT:    v_writelane_b32 v32, s18, 46
+; GCN-NEXT:    v_writelane_b32 v32, s19, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 48
-; GCN-NEXT:    v_writelane_b32 v1, s5, 49
-; GCN-NEXT:    v_writelane_b32 v1, s6, 50
-; GCN-NEXT:    v_writelane_b32 v1, s7, 51
-; GCN-NEXT:    v_writelane_b32 v1, s8, 52
-; GCN-NEXT:    v_writelane_b32 v1, s9, 53
-; GCN-NEXT:    v_writelane_b32 v1, s10, 54
-; GCN-NEXT:    v_writelane_b32 v1, s11, 55
-; GCN-NEXT:    v_writelane_b32 v1, s12, 56
-; GCN-NEXT:    v_writelane_b32 v1, s13, 57
-; GCN-NEXT:    v_writelane_b32 v1, s14, 58
-; GCN-NEXT:    v_writelane_b32 v1, s15, 59
-; GCN-NEXT:    v_writelane_b32 v1, s16, 60
-; GCN-NEXT:    v_writelane_b32 v1, s17, 61
-; GCN-NEXT:    v_writelane_b32 v1, s18, 62
-; GCN-NEXT:    v_writelane_b32 v1, s19, 63
+; GCN-NEXT:    v_writelane_b32 v32, s4, 48
+; GCN-NEXT:    v_writelane_b32 v32, s5, 49
+; GCN-NEXT:    v_writelane_b32 v32, s6, 50
+; GCN-NEXT:    v_writelane_b32 v32, s7, 51
+; GCN-NEXT:    v_writelane_b32 v32, s8, 52
+; GCN-NEXT:    v_writelane_b32 v32, s9, 53
+; GCN-NEXT:    v_writelane_b32 v32, s10, 54
+; GCN-NEXT:    v_writelane_b32 v32, s11, 55
+; GCN-NEXT:    v_writelane_b32 v32, s12, 56
+; GCN-NEXT:    v_writelane_b32 v32, s13, 57
+; GCN-NEXT:    v_writelane_b32 v32, s14, 58
+; GCN-NEXT:    v_writelane_b32 v32, s15, 59
+; GCN-NEXT:    v_writelane_b32 v32, s16, 60
+; GCN-NEXT:    v_writelane_b32 v32, s17, 61
+; GCN-NEXT:    v_writelane_b32 v32, s18, 62
+; GCN-NEXT:    v_writelane_b32 v32, s19, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v32, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[2:3]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_writelane_b32 v0, s2, 0
-; GCN-NEXT:    v_writelane_b32 v0, s3, 1
+; GCN-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
+; GCN-NEXT:    v_writelane_b32 v32, s2, 0
+; GCN-NEXT:    v_writelane_b32 v32, s3, 1
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v32, off, s[52:55], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -857,59 +829,59 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v31, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s36, v1, 32
-; GCN-NEXT:    v_readlane_b32 s37, v1, 33
-; GCN-NEXT:    v_readlane_b32 s38, v1, 34
-; GCN-NEXT:    v_readlane_b32 s39, v1, 35
-; GCN-NEXT:    v_readlane_b32 s40, v1, 36
-; GCN-NEXT:    v_readlane_b32 s41, v1, 37
-; GCN-NEXT:    v_readlane_b32 s42, v1, 38
-; GCN-NEXT:    v_readlane_b32 s43, v1, 39
-; GCN-NEXT:    v_readlane_b32 s44, v1, 40
-; GCN-NEXT:    v_readlane_b32 s45, v1, 41
-; GCN-NEXT:    v_readlane_b32 s46, v1, 42
-; GCN-NEXT:    v_readlane_b32 s47, v1, 43
-; GCN-NEXT:    v_readlane_b32 s48, v1, 44
-; GCN-NEXT:    v_readlane_b32 s49, v1, 45
-; GCN-NEXT:    v_readlane_b32 s50, v1, 46
-; GCN-NEXT:    v_readlane_b32 s51, v1, 47
-; GCN-NEXT:    v_readlane_b32 s0, v1, 16
-; GCN-NEXT:    v_readlane_b32 s1, v1, 17
-; GCN-NEXT:    v_readlane_b32 s2, v1, 18
-; GCN-NEXT:    v_readlane_b32 s3, v1, 19
-; GCN-NEXT:    v_readlane_b32 s4, v1, 20
-; GCN-NEXT:    v_readlane_b32 s5, v1, 21
-; GCN-NEXT:    v_readlane_b32 s6, v1, 22
-; GCN-NEXT:    v_readlane_b32 s7, v1, 23
-; GCN-NEXT:    v_readlane_b32 s8, v1, 24
-; GCN-NEXT:    v_readlane_b32 s9, v1, 25
-; GCN-NEXT:    v_readlane_b32 s10, v1, 26
-; GCN-NEXT:    v_readlane_b32 s11, v1, 27
-; GCN-NEXT:    v_readlane_b32 s12, v1, 28
-; GCN-NEXT:    v_readlane_b32 s13, v1, 29
-; GCN-NEXT:    v_readlane_b32 s14, v1, 30
-; GCN-NEXT:    v_readlane_b32 s15, v1, 31
-; GCN-NEXT:    v_readlane_b32 s16, v1, 0
-; GCN-NEXT:    v_readlane_b32 s17, v1, 1
-; GCN-NEXT:    v_readlane_b32 s18, v1, 2
-; GCN-NEXT:    v_readlane_b32 s19, v1, 3
-; GCN-NEXT:    v_readlane_b32 s20, v1, 4
-; GCN-NEXT:    v_readlane_b32 s21, v1, 5
-; GCN-NEXT:    v_readlane_b32 s22, v1, 6
-; GCN-NEXT:    v_readlane_b32 s23, v1, 7
-; GCN-NEXT:    v_readlane_b32 s24, v1, 8
-; GCN-NEXT:    v_readlane_b32 s25, v1, 9
-; GCN-NEXT:    v_readlane_b32 s26, v1, 10
-; GCN-NEXT:    v_readlane_b32 s27, v1, 11
-; GCN-NEXT:    v_readlane_b32 s28, v1, 12
-; GCN-NEXT:    v_readlane_b32 s29, v1, 13
-; GCN-NEXT:    v_readlane_b32 s30, v1, 14
-; GCN-NEXT:    v_readlane_b32 s31, v1, 15
+; GCN-NEXT:    v_readlane_b32 s36, v31, 32
+; GCN-NEXT:    v_readlane_b32 s37, v31, 33
+; GCN-NEXT:    v_readlane_b32 s38, v31, 34
+; GCN-NEXT:    v_readlane_b32 s39, v31, 35
+; GCN-NEXT:    v_readlane_b32 s40, v31, 36
+; GCN-NEXT:    v_readlane_b32 s41, v31, 37
+; GCN-NEXT:    v_readlane_b32 s42, v31, 38
+; GCN-NEXT:    v_readlane_b32 s43, v31, 39
+; GCN-NEXT:    v_readlane_b32 s44, v31, 40
+; GCN-NEXT:    v_readlane_b32 s45, v31, 41
+; GCN-NEXT:    v_readlane_b32 s46, v31, 42
+; GCN-NEXT:    v_readlane_b32 s47, v31, 43
+; GCN-NEXT:    v_readlane_b32 s48, v31, 44
+; GCN-NEXT:    v_readlane_b32 s49, v31, 45
+; GCN-NEXT:    v_readlane_b32 s50, v31, 46
+; GCN-NEXT:    v_readlane_b32 s51, v31, 47
+; GCN-NEXT:    v_readlane_b32 s0, v31, 16
+; GCN-NEXT:    v_readlane_b32 s1, v31, 17
+; GCN-NEXT:    v_readlane_b32 s2, v31, 18
+; GCN-NEXT:    v_readlane_b32 s3, v31, 19
+; GCN-NEXT:    v_readlane_b32 s4, v31, 20
+; GCN-NEXT:    v_readlane_b32 s5, v31, 21
+; GCN-NEXT:    v_readlane_b32 s6, v31, 22
+; GCN-NEXT:    v_readlane_b32 s7, v31, 23
+; GCN-NEXT:    v_readlane_b32 s8, v31, 24
+; GCN-NEXT:    v_readlane_b32 s9, v31, 25
+; GCN-NEXT:    v_readlane_b32 s10, v31, 26
+; GCN-NEXT:    v_readlane_b32 s11, v31, 27
+; GCN-NEXT:    v_readlane_b32 s12, v31, 28
+; GCN-NEXT:    v_readlane_b32 s13, v31, 29
+; GCN-NEXT:    v_readlane_b32 s14, v31, 30
+; GCN-NEXT:    v_readlane_b32 s15, v31, 31
+; GCN-NEXT:    v_readlane_b32 s16, v31, 0
+; GCN-NEXT:    v_readlane_b32 s17, v31, 1
+; GCN-NEXT:    v_readlane_b32 s18, v31, 2
+; GCN-NEXT:    v_readlane_b32 s19, v31, 3
+; GCN-NEXT:    v_readlane_b32 s20, v31, 4
+; GCN-NEXT:    v_readlane_b32 s21, v31, 5
+; GCN-NEXT:    v_readlane_b32 s22, v31, 6
+; GCN-NEXT:    v_readlane_b32 s23, v31, 7
+; GCN-NEXT:    v_readlane_b32 s24, v31, 8
+; GCN-NEXT:    v_readlane_b32 s25, v31, 9
+; GCN-NEXT:    v_readlane_b32 s26, v31, 10
+; GCN-NEXT:    v_readlane_b32 s27, v31, 11
+; GCN-NEXT:    v_readlane_b32 s28, v31, 12
+; GCN-NEXT:    v_readlane_b32 s29, v31, 13
+; GCN-NEXT:    v_readlane_b32 s30, v31, 14
+; GCN-NEXT:    v_readlane_b32 s31, v31, 15
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v32, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[16:31]
@@ -917,25 +889,25 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s4, v1, 48
-; GCN-NEXT:    v_readlane_b32 s5, v1, 49
-; GCN-NEXT:    v_readlane_b32 s6, v1, 50
-; GCN-NEXT:    v_readlane_b32 s7, v1, 51
-; GCN-NEXT:    v_readlane_b32 s8, v1, 52
-; GCN-NEXT:    v_readlane_b32 s9, v1, 53
-; GCN-NEXT:    v_readlane_b32 s10, v1, 54
-; GCN-NEXT:    v_readlane_b32 s11, v1, 55
-; GCN-NEXT:    v_readlane_b32 s12, v1, 56
-; GCN-NEXT:    v_readlane_b32 s13, v1, 57
-; GCN-NEXT:    v_readlane_b32 s14, v1, 58
-; GCN-NEXT:    v_readlane_b32 s15, v1, 59
-; GCN-NEXT:    v_readlane_b32 s16, v1, 60
-; GCN-NEXT:    v_readlane_b32 s17, v1, 61
-; GCN-NEXT:    v_readlane_b32 s18, v1, 62
-; GCN-NEXT:    v_readlane_b32 s19, v1, 63
+; GCN-NEXT:    v_readlane_b32 s4, v31, 48
+; GCN-NEXT:    v_readlane_b32 s5, v31, 49
+; GCN-NEXT:    v_readlane_b32 s6, v31, 50
+; GCN-NEXT:    v_readlane_b32 s7, v31, 51
+; GCN-NEXT:    v_readlane_b32 s8, v31, 52
+; GCN-NEXT:    v_readlane_b32 s9, v31, 53
+; GCN-NEXT:    v_readlane_b32 s10, v31, 54
+; GCN-NEXT:    v_readlane_b32 s11, v31, 55
+; GCN-NEXT:    v_readlane_b32 s12, v31, 56
+; GCN-NEXT:    v_readlane_b32 s13, v31, 57
+; GCN-NEXT:    v_readlane_b32 s14, v31, 58
+; GCN-NEXT:    v_readlane_b32 s15, v31, 59
+; GCN-NEXT:    v_readlane_b32 s16, v31, 60
+; GCN-NEXT:    v_readlane_b32 s17, v31, 61
+; GCN-NEXT:    v_readlane_b32 s18, v31, 62
+; GCN-NEXT:    v_readlane_b32 s19, v31, 63
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-NEXT:    v_readlane_b32 s0, v32, 0
+; GCN-NEXT:    v_readlane_b32 s1, v32, 1
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[36:51]
 ; GCN-NEXT:    ;;#ASMEND
@@ -946,14 +918,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ; use s[0:1]
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB2_2: ; %ret
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    ; kill: killed $vgpr1
-; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    s_endpgm
   call void asm sideeffect "", "~{v[0:7]}" () #0
   call void asm sideeffect "", "~{v[8:15]}" () #0
@@ -993,17 +957,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    s_mov_b32 s55, 0xe8f000
 ; GCN-NEXT:    s_add_u32 s52, s52, s9
 ; GCN-NEXT:    s_addc_u32 s53, s53, 0
-; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN-NEXT:    s_load_dword s0, s[2:3], 0x9
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
@@ -1017,91 +973,91 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_writelane_b32 v1, s4, 0
-; GCN-NEXT:    v_writelane_b32 v1, s5, 1
-; GCN-NEXT:    v_writelane_b32 v1, s6, 2
-; GCN-NEXT:    v_writelane_b32 v1, s7, 3
-; GCN-NEXT:    v_writelane_b32 v1, s8, 4
-; GCN-NEXT:    v_writelane_b32 v1, s9, 5
-; GCN-NEXT:    v_writelane_b32 v1, s10, 6
-; GCN-NEXT:    v_writelane_b32 v1, s11, 7
-; GCN-NEXT:    v_writelane_b32 v1, s12, 8
-; GCN-NEXT:    v_writelane_b32 v1, s13, 9
-; GCN-NEXT:    v_writelane_b32 v1, s14, 10
-; GCN-NEXT:    v_writelane_b32 v1, s15, 11
-; GCN-NEXT:    v_writelane_b32 v1, s16, 12
-; GCN-NEXT:    v_writelane_b32 v1, s17, 13
-; GCN-NEXT:    v_writelane_b32 v1, s18, 14
-; GCN-NEXT:    v_writelane_b32 v1, s19, 15
+; GCN-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
+; GCN-NEXT:    v_writelane_b32 v32, s4, 0
+; GCN-NEXT:    v_writelane_b32 v32, s5, 1
+; GCN-NEXT:    v_writelane_b32 v32, s6, 2
+; GCN-NEXT:    v_writelane_b32 v32, s7, 3
+; GCN-NEXT:    v_writelane_b32 v32, s8, 4
+; GCN-NEXT:    v_writelane_b32 v32, s9, 5
+; GCN-NEXT:    v_writelane_b32 v32, s10, 6
+; GCN-NEXT:    v_writelane_b32 v32, s11, 7
+; GCN-NEXT:    v_writelane_b32 v32, s12, 8
+; GCN-NEXT:    v_writelane_b32 v32, s13, 9
+; GCN-NEXT:    v_writelane_b32 v32, s14, 10
+; GCN-NEXT:    v_writelane_b32 v32, s15, 11
+; GCN-NEXT:    v_writelane_b32 v32, s16, 12
+; GCN-NEXT:    v_writelane_b32 v32, s17, 13
+; GCN-NEXT:    v_writelane_b32 v32, s18, 14
+; GCN-NEXT:    v_writelane_b32 v32, s19, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 16
-; GCN-NEXT:    v_writelane_b32 v1, s5, 17
-; GCN-NEXT:    v_writelane_b32 v1, s6, 18
-; GCN-NEXT:    v_writelane_b32 v1, s7, 19
-; GCN-NEXT:    v_writelane_b32 v1, s8, 20
-; GCN-NEXT:    v_writelane_b32 v1, s9, 21
-; GCN-NEXT:    v_writelane_b32 v1, s10, 22
-; GCN-NEXT:    v_writelane_b32 v1, s11, 23
-; GCN-NEXT:    v_writelane_b32 v1, s12, 24
-; GCN-NEXT:    v_writelane_b32 v1, s13, 25
-; GCN-NEXT:    v_writelane_b32 v1, s14, 26
-; GCN-NEXT:    v_writelane_b32 v1, s15, 27
-; GCN-NEXT:    v_writelane_b32 v1, s16, 28
-; GCN-NEXT:    v_writelane_b32 v1, s17, 29
-; GCN-NEXT:    v_writelane_b32 v1, s18, 30
-; GCN-NEXT:    v_writelane_b32 v1, s19, 31
+; GCN-NEXT:    v_writelane_b32 v32, s4, 16
+; GCN-NEXT:    v_writelane_b32 v32, s5, 17
+; GCN-NEXT:    v_writelane_b32 v32, s6, 18
+; GCN-NEXT:    v_writelane_b32 v32, s7, 19
+; GCN-NEXT:    v_writelane_b32 v32, s8, 20
+; GCN-NEXT:    v_writelane_b32 v32, s9, 21
+; GCN-NEXT:    v_writelane_b32 v32, s10, 22
+; GCN-NEXT:    v_writelane_b32 v32, s11, 23
+; GCN-NEXT:    v_writelane_b32 v32, s12, 24
+; GCN-NEXT:    v_writelane_b32 v32, s13, 25
+; GCN-NEXT:    v_writelane_b32 v32, s14, 26
+; GCN-NEXT:    v_writelane_b32 v32, s15, 27
+; GCN-NEXT:    v_writelane_b32 v32, s16, 28
+; GCN-NEXT:    v_writelane_b32 v32, s17, 29
+; GCN-NEXT:    v_writelane_b32 v32, s18, 30
+; GCN-NEXT:    v_writelane_b32 v32, s19, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 32
-; GCN-NEXT:    v_writelane_b32 v1, s5, 33
-; GCN-NEXT:    v_writelane_b32 v1, s6, 34
-; GCN-NEXT:    v_writelane_b32 v1, s7, 35
-; GCN-NEXT:    v_writelane_b32 v1, s8, 36
-; GCN-NEXT:    v_writelane_b32 v1, s9, 37
-; GCN-NEXT:    v_writelane_b32 v1, s10, 38
-; GCN-NEXT:    v_writelane_b32 v1, s11, 39
-; GCN-NEXT:    v_writelane_b32 v1, s12, 40
-; GCN-NEXT:    v_writelane_b32 v1, s13, 41
-; GCN-NEXT:    v_writelane_b32 v1, s14, 42
-; GCN-NEXT:    v_writelane_b32 v1, s15, 43
-; GCN-NEXT:    v_writelane_b32 v1, s16, 44
-; GCN-NEXT:    v_writelane_b32 v1, s17, 45
-; GCN-NEXT:    v_writelane_b32 v1, s18, 46
-; GCN-NEXT:    v_writelane_b32 v1, s19, 47
+; GCN-NEXT:    v_writelane_b32 v32, s4, 32
+; GCN-NEXT:    v_writelane_b32 v32, s5, 33
+; GCN-NEXT:    v_writelane_b32 v32, s6, 34
+; GCN-NEXT:    v_writelane_b32 v32, s7, 35
+; GCN-NEXT:    v_writelane_b32 v32, s8, 36
+; GCN-NEXT:    v_writelane_b32 v32, s9, 37
+; GCN-NEXT:    v_writelane_b32 v32, s10, 38
+; GCN-NEXT:    v_writelane_b32 v32, s11, 39
+; GCN-NEXT:    v_writelane_b32 v32, s12, 40
+; GCN-NEXT:    v_writelane_b32 v32, s13, 41
+; GCN-NEXT:    v_writelane_b32 v32, s14, 42
+; GCN-NEXT:    v_writelane_b32 v32, s15, 43
+; GCN-NEXT:    v_writelane_b32 v32, s16, 44
+; GCN-NEXT:    v_writelane_b32 v32, s17, 45
+; GCN-NEXT:    v_writelane_b32 v32, s18, 46
+; GCN-NEXT:    v_writelane_b32 v32, s19, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s4, 48
-; GCN-NEXT:    v_writelane_b32 v1, s5, 49
-; GCN-NEXT:    v_writelane_b32 v1, s6, 50
-; GCN-NEXT:    v_writelane_b32 v1, s7, 51
-; GCN-NEXT:    v_writelane_b32 v1, s8, 52
-; GCN-NEXT:    v_writelane_b32 v1, s9, 53
-; GCN-NEXT:    v_writelane_b32 v1, s10, 54
-; GCN-NEXT:    v_writelane_b32 v1, s11, 55
-; GCN-NEXT:    v_writelane_b32 v1, s12, 56
-; GCN-NEXT:    v_writelane_b32 v1, s13, 57
-; GCN-NEXT:    v_writelane_b32 v1, s14, 58
-; GCN-NEXT:    v_writelane_b32 v1, s15, 59
-; GCN-NEXT:    v_writelane_b32 v1, s16, 60
-; GCN-NEXT:    v_writelane_b32 v1, s17, 61
-; GCN-NEXT:    v_writelane_b32 v1, s18, 62
-; GCN-NEXT:    v_writelane_b32 v1, s19, 63
+; GCN-NEXT:    v_writelane_b32 v32, s4, 48
+; GCN-NEXT:    v_writelane_b32 v32, s5, 49
+; GCN-NEXT:    v_writelane_b32 v32, s6, 50
+; GCN-NEXT:    v_writelane_b32 v32, s7, 51
+; GCN-NEXT:    v_writelane_b32 v32, s8, 52
+; GCN-NEXT:    v_writelane_b32 v32, s9, 53
+; GCN-NEXT:    v_writelane_b32 v32, s10, 54
+; GCN-NEXT:    v_writelane_b32 v32, s11, 55
+; GCN-NEXT:    v_writelane_b32 v32, s12, 56
+; GCN-NEXT:    v_writelane_b32 v32, s13, 57
+; GCN-NEXT:    v_writelane_b32 v32, s14, 58
+; GCN-NEXT:    v_writelane_b32 v32, s15, 59
+; GCN-NEXT:    v_writelane_b32 v32, s16, 60
+; GCN-NEXT:    v_writelane_b32 v32, s17, 61
+; GCN-NEXT:    v_writelane_b32 v32, s18, 62
+; GCN-NEXT:    v_writelane_b32 v32, s19, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v32, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[2:3]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_writelane_b32 v0, s2, 0
-; GCN-NEXT:    v_writelane_b32 v0, s3, 1
+; GCN-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
+; GCN-NEXT:    v_writelane_b32 v32, s2, 0
+; GCN-NEXT:    v_writelane_b32 v32, s3, 1
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v32, off, s[52:55], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1109,59 +1065,59 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v31, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s36, v2, 32
-; GCN-NEXT:    v_readlane_b32 s37, v2, 33
-; GCN-NEXT:    v_readlane_b32 s38, v2, 34
-; GCN-NEXT:    v_readlane_b32 s39, v2, 35
-; GCN-NEXT:    v_readlane_b32 s40, v2, 36
-; GCN-NEXT:    v_readlane_b32 s41, v2, 37
-; GCN-NEXT:    v_readlane_b32 s42, v2, 38
-; GCN-NEXT:    v_readlane_b32 s43, v2, 39
-; GCN-NEXT:    v_readlane_b32 s44, v2, 40
-; GCN-NEXT:    v_readlane_b32 s45, v2, 41
-; GCN-NEXT:    v_readlane_b32 s46, v2, 42
-; GCN-NEXT:    v_readlane_b32 s47, v2, 43
-; GCN-NEXT:    v_readlane_b32 s48, v2, 44
-; GCN-NEXT:    v_readlane_b32 s49, v2, 45
-; GCN-NEXT:    v_readlane_b32 s50, v2, 46
-; GCN-NEXT:    v_readlane_b32 s51, v2, 47
-; GCN-NEXT:    v_readlane_b32 s0, v2, 16
-; GCN-NEXT:    v_readlane_b32 s1, v2, 17
-; GCN-NEXT:    v_readlane_b32 s2, v2, 18
-; GCN-NEXT:    v_readlane_b32 s3, v2, 19
-; GCN-NEXT:    v_readlane_b32 s4, v2, 20
-; GCN-NEXT:    v_readlane_b32 s5, v2, 21
-; GCN-NEXT:    v_readlane_b32 s6, v2, 22
-; GCN-NEXT:    v_readlane_b32 s7, v2, 23
-; GCN-NEXT:    v_readlane_b32 s8, v2, 24
-; GCN-NEXT:    v_readlane_b32 s9, v2, 25
-; GCN-NEXT:    v_readlane_b32 s10, v2, 26
-; GCN-NEXT:    v_readlane_b32 s11, v2, 27
-; GCN-NEXT:    v_readlane_b32 s12, v2, 28
-; GCN-NEXT:    v_readlane_b32 s13, v2, 29
-; GCN-NEXT:    v_readlane_b32 s14, v2, 30
-; GCN-NEXT:    v_readlane_b32 s15, v2, 31
-; GCN-NEXT:    v_readlane_b32 s16, v2, 0
-; GCN-NEXT:    v_readlane_b32 s17, v2, 1
-; GCN-NEXT:    v_readlane_b32 s18, v2, 2
-; GCN-NEXT:    v_readlane_b32 s19, v2, 3
-; GCN-NEXT:    v_readlane_b32 s20, v2, 4
-; GCN-NEXT:    v_readlane_b32 s21, v2, 5
-; GCN-NEXT:    v_readlane_b32 s22, v2, 6
-; GCN-NEXT:    v_readlane_b32 s23, v2, 7
-; GCN-NEXT:    v_readlane_b32 s24, v2, 8
-; GCN-NEXT:    v_readlane_b32 s25, v2, 9
-; GCN-NEXT:    v_readlane_b32 s26, v2, 10
-; GCN-NEXT:    v_readlane_b32 s27, v2, 11
-; GCN-NEXT:    v_readlane_b32 s28, v2, 12
-; GCN-NEXT:    v_readlane_b32 s29, v2, 13
-; GCN-NEXT:    v_readlane_b32 s30, v2, 14
-; GCN-NEXT:    v_readlane_b32 s31, v2, 15
+; GCN-NEXT:    v_readlane_b32 s36, v31, 32
+; GCN-NEXT:    v_readlane_b32 s37, v31, 33
+; GCN-NEXT:    v_readlane_b32 s38, v31, 34
+; GCN-NEXT:    v_readlane_b32 s39, v31, 35
+; GCN-NEXT:    v_readlane_b32 s40, v31, 36
+; GCN-NEXT:    v_readlane_b32 s41, v31, 37
+; GCN-NEXT:    v_readlane_b32 s42, v31, 38
+; GCN-NEXT:    v_readlane_b32 s43, v31, 39
+; GCN-NEXT:    v_readlane_b32 s44, v31, 40
+; GCN-NEXT:    v_readlane_b32 s45, v31, 41
+; GCN-NEXT:    v_readlane_b32 s46, v31, 42
+; GCN-NEXT:    v_readlane_b32 s47, v31, 43
+; GCN-NEXT:    v_readlane_b32 s48, v31, 44
+; GCN-NEXT:    v_readlane_b32 s49, v31, 45
+; GCN-NEXT:    v_readlane_b32 s50, v31, 46
+; GCN-NEXT:    v_readlane_b32 s51, v31, 47
+; GCN-NEXT:    v_readlane_b32 s0, v31, 16
+; GCN-NEXT:    v_readlane_b32 s1, v31, 17
+; GCN-NEXT:    v_readlane_b32 s2, v31, 18
+; GCN-NEXT:    v_readlane_b32 s3, v31, 19
+; GCN-NEXT:    v_readlane_b32 s4, v31, 20
+; GCN-NEXT:    v_readlane_b32 s5, v31, 21
+; GCN-NEXT:    v_readlane_b32 s6, v31, 22
+; GCN-NEXT:    v_readlane_b32 s7, v31, 23
+; GCN-NEXT:    v_readlane_b32 s8, v31, 24
+; GCN-NEXT:    v_readlane_b32 s9, v31, 25
+; GCN-NEXT:    v_readlane_b32 s10, v31, 26
+; GCN-NEXT:    v_readlane_b32 s11, v31, 27
+; GCN-NEXT:    v_readlane_b32 s12, v31, 28
+; GCN-NEXT:    v_readlane_b32 s13, v31, 29
+; GCN-NEXT:    v_readlane_b32 s14, v31, 30
+; GCN-NEXT:    v_readlane_b32 s15, v31, 31
+; GCN-NEXT:    v_readlane_b32 s16, v31, 0
+; GCN-NEXT:    v_readlane_b32 s17, v31, 1
+; GCN-NEXT:    v_readlane_b32 s18, v31, 2
+; GCN-NEXT:    v_readlane_b32 s19, v31, 3
+; GCN-NEXT:    v_readlane_b32 s20, v31, 4
+; GCN-NEXT:    v_readlane_b32 s21, v31, 5
+; GCN-NEXT:    v_readlane_b32 s22, v31, 6
+; GCN-NEXT:    v_readlane_b32 s23, v31, 7
+; GCN-NEXT:    v_readlane_b32 s24, v31, 8
+; GCN-NEXT:    v_readlane_b32 s25, v31, 9
+; GCN-NEXT:    v_readlane_b32 s26, v31, 10
+; GCN-NEXT:    v_readlane_b32 s27, v31, 11
+; GCN-NEXT:    v_readlane_b32 s28, v31, 12
+; GCN-NEXT:    v_readlane_b32 s29, v31, 13
+; GCN-NEXT:    v_readlane_b32 s30, v31, 14
+; GCN-NEXT:    v_readlane_b32 s31, v31, 15
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v32, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def v0
@@ -1172,25 +1128,25 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s4, v2, 48
-; GCN-NEXT:    v_readlane_b32 s5, v2, 49
-; GCN-NEXT:    v_readlane_b32 s6, v2, 50
-; GCN-NEXT:    v_readlane_b32 s7, v2, 51
-; GCN-NEXT:    v_readlane_b32 s8, v2, 52
-; GCN-NEXT:    v_readlane_b32 s9, v2, 53
-; GCN-NEXT:    v_readlane_b32 s10, v2, 54
-; GCN-NEXT:    v_readlane_b32 s11, v2, 55
-; GCN-NEXT:    v_readlane_b32 s12, v2, 56
-; GCN-NEXT:    v_readlane_b32 s13, v2, 57
-; GCN-NEXT:    v_readlane_b32 s14, v2, 58
-; GCN-NEXT:    v_readlane_b32 s15, v2, 59
-; GCN-NEXT:    v_readlane_b32 s16, v2, 60
-; GCN-NEXT:    v_readlane_b32 s17, v2, 61
-; GCN-NEXT:    v_readlane_b32 s18, v2, 62
-; GCN-NEXT:    v_readlane_b32 s19, v2, 63
+; GCN-NEXT:    v_readlane_b32 s4, v31, 48
+; GCN-NEXT:    v_readlane_b32 s5, v31, 49
+; GCN-NEXT:    v_readlane_b32 s6, v31, 50
+; GCN-NEXT:    v_readlane_b32 s7, v31, 51
+; GCN-NEXT:    v_readlane_b32 s8, v31, 52
+; GCN-NEXT:    v_readlane_b32 s9, v31, 53
+; GCN-NEXT:    v_readlane_b32 s10, v31, 54
+; GCN-NEXT:    v_readlane_b32 s11, v31, 55
+; GCN-NEXT:    v_readlane_b32 s12, v31, 56
+; GCN-NEXT:    v_readlane_b32 s13, v31, 57
+; GCN-NEXT:    v_readlane_b32 s14, v31, 58
+; GCN-NEXT:    v_readlane_b32 s15, v31, 59
+; GCN-NEXT:    v_readlane_b32 s16, v31, 60
+; GCN-NEXT:    v_readlane_b32 s17, v31, 61
+; GCN-NEXT:    v_readlane_b32 s18, v31, 62
+; GCN-NEXT:    v_readlane_b32 s19, v31, 63
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s0, v1, 0
-; GCN-NEXT:    v_readlane_b32 s1, v1, 1
+; GCN-NEXT:    v_readlane_b32 s0, v32, 0
+; GCN-NEXT:    v_readlane_b32 s1, v32, 1
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[36:51]
 ; GCN-NEXT:    ;;#ASMEND
@@ -1204,14 +1160,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    ; use v0
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB3_2: ; %ret
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    ; kill: killed $vgpr1
-; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    s_endpgm
   call void asm sideeffect "", "~{v[0:7]}" () #0
   call void asm sideeffect "", "~{v[8:15]}" () #0
@@ -1243,7 +1191,7 @@ ret:
 }
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind "amdgpu-waves-per-eu"="8,8" }
+attributes #1 = { nounwind "amdgpu-waves-per-eu"="7,7" }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
index 8e2a56b463c40..fa62048fd31ad 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
@@ -61,35 +61,27 @@ machineFunctionInfo:
   isChainFunction: true
   returnsVoid:     true
   wwmReservedRegs:
-    - '$vgpr11'
+    - '$vgpr10'
 body:             |
   bb.0:
     liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
 
     ; GCN-LABEL: name: preserve_all_lanes_wwm_above_args
-    ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
+    ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr11, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
-    ; GCN-NEXT: renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
-    ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr10, 0
-    ; GCN-NEXT: renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec
-    ; GCN-NEXT: $vgpr8 = COPY killed renamable $vgpr10
+    ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 10, implicit $exec
+    ; GCN-NEXT: $vgpr8 = COPY killed $vgpr0
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-    ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
-    renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
+    $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
     $sgpr35 = S_MOV_B32 5
     $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr10, 0
-    renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec
-    $vgpr8 = COPY renamable killed $vgpr10
+    $vgpr10 = V_MOV_B32_e32 10, implicit $exec
+    $vgpr8 = COPY killed $vgpr10
     renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
     SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
@@ -139,23 +131,15 @@ body:             |
     liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
 
     ; GCN-LABEL: name: preserve_inactive_lanes_wwm_args
-    ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
+    ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr10
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
-    ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
+    ; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
     ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
     ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-    ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
-    ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
+    ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr0
     renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     $sgpr35 = S_MOV_B32 5
     $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
@@ -184,7 +168,7 @@ body:             |
     ; GCN-LABEL: name: dont_preserve_if_no_chain_calls
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr9
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
+    ; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
     ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
     ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
@@ -218,7 +202,7 @@ body:             |
     ; GCN-LABEL: name: dont_preserve_v0_v7
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
     ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
     ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
index 4b8b71a740085..49001a2cfd7a6 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
@@ -36,19 +36,11 @@ body:             |
     liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
 
     ; GCN-LABEL: name: preserve_inactive_wwm
-    ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
+    ; GCN: liveins: $sgpr0, $sgpr35
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-    ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
-    ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
+    ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr1
     renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
     SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
@@ -72,24 +64,16 @@ body:             |
     ; GCN-LABEL: name: preserve_inactive_detected_wwm
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
-    ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
+    ; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
     ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
     ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
-    ; GCN-NEXT: renamable $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9
+    ; GCN-NEXT: $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
     ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr9, 0
     ; GCN-NEXT: renamable $vgpr9 = V_MOV_B32_e32 10, implicit $exec
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-    ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
     renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     $sgpr35 = S_MOV_B32 5
@@ -122,7 +106,7 @@ body:             |
     ; GCN-LABEL: name: dont_preserve_wwm_if_no_chain_calls
     ; GCN: liveins: $sgpr35, $vgpr8
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
+    ; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
     ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
     ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
@@ -151,11 +135,11 @@ body:             |
     liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
 
     ; GCN-LABEL: name: dont_preserve_wwm_if_init_whole_wave
-    ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
+    ; GCN: liveins: $sgpr0, $sgpr35
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-    ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
+    ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr1
     renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
     SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
@@ -209,7 +193,7 @@ body:             |
     ; GCN-LABEL: name: dont_preserve_v0_v7
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr7, $vgpr8, $vgpr9
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
     ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
     ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir
index b21285e83dc21..4be102f7860ea 100644
--- a/llvm/test/CodeGen/AMDGPU/pr51516.mir
+++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s
 
 # Check that %3 was not rematerialized before the last store since its operand %1
 # is killed by that store.
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
index 4571e792c7cb5..168d63d3a95b9 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
+++ b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
@@ -20,16 +20,10 @@ body:             |
     ; GCN-LABEL: name: preserve_scratch_vgpr_inactive_lanes
     ; GCN: liveins: $sgpr35, $vgpr0, $sgpr30_sgpr31
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
     ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
     ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
-    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31, implicit $vgpr0
     renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
     $sgpr35 = S_MOV_B32 5
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index bbeb2e1884a9f..924340ec8a2a6 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -13,333 +13,333 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-NEXT:    s_mov_b32 s16, s33
 ; GFX906-NEXT:    s_mov_b32 s33, s32
 ; GFX906-NEXT:    s_xor_saveexec_b64 s[18:19], -1
-; GFX906-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_mov_b64 exec, -1
-; GFX906-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX906-NEXT:    ; implicit-def: $vgpr2
+; GFX906-NEXT:    ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
 ; GFX906-NEXT:    s_mov_b32 s21, s15
-; GFX906-NEXT:    v_writelane_b32 v2, s6, 0
-; GFX906-NEXT:    v_writelane_b32 v2, s7, 1
-; GFX906-NEXT:    v_writelane_b32 v2, s21, 2
+; GFX906-NEXT:    v_writelane_b32 v39, s6, 0
+; GFX906-NEXT:    v_writelane_b32 v39, s7, 1
+; GFX906-NEXT:    v_writelane_b32 v39, s21, 2
 ; GFX906-NEXT:    s_mov_b32 s22, s14
-; GFX906-NEXT:    v_writelane_b32 v2, s22, 3
+; GFX906-NEXT:    v_writelane_b32 v39, s22, 3
 ; GFX906-NEXT:    s_mov_b32 s23, s13
-; GFX906-NEXT:    v_writelane_b32 v2, s23, 4
+; GFX906-NEXT:    v_writelane_b32 v39, s23, 4
 ; GFX906-NEXT:    s_mov_b32 s24, s12
-; GFX906-NEXT:    v_writelane_b32 v2, s24, 5
+; GFX906-NEXT:    v_writelane_b32 v39, s24, 5
 ; GFX906-NEXT:    s_mov_b64 s[26:27], s[10:11]
-; GFX906-NEXT:    v_writelane_b32 v2, s26, 6
+; GFX906-NEXT:    v_writelane_b32 v39, s26, 6
 ; GFX906-NEXT:    v_writelane_b32 v41, s16, 4
-; GFX906-NEXT:    v_writelane_b32 v2, s27, 7
+; GFX906-NEXT:    v_writelane_b32 v39, s27, 7
 ; GFX906-NEXT:    v_writelane_b32 v41, s34, 2
-; GFX906-NEXT:    v_writelane_b32 v2, s8, 8
+; GFX906-NEXT:    v_writelane_b32 v39, s8, 8
 ; GFX906-NEXT:    v_writelane_b32 v41, s35, 3
-; GFX906-NEXT:    v_writelane_b32 v2, s9, 9
+; GFX906-NEXT:    v_writelane_b32 v39, s9, 9
 ; GFX906-NEXT:    v_writelane_b32 v41, s30, 0
-; GFX906-NEXT:    v_writelane_b32 v2, s4, 10
+; GFX906-NEXT:    v_writelane_b32 v39, s4, 10
 ; GFX906-NEXT:    s_addk_i32 s32, 0x2800
+; GFX906-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_writelane_b32 v41, s31, 1
 ; GFX906-NEXT:    v_mov_b32_e32 v32, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_writelane_b32 v2, s5, 11
+; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_writelane_b32 v39, s5, 11
 ; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX906-NEXT:    v_mov_b32_e32 v33, v2
 ; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def v[0:31]
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def v40
 ; GFX906-NEXT:    ;;#ASMEND
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s11
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX906-NEXT:    v_mov_b32_e32 v40, v33
-; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX906-NEXT:    v_writelane_b32 v40, s11, 12
+; GFX906-NEXT:    v_writelane_b32 v39, s11, 12
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s12
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s12, 13
+; GFX906-NEXT:    v_writelane_b32 v39, s12, 13
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s13
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s13, 14
+; GFX906-NEXT:    v_writelane_b32 v39, s13, 14
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s14
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s14, 15
+; GFX906-NEXT:    v_writelane_b32 v39, s14, 15
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s15
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s15, 16
+; GFX906-NEXT:    v_writelane_b32 v39, s15, 16
 ; GFX906-NEXT:    s_getpc_b64 s[10:11]
 ; GFX906-NEXT:    s_add_u32 s10, s10, foo@gotpcrel32@lo+4
 ; GFX906-NEXT:    s_addc_u32 s11, s11, foo@gotpcrel32@hi+12
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s16
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s16, 17
+; GFX906-NEXT:    v_writelane_b32 v39, s16, 17
 ; GFX906-NEXT:    s_load_dwordx2 s[10:11], s[10:11], 0x0
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s17
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s17, 18
+; GFX906-NEXT:    v_writelane_b32 v39, s17, 18
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s18
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s18, 19
+; GFX906-NEXT:    v_writelane_b32 v39, s18, 19
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s19
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s19, 20
+; GFX906-NEXT:    v_writelane_b32 v39, s19, 20
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s20
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_writelane_b32 v40, s20, 21
+; GFX906-NEXT:    v_writelane_b32 v39, s20, 21
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    v_writelane_b32 v40, s10, 22
-; GFX906-NEXT:    v_writelane_b32 v40, s11, 23
+; GFX906-NEXT:    v_writelane_b32 v39, s10, 22
+; GFX906-NEXT:    v_writelane_b32 v39, s11, 23
+; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX906-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX906-NEXT:    v_readlane_b32 s16, v40, 22
+; GFX906-NEXT:    v_readlane_b32 s16, v39, 22
 ; GFX906-NEXT:    s_mov_b32 s12, s24
 ; GFX906-NEXT:    s_mov_b32 s13, s23
 ; GFX906-NEXT:    s_mov_b32 s14, s22
 ; GFX906-NEXT:    v_mov_b32_e32 v31, v32
 ; GFX906-NEXT:    s_mov_b32 s15, s21
 ; GFX906-NEXT:    s_mov_b64 s[10:11], s[26:27]
-; GFX906-NEXT:    v_readlane_b32 s17, v40, 23
-; GFX906-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_readlane_b32 s17, v39, 23
+; GFX906-NEXT:    v_mov_b32_e32 v40, v32
 ; GFX906-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX906-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX906-NEXT:    v_readlane_b32 s11, v40, 12
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_readlane_b32 s11, v39, 12
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s11
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s12, v40, 13
+; GFX906-NEXT:    v_readlane_b32 s12, v39, 13
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s12
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s13, v40, 14
+; GFX906-NEXT:    v_readlane_b32 s13, v39, 14
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s13
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s14, v40, 15
+; GFX906-NEXT:    v_readlane_b32 s14, v39, 15
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s14
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s15, v40, 16
+; GFX906-NEXT:    v_readlane_b32 s15, v39, 16
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s15
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s16, v40, 17
+; GFX906-NEXT:    v_readlane_b32 s16, v39, 17
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s16
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s17, v40, 18
+; GFX906-NEXT:    v_readlane_b32 s17, v39, 18
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s17
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s18, v40, 19
+; GFX906-NEXT:    v_readlane_b32 s18, v39, 19
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s18
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s19, v40, 20
+; GFX906-NEXT:    v_readlane_b32 s19, v39, 20
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s19
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s20, v40, 21
+; GFX906-NEXT:    v_readlane_b32 s20, v39, 21
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s20
 ; GFX906-NEXT:    ;;#ASMEND
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s21
 ; GFX906-NEXT:    ;;#ASMEND
+; GFX906-NEXT:    v_writelane_b32 v39, s21, 24
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s22
 ; GFX906-NEXT:    ;;#ASMEND
+; GFX906-NEXT:    v_writelane_b32 v39, s22, 25
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s23
 ; GFX906-NEXT:    ;;#ASMEND
+; GFX906-NEXT:    v_writelane_b32 v39, s23, 26
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s24
 ; GFX906-NEXT:    ;;#ASMEND
+; GFX906-NEXT:    v_writelane_b32 v39, s24, 27
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s25
 ; GFX906-NEXT:    ;;#ASMEND
+; GFX906-NEXT:    v_writelane_b32 v39, s25, 28
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s26
 ; GFX906-NEXT:    ;;#ASMEND
+; GFX906-NEXT:    v_writelane_b32 v39, s26, 29
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s27
 ; GFX906-NEXT:    ;;#ASMEND
+; GFX906-NEXT:    v_writelane_b32 v39, s27, 30
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s28
 ; GFX906-NEXT:    ;;#ASMEND
+; GFX906-NEXT:    v_writelane_b32 v39, s28, 31
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; def s29
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_writelane_b32 v40, s21, 24
-; GFX906-NEXT:    v_writelane_b32 v40, s22, 25
-; GFX906-NEXT:    v_writelane_b32 v40, s23, 26
-; GFX906-NEXT:    v_writelane_b32 v40, s24, 27
-; GFX906-NEXT:    v_writelane_b32 v40, s25, 28
-; GFX906-NEXT:    v_writelane_b32 v40, s26, 29
-; GFX906-NEXT:    v_writelane_b32 v40, s27, 30
-; GFX906-NEXT:    v_writelane_b32 v40, s28, 31
-; GFX906-NEXT:    v_writelane_b32 v40, s29, 32
-; GFX906-NEXT:    v_readlane_b32 s4, v40, 10
-; GFX906-NEXT:    v_readlane_b32 s6, v40, 0
-; GFX906-NEXT:    v_readlane_b32 s8, v40, 8
-; GFX906-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX906-NEXT:    v_readlane_b32 s16, v40, 22
-; GFX906-NEXT:    v_readlane_b32 s12, v40, 5
-; GFX906-NEXT:    v_readlane_b32 s13, v40, 4
-; GFX906-NEXT:    v_readlane_b32 s14, v40, 3
-; GFX906-NEXT:    v_readlane_b32 s15, v40, 2
-; GFX906-NEXT:    v_readlane_b32 s5, v40, 11
-; GFX906-NEXT:    v_readlane_b32 s7, v40, 1
-; GFX906-NEXT:    v_readlane_b32 s9, v40, 9
-; GFX906-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX906-NEXT:    v_readlane_b32 s17, v40, 23
+; GFX906-NEXT:    v_writelane_b32 v39, s29, 32
 ; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX906-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX906-NEXT:    v_readlane_b32 s4, v39, 10
+; GFX906-NEXT:    v_readlane_b32 s6, v39, 0
+; GFX906-NEXT:    v_readlane_b32 s8, v39, 8
+; GFX906-NEXT:    v_readlane_b32 s10, v39, 6
+; GFX906-NEXT:    v_readlane_b32 s16, v39, 22
+; GFX906-NEXT:    v_readlane_b32 s12, v39, 5
+; GFX906-NEXT:    v_mov_b32_e32 v31, v40
+; GFX906-NEXT:    v_readlane_b32 s13, v39, 4
+; GFX906-NEXT:    v_readlane_b32 s14, v39, 3
+; GFX906-NEXT:    v_readlane_b32 s15, v39, 2
+; GFX906-NEXT:    v_readlane_b32 s5, v39, 11
+; GFX906-NEXT:    v_readlane_b32 s7, v39, 1
+; GFX906-NEXT:    v_readlane_b32 s9, v39, 9
+; GFX906-NEXT:    v_readlane_b32 s11, v39, 7
+; GFX906-NEXT:    v_readlane_b32 s17, v39, 23
 ; GFX906-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX906-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX906-NEXT:    v_readlane_b32 s21, v40, 24
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_readlane_b32 s4, v39, 10
+; GFX906-NEXT:    v_readlane_b32 s6, v39, 0
+; GFX906-NEXT:    v_readlane_b32 s8, v39, 8
+; GFX906-NEXT:    v_readlane_b32 s10, v39, 6
+; GFX906-NEXT:    v_readlane_b32 s16, v39, 22
+; GFX906-NEXT:    v_readlane_b32 s5, v39, 11
+; GFX906-NEXT:    v_readlane_b32 s7, v39, 1
+; GFX906-NEXT:    v_readlane_b32 s9, v39, 9
+; GFX906-NEXT:    v_readlane_b32 s11, v39, 7
+; GFX906-NEXT:    v_readlane_b32 s12, v39, 5
+; GFX906-NEXT:    v_readlane_b32 s13, v39, 4
+; GFX906-NEXT:    v_readlane_b32 s14, v39, 3
+; GFX906-NEXT:    v_readlane_b32 s15, v39, 2
+; GFX906-NEXT:    v_mov_b32_e32 v31, v40
+; GFX906-NEXT:    v_readlane_b32 s17, v39, 23
+; GFX906-NEXT:    v_readlane_b32 s21, v39, 24
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s21
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s22, v40, 25
+; GFX906-NEXT:    v_readlane_b32 s22, v39, 25
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s22
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s23, v40, 26
+; GFX906-NEXT:    v_readlane_b32 s23, v39, 26
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s23
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s24, v40, 27
+; GFX906-NEXT:    v_readlane_b32 s24, v39, 27
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s24
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s25, v40, 28
+; GFX906-NEXT:    v_readlane_b32 s25, v39, 28
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s25
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s26, v40, 29
+; GFX906-NEXT:    v_readlane_b32 s26, v39, 29
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s26
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s27, v40, 30
+; GFX906-NEXT:    v_readlane_b32 s27, v39, 30
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s27
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s28, v40, 31
+; GFX906-NEXT:    v_readlane_b32 s28, v39, 31
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s28
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    v_readlane_b32 s29, v40, 32
+; GFX906-NEXT:    v_readlane_b32 s29, v39, 32
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s29
 ; GFX906-NEXT:    ;;#ASMEND
-; GFX906-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_readlane_b32 s4, v40, 10
-; GFX906-NEXT:    v_readlane_b32 s6, v40, 0
-; GFX906-NEXT:    v_readlane_b32 s8, v40, 8
-; GFX906-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX906-NEXT:    v_readlane_b32 s16, v40, 22
-; GFX906-NEXT:    v_readlane_b32 s5, v40, 11
-; GFX906-NEXT:    v_readlane_b32 s7, v40, 1
-; GFX906-NEXT:    v_readlane_b32 s9, v40, 9
-; GFX906-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX906-NEXT:    v_readlane_b32 s12, v40, 5
-; GFX906-NEXT:    v_readlane_b32 s13, v40, 4
-; GFX906-NEXT:    v_readlane_b32 s14, v40, 3
-; GFX906-NEXT:    v_readlane_b32 s15, v40, 2
-; GFX906-NEXT:    v_readlane_b32 s17, v40, 23
-; GFX906-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX906-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX906-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX906-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX906-NEXT:    v_readlane_b32 s30, v41, 0
-; GFX906-NEXT:    ; kill: killed $vgpr40
 ; GFX906-NEXT:    v_readlane_b32 s4, v41, 4
 ; GFX906-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX906-NEXT:    v_readlane_b32 s35, v41, 3
@@ -360,12 +360,11 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GFX906-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_mov_b64 exec, -1
-; GFX906-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX906-NEXT:    s_addk_i32 s32, 0xd800
 ; GFX906-NEXT:    s_mov_b32 s33, s4
@@ -378,346 +377,346 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    s_mov_b32 s16, s33
 ; GFX908-NEXT:    s_mov_b32 s33, s32
 ; GFX908-NEXT:    s_xor_saveexec_b64 s[18:19], -1
-; GFX908-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill
-; GFX908-NEXT:    s_mov_b64 exec, -1
-; GFX908-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX908-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX908-NEXT:    v_mov_b32_e32 v3, s16
-; GFX908-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
-; GFX908-NEXT:    v_mov_b32_e32 v3, s34
-; GFX908-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
-; GFX908-NEXT:    v_mov_b32_e32 v3, s35
-; GFX908-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
+; GFX908-NEXT:    v_mov_b32_e32 v2, s16
+; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
+; GFX908-NEXT:    v_mov_b32_e32 v2, s34
+; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill
+; GFX908-NEXT:    v_mov_b32_e32 v2, s35
+; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
 ; GFX908-NEXT:    s_addk_i32 s32, 0x2c00
+; GFX908-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX908-NEXT:    s_mov_b64 s[16:17], exec
 ; GFX908-NEXT:    s_mov_b64 exec, 1
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:172
+; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:172
+; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[16:17]
 ; GFX908-NEXT:    s_mov_b64 s[16:17], exec
 ; GFX908-NEXT:    s_mov_b64 exec, 1
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:172
+; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    v_writelane_b32 v2, s31, 0
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:172
+; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[16:17]
-; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:    ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
 ; GFX908-NEXT:    s_mov_b32 s21, s15
-; GFX908-NEXT:    v_writelane_b32 v2, s6, 0
-; GFX908-NEXT:    v_writelane_b32 v2, s7, 1
-; GFX908-NEXT:    v_writelane_b32 v2, s21, 2
+; GFX908-NEXT:    v_writelane_b32 v39, s6, 0
+; GFX908-NEXT:    v_writelane_b32 v39, s7, 1
+; GFX908-NEXT:    v_writelane_b32 v39, s21, 2
 ; GFX908-NEXT:    s_mov_b32 s22, s14
-; GFX908-NEXT:    v_writelane_b32 v2, s22, 3
+; GFX908-NEXT:    v_writelane_b32 v39, s22, 3
 ; GFX908-NEXT:    s_mov_b32 s23, s13
-; GFX908-NEXT:    v_writelane_b32 v2, s23, 4
+; GFX908-NEXT:    v_writelane_b32 v39, s23, 4
 ; GFX908-NEXT:    s_mov_b32 s24, s12
-; GFX908-NEXT:    v_writelane_b32 v2, s24, 5
+; GFX908-NEXT:    v_writelane_b32 v39, s24, 5
 ; GFX908-NEXT:    s_mov_b64 s[26:27], s[10:11]
-; GFX908-NEXT:    v_writelane_b32 v2, s26, 6
-; GFX908-NEXT:    v_writelane_b32 v2, s27, 7
-; GFX908-NEXT:    v_writelane_b32 v2, s8, 8
-; GFX908-NEXT:    v_writelane_b32 v2, s9, 9
-; GFX908-NEXT:    v_writelane_b32 v2, s4, 10
+; GFX908-NEXT:    v_writelane_b32 v39, s26, 6
+; GFX908-NEXT:    v_writelane_b32 v39, s27, 7
+; GFX908-NEXT:    v_writelane_b32 v39, s8, 8
+; GFX908-NEXT:    v_writelane_b32 v39, s9, 9
+; GFX908-NEXT:    v_writelane_b32 v39, s4, 10
 ; GFX908-NEXT:    v_mov_b32_e32 v32, v31
-; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX908-NEXT:    v_writelane_b32 v2, s5, 11
+; GFX908-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX908-NEXT:    v_writelane_b32 v39, s5, 11
 ; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX908-NEXT:    v_mov_b32_e32 v33, v2
 ; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def v[0:31]
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
+; GFX908-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def v40
 ; GFX908-NEXT:    ;;#ASMEND
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s11
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX908-NEXT:    v_mov_b32_e32 v40, v33
-; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX908-NEXT:    v_writelane_b32 v40, s11, 12
+; GFX908-NEXT:    v_writelane_b32 v39, s11, 12
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s12
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s12, 13
+; GFX908-NEXT:    v_writelane_b32 v39, s12, 13
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s13
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s13, 14
+; GFX908-NEXT:    v_writelane_b32 v39, s13, 14
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s14
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s14, 15
+; GFX908-NEXT:    v_writelane_b32 v39, s14, 15
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s15
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s15, 16
+; GFX908-NEXT:    v_writelane_b32 v39, s15, 16
 ; GFX908-NEXT:    s_getpc_b64 s[10:11]
 ; GFX908-NEXT:    s_add_u32 s10, s10, foo@gotpcrel32@lo+4
 ; GFX908-NEXT:    s_addc_u32 s11, s11, foo@gotpcrel32@hi+12
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s16
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s16, 17
+; GFX908-NEXT:    v_writelane_b32 v39, s16, 17
 ; GFX908-NEXT:    s_load_dwordx2 s[10:11], s[10:11], 0x0
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s17
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s17, 18
+; GFX908-NEXT:    v_writelane_b32 v39, s17, 18
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s18
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s18, 19
+; GFX908-NEXT:    v_writelane_b32 v39, s18, 19
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s19
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s19, 20
+; GFX908-NEXT:    v_writelane_b32 v39, s19, 20
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s20
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_writelane_b32 v40, s20, 21
+; GFX908-NEXT:    v_writelane_b32 v39, s20, 21
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_writelane_b32 v40, s10, 22
-; GFX908-NEXT:    v_writelane_b32 v40, s11, 23
+; GFX908-NEXT:    v_writelane_b32 v39, s10, 22
+; GFX908-NEXT:    v_writelane_b32 v39, s11, 23
 ; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX908-NEXT:    v_readlane_b32 s16, v40, 22
+; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX908-NEXT:    v_readlane_b32 s16, v39, 22
 ; GFX908-NEXT:    s_mov_b32 s12, s24
 ; GFX908-NEXT:    s_mov_b32 s13, s23
 ; GFX908-NEXT:    s_mov_b32 s14, s22
 ; GFX908-NEXT:    v_mov_b32_e32 v31, v32
 ; GFX908-NEXT:    s_mov_b32 s15, s21
 ; GFX908-NEXT:    s_mov_b64 s[10:11], s[26:27]
-; GFX908-NEXT:    v_readlane_b32 s17, v40, 23
-; GFX908-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX908-NEXT:    v_readlane_b32 s17, v39, 23
+; GFX908-NEXT:    v_mov_b32_e32 v40, v32
 ; GFX908-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX908-NEXT:    v_readlane_b32 s11, v40, 12
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_readlane_b32 s11, v39, 12
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s11
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s12, v40, 13
+; GFX908-NEXT:    v_readlane_b32 s12, v39, 13
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s12
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s13, v40, 14
+; GFX908-NEXT:    v_readlane_b32 s13, v39, 14
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s13
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s14, v40, 15
+; GFX908-NEXT:    v_readlane_b32 s14, v39, 15
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s14
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s15, v40, 16
+; GFX908-NEXT:    v_readlane_b32 s15, v39, 16
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s15
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s16, v40, 17
+; GFX908-NEXT:    v_readlane_b32 s16, v39, 17
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s16
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s17, v40, 18
+; GFX908-NEXT:    v_readlane_b32 s17, v39, 18
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s17
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s18, v40, 19
+; GFX908-NEXT:    v_readlane_b32 s18, v39, 19
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s18
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s19, v40, 20
+; GFX908-NEXT:    v_readlane_b32 s19, v39, 20
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s19
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s20, v40, 21
+; GFX908-NEXT:    v_readlane_b32 s20, v39, 21
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s20
 ; GFX908-NEXT:    ;;#ASMEND
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s21
 ; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_writelane_b32 v39, s21, 24
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s22
 ; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_writelane_b32 v39, s22, 25
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s23
 ; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_writelane_b32 v39, s23, 26
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s24
 ; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_writelane_b32 v39, s24, 27
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s25
 ; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_writelane_b32 v39, s25, 28
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s26
 ; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_writelane_b32 v39, s26, 29
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s27
 ; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_writelane_b32 v39, s27, 30
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s28
 ; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_writelane_b32 v39, s28, 31
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; def s29
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX908-NEXT:    v_writelane_b32 v40, s21, 24
-; GFX908-NEXT:    v_writelane_b32 v40, s22, 25
-; GFX908-NEXT:    v_writelane_b32 v40, s23, 26
-; GFX908-NEXT:    v_writelane_b32 v40, s24, 27
-; GFX908-NEXT:    v_writelane_b32 v40, s25, 28
-; GFX908-NEXT:    v_writelane_b32 v40, s26, 29
-; GFX908-NEXT:    v_writelane_b32 v40, s27, 30
-; GFX908-NEXT:    v_writelane_b32 v40, s28, 31
-; GFX908-NEXT:    v_writelane_b32 v40, s29, 32
-; GFX908-NEXT:    v_readlane_b32 s4, v40, 10
-; GFX908-NEXT:    v_readlane_b32 s6, v40, 0
-; GFX908-NEXT:    v_readlane_b32 s8, v40, 8
-; GFX908-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX908-NEXT:    v_readlane_b32 s16, v40, 22
-; GFX908-NEXT:    v_readlane_b32 s12, v40, 5
-; GFX908-NEXT:    v_readlane_b32 s13, v40, 4
-; GFX908-NEXT:    v_readlane_b32 s14, v40, 3
-; GFX908-NEXT:    v_readlane_b32 s15, v40, 2
-; GFX908-NEXT:    v_readlane_b32 s5, v40, 11
-; GFX908-NEXT:    v_readlane_b32 s7, v40, 1
-; GFX908-NEXT:    v_readlane_b32 s9, v40, 9
-; GFX908-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX908-NEXT:    v_readlane_b32 s17, v40, 23
+; GFX908-NEXT:    v_writelane_b32 v39, s29, 32
 ; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX908-NEXT:    v_readlane_b32 s4, v39, 10
+; GFX908-NEXT:    v_readlane_b32 s6, v39, 0
+; GFX908-NEXT:    v_readlane_b32 s8, v39, 8
+; GFX908-NEXT:    v_readlane_b32 s10, v39, 6
+; GFX908-NEXT:    v_readlane_b32 s16, v39, 22
+; GFX908-NEXT:    v_readlane_b32 s12, v39, 5
+; GFX908-NEXT:    v_mov_b32_e32 v31, v40
+; GFX908-NEXT:    v_readlane_b32 s13, v39, 4
+; GFX908-NEXT:    v_readlane_b32 s14, v39, 3
+; GFX908-NEXT:    v_readlane_b32 s15, v39, 2
+; GFX908-NEXT:    v_readlane_b32 s5, v39, 11
+; GFX908-NEXT:    v_readlane_b32 s7, v39, 1
+; GFX908-NEXT:    v_readlane_b32 s9, v39, 9
+; GFX908-NEXT:    v_readlane_b32 s11, v39, 7
+; GFX908-NEXT:    v_readlane_b32 s17, v39, 23
 ; GFX908-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX908-NEXT:    v_readlane_b32 s21, v40, 24
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_readlane_b32 s4, v39, 10
+; GFX908-NEXT:    v_readlane_b32 s6, v39, 0
+; GFX908-NEXT:    v_readlane_b32 s8, v39, 8
+; GFX908-NEXT:    v_readlane_b32 s10, v39, 6
+; GFX908-NEXT:    v_readlane_b32 s16, v39, 22
+; GFX908-NEXT:    v_readlane_b32 s5, v39, 11
+; GFX908-NEXT:    v_readlane_b32 s7, v39, 1
+; GFX908-NEXT:    v_readlane_b32 s9, v39, 9
+; GFX908-NEXT:    v_readlane_b32 s11, v39, 7
+; GFX908-NEXT:    v_readlane_b32 s12, v39, 5
+; GFX908-NEXT:    v_readlane_b32 s13, v39, 4
+; GFX908-NEXT:    v_readlane_b32 s14, v39, 3
+; GFX908-NEXT:    v_readlane_b32 s15, v39, 2
+; GFX908-NEXT:    v_mov_b32_e32 v31, v40
+; GFX908-NEXT:    v_readlane_b32 s17, v39, 23
+; GFX908-NEXT:    v_readlane_b32 s21, v39, 24
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s21
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s22, v40, 25
+; GFX908-NEXT:    v_readlane_b32 s22, v39, 25
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s22
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s23, v40, 26
+; GFX908-NEXT:    v_readlane_b32 s23, v39, 26
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s23
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s24, v40, 27
+; GFX908-NEXT:    v_readlane_b32 s24, v39, 27
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s24
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s25, v40, 28
+; GFX908-NEXT:    v_readlane_b32 s25, v39, 28
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s25
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s26, v40, 29
+; GFX908-NEXT:    v_readlane_b32 s26, v39, 29
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s26
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s27, v40, 30
+; GFX908-NEXT:    v_readlane_b32 s27, v39, 30
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s27
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s28, v40, 31
+; GFX908-NEXT:    v_readlane_b32 s28, v39, 31
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s28
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_readlane_b32 s29, v40, 32
+; GFX908-NEXT:    v_readlane_b32 s29, v39, 32
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use s29
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX908-NEXT:    v_readlane_b32 s4, v40, 10
-; GFX908-NEXT:    v_readlane_b32 s6, v40, 0
-; GFX908-NEXT:    v_readlane_b32 s8, v40, 8
-; GFX908-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX908-NEXT:    v_readlane_b32 s16, v40, 22
-; GFX908-NEXT:    v_readlane_b32 s5, v40, 11
-; GFX908-NEXT:    v_readlane_b32 s7, v40, 1
-; GFX908-NEXT:    v_readlane_b32 s9, v40, 9
-; GFX908-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX908-NEXT:    v_readlane_b32 s12, v40, 5
-; GFX908-NEXT:    v_readlane_b32 s13, v40, 4
-; GFX908-NEXT:    v_readlane_b32 s14, v40, 3
-; GFX908-NEXT:    v_readlane_b32 s15, v40, 2
-; GFX908-NEXT:    v_readlane_b32 s17, v40, 23
-; GFX908-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX908-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX908-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    flat_store_dwordx4 v[0:1], v[30:33] offset:112
@@ -737,37 +736,34 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, 1
-; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:172
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:168
+; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readlane_b32 s31, v0, 0
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:172
+; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX908-NEXT:    s_mov_b64 exec, 1
-; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:172
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:168
+; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readlane_b32 s30, v0, 0
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:172
+; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload
-; GFX908-NEXT:    ; kill: killed $vgpr40
+; GFX908-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX908-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GFX908-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload
-; GFX908-NEXT:    s_mov_b64 exec, -1
-; GFX908-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_addk_i32 s32, 0xd400
 ; GFX908-NEXT:    s_mov_b32 s33, s4
diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll
index d58477c194ea6..c0d228e1254e6 100644
--- a/llvm/test/CodeGen/AMDGPU/recursion.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursion.ll
@@ -3,7 +3,11 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=V5 %s
 
 ; CHECK-LABEL: {{^}}recursive:
+; CHECK: .set recursive.private_seg_size, 16+(max(16384))
 ; CHECK: ScratchSize: 16
+
+; V5-LABEL: {{^}}recursive:
+; V5: .set recursive.has_recursion, 1
 define void @recursive() {
   call void @recursive()
   store volatile i32 0, ptr addrspace(1) undef
@@ -11,18 +15,22 @@ define void @recursive() {
 }
 
 ; CHECK-LABEL: {{^}}tail_recursive:
+; CHECK: .set tail_recursive.private_seg_size, 0
 ; CHECK: ScratchSize: 0
 define void @tail_recursive() {
   tail call void @tail_recursive()
   ret void
 }
 
+; CHECK: .set calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size))
 define void @calls_tail_recursive() norecurse {
   tail call void @tail_recursive()
   ret void
 }
 
 ; CHECK-LABEL: {{^}}tail_recursive_with_stack:
+; CHECK: .set tail_recursive_with_stack.private_seg_size, 8
+; CHECK: .set tail_recursive_with_stack.has_recursion, 1
 define void @tail_recursive_with_stack() {
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
@@ -33,11 +41,11 @@ define void @tail_recursive_with_stack() {
 ; For an arbitrary recursive call, report a large number for unknown stack
 ; usage for code object v4 and older
 ; CHECK-LABEL: {{^}}calls_recursive:
-; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}}
+; CHECK: .set calls_recursive.private_seg_size, 0+(max(16384, recursive.private_seg_size))
 ;
 ; V5-LABEL: {{^}}calls_recursive:
-; V5: .amdhsa_private_segment_fixed_size 0{{$}}
-; V5: .amdhsa_uses_dynamic_stack 1
+; V5: .set calls_recursive.private_seg_size, 0+(max(recursive.private_seg_size))
+; V5: .set calls_recursive.has_dyn_sized_stack, or(0, recursive.has_dyn_sized_stack)
 define amdgpu_kernel void @calls_recursive() {
   call void @recursive()
   ret void
@@ -46,7 +54,7 @@ define amdgpu_kernel void @calls_recursive() {
 ; Make sure we do not report a huge stack size for tail recursive
 ; functions
 ; CHECK-LABEL: {{^}}kernel_indirectly_calls_tail_recursive:
-; CHECK: .amdhsa_private_segment_fixed_size 0{{$}}
+; CHECK: .set kernel_indirectly_calls_tail_recursive.private_seg_size, 0+(max(calls_tail_recursive.private_seg_size))
 define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
   call void @calls_tail_recursive()
   ret void
@@ -57,22 +65,22 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
 ; in the kernel.
 
 ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive:
-; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
+; CHECK: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(16384, tail_recursive.private_seg_size))
 ;
 ; V5-LABEL: {{^}}kernel_calls_tail_recursive:
-; V5: .amdhsa_private_segment_fixed_size 0{{$}}
-; V5: .amdhsa_uses_dynamic_stack 1
+; V5: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size))
+; V5: .set kernel_calls_tail_recursive.has_recursion, or(1, tail_recursive.has_recursion)
 define amdgpu_kernel void @kernel_calls_tail_recursive() {
   call void @tail_recursive()
   ret void
 }
 
 ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
-; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
+; CHECK: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(16384, tail_recursive_with_stack.private_seg_size))
 ;
 ; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
-; V5: .amdhsa_private_segment_fixed_size 8{{$}}
-; V5: .amdhsa_uses_dynamic_stack 1
+; V5: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(tail_recursive_with_stack.private_seg_size))
+; V5: .set kernel_calls_tail_recursive_with_stack.has_dyn_sized_stack, or(0, tail_recursive_with_stack.has_dyn_sized_stack)
 define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() {
   call void @tail_recursive_with_stack()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
index 447a8bf9956f3..fe01728c00563 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
@@ -1,5 +1,5 @@
-# RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs=0 -start-before=greedy,1 -stop-after=virtregrewriter,1 %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -start-before=greedy,1 -stop-after=virtregrewriter,1 %s -o /dev/null 2>&1 | FileCheck -check-prefixes=ERR,VERIFIER %s
+# RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs=0 -start-before=greedy,1 -stop-after=virtregrewriter,2 %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -start-before=greedy,1 -stop-after=virtregrewriter,2 %s -o /dev/null 2>&1 | FileCheck -check-prefixes=ERR,VERIFIER %s
 
 # FIXME: We should not produce a verifier error after erroring
 
diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
index 8d12b3fe626da..35e11ad6a648b 100644
--- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 ; SI-LABEL: {{^}}foo:
 ; SI: .section	.AMDGPU.csdata
 ; SI: ; Kernel info:
-; SI: ; NumSgprs: {{[0-9]+}}
+; SI: ; TotalNumSgprs: {{[0-9]+}}
 ; SI: ; NumVgprs: {{[0-9]+}}
 define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %abase, ptr addrspace(1) %bbase) nounwind {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 19cc60963e900..f7f5bd56fa6f1 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -243,350 +243,345 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v13
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[3:4], s4, v[3:4]
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[2:3], s4, v[2:3]
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v13
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 1
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s10, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s10, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s10, 2
 ; GFX9-O0-NEXT:    s_mov_b32 s11, s7
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s11, 3
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v10, vcc, s10, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v4, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v2, vcc
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s11, 3
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v9, vcc, s10, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v4, v3, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v0, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v1, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[12:13], s[4:5]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[4:5]
+; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[11:12], s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[4:5]
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v17
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v19
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v14, vcc, s10, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v11, v10, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v8, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v13, v9, vcc
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, s10, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v10, v9, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v7, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v8, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v15
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[18:19], s[4:5]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, v10, v12, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v14
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s[4:5]
+; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[17:18], s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v12
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v9, v13, s[4:5]
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v5
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v4
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v15
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v17
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v19
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v17
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v19
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v5
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v4
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v15
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v19
-; GFX9-O0-NEXT:    v_or_b32_e64 v15, v13, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
-; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v14
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
-; GFX9-O0-NEXT:    v_or_b32_e64 v15, v13, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
-; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v14
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v18
+; GFX9-O0-NEXT:    v_or_b32_e64 v14, v12, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v17
+; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v13
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v14
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[12:13], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v16
+; GFX9-O0-NEXT:    v_or_b32_e64 v14, v12, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v13
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v14
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[12:13], s[6:7]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9]
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9]
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s13, 32
-; GFX9-O0-NEXT:    v_add_u32_e64 v8, v8, s13
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v9, v9
-; GFX9-O0-NEXT:    v_min_u32_e64 v8, v8, v9
+; GFX9-O0-NEXT:    v_add_u32_e64 v7, v7, s13
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX9-O0-NEXT:    v_min_u32_e64 v7, v7, v8
 ; GFX9-O0-NEXT:    s_mov_b32 s12, 0
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s12
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
-; GFX9-O0-NEXT:    v_add_u32_e64 v7, v7, s13
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v10
-; GFX9-O0-NEXT:    v_min_u32_e64 v13, v7, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX9-O0-NEXT:    v_add_u32_e64 v6, v6, s13
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v9, v9
+; GFX9-O0-NEXT:    v_min_u32_e64 v12, v6, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s12
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
 ; GFX9-O0-NEXT:    s_mov_b64 s[14:15], 64
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v12
 ; GFX9-O0-NEXT:    s_mov_b32 s16, s14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s15
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v10, s[16:17], v10, s16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s18
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17]
-; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v8, v9, s[8:9]
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v9, s[16:17], v9, s16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s18
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v6, s[16:17], v6, v10, s[16:17]
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v7, v8, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9]
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v0
+; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s13
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s13
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v2
-; GFX9-O0-NEXT:    v_min_u32_e64 v6, v5, v6
+; GFX9-O0-NEXT:    v_min_u32_e64 v5, v4, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s12
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s13
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v11, v4
-; GFX9-O0-NEXT:    v_min_u32_e64 v12, v5, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s13
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v3
+; GFX9-O0-NEXT:    v_min_u32_e64 v11, v4, v10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s12
-; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v11
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s15
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v11, s[12:13], v11, s12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s14
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v5, v12, s[12:13]
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[8:9]
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v10, s[12:13], v10, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s14
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[12:13], v4, v11, s[12:13]
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v8
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v9, vcc, v6, v7, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v7
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v5, v6, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v6, v7, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v6, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
-; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[8:9], v7, 1
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
+; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[8:9], v6, 1
 ; GFX9-O0-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-O0-NEXT:    s_xor_b64 s[4:5], s[8:9], s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
-; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
+; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
-; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v7, v7, v10
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v2, v5, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[8:9]
+; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s4, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s5, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -594,67 +589,66 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB0_8
 ; GFX9-O0-NEXT:  .LBB0_1: ; %Flow
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_5
 ; GFX9-O0-NEXT:  .LBB0_3: ; %Flow2
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v4, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v4, 5
-; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 5
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_9
 ; GFX9-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
@@ -687,123 +681,117 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_3
 ; GFX9-O0-NEXT:  .LBB0_5: ; %Flow1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 9
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 11
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 11
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[29:30], s4, v[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v30
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], s5, v[23:24]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v24
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], s5, v[22:23]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v29
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    v_or_b32_e64 v23, v5, v10
-; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v22
+; GFX9-O0-NEXT:    v_or_b32_e64 v22, v5, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[2:3]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v30
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v29
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v28
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[6:7]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[6:7]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v30
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
 ; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v29
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v27
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v26
 ; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v26
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v25
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v24
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -823,22 +811,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v22
-; GFX9-O0-NEXT:    v_and_b32_e64 v23, v7, v23
-; GFX9-O0-NEXT:    v_and_b32_e64 v21, v11, v21
-; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v20
-; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v23
-; GFX9-O0-NEXT:    v_and_b32_e64 v23, v11, v19
-; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v22
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v20
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v19, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v20, v11, v20
+; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v11, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v19
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
@@ -854,149 +842,149 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v20, vcc, v11, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v19, vcc, v11, v16
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s4
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v10, v11, vcc
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s4
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v21
-; GFX9-O0-NEXT:    v_or_b32_e64 v19, v19, v22
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v20
-; GFX9-O0-NEXT:    v_or_b32_e64 v17, v17, v18
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v19
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13]
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v18, v18, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v2
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v2
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
 ; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
 ; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v14
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
 ; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v12
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 10
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 11
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 11
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execnz .LBB0_6
 ; GFX9-O0-NEXT:    s_branch .LBB0_1
 ; GFX9-O0-NEXT:  .LBB0_7: ; %udiv-preheader
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[21:22]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v12, s6, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], v12, v[19:20]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v24
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], v12, v[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v23
 ; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v22
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
 ; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v4, s6
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v5, v4, s6
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[23:24], v5, v[19:20]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[22:23], v5, v[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 0
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v4, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v21
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v22
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v20
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[19:20]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[18:19]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v5
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s7
@@ -1015,12 +1003,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s4
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v15, v17, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v15, v16, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s5
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v14, vcc, v14, v15, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s4
@@ -1032,429 +1020,428 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s4, 10
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s5, 11
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 11
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v9, vcc, v4, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s9
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
-; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s4, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], v3, v[11:12]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
+; GFX9-O0-NEXT:    v_sub_u32_e64 v2, s4, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-O0-NEXT:    v_sub_u32_e64 v14, s4, v3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], v14, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v15
-; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v16
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
-; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
-; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v3, s4
+; GFX9-O0-NEXT:    v_sub_u32_e64 v13, s4, v2
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
+; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v15
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v2, s4
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 63
-; GFX9-O0-NEXT:    v_sub_u32_e64 v4, s10, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[13:14], v4, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[4:5]
+; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s10, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[12:13], v3, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 0
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v3, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[10:11]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[10:11]
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v2, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[7:8], v3, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v4, v7, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v3, v6, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
-; GFX9-O0-NEXT:    v_or_b32_e64 v3, v3, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v1, v1, v2
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB0_5
 ; GFX9-O0-NEXT:    s_branch .LBB0_7
 ; GFX9-O0-NEXT:  .LBB0_9: ; %udiv-end
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 32
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[5:6]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[16:17]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v17
-; GFX9-O0-NEXT:    v_mul_lo_u32 v3, v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v20
+; GFX9-O0-NEXT:    v_mul_lo_u32 v8, v1, v0
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[17:18], s4, v[17:18]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v17
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mul_lo_u32 v2, v5, v2
-; GFX9-O0-NEXT:    v_mad_u64_u32 v[17:18], s[6:7], v5, v0, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v18
-; GFX9-O0-NEXT:    v_add3_u32 v2, v0, v2, v3
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[20:21], s4, v[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v16
+; GFX9-O0-NEXT:    v_mul_lo_u32 v5, v2, v5
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v2, v0, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v17
+; GFX9-O0-NEXT:    v_add3_u32 v8, v0, v5, v8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v3
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 killed $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[8:9], s4, v[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 0
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v0
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v17
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT:    v_or_b32_e64 v16, v5, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[8:9], s4, v[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v14
+; GFX9-O0-NEXT:    v_mul_lo_u32 v9, v8, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], s4, v[14:15]
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v18
-; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v17
-; GFX9-O0-NEXT:    v_or_b32_e64 v17, v2, v3
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v0
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[19:20]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v11
-; GFX9-O0-NEXT:    v_mul_lo_u32 v3, v2, v6
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[11:12], s4, v[11:12]
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v19
-; GFX9-O0-NEXT:    v_mul_lo_u32 v11, v11, v0
-; GFX9-O0-NEXT:    v_mad_u64_u32 v[19:20], s[6:7], v2, v0, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v20
-; GFX9-O0-NEXT:    v_add3_u32 v2, v2, v3, v11
+; GFX9-O0-NEXT:    v_mul_lo_u32 v14, v14, v0
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v8, v0, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v19
+; GFX9-O0-NEXT:    v_add3_u32 v8, v8, v9, v14
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s6
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v3
-; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v14
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[8:9], s4, v[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 killed $vgpr18_vgpr19 killed $exec
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
-; GFX9-O0-NEXT:    v_or_b32_e64 v11, v11, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v19
-; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v18
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v17, s[6:7], v11, v12
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v2
-; GFX9-O0-NEXT:    v_mad_u64_u32 v[19:20], s[6:7], v6, v1, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v14, v14, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
+; GFX9-O0-NEXT:    v_or_b32_e64 v8, v8, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v16, s[6:7], v14, v15
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v8, s[6:7], v8, v9, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v8
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v5, v1, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v14
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v12
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[19:20], s4, v[19:20]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v20
-; GFX9-O0-NEXT:    v_or_b32_e64 v11, v11, v12
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v19
-; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_mad_u64_u32 v[19:20], s[6:7], v6, v5, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v9
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[14:15], s4, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v8, v8, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_or_b32_e64 v20, v9, v14
+; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v8
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v5, v2, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v14
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, s6
-; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v21
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[19:20], s4, v[19:20]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v20
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v21
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v19
-; GFX9-O0-NEXT:    v_or_b32_e64 v23, v11, v12
-; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v6
-; GFX9-O0-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v0, v5, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v18
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[14:15], s4, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v14
+; GFX9-O0-NEXT:    v_or_b32_e64 v22, v8, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v5
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v0, v2, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v22
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v5, s[6:7], v5, v20
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v19, s[6:7], v6, v19, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v19
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v19
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v8, s[6:7], v8, v9
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0xffffffff
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s7
-; GFX9-O0-NEXT:    v_and_b32_e64 v19, v19, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v5
+; GFX9-O0-NEXT:    v_and_b32_e64 v2, v2, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
-; GFX9-O0-NEXT:    v_and_b32_e64 v21, v20, s6
-; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT:    v_mad_u64_u32 v[19:20], s[6:7], v0, v1, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v19
+; GFX9-O0-NEXT:    v_and_b32_e64 v18, v5, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v2
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v0, v1, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v22
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v20
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v23
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v1
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[19:20], s4, v[19:20]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v20
-; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v23
-; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT:    v_or_b32_e64 v23, v1, v19
-; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v22
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v0, s[6:7], v0, v20
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v19, s[6:7], v1, v19, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v5
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], s4, v[22:23]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v22
+; GFX9-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v19
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v0, s[6:7], v0, v5
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v1, v2, s[6:7]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v19
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[21:22], s4, v[0:1]
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[5:6], s4, v[5:6]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v22
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v19, s[6:7], v19, v20
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[6:7], v5, v6, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v19
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v20
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v19, s[6:7], v5, v6
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[18:19], s4, v[0:1]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[22:23], s4, v[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v19
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v18, s[6:7], v8, v9
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v19
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v18, s[6:7], v8, v9
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v16
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v19
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v18
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[6:7], v2, v6
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[6:7], v3, v5, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v17
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v8, s[6:7], v8, v9
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v14
 ; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v14
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v12
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v11, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v9
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v8, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v5, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v1, v2, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
@@ -1462,53 +1449,48 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_xor_b32_e64 v9, v6, v5
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-O0-NEXT:    v_xor_b32_e64 v8, v5, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v6
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-O0-NEXT:    v_xor_b32_e64 v0, v0, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    v_xor_b32_e64 v0, v0, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v7, vcc, v7, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v7
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[7:8], s4, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[5:6], s4, v[5:6]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-O0-NEXT:    ; kill: killed $vgpr4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s4, v[3:4]
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -1725,266 +1707,258 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-O0-NEXT:    v_or_b32_e64 v3, v8, v7
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    v_or_b32_e64 v1, v5, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 1
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    v_or_b32_e64 v15, v4, v2
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 1
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v9, v3, v1
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v15
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7]
+; GFX9-O0-NEXT:    v_or_b32_e64 v14, v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v8, v2, v0
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v14
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s9, 32
-; GFX9-O0-NEXT:    v_add_u32_e64 v6, v6, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
-; GFX9-O0-NEXT:    v_min_u32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX9-O0-NEXT:    v_min_u32_e64 v5, v5, v6
 ; GFX9-O0-NEXT:    s_mov_b32 s8, 0
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v5
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v8, v8
-; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v4
+; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
+; GFX9-O0-NEXT:    v_min_u32_e64 v14, v4, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v4
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], 64
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v15
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s11
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v8, s[12:13], v8, s12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s14
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13]
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v7, s[12:13], v7, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s14
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[12:13], v4, v8, s[12:13]
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[12:13]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[10:11], s[12:13]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[12:13]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v5, v6, s[12:13]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v0
+; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s9
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v2
-; GFX9-O0-NEXT:    v_min_u32_e64 v6, v5, v6
+; GFX9-O0-NEXT:    v_min_u32_e64 v5, v4, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v11, v4
-; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v3
+; GFX9-O0-NEXT:    v_min_u32_e64 v14, v4, v10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v14
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v15
 ; GFX9-O0-NEXT:    s_mov_b32 s10, s11
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v11, s[8:9], v11, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s10
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9]
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v10, s[8:9], v10, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s10
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v4, v11, s[8:9]
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v11
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[8:9]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[12:13], s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
 ; GFX9-O0-NEXT:    s_mov_b32 s10, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s11, s7
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v8
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v9, vcc, v6, v7, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v7
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v5, v6, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v6, v7, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v6, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
-; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[8:9], v7, 1
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
+; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[8:9], v6, 1
 ; GFX9-O0-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-O0-NEXT:    s_xor_b64 s[4:5], s[8:9], s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
-; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
+; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
-; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v7, v7, v10
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v2, v5, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[8:9]
+; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s5, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1992,11 +1966,11 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB1_8
 ; GFX9-O0-NEXT:  .LBB1_1: ; %Flow
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
@@ -2025,20 +1999,19 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_5
 ; GFX9-O0-NEXT:  .LBB1_3: ; %Flow2
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v4, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v4, 3
-; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 3
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -2085,13 +2058,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_3
 ; GFX9-O0-NEXT:  .LBB1_5: ; %Flow1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 7
-; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
@@ -2100,9 +2066,15 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 7
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -2116,92 +2088,87 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB1_4
 ; GFX9-O0-NEXT:  .LBB1_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 9
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 9
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[29:30], s4, v[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v30
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], s5, v[23:24]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v24
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], s5, v[22:23]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v29
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    v_or_b32_e64 v23, v5, v10
-; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v22
+; GFX9-O0-NEXT:    v_or_b32_e64 v22, v5, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[2:3]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v30
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v29
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v28
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[6:7]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[6:7]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v30
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
 ; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v29
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v27
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v26
 ; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v26
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v25
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v24
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -2221,22 +2188,22 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v22
-; GFX9-O0-NEXT:    v_and_b32_e64 v23, v7, v23
-; GFX9-O0-NEXT:    v_and_b32_e64 v21, v11, v21
-; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v20
-; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v23
-; GFX9-O0-NEXT:    v_and_b32_e64 v23, v11, v19
-; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v22
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v20
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v19, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v20, v11, v20
+; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v11, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v19
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
@@ -2252,66 +2219,66 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v20, vcc, v11, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v19, vcc, v11, v16
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s4
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v10, v11, vcc
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s4
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v21
-; GFX9-O0-NEXT:    v_or_b32_e64 v19, v19, v22
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v20
-; GFX9-O0-NEXT:    v_or_b32_e64 v17, v17, v18
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v19
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13]
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v18, v18, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v2
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v2
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v14
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v12
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 5
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -2349,52 +2316,52 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[21:22]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v12, s6, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], v12, v[19:20]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v24
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], v12, v[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v23
 ; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v22
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
 ; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v4, s6
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v5, v4, s6
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[23:24], v5, v[19:20]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[22:23], v5, v[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 0
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v4, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v21
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v22
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v20
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[19:20]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[18:19]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v5
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s7
@@ -2413,12 +2380,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s4
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v15, v17, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v15, v16, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s5
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v14, vcc, v14, v15, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s4
@@ -2430,7 +2397,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -2443,10 +2410,11 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s4, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s5, 9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -2474,403 +2442,396 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_6
 ; GFX9-O0-NEXT:  .LBB1_8: ; %udiv-bb1
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v9, vcc, v4, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s9
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
-; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s4, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], v3, v[11:12]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
+; GFX9-O0-NEXT:    v_sub_u32_e64 v2, s4, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-O0-NEXT:    v_sub_u32_e64 v14, s4, v3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], v14, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v15
-; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v16
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
-; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
-; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v3, s4
+; GFX9-O0-NEXT:    v_sub_u32_e64 v13, s4, v2
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
+; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v15
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v2, s4
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 63
-; GFX9-O0-NEXT:    v_sub_u32_e64 v4, s10, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[13:14], v4, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[4:5]
+; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s10, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[12:13], v3, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 0
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v3, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[10:11]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[10:11]
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v2, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[7:8], v3, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v4, v7, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v3, v6, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
-; GFX9-O0-NEXT:    v_or_b32_e64 v3, v3, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v1, v1, v2
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 7
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_5
 ; GFX9-O0-NEXT:    s_branch .LBB1_7
 ; GFX9-O0-NEXT:  .LBB1_9: ; %udiv-end
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 32
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v13
-; GFX9-O0-NEXT:    v_mul_lo_u32 v5, v6, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v12
+; GFX9-O0-NEXT:    v_mul_lo_u32 v4, v5, v2
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], s4, v[13:14]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mul_lo_u32 v3, v7, v3
-; GFX9-O0-NEXT:    v_mad_u64_u32 v[13:14], s[6:7], v7, v2, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
-; GFX9-O0-NEXT:    v_add3_u32 v2, v2, v3, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[12:13], s4, v[12:13]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mul_lo_u32 v3, v6, v3
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v6, v2, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v13
+; GFX9-O0-NEXT:    v_add3_u32 v2, v2, v3, v4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[17:18], s4, v[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v18
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[3:4], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 0
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
-; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT:    v_or_b32_e64 v13, v3, v5
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v2
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[15:16]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v11
-; GFX9-O0-NEXT:    v_mul_lo_u32 v3, v2, v8
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[11:12], s4, v[11:12]
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v15
-; GFX9-O0-NEXT:    v_mul_lo_u32 v11, v11, v5
-; GFX9-O0-NEXT:    v_mad_u64_u32 v[15:16], s[6:7], v2, v5, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v16
-; GFX9-O0-NEXT:    v_add3_u32 v2, v2, v3, v11
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v13
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v12
+; GFX9-O0-NEXT:    v_or_b32_e64 v12, v3, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v2
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v10
+; GFX9-O0-NEXT:    v_mul_lo_u32 v3, v2, v7
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s4, v[10:11]
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    v_mul_lo_u32 v10, v10, v4
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v2, v4, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v15
+; GFX9-O0-NEXT:    v_add3_u32 v2, v2, v3, v10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s6
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v3
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v16
-; GFX9-O0-NEXT:    v_or_b32_e64 v11, v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v10, v10, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v14
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v13, s[6:7], v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v12, s[6:7], v10, v11
 ; GFX9-O0-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v2
-; GFX9-O0-NEXT:    v_mad_u64_u32 v[15:16], s[6:7], v8, v6, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v15
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v2
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v7, v5, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s5
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v12
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[15:16], s4, v[15:16]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v16
-; GFX9-O0-NEXT:    v_or_b32_e64 v11, v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v11
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[14:15], s4, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v10, v10, v11
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v14
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_mad_u64_u32 v[15:16], s[6:7], v8, v7, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v7, v6, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v14
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s6
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v17
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[15:16], s4, v[15:16]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v16
-; GFX9-O0-NEXT:    v_or_b32_e64 v8, v8, v17
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v15
-; GFX9-O0-NEXT:    v_or_b32_e64 v19, v11, v12
-; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v8
-; GFX9-O0-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v5, v7, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[14:15], s4, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v7, v7, v16
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    v_or_b32_e64 v18, v10, v11
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v7
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v4, v6, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v11
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v16
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v19
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v18
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v7, s[6:7], v7, v16
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v15, s[6:7], v8, v15, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v6, s[6:7], v6, v15
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v14, s[6:7], v7, v14, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0xffffffff
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s7
-; GFX9-O0-NEXT:    v_and_b32_e64 v15, v15, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v7
+; GFX9-O0-NEXT:    v_and_b32_e64 v14, v14, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
 ; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
-; GFX9-O0-NEXT:    v_and_b32_e64 v17, v16, s6
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
-; GFX9-O0-NEXT:    v_mad_u64_u32 v[15:16], s[6:7], v5, v6, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v15
+; GFX9-O0-NEXT:    v_and_b32_e64 v16, v15, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v4, v5, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v14
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v6
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[15:16], s4, v[15:16]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v16
-; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v19
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_or_b32_e64 v19, v6, v15
-; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v5
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[14:15], s4, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_or_b32_e64 v18, v5, v14
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v16
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v19
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v18
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v5, s[6:7], v5, v16
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v15, s[6:7], v6, v15, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v15
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[17:18], s4, v[5:6]
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[7:8], s4, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v15, s[6:7], v15, v16
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v7, v8, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v4, s[6:7], v4, v15
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v14, s[6:7], v5, v14, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[16:17], s4, v[4:5]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], s4, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v17
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v14, s[6:7], v14, v15
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v6, v7, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v16
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v15, s[6:7], v7, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v15
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v14, s[6:7], v6, v7
 ; GFX9-O0-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[6:7], v2, v8
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v3, v7, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[6:7], v2, v7
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v3, v6, s[6:7]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v7
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[6:7], s4, v[5:6]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], s4, v[4:5]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v11
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v7, vcc, v7, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v7
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[7:8], s4, v[7:8]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[5:6], s4, v[5:6]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-O0-NEXT:    ; kill: killed $vgpr4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s4, v[3:4]
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/remat-vop.mir b/llvm/test/CodeGen/AMDGPU/remat-vop.mir
index 248a9e2ddb636..4f6ea44ccf68b 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-vop.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-vop.mir
@@ -7,12 +7,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mov_b32_e32
-    ; GCN: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     %1:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
@@ -31,16 +31,12 @@ body:             |
   bb.0:
     ; GCN-LABEL: name: test_no_remat_v_mov_b32_e32_impuse
     ; GCN: $m0 = IMPLICIT_DEF
-    ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec, implicit $m0
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec, implicit $m0
-    ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec, implicit $m0
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec, implicit $m0
+    ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec, implicit $m0
+    ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec, implicit $m0
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     $m0 = IMPLICIT_DEF
     %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec, implicit $m0
@@ -59,12 +55,12 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mov_b32_e32_exec_def
-    ; GCN: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_2]]
     ; GCN-NEXT: $exec = S_ANDN2_B64_term $exec, undef renamable $sgpr0_sgpr1, implicit-def $scc
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
@@ -82,12 +78,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mov_b32_e64
-    ; GCN: renamable $vgpr0 = V_MOV_B32_e64 1, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e64 2, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e64 3, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MOV_B32_e64_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e64 1, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e64 2, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e64 3, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_MOV_B32_e64 1, implicit $exec
     %1:vgpr_32 = V_MOV_B32_e64 2, implicit $exec
@@ -105,16 +101,12 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_no_remat_v_mov_b32_dpp
-    ; GCN: renamable $vgpr0 = V_MOV_B32_dpp undef $vgpr0, undef $vgpr0, 1, 15, 15, 1, implicit $exec
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_dpp undef $vgpr1, undef $vgpr0, 1, 15, 15, 1, implicit $exec
-    ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_dpp undef $vgpr0, undef $vgpr0, 1, 15, 15, 1, implicit $exec
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp undef [[V_MOV_B32_dpp]], undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp undef [[V_MOV_B32_dpp1]], undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp undef [[V_MOV_B32_dpp2]], undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_dpp]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_dpp1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_dpp2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MOV_B32_dpp undef %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
     %2:vgpr_32 = V_MOV_B32_dpp undef %2:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
@@ -130,12 +122,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_accvgpr_read_b32
-    ; GCN: renamable $vgpr0 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_ACCVGPR_READ_B32_e64_:%[0-9]+]]:vgpr_32 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec
+    ; GCN-NEXT: [[V_ACCVGPR_READ_B32_e64_1:%[0-9]+]]:vgpr_32 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec
+    ; GCN-NEXT: [[V_ACCVGPR_READ_B32_e64_2:%[0-9]+]]:vgpr_32 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_READ_B32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_READ_B32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_READ_B32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec
     %1:vgpr_32 = V_ACCVGPR_READ_B32_e64 undef $agpr0, implicit $exec
@@ -151,12 +143,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_accvgpr_write_b32
-    ; GCN: renamable $agpr0 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
-    ; GCN-NEXT: renamable $agpr1 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $agpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $agpr1
-    ; GCN-NEXT: renamable $agpr0 = V_ACCVGPR_WRITE_B32_e64 3, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $agpr0
+    ; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
+    ; GCN-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
+    ; GCN-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 3, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_WRITE_B32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_WRITE_B32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_WRITE_B32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
     %1:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
@@ -172,12 +164,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mov_b64_pseudo
-    ; GCN: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 1, implicit $exec
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_MOV_B64_PSEUDO 2, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 3, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 2, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 3, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vreg_64_align2 = V_MOV_B64_PSEUDO 1, implicit $exec
     %1:vreg_64_align2 = V_MOV_B64_PSEUDO 2, implicit $exec
@@ -193,12 +185,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_i32_f64_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode
@@ -216,16 +208,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_no_remat_v_cvt_i32_f64_e32_fp_except
-    ; GCN: renamable $vgpr0 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode
@@ -245,16 +233,12 @@ body:             |
   bb.0:
     ; GCN-LABEL: name: test_no_remat_v_cvt_i32_f64_e32_mode_def
     ; GCN: $mode = IMPLICIT_DEF
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     $mode = IMPLICIT_DEF
     %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
@@ -271,12 +255,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_i32_f64_e64
-    ; GCN: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e64 0, 1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_I32_F64_e64 0, 2, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e64 0, 3, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_I32_F64_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, 1, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F64_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, 2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F64_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, 3, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, 1, 0, 0, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, 2, 0, 0, implicit $exec, implicit $mode
@@ -294,12 +278,12 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_i32_f64_e64_undef
-    ; GCN: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e64 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_I32_F64_e64 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_I32_F64_e64 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_I32_F64_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %1:vreg_64, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F64_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %1:vreg_64, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F64_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %1:vreg_64, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %0:vreg_64, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %0:vreg_64, 0, 0, implicit $exec, implicit $mode
@@ -317,16 +301,12 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_no_remat_v_cvt_i32_f64_dpp
-    ; GCN: renamable $vgpr0 = V_CVT_I32_F64_dpp undef $vgpr0, 0, undef $vgpr0_vgpr1, 336, 0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_I32_F64_dpp undef $vgpr1, 0, undef $vgpr0_vgpr1, 336, 0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_I32_F64_dpp undef $vgpr0, 0, undef $vgpr0_vgpr1, 336, 0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_I32_F64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_dpp undef [[V_CVT_I32_F64_dpp]], 0, undef %1:vreg_64_align2, 336, 0, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F64_dpp1:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_dpp undef [[V_CVT_I32_F64_dpp1]], 0, undef %1:vreg_64_align2, 336, 0, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F64_dpp2:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_dpp undef [[V_CVT_I32_F64_dpp2]], 0, undef %1:vreg_64_align2, 336, 0, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_dpp]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_dpp1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_dpp2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CVT_I32_F64_dpp undef %1:vgpr_32, 0, undef %0:vreg_64_align2, 336, 0, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = V_CVT_I32_F64_dpp undef %2:vgpr_32, 0, undef %0:vreg_64_align2, 336, 0, 0, 0, implicit $exec, implicit $mode
@@ -344,16 +324,12 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_no_remat_v_cvt_i32_f64_e32_imp_def
-    ; GCN: renamable $vgpr0 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+    ; GCN-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+    ; GCN-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
     %1:vgpr_32 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
@@ -371,16 +347,12 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_no_remat_v_cvt_i32_f64_e32_imp_use
-    ; GCN: renamable $vgpr0 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit $m0
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit $m0
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit $m0
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit $m0
+    ; GCN-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit $m0
+    ; GCN-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit $m0
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit $m0
     %1:vgpr_32 = V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit $m0
@@ -396,12 +368,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_f64_i32_e32
-    ; GCN: renamable $vgpr0_vgpr1 = V_CVT_F64_I32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_CVT_F64_I32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_CVT_F64_I32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64_align2 = V_CVT_F64_I32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F64_I32_e32_1:%[0-9]+]]:vreg_64_align2 = V_CVT_F64_I32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F64_I32_e32_2:%[0-9]+]]:vreg_64_align2 = V_CVT_F64_I32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vreg_64_align2 = V_CVT_F64_I32_e32 1, implicit $exec, implicit $mode
     %1:vreg_64_align2 = V_CVT_F64_I32_e32 2, implicit $exec, implicit $mode
@@ -417,12 +389,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_f32_f64_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_CVT_F32_F64_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_F32_F64_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_F32_F64_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_F32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F64_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F64_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F64_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_F64_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_F64_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_F64_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_CVT_F32_F64_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_CVT_F32_F64_e32 2, implicit $exec, implicit $mode
@@ -438,12 +410,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_f64_f32_e32
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_CVT_F64_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_CVT_F64_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_CVT_F64_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_CVT_F64_F32_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_CVT_F64_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F64_F32_e32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_CVT_F64_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F64_F32_e32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_CVT_F64_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vreg_64_align2 = nofpexcept V_CVT_F64_F32_e32 1, implicit $exec, implicit $mode
     %1:vreg_64_align2 = nofpexcept V_CVT_F64_F32_e32 2, implicit $exec, implicit $mode
@@ -459,12 +431,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_u32_f64_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_CVT_U32_F64_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_U32_F64_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_U32_F64_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_U32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F64_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_U32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F64_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_U32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F64_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_U32_F64_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_U32_F64_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_U32_F64_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_CVT_U32_F64_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_CVT_U32_F64_e32 2, implicit $exec, implicit $mode
@@ -480,12 +452,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_f64_u32_e32
-    ; GCN: renamable $vgpr0_vgpr1 = V_CVT_F64_U32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_CVT_F64_U32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_CVT_F64_U32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_CVT_F64_U32_e32_:%[0-9]+]]:vreg_64_align2 = V_CVT_F64_U32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F64_U32_e32_1:%[0-9]+]]:vreg_64_align2 = V_CVT_F64_U32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F64_U32_e32_2:%[0-9]+]]:vreg_64_align2 = V_CVT_F64_U32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_U32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_U32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F64_U32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vreg_64_align2 = V_CVT_F64_U32_e32 1, implicit $exec, implicit $mode
     %1:vreg_64_align2 = V_CVT_F64_U32_e32 2, implicit $exec, implicit $mode
@@ -501,12 +473,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_f32_i32_e32
-    ; GCN: renamable $vgpr0 = V_CVT_F32_I32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_F32_I32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_F32_I32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F32_I32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F32_I32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_CVT_F32_I32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = V_CVT_F32_I32_e32 2, implicit $exec, implicit $mode
@@ -522,12 +494,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_f32_i32_sdwa
-    ; GCN: renamable $vgpr0 = V_CVT_F32_I32_sdwa 0, undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_F32_I32_sdwa 0, undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_F32_I32_sdwa 0, undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_F32_I32_sdwa:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F32_I32_sdwa1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F32_I32_sdwa2:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_sdwa]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_sdwa1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_sdwa2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %0:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %0:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
@@ -547,16 +519,12 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_no_remat_v_cvt_f32_i32_sdwa_dst_unused_preserve
-    ; GCN: renamable $vgpr0 = V_CVT_F32_I32_sdwa 0, undef $vgpr0, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef $vgpr0(tied-def 0)
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_F32_I32_sdwa 0, undef $vgpr0, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef $vgpr1(tied-def 0)
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_F32_I32_sdwa 0, undef $vgpr0, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef $vgpr0(tied-def 0)
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_F32_I32_sdwa:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %1:vgpr_32, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef [[V_CVT_F32_I32_sdwa]](tied-def 0)
+    ; GCN-NEXT: [[V_CVT_F32_I32_sdwa1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %1:vgpr_32, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef [[V_CVT_F32_I32_sdwa1]](tied-def 0)
+    ; GCN-NEXT: [[V_CVT_F32_I32_sdwa2:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %1:vgpr_32, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef [[V_CVT_F32_I32_sdwa2]](tied-def 0)
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_sdwa]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_sdwa1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_I32_sdwa2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %0:vgpr_32, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef %1:vgpr_32(tied-def 0)
     %2:vgpr_32 = V_CVT_F32_I32_sdwa 0, undef %0:vgpr_32, 0, 0, 0, 2, 0, implicit $exec, implicit $mode, implicit undef %2:vgpr_32(tied-def 0)
@@ -572,12 +540,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_f32_u32_e32
-    ; GCN: renamable $vgpr0 = V_CVT_F32_U32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_F32_U32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_F32_U32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F32_U32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F32_U32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_U32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_U32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_U32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_CVT_F32_U32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = V_CVT_F32_U32_e32 2, implicit $exec, implicit $mode
@@ -593,12 +561,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_u32_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_CVT_U32_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_U32_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_U32_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_U32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_U32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_U32_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_U32_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_U32_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 2, implicit $exec, implicit $mode
@@ -614,12 +582,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_i32_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_I32_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
@@ -635,12 +603,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_f32_f16_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_CVT_F32_F16_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_F32_F16_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_F32_F16_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F32_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F32_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_F16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_F16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_F16_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_CVT_F32_F16_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_CVT_F32_F16_e32 2, implicit $exec, implicit $mode
@@ -656,12 +624,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_rpi_i32_f32_e32
-    ; GCN: renamable $vgpr0 = V_CVT_RPI_I32_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_RPI_I32_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_RPI_I32_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_RPI_I32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_RPI_I32_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_RPI_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_RPI_I32_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_RPI_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_RPI_I32_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_RPI_I32_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_RPI_I32_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_RPI_I32_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_CVT_RPI_I32_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = V_CVT_RPI_I32_F32_e32 2, implicit $exec, implicit $mode
@@ -677,12 +645,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_flr_i32_f32_e32
-    ; GCN: renamable $vgpr0 = V_CVT_FLR_I32_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_FLR_I32_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_FLR_I32_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_FLR_I32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_FLR_I32_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_FLR_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_FLR_I32_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_FLR_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_FLR_I32_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_FLR_I32_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_FLR_I32_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_FLR_I32_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_CVT_FLR_I32_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = V_CVT_FLR_I32_F32_e32 2, implicit $exec, implicit $mode
@@ -698,12 +666,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_off_f32_i4_e32
-    ; GCN: renamable $vgpr0 = V_CVT_OFF_F32_I4_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_OFF_F32_I4_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_OFF_F32_I4_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_OFF_F32_I4_e32_:%[0-9]+]]:vgpr_32 = V_CVT_OFF_F32_I4_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_OFF_F32_I4_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_OFF_F32_I4_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_OFF_F32_I4_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_OFF_F32_I4_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_OFF_F32_I4_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_OFF_F32_I4_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_OFF_F32_I4_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_CVT_OFF_F32_I4_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = V_CVT_OFF_F32_I4_e32 2, implicit $exec, implicit $mode
@@ -719,12 +687,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_f32_ubyte0_e32
-    ; GCN: renamable $vgpr0 = V_CVT_F32_UBYTE0_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_F32_UBYTE0_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_F32_UBYTE0_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_F32_UBYTE0_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F32_UBYTE0_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_F32_UBYTE0_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_UBYTE0_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_UBYTE0_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_F32_UBYTE0_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_CVT_F32_UBYTE0_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = V_CVT_F32_UBYTE0_e32 2, implicit $exec, implicit $mode
@@ -740,12 +708,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_fract_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_FRACT_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FRACT_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FRACT_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_FRACT_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FRACT_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FRACT_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FRACT_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FRACT_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FRACT_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FRACT_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FRACT_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FRACT_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_FRACT_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_FRACT_F32_e32 2, implicit $exec, implicit $mode
@@ -761,12 +729,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_trunc_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_TRUNC_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_TRUNC_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_TRUNC_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_TRUNC_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_TRUNC_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_TRUNC_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_TRUNC_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_TRUNC_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_TRUNC_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_TRUNC_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_TRUNC_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_TRUNC_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_TRUNC_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_TRUNC_F32_e32 2, implicit $exec, implicit $mode
@@ -782,12 +750,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_ceil_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_CEIL_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CEIL_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CEIL_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CEIL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CEIL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CEIL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CEIL_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CEIL_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CEIL_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_CEIL_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_CEIL_F32_e32 2, implicit $exec, implicit $mode
@@ -803,12 +771,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_rndne_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_RNDNE_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_RNDNE_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_RNDNE_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_RNDNE_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_RNDNE_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_RNDNE_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RNDNE_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_RNDNE_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_RNDNE_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RNDNE_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RNDNE_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RNDNE_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_RNDNE_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_RNDNE_F32_e32 2, implicit $exec, implicit $mode
@@ -824,12 +792,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_floor_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_FLOOR_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FLOOR_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FLOOR_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_FLOOR_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FLOOR_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FLOOR_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FLOOR_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FLOOR_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FLOOR_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_FLOOR_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_FLOOR_F32_e32 2, implicit $exec, implicit $mode
@@ -845,12 +813,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_exp_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_EXP_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_EXP_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_EXP_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_EXP_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_EXP_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_EXP_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_EXP_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_EXP_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_EXP_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_EXP_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_EXP_F32_e32 2, implicit $exec, implicit $mode
@@ -866,12 +834,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_log_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_LOG_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LOG_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_LOG_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_LOG_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_LOG_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_LOG_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LOG_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LOG_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LOG_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_LOG_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_LOG_F32_e32 2, implicit $exec, implicit $mode
@@ -887,12 +855,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_rcp_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_RCP_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_RCP_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_RCP_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_RCP_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_RCP_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_RCP_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_RCP_F32_e32 2, implicit $exec, implicit $mode
@@ -908,12 +876,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_rcp_iflag_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_RCP_IFLAG_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_RCP_IFLAG_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_RCP_IFLAG_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_RCP_IFLAG_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_RCP_IFLAG_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_RCP_IFLAG_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_IFLAG_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_IFLAG_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_IFLAG_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 2, implicit $exec, implicit $mode
@@ -929,12 +897,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_rsq_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_RSQ_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_RSQ_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_RSQ_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_RSQ_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_RSQ_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_RSQ_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RSQ_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RSQ_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RSQ_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_RSQ_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_RSQ_F32_e32 2, implicit $exec, implicit $mode
@@ -950,12 +918,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_sqrt_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_SQRT_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SQRT_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_SQRT_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_SQRT_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_SQRT_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_SQRT_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SQRT_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SQRT_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SQRT_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_SQRT_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_SQRT_F32_e32 2, implicit $exec, implicit $mode
@@ -971,12 +939,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_rcp_f64_e32
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_RCP_F64_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_RCP_F64_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_RCP_F64_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_RCP_F64_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_RCP_F64_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_RCP_F64_e32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_RCP_F64_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_RCP_F64_e32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_RCP_F64_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_F64_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_F64_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RCP_F64_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vreg_64_align2 = nofpexcept V_RCP_F64_e32 1, implicit $exec, implicit $mode
     %1:vreg_64_align2 = nofpexcept V_RCP_F64_e32 2, implicit $exec, implicit $mode
@@ -992,12 +960,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_rsq_f64_e32
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_RSQ_F64_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_RSQ_F64_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_RSQ_F64_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_RSQ_F64_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_RSQ_F64_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_RSQ_F64_e32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_RSQ_F64_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_RSQ_F64_e32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_RSQ_F64_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RSQ_F64_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RSQ_F64_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_RSQ_F64_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vreg_64_align2 = nofpexcept V_RSQ_F64_e32 1, implicit $exec, implicit $mode
     %1:vreg_64_align2 = nofpexcept V_RSQ_F64_e32 2, implicit $exec, implicit $mode
@@ -1013,12 +981,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_sqrt_f64_e32
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_SQRT_F64_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_SQRT_F64_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_SQRT_F64_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_SQRT_F64_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_SQRT_F64_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_SQRT_F64_e32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_SQRT_F64_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_SQRT_F64_e32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_SQRT_F64_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SQRT_F64_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SQRT_F64_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SQRT_F64_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vreg_64_align2 = nofpexcept V_SQRT_F64_e32 1, implicit $exec, implicit $mode
     %1:vreg_64_align2 = nofpexcept V_SQRT_F64_e32 2, implicit $exec, implicit $mode
@@ -1034,12 +1002,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_sin_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_SIN_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SIN_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_SIN_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_SIN_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_SIN_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_SIN_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SIN_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SIN_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SIN_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_SIN_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_SIN_F32_e32 2, implicit $exec, implicit $mode
@@ -1055,12 +1023,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cos_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_COS_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_COS_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_COS_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_COS_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_COS_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_COS_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_COS_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_COS_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_COS_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_COS_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_COS_F32_e32 2, implicit $exec, implicit $mode
@@ -1076,12 +1044,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_not_b32_e32
-    ; GCN: renamable $vgpr0 = V_NOT_B32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_NOT_B32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_NOT_B32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_NOT_B32_e32_2:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_NOT_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_NOT_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_NOT_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_NOT_B32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = V_NOT_B32_e32 2, implicit $exec, implicit $mode
@@ -1097,12 +1065,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_bfrev_b32_e32
-    ; GCN: renamable $vgpr0 = V_BFREV_B32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_BFREV_B32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_BFREV_B32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_BFREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_BFREV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_BFREV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFREV_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFREV_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFREV_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_BFREV_B32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = V_BFREV_B32_e32 2, implicit $exec, implicit $mode
@@ -1118,12 +1086,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_ffbh_u32_e32
-    ; GCN: renamable $vgpr0 = V_FFBH_U32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_FFBH_U32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_FFBH_U32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_FFBH_U32_e32_:%[0-9]+]]:vgpr_32 = V_FFBH_U32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FFBH_U32_e32_1:%[0-9]+]]:vgpr_32 = V_FFBH_U32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FFBH_U32_e32_2:%[0-9]+]]:vgpr_32 = V_FFBH_U32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FFBH_U32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FFBH_U32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FFBH_U32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_FFBH_U32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = V_FFBH_U32_e32 2, implicit $exec, implicit $mode
@@ -1139,12 +1107,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_ffbl_b32_e32
-    ; GCN: renamable $vgpr0 = V_FFBL_B32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_FFBL_B32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_FFBL_B32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_FFBL_B32_e32_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FFBL_B32_e32_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FFBL_B32_e32_2:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FFBL_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FFBL_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FFBL_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_FFBL_B32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = V_FFBL_B32_e32 2, implicit $exec, implicit $mode
@@ -1160,12 +1128,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_ffbh_i32_e32
-    ; GCN: renamable $vgpr0 = V_FFBH_I32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_FFBH_I32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_FFBH_I32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_FFBH_I32_e32_:%[0-9]+]]:vgpr_32 = V_FFBH_I32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FFBH_I32_e32_1:%[0-9]+]]:vgpr_32 = V_FFBH_I32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FFBH_I32_e32_2:%[0-9]+]]:vgpr_32 = V_FFBH_I32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FFBH_I32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FFBH_I32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FFBH_I32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_FFBH_I32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = V_FFBH_I32_e32 2, implicit $exec, implicit $mode
@@ -1181,12 +1149,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_frexp_exp_i32_f64_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_FREXP_EXP_I32_F64_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FREXP_EXP_I32_F64_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FREXP_EXP_I32_F64_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_FREXP_EXP_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F64_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FREXP_EXP_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F64_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FREXP_EXP_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F64_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_EXP_I32_F64_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_EXP_I32_F64_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_EXP_I32_F64_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F64_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F64_e32 2, implicit $exec, implicit $mode
@@ -1202,12 +1170,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_frexp_mant_f64_e32
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_FREXP_MANT_F64_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_FREXP_MANT_F64_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_FREXP_MANT_F64_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_FREXP_MANT_F64_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FREXP_MANT_F64_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FREXP_MANT_F64_e32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FREXP_MANT_F64_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FREXP_MANT_F64_e32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FREXP_MANT_F64_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_MANT_F64_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_MANT_F64_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_MANT_F64_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vreg_64_align2 = nofpexcept V_FREXP_MANT_F64_e32 1, implicit $exec, implicit $mode
     %1:vreg_64_align2 = nofpexcept V_FREXP_MANT_F64_e32 2, implicit $exec, implicit $mode
@@ -1223,12 +1191,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_fract_f64_e32
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_FRACT_F64_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_FRACT_F64_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_FRACT_F64_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_FRACT_F64_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FRACT_F64_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FRACT_F64_e32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FRACT_F64_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FRACT_F64_e32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FRACT_F64_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FRACT_F64_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FRACT_F64_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FRACT_F64_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vreg_64_align2 = nofpexcept V_FRACT_F64_e32 1, implicit $exec, implicit $mode
     %1:vreg_64_align2 = nofpexcept V_FRACT_F64_e32 2, implicit $exec, implicit $mode
@@ -1244,12 +1212,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_frexp_exp_i32_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_FREXP_EXP_I32_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FREXP_EXP_I32_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FREXP_EXP_I32_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_FREXP_EXP_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FREXP_EXP_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FREXP_EXP_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_EXP_I32_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_EXP_I32_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_EXP_I32_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_FREXP_EXP_I32_F32_e32 2, implicit $exec, implicit $mode
@@ -1265,12 +1233,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_frexp_mant_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_FREXP_MANT_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FREXP_MANT_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FREXP_MANT_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_FREXP_MANT_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_MANT_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FREXP_MANT_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_MANT_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FREXP_MANT_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FREXP_MANT_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_MANT_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_MANT_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FREXP_MANT_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_FREXP_MANT_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_FREXP_MANT_F32_e32 2, implicit $exec, implicit $mode
@@ -1286,12 +1254,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_exp_legacy_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_EXP_LEGACY_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_EXP_LEGACY_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_EXP_LEGACY_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_EXP_LEGACY_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_LEGACY_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_EXP_LEGACY_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_LEGACY_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_EXP_LEGACY_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_LEGACY_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_EXP_LEGACY_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_EXP_LEGACY_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_EXP_LEGACY_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_EXP_LEGACY_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_EXP_LEGACY_F32_e32 2, implicit $exec, implicit $mode
@@ -1307,12 +1275,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_log_legacy_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_LOG_LEGACY_F32_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LOG_LEGACY_F32_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_LOG_LEGACY_F32_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_LOG_LEGACY_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_LEGACY_F32_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_LOG_LEGACY_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_LEGACY_F32_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_LOG_LEGACY_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_LEGACY_F32_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LOG_LEGACY_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LOG_LEGACY_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LOG_LEGACY_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = nofpexcept V_LOG_LEGACY_F32_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = nofpexcept V_LOG_LEGACY_F32_e32 2, implicit $exec, implicit $mode
@@ -1328,12 +1296,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_sat_pk_u8_i16_e32
-    ; GCN: renamable $vgpr0 = V_SAT_PK_U8_I16_e32 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_SAT_PK_U8_I16_e32 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_SAT_PK_U8_I16_e32 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_SAT_PK_U8_I16_e32_:%[0-9]+]]:vgpr_32 = V_SAT_PK_U8_I16_e32 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_SAT_PK_U8_I16_e32_1:%[0-9]+]]:vgpr_32 = V_SAT_PK_U8_I16_e32 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_SAT_PK_U8_I16_e32_2:%[0-9]+]]:vgpr_32 = V_SAT_PK_U8_I16_e32 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAT_PK_U8_I16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAT_PK_U8_I16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAT_PK_U8_I16_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_SAT_PK_U8_I16_e32 1, implicit $exec, implicit $mode
     %1:vgpr_32 = V_SAT_PK_U8_I16_e32 2, implicit $exec, implicit $mode
@@ -1349,12 +1317,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_accvgpr_mov_b32
-    ; GCN: renamable $agpr0 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec
-    ; GCN-NEXT: renamable $agpr1 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $agpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $agpr1
-    ; GCN-NEXT: renamable $agpr0 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $agpr0
+    ; GCN: [[V_ACCVGPR_MOV_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec
+    ; GCN-NEXT: [[V_ACCVGPR_MOV_B32_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec
+    ; GCN-NEXT: [[V_ACCVGPR_MOV_B32_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_MOV_B32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_MOV_B32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ACCVGPR_MOV_B32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:agpr_32 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec
     %1:agpr_32 = V_ACCVGPR_MOV_B32 undef $agpr0, implicit $exec
@@ -1372,16 +1340,12 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_no_remat_v_cndmask_b32_e32
-    ; GCN: renamable $vgpr0 = V_CNDMASK_B32_e32 1, undef $vgpr0, implicit $exec, implicit undef $vcc
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = V_CNDMASK_B32_e32 1, undef $vgpr0, implicit $exec, implicit undef $vcc
-    ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e32 1, undef $vgpr0, implicit $exec, implicit undef $vcc
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 1, undef %1:vgpr_32, implicit $exec, implicit undef $vcc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e32_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 1, undef %1:vgpr_32, implicit $exec, implicit undef $vcc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e32_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 1, undef %1:vgpr_32, implicit $exec, implicit undef $vcc
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CNDMASK_B32_e32 1, undef %0:vgpr_32, implicit $exec, implicit undef $vcc
     %2:vgpr_32 = V_CNDMASK_B32_e32 1, undef %0:vgpr_32, implicit $exec, implicit undef $vcc
@@ -1399,16 +1363,12 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_no_remat_v_cndmask_b32_sdwa
-    ; GCN: renamable $vgpr0 = V_CNDMASK_B32_sdwa 0, undef $vgpr0, 0, undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = V_CNDMASK_B32_sdwa 0, undef $vgpr0, 0, undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc
-    ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_sdwa 0, undef $vgpr0, 0, undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc
+    ; GCN-NEXT: [[V_CNDMASK_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc
+    ; GCN-NEXT: [[V_CNDMASK_B32_sdwa2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_sdwa]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_sdwa1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_sdwa2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CNDMASK_B32_sdwa 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc
     %2:vgpr_32 = V_CNDMASK_B32_sdwa 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 0, 0, 0, 0, 0, implicit $exec, implicit undef $vcc
@@ -1426,16 +1386,12 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_no_remat_v_cndmask_b32_dpp
-    ; GCN: renamable $vgpr0 = V_CNDMASK_B32_dpp undef $vgpr0, 0, undef $vgpr0, 0, undef $vgpr0, 1, 15, 15, 10, implicit $exec, implicit undef $vcc
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = V_CNDMASK_B32_dpp undef $vgpr1, 0, undef $vgpr0, 0, undef $vgpr0, 1, 15, 15, 10, implicit $exec, implicit undef $vcc
-    ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_dpp undef $vgpr0, 0, undef $vgpr0, 0, undef $vgpr0, 1, 15, 15, 10, implicit $exec, implicit undef $vcc
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CNDMASK_B32_dpp:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_dpp undef [[V_CNDMASK_B32_dpp]], 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit undef $vcc
+    ; GCN-NEXT: [[V_CNDMASK_B32_dpp1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_dpp undef [[V_CNDMASK_B32_dpp1]], 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit undef $vcc
+    ; GCN-NEXT: [[V_CNDMASK_B32_dpp2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_dpp undef [[V_CNDMASK_B32_dpp2]], 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit undef $vcc
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_dpp]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_dpp1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_dpp2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CNDMASK_B32_dpp undef %1:vgpr_32, 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit undef $vcc
     %2:vgpr_32 = V_CNDMASK_B32_dpp undef %2:vgpr_32, 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit undef $vcc
@@ -1451,12 +1407,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cndmask_b32_e64
-    ; GCN: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef $sgpr0_sgpr1, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef $sgpr0_sgpr1, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef $sgpr0_sgpr1, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CNDMASK_B32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef %0:sreg_64_xexec, implicit $exec
     %2:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 2, undef %0:sreg_64_xexec, implicit $exec
@@ -1472,12 +1428,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_madmk_f32
-    ; GCN: renamable $vgpr0 = nofpexcept V_MADMK_F32 1, 1, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADMK_F32 2, 2, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MADMK_F32 3, 3, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MADMK_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MADMK_F32 1, 1, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MADMK_F32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MADMK_F32 2, 2, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MADMK_F32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MADMK_F32 3, 3, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MADMK_F32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MADMK_F32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MADMK_F32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_MADMK_F32 1, 1, undef %0:vgpr_32, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_MADMK_F32 2, 2, undef %0:vgpr_32, implicit $exec, implicit $mode
@@ -1493,12 +1449,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_add_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_ADD_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_ADD_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_ADD_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_ADD_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode
@@ -1514,12 +1470,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_add_f32_e64
-    ; GCN: renamable $vgpr0 = nofpexcept V_ADD_F32_e64 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F32_e64 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_ADD_F32_e64 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -1535,12 +1491,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_add_f32_sdwa
-    ; GCN: renamable $vgpr0 = nofpexcept V_ADD_F32_sdwa 0, undef $vgpr0, 0, undef $vgpr0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F32_sdwa 0, undef $vgpr0, 0, undef $vgpr0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_ADD_F32_sdwa 0, undef $vgpr0, 0, undef $vgpr0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_ADD_F32_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_sdwa 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_ADD_F32_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_sdwa 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_ADD_F32_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_sdwa 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_sdwa]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_sdwa1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_sdwa2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_ADD_F32_sdwa 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_ADD_F32_sdwa 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $mode
@@ -1558,16 +1514,12 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_no_remat_v_add_f32_dpp
-    ; GCN: renamable $vgpr0 = nofpexcept V_ADD_F32_dpp undef $vgpr0, 0, undef $vgpr0, 0, undef $vgpr0, 1, 15, 15, 10, implicit $exec, implicit $mode
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F32_dpp undef $vgpr1, 0, undef $vgpr0, 0, undef $vgpr0, 1, 15, 15, 10, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_ADD_F32_dpp undef $vgpr0, 0, undef $vgpr0, 0, undef $vgpr0, 1, 15, 15, 10, implicit $exec, implicit $mode
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_ADD_F32_dpp:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_dpp undef [[V_ADD_F32_dpp]], 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_ADD_F32_dpp1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_dpp undef [[V_ADD_F32_dpp1]], 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_ADD_F32_dpp2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_dpp undef [[V_ADD_F32_dpp2]], 0, undef %1:vgpr_32, 0, undef %1:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_dpp]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_dpp1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F32_dpp2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_ADD_F32_dpp undef %1:vgpr_32, 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_ADD_F32_dpp undef %2:vgpr_32, 0, undef %0:vgpr_32, 0, undef %0:vgpr_32, 1, 15, 15, 10, implicit $exec, implicit $mode
@@ -1583,12 +1535,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_sub_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_SUB_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_SUB_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_SUB_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_SUB_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_SUB_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_SUB_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_SUB_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode
@@ -1604,12 +1556,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_subrev_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_SUBREV_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_SUBREV_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_SUBREV_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_SUBREV_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_SUBREV_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_SUBREV_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_SUBREV_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode
@@ -1625,12 +1577,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mul_legacy_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_MUL_LEGACY_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_LEGACY_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MUL_LEGACY_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MUL_LEGACY_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_LEGACY_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MUL_LEGACY_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_LEGACY_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MUL_LEGACY_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_LEGACY_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LEGACY_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LEGACY_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LEGACY_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_MUL_LEGACY_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_MUL_LEGACY_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode
@@ -1646,12 +1598,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mul_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_MUL_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MUL_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_MUL_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_MUL_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode
@@ -1667,12 +1619,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mul_i32_i24_e32
-    ; GCN: renamable $vgpr0 = V_MUL_I32_I24_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_I32_I24_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MUL_I32_I24_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MUL_I32_I24_e32_:%[0-9]+]]:vgpr_32 = V_MUL_I32_I24_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_I32_I24_e32_1:%[0-9]+]]:vgpr_32 = V_MUL_I32_I24_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_I32_I24_e32_2:%[0-9]+]]:vgpr_32 = V_MUL_I32_I24_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_I32_I24_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_I32_I24_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_I32_I24_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MUL_I32_I24_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MUL_I32_I24_e32 2, undef %0:vgpr_32, implicit $exec
@@ -1688,12 +1640,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mul_hi_i32_i24_e32
-    ; GCN: renamable $vgpr0 = V_MUL_HI_I32_I24_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_HI_I32_I24_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MUL_HI_I32_I24_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MUL_HI_I32_I24_e32_:%[0-9]+]]:vgpr_32 = V_MUL_HI_I32_I24_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_HI_I32_I24_e32_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_I32_I24_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_HI_I32_I24_e32_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_I32_I24_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_I32_I24_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_I32_I24_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_I32_I24_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MUL_HI_I32_I24_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MUL_HI_I32_I24_e32 2, undef %0:vgpr_32, implicit $exec
@@ -1709,12 +1661,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mul_u32_u24_e32
-    ; GCN: renamable $vgpr0 = V_MUL_U32_U24_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_U32_U24_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MUL_U32_U24_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MUL_U32_U24_e32_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_U32_U24_e32_1:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_U32_U24_e32_2:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_U32_U24_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_U32_U24_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_U32_U24_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MUL_U32_U24_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MUL_U32_U24_e32 2, undef %0:vgpr_32, implicit $exec
@@ -1730,12 +1682,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mul_hi_u32_u24_e32
-    ; GCN: renamable $vgpr0 = V_MUL_HI_U32_U24_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_HI_U32_U24_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MUL_HI_U32_U24_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MUL_HI_U32_U24_e32_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_U24_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_HI_U32_U24_e32_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_U24_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_HI_U32_U24_e32_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_U24_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_U32_U24_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_U32_U24_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_U32_U24_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MUL_HI_U32_U24_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MUL_HI_U32_U24_e32 2, undef %0:vgpr_32, implicit $exec
@@ -1751,12 +1703,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_min_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_MIN_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MIN_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MIN_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MIN_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MIN_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_MIN_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_MIN_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode
@@ -1772,12 +1724,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_max_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_MAX_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MAX_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MAX_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MAX_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MAX_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_MAX_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_MAX_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode
@@ -1793,12 +1745,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_min_i32_e32
-    ; GCN: renamable $vgpr0 = V_MIN_I32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_I32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MIN_I32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MIN_I32_e32_:%[0-9]+]]:vgpr_32 = V_MIN_I32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MIN_I32_e32_1:%[0-9]+]]:vgpr_32 = V_MIN_I32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MIN_I32_e32_2:%[0-9]+]]:vgpr_32 = V_MIN_I32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MIN_I32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MIN_I32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -1814,12 +1766,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_max_i32_e32
-    ; GCN: renamable $vgpr0 = V_MAX_I32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_I32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MAX_I32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MAX_I32_e32_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MAX_I32_e32_1:%[0-9]+]]:vgpr_32 = V_MAX_I32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MAX_I32_e32_2:%[0-9]+]]:vgpr_32 = V_MAX_I32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MAX_I32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MAX_I32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -1835,12 +1787,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_min_u32_e32
-    ; GCN: renamable $vgpr0 = V_MIN_U32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_U32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MIN_U32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MIN_U32_e32_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MIN_U32_e32_1:%[0-9]+]]:vgpr_32 = V_MIN_U32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MIN_U32_e32_2:%[0-9]+]]:vgpr_32 = V_MIN_U32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MIN_U32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MIN_U32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -1856,12 +1808,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_max_u32_e32
-    ; GCN: renamable $vgpr0 = V_MAX_U32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_U32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MAX_U32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MAX_U32_e32_:%[0-9]+]]:vgpr_32 = V_MAX_U32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MAX_U32_e32_1:%[0-9]+]]:vgpr_32 = V_MAX_U32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MAX_U32_e32_2:%[0-9]+]]:vgpr_32 = V_MAX_U32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MAX_U32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MAX_U32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -1877,12 +1829,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_lshrrev_b32_e32
-    ; GCN: renamable $vgpr0 = V_LSHRREV_B32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_LSHRREV_B32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LSHRREV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LSHRREV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_LSHRREV_B32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_LSHRREV_B32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -1898,12 +1850,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_lshlrev_b32_e32
-    ; GCN: renamable $vgpr0 = V_LSHLREV_B32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_LSHLREV_B32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_LSHLREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LSHLREV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LSHLREV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_LSHLREV_B32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_LSHLREV_B32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -1919,12 +1871,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_ashrrev_i32_e32
-    ; GCN: renamable $vgpr0 = V_ASHRREV_I32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_ASHRREV_I32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_ASHRREV_I32_e32_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ASHRREV_I32_e32_1:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ASHRREV_I32_e32_2:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_ASHRREV_I32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_ASHRREV_I32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -1940,12 +1892,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_and_b32_e32
-    ; GCN: renamable $vgpr0 = V_AND_B32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_AND_B32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_AND_B32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_AND_B32_e32_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_AND_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_AND_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_AND_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_AND_B32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_AND_B32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -1961,12 +1913,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_or_b32_e32
-    ; GCN: renamable $vgpr0 = V_OR_B32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_OR_B32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_OR_B32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_OR_B32_e32_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_OR_B32_e32_2:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_OR_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_OR_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_OR_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_OR_B32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_OR_B32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -1982,12 +1934,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_xor_b32_e32
-    ; GCN: renamable $vgpr0 = V_XOR_B32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_XOR_B32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_XOR_B32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_XOR_B32_e32_1:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_XOR_B32_e32_2:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_XOR_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_XOR_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_XOR_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_XOR_B32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_XOR_B32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2003,12 +1955,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_madak_f32
-    ; GCN: renamable $vgpr0 = nofpexcept V_MADAK_F32 1, undef $vgpr0, 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADAK_F32 2, undef $vgpr0, 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MADAK_F32 3, undef $vgpr0, 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MADAK_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MADAK_F32 1, undef %1:vgpr_32, 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MADAK_F32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MADAK_F32 2, undef %1:vgpr_32, 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MADAK_F32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MADAK_F32 3, undef %1:vgpr_32, 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MADAK_F32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MADAK_F32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MADAK_F32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_MADAK_F32 1, undef %0:vgpr_32, 1, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_MADAK_F32 2, undef %0:vgpr_32, 2, implicit $exec, implicit $mode
@@ -2024,12 +1976,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_add_u32_e32
-    ; GCN: renamable $vgpr0 = V_ADD_U32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_ADD_U32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_ADD_U32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_ADD_U32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2045,12 +1997,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_sub_u32_e32
-    ; GCN: renamable $vgpr0 = V_SUB_U32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_SUB_U32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_SUB_U32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_SUB_U32_e32_2:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_SUB_U32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_SUB_U32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2066,12 +2018,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_subrev_u32_e32
-    ; GCN: renamable $vgpr0 = V_SUBREV_U32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_SUBREV_U32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_SUBREV_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_SUBREV_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_SUBREV_U32_e32_2:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_SUBREV_U32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_SUBREV_U32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2087,12 +2039,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_bfm_b32_e32
-    ; GCN: renamable $vgpr0 = V_BFM_B32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_BFM_B32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_BFM_B32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_BFM_B32_e32_:%[0-9]+]]:vgpr_32 = V_BFM_B32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_BFM_B32_e32_1:%[0-9]+]]:vgpr_32 = V_BFM_B32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_BFM_B32_e32_2:%[0-9]+]]:vgpr_32 = V_BFM_B32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFM_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFM_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFM_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_BFM_B32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_BFM_B32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2108,12 +2060,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_bcnt_u32_b32_e32
-    ; GCN: renamable $vgpr0 = V_BCNT_U32_B32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_BCNT_U32_B32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_BCNT_U32_B32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_BCNT_U32_B32_e32_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_BCNT_U32_B32_e32_1:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_BCNT_U32_B32_e32_2:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BCNT_U32_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BCNT_U32_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BCNT_U32_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_BCNT_U32_B32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_BCNT_U32_B32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2129,12 +2081,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mbcnt_lo_u32_b32_e32
-    ; GCN: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MBCNT_LO_U32_B32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MBCNT_LO_U32_B32_e32_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MBCNT_LO_U32_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MBCNT_LO_U32_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MBCNT_LO_U32_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MBCNT_LO_U32_B32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MBCNT_LO_U32_B32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2150,12 +2102,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mbcnt_hi_u32_b32_e32
-    ; GCN: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MBCNT_HI_U32_B32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MBCNT_HI_U32_B32_e32_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MBCNT_HI_U32_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MBCNT_HI_U32_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MBCNT_HI_U32_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MBCNT_HI_U32_B32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MBCNT_HI_U32_B32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2171,12 +2123,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_ldexp_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_LDEXP_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_LDEXP_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_LDEXP_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_LDEXP_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_LDEXP_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_LDEXP_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_LDEXP_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode
@@ -2192,12 +2144,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_pknorm_i16_f32_e32
-    ; GCN: renamable $vgpr0 = V_CVT_PKNORM_I16_F32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_PKNORM_I16_F32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_PKNORM_I16_F32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_PKNORM_I16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_PKNORM_I16_F32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_CVT_PKNORM_I16_F32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_PKNORM_I16_F32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_CVT_PKNORM_I16_F32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_PKNORM_I16_F32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKNORM_I16_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKNORM_I16_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKNORM_I16_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CVT_PKNORM_I16_F32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_CVT_PKNORM_I16_F32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2213,12 +2165,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_pknorm_u16_f32_e32
-    ; GCN: renamable $vgpr0 = V_CVT_PKNORM_U16_F32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_PKNORM_U16_F32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_PKNORM_U16_F32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_PKNORM_U16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_PKNORM_U16_F32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_CVT_PKNORM_U16_F32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_PKNORM_U16_F32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_CVT_PKNORM_U16_F32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_PKNORM_U16_F32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKNORM_U16_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKNORM_U16_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKNORM_U16_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CVT_PKNORM_U16_F32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_CVT_PKNORM_U16_F32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2234,12 +2186,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_pkrtz_f16_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_PKRTZ_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_PKRTZ_F16_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_PKRTZ_F16_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKRTZ_F16_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKRTZ_F16_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PKRTZ_F16_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode
@@ -2255,12 +2207,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_pk_u16_u32_e32
-    ; GCN: renamable $vgpr0 = V_CVT_PK_U16_U32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_PK_U16_U32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_PK_U16_U32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_PK_U16_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_PK_U16_U32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_CVT_PK_U16_U32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_PK_U16_U32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_CVT_PK_U16_U32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_PK_U16_U32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_U16_U32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_U16_U32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_U16_U32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CVT_PK_U16_U32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_CVT_PK_U16_U32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2276,12 +2228,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_pk_i16_i32_e32
-    ; GCN: renamable $vgpr0 = V_CVT_PK_I16_I32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_CVT_PK_I16_I32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CVT_PK_I16_I32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_PK_I16_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_PK_I16_I32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_CVT_PK_I16_I32_e32_1:%[0-9]+]]:vgpr_32 = V_CVT_PK_I16_I32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_CVT_PK_I16_I32_e32_2:%[0-9]+]]:vgpr_32 = V_CVT_PK_I16_I32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_I16_I32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_I16_I32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_I16_I32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CVT_PK_I16_I32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_CVT_PK_I16_I32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2297,12 +2249,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_min_legacy_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_MIN_LEGACY_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_LEGACY_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MIN_LEGACY_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MIN_LEGACY_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_LEGACY_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MIN_LEGACY_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_LEGACY_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MIN_LEGACY_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_LEGACY_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_LEGACY_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_LEGACY_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_LEGACY_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_MIN_LEGACY_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_MIN_LEGACY_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode
@@ -2318,12 +2270,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_max_legacy_f32_e32
-    ; GCN: renamable $vgpr0 = nofpexcept V_MAX_LEGACY_F32_e32 1, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_LEGACY_F32_e32 2, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MAX_LEGACY_F32_e32 3, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MAX_LEGACY_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_LEGACY_F32_e32 1, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MAX_LEGACY_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_LEGACY_F32_e32 2, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MAX_LEGACY_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_LEGACY_F32_e32 3, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_LEGACY_F32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_LEGACY_F32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_LEGACY_F32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_MAX_LEGACY_F32_e32 1, undef %0:vgpr_32, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_MAX_LEGACY_F32_e32 2, undef %0:vgpr_32, implicit $exec, implicit $mode
@@ -2339,12 +2291,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_lshr_b32_e32
-    ; GCN: renamable $vgpr0 = V_LSHR_B32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHR_B32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_LSHR_B32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_LSHR_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHR_B32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LSHR_B32_e32_1:%[0-9]+]]:vgpr_32 = V_LSHR_B32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LSHR_B32_e32_2:%[0-9]+]]:vgpr_32 = V_LSHR_B32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHR_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHR_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHR_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_LSHR_B32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_LSHR_B32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2360,12 +2312,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_lshl_b32_e32
-    ; GCN: renamable $vgpr0 = V_LSHL_B32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHL_B32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_LSHL_B32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_LSHL_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHL_B32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LSHL_B32_e32_1:%[0-9]+]]:vgpr_32 = V_LSHL_B32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LSHL_B32_e32_2:%[0-9]+]]:vgpr_32 = V_LSHL_B32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_LSHL_B32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_LSHL_B32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2381,12 +2333,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_ashr_i32_e32
-    ; GCN: renamable $vgpr0 = V_ASHR_I32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_ASHR_I32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_ASHR_I32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_ASHR_I32_e32_:%[0-9]+]]:vgpr_32 = V_ASHR_I32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ASHR_I32_e32_1:%[0-9]+]]:vgpr_32 = V_ASHR_I32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ASHR_I32_e32_2:%[0-9]+]]:vgpr_32 = V_ASHR_I32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHR_I32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHR_I32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHR_I32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_ASHR_I32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_ASHR_I32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2402,12 +2354,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_xnor_b32_e32
-    ; GCN: renamable $vgpr0 = V_XNOR_B32_e32 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_XNOR_B32_e32 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_XNOR_B32_e32 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_XNOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XNOR_B32_e32 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_XNOR_B32_e32_1:%[0-9]+]]:vgpr_32 = V_XNOR_B32_e32 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_XNOR_B32_e32_2:%[0-9]+]]:vgpr_32 = V_XNOR_B32_e32 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_XNOR_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_XNOR_B32_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_XNOR_B32_e32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_XNOR_B32_e32 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_XNOR_B32_e32 2, undef %0:vgpr_32, implicit $exec
@@ -2423,12 +2375,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_fmamk_f32
-    ; GCN: renamable $vgpr0 = nofpexcept V_FMAMK_F32 1, 1, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMAMK_F32 2, 2, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FMAMK_F32 3, 3, undef $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_FMAMK_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAMK_F32 1, 1, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FMAMK_F32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAMK_F32 2, 2, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FMAMK_F32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAMK_F32 3, 3, undef %1:vgpr_32, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMAMK_F32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMAMK_F32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMAMK_F32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_FMAMK_F32 1, 1, undef %0:vgpr_32, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_FMAMK_F32 2, 2, undef %0:vgpr_32, implicit $exec, implicit $mode
@@ -2444,12 +2396,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_fmaak_f32
-    ; GCN: renamable $vgpr0 = nofpexcept V_FMAAK_F32 1, undef $vgpr0, 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMAAK_F32 2, undef $vgpr0, 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FMAAK_F32 3, undef $vgpr0, 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_FMAAK_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAAK_F32 1, undef %1:vgpr_32, 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FMAAK_F32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAAK_F32 2, undef %1:vgpr_32, 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FMAAK_F32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAAK_F32 3, undef %1:vgpr_32, 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMAAK_F32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMAAK_F32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMAAK_F32_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_FMAAK_F32 1, undef %0:vgpr_32, 1, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_FMAAK_F32 2, undef %0:vgpr_32, 2, implicit $exec, implicit $mode
@@ -2465,12 +2417,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mad_legacy_f32_e64
-    ; GCN: renamable $vgpr0 = nofpexcept V_MAD_LEGACY_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAD_LEGACY_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MAD_LEGACY_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MAD_LEGACY_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_LEGACY_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MAD_LEGACY_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_LEGACY_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MAD_LEGACY_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_LEGACY_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_LEGACY_F32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_LEGACY_F32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_LEGACY_F32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_MAD_LEGACY_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_MAD_LEGACY_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -2486,12 +2438,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mad_f32_e64
-    ; GCN: renamable $vgpr0 = nofpexcept V_MAD_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAD_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_MAD_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_F32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_F32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_F32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_MAD_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_MAD_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -2507,12 +2459,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_fma_legacy_f32_e64
-    ; GCN: renamable $vgpr0 = nofpexcept V_FMA_LEGACY_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMA_LEGACY_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FMA_LEGACY_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_FMA_LEGACY_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_LEGACY_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FMA_LEGACY_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_LEGACY_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FMA_LEGACY_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_LEGACY_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_LEGACY_F32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_LEGACY_F32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_LEGACY_F32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_FMA_LEGACY_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_FMA_LEGACY_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -2528,12 +2480,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_fma_f32_e64
-    ; GCN: renamable $vgpr0 = nofpexcept V_FMA_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMA_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_FMA_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FMA_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FMA_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_F32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_F32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_F32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -2549,12 +2501,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mad_i32_i24_e64
-    ; GCN: renamable $vgpr0 = V_MAD_I32_I24_e64 1, 1, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MAD_I32_I24_e64 2, 2, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MAD_I32_I24_e64 3, 3, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MAD_I32_I24_e64_:%[0-9]+]]:vgpr_32 = V_MAD_I32_I24_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_MAD_I32_I24_e64_1:%[0-9]+]]:vgpr_32 = V_MAD_I32_I24_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_MAD_I32_I24_e64_2:%[0-9]+]]:vgpr_32 = V_MAD_I32_I24_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_I32_I24_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_I32_I24_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_I32_I24_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MAD_I32_I24_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec
     %2:vgpr_32 = V_MAD_I32_I24_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec
@@ -2570,12 +2522,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mad_u32_u24_e64
-    ; GCN: renamable $vgpr0 = V_MAD_U32_U24_e64 1, 1, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MAD_U32_U24_e64 2, 2, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MAD_U32_U24_e64 3, 3, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MAD_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MAD_U32_U24_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_MAD_U32_U24_e64_1:%[0-9]+]]:vgpr_32 = V_MAD_U32_U24_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_MAD_U32_U24_e64_2:%[0-9]+]]:vgpr_32 = V_MAD_U32_U24_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_U32_U24_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_U32_U24_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_U32_U24_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MAD_U32_U24_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec
     %2:vgpr_32 = V_MAD_U32_U24_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec
@@ -2591,12 +2543,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_lerp_u8_e64
-    ; GCN: renamable $vgpr0 = V_LERP_U8_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_LERP_U8_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_LERP_U8_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_LERP_U8_e64_:%[0-9]+]]:vgpr_32 = V_LERP_U8_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LERP_U8_e64_1:%[0-9]+]]:vgpr_32 = V_LERP_U8_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LERP_U8_e64_2:%[0-9]+]]:vgpr_32 = V_LERP_U8_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LERP_U8_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LERP_U8_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LERP_U8_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_LERP_U8_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_LERP_U8_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -2612,12 +2564,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_fma_f64_e64
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_FMA_F64_e64 0, 1, 0, 1, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_FMA_F64_e64 0, 2, 0, 2, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_FMA_F64_e64 0, 3, 0, 3, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_FMA_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMA_F64_e64 0, 1, 0, 1, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FMA_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMA_F64_e64 0, 2, 0, 2, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FMA_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMA_F64_e64 0, 3, 0, 3, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_F64_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_F64_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_F64_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vreg_64_align2 = nofpexcept V_FMA_F64_e64 0, 1, 0, 1, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
     %2:vreg_64_align2 = nofpexcept V_FMA_F64_e64 0, 2, 0, 2, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
@@ -2633,12 +2585,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_add_f64_e64
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_ADD_F64_e64 0, 1, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_ADD_F64_e64 0, 2, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_ADD_F64_e64 0, 3, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, 1, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, 2, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, 3, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F64_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F64_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F64_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, 1, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
     %2:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, 2, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
@@ -2654,12 +2606,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mul_f64_e64
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_MUL_F64_e64 0, 1, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_MUL_F64_e64 0, 2, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_MUL_F64_e64 0, 3, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MUL_F64_e64 0, 1, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MUL_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MUL_F64_e64 0, 2, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MUL_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MUL_F64_e64 0, 3, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F64_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F64_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F64_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vreg_64_align2 = nofpexcept V_MUL_F64_e64 0, 1, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
     %2:vreg_64_align2 = nofpexcept V_MUL_F64_e64 0, 2, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
@@ -2675,12 +2627,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_min_f64_e64
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_MIN_F64_e64 0, 1, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_MIN_F64_e64 0, 2, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_MIN_F64_e64 0, 3, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_MIN_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MIN_F64_e64 0, 1, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MIN_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MIN_F64_e64 0, 2, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MIN_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MIN_F64_e64 0, 3, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F64_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F64_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F64_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vreg_64_align2 = nofpexcept V_MIN_F64_e64 0, 1, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
     %2:vreg_64_align2 = nofpexcept V_MIN_F64_e64 0, 2, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
@@ -2696,12 +2648,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_max_f64_e64
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_MAX_F64_e64 0, 1, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_MAX_F64_e64 0, 2, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_MAX_F64_e64 0, 3, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_MAX_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MAX_F64_e64 0, 1, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MAX_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MAX_F64_e64 0, 2, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MAX_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_MAX_F64_e64 0, 3, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F64_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F64_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F64_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vreg_64_align2 = nofpexcept V_MAX_F64_e64 0, 1, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
     %2:vreg_64_align2 = nofpexcept V_MAX_F64_e64 0, 2, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
@@ -2717,12 +2669,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mul_lo_u32_e64
-    ; GCN: renamable $vgpr0 = V_MUL_LO_U32_e64 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U32_e64 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MUL_LO_U32_e64 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MUL_LO_U32_e64 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MUL_LO_U32_e64 2, undef %0:vgpr_32, implicit $exec
@@ -2738,12 +2690,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mul_hi_u32_e64
-    ; GCN: renamable $vgpr0 = V_MUL_HI_U32_e64 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_HI_U32_e64 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MUL_HI_U32_e64 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_HI_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_HI_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_U32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_U32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_U32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MUL_HI_U32_e64 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MUL_HI_U32_e64 2, undef %0:vgpr_32, implicit $exec
@@ -2759,12 +2711,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mul_lo_i32_e64
-    ; GCN: renamable $vgpr0 = V_MUL_LO_I32_e64 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_I32_e64 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MUL_LO_I32_e64 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MUL_LO_I32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_I32_e64 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_LO_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_I32_e64 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_LO_I32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_I32_e64 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_I32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_I32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_I32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MUL_LO_I32_e64 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MUL_LO_I32_e64 2, undef %0:vgpr_32, implicit $exec
@@ -2780,12 +2732,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mul_hi_i32_e64
-    ; GCN: renamable $vgpr0 = V_MUL_HI_I32_e64 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_HI_I32_e64 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MUL_HI_I32_e64 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MUL_HI_I32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_I32_e64 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_HI_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_I32_e64 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MUL_HI_I32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_I32_e64 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_I32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_I32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_HI_I32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MUL_HI_I32_e64 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MUL_HI_I32_e64 2, undef %0:vgpr_32, implicit $exec
@@ -2801,12 +2753,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cubeid_f32_e64
-    ; GCN: renamable $vgpr0 = V_CUBEID_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_CUBEID_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CUBEID_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CUBEID_F32_e64_:%[0-9]+]]:vgpr_32 = V_CUBEID_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CUBEID_F32_e64_1:%[0-9]+]]:vgpr_32 = V_CUBEID_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CUBEID_F32_e64_2:%[0-9]+]]:vgpr_32 = V_CUBEID_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CUBEID_F32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CUBEID_F32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CUBEID_F32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CUBEID_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = V_CUBEID_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -2822,12 +2774,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cubesc_f32_e64
-    ; GCN: renamable $vgpr0 = V_CUBESC_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_CUBESC_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CUBESC_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CUBESC_F32_e64_:%[0-9]+]]:vgpr_32 = V_CUBESC_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CUBESC_F32_e64_1:%[0-9]+]]:vgpr_32 = V_CUBESC_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CUBESC_F32_e64_2:%[0-9]+]]:vgpr_32 = V_CUBESC_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CUBESC_F32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CUBESC_F32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CUBESC_F32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CUBESC_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = V_CUBESC_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -2843,12 +2795,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cubetc_f32_e64
-    ; GCN: renamable $vgpr0 = V_CUBETC_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_CUBETC_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CUBETC_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CUBETC_F32_e64_:%[0-9]+]]:vgpr_32 = V_CUBETC_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CUBETC_F32_e64_1:%[0-9]+]]:vgpr_32 = V_CUBETC_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CUBETC_F32_e64_2:%[0-9]+]]:vgpr_32 = V_CUBETC_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CUBETC_F32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CUBETC_F32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CUBETC_F32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CUBETC_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = V_CUBETC_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -2864,12 +2816,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cubema_f32_e64
-    ; GCN: renamable $vgpr0 = V_CUBEMA_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_CUBEMA_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_CUBEMA_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CUBEMA_F32_e64_:%[0-9]+]]:vgpr_32 = V_CUBEMA_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CUBEMA_F32_e64_1:%[0-9]+]]:vgpr_32 = V_CUBEMA_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CUBEMA_F32_e64_2:%[0-9]+]]:vgpr_32 = V_CUBEMA_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CUBEMA_F32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CUBEMA_F32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CUBEMA_F32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_CUBEMA_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = V_CUBEMA_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -2885,12 +2837,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_bfe_u32_e64
-    ; GCN: renamable $vgpr0 = V_BFE_U32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_BFE_U32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_BFE_U32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_BFE_U32_e64_1:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_BFE_U32_e64_2:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFE_U32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFE_U32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFE_U32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_BFE_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_BFE_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -2906,12 +2858,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_bfe_i32_e64
-    ; GCN: renamable $vgpr0 = V_BFE_I32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_BFE_I32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_BFE_I32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_BFE_I32_e64_1:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_BFE_I32_e64_2:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFE_I32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFE_I32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFE_I32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_BFE_I32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_BFE_I32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -2927,12 +2879,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_bfi_b32_e64
-    ; GCN: renamable $vgpr0 = V_BFI_B32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_BFI_B32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_BFI_B32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_BFI_B32_e64_1:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_BFI_B32_e64_2:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFI_B32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFI_B32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_BFI_B32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_BFI_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_BFI_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -2948,12 +2900,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_alignbit_b32_e64
-    ; GCN: renamable $vgpr0 = V_ALIGNBIT_B32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_ALIGNBIT_B32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_ALIGNBIT_B32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ALIGNBIT_B32_e64_1:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ALIGNBIT_B32_e64_2:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ALIGNBIT_B32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ALIGNBIT_B32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ALIGNBIT_B32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_ALIGNBIT_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_ALIGNBIT_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -2969,12 +2921,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_alignbyte_b32_e64
-    ; GCN: renamable $vgpr0 = V_ALIGNBYTE_B32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_ALIGNBYTE_B32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_ALIGNBYTE_B32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_ALIGNBYTE_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBYTE_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ALIGNBYTE_B32_e64_1:%[0-9]+]]:vgpr_32 = V_ALIGNBYTE_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ALIGNBYTE_B32_e64_2:%[0-9]+]]:vgpr_32 = V_ALIGNBYTE_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ALIGNBYTE_B32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ALIGNBYTE_B32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ALIGNBYTE_B32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_ALIGNBYTE_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_ALIGNBYTE_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -2990,12 +2942,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_min3_i32_e64
-    ; GCN: renamable $vgpr0 = V_MIN3_I32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN3_I32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MIN3_I32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MIN3_I32_e64_:%[0-9]+]]:vgpr_32 = V_MIN3_I32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MIN3_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN3_I32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MIN3_I32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN3_I32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_I32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_I32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_I32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MIN3_I32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MIN3_I32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3011,12 +2963,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_min3_u32_e64
-    ; GCN: renamable $vgpr0 = V_MIN3_U32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN3_U32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MIN3_U32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MIN3_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN3_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MIN3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN3_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MIN3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN3_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_U32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_U32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_U32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MIN3_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MIN3_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3032,12 +2984,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_max3_i32_e64
-    ; GCN: renamable $vgpr0 = V_MAX3_I32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX3_I32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MAX3_I32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MAX3_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX3_I32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MAX3_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX3_I32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MAX3_I32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX3_I32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_I32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_I32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_I32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MAX3_I32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MAX3_I32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3053,12 +3005,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_max3_u32_e64
-    ; GCN: renamable $vgpr0 = V_MAX3_U32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX3_U32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MAX3_U32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MAX3_U32_e64_:%[0-9]+]]:vgpr_32 = V_MAX3_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MAX3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX3_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MAX3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX3_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_U32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_U32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_U32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MAX3_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MAX3_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3074,12 +3026,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_med3_i32_e64
-    ; GCN: renamable $vgpr0 = V_MED3_I32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MED3_I32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MED3_I32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MED3_I32_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MED3_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MED3_I32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MED3_I32_e64_2:%[0-9]+]]:vgpr_32 = V_MED3_I32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_I32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_I32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_I32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MED3_I32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MED3_I32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3095,12 +3047,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_med3_u32_e64
-    ; GCN: renamable $vgpr0 = V_MED3_U32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MED3_U32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MED3_U32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MED3_U32_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MED3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MED3_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_MED3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MED3_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_U32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_U32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_U32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MED3_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MED3_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3116,12 +3068,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_min3_f32_e64
-    ; GCN: renamable $vgpr0 = V_MIN3_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN3_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MIN3_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MIN3_F32_e64_:%[0-9]+]]:vgpr_32 = V_MIN3_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MIN3_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN3_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MIN3_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN3_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_F32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_F32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN3_F32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MIN3_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = V_MIN3_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -3137,12 +3089,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_max3_f32_e64
-    ; GCN: renamable $vgpr0 = V_MAX3_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX3_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MAX3_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MAX3_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX3_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MAX3_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX3_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MAX3_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX3_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_F32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_F32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX3_F32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MAX3_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = V_MAX3_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -3158,12 +3110,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_med3_f32_e64
-    ; GCN: renamable $vgpr0 = V_MED3_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = V_MED3_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MED3_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MED3_F32_e64_:%[0-9]+]]:vgpr_32 = V_MED3_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MED3_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MED3_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MED3_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MED3_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_F32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_F32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MED3_F32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MED3_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = V_MED3_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -3179,12 +3131,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_sad_u8_e64
-    ; GCN: renamable $vgpr0 = V_SAD_U8_e64 1, 1, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_SAD_U8_e64 2, 2, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_SAD_U8_e64 3, 3, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_SAD_U8_e64_:%[0-9]+]]:vgpr_32 = V_SAD_U8_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_SAD_U8_e64_1:%[0-9]+]]:vgpr_32 = V_SAD_U8_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_SAD_U8_e64_2:%[0-9]+]]:vgpr_32 = V_SAD_U8_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U8_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U8_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U8_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_SAD_U8_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec
     %2:vgpr_32 = V_SAD_U8_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec
@@ -3200,12 +3152,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_sad_hi_u8_e64
-    ; GCN: renamable $vgpr0 = V_SAD_HI_U8_e64 1, 1, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_SAD_HI_U8_e64 2, 2, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_SAD_HI_U8_e64 3, 3, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_SAD_HI_U8_e64_:%[0-9]+]]:vgpr_32 = V_SAD_HI_U8_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_SAD_HI_U8_e64_1:%[0-9]+]]:vgpr_32 = V_SAD_HI_U8_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_SAD_HI_U8_e64_2:%[0-9]+]]:vgpr_32 = V_SAD_HI_U8_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_HI_U8_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_HI_U8_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_HI_U8_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_SAD_HI_U8_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec
     %2:vgpr_32 = V_SAD_HI_U8_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec
@@ -3221,12 +3173,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_sad_u16_e64
-    ; GCN: renamable $vgpr0 = V_SAD_U16_e64 1, 1, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_SAD_U16_e64 2, 2, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_SAD_U16_e64 3, 3, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_SAD_U16_e64_:%[0-9]+]]:vgpr_32 = V_SAD_U16_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_SAD_U16_e64_1:%[0-9]+]]:vgpr_32 = V_SAD_U16_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_SAD_U16_e64_2:%[0-9]+]]:vgpr_32 = V_SAD_U16_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U16_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_SAD_U16_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec
     %2:vgpr_32 = V_SAD_U16_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec
@@ -3242,12 +3194,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_sad_u32_e64
-    ; GCN: renamable $vgpr0 = V_SAD_U32_e64 1, 1, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_SAD_U32_e64 2, 2, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_SAD_U32_e64 3, 3, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_SAD_U32_e64_:%[0-9]+]]:vgpr_32 = V_SAD_U32_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_SAD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SAD_U32_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_SAD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_SAD_U32_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SAD_U32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_SAD_U32_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec
     %2:vgpr_32 = V_SAD_U32_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec
@@ -3263,12 +3215,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_cvt_pk_u8_f32_e64
-    ; GCN: renamable $vgpr0 = nofpexcept V_CVT_PK_U8_F32_e64 0, 1, 0, 1, 0, undef $vgpr0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_CVT_PK_U8_F32_e64 0, 2, 0, 2, 0, undef $vgpr0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = nofpexcept V_CVT_PK_U8_F32_e64 0, 3, 0, 3, 0, undef $vgpr0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_CVT_PK_U8_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PK_U8_F32_e64 0, 1, 0, 1, 0, undef %1:vgpr_32, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_PK_U8_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PK_U8_F32_e64 0, 2, 0, 2, 0, undef %1:vgpr_32, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_CVT_PK_U8_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PK_U8_F32_e64 0, 3, 0, 3, 0, undef %1:vgpr_32, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_U8_F32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_U8_F32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_CVT_PK_U8_F32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = nofpexcept V_CVT_PK_U8_F32_e64 0, 1, 0, 1, 0, undef %0:vgpr_32, 0, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_CVT_PK_U8_F32_e64 0, 2, 0, 2, 0, undef %0:vgpr_32, 0, implicit $exec, implicit $mode
@@ -3284,12 +3236,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_div_fixup_f64_e64
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_DIV_FIXUP_F64_e64 0, 1, 0, 1, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_DIV_FIXUP_F64_e64 0, 2, 0, 2, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_DIV_FIXUP_F64_e64 0, 3, 0, 3, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_DIV_FIXUP_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_DIV_FIXUP_F64_e64 0, 1, 0, 1, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_DIV_FIXUP_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_DIV_FIXUP_F64_e64 0, 2, 0, 2, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_DIV_FIXUP_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_DIV_FIXUP_F64_e64 0, 3, 0, 3, 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_DIV_FIXUP_F64_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_DIV_FIXUP_F64_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_DIV_FIXUP_F64_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vreg_64_align2 = nofpexcept V_DIV_FIXUP_F64_e64 0, 1, 0, 1, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
     %2:vreg_64_align2 = nofpexcept V_DIV_FIXUP_F64_e64 0, 2, 0, 2, 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
@@ -3305,12 +3257,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_ldexp_f64_e64
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_LDEXP_F64_e64 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_LDEXP_F64_e64 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_LDEXP_F64_e64 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_LDEXP_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_LDEXP_F64_e64 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_LDEXP_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_LDEXP_F64_e64 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_LDEXP_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_LDEXP_F64_e64 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F64_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F64_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F64_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vreg_64_align2 = nofpexcept V_LDEXP_F64_e64 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vreg_64_align2 = nofpexcept V_LDEXP_F64_e64 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -3326,12 +3278,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_msad_u8_e64
-    ; GCN: renamable $vgpr0 = V_MSAD_U8_e64 1, 1, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_MSAD_U8_e64 2, 2, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_MSAD_U8_e64 3, 3, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_MSAD_U8_e64_:%[0-9]+]]:vgpr_32 = V_MSAD_U8_e64 1, 1, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_MSAD_U8_e64_1:%[0-9]+]]:vgpr_32 = V_MSAD_U8_e64 2, 2, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_MSAD_U8_e64_2:%[0-9]+]]:vgpr_32 = V_MSAD_U8_e64 3, 3, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MSAD_U8_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MSAD_U8_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MSAD_U8_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_MSAD_U8_e64 1, 1, undef %0:vgpr_32, 0, implicit $exec
     %2:vgpr_32 = V_MSAD_U8_e64 2, 2, undef %0:vgpr_32, 0, implicit $exec
@@ -3347,12 +3299,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_trig_preop_f64_e64
-    ; GCN: renamable $vgpr0_vgpr1 = nofpexcept V_TRIG_PREOP_F64_e64 0, 1, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_TRIG_PREOP_F64_e64 0, 2, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_TRIG_PREOP_F64_e64 0, 3, 0, undef $vgpr0, 0, 0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_TRIG_PREOP_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_TRIG_PREOP_F64_e64 0, 1, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_TRIG_PREOP_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_TRIG_PREOP_F64_e64 0, 2, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_TRIG_PREOP_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_TRIG_PREOP_F64_e64 0, 3, 0, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_TRIG_PREOP_F64_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_TRIG_PREOP_F64_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_TRIG_PREOP_F64_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vreg_64_align2 = nofpexcept V_TRIG_PREOP_F64_e64 0, 1, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
     %2:vreg_64_align2 = nofpexcept V_TRIG_PREOP_F64_e64 0, 2, 0, undef %0:vgpr_32, 0, 0, implicit $exec, implicit $mode
@@ -3368,12 +3320,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_lshlrev_b64_e64
-    ; GCN: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 1, undef $vgpr0_vgpr1, implicit $exec
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 2, undef $vgpr0_vgpr1, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 3, undef $vgpr0_vgpr1, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 1, undef %1:vreg_64_align2, implicit $exec
+    ; GCN-NEXT: [[V_LSHLREV_B64_e64_1:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 2, undef %1:vreg_64_align2, implicit $exec
+    ; GCN-NEXT: [[V_LSHLREV_B64_e64_2:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 3, undef %1:vreg_64_align2, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B64_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B64_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B64_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vreg_64_align2 = V_LSHLREV_B64_e64 1, undef %0:vreg_64_align2, implicit $exec
     %2:vreg_64_align2 = V_LSHLREV_B64_e64 2, undef %0:vreg_64_align2, implicit $exec
@@ -3389,12 +3341,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_lshrrev_b64_e64
-    ; GCN: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 1, undef $vgpr0_vgpr1, implicit $exec
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 2, undef $vgpr0_vgpr1, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 3, undef $vgpr0_vgpr1, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_LSHRREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHRREV_B64_e64 1, undef %1:vreg_64_align2, implicit $exec
+    ; GCN-NEXT: [[V_LSHRREV_B64_e64_1:%[0-9]+]]:vreg_64_align2 = V_LSHRREV_B64_e64 2, undef %1:vreg_64_align2, implicit $exec
+    ; GCN-NEXT: [[V_LSHRREV_B64_e64_2:%[0-9]+]]:vreg_64_align2 = V_LSHRREV_B64_e64 3, undef %1:vreg_64_align2, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B64_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B64_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B64_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vreg_64_align2 = V_LSHRREV_B64_e64 1, undef %0:vreg_64_align2, implicit $exec
     %2:vreg_64_align2 = V_LSHRREV_B64_e64 2, undef %0:vreg_64_align2, implicit $exec
@@ -3410,12 +3362,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_ashrrev_i64_e64
-    ; GCN: renamable $vgpr0_vgpr1 = V_ASHRREV_I64_e64 1, undef $vgpr0_vgpr1, implicit $exec
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_ASHRREV_I64_e64 2, undef $vgpr0_vgpr1, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_ASHRREV_I64_e64 3, undef $vgpr0_vgpr1, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN: [[V_ASHRREV_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_ASHRREV_I64_e64 1, undef %1:vreg_64_align2, implicit $exec
+    ; GCN-NEXT: [[V_ASHRREV_I64_e64_1:%[0-9]+]]:vreg_64_align2 = V_ASHRREV_I64_e64 2, undef %1:vreg_64_align2, implicit $exec
+    ; GCN-NEXT: [[V_ASHRREV_I64_e64_2:%[0-9]+]]:vreg_64_align2 = V_ASHRREV_I64_e64 3, undef %1:vreg_64_align2, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I64_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I64_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I64_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vreg_64_align2 = V_ASHRREV_I64_e64 1, undef %0:vreg_64_align2, implicit $exec
     %2:vreg_64_align2 = V_ASHRREV_I64_e64 2, undef %0:vreg_64_align2, implicit $exec
@@ -3431,12 +3383,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_perm_b32_e64
-    ; GCN: renamable $vgpr0 = V_PERM_B32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_PERM_B32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_PERM_B32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_PERM_B32_e64_1:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_PERM_B32_e64_2:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PERM_B32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PERM_B32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PERM_B32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_PERM_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_PERM_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3452,12 +3404,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_add3_u32_e64
-    ; GCN: renamable $vgpr0 = V_ADD3_U32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_ADD3_U32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_ADD3_U32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ADD3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ADD3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD3_U32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD3_U32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD3_U32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_ADD3_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_ADD3_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3473,12 +3425,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_and_or_b32_e64
-    ; GCN: renamable $vgpr0 = V_AND_OR_B32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_AND_OR_B32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_AND_OR_B32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_AND_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_AND_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_AND_OR_B32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_AND_OR_B32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_AND_OR_B32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_AND_OR_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_AND_OR_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3494,12 +3446,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_or3_b32_e64
-    ; GCN: renamable $vgpr0 = V_OR3_B32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_OR3_B32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_OR3_B32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_OR3_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_OR3_B32_e64_2:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_OR3_B32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_OR3_B32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_OR3_B32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_OR3_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_OR3_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3515,12 +3467,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_xad_u32_e64
-    ; GCN: renamable $vgpr0 = V_XAD_U32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_XAD_U32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_XAD_U32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_XAD_U32_e64_:%[0-9]+]]:vgpr_32 = V_XAD_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_XAD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_XAD_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_XAD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_XAD_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_XAD_U32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_XAD_U32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_XAD_U32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_XAD_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_XAD_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3536,12 +3488,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_add_i32_e64
-    ; GCN: renamable $vgpr0 = V_ADD_I32_e64 1, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_ADD_I32_e64 2, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_ADD_I32_e64 3, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e64 1, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_ADD_I32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_I32_e64 2, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_ADD_I32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_I32_e64 3, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_I32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_I32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_I32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_ADD_I32_e64 1, undef %0:vgpr_32, 0, implicit $exec
     %2:vgpr_32 = V_ADD_I32_e64 2, undef %0:vgpr_32, 0, implicit $exec
@@ -3557,12 +3509,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_add_lshl_u32_e64
-    ; GCN: renamable $vgpr0 = V_ADD_LSHL_U32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_ADD_LSHL_U32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_ADD_LSHL_U32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_ADD_LSHL_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_LSHL_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ADD_LSHL_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_LSHL_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_ADD_LSHL_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_LSHL_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_LSHL_U32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_LSHL_U32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_LSHL_U32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_ADD_LSHL_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_ADD_LSHL_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3578,12 +3530,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_sub_i32_e64
-    ; GCN: renamable $vgpr0 = V_SUB_I32_e64 1, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_SUB_I32_e64 2, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_SUB_I32_e64 3, undef $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_SUB_I32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 1, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_SUB_I32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 2, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: [[V_SUB_I32_e64_2:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 3, undef %1:vgpr_32, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_I32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_I32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_I32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_SUB_I32_e64 1, undef %0:vgpr_32, 0, implicit $exec
     %2:vgpr_32 = V_SUB_I32_e64 2, undef %0:vgpr_32, 0, implicit $exec
@@ -3599,12 +3551,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_lshl_add_u32_e64
-    ; GCN: renamable $vgpr0 = V_LSHL_ADD_U32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHL_ADD_U32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_LSHL_ADD_U32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LSHL_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LSHL_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_ADD_U32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_ADD_U32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_ADD_U32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_LSHL_ADD_U32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_LSHL_ADD_U32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3620,12 +3572,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_lshl_or_b32_e64
-    ; GCN: renamable $vgpr0 = V_LSHL_OR_B32_e64 1, 1, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHL_OR_B32_e64 2, 2, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr0 = V_LSHL_OR_B32_e64 3, 3, undef $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; GCN: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 1, 1, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 2, 2, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 3, 3, undef %1:vgpr_32, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_OR_B32_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_OR_B32_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHL_OR_B32_e64_2]]
     ; GCN-NEXT: S_ENDPGM 0
     %1:vgpr_32 = V_LSHL_OR_B32_e64 1, 1, undef %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_LSHL_OR_B32_e64 2, 2, undef %0:vgpr_32, implicit $exec
@@ -3645,13 +3597,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_lshlrev_b16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B16_e32 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B16_e32 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B16_e32 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_LSHLREV_B16_e32_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e32 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_LSHLREV_B16_e32_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e32 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_LSHLREV_B16_e32_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e32 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_LSHLREV_B16_e32 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_LSHLREV_B16_e32 2, %0:vgpr_32, implicit $exec
@@ -3670,13 +3623,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_lshlrev_b16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B16_e64 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B16_e64 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHLREV_B16_e64 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_LSHLREV_B16_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_LSHLREV_B16_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHLREV_B16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_LSHLREV_B16_e64 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_LSHLREV_B16_e64 2, %0:vgpr_32, implicit $exec
@@ -3696,13 +3650,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_lshrrev_b16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B16_e32 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B16_e32 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B16_e32 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_LSHRREV_B16_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e32 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_LSHRREV_B16_e32_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e32 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_LSHRREV_B16_e32_2:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e32 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_LSHRREV_B16_e32 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_LSHRREV_B16_e32 2, %0:vgpr_32, implicit $exec
@@ -3721,13 +3676,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_lshrrev_b16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B16_e64 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B16_e64 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_LSHRREV_B16_e64 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_LSHRREV_B16_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_LSHRREV_B16_e64_2:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LSHRREV_B16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_LSHRREV_B16_e64 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_LSHRREV_B16_e64 2, %0:vgpr_32, implicit $exec
@@ -3747,13 +3703,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_ashrrev_i16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I16_e32 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I16_e32 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I16_e32 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_ASHRREV_I16_e32_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e32 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_ASHRREV_I16_e32_1:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e32 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_ASHRREV_I16_e32_2:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e32 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_ASHRREV_I16_e32 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_ASHRREV_I16_e32 2, %0:vgpr_32, implicit $exec
@@ -3772,13 +3729,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_ashrrev_i16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I16_e64 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I16_e64 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_ASHRREV_I16_e64 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_ASHRREV_I16_e64_1:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_ASHRREV_I16_e64_2:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ASHRREV_I16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_ASHRREV_I16_e64 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_ASHRREV_I16_e64 2, %0:vgpr_32, implicit $exec
@@ -3798,13 +3756,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_add_u16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_ADD_U16_e32 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_ADD_U16_e32 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_ADD_U16_e32 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_ADD_U16_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e32 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_ADD_U16_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U16_e32 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_ADD_U16_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U16_e32 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_ADD_U16_e32 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_ADD_U16_e32 2, %0:vgpr_32, implicit $exec
@@ -3824,13 +3783,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_add_u16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_ADD_U16_e64 1, $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_ADD_U16_e64 2, $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_ADD_U16_e64 3, $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 1, [[COPY]], 0, implicit $exec
+    ; GCN-NEXT: [[V_ADD_U16_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 2, [[COPY]], 0, implicit $exec
+    ; GCN-NEXT: [[V_ADD_U16_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 3, [[COPY]], 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_U16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_ADD_U16_e64 1, %0:vgpr_32, 0, implicit $exec
     %2:vgpr_32 = V_ADD_U16_e64 2, %0:vgpr_32, 0, implicit $exec
@@ -3850,13 +3810,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_sub_u16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_SUB_U16_e32 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_SUB_U16_e32 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_SUB_U16_e32 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_SUB_U16_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e32 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_SUB_U16_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U16_e32 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_SUB_U16_e32_2:%[0-9]+]]:vgpr_32 = V_SUB_U16_e32 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_SUB_U16_e32 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_SUB_U16_e32 2, %0:vgpr_32, implicit $exec
@@ -3876,13 +3837,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_sub_u16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_SUB_U16_e64 1, $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_SUB_U16_e64 2, $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_SUB_U16_e64 3, $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 1, [[COPY]], 0, implicit $exec
+    ; GCN-NEXT: [[V_SUB_U16_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 2, [[COPY]], 0, implicit $exec
+    ; GCN-NEXT: [[V_SUB_U16_e64_2:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 3, [[COPY]], 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_U16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_SUB_U16_e64 1, %0:vgpr_32, 0, implicit $exec
     %2:vgpr_32 = V_SUB_U16_e64 2, %0:vgpr_32, 0, implicit $exec
@@ -3902,13 +3864,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_subrev_u16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U16_e32 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U16_e32 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U16_e32 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_SUBREV_U16_e32_:%[0-9]+]]:vgpr_32 = V_SUBREV_U16_e32 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_SUBREV_U16_e32_1:%[0-9]+]]:vgpr_32 = V_SUBREV_U16_e32 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_SUBREV_U16_e32_2:%[0-9]+]]:vgpr_32 = V_SUBREV_U16_e32 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_SUBREV_U16_e32 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_SUBREV_U16_e32 2, %0:vgpr_32, implicit $exec
@@ -3928,13 +3891,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_subrev_u16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U16_e64 1, $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U16_e64 2, $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_SUBREV_U16_e64 3, $vgpr0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_SUBREV_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUBREV_U16_e64 1, [[COPY]], 0, implicit $exec
+    ; GCN-NEXT: [[V_SUBREV_U16_e64_1:%[0-9]+]]:vgpr_32 = V_SUBREV_U16_e64 2, [[COPY]], 0, implicit $exec
+    ; GCN-NEXT: [[V_SUBREV_U16_e64_2:%[0-9]+]]:vgpr_32 = V_SUBREV_U16_e64 3, [[COPY]], 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_U16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_SUBREV_U16_e64 1, %0:vgpr_32, 0, implicit $exec
     %2:vgpr_32 = V_SUBREV_U16_e64 2, %0:vgpr_32, 0, implicit $exec
@@ -3954,13 +3918,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_min_u16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_U16_e32 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_U16_e32 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_U16_e32 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MIN_U16_e32_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e32 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MIN_U16_e32_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e32 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MIN_U16_e32_2:%[0-9]+]]:vgpr_32 = V_MIN_U16_e32 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MIN_U16_e32 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MIN_U16_e32 2, %0:vgpr_32, implicit $exec
@@ -3980,13 +3945,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_min_u16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_U16_e64 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_U16_e64 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_U16_e64 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MIN_U16_e64_2:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_U16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MIN_U16_e64 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MIN_U16_e64 2, %0:vgpr_32, implicit $exec
@@ -4006,13 +3972,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_max_u16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_U16_e32 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_U16_e32 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_U16_e32 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MAX_U16_e32_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e32 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MAX_U16_e32_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e32 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MAX_U16_e32_2:%[0-9]+]]:vgpr_32 = V_MAX_U16_e32 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MAX_U16_e32 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MAX_U16_e32 2, %0:vgpr_32, implicit $exec
@@ -4032,13 +3999,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_max_u16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_U16_e64 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_U16_e64 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_U16_e64 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MAX_U16_e64_2:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_U16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MAX_U16_e64 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MAX_U16_e64 2, %0:vgpr_32, implicit $exec
@@ -4058,13 +4026,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_min_i16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_I16_e32 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_I16_e32 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_I16_e32 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MIN_I16_e32_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e32 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MIN_I16_e32_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e32 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MIN_I16_e32_2:%[0-9]+]]:vgpr_32 = V_MIN_I16_e32 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MIN_I16_e32 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MIN_I16_e32 2, %0:vgpr_32, implicit $exec
@@ -4084,13 +4053,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_min_i16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_I16_e64 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_I16_e64 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MIN_I16_e64 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MIN_I16_e64_2:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_I16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MIN_I16_e64 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MIN_I16_e64 2, %0:vgpr_32, implicit $exec
@@ -4110,13 +4080,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_max_i16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_I16_e32 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_I16_e32 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_I16_e32 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MAX_I16_e32_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e32 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MAX_I16_e32_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e32 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MAX_I16_e32_2:%[0-9]+]]:vgpr_32 = V_MAX_I16_e32 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MAX_I16_e32 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MAX_I16_e32 2, %0:vgpr_32, implicit $exec
@@ -4136,13 +4107,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_max_i16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_I16_e64 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_I16_e64 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MAX_I16_e64 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MAX_I16_e64_2:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_I16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MAX_I16_e64 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MAX_I16_e64 2, %0:vgpr_32, implicit $exec
@@ -4162,13 +4134,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_mul_lo_u16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U16_e32 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U16_e32 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U16_e32 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MUL_LO_U16_e32_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U16_e32 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MUL_LO_U16_e32_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U16_e32 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MUL_LO_U16_e32_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U16_e32 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MUL_LO_U16_e32 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MUL_LO_U16_e32 2, %0:vgpr_32, implicit $exec
@@ -4188,13 +4161,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_mul_lo_u16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U16_e64 1, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U16_e64 2, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_MUL_LO_U16_e64 3, $vgpr0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MUL_LO_U16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U16_e64 1, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MUL_LO_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U16_e64 2, [[COPY]], implicit $exec
+    ; GCN-NEXT: [[V_MUL_LO_U16_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U16_e64 3, [[COPY]], implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_LO_U16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MUL_LO_U16_e64 1, %0:vgpr_32, implicit $exec
     %2:vgpr_32 = V_MUL_LO_U16_e64 2, %0:vgpr_32, implicit $exec
@@ -4214,13 +4188,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_add_f16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F16_e32 1, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F16_e32 2, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F16_e32 3, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_ADD_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e32 1, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_ADD_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e32 2, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_ADD_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e32 3, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_ADD_F16_e32 1, %0:vgpr_32, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_ADD_F16_e32 2, %0:vgpr_32, implicit $mode, implicit $exec
@@ -4240,13 +4215,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_add_f16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F16_e64 0, 1, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F16_e64 0, 2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_ADD_F16_e64 0, 3, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, 1, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_ADD_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, 2, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_ADD_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, 3, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_ADD_F16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_ADD_F16_e64 0, 1, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_ADD_F16_e64 0, 2, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -4266,13 +4242,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_sub_f16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F16_e32 1, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F16_e32 2, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F16_e32 3, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_SUB_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F16_e32 1, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_SUB_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F16_e32 2, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_SUB_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F16_e32 3, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_SUB_F16_e32 1, %0:vgpr_32, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_SUB_F16_e32 2, %0:vgpr_32, implicit $mode, implicit $exec
@@ -4292,13 +4269,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_sub_f16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F16_e64 0, 1, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F16_e64 0, 2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUB_F16_e64 0, 3, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_SUB_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F16_e64 0, 1, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_SUB_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F16_e64 0, 2, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_SUB_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F16_e64 0, 3, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUB_F16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_SUB_F16_e64 0, 1, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_SUB_F16_e64 0, 2, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -4318,13 +4296,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_subrev_f16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F16_e32 1, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F16_e32 2, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F16_e32 3, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_SUBREV_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F16_e32 1, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_SUBREV_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F16_e32 2, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_SUBREV_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F16_e32 3, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_SUBREV_F16_e32 1, %0:vgpr_32, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_SUBREV_F16_e32 2, %0:vgpr_32, implicit $mode, implicit $exec
@@ -4344,13 +4323,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_subrev_f16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F16_e64 0, 1, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F16_e64 0, 2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_SUBREV_F16_e64 0, 3, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_SUBREV_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F16_e64 0, 1, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_SUBREV_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F16_e64 0, 2, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_SUBREV_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SUBREV_F16_e64 0, 3, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_SUBREV_F16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_SUBREV_F16_e64 0, 1, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_SUBREV_F16_e64 0, 2, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -4370,13 +4350,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_mul_f16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F16_e32 1, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F16_e32 2, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F16_e32 3, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MUL_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e32 1, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e32 2, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e32 3, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_MUL_F16_e32 1, %0:vgpr_32, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_MUL_F16_e32 2, %0:vgpr_32, implicit $mode, implicit $exec
@@ -4396,13 +4377,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_mul_f16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F16_e64 0, 1, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F16_e64 0, 2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F16_e64 0, 3, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, 1, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, 2, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, 3, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MUL_F16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_MUL_F16_e64 0, 1, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_MUL_F16_e64 0, 2, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -4422,13 +4404,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_ldexp_f16_e32
     ; GCN: liveins: $vgpr0, $vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F16_e32 1, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F16_e32 1, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F16_e32 1, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_LDEXP_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e32 1, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_LDEXP_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e32 1, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_LDEXP_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e32 1, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_LDEXP_F16_e32 1, %0, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_LDEXP_F16_e32 1, %0, implicit $mode, implicit $exec
@@ -4448,13 +4431,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_ldexp_f16_e64
     ; GCN: liveins: $vgpr0, $vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F16_e64 0, $vgpr0, 0, 1, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F16_e64 0, $vgpr0, 0, 1, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_LDEXP_F16_e64 0, $vgpr0, 0, 1, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_LDEXP_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, [[COPY]], 0, 1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_LDEXP_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, [[COPY]], 0, 1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_LDEXP_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, [[COPY]], 0, 1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_LDEXP_F16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, %0, 0, 1, 0, 0, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, %0, 0, 1, 0, 0, implicit $mode, implicit $exec
@@ -4474,13 +4458,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_min_f16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F16_e32 1, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F16_e32 2, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F16_e32 3, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MIN_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F16_e32 1, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MIN_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F16_e32 2, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MIN_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F16_e32 3, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_MIN_F16_e32 1, %0:vgpr_32, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_MIN_F16_e32 2, %0:vgpr_32, implicit $mode, implicit $exec
@@ -4500,13 +4485,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_min_f16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F16_e64 0, 1, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F16_e64 0, 2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MIN_F16_e64 0, 3, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MIN_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F16_e64 0, 1, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MIN_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F16_e64 0, 2, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MIN_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F16_e64 0, 3, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MIN_F16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_MIN_F16_e64 0, 1, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_MIN_F16_e64 0, 2, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -4526,13 +4512,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_max_f16_e32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F16_e32 1, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F16_e32 2, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F16_e32 3, $vgpr0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MAX_F16_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e32 1, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MAX_F16_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e32 2, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MAX_F16_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e32 3, [[COPY]], implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F16_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F16_e32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F16_e32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_MAX_F16_e32 1, %0:vgpr_32, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_MAX_F16_e32 2, %0:vgpr_32, implicit $mode, implicit $exec
@@ -4552,13 +4539,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_max_f16_e64
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F16_e64 0, 1, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F16_e64 0, 2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAX_F16_e64 0, 3, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e64 0, 1, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MAX_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e64 0, 2, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MAX_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e64 0, 3, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F16_e64_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F16_e64_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAX_F16_e64_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_MAX_F16_e64 0, 1, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_MAX_F16_e64 0, 2, 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -4578,13 +4566,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_madak_f16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADAK_F16 1, $vgpr0, 1, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADAK_F16 2, $vgpr0, 2, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADAK_F16 3, $vgpr0, 3, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MADAK_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_MADAK_F16 1, [[COPY]], 1, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MADAK_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MADAK_F16 2, [[COPY]], 2, implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MADAK_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MADAK_F16 3, [[COPY]], 3, implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MADAK_F16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MADAK_F16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MADAK_F16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_MADAK_F16 1, %0, 1, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_MADAK_F16 2, %0, 2, implicit $exec, implicit $mode
@@ -4604,13 +4593,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_madmk_f16
     ; GCN: liveins: $vgpr0, $vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADMK_F16 1, 1, $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADMK_F16 2, 2, $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MADMK_F16 3, 3, $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MADMK_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_MADMK_F16 1, 1, [[COPY]], implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MADMK_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MADMK_F16 2, 2, [[COPY]], implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_MADMK_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MADMK_F16 3, 3, [[COPY]], implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MADMK_F16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MADMK_F16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MADMK_F16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_MADMK_F16 1, 1, %0, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_MADMK_F16 2, 2, %0, implicit $exec, implicit $mode
@@ -4630,13 +4620,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_fmamk_f16
     ; GCN: liveins: $vgpr0, $vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMAMK_F16 1, 1, $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMAMK_F16 2, 2, $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMAMK_F16 3, 3, $vgpr0, implicit $exec, implicit $mode
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_FMAMK_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAMK_F16 1, 1, [[COPY]], implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FMAMK_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAMK_F16 2, 2, [[COPY]], implicit $exec, implicit $mode
+    ; GCN-NEXT: [[V_FMAMK_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAMK_F16 3, 3, [[COPY]], implicit $exec, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMAMK_F16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMAMK_F16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMAMK_F16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_FMAMK_F16 1, 1, %0, implicit $exec, implicit $mode
     %2:vgpr_32 = nofpexcept V_FMAMK_F16 2, 2, %0, implicit $exec, implicit $mode
@@ -4656,13 +4647,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_mad_i16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MAD_I16 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MAD_I16 9, $vgpr0, 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MAD_I16 10, $vgpr0, 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_MAD_I16_:%[0-9]+]]:vgpr_32 = V_PK_MAD_I16 8, [[COPY]], 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MAD_I16_1:%[0-9]+]]:vgpr_32 = V_PK_MAD_I16 9, [[COPY]], 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MAD_I16_2:%[0-9]+]]:vgpr_32 = V_PK_MAD_I16 10, [[COPY]], 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAD_I16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAD_I16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAD_I16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_MAD_I16 8, %0, 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -4683,13 +4675,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_mad_u16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MAD_U16 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MAD_U16 9, $vgpr0, 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MAD_U16 10, $vgpr0, 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_MAD_U16_:%[0-9]+]]:vgpr_32 = V_PK_MAD_U16 8, [[COPY]], 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MAD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MAD_U16 9, [[COPY]], 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MAD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MAD_U16 10, [[COPY]], 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAD_U16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAD_U16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAD_U16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_MAD_U16 8, %0, 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -4710,13 +4703,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_add_u16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_U16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_U16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_U16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_U16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_U16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_U16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_ADD_U16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -4737,13 +4731,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_add_i16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_I16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_I16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_I16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_ADD_I16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_I16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_ADD_I16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_I16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_ADD_I16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_I16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_I16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_I16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_I16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_ADD_I16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -4764,13 +4759,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_mul_lo_u16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MUL_LO_U16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MUL_LO_U16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MUL_LO_U16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MUL_LO_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_LO_U16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_LO_U16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_LO_U16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_MUL_LO_U16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -4791,13 +4787,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_min_i16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MIN_I16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MIN_I16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MIN_I16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_MIN_I16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_I16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MIN_I16_1:%[0-9]+]]:vgpr_32 = V_PK_MIN_I16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MIN_I16_2:%[0-9]+]]:vgpr_32 = V_PK_MIN_I16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_I16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_I16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_I16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_MIN_I16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -4818,13 +4815,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_max_i16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MAX_I16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MAX_I16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MAX_I16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_MAX_I16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_I16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MAX_I16_1:%[0-9]+]]:vgpr_32 = V_PK_MAX_I16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MAX_I16_2:%[0-9]+]]:vgpr_32 = V_PK_MAX_I16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_I16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_I16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_I16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_MAX_I16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -4845,13 +4843,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_min_u16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MIN_U16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MIN_U16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MIN_U16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_MIN_U16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_U16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MIN_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MIN_U16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MIN_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MIN_U16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_U16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_U16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_U16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_MIN_U16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -4872,13 +4871,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_max_u16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MAX_U16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MAX_U16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_MAX_U16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_MAX_U16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_U16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MAX_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MAX_U16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MAX_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MAX_U16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_U16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_U16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_U16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_MAX_U16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -4899,13 +4899,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_sub_u16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_SUB_U16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_SUB_U16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_SUB_U16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_SUB_U16_:%[0-9]+]]:vgpr_32 = V_PK_SUB_U16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_SUB_U16_1:%[0-9]+]]:vgpr_32 = V_PK_SUB_U16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_SUB_U16_2:%[0-9]+]]:vgpr_32 = V_PK_SUB_U16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_SUB_U16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_SUB_U16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_SUB_U16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_SUB_U16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -4926,13 +4927,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_sub_i16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_SUB_I16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_SUB_I16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_SUB_I16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_SUB_I16_:%[0-9]+]]:vgpr_32 = V_PK_SUB_I16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_SUB_I16_1:%[0-9]+]]:vgpr_32 = V_PK_SUB_I16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_SUB_I16_2:%[0-9]+]]:vgpr_32 = V_PK_SUB_I16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_SUB_I16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_SUB_I16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_SUB_I16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_SUB_I16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -4953,13 +4955,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_lshlrev_b16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_LSHLREV_B16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_LSHLREV_B16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_LSHLREV_B16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_LSHLREV_B16_1:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_LSHLREV_B16_2:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_LSHLREV_B16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_LSHLREV_B16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_LSHLREV_B16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_LSHLREV_B16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -4980,13 +4983,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_ashrrev_i16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_ASHRREV_I16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_ASHRREV_I16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_ASHRREV_I16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_ASHRREV_I16_1:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_ASHRREV_I16_2:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ASHRREV_I16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ASHRREV_I16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ASHRREV_I16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_ASHRREV_I16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -5007,13 +5011,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_lshrrev_b16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_LSHRREV_B16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_LSHRREV_B16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_LSHRREV_B16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_LSHRREV_B16_1:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_LSHRREV_B16_2:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_LSHRREV_B16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_LSHRREV_B16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_LSHRREV_B16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_LSHRREV_B16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $exec
@@ -5034,13 +5039,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_add_f16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_ADD_F16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_ADD_F16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_ADD_F16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_ADD_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_ADD_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = nofpexcept V_PK_ADD_F16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
@@ -5064,19 +5070,14 @@ body:             |
     ; GCN-LABEL: name: test_no_remat_v_pk_add_f16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_F16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_F16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = V_PK_ADD_F16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr1 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_ADD_F16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_ADD_F16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_F16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = V_PK_ADD_F16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
@@ -5097,13 +5098,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_mul_f16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MUL_F16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MUL_F16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MUL_F16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MUL_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_MUL_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MUL_F16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_MUL_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MUL_F16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_F16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_F16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_F16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = nofpexcept V_PK_MUL_F16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
@@ -5124,13 +5126,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_min_f16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MIN_F16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MIN_F16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MIN_F16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_MIN_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MIN_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_MIN_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MIN_F16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_MIN_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MIN_F16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_F16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_F16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MIN_F16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = nofpexcept V_PK_MIN_F16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
@@ -5151,13 +5154,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_max_f16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MAX_F16 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MAX_F16 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_MAX_F16 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MAX_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_MAX_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MAX_F16 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_MAX_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_MAX_F16 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_F16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_F16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MAX_F16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
 
     %1:vgpr_32 = nofpexcept V_PK_MAX_F16 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
@@ -5178,13 +5182,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_fma_f16
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_FMA_F16 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_FMA_F16 9, $vgpr0, 9, $vgpr0, 9, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_PK_FMA_F16 10, $vgpr0, 10, $vgpr0, 10, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_PK_FMA_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[COPY]], 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_FMA_F16_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 9, [[COPY]], 9, [[COPY]], 9, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_FMA_F16_2:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 10, [[COPY]], 10, [[COPY]], 10, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F16_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F16_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F16_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_PK_FMA_F16 8, %0, 8, %0, 8, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_PK_FMA_F16 9, %0, 9, %0, 9, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
@@ -5205,13 +5210,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_mad_mix_f32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAD_MIX_F32 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAD_MIX_F32 9, $vgpr0, 9, $vgpr0, 9, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_MAD_MIX_F32 10, $vgpr0, 10, $vgpr0, 10, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_MIX_F32 8, [[COPY]], 8, [[COPY]], 8, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MAD_MIX_F32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_MIX_F32 9, [[COPY]], 9, [[COPY]], 9, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MAD_MIX_F32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_MIX_F32 10, [[COPY]], 10, [[COPY]], 10, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_MIX_F32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_MIX_F32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MAD_MIX_F32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 8, %0, 8, %0, 8, %0, 0, 0, 0, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_MAD_MIX_F32 9, %0, 9, %0, 9, %0, 0, 0, 0, implicit $mode, implicit $exec
@@ -5231,13 +5237,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_fma_mix_f32
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMA_MIX_F32 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMA_MIX_F32 9, $vgpr0, 9, $vgpr0, 9, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_FMA_MIX_F32 10, $vgpr0, 10, $vgpr0, 10, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIX_F32 8, [[COPY]], 8, [[COPY]], 8, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_FMA_MIX_F32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIX_F32 9, [[COPY]], 9, [[COPY]], 9, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_FMA_MIX_F32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIX_F32 10, [[COPY]], 10, [[COPY]], 10, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_MIX_F32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_MIX_F32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_FMA_MIX_F32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = nofpexcept V_FMA_MIX_F32 8, %0, 8, %0, 8, %0, 0, 0, 0, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_FMA_MIX_F32 9, %0, 9, %0, 9, %0, 0, 0, 0, implicit $mode, implicit $exec
@@ -5257,13 +5264,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_fma_f32
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_FMA_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_FMA_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_FMA_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, [[COPY]], 8, [[COPY]], 11, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_FMA_F32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, [[COPY]], 8, [[COPY]], 11, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_FMA_F32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, [[COPY]], 8, [[COPY]], 11, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %1:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, %0, 8, %0, 11, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     %2:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, %0, 8, %0, 11, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
@@ -5285,19 +5293,14 @@ body:             |
     ; GCN-LABEL: name: test_no_remat_v_pk_fma_f32
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_PK_FMA_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: SI_SPILL_V64_SAVE killed $vgpr2_vgpr3, %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_PK_FMA_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: SI_SPILL_V64_SAVE killed $vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_PK_FMA_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: SI_SPILL_V64_SAVE killed $vgpr2_vgpr3, %stack.2, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.2, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = SI_SPILL_V64_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.2, align 4, addrspace 5)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_FMA_F32 8, [[COPY]], 8, [[COPY]], 11, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_FMA_F32_1:%[0-9]+]]:vreg_64_align2 = V_PK_FMA_F32 8, [[COPY]], 8, [[COPY]], 11, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_FMA_F32_2:%[0-9]+]]:vreg_64_align2 = V_PK_FMA_F32 8, [[COPY]], 8, [[COPY]], 11, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_FMA_F32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %1:vreg_64_align2 = V_PK_FMA_F32 8, %0, 8, %0, 11, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     %2:vreg_64_align2 = V_PK_FMA_F32 8, %0, 8, %0, 11, %0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
@@ -5317,13 +5320,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_mul_f32
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_MUL_F32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_MUL_F32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_F32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_F32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MUL_F32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %0, 8, %0, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
     %2:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %0, 8, %0, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
@@ -5343,13 +5347,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_add_f32
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_ADD_F32_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_PK_ADD_F32_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_ADD_F32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %1:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %0, 8, %0, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
     %2:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %0, 8, %0, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
@@ -5369,13 +5374,14 @@ body:             |
     ; GCN-LABEL: name: test_remat_v_pk_mov_b32
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 8, $vgpr0_vgpr1, 11, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_PK_MOV_B32 9, $vgpr0_vgpr1, 9, $vgpr0_vgpr1, 12, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_PK_MOV_B32 10, $vgpr0_vgpr1, 10, $vgpr0_vgpr1, 13, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr2_vgpr3
-    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0_vgpr1
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MOV_B32_1:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 9, [[COPY]], 9, [[COPY]], 12, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: [[V_PK_MOV_B32_2:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 10, [[COPY]], 10, [[COPY]], 13, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MOV_B32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MOV_B32_1]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_PK_MOV_B32_2]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %1:vreg_64_align2 = V_PK_MOV_B32 8, %0, 8, %0, 11, 0, 0, 0, 0, implicit $exec
     %2:vreg_64_align2 = V_PK_MOV_B32 9, %0, 9, %0, 12, 0, 0, 0, 0, implicit $exec
@@ -5395,12 +5401,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_subreg_def
-    ; GCN: renamable $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr1
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $vgpr0
-    ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 2, implicit $exec
-    ; GCN-NEXT: S_NOP 0, implicit renamable $vgpr0_vgpr1
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: undef [[V_MOV_B32_e32_1:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 2, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_2]]
+    ; GCN-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_1]]
     ; GCN-NEXT: S_ENDPGM 0
     %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     undef %1.sub0:vreg_64 = V_MOV_B32_e32 2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index 002de8bb4eb51..8bbae59f468f1 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -2,7 +2,7 @@
 ; RUN: FileCheck -check-prefix=REMARK %s < %t
 
 ; STDERR: remark: foo.cl:27:0: Function Name: test_kernel
-; STDERR-NEXT: remark: foo.cl:27:0:     SGPRs: 28
+; STDERR-NEXT: remark: foo.cl:27:0:     TotalSGPRs: 28
 ; STDERR-NEXT: remark: foo.cl:27:0:     VGPRs: 9
 ; STDERR-NEXT: remark: foo.cl:27:0:     AGPRs: 43
 ; STDERR-NEXT: remark: foo.cl:27:0:     ScratchSize [bytes/lane]: 0
@@ -27,7 +27,7 @@
 ; REMARK-NEXT: DebugLoc:        { File: foo.cl, Line: 27, Column: 0 }
 ; REMARK-NEXT: Function:        test_kernel
 ; REMARK-NEXT: Args:
-; REMARK-NEXT:   - String:          '    SGPRs: '
+; REMARK-NEXT:   - String:          '    TotalSGPRs: '
 ; REMARK-NEXT:   - NumSGPR:         '28'
 ; REMARK-NEXT: ...
 ; REMARK-NEXT: --- !Analysis
@@ -122,7 +122,7 @@ define void @test_func() !dbg !6 {
 }
 
 ; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
-; STDERR-NEXT: remark: foo.cl:8:0:     SGPRs: 4
+; STDERR-NEXT: remark: foo.cl:8:0:     TotalSGPRs: 4
 ; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     AGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     ScratchSize [bytes/lane]: 0
@@ -141,12 +141,12 @@ define void @empty_func() !dbg !8 {
 }
 
 ; STDERR: remark: foo.cl:64:0: Function Name: test_indirect_call
-; STDERR-NEXT: remark: foo.cl:64:0:     SGPRs: 39
-; STDERR-NEXT: remark: foo.cl:64:0:     VGPRs: 32
-; STDERR-NEXT: remark: foo.cl:64:0:     AGPRs: 10
+; STDERR-NEXT: remark: foo.cl:64:0:     TotalSGPRs: test_indirect_call.numbered_sgpr+6
+; STDERR-NEXT: remark: foo.cl:64:0:     VGPRs: test_indirect_call.num_vgpr
+; STDERR-NEXT: remark: foo.cl:64:0:     AGPRs: test_indirect_call.num_agpr
 ; STDERR-NEXT: remark: foo.cl:64:0:     ScratchSize [bytes/lane]: 0
 ; STDERR-NEXT: remark: foo.cl:64:0:     Dynamic Stack: True
-; STDERR-NEXT: remark: foo.cl:64:0:     Occupancy [waves/SIMD]: 8
+; STDERR-NEXT: remark: foo.cl:64:0:     Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_call.numbered_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0))
 ; STDERR-NEXT: remark: foo.cl:64:0:     SGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:64:0:     VGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:64:0:     LDS Size [bytes/block]: 0
@@ -159,12 +159,12 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 {
 }
 
 ; STDERR: remark: foo.cl:74:0: Function Name: test_indirect_w_static_stack
-; STDERR-NEXT: remark: foo.cl:74:0:     SGPRs: 39
-; STDERR-NEXT: remark: foo.cl:74:0:     VGPRs: 32
-; STDERR-NEXT: remark: foo.cl:74:0:     AGPRs: 10
+; STDERR-NEXT: remark: foo.cl:74:0:     TotalSGPRs: test_indirect_w_static_stack.numbered_sgpr+6
+; STDERR-NEXT: remark: foo.cl:74:0:     VGPRs: test_indirect_w_static_stack.num_vgpr
+; STDERR-NEXT: remark: foo.cl:74:0:     AGPRs: test_indirect_w_static_stack.num_agpr
 ; STDERR-NEXT: remark: foo.cl:74:0:     ScratchSize [bytes/lane]: 144
 ; STDERR-NEXT: remark: foo.cl:74:0:     Dynamic Stack: True
-; STDERR-NEXT: remark: foo.cl:74:0:     Occupancy [waves/SIMD]: 8
+; STDERR-NEXT: remark: foo.cl:74:0:     Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_w_static_stack.numbered_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0))
 ; STDERR-NEXT: remark: foo.cl:74:0:     SGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:74:0:     VGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:74:0:     LDS Size [bytes/block]: 0
diff --git a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
index bba59ba4d8030..5d5aad76afd09 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
@@ -1,6 +1,6 @@
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN,ALL %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s
 
 ; Make sure there's no assertion when trying to report the resource
 ; usage for a function which becomes dead during codegen.
@@ -21,9 +21,10 @@ define internal fastcc void @unreachable() {
 ; GCN-NOT: s_swappc_b64
 ; GCN: s_endpgm
 
-; GCN: .amdhsa_private_segment_fixed_size 0
-; GCN-NOT: .amdhsa_uses_dynamic_stack 0
-; GCN-V5: .amdhsa_uses_dynamic_stack 0
+; GCN-NOT: .amdhsa_uses_dynamic_stack
+; GCN-V5: .amdhsa_uses_dynamic_stack
+; ALL: .set entry.private_seg_size, 0
+; ALL: .set entry.has_dyn_sized_stack, 0
 define amdgpu_kernel void @entry() {
 bb0:
   br i1 false, label %bb1, label %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index ad82869c001f6..7f8240eeb98eb 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -8,9 +8,6 @@
 define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-LABEL: kernel0:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
@@ -22,46 +19,47 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[2:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 0
+; CHECK-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x8
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 1
+; CHECK-NEXT:    v_writelane_b32 v22, s2, 0
+; CHECK-NEXT:    v_writelane_b32 v22, s3, 1
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 2
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 3
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 4
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 5
+; CHECK-NEXT:    v_writelane_b32 v22, s4, 2
+; CHECK-NEXT:    v_writelane_b32 v22, s5, 3
+; CHECK-NEXT:    v_writelane_b32 v22, s6, 4
+; CHECK-NEXT:    v_writelane_b32 v22, s7, 5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:11]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 6
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 7
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 8
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 9
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 10
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 11
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 12
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 13
+; CHECK-NEXT:    v_writelane_b32 v22, s4, 6
+; CHECK-NEXT:    v_writelane_b32 v22, s5, 7
+; CHECK-NEXT:    v_writelane_b32 v22, s6, 8
+; CHECK-NEXT:    v_writelane_b32 v22, s7, 9
+; CHECK-NEXT:    v_writelane_b32 v22, s8, 10
+; CHECK-NEXT:    v_writelane_b32 v22, s9, 11
+; CHECK-NEXT:    v_writelane_b32 v22, s10, 12
+; CHECK-NEXT:    v_writelane_b32 v22, s11, 13
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:19]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 14
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 15
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 16
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 17
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 18
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 19
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 20
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 21
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 22
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 23
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 24
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 25
-; CHECK-NEXT:    v_writelane_b32 v23, s16, 26
-; CHECK-NEXT:    v_writelane_b32 v23, s17, 27
-; CHECK-NEXT:    v_writelane_b32 v23, s18, 28
-; CHECK-NEXT:    v_writelane_b32 v23, s19, 29
+; CHECK-NEXT:    v_writelane_b32 v22, s4, 14
+; CHECK-NEXT:    v_writelane_b32 v22, s5, 15
+; CHECK-NEXT:    v_writelane_b32 v22, s6, 16
+; CHECK-NEXT:    v_writelane_b32 v22, s7, 17
+; CHECK-NEXT:    v_writelane_b32 v22, s8, 18
+; CHECK-NEXT:    v_writelane_b32 v22, s9, 19
+; CHECK-NEXT:    v_writelane_b32 v22, s10, 20
+; CHECK-NEXT:    v_writelane_b32 v22, s11, 21
+; CHECK-NEXT:    v_writelane_b32 v22, s12, 22
+; CHECK-NEXT:    v_writelane_b32 v22, s13, 23
+; CHECK-NEXT:    v_writelane_b32 v22, s14, 24
+; CHECK-NEXT:    v_writelane_b32 v22, s15, 25
+; CHECK-NEXT:    v_writelane_b32 v22, s16, 26
+; CHECK-NEXT:    v_writelane_b32 v22, s17, 27
+; CHECK-NEXT:    v_writelane_b32 v22, s18, 28
+; CHECK-NEXT:    v_writelane_b32 v22, s19, 29
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[42:43]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -71,14 +69,14 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:11]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 30
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 31
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 32
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 33
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 34
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 35
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 36
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 37
+; CHECK-NEXT:    v_writelane_b32 v22, s4, 30
+; CHECK-NEXT:    v_writelane_b32 v22, s5, 31
+; CHECK-NEXT:    v_writelane_b32 v22, s6, 32
+; CHECK-NEXT:    v_writelane_b32 v22, s7, 33
+; CHECK-NEXT:    v_writelane_b32 v22, s8, 34
+; CHECK-NEXT:    v_writelane_b32 v22, s9, 35
+; CHECK-NEXT:    v_writelane_b32 v22, s10, 36
+; CHECK-NEXT:    v_writelane_b32 v22, s11, 37
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
@@ -96,161 +94,159 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 38
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 39
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 40
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 41
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 42
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 43
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 44
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 45
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 46
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 47
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 48
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 49
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 50
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 51
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 52
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 53
+; CHECK-NEXT:    v_writelane_b32 v22, s0, 38
+; CHECK-NEXT:    v_writelane_b32 v22, s1, 39
+; CHECK-NEXT:    v_writelane_b32 v22, s2, 40
+; CHECK-NEXT:    v_writelane_b32 v22, s3, 41
+; CHECK-NEXT:    v_writelane_b32 v22, s4, 42
+; CHECK-NEXT:    v_writelane_b32 v22, s5, 43
+; CHECK-NEXT:    v_writelane_b32 v22, s6, 44
+; CHECK-NEXT:    v_writelane_b32 v22, s7, 45
+; CHECK-NEXT:    v_writelane_b32 v22, s8, 46
+; CHECK-NEXT:    v_writelane_b32 v22, s9, 47
+; CHECK-NEXT:    v_writelane_b32 v22, s10, 48
+; CHECK-NEXT:    v_writelane_b32 v22, s11, 49
+; CHECK-NEXT:    v_writelane_b32 v22, s12, 50
+; CHECK-NEXT:    v_writelane_b32 v22, s13, 51
+; CHECK-NEXT:    v_writelane_b32 v22, s14, 52
+; CHECK-NEXT:    v_writelane_b32 v22, s15, 53
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 54
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 55
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 56
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 57
+; CHECK-NEXT:    v_writelane_b32 v22, s0, 54
+; CHECK-NEXT:    v_writelane_b32 v22, s1, 55
+; CHECK-NEXT:    v_writelane_b32 v22, s2, 56
+; CHECK-NEXT:    v_writelane_b32 v22, s3, 57
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 58
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 59
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 60
-; CHECK-NEXT:    ; implicit-def: $vgpr0
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 61
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 62
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 0
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 63
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 1
+; CHECK-NEXT:    v_writelane_b32 v22, s0, 58
+; CHECK-NEXT:    v_writelane_b32 v22, s1, 59
+; CHECK-NEXT:    v_writelane_b32 v22, s2, 60
+; CHECK-NEXT:    ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
+; CHECK-NEXT:    v_writelane_b32 v22, s3, 61
+; CHECK-NEXT:    v_writelane_b32 v22, s4, 62
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 0
+; CHECK-NEXT:    v_writelane_b32 v22, s5, 63
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 1
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 3
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 4
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 5
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 6
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 7
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s9, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s10, 12
-; CHECK-NEXT:    v_writelane_b32 v0, s11, 13
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 14
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 15
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 16
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 17
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 2
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 3
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 4
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 5
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 6
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 7
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 8
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 9
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 10
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 11
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 12
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 13
+; CHECK-NEXT:    v_writelane_b32 v23, s12, 14
+; CHECK-NEXT:    v_writelane_b32 v23, s13, 15
+; CHECK-NEXT:    v_writelane_b32 v23, s14, 16
+; CHECK-NEXT:    v_writelane_b32 v23, s15, 17
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 18
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 19
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 18
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 19
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 20
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 21
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 22
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 23
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 20
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 21
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 22
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 23
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 24
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 25
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 26
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 27
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 28
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 29
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 30
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 31
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 24
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 25
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 26
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 27
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 28
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 29
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 30
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 31
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 32
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 33
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 34
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 35
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 36
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 37
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 38
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 39
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 40
-; CHECK-NEXT:    v_writelane_b32 v0, s9, 41
-; CHECK-NEXT:    v_writelane_b32 v0, s10, 42
-; CHECK-NEXT:    v_writelane_b32 v0, s11, 43
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 44
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 45
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 46
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 47
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 32
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 33
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 34
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 35
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 36
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 37
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 38
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 39
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 40
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 41
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 42
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 43
+; CHECK-NEXT:    v_writelane_b32 v23, s12, 44
+; CHECK-NEXT:    v_writelane_b32 v23, s13, 45
+; CHECK-NEXT:    v_writelane_b32 v23, s14, 46
+; CHECK-NEXT:    v_writelane_b32 v23, s15, 47
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %ret
-; CHECK-NEXT:    ; kill: killed $vgpr23
-; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    s_endpgm
 ; CHECK-NEXT:  .LBB0_2: ; %bb0
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 0
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 1
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 0
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 1
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 2
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 3
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 4
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 5
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 2
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 3
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 4
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 6
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 7
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 8
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 9
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 10
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 11
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 12
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 13
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 6
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 7
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 8
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 9
+; CHECK-NEXT:    v_readlane_b32 s4, v22, 10
+; CHECK-NEXT:    v_readlane_b32 s5, v22, 11
+; CHECK-NEXT:    v_readlane_b32 s6, v22, 12
+; CHECK-NEXT:    v_readlane_b32 s7, v22, 13
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 14
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 15
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 16
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 17
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 18
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 19
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 20
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 21
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 22
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 23
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 24
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 25
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 26
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 27
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 28
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 29
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 14
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 15
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 16
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 17
+; CHECK-NEXT:    v_readlane_b32 s4, v22, 18
+; CHECK-NEXT:    v_readlane_b32 s5, v22, 19
+; CHECK-NEXT:    v_readlane_b32 s6, v22, 20
+; CHECK-NEXT:    v_readlane_b32 s7, v22, 21
+; CHECK-NEXT:    v_readlane_b32 s8, v22, 22
+; CHECK-NEXT:    v_readlane_b32 s9, v22, 23
+; CHECK-NEXT:    v_readlane_b32 s10, v22, 24
+; CHECK-NEXT:    v_readlane_b32 s11, v22, 25
+; CHECK-NEXT:    v_readlane_b32 s12, v22, 26
+; CHECK-NEXT:    v_readlane_b32 s13, v22, 27
+; CHECK-NEXT:    v_readlane_b32 s14, v22, 28
+; CHECK-NEXT:    v_readlane_b32 s15, v22, 29
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 30
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 31
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 32
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 33
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 34
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 35
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 36
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 37
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 30
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 31
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 32
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 33
+; CHECK-NEXT:    v_readlane_b32 s4, v22, 34
+; CHECK-NEXT:    v_readlane_b32 s5, v22, 35
+; CHECK-NEXT:    v_readlane_b32 s6, v22, 36
+; CHECK-NEXT:    v_readlane_b32 s7, v22, 37
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[42:43]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -260,10 +256,10 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 38
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 39
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 40
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 41
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 38
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 39
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 40
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 41
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[16:31]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -276,111 +272,108 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[44:51]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 42
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 43
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 44
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 45
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 46
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 47
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 48
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 49
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 50
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 51
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 52
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 53
+; CHECK-NEXT:    v_readlane_b32 s4, v22, 42
+; CHECK-NEXT:    v_readlane_b32 s5, v22, 43
+; CHECK-NEXT:    v_readlane_b32 s6, v22, 44
+; CHECK-NEXT:    v_readlane_b32 s7, v22, 45
+; CHECK-NEXT:    v_readlane_b32 s8, v22, 46
+; CHECK-NEXT:    v_readlane_b32 s9, v22, 47
+; CHECK-NEXT:    v_readlane_b32 s10, v22, 48
+; CHECK-NEXT:    v_readlane_b32 s11, v22, 49
+; CHECK-NEXT:    v_readlane_b32 s12, v22, 50
+; CHECK-NEXT:    v_readlane_b32 s13, v22, 51
+; CHECK-NEXT:    v_readlane_b32 s14, v22, 52
+; CHECK-NEXT:    v_readlane_b32 s15, v22, 53
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 54
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 55
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 56
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 57
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 54
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 55
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 56
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 57
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 58
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 59
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 60
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 61
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 62
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 63
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 0
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 1
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 58
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 59
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 60
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 61
+; CHECK-NEXT:    v_readlane_b32 s4, v22, 62
+; CHECK-NEXT:    v_readlane_b32 s5, v22, 63
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 0
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 1
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 2
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 3
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 4
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 5
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 6
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 7
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 8
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 9
-; CHECK-NEXT:    v_readlane_b32 s8, v0, 10
-; CHECK-NEXT:    v_readlane_b32 s9, v0, 11
-; CHECK-NEXT:    v_readlane_b32 s10, v0, 12
-; CHECK-NEXT:    v_readlane_b32 s11, v0, 13
-; CHECK-NEXT:    v_readlane_b32 s12, v0, 14
-; CHECK-NEXT:    v_readlane_b32 s13, v0, 15
-; CHECK-NEXT:    v_readlane_b32 s14, v0, 16
-; CHECK-NEXT:    v_readlane_b32 s15, v0, 17
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 2
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 3
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 4
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 5
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 6
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 7
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 8
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 9
+; CHECK-NEXT:    v_readlane_b32 s8, v23, 10
+; CHECK-NEXT:    v_readlane_b32 s9, v23, 11
+; CHECK-NEXT:    v_readlane_b32 s10, v23, 12
+; CHECK-NEXT:    v_readlane_b32 s11, v23, 13
+; CHECK-NEXT:    v_readlane_b32 s12, v23, 14
+; CHECK-NEXT:    v_readlane_b32 s13, v23, 15
+; CHECK-NEXT:    v_readlane_b32 s14, v23, 16
+; CHECK-NEXT:    v_readlane_b32 s15, v23, 17
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 18
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 19
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 18
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 19
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 20
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 21
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 22
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 23
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 20
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 21
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 22
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 23
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 24
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 25
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 26
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 27
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 28
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 29
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 30
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 31
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 24
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 25
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 26
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 27
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 28
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 29
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 30
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 31
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 32
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 33
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 34
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 35
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 36
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 37
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 38
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 39
-; CHECK-NEXT:    v_readlane_b32 s8, v0, 40
-; CHECK-NEXT:    v_readlane_b32 s9, v0, 41
-; CHECK-NEXT:    v_readlane_b32 s10, v0, 42
-; CHECK-NEXT:    v_readlane_b32 s11, v0, 43
-; CHECK-NEXT:    v_readlane_b32 s12, v0, 44
-; CHECK-NEXT:    v_readlane_b32 s13, v0, 45
-; CHECK-NEXT:    v_readlane_b32 s14, v0, 46
-; CHECK-NEXT:    v_readlane_b32 s15, v0, 47
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 32
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 33
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 34
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 35
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 36
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 37
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 38
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 39
+; CHECK-NEXT:    v_readlane_b32 s8, v23, 40
+; CHECK-NEXT:    v_readlane_b32 s9, v23, 41
+; CHECK-NEXT:    v_readlane_b32 s10, v23, 42
+; CHECK-NEXT:    v_readlane_b32 s11, v23, 43
+; CHECK-NEXT:    v_readlane_b32 s12, v23, 44
+; CHECK-NEXT:    v_readlane_b32 s13, v23, 45
+; CHECK-NEXT:    v_readlane_b32 s14, v23, 46
+; CHECK-NEXT:    v_readlane_b32 s15, v23, 47
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ; kill: killed $vgpr23
-; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    s_endpgm
   call void asm sideeffect "", "~{v[0:7]}" () #0
   call void asm sideeffect "", "~{v[8:15]}" () #0
   call void asm sideeffect "", "~{v[16:19]}"() #0
   call void asm sideeffect "", "~{v[20:21]}"() #0
-  call void asm sideeffect "", "~{v22}"() #0
 
   %val0 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
   %val1 = call <4 x i32> asm sideeffect "; def $0", "=s" () #0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index 17a19116735e4..14a02d4d2dcec 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -1,19 +1,19 @@
 ; REQUIRES: asserts
 
 ; RUN: llc -verify-machineinstrs=0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s
-; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=greedy -vgpr-regalloc=greedy -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s
+; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=greedy -wwm-regalloc=greedy -vgpr-regalloc=greedy -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s
 
 ; RUN: llc -verify-machineinstrs=0 -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=O0 %s
 
-; RUN: llc -verify-machineinstrs=0 -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT-BASIC %s
+; RUN: llc -verify-machineinstrs=0 -wwm-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT-BASIC %s
 ; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-DEFAULT %s
-; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-BASIC %s
+; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=basic -wwm-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-BASIC %s
 
 ; RUN: not --crash llc -verify-machineinstrs=0 -regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s
 ; RUN: not --crash llc -verify-machineinstrs=0 -regalloc=fast -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s
 
 
-; REGALLOC: -regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc
+; REGALLOC: -regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, and -vgpr-regalloc
 
 ; DEFAULT: Greedy Register Allocator
 ; DEFAULT-NEXT: Virtual Register Rewriter
@@ -23,6 +23,11 @@
 ; DEFAULT-NEXT: SI Pre-allocate WWM Registers
 ; DEFAULT-NEXT: Greedy Register Allocator
 ; DEFAULT-NEXT: SI Lower WWM Copies
+; DEFAULT-NEXT: Virtual Register Rewriter
+; DEFAULT-NEXT: AMDGPU Reserve WWM Registers
+; DEFAULT-NEXT: Virtual Register Map
+; DEFAULT-NEXT: Live Register Matrix
+; DEFAULT-NEXT: Greedy Register Allocator
 ; DEFAULT-NEXT: GCN NSA Reassign
 ; DEFAULT-NEXT: Virtual Register Rewriter
 ; DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
@@ -37,6 +42,8 @@
 ; O0-NEXT: SI Pre-allocate WWM Registers
 ; O0-NEXT: Fast Register Allocator
 ; O0-NEXT: SI Lower WWM Copies
+; O0-NEXT: AMDGPU Reserve WWM Registers
+; O0-NEXT: Fast Register Allocator
 ; O0-NEXT: SI Fix VGPR copies
 
 
@@ -60,6 +67,11 @@
 ; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter
 ; BASIC-DEFAULT-NEXT: Greedy Register Allocator
 ; BASIC-DEFAULT-NEXT: SI Lower WWM Copies
+; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
+; BASIC-DEFAULT-NEXT: AMDGPU Reserve WWM Registers
+; BASIC-DEFAULT-NEXT: Virtual Register Map
+; BASIC-DEFAULT-NEXT: Live Register Matrix
+; BASIC-DEFAULT-NEXT: Greedy Register Allocator
 ; BASIC-DEFAULT-NEXT: GCN NSA Reassign
 ; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
 ; BASIC-DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
@@ -75,6 +87,11 @@
 ; DEFAULT-BASIC-NEXT: SI Pre-allocate WWM Registers
 ; DEFAULT-BASIC-NEXT: Basic Register Allocator
 ; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
+; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
+; DEFAULT-BASIC-NEXT: AMDGPU Reserve WWM Registers
+; DEFAULT-BASIC-NEXT: Virtual Register Map
+; DEFAULT-BASIC-NEXT: Live Register Matrix
+; DEFAULT-BASIC-NEXT: Basic Register Allocator
 ; DEFAULT-BASIC-NEXT: GCN NSA Reassign
 ; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
 ; DEFAULT-BASIC-NEXT: AMDGPU Mark Last Scratch Load
@@ -96,6 +113,11 @@
 ; BASIC-BASIC-NEXT: SI Pre-allocate WWM Registers
 ; BASIC-BASIC-NEXT: Basic Register Allocator
 ; BASIC-BASIC-NEXT: SI Lower WWM Copies
+; BASIC-BASIC-NEXT: Virtual Register Rewriter
+; BASIC-BASIC-NEXT: AMDGPU Reserve WWM Registers
+; BASIC-BASIC-NEXT: Virtual Register Map
+; BASIC-BASIC-NEXT: Live Register Matrix
+; BASIC-BASIC-NEXT: Basic Register Allocator
 ; BASIC-BASIC-NEXT: GCN NSA Reassign
 ; BASIC-BASIC-NEXT: Virtual Register Rewriter
 ; BASIC-BASIC-NEXT: AMDGPU Mark Last Scratch Load
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir
index 189aead1e5646..520717391b596 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck -check-prefix=PEI %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -passes=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s
@@ -45,28 +46,25 @@ body:             |
   ; SGPR_SPILL: bb.0:
   ; SGPR_SPILL-NEXT:   successors: %bb.1(0x80000000)
   ; SGPR_SPILL-NEXT: {{  $}}
-  ; SGPR_SPILL-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; SGPR_SPILL-NEXT:   renamable $sgpr10 = IMPLICIT_DEF
-  ; SGPR_SPILL-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[V_WRITELANE_B32_]]
+  ; SGPR_SPILL-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; SGPR_SPILL-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
   ; SGPR_SPILL-NEXT:   DBG_VALUE $noreg, 0
   ; SGPR_SPILL-NEXT: {{  $}}
   ; SGPR_SPILL-NEXT: bb.1:
-  ; SGPR_SPILL-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[V_WRITELANE_B32_]], 0
-  ; SGPR_SPILL-NEXT:   KILL [[V_WRITELANE_B32_]]
+  ; SGPR_SPILL-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
   ; SGPR_SPILL-NEXT:   S_ENDPGM 0
+  ;
   ; PEI-LABEL: name: test
   ; PEI: bb.0:
   ; PEI-NEXT:   successors: %bb.1(0x80000000)
   ; PEI-NEXT: {{  $}}
-  ; PEI-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
   ; PEI-NEXT:   renamable $sgpr10 = IMPLICIT_DEF
-  ; PEI-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, killed $vgpr0
+  ; PEI-NEXT:   $vgpr0 = IMPLICIT_DEF
+  ; PEI-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, killed $vgpr0
   ; PEI-NEXT: {{  $}}
   ; PEI-NEXT: bb.1:
-  ; PEI-NEXT:   liveins: $vgpr0
-  ; PEI-NEXT: {{  $}}
-  ; PEI-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-  ; PEI-NEXT:   KILL killed renamable $vgpr0
+  ; PEI-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
   ; PEI-NEXT:   S_ENDPGM 0
   bb.0:
     renamable $sgpr10 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll
index 29622d3fd0f1b..5692dc1e2a2c6 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll
@@ -9,7 +9,6 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
   call void asm sideeffect "", "~{v[8:15]}" () #0
   call void asm sideeffect "", "~{v[16:19]}"() #0
   call void asm sideeffect "", "~{v[20:21]}"() #0
-  call void asm sideeffect "", "~{v22}"() #0
   %val0 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
   %val1 = call <4 x i32> asm sideeffect "; def $0", "=s" () #0
   %val2 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index d430ba758572d..59036c64c8afc 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -9,19 +9,9 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_add_u32 s0, s0, s13
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN-NEXT:    s_load_dword s4, s[6:7], 0x2
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
@@ -31,91 +21,91 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[8:23]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_writelane_b32 v1, s8, 0
-; GCN-NEXT:    v_writelane_b32 v1, s9, 1
-; GCN-NEXT:    v_writelane_b32 v1, s10, 2
-; GCN-NEXT:    v_writelane_b32 v1, s11, 3
-; GCN-NEXT:    v_writelane_b32 v1, s12, 4
-; GCN-NEXT:    v_writelane_b32 v1, s13, 5
-; GCN-NEXT:    v_writelane_b32 v1, s14, 6
-; GCN-NEXT:    v_writelane_b32 v1, s15, 7
-; GCN-NEXT:    v_writelane_b32 v1, s16, 8
-; GCN-NEXT:    v_writelane_b32 v1, s17, 9
-; GCN-NEXT:    v_writelane_b32 v1, s18, 10
-; GCN-NEXT:    v_writelane_b32 v1, s19, 11
-; GCN-NEXT:    v_writelane_b32 v1, s20, 12
-; GCN-NEXT:    v_writelane_b32 v1, s21, 13
-; GCN-NEXT:    v_writelane_b32 v1, s22, 14
-; GCN-NEXT:    v_writelane_b32 v1, s23, 15
+; GCN-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
+; GCN-NEXT:    v_writelane_b32 v22, s8, 0
+; GCN-NEXT:    v_writelane_b32 v22, s9, 1
+; GCN-NEXT:    v_writelane_b32 v22, s10, 2
+; GCN-NEXT:    v_writelane_b32 v22, s11, 3
+; GCN-NEXT:    v_writelane_b32 v22, s12, 4
+; GCN-NEXT:    v_writelane_b32 v22, s13, 5
+; GCN-NEXT:    v_writelane_b32 v22, s14, 6
+; GCN-NEXT:    v_writelane_b32 v22, s15, 7
+; GCN-NEXT:    v_writelane_b32 v22, s16, 8
+; GCN-NEXT:    v_writelane_b32 v22, s17, 9
+; GCN-NEXT:    v_writelane_b32 v22, s18, 10
+; GCN-NEXT:    v_writelane_b32 v22, s19, 11
+; GCN-NEXT:    v_writelane_b32 v22, s20, 12
+; GCN-NEXT:    v_writelane_b32 v22, s21, 13
+; GCN-NEXT:    v_writelane_b32 v22, s22, 14
+; GCN-NEXT:    v_writelane_b32 v22, s23, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[8:23]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s8, 16
-; GCN-NEXT:    v_writelane_b32 v1, s9, 17
-; GCN-NEXT:    v_writelane_b32 v1, s10, 18
-; GCN-NEXT:    v_writelane_b32 v1, s11, 19
-; GCN-NEXT:    v_writelane_b32 v1, s12, 20
-; GCN-NEXT:    v_writelane_b32 v1, s13, 21
-; GCN-NEXT:    v_writelane_b32 v1, s14, 22
-; GCN-NEXT:    v_writelane_b32 v1, s15, 23
-; GCN-NEXT:    v_writelane_b32 v1, s16, 24
-; GCN-NEXT:    v_writelane_b32 v1, s17, 25
-; GCN-NEXT:    v_writelane_b32 v1, s18, 26
-; GCN-NEXT:    v_writelane_b32 v1, s19, 27
-; GCN-NEXT:    v_writelane_b32 v1, s20, 28
-; GCN-NEXT:    v_writelane_b32 v1, s21, 29
-; GCN-NEXT:    v_writelane_b32 v1, s22, 30
-; GCN-NEXT:    v_writelane_b32 v1, s23, 31
+; GCN-NEXT:    v_writelane_b32 v22, s8, 16
+; GCN-NEXT:    v_writelane_b32 v22, s9, 17
+; GCN-NEXT:    v_writelane_b32 v22, s10, 18
+; GCN-NEXT:    v_writelane_b32 v22, s11, 19
+; GCN-NEXT:    v_writelane_b32 v22, s12, 20
+; GCN-NEXT:    v_writelane_b32 v22, s13, 21
+; GCN-NEXT:    v_writelane_b32 v22, s14, 22
+; GCN-NEXT:    v_writelane_b32 v22, s15, 23
+; GCN-NEXT:    v_writelane_b32 v22, s16, 24
+; GCN-NEXT:    v_writelane_b32 v22, s17, 25
+; GCN-NEXT:    v_writelane_b32 v22, s18, 26
+; GCN-NEXT:    v_writelane_b32 v22, s19, 27
+; GCN-NEXT:    v_writelane_b32 v22, s20, 28
+; GCN-NEXT:    v_writelane_b32 v22, s21, 29
+; GCN-NEXT:    v_writelane_b32 v22, s22, 30
+; GCN-NEXT:    v_writelane_b32 v22, s23, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[8:23]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s8, 32
-; GCN-NEXT:    v_writelane_b32 v1, s9, 33
-; GCN-NEXT:    v_writelane_b32 v1, s10, 34
-; GCN-NEXT:    v_writelane_b32 v1, s11, 35
-; GCN-NEXT:    v_writelane_b32 v1, s12, 36
-; GCN-NEXT:    v_writelane_b32 v1, s13, 37
-; GCN-NEXT:    v_writelane_b32 v1, s14, 38
-; GCN-NEXT:    v_writelane_b32 v1, s15, 39
-; GCN-NEXT:    v_writelane_b32 v1, s16, 40
-; GCN-NEXT:    v_writelane_b32 v1, s17, 41
-; GCN-NEXT:    v_writelane_b32 v1, s18, 42
-; GCN-NEXT:    v_writelane_b32 v1, s19, 43
-; GCN-NEXT:    v_writelane_b32 v1, s20, 44
-; GCN-NEXT:    v_writelane_b32 v1, s21, 45
-; GCN-NEXT:    v_writelane_b32 v1, s22, 46
-; GCN-NEXT:    v_writelane_b32 v1, s23, 47
+; GCN-NEXT:    v_writelane_b32 v22, s8, 32
+; GCN-NEXT:    v_writelane_b32 v22, s9, 33
+; GCN-NEXT:    v_writelane_b32 v22, s10, 34
+; GCN-NEXT:    v_writelane_b32 v22, s11, 35
+; GCN-NEXT:    v_writelane_b32 v22, s12, 36
+; GCN-NEXT:    v_writelane_b32 v22, s13, 37
+; GCN-NEXT:    v_writelane_b32 v22, s14, 38
+; GCN-NEXT:    v_writelane_b32 v22, s15, 39
+; GCN-NEXT:    v_writelane_b32 v22, s16, 40
+; GCN-NEXT:    v_writelane_b32 v22, s17, 41
+; GCN-NEXT:    v_writelane_b32 v22, s18, 42
+; GCN-NEXT:    v_writelane_b32 v22, s19, 43
+; GCN-NEXT:    v_writelane_b32 v22, s20, 44
+; GCN-NEXT:    v_writelane_b32 v22, s21, 45
+; GCN-NEXT:    v_writelane_b32 v22, s22, 46
+; GCN-NEXT:    v_writelane_b32 v22, s23, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[8:23]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v1, s8, 48
-; GCN-NEXT:    v_writelane_b32 v1, s9, 49
-; GCN-NEXT:    v_writelane_b32 v1, s10, 50
-; GCN-NEXT:    v_writelane_b32 v1, s11, 51
-; GCN-NEXT:    v_writelane_b32 v1, s12, 52
-; GCN-NEXT:    v_writelane_b32 v1, s13, 53
-; GCN-NEXT:    v_writelane_b32 v1, s14, 54
-; GCN-NEXT:    v_writelane_b32 v1, s15, 55
-; GCN-NEXT:    v_writelane_b32 v1, s16, 56
-; GCN-NEXT:    v_writelane_b32 v1, s17, 57
-; GCN-NEXT:    v_writelane_b32 v1, s18, 58
-; GCN-NEXT:    v_writelane_b32 v1, s19, 59
-; GCN-NEXT:    v_writelane_b32 v1, s20, 60
-; GCN-NEXT:    v_writelane_b32 v1, s21, 61
-; GCN-NEXT:    v_writelane_b32 v1, s22, 62
-; GCN-NEXT:    v_writelane_b32 v1, s23, 63
+; GCN-NEXT:    v_writelane_b32 v22, s8, 48
+; GCN-NEXT:    v_writelane_b32 v22, s9, 49
+; GCN-NEXT:    v_writelane_b32 v22, s10, 50
+; GCN-NEXT:    v_writelane_b32 v22, s11, 51
+; GCN-NEXT:    v_writelane_b32 v22, s12, 52
+; GCN-NEXT:    v_writelane_b32 v22, s13, 53
+; GCN-NEXT:    v_writelane_b32 v22, s14, 54
+; GCN-NEXT:    v_writelane_b32 v22, s15, 55
+; GCN-NEXT:    v_writelane_b32 v22, s16, 56
+; GCN-NEXT:    v_writelane_b32 v22, s17, 57
+; GCN-NEXT:    v_writelane_b32 v22, s18, 58
+; GCN-NEXT:    v_writelane_b32 v22, s19, 59
+; GCN-NEXT:    v_writelane_b32 v22, s20, 60
+; GCN-NEXT:    v_writelane_b32 v22, s21, 61
+; GCN-NEXT:    v_writelane_b32 v22, s22, 62
+; GCN-NEXT:    v_writelane_b32 v22, s23, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[6:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_writelane_b32 v0, s6, 0
-; GCN-NEXT:    v_writelane_b32 v0, s7, 1
+; GCN-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
+; GCN-NEXT:    v_writelane_b32 v22, s6, 0
+; GCN-NEXT:    v_writelane_b32 v22, s7, 1
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    s_mov_b32 s5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -123,88 +113,88 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s4, v1, 0
-; GCN-NEXT:    v_readlane_b32 s5, v1, 1
-; GCN-NEXT:    v_readlane_b32 s6, v1, 2
-; GCN-NEXT:    v_readlane_b32 s7, v1, 3
-; GCN-NEXT:    v_readlane_b32 s8, v1, 4
-; GCN-NEXT:    v_readlane_b32 s9, v1, 5
-; GCN-NEXT:    v_readlane_b32 s10, v1, 6
-; GCN-NEXT:    v_readlane_b32 s11, v1, 7
-; GCN-NEXT:    v_readlane_b32 s12, v1, 8
-; GCN-NEXT:    v_readlane_b32 s13, v1, 9
-; GCN-NEXT:    v_readlane_b32 s14, v1, 10
-; GCN-NEXT:    v_readlane_b32 s15, v1, 11
-; GCN-NEXT:    v_readlane_b32 s16, v1, 12
-; GCN-NEXT:    v_readlane_b32 s17, v1, 13
-; GCN-NEXT:    v_readlane_b32 s18, v1, 14
-; GCN-NEXT:    v_readlane_b32 s19, v1, 15
+; GCN-NEXT:    v_readlane_b32 s4, v23, 0
+; GCN-NEXT:    v_readlane_b32 s5, v23, 1
+; GCN-NEXT:    v_readlane_b32 s6, v23, 2
+; GCN-NEXT:    v_readlane_b32 s7, v23, 3
+; GCN-NEXT:    v_readlane_b32 s8, v23, 4
+; GCN-NEXT:    v_readlane_b32 s9, v23, 5
+; GCN-NEXT:    v_readlane_b32 s10, v23, 6
+; GCN-NEXT:    v_readlane_b32 s11, v23, 7
+; GCN-NEXT:    v_readlane_b32 s12, v23, 8
+; GCN-NEXT:    v_readlane_b32 s13, v23, 9
+; GCN-NEXT:    v_readlane_b32 s14, v23, 10
+; GCN-NEXT:    v_readlane_b32 s15, v23, 11
+; GCN-NEXT:    v_readlane_b32 s16, v23, 12
+; GCN-NEXT:    v_readlane_b32 s17, v23, 13
+; GCN-NEXT:    v_readlane_b32 s18, v23, 14
+; GCN-NEXT:    v_readlane_b32 s19, v23, 15
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s4, v1, 16
-; GCN-NEXT:    v_readlane_b32 s5, v1, 17
-; GCN-NEXT:    v_readlane_b32 s6, v1, 18
-; GCN-NEXT:    v_readlane_b32 s7, v1, 19
-; GCN-NEXT:    v_readlane_b32 s8, v1, 20
-; GCN-NEXT:    v_readlane_b32 s9, v1, 21
-; GCN-NEXT:    v_readlane_b32 s10, v1, 22
-; GCN-NEXT:    v_readlane_b32 s11, v1, 23
-; GCN-NEXT:    v_readlane_b32 s12, v1, 24
-; GCN-NEXT:    v_readlane_b32 s13, v1, 25
-; GCN-NEXT:    v_readlane_b32 s14, v1, 26
-; GCN-NEXT:    v_readlane_b32 s15, v1, 27
-; GCN-NEXT:    v_readlane_b32 s16, v1, 28
-; GCN-NEXT:    v_readlane_b32 s17, v1, 29
-; GCN-NEXT:    v_readlane_b32 s18, v1, 30
-; GCN-NEXT:    v_readlane_b32 s19, v1, 31
+; GCN-NEXT:    v_readlane_b32 s4, v23, 16
+; GCN-NEXT:    v_readlane_b32 s5, v23, 17
+; GCN-NEXT:    v_readlane_b32 s6, v23, 18
+; GCN-NEXT:    v_readlane_b32 s7, v23, 19
+; GCN-NEXT:    v_readlane_b32 s8, v23, 20
+; GCN-NEXT:    v_readlane_b32 s9, v23, 21
+; GCN-NEXT:    v_readlane_b32 s10, v23, 22
+; GCN-NEXT:    v_readlane_b32 s11, v23, 23
+; GCN-NEXT:    v_readlane_b32 s12, v23, 24
+; GCN-NEXT:    v_readlane_b32 s13, v23, 25
+; GCN-NEXT:    v_readlane_b32 s14, v23, 26
+; GCN-NEXT:    v_readlane_b32 s15, v23, 27
+; GCN-NEXT:    v_readlane_b32 s16, v23, 28
+; GCN-NEXT:    v_readlane_b32 s17, v23, 29
+; GCN-NEXT:    v_readlane_b32 s18, v23, 30
+; GCN-NEXT:    v_readlane_b32 s19, v23, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s4, v1, 32
-; GCN-NEXT:    v_readlane_b32 s5, v1, 33
-; GCN-NEXT:    v_readlane_b32 s6, v1, 34
-; GCN-NEXT:    v_readlane_b32 s7, v1, 35
-; GCN-NEXT:    v_readlane_b32 s8, v1, 36
-; GCN-NEXT:    v_readlane_b32 s9, v1, 37
-; GCN-NEXT:    v_readlane_b32 s10, v1, 38
-; GCN-NEXT:    v_readlane_b32 s11, v1, 39
-; GCN-NEXT:    v_readlane_b32 s12, v1, 40
-; GCN-NEXT:    v_readlane_b32 s13, v1, 41
-; GCN-NEXT:    v_readlane_b32 s14, v1, 42
-; GCN-NEXT:    v_readlane_b32 s15, v1, 43
-; GCN-NEXT:    v_readlane_b32 s16, v1, 44
-; GCN-NEXT:    v_readlane_b32 s17, v1, 45
-; GCN-NEXT:    v_readlane_b32 s18, v1, 46
-; GCN-NEXT:    v_readlane_b32 s19, v1, 47
+; GCN-NEXT:    v_readlane_b32 s4, v23, 32
+; GCN-NEXT:    v_readlane_b32 s5, v23, 33
+; GCN-NEXT:    v_readlane_b32 s6, v23, 34
+; GCN-NEXT:    v_readlane_b32 s7, v23, 35
+; GCN-NEXT:    v_readlane_b32 s8, v23, 36
+; GCN-NEXT:    v_readlane_b32 s9, v23, 37
+; GCN-NEXT:    v_readlane_b32 s10, v23, 38
+; GCN-NEXT:    v_readlane_b32 s11, v23, 39
+; GCN-NEXT:    v_readlane_b32 s12, v23, 40
+; GCN-NEXT:    v_readlane_b32 s13, v23, 41
+; GCN-NEXT:    v_readlane_b32 s14, v23, 42
+; GCN-NEXT:    v_readlane_b32 s15, v23, 43
+; GCN-NEXT:    v_readlane_b32 s16, v23, 44
+; GCN-NEXT:    v_readlane_b32 s17, v23, 45
+; GCN-NEXT:    v_readlane_b32 s18, v23, 46
+; GCN-NEXT:    v_readlane_b32 s19, v23, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s8, v1, 48
-; GCN-NEXT:    v_readlane_b32 s9, v1, 49
-; GCN-NEXT:    v_readlane_b32 s10, v1, 50
-; GCN-NEXT:    v_readlane_b32 s11, v1, 51
-; GCN-NEXT:    v_readlane_b32 s12, v1, 52
-; GCN-NEXT:    v_readlane_b32 s13, v1, 53
-; GCN-NEXT:    v_readlane_b32 s14, v1, 54
-; GCN-NEXT:    v_readlane_b32 s15, v1, 55
-; GCN-NEXT:    v_readlane_b32 s16, v1, 56
-; GCN-NEXT:    v_readlane_b32 s17, v1, 57
-; GCN-NEXT:    v_readlane_b32 s18, v1, 58
-; GCN-NEXT:    v_readlane_b32 s19, v1, 59
-; GCN-NEXT:    v_readlane_b32 s20, v1, 60
-; GCN-NEXT:    v_readlane_b32 s21, v1, 61
-; GCN-NEXT:    v_readlane_b32 s22, v1, 62
-; GCN-NEXT:    v_readlane_b32 s23, v1, 63
+; GCN-NEXT:    v_readlane_b32 s8, v23, 48
+; GCN-NEXT:    v_readlane_b32 s9, v23, 49
+; GCN-NEXT:    v_readlane_b32 s10, v23, 50
+; GCN-NEXT:    v_readlane_b32 s11, v23, 51
+; GCN-NEXT:    v_readlane_b32 s12, v23, 52
+; GCN-NEXT:    v_readlane_b32 s13, v23, 53
+; GCN-NEXT:    v_readlane_b32 s14, v23, 54
+; GCN-NEXT:    v_readlane_b32 s15, v23, 55
+; GCN-NEXT:    v_readlane_b32 s16, v23, 56
+; GCN-NEXT:    v_readlane_b32 s17, v23, 57
+; GCN-NEXT:    v_readlane_b32 s18, v23, 58
+; GCN-NEXT:    v_readlane_b32 s19, v23, 59
+; GCN-NEXT:    v_readlane_b32 s20, v23, 60
+; GCN-NEXT:    v_readlane_b32 s21, v23, 61
+; GCN-NEXT:    v_readlane_b32 s22, v23, 62
+; GCN-NEXT:    v_readlane_b32 s23, v23, 63
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s4, v0, 0
-; GCN-NEXT:    v_readlane_b32 s5, v0, 1
+; GCN-NEXT:    v_readlane_b32 s4, v22, 0
+; GCN-NEXT:    v_readlane_b32 s5, v22, 1
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[8:23]
 ; GCN-NEXT:    ;;#ASMEND
@@ -212,20 +202,11 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    ; use s[4:5]
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB0_2: ; %ret
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
-; GCN-NEXT:    ; kill: killed $vgpr1
-; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    s_endpgm
   call void asm sideeffect "", "~{v[0:7]}" () #0
   call void asm sideeffect "", "~{v[8:15]}" () #0
   call void asm sideeffect "", "~{v[16:19]}"() #0
   call void asm sideeffect "", "~{v[20:21]}"() #0
-  call void asm sideeffect "", "~{v22}"() #0
 
   %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
   %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index b0fb24e60bead..bb0a707a7c90b 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -28,181 +28,180 @@ body:             |
   ; GCN-LABEL: name: test_main
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
-  ; GCN-NEXT:   liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GCN-NEXT:   liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $vcc_hi = frame-setup COPY $sgpr33
   ; GCN-NEXT:   $sgpr33 = frame-setup COPY $sgpr32
   ; GCN-NEXT:   $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.73, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.73, addrspace 5)
   ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr0
   ; GCN-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc
-  ; GCN-NEXT:   renamable $vgpr2 = IMPLICIT_DEF
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr15, 11, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr16, 12, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr17, 13, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr18, 14, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr19, 15, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr20, 16, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr21, 17, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr22, 18, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr23, 19, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr24, 20, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr25, 21, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr26, 22, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr3
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr4
-  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr5
-  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr5
-  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr5
-  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr5
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr15, 11, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr16, 12, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr17, 13, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr18, 14, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr19, 15, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr20, 16, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr21, 17, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 18, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr23, 19, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr24, 20, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr25, 21, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr26, 22, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr2
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr3
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr4
   ; GCN-NEXT:   $sgpr22 = IMPLICIT_DEF
-  ; GCN-NEXT:   renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr2
+  ; GCN-NEXT:   $vgpr5 = IMPLICIT_DEF
+  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr5
   ; GCN-NEXT:   dead $vgpr1 = V_SET_INACTIVE_B32 0, $vgpr0, 0, 0, $sgpr_null, implicit $exec, implicit-def $scc
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GCN-NEXT:   liveins: $vcc_hi
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   KILL implicit-def $vcc_lo, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GCN-NEXT:   liveins: $vcc_hi
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
+  ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR killed $vgpr5, 0
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
-  ; GCN-NEXT:   liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GCN-NEXT:   liveins: $vcc_hi
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 3
-  ; GCN-NEXT:   $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 2
-  ; GCN-NEXT:   $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 1
-  ; GCN-NEXT:   $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 0
-  ; GCN-NEXT:   $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 31
-  ; GCN-NEXT:   $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 30
-  ; GCN-NEXT:   $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 29
-  ; GCN-NEXT:   $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 28
-  ; GCN-NEXT:   $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 27
-  ; GCN-NEXT:   $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 26
-  ; GCN-NEXT:   $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 25
-  ; GCN-NEXT:   $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 24
-  ; GCN-NEXT:   $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 23
-  ; GCN-NEXT:   $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 22
-  ; GCN-NEXT:   $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 21
-  ; GCN-NEXT:   $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 20
-  ; GCN-NEXT:   $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 19
-  ; GCN-NEXT:   $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 18
-  ; GCN-NEXT:   $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 17
-  ; GCN-NEXT:   $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 16
-  ; GCN-NEXT:   $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 15
-  ; GCN-NEXT:   $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 14
-  ; GCN-NEXT:   $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 13
-  ; GCN-NEXT:   $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 12
-  ; GCN-NEXT:   $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 11
-  ; GCN-NEXT:   $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 10
-  ; GCN-NEXT:   $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 9
-  ; GCN-NEXT:   $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 8
-  ; GCN-NEXT:   $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 7
-  ; GCN-NEXT:   $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 6
-  ; GCN-NEXT:   $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 5
-  ; GCN-NEXT:   $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 4
-  ; GCN-NEXT:   $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
-  ; GCN-NEXT:   $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2
-  ; GCN-NEXT:   $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
-  ; GCN-NEXT:   $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
-  ; GCN-NEXT:   $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
-  ; GCN-NEXT:   $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
-  ; GCN-NEXT:   $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
-  ; GCN-NEXT:   $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
-  ; GCN-NEXT:   $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
-  ; GCN-NEXT:   $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
-  ; GCN-NEXT:   $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
-  ; GCN-NEXT:   $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
-  ; GCN-NEXT:   $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
-  ; GCN-NEXT:   $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
-  ; GCN-NEXT:   $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
-  ; GCN-NEXT:   $sgpr24 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
-  ; GCN-NEXT:   $sgpr23 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
-  ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
-  ; GCN-NEXT:   $sgpr21 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
-  ; GCN-NEXT:   $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
-  ; GCN-NEXT:   $sgpr19 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
-  ; GCN-NEXT:   $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
-  ; GCN-NEXT:   $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
-  ; GCN-NEXT:   $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
-  ; GCN-NEXT:   $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
-  ; GCN-NEXT:   $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
-  ; GCN-NEXT:   $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
-  ; GCN-NEXT:   $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
-  ; GCN-NEXT:   $sgpr11 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
-  ; GCN-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
-  ; GCN-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
-  ; GCN-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
-  ; GCN-NEXT:   $sgpr7 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
-  ; GCN-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
-  ; GCN-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
-  ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
-  ; GCN-NEXT:   KILL killed renamable $vgpr2
+  ; GCN-NEXT:   $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
+  ; GCN-NEXT:   $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2
+  ; GCN-NEXT:   $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
+  ; GCN-NEXT:   $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
+  ; GCN-NEXT:   $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
+  ; GCN-NEXT:   $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
+  ; GCN-NEXT:   $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
+  ; GCN-NEXT:   $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
+  ; GCN-NEXT:   $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
+  ; GCN-NEXT:   $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
+  ; GCN-NEXT:   $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
+  ; GCN-NEXT:   $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
+  ; GCN-NEXT:   $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
+  ; GCN-NEXT:   $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
+  ; GCN-NEXT:   $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
+  ; GCN-NEXT:   $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
+  ; GCN-NEXT:   $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
+  ; GCN-NEXT:   $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
+  ; GCN-NEXT:   $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
+  ; GCN-NEXT:   $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
+  ; GCN-NEXT:   $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
+  ; GCN-NEXT:   $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
+  ; GCN-NEXT:   $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
+  ; GCN-NEXT:   $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
+  ; GCN-NEXT:   $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
+  ; GCN-NEXT:   $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
+  ; GCN-NEXT:   $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
+  ; GCN-NEXT:   $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
+  ; GCN-NEXT:   $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
+  ; GCN-NEXT:   $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
+  ; GCN-NEXT:   $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
+  ; GCN-NEXT:   $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
+  ; GCN-NEXT:   $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
+  ; GCN-NEXT:   $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
+  ; GCN-NEXT:   $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
+  ; GCN-NEXT:   $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
+  ; GCN-NEXT:   $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31
+  ; GCN-NEXT:   $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30
+  ; GCN-NEXT:   $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29
+  ; GCN-NEXT:   $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28
+  ; GCN-NEXT:   $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27
+  ; GCN-NEXT:   $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26
+  ; GCN-NEXT:   $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 25
+  ; GCN-NEXT:   $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 24
+  ; GCN-NEXT:   $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 23
+  ; GCN-NEXT:   $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 22
+  ; GCN-NEXT:   $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 21
+  ; GCN-NEXT:   $sgpr24 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 20
+  ; GCN-NEXT:   $sgpr23 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 19
+  ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 18
+  ; GCN-NEXT:   $sgpr21 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 17
+  ; GCN-NEXT:   $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 16
+  ; GCN-NEXT:   $sgpr19 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 15
+  ; GCN-NEXT:   $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 14
+  ; GCN-NEXT:   $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 13
+  ; GCN-NEXT:   $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 12
+  ; GCN-NEXT:   $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 11
+  ; GCN-NEXT:   $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 10
+  ; GCN-NEXT:   $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 9
+  ; GCN-NEXT:   $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 8
+  ; GCN-NEXT:   $sgpr11 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 7
+  ; GCN-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 6
+  ; GCN-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 5
+  ; GCN-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 4
+  ; GCN-NEXT:   $sgpr7 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3
+  ; GCN-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 2
+  ; GCN-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
+  ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
   ; GCN-NEXT:   $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
-  ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
-  ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
-  ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
-  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5)
+  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
+  ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
+  ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
+  ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
+  ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5)
   ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr0
   ; GCN-NEXT:   $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24, implicit-def dead $scc
   ; GCN-NEXT:   $sgpr33 = frame-destroy COPY $vcc_hi
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir
index 2c4b7a22facf4..59c4b715dd12e 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir
@@ -218,7 +218,7 @@ body:             |
   ; VMEM-GFX8-LABEL: name: sgpr32_save_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; VMEM-GFX8-NEXT:   liveins: $sgpr8, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
@@ -235,7 +235,7 @@ body:             |
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT: bb.1:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x80000000)
-  ; VMEM-GFX8-NEXT:   liveins: $sgpr8, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_NOP 0
   ; VMEM-GFX8-NEXT: {{  $}}
@@ -271,7 +271,7 @@ body:             |
   ; VMEM-GFX8-LABEL: name: sgpr32_restore_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
@@ -288,7 +288,7 @@ body:             |
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT: bb.1:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x80000000)
-  ; VMEM-GFX8-NEXT:   liveins: $sgpr8, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_NOP 0
   ; VMEM-GFX8-NEXT: {{  $}}
@@ -327,7 +327,7 @@ body:             |
   ; VMEM-GFX8-LABEL: name: sgpr64_save_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; VMEM-GFX8-NEXT:   liveins: $sgpr8_sgpr9, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
@@ -345,7 +345,7 @@ body:             |
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT: bb.1:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x80000000)
-  ; VMEM-GFX8-NEXT:   liveins: $sgpr8_sgpr9, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_NOP 0
   ; VMEM-GFX8-NEXT: {{  $}}
@@ -381,7 +381,7 @@ body:             |
   ; VMEM-GFX8-LABEL: name: sgpr64_restore_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
@@ -399,7 +399,7 @@ body:             |
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT: bb.1:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x80000000)
-  ; VMEM-GFX8-NEXT:   liveins: $sgpr8_sgpr9, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_NOP 0
   ; VMEM-GFX8-NEXT: {{  $}}
@@ -441,7 +441,7 @@ body:             |
   ; VMEM-GFX8-LABEL: name: sgpr32_save_clobber_scc_emergency_stack_slot_x2
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; VMEM-GFX8-NEXT:   liveins: $sgpr8, $sgpr9, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8, $sgpr9
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
@@ -468,7 +468,7 @@ body:             |
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT: bb.1:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x80000000)
-  ; VMEM-GFX8-NEXT:   liveins: $sgpr8, $sgpr9, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8, $sgpr9
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_NOP 0
   ; VMEM-GFX8-NEXT: {{  $}}
@@ -505,7 +505,7 @@ body:             |
   ; VMEM-GFX8-LABEL: name: sgpr32_restore_clobber_scc_emergency_stack_slot_x2
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
@@ -532,7 +532,7 @@ body:             |
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT: bb.1:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x80000000)
-  ; VMEM-GFX8-NEXT:   liveins: $sgpr8, $sgpr9, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8, $sgpr9
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_NOP 0
   ; VMEM-GFX8-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir
index 0c6c0069911f0..bed7c0c12b7cb 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir
@@ -18,11 +18,9 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   $sgpr0 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
-  ; CHECK-NEXT:   KILL [[DEF]]
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     liveins: $sgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index b2f5b6aa7fe36..ff2202f1e177b 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -19,7 +19,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
 ; GCN-NEXT:    s_mov_b32 s18, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT:    buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x7400
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill
@@ -135,13 +135,13 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
 ; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_writelane_b32 v255, s30, 0
 ; GCN-NEXT:    v_writelane_b32 v255, s31, 1
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:444
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, child_function@gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, child_function@gotpcrel32@hi+12
@@ -266,7 +266,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff8c00
 ; GCN-NEXT:    s_mov_b32 s33, s18
@@ -313,7 +313,7 @@ define void @spill_to_lowest_available_vgpr() #0 {
 ; GCN-NEXT:    s_mov_b32 s18, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x7400
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill
@@ -428,13 +428,13 @@ define void @spill_to_lowest_available_vgpr() #0 {
 ; GCN-NEXT:    buffer_store_dword v253, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_writelane_b32 v254, s30, 0
 ; GCN-NEXT:    v_writelane_b32 v254, s31, 1
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:440
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, child_function@gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, child_function@gotpcrel32@hi+12
@@ -558,7 +558,7 @@ define void @spill_to_lowest_available_vgpr() #0 {
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff8c00
 ; GCN-NEXT:    s_mov_b32 s33, s18
@@ -602,8 +602,8 @@ define void @spill_sgpr_with_sgpr_uses() #0 {
 ; GCN-LABEL: spill_sgpr_with_sgpr_uses:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
@@ -715,38 +715,30 @@ define void @spill_sgpr_with_sgpr_uses() #0 {
 ; GCN-NEXT:    buffer_store_dword v251, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v252, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v253, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:440
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s4
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_writelane_b32 v0, s4, 0
+; GCN-NEXT:    ; implicit-def: $vgpr254 : SGPR spill to VGPR lane
+; GCN-NEXT:    v_writelane_b32 v254, s4, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s4, v0, 0
+; GCN-NEXT:    v_readlane_b32 s4, v254, 0
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s4
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB3_2: ; %ret
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    buffer_load_dword v253, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v252, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v251, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -857,8 +849,8 @@ define void @spill_sgpr_with_sgpr_uses() #0 {
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v254, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -1183,7 +1175,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
@@ -1315,16 +1307,16 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GCN-NEXT:    flat_load_dwordx4 v[6:9], v[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -1446,7 +1438,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
index 887e9c4b5dc5e..0e6d9ce4a7f31 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
@@ -25,9 +25,9 @@ body:             |
     ; SGPR_SPILLED-LABEL: name: stack-slot-share-equal-sized-spills
     ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
     ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
     ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]], implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr1, 2, [[DEF]], implicit $sgpr0_sgpr1
@@ -91,9 +91,9 @@ body:             |
     ; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-large-spill-first
     ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
     ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
     ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr3, 2, [[DEF]], implicit $sgpr2_sgpr3
@@ -155,9 +155,9 @@ body:             |
     ; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-small-spill-first
     ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
     ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
     ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]]
     ; SGPR_SPILLED-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
deleted file mode 100644
index c5a5a5209f54f..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SGPR %s
-
-; Make sure this doesn't crash.
-; ALL-LABEL: {{^}}test:
-; ALL: s_mov_b32 s[[LO:[0-9]+]], SCRATCH_RSRC_DWORD0
-; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000
-
-; Make sure we are handling hazards correctly.
-; SGPR: v_mov_b32_e32 v0, vcc_lo
-; SGPR-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1
-; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 ; 4-byte Folded Reload
-; SGPR-NEXT: s_mov_b64 exec, [[EXEC_COPY]]
-; SGPR-NEXT: s_waitcnt vmcnt(0)
-; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0
-; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1
-; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2
-; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3
-; SGPR-NEXT: s_or_saveexec_b64 s[100:101], -1
-; SGPR-NEXT: s_mov_b64 exec, s[100:101]
-; SGPR-NEXT: s_nop 2
-; SGPR-NEXT: buffer_store_dword v0, off, s[{{[0-9]+}}:[[HI]]], 0
-; SGPR-NEXT: ; kill: killed $vgpr1
-
-; ALL: s_endpgm
-define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
-  call void asm sideeffect "", "~{s[0:7]}" ()
-  call void asm sideeffect "", "~{s[8:15]}" ()
-  call void asm sideeffect "", "~{s[16:23]}" ()
-  call void asm sideeffect "", "~{s[24:31]}" ()
-  call void asm sideeffect "", "~{s[32:39]}" ()
-  call void asm sideeffect "", "~{s[40:47]}" ()
-  call void asm sideeffect "", "~{s[48:55]}" ()
-  call void asm sideeffect "", "~{s[56:63]}" ()
-  call void asm sideeffect "", "~{s[64:71]}" ()
-  call void asm sideeffect "", "~{s[72:79]}" ()
-  call void asm sideeffect "", "~{s[80:87]}" ()
-  call void asm sideeffect "", "~{s[88:95]}" ()
-  call void asm sideeffect "", "~{v[0:7]}" ()
-  call void asm sideeffect "", "~{v[8:15]}" ()
-  call void asm sideeffect "", "~{v[16:23]}" ()
-  call void asm sideeffect "", "~{v[24:31]}" ()
-  call void asm sideeffect "", "~{v[32:39]}" ()
-  call void asm sideeffect "", "~{v[40:47]}" ()
-  call void asm sideeffect "", "~{v[48:55]}" ()
-  call void asm sideeffect "", "~{v[56:63]}" ()
-  call void asm sideeffect "", "~{v[64:71]}" ()
-  call void asm sideeffect "", "~{v[72:79]}" ()
-  call void asm sideeffect "", "~{v[80:87]}" ()
-  call void asm sideeffect "", "~{v[88:95]}" ()
-  call void asm sideeffect "", "~{v[96:103]}" ()
-  call void asm sideeffect "", "~{v[104:111]}" ()
-  call void asm sideeffect "", "~{v[112:119]}" ()
-  call void asm sideeffect "", "~{v[120:127]}" ()
-  call void asm sideeffect "", "~{v[128:135]}" ()
-  call void asm sideeffect "", "~{v[136:143]}" ()
-  call void asm sideeffect "", "~{v[144:151]}" ()
-  call void asm sideeffect "", "~{v[152:159]}" ()
-  call void asm sideeffect "", "~{v[160:167]}" ()
-  call void asm sideeffect "", "~{v[168:175]}" ()
-  call void asm sideeffect "", "~{v[176:183]}" ()
-  call void asm sideeffect "", "~{v[184:191]}" ()
-  call void asm sideeffect "", "~{v[192:199]}" ()
-  call void asm sideeffect "", "~{v[200:207]}" ()
-  call void asm sideeffect "", "~{v[208:215]}" ()
-  call void asm sideeffect "", "~{v[216:223]}" ()
-  call void asm sideeffect "", "~{v[224:231]}" ()
-  call void asm sideeffect "", "~{v[232:239]}" ()
-  call void asm sideeffect "", "~{v[240:247]}" ()
-  call void asm sideeffect "", "~{v[248:255]}" ()
-
-  store i32 %in, ptr addrspace(1) %out
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
index f8ec6bb5d943f..080bd052a7391 100644
--- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
+++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
@@ -35,7 +35,7 @@ body:             |
   ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $sgpr34_sgpr35 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead renamable $vgpr0 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   renamable $sgpr41 = IMPLICIT_DEF
   ; CHECK-NEXT:   renamable $sgpr38_sgpr39 = COPY undef $sgpr8_sgpr9
   ; CHECK-NEXT:   renamable $sgpr36_sgpr37 = IMPLICIT_DEF
@@ -79,9 +79,9 @@ body:             |
   ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr38_sgpr39, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4)
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR undef renamable $vgpr0, undef renamable $vgpr0, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1)
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR undef renamable $vgpr0, undef renamable $vgpr0, renamable $sgpr50_sgpr51, 0, 0, implicit $exec :: (store (s32), addrspace 1)
-  ; CHECK-NEXT:   dead renamable $vgpr0 = COPY killed renamable $sgpr49
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], renamable $sgpr50_sgpr51, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+  ; CHECK-NEXT:   dead [[COPY:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr49
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY killed renamable $sgpr36_sgpr37
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY killed renamable $sgpr34_sgpr35
diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
index 34bcc3f02ac66..03988c3994992 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
@@ -2,23 +2,20 @@
 
 ; GCN-LABEL: {{^}}spill_csr_s5_copy:
 ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN: s_xor_saveexec_b64
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN: s_or_saveexec_b64
 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec
-; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4
+; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2
 ; GCN: s_swappc_b64
 
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9
 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
 
-; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4
-; GCN: s_xor_saveexec_b64
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2
+; GCN: s_or_saveexec_b64
 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GCN: s_mov_b64 exec
 ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
 ; GCN: s_setpc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
index e54e5898f8b53..40be0c6b67ee9 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
@@ -22,14 +22,11 @@ body:             |
   ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
   ; CHECK-NEXT:   dead [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e32 0, [[V_MOV_B32_e32_]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   undef [[V_MAC_F32_e32_:%[0-9]+]].sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef [[V_MAC_F32_e32_]].sub1, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   SI_SPILL_V64_SAVE [[V_MAC_F32_e32_]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_1:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
-  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_1]].sub1
-  ; CHECK-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V64_RESTORE]].sub1
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %9.sub0:vreg_64
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MAC_F32_e32_]].sub1
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[V_MAC_F32_e32_]].sub0
   bb.0:
     successors: %bb.1
 
@@ -59,13 +56,13 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_1:%[0-9]+]].sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub2
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %4.sub0:vreg_128
-  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_1:%[0-9]+]].sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_1]].sub2
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub2
   bb.0:
     successors: %bb.1
 
diff --git a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
index 05e1082de4478..f4edafd9443ab 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
@@ -25,14 +25,13 @@ body:             |
     ; GCN-NEXT: $sgpr8_sgpr9 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
     ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr1, 1, killed $vgpr0
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr2, 2, killed $vgpr0
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr1, 1, killed $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr2, 2, killed $vgpr0
+    ; GCN-NEXT: dead $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GCN-NEXT: renamable $sgpr8 = COPY renamable $sgpr1
-    ; GCN-NEXT: KILL killed renamable $vgpr0
     ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
@@ -64,13 +63,12 @@ body:             |
     ; GCN-NEXT: $sgpr8_sgpr9 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
     ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr1, 1, killed $vgpr0
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr2, 2, killed $vgpr0
-    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN-NEXT: KILL killed renamable $vgpr0
+    ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr1, 1, killed $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr2, 2, killed $vgpr0
+    ; GCN-NEXT: dead $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
index 11babc82e919b..dff2bd7f7aef9 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
@@ -21,8 +21,8 @@ body:             |
     ; GCN-LABEL: name: sgpr32_spill
     ; GCN: liveins: $sgpr30_sgpr31, $sgpr10
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GCN-NEXT: S_NOP 0
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
     ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
     ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31
@@ -55,7 +55,6 @@ body:             |
     ; GCN-LABEL: name: sgpr_spill_lane_crossover
     ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr63, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr64, 0, $vgpr63
     ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr65, 1, $vgpr63
     ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr66, 2, $vgpr63
@@ -89,6 +88,7 @@ body:             |
     ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr94, 30, $vgpr63
     ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 31, $vgpr63
     ; GCN-NEXT: S_NOP 0
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr64, 1, [[DEF]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr65, 2, [[DEF]]
@@ -187,9 +187,9 @@ body:             |
   ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; GCN-NEXT:   liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GCN-NEXT:   S_NOP 0
   ; GCN-NEXT:   S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
@@ -256,7 +256,6 @@ body:             |
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
   ; GCN-NEXT:   liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GCN-NEXT:   S_NOP 0
   ; GCN-NEXT:   S_BRANCH %bb.3
   ; GCN-NEXT: {{  $}}
@@ -264,7 +263,7 @@ body:             |
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
   ; GCN-NEXT:   liveins: $sgpr10, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+  ; GCN-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR %0, 0
   ; GCN-NEXT:   $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
@@ -272,7 +271,7 @@ body:             |
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
   ; GCN-NEXT:   liveins: $sgpr10, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
+  ; GCN-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR %0, 0
   ; GCN-NEXT:   $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc
   ; GCN-NEXT:   S_BRANCH %bb.3
   ; GCN-NEXT: {{  $}}
@@ -281,6 +280,7 @@ body:             |
   ; GCN-NEXT:   liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $sgpr10 = S_MOV_B32 10
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
   ; GCN-NEXT:   S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
   ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir
index 2caaabde38e9d..9b0f52cb39b01 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir
@@ -19,12 +19,8 @@ body:             |
   bb.0:
     liveins: $sgpr30_sgpr31, $vgpr0
     ; GCN-LABEL: name: shift_back_exec_copy_reserved_reg
-    ; GCN: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
+    ; GCN: liveins: $sgpr30_sgpr31, $vgpr0
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, killed $vgpr0
     ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, killed $vgpr0
     ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
@@ -60,12 +56,10 @@ body:             |
   bb.0:
     liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr30_sgpr31, $vcc, $vgpr0
     ; GCN-LABEL: name: spill_exec_copy_reserved_reg
-    ; GCN: liveins: $vcc, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
+    ; GCN: liveins: $vcc, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $vgpr0, $vgpr2, $sgpr30_sgpr31
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr28_sgpr29 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr28_sgpr29
     ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 0, undef $vgpr2
     ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr35, 1, undef $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index b4a981f1db4ec..882356d994fc6 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -12,12 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; GCN-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %30.sub0
-  ; GCN-NEXT:   SI_SPILL_V64_SAVE %30, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %12.sub0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
   ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]]
+  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %12
   ; GCN-NEXT:   S_ENDPGM 0
   %v0 = call i32 asm sideeffect "; def $0", "=v"()
   %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
index 81dd2c4457b2f..4384d1e32cf53 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
@@ -11,9 +11,8 @@ define void @test() {
 ; CHECK:       ; %bb.0: ; %bb.0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; CHECK-NEXT:  .LBB0_1: ; %bb.1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_3
@@ -21,42 +20,40 @@ define void @test() {
 ; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:  .LBB0_3: ; %bb.3
 ; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
 ; CHECK-NEXT:    ; implicit-def: $sgpr4
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    v_readfirstlane_b32 s6, v1
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_readfirstlane_b32 s6, v0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], -1
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    s_cmp_eq_u32 s6, s7
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 1
+; CHECK-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
+; CHECK-NEXT:    v_writelane_b32 v1, s4, 0
+; CHECK-NEXT:    v_writelane_b32 v1, s5, 1
 ; CHECK-NEXT:    s_mov_b64 s[10:11], exec
 ; CHECK-NEXT:    s_mov_b64 exec, -1
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a0, v1 ; Reload Reuse
 ; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_5
 ; CHECK-NEXT:  ; %bb.4: ; %bb.4
 ; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a0 ; Reload Reuse
 ; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 1
+; CHECK-NEXT:    v_writelane_b32 v1, s4, 0
+; CHECK-NEXT:    v_writelane_b32 v1, s5, 1
 ; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a0, v1 ; Reload Reuse
 ; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
 ; CHECK-NEXT:  .LBB0_5: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a0 ; Reload Reuse
 ; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 0
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 1
+; CHECK-NEXT:    v_readlane_b32 s4, v1, 0
+; CHECK-NEXT:    v_readlane_b32 s5, v1, 1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CHECK-NEXT:    s_mov_b32 s4, 1
 ; CHECK-NEXT:    ; implicit-def: $sgpr5
@@ -64,12 +61,8 @@ define void @test() {
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.6: ; %bb.5
-; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
-; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir
index 5040140a3e935..be2a31d7ccbaa 100644
--- a/llvm/test/CodeGen/AMDGPU/spill192.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill192.mir
@@ -37,8 +37,8 @@ body: |
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
   ; EXPANDED-NEXT: {{  $}}
-  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]]
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/spill224.mir b/llvm/test/CodeGen/AMDGPU/spill224.mir
index 5e53f93df95f7..f4965dcf61e17 100644
--- a/llvm/test/CodeGen/AMDGPU/spill224.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill224.mir
@@ -33,8 +33,8 @@ body: |
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
   ; EXPANDED-NEXT: {{  $}}
-  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]]
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/spill288.mir b/llvm/test/CodeGen/AMDGPU/spill288.mir
index 3d5404a9c1ad5..312531ba5bc99 100644
--- a/llvm/test/CodeGen/AMDGPU/spill288.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill288.mir
@@ -33,8 +33,8 @@ body: |
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
   ; EXPANDED-NEXT: {{  $}}
-  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]]
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/spill320.mir b/llvm/test/CodeGen/AMDGPU/spill320.mir
index 4473a4d6648ef..0c0c01760f8ba 100644
--- a/llvm/test/CodeGen/AMDGPU/spill320.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill320.mir
@@ -33,8 +33,8 @@ body: |
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
   ; EXPANDED-NEXT: {{  $}}
-  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]]
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/spill352.mir b/llvm/test/CodeGen/AMDGPU/spill352.mir
index 8fa053a908b60..8823ba1a8326e 100644
--- a/llvm/test/CodeGen/AMDGPU/spill352.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill352.mir
@@ -33,8 +33,8 @@ body: |
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
   ; EXPANDED-NEXT: {{  $}}
-  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]]
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/spill384.mir b/llvm/test/CodeGen/AMDGPU/spill384.mir
index cd604e4483b9c..e33641cf89237 100644
--- a/llvm/test/CodeGen/AMDGPU/spill384.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill384.mir
@@ -33,8 +33,8 @@ body: |
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
   ; EXPANDED-NEXT: {{  $}}
-  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]]
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index 3c16cd29de8f6..6b0fbc44c65b7 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -236,20 +236,15 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE32-O0:       ; %bb.0: ; %bb0
 ; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_xor_saveexec_b32 s4, -1
-; WAVE32-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; WAVE32-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s4
-; WAVE32-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; WAVE32-O0-NEXT:    v_mov_b32_e32 v1, v0
-; WAVE32-O0-NEXT:    s_or_saveexec_b32 s7, -1
-; WAVE32-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s7
-; WAVE32-O0-NEXT:    v_and_b32_e64 v1, 1, v1
-; WAVE32-O0-NEXT:    v_cmp_eq_u32_e64 s5, v1, 1
+; WAVE32-O0-NEXT:    v_and_b32_e64 v0, 1, v0
+; WAVE32-O0-NEXT:    v_cmp_eq_u32_e64 s5, v0, 1
 ; WAVE32-O0-NEXT:    s_mov_b32 s4, exec_lo
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0)
-; WAVE32-O0-NEXT:    v_writelane_b32 v0, s4, 0
+; WAVE32-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
+; WAVE32-O0-NEXT:    v_writelane_b32 v1, s4, 0
 ; WAVE32-O0-NEXT:    s_or_saveexec_b32 s7, -1
-; WAVE32-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE32-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s7
 ; WAVE32-O0-NEXT:    s_and_b32 s4, s4, s5
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s4
@@ -262,14 +257,13 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE32-O0-NEXT:    ;;#ASMEND
 ; WAVE32-O0-NEXT:  .LBB4_2: ; %bb2
 ; WAVE32-O0-NEXT:    s_or_saveexec_b32 s7, -1
-; WAVE32-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; WAVE32-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s7
 ; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0)
-; WAVE32-O0-NEXT:    v_readlane_b32 s4, v0, 0
+; WAVE32-O0-NEXT:    v_readlane_b32 s4, v1, 0
 ; WAVE32-O0-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; WAVE32-O0-NEXT:    ; kill: killed $vgpr0
 ; WAVE32-O0-NEXT:    s_xor_saveexec_b32 s4, -1
-; WAVE32-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; WAVE32-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s4
 ; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -278,21 +272,16 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE64-O0:       ; %bb.0: ; %bb0
 ; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; WAVE64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; WAVE64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; WAVE64-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; WAVE64-O0-NEXT:    v_mov_b32_e32 v1, v0
-; WAVE64-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; WAVE64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; WAVE64-O0-NEXT:    s_mov_b64 exec, s[10:11]
-; WAVE64-O0-NEXT:    v_and_b32_e64 v1, 1, v1
-; WAVE64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v1, 1
+; WAVE64-O0-NEXT:    v_and_b32_e64 v0, 1, v0
+; WAVE64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, 1
 ; WAVE64-O0-NEXT:    s_mov_b64 s[4:5], exec
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0)
-; WAVE64-O0-NEXT:    v_writelane_b32 v0, s4, 0
-; WAVE64-O0-NEXT:    v_writelane_b32 v0, s5, 1
+; WAVE64-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
+; WAVE64-O0-NEXT:    v_writelane_b32 v1, s4, 0
+; WAVE64-O0-NEXT:    v_writelane_b32 v1, s5, 1
 ; WAVE64-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; WAVE64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[10:11]
 ; WAVE64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -305,15 +294,14 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE64-O0-NEXT:    ;;#ASMEND
 ; WAVE64-O0-NEXT:  .LBB4_2: ; %bb2
 ; WAVE64-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; WAVE64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; WAVE64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[10:11]
 ; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0)
-; WAVE64-O0-NEXT:    v_readlane_b32 s4, v0, 0
-; WAVE64-O0-NEXT:    v_readlane_b32 s5, v0, 1
+; WAVE64-O0-NEXT:    v_readlane_b32 s4, v1, 0
+; WAVE64-O0-NEXT:    v_readlane_b32 s5, v1, 1
 ; WAVE64-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; WAVE64-O0-NEXT:    ; kill: killed $vgpr0
 ; WAVE64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; WAVE64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; WAVE64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE64-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -324,10 +312,10 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s4
-; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; WAVE32-WWM-PREALLOC-NEXT:    v_and_b32_e64 v0, 1, v0
 ; WAVE32-WWM-PREALLOC-NEXT:    v_cmp_eq_u32_e64 s5, v0, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s4, exec_lo
+; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v1, s4, 0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_and_b32 s4, s4, s5
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s4
@@ -341,7 +329,6 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE32-WWM-PREALLOC-NEXT:  .LBB4_2: ; %bb2
 ; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s4, v1, 0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; WAVE32-WWM-PREALLOC-NEXT:    ; kill: killed $vgpr1
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s4
@@ -941,7 +928,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-O0-NEXT:    s_bitset0_b32 s23, 21
 ; WAVE32-O0-NEXT:    s_add_u32 s20, s20, s9
 ; WAVE32-O0-NEXT:    s_addc_u32 s21, s21, 0
-; WAVE32-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; WAVE32-O0-NEXT:    s_mov_b32 s14, s8
 ; WAVE32-O0-NEXT:    s_mov_b32 s13, s7
 ; WAVE32-O0-NEXT:    s_mov_b32 s12, s6
@@ -949,12 +935,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-O0-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; WAVE32-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; WAVE32-O0-NEXT:    s_mov_b32 s0, s32
-; WAVE32-O0-NEXT:    v_writelane_b32 v3, s0, 0
+; WAVE32-O0-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
+; WAVE32-O0-NEXT:    v_writelane_b32 v32, s0, 0
 ; WAVE32-O0-NEXT:    s_lshr_b32 s0, s0, 5
-; WAVE32-O0-NEXT:    v_writelane_b32 v3, s0, 1
-; WAVE32-O0-NEXT:    s_or_saveexec_b32 s19, -1
-; WAVE32-O0-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill
-; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s19
+; WAVE32-O0-NEXT:    v_writelane_b32 v32, s0, 1
 ; WAVE32-O0-NEXT:    v_mov_b32_e32 v3, 42
 ; WAVE32-O0-NEXT:    buffer_store_dword v3, off, s[20:23], 0
 ; WAVE32-O0-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1037,17 +1021,12 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-O0-NEXT:    ; implicit-def: $sgpr18
 ; WAVE32-O0-NEXT:    v_mov_b32_e32 v30, s18
 ; WAVE32-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; WAVE32-O0-NEXT:    s_or_saveexec_b32 s19, -1
-; WAVE32-O0-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:128 ; 4-byte Folded Reload
-; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s19
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0)
-; WAVE32-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; WAVE32-O0-NEXT:    v_readlane_b32 s0, v0, 0
+; WAVE32-O0-NEXT:    v_readlane_b32 s1, v32, 1
+; WAVE32-O0-NEXT:    v_readlane_b32 s0, v32, 0
 ; WAVE32-O0-NEXT:    ;;#ASMSTART
 ; WAVE32-O0-NEXT:    ; use s1
 ; WAVE32-O0-NEXT:    ;;#ASMEND
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s0
-; WAVE32-O0-NEXT:    ; kill: killed $vgpr0
 ; WAVE32-O0-NEXT:    s_endpgm
 ;
 ; WAVE64-O0-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects:
@@ -1059,7 +1038,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE64-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_add_u32 s24, s24, s9
 ; WAVE64-O0-NEXT:    s_addc_u32 s25, s25, 0
-; WAVE64-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; WAVE64-O0-NEXT:    s_mov_b32 s14, s8
 ; WAVE64-O0-NEXT:    s_mov_b32 s13, s7
 ; WAVE64-O0-NEXT:    s_mov_b32 s12, s6
@@ -1067,12 +1045,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE64-O0-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; WAVE64-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; WAVE64-O0-NEXT:    s_mov_b32 s0, s32
-; WAVE64-O0-NEXT:    v_writelane_b32 v3, s0, 0
+; WAVE64-O0-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
+; WAVE64-O0-NEXT:    v_writelane_b32 v32, s0, 0
 ; WAVE64-O0-NEXT:    s_lshr_b32 s0, s0, 6
-; WAVE64-O0-NEXT:    v_writelane_b32 v3, s0, 1
-; WAVE64-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; WAVE64-O0-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill
-; WAVE64-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; WAVE64-O0-NEXT:    v_writelane_b32 v32, s0, 1
 ; WAVE64-O0-NEXT:    v_mov_b32_e32 v3, 42
 ; WAVE64-O0-NEXT:    buffer_store_dword v3, off, s[24:27], 0
 ; WAVE64-O0-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1155,17 +1131,12 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE64-O0-NEXT:    ; implicit-def: $sgpr18
 ; WAVE64-O0-NEXT:    v_mov_b32_e32 v30, s18
 ; WAVE64-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; WAVE64-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; WAVE64-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload
-; WAVE64-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0)
-; WAVE64-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; WAVE64-O0-NEXT:    v_readlane_b32 s0, v0, 0
+; WAVE64-O0-NEXT:    v_readlane_b32 s1, v32, 1
+; WAVE64-O0-NEXT:    v_readlane_b32 s0, v32, 0
 ; WAVE64-O0-NEXT:    ;;#ASMSTART
 ; WAVE64-O0-NEXT:    ; use s1
 ; WAVE64-O0-NEXT:    ;;#ASMEND
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s0
-; WAVE64-O0-NEXT:    ; kill: killed $vgpr0
 ; WAVE64-O0-NEXT:    s_endpgm
 ;
 ; WAVE32-WWM-PREALLOC-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects:
@@ -1178,7 +1149,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-WWM-PREALLOC-NEXT:    s_bitset0_b32 s23, 21
 ; WAVE32-WWM-PREALLOC-NEXT:    s_add_u32 s20, s20, s9
 ; WAVE32-WWM-PREALLOC-NEXT:    s_addc_u32 s21, s21, 0
-; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s14, s8
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s13, s7
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s12, s6
@@ -1186,6 +1156,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s0, s32
+; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
 ; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s0, 0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshr_b32 s0, s0, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s0, 1
@@ -1277,7 +1248,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-WWM-PREALLOC-NEXT:    ; use s1
 ; WAVE32-WWM-PREALLOC-NEXT:    ;;#ASMEND
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s0
-; WAVE32-WWM-PREALLOC-NEXT:    ; kill: killed $vgpr32
 ; WAVE32-WWM-PREALLOC-NEXT:    s_endpgm
   %alloca = alloca [32 x i32], addrspace(5)
   %stacksave = call ptr addrspace(5) @llvm.stacksave.p5()
@@ -1362,23 +1332,20 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects:
 ; WAVE32-O0:       ; %bb.0:
 ; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WAVE32-O0-NEXT:    s_mov_b32 s25, s33
+; WAVE32-O0-NEXT:    s_mov_b32 s24, s33
 ; WAVE32-O0-NEXT:    s_mov_b32 s33, s32
 ; WAVE32-O0-NEXT:    s_xor_saveexec_b32 s16, -1
 ; WAVE32-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
-; WAVE32-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
+; WAVE32-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s16
 ; WAVE32-O0-NEXT:    s_add_i32 s32, s32, 0x1200
-; WAVE32-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; WAVE32-O0-NEXT:    v_writelane_b32 v32, s30, 0
 ; WAVE32-O0-NEXT:    v_writelane_b32 v32, s31, 1
 ; WAVE32-O0-NEXT:    s_mov_b32 s16, s32
-; WAVE32-O0-NEXT:    v_writelane_b32 v0, s16, 0
+; WAVE32-O0-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
+; WAVE32-O0-NEXT:    v_writelane_b32 v33, s16, 0
 ; WAVE32-O0-NEXT:    s_lshr_b32 s16, s16, 5
-; WAVE32-O0-NEXT:    v_writelane_b32 v0, s16, 1
-; WAVE32-O0-NEXT:    s_or_saveexec_b32 s24, -1
-; WAVE32-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
-; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s24
+; WAVE32-O0-NEXT:    v_writelane_b32 v33, s16, 1
 ; WAVE32-O0-NEXT:    v_mov_b32_e32 v0, 42
 ; WAVE32-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; WAVE32-O0-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1456,25 +1423,20 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-O0-NEXT:    ; implicit-def: $sgpr18
 ; WAVE32-O0-NEXT:    v_mov_b32_e32 v30, s18
 ; WAVE32-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; WAVE32-O0-NEXT:    s_or_saveexec_b32 s24, -1
-; WAVE32-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
-; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s24
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0)
-; WAVE32-O0-NEXT:    v_readlane_b32 s5, v0, 1
-; WAVE32-O0-NEXT:    v_readlane_b32 s4, v0, 0
+; WAVE32-O0-NEXT:    v_readlane_b32 s5, v33, 1
+; WAVE32-O0-NEXT:    v_readlane_b32 s4, v33, 0
 ; WAVE32-O0-NEXT:    ;;#ASMSTART
 ; WAVE32-O0-NEXT:    ; use s5
 ; WAVE32-O0-NEXT:    ;;#ASMEND
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s4
 ; WAVE32-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE32-O0-NEXT:    v_readlane_b32 s30, v32, 0
-; WAVE32-O0-NEXT:    ; kill: killed $vgpr0
 ; WAVE32-O0-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
-; WAVE32-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
+; WAVE32-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s4
 ; WAVE32-O0-NEXT:    s_add_i32 s32, s32, 0xffffee00
-; WAVE32-O0-NEXT:    s_mov_b32 s33, s25
+; WAVE32-O0-NEXT:    s_mov_b32 s33, s24
 ; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1485,19 +1447,16 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE64-O0-NEXT:    s_mov_b32 s33, s32
 ; WAVE64-O0-NEXT:    s_xor_saveexec_b64 s[16:17], -1
 ; WAVE64-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
-; WAVE64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
+; WAVE64-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; WAVE64-O0-NEXT:    s_add_i32 s32, s32, 0x2400
-; WAVE64-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; WAVE64-O0-NEXT:    v_writelane_b32 v32, s30, 0
 ; WAVE64-O0-NEXT:    v_writelane_b32 v32, s31, 1
 ; WAVE64-O0-NEXT:    s_mov_b32 s16, s32
-; WAVE64-O0-NEXT:    v_writelane_b32 v0, s16, 0
+; WAVE64-O0-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
+; WAVE64-O0-NEXT:    v_writelane_b32 v33, s16, 0
 ; WAVE64-O0-NEXT:    s_lshr_b32 s16, s16, 6
-; WAVE64-O0-NEXT:    v_writelane_b32 v0, s16, 1
-; WAVE64-O0-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; WAVE64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
-; WAVE64-O0-NEXT:    s_mov_b64 exec, s[26:27]
+; WAVE64-O0-NEXT:    v_writelane_b32 v33, s16, 1
 ; WAVE64-O0-NEXT:    v_mov_b32_e32 v0, 42
 ; WAVE64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; WAVE64-O0-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1575,22 +1534,17 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE64-O0-NEXT:    ; implicit-def: $sgpr18
 ; WAVE64-O0-NEXT:    v_mov_b32_e32 v30, s18
 ; WAVE64-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; WAVE64-O0-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; WAVE64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
-; WAVE64-O0-NEXT:    s_mov_b64 exec, s[26:27]
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0)
-; WAVE64-O0-NEXT:    v_readlane_b32 s5, v0, 1
-; WAVE64-O0-NEXT:    v_readlane_b32 s4, v0, 0
+; WAVE64-O0-NEXT:    v_readlane_b32 s5, v33, 1
+; WAVE64-O0-NEXT:    v_readlane_b32 s4, v33, 0
 ; WAVE64-O0-NEXT:    ;;#ASMSTART
 ; WAVE64-O0-NEXT:    ; use s5
 ; WAVE64-O0-NEXT:    ;;#ASMEND
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s4
 ; WAVE64-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE64-O0-NEXT:    v_readlane_b32 s30, v32, 0
-; WAVE64-O0-NEXT:    ; kill: killed $vgpr0
 ; WAVE64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; WAVE64-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
-; WAVE64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
+; WAVE64-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; WAVE64-O0-NEXT:    s_add_i32 s32, s32, 0xffffdc00
 ; WAVE64-O0-NEXT:    s_mov_b32 s33, s19
@@ -1603,14 +1557,14 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s24, s33
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s33, s32
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s16, -1
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s16
 ; WAVE32-WWM-PREALLOC-NEXT:    s_add_i32 s32, s32, 0x1200
-; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
 ; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v33, s30, 0
 ; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v33, s31, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s16, s32
+; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
 ; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s16, 0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshr_b32 s16, s16, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s16, 1
@@ -1699,10 +1653,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
 ; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s31, v33, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s30, v33, 0
-; WAVE32-WWM-PREALLOC-NEXT:    ; kill: killed $vgpr32
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s4, -1
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s4
 ; WAVE32-WWM-PREALLOC-NEXT:    s_add_i32 s32, s32, 0xffffee00
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s33, s24
diff --git a/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir
index 2c4a5dba3520c..cc261b0da4a8f 100644
--- a/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir
+++ b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir
@@ -17,10 +17,10 @@ body:             |
   bb.0:
     liveins: $sgpr20, $vgpr1
     ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_reg
-    ; GCN: liveins: $sgpr20, $vgpr0, $vgpr1
+    ; GCN: liveins: $sgpr20, $vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
     ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
@@ -50,11 +50,11 @@ body:             |
   bb.0:
     liveins: $sgpr20, $sgpr21, $vgpr1
     ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_tuple_subreg
-    ; GCN: liveins: $sgpr20, $sgpr21, $vgpr0, $vgpr1, $vgpr2
+    ; GCN: liveins: $sgpr20, $sgpr21, $vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
     ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF
@@ -89,10 +89,10 @@ body:             |
   bb.0:
     liveins: $sgpr20, $vgpr1
     ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_different_outgoing_reg
-    ; GCN: liveins: $sgpr20, $vgpr1, $vgpr2
+    ; GCN: liveins: $sgpr20, $vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF
     ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr2
@@ -121,17 +121,17 @@ body:             |
   bb.0:
     liveins: $sgpr20, $vgpr1
     ; GCN-LABEL: name: wwm_csr_spill_reload
-    ; GCN: liveins: $sgpr20, $vgpr1, $vgpr40
+    ; GCN: liveins: $sgpr20, $vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
-    ; GCN-NEXT: $vgpr40 = IMPLICIT_DEF
-    ; GCN-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr40
-    ; GCN-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 0, implicit $exec
+    ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr2
+    ; GCN-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0, implicit $exec
     ; GCN-NEXT: $vgpr0 = COPY killed $vgpr1, implicit $exec
-    ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: SI_RETURN implicit $vgpr0
     $vgpr40 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 52370f6a2ef05..4dfd4c095c87a 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -206,14 +206,14 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
 ;
 ; HSA-TRAP-GFX1100-O0-LABEL: non_entry_trap:
 ; HSA-TRAP-GFX1100-O0:       ; %bb.0: ; %entry
-; HSA-TRAP-GFX1100-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v0, s2, 0
-; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v0, s3, 1
+; HSA-TRAP-GFX1100-O0-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
+; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v2, s2, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v2, s3, 1
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_or_saveexec_b32 s6, -1
-; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v0, off ; 4-byte Folded Spill
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v2, off ; 4-byte Folded Spill
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 exec_lo, s6
 ; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-TRAP-GFX1100-O0-NEXT:    global_load_b32 v0, v0, s[0:1] glc dlc
@@ -236,16 +236,15 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_branch .LBB1_3
 ; HSA-TRAP-GFX1100-O0-NEXT:  .LBB1_2: ; %ret
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_or_saveexec_b32 s6, -1
-; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v0, off, off ; 4-byte Folded Reload
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v2, off, off ; 4-byte Folded Reload
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 exec_lo, s6
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v1, 0
-; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v2, 3
-; HSA-TRAP-GFX1100-O0-NEXT:    global_store_b32 v1, v2, s[0:1] dlc
+; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s0, v2, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s1, v2, 1
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v1, 3
+; HSA-TRAP-GFX1100-O0-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt_vscnt null, 0x0
-; HSA-TRAP-GFX1100-O0-NEXT:    ; kill: killed $vgpr0
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_endpgm
 ; HSA-TRAP-GFX1100-O0-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_sethalt 5
@@ -352,34 +351,32 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
 ;
 ; HSA-TRAP-GFX1100-O0-LABEL: trap_with_use_after:
 ; HSA-TRAP-GFX1100-O0:       ; %bb.0:
-; HSA-TRAP-GFX1100-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v0, off offset:8 ; 4-byte Folded Spill
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; HSA-TRAP-GFX1100-O0-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v1, s2, 0
-; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v1, s3, 1
+; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v2, s2, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v2, s3, 1
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_or_saveexec_b32 s6, -1
-; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v1, off offset:4 ; 4-byte Folded Spill
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v2, off ; 4-byte Folded Spill
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 exec_lo, s6
 ; HSA-TRAP-GFX1100-O0-NEXT:    global_load_b32 v0, v0, s[0:1] glc dlc
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v0, off ; 4-byte Folded Spill
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v0, off offset:4 ; 4-byte Folded Spill
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_cbranch_execnz .LBB2_2
 ; HSA-TRAP-GFX1100-O0-NEXT:  ; %bb.1:
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v0, off, off offset:8 ; 4-byte Folded Reload
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v1, off, off offset:4 ; 4-byte Folded Reload
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_or_saveexec_b32 s6, -1
-; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v0, off, off offset:4 ; 4-byte Folded Reload
-; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 exec_lo, s6
-; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s0, v0, 0
-; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v1, off, off offset:8 ; 4-byte Folded Reload
 ; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v2, off, off ; 4-byte Folded Reload
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 exec_lo, s6
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX1100-O0-NEXT:    global_store_b32 v1, v2, s[0:1] dlc
+; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s0, v2, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s1, v2, 1
+; HSA-TRAP-GFX1100-O0-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt_vscnt null, 0x0
-; HSA-TRAP-GFX1100-O0-NEXT:    ; kill: killed $vgpr0
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_endpgm
 ; HSA-TRAP-GFX1100-O0-NEXT:  .LBB2_2:
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_trap 2
diff --git a/llvm/test/CodeGen/AMDGPU/true16-ra-pre-gfx11-regression-test.mir b/llvm/test/CodeGen/AMDGPU/true16-ra-pre-gfx11-regression-test.mir
index c73b023f18652..4c2d0d2fa0d77 100644
--- a/llvm/test/CodeGen/AMDGPU/true16-ra-pre-gfx11-regression-test.mir
+++ b/llvm/test/CodeGen/AMDGPU/true16-ra-pre-gfx11-regression-test.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -start-before=greedy,0 -stop-after=virtregrewriter,2 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
 
 --- |
   define amdgpu_ps void @e32() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 15a83475f368e..a827ebe96cfcf 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -13,41 +13,37 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_add_u32 s0, s0, s13
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
-; CHECK-NEXT:    v_mov_b32_e32 v2, v0
-; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    global_load_ushort v3, v1, s[4:5] offset:4
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    global_load_ushort v2, v0, s[4:5] offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ; implicit-def: $sgpr4
 ; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s4
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    ds_write_b8 v1, v2
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[6:7], v1, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    ds_write_b8 v0, v1
 ; CHECK-NEXT:    s_mov_b64 s[4:5], exec
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 1
+; CHECK-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
+; CHECK-NEXT:    v_writelane_b32 v3, s4, 0
+; CHECK-NEXT:    v_writelane_b32 v3, s5, 1
 ; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %bb193
 ; CHECK-NEXT:  .LBB0_2: ; %bb194
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], 0 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s4, v1, 0
-; CHECK-NEXT:    v_readlane_b32 s5, v1, 1
+; CHECK-NEXT:    v_readlane_b32 s4, v3, 0
+; CHECK-NEXT:    v_readlane_b32 s5, v3, 1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 s4, 0xffff
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e64 v0, s4, v0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], v0, s4
@@ -66,10 +62,6 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    s_trap 2
 ; CHECK-NEXT:    ; divergent unreachable
 ; CHECK-NEXT:  .LBB0_4: ; %UnifiedReturnBlock
-; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
-; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    s_endpgm
 bb:
   %i10 = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir
index dd3572c027c86..e5caa509835c3 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir
@@ -413,7 +413,7 @@ body:             |
   ; MUBUF-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot
   ; MUBUF: bb.0:
   ; MUBUF-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -424,7 +424,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.1:
   ; MUBUF-NEXT:   successors: %bb.2(0x80000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_NOP 0
   ; MUBUF-NEXT: {{  $}}
@@ -434,7 +434,7 @@ body:             |
   ; GFX9-FLATSCR-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX9-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -446,7 +446,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.1:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_NOP 0
   ; GFX9-FLATSCR-NEXT: {{  $}}
@@ -456,7 +456,7 @@ body:             |
   ; GFX10-FLATSCR-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX10-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -467,7 +467,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.1:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_NOP 0
   ; GFX10-FLATSCR-NEXT: {{  $}}
@@ -477,7 +477,7 @@ body:             |
   ; VMEM-GFX8-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -488,7 +488,7 @@ body:             |
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT: bb.1:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x80000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_NOP 0
   ; VMEM-GFX8-NEXT: {{  $}}
@@ -525,7 +525,7 @@ body:             |
   ; MUBUF-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot
   ; MUBUF: bb.0:
   ; MUBUF-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -537,7 +537,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.1:
   ; MUBUF-NEXT:   successors: %bb.2(0x80000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_NOP 0
   ; MUBUF-NEXT: {{  $}}
@@ -547,7 +547,7 @@ body:             |
   ; GFX9-FLATSCR-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX9-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -559,7 +559,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.1:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_NOP 0
   ; GFX9-FLATSCR-NEXT: {{  $}}
@@ -569,7 +569,7 @@ body:             |
   ; GFX10-FLATSCR-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX10-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -580,7 +580,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.1:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_NOP 0
   ; GFX10-FLATSCR-NEXT: {{  $}}
@@ -590,7 +590,7 @@ body:             |
   ; VMEM-GFX8-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -602,7 +602,7 @@ body:             |
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT: bb.1:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x80000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_NOP 0
   ; VMEM-GFX8-NEXT: {{  $}}
@@ -639,7 +639,7 @@ body:             |
   ; MUBUF-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot
   ; MUBUF: bb.0:
   ; MUBUF-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -652,7 +652,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.1:
   ; MUBUF-NEXT:   successors: %bb.2(0x80000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_NOP 0
   ; MUBUF-NEXT: {{  $}}
@@ -662,7 +662,7 @@ body:             |
   ; GFX9-FLATSCR-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX9-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -674,7 +674,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.1:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_NOP 0
   ; GFX9-FLATSCR-NEXT: {{  $}}
@@ -684,7 +684,7 @@ body:             |
   ; GFX10-FLATSCR-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX10-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -695,7 +695,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.1:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_NOP 0
   ; GFX10-FLATSCR-NEXT: {{  $}}
@@ -705,7 +705,7 @@ body:             |
   ; VMEM-GFX8-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -718,7 +718,7 @@ body:             |
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT: bb.1:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x80000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_NOP 0
   ; VMEM-GFX8-NEXT: {{  $}}
@@ -755,7 +755,7 @@ body:             |
   ; MUBUF-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot
   ; MUBUF: bb.0:
   ; MUBUF-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -766,7 +766,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.1:
   ; MUBUF-NEXT:   successors: %bb.2(0x80000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_NOP 0
   ; MUBUF-NEXT: {{  $}}
@@ -776,7 +776,7 @@ body:             |
   ; GFX9-FLATSCR-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX9-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -788,7 +788,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.1:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_NOP 0
   ; GFX9-FLATSCR-NEXT: {{  $}}
@@ -798,7 +798,7 @@ body:             |
   ; GFX10-FLATSCR-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX10-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -809,7 +809,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.1:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_NOP 0
   ; GFX10-FLATSCR-NEXT: {{  $}}
@@ -819,7 +819,7 @@ body:             |
   ; VMEM-GFX8-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -830,7 +830,7 @@ body:             |
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT: bb.1:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x80000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_NOP 0
   ; VMEM-GFX8-NEXT: {{  $}}
@@ -867,7 +867,7 @@ body:             |
   ; MUBUF-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot
   ; MUBUF: bb.0:
   ; MUBUF-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -879,7 +879,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.1:
   ; MUBUF-NEXT:   successors: %bb.2(0x80000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_NOP 0
   ; MUBUF-NEXT: {{  $}}
@@ -889,7 +889,7 @@ body:             |
   ; GFX9-FLATSCR-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX9-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -901,7 +901,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.1:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_NOP 0
   ; GFX9-FLATSCR-NEXT: {{  $}}
@@ -911,7 +911,7 @@ body:             |
   ; GFX10-FLATSCR-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX10-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -922,7 +922,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.1:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_NOP 0
   ; GFX10-FLATSCR-NEXT: {{  $}}
@@ -932,7 +932,7 @@ body:             |
   ; VMEM-GFX8-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -944,7 +944,7 @@ body:             |
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT: bb.1:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x80000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_NOP 0
   ; VMEM-GFX8-NEXT: {{  $}}
@@ -981,7 +981,7 @@ body:             |
   ; MUBUF-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot
   ; MUBUF: bb.0:
   ; MUBUF-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -994,7 +994,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.1:
   ; MUBUF-NEXT:   successors: %bb.2(0x80000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_NOP 0
   ; MUBUF-NEXT: {{  $}}
@@ -1004,7 +1004,7 @@ body:             |
   ; GFX9-FLATSCR-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX9-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -1016,7 +1016,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.1:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_NOP 0
   ; GFX9-FLATSCR-NEXT: {{  $}}
@@ -1026,7 +1026,7 @@ body:             |
   ; GFX10-FLATSCR-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX10-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -1037,7 +1037,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.1:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_NOP 0
   ; GFX10-FLATSCR-NEXT: {{  $}}
@@ -1047,7 +1047,7 @@ body:             |
   ; VMEM-GFX8-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -1060,7 +1060,7 @@ body:             |
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT: bb.1:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x80000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_NOP 0
   ; VMEM-GFX8-NEXT: {{  $}}
@@ -1200,7 +1200,7 @@ body:             |
   ; MUBUF-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot
   ; MUBUF: bb.0:
   ; MUBUF-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -1211,7 +1211,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.1:
   ; MUBUF-NEXT:   successors: %bb.2(0x80000000)
-  ; MUBUF-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; MUBUF-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_NOP 0
   ; MUBUF-NEXT: {{  $}}
@@ -1221,7 +1221,7 @@ body:             |
   ; GFX9-FLATSCR-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX9-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -1235,7 +1235,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.1:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_NOP 0
   ; GFX9-FLATSCR-NEXT: {{  $}}
@@ -1245,7 +1245,7 @@ body:             |
   ; GFX10-FLATSCR-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; GFX10-FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
@@ -1259,7 +1259,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.1:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_NOP 0
   ; GFX10-FLATSCR-NEXT: {{  $}}
@@ -1269,7 +1269,7 @@ body:             |
   ; VMEM-GFX8-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
@@ -1280,7 +1280,7 @@ body:             |
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT: bb.1:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x80000000)
-  ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; VMEM-GFX8-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_NOP 0
   ; VMEM-GFX8-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
index 6659e95323769..fa0922590712a 100644
--- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
@@ -30,7 +30,7 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: undef_identity_copy
-    ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128), addrspace 1)
+    ; CHECK: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128), addrspace 1)
     ; CHECK-NEXT: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc
     ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95, implicit-def $scc
     ; CHECK-NEXT: $sgpr4 = COPY $sgpr95
@@ -39,13 +39,14 @@ body:             |
     ; CHECK-NEXT: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @bar + 4, target-flags(amdgpu-rel32-hi) @bar + 4, implicit-def dead $scc
     ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95
     ; CHECK-NEXT: $sgpr4 = COPY $sgpr95
-    ; CHECK-NEXT: $vgpr0 = COPY renamable $vgpr40
-    ; CHECK-NEXT: $vgpr1 = COPY renamable $vgpr41
-    ; CHECK-NEXT: $vgpr2 = COPY killed renamable $vgpr42
-    ; CHECK-NEXT: $vgpr3 = KILL undef renamable $vgpr3
+    ; CHECK-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORDX4_]].sub0
+    ; CHECK-NEXT: $vgpr1 = COPY [[FLAT_LOAD_DWORDX4_]].sub1
+    ; CHECK-NEXT: $vgpr2 = COPY [[FLAT_LOAD_DWORDX4_]].sub2
+    ; CHECK-NEXT: $vgpr3 = COPY undef %4:vgpr_32
     ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @bar, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0, implicit killed $vgpr1, implicit killed $vgpr2, implicit killed $vgpr3, implicit-def $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95
-    ; CHECK-NEXT: FLAT_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: FLAT_STORE_DWORD undef %6:vreg_64, [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
     ; CHECK-NEXT: S_ENDPGM 0
     %0:vreg_128 = FLAT_LOAD_DWORDX4 undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128), addrspace 1)
     %2:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
index 8c285f37b4878..d1ee82e74b3de 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
@@ -13,24 +13,24 @@ define void @vector_reg_liverange_split() #0 {
 ; GFX90A-NEXT:    s_mov_b32 s16, s33
 ; GFX90A-NEXT:    s_mov_b32 s33, s32
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[18:19], -1
-; GFX90A-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    s_mov_b64 exec, -1
-; GFX90A-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX90A-NEXT:    buffer_store_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT:    buffer_store_dword a32, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX90A-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX90A-NEXT:    v_writelane_b32 v40, s16, 4
 ; GFX90A-NEXT:    v_writelane_b32 v40, s28, 2
 ; GFX90A-NEXT:    v_writelane_b32 v40, s29, 3
-; GFX90A-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GFX90A-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX90A-NEXT:    ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
 ; GFX90A-NEXT:    s_addk_i32 s32, 0x400
 ; GFX90A-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s20
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_writelane_b32 v0, s20, 0
+; GFX90A-NEXT:    v_writelane_b32 v39, s20, 0
 ; GFX90A-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a32, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a32, v39
 ; GFX90A-NEXT:    s_mov_b64 exec, s[28:29]
 ; GFX90A-NEXT:    s_getpc_b64 s[16:17]
 ; GFX90A-NEXT:    s_add_u32 s16, s16, foo@gotpcrel32@lo+4
@@ -39,23 +39,22 @@ define void @vector_reg_liverange_split() #0 {
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX90A-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a32
+; GFX90A-NEXT:    v_accvgpr_read_b32 v39, a32
 ; GFX90A-NEXT:    s_mov_b64 exec, s[28:29]
-; GFX90A-NEXT:    v_readlane_b32 s20, v0, 0
+; GFX90A-NEXT:    v_readlane_b32 s20, v39, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s20
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX90A-NEXT:    ; kill: killed $vgpr0
 ; GFX90A-NEXT:    v_readlane_b32 s4, v40, 4
 ; GFX90A-NEXT:    v_readlane_b32 s28, v40, 2
 ; GFX90A-NEXT:    v_readlane_b32 s29, v40, 3
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, -1
-; GFX90A-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX90A-NEXT:    buffer_load_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT:    buffer_load_dword a32, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX90A-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX90A-NEXT:    s_mov_b32 s33, s4
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
index 5608ea8563548..4837efe6606b8 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
@@ -19,24 +19,23 @@ define void @test() #0 {
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[18:19], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 4
 ; GCN-NEXT:    v_writelane_b32 v40, s28, 2
 ; GCN-NEXT:    v_writelane_b32 v40, s29, 3
-; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    s_addk_i32 s32, 0x800
+; GCN-NEXT:    ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
+; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s16
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s16, 0
+; GCN-NEXT:    v_writelane_b32 v39, s16, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4
@@ -45,26 +44,24 @@ define void @test() #0 {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s4, v1, 0
+; GCN-NEXT:    v_readlane_b32 s4, v39, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    ; kill: killed $vgpr1
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-NEXT:    v_readlane_b32 s28, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s29, v40, 3
 ; GCN-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xf800
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -75,23 +72,23 @@ define void @test() #0 {
 ; GCN-O0-NEXT:    s_mov_b32 s16, s33
 ; GCN-O0-NEXT:    s_mov_b32 s33, s32
 ; GCN-O0-NEXT:    s_xor_saveexec_b64 s[18:19], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, -1
-; GCN-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-O0-NEXT:    v_writelane_b32 v40, s16, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v40, s28, 2
 ; GCN-O0-NEXT:    v_writelane_b32 v40, s29, 3
 ; GCN-O0-NEXT:    s_add_i32 s32, s32, 0x400
-; GCN-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-O0-NEXT:    ;;#ASMSTART
 ; GCN-O0-NEXT:    ; def s16
 ; GCN-O0-NEXT:    ;;#ASMEND
-; GCN-O0-NEXT:    v_writelane_b32 v0, s16, 0
+; GCN-O0-NEXT:    ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
+; GCN-O0-NEXT:    v_writelane_b32 v39, s16, 0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v39, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-O0-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-O0-NEXT:    s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4
@@ -104,26 +101,25 @@ define void @test() #0 {
 ; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
+; GCN-O0-NEXT:    v_readlane_b32 s4, v39, 0
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-O0-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-O0-NEXT:    global_store_dword v[1:2], v3, off
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-O0-NEXT:    global_store_dword v[0:1], v2, off
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-O0-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-O0-NEXT:    ; kill: killed $vgpr0
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-O0-NEXT:    v_readlane_b32 s28, v40, 2
 ; GCN-O0-NEXT:    v_readlane_b32 s29, v40, 3
 ; GCN-O0-NEXT:    s_xor_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, -1
-; GCN-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; GCN-O0-NEXT:    s_mov_b32 s33, s4
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll b/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll
new file mode 100644
index 0000000000000..145f1e483cd99
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll
@@ -0,0 +1,29 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 -filetype=null %s 2>&1 | FileCheck %s
+
+; A negative test to capture the expected error when the VGPRs are insufficient for wwm-regalloc.
+
+; CHECK: error: can't find enough VGPRs for wwm-regalloc
+
+define amdgpu_kernel void @test(i32 %in) {
+entry:
+  call void asm sideeffect "", "~{v[0:7]}" ()
+  call void asm sideeffect "", "~{v[8:15]}" ()
+  call void asm sideeffect "", "~{v[16:23]}" ()
+  call void asm sideeffect "", "~{v[24:31]}" ()
+  call void asm sideeffect "", "~{v[32:39]}" ()
+  call void asm sideeffect "", "~{v[40:47]}" ()
+  call void asm sideeffect "", "~{v[48:55]}" ()
+  call void asm sideeffect "", "~{v[56:63]}" ()
+  %val0 = call i32 asm sideeffect "; def $0", "=s" ()
+  %val1 = call i32 asm sideeffect "; def $0", "=s" ()
+  %val2 = call i32 asm sideeffect "; def $0", "=s" ()
+  %cmp = icmp eq i32 %in, 0
+  br i1 %cmp, label %bb0, label %ret
+bb0:
+  call void asm sideeffect "; use $0", "s"(i32 %val0)
+  call void asm sideeffect "; use $0", "s"(i32 %val1)
+  call void asm sideeffect "; use $0", "s"(i32 %val2)
+  br label %ret
+ret:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index c295a056eb9e7..025381d5c16df 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -134,15 +134,10 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0:       ; %bb.0: ; %entry
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
 ; GFX9-O0-NEXT:    s_mov_b32 s40, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
@@ -157,38 +152,38 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    s_mov_b32 s37, s44
 ; GFX9-O0-NEXT:    s_mov_b32 s38, s43
 ; GFX9-O0-NEXT:    s_mov_b32 s39, s42
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s40, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s41, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s34, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s35, 3
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s40, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s41, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s34, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s35, 3
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
 ; GFX9-O0-NEXT:    s_nop 2
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], s34
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], off, s[36:39], s34
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[36:37]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[36:37], v3, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s34
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[36:37], v0, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[34:35], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s34, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s35, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s34, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s35, 5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
 ; GFX9-O0-NEXT:    s_and_b64 s[34:35], s[34:35], s[36:37]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -211,26 +206,26 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:  .LBB1_2: ; %merge
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s36, v0, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s37, v0, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v5, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v5, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GFX9-O0-NEXT:    v_readlane_b32 s38, v0, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s39, v0, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s34, v0, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s35, v0, 3
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[36:37], v3, v4
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[36:37]
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v5, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v5, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v5, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v5, 3
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[36:37], v0, v3
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[36:37]
 ; GFX9-O0-NEXT:    s_mov_b32 s36, 1
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s36, v3
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s36, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s36, 2
-; GFX9-O0-NEXT:    v_and_b32_e64 v3, v3, s36
+; GFX9-O0-NEXT:    v_and_b32_e64 v0, v0, s36
 ; GFX9-O0-NEXT:    s_mov_b32 s40, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s34
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s39
@@ -240,12 +235,11 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    s_mov_b32 s38, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s39, s34
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -353,9 +347,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b32 s48, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x400
 ; GFX9-O0-NEXT:    v_writelane_b32 v3, s30, 0
@@ -397,9 +391,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s48
@@ -412,9 +406,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    s_mov_b32 s38, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x400
@@ -435,9 +429,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s38
@@ -539,28 +533,26 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s48, s33
+; GFX9-O0-NEXT:    s_mov_b32 s46, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x1000
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s8
@@ -578,10 +570,11 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b32 s41, s45
 ; GFX9-O0-NEXT:    s_mov_b32 s42, s44
 ; GFX9-O0-NEXT:    s_mov_b32 s43, s35
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s40, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s41, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s42, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s43, 3
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr11 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s40, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s41, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s42, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s43, 3
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s9
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
@@ -599,11 +592,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s34, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s35, 5
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s34, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s35, 5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s36
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[34:35]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
@@ -625,20 +615,13 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s34, v6, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s35, v6, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s36, v6, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s37, v6, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s38, v6, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s39, v6, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v11, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v11, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v11, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v11, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v11, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v11, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
@@ -647,30 +630,28 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[40:41], v2, v4
 ; GFX9-O0-NEXT:    v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v10, 0
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff000
-; GFX9-O0-NEXT:    s_mov_b32 s33, s48
+; GFX9-O0-NEXT:    s_mov_b32 s33, s46
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -680,14 +661,14 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    s_mov_b32 s38, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_nop 0
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_writelane_b32 v8, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x800
@@ -718,13 +699,13 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    v_readlane_b32 s31, v8, 1
 ; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s38
@@ -924,7 +905,7 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, -1
-; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
@@ -937,35 +918,35 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    v_writelane_b32 v47, s65, 1
 ; GFX9-O0-NEXT:    v_writelane_b32 v47, s66, 2
 ; GFX9-O0-NEXT:    v_writelane_b32 v47, s67, 3
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
@@ -975,36 +956,36 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s18
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s19
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s20
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s21
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s23
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s24
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s25
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s26
@@ -1013,56 +994,56 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s29
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v40
-; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v39
-; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v38
-; GFX9-O0-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v37
-; GFX9-O0-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v36
-; GFX9-O0-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v46
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v45
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v44
@@ -1080,23 +1061,23 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v36
 ; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr35 killed $exec
-; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -1276,7 +1257,7 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, -1
-; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index ee9174822a960..312628c7b5451 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -114,15 +114,10 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_mov_b32 s19, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s16, s16, s4
 ; GFX9-O0-NEXT:    s_addc_u32 s17, s17, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 0
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s3, 0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s1
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v0, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v5, 0
 ; GFX9-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    s_mov_b32 s3, s1
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s3
@@ -135,37 +130,37 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s10
 ; GFX9-O0-NEXT:    s_mov_b32 s6, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s7, s8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s0, 3
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s1, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s2, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s3, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s0, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s1, 4
 ; GFX9-O0-NEXT:    s_mov_b32 s0, 0
 ; GFX9-O0-NEXT:    s_nop 2
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], s0
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], s0
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v3, s0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s0, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s1, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s0, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s1, 6
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -188,26 +183,26 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:  .LBB1_2: ; %merge
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[16:19], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v5, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v5, 6
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v0, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v0, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v0, 3
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v0, 4
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v4
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v5, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v5, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v5, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v5, 4
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v3
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s4, v3
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 2
-; GFX9-O0-NEXT:    v_and_b32_e64 v3, v3, s4
+; GFX9-O0-NEXT:    v_and_b32_e64 v0, v0, s4
 ; GFX9-O0-NEXT:    s_mov_b32 s6, s1
 ; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s3
@@ -217,8 +212,7 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s5
 ; GFX9-O0-NEXT:    s_mov_b32 s3, s4
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s4 offset:4
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: cfg:
@@ -310,38 +304,32 @@ define hidden i32 @called(i32 %a) noinline {
 define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-LABEL: call:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0
 ; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GFX9-O0-NEXT:    s_mov_b32 s26, -1
 ; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s9
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s11, 1
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s6
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v7, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v7, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s5, 3
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s5, 3
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v7, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v7, 3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
@@ -355,23 +343,19 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    s_mov_b32 s17, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s6, 8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s2, 9
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s3, 10
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[2:3]
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s2, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s3, 10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -387,35 +371,28 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 20
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s6, v3
-; GFX9-O0-NEXT:    s_mov_b32 s6, 10
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s6, v4
-; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
+; GFX9-O0-NEXT:    s_mov_b32 s6, 10
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v5, s6, v5
+; GFX9-O0-NEXT:    v_or3_b32 v4, v6, v5, v4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v3, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v3, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v3, 8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v6
+; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v7
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s4 offset:4
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: call:
@@ -559,37 +536,31 @@ define i64 @called_i64(i64 %a) noinline {
 define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
 ; GFX9-O0-LABEL: call_i64:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0
 ; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GFX9-O0-NEXT:    s_mov_b32 s26, -1
 ; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s9
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr12 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s11, 1
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s6
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v12, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v12, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s5, 3
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s5, 3
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v12, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v12, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
@@ -604,11 +575,10 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_mov_b32 s17, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s15, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s3
@@ -623,20 +593,17 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s2, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s3, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s2, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s3, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s6
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[2:3]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX9-O0-NEXT:    s_mov_b32 s2, 32
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s2, v[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[11:12], s2, v[9:10]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -664,33 +631,25 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
 ; GFX9-O0-NEXT:    v_add_co_u32_e64 v3, s[6:7], v3, v5
 ; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: call_i64:
@@ -1007,15 +966,10 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_mov_b32 s19, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s16, s16, s4
 ; GFX9-O0-NEXT:    s_addc_u32 s17, s17, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 0
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s3, 0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s1
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v0, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v5, 0
 ; GFX9-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    s_mov_b32 s3, s1
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s3
@@ -1028,37 +982,37 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s10
 ; GFX9-O0-NEXT:    s_mov_b32 s6, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s7, s8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s0, 3
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s1, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s2, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s3, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s0, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s1, 4
 ; GFX9-O0-NEXT:    s_mov_b32 s0, 0
 ; GFX9-O0-NEXT:    s_nop 2
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], s0
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], s0
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v3, s0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s0, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s1, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s0, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s1, 6
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -1081,26 +1035,26 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:  .LBB8_2: ; %merge
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[16:19], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v5, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v5, 6
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v0, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v0, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v0, 3
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v0, 4
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v4
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v5, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v5, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v5, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v5, 4
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v3
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s4, v3
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 2
-; GFX9-O0-NEXT:    v_and_b32_e64 v3, v3, s4
+; GFX9-O0-NEXT:    v_and_b32_e64 v0, v0, s4
 ; GFX9-O0-NEXT:    s_mov_b32 s6, s1
 ; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s3
@@ -1110,8 +1064,7 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s5
 ; GFX9-O0-NEXT:    s_mov_b32 s3, s4
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s4 offset:4
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: strict_wwm_cfg:
@@ -1203,38 +1156,32 @@ define hidden i32 @strict_wwm_called(i32 %a) noinline {
 define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-LABEL: strict_wwm_call:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0
 ; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GFX9-O0-NEXT:    s_mov_b32 s26, -1
 ; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s9
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s11, 1
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s6
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v7, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v7, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s5, 3
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s5, 3
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v7, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v7, 3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
@@ -1248,23 +1195,19 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    s_mov_b32 s17, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s6, 8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s2, 9
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s3, 10
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[2:3]
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s2, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s3, 10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1280,35 +1223,28 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 20
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s6, v3
-; GFX9-O0-NEXT:    s_mov_b32 s6, 10
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s6, v4
-; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
+; GFX9-O0-NEXT:    s_mov_b32 s6, 10
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v5, s6, v5
+; GFX9-O0-NEXT:    v_or3_b32 v4, v6, v5, v4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v3, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v3, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v3, 8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v6
+; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v7
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s4 offset:4
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: strict_wwm_call:
@@ -1452,37 +1388,31 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline {
 define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0
 ; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GFX9-O0-NEXT:    s_mov_b32 s26, -1
 ; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s9
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr12 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s11, 1
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s6
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v12, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v12, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s5, 3
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s5, 3
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v12, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v12, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
@@ -1497,11 +1427,10 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_mov_b32 s17, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s15, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s3
@@ -1516,20 +1445,17 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s2, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v1, s3, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s2, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s3, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s6
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[2:3]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX9-O0-NEXT:    s_mov_b32 s2, 32
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s2, v[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[11:12], s2, v[9:10]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1557,33 +1483,25 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
 ; GFX9-O0-NEXT:    v_add_co_u32_e64 v3, s[6:7], v3, v5
 ; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: strict_wwm_call_i64:
diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll
index 65288e1884c74..6427a3e34cf8e 100644
--- a/llvm/test/CodeGen/ARM/select-imm.ll
+++ b/llvm/test/CodeGen/ARM/select-imm.ll
@@ -655,14 +655,11 @@ define i1 @t10() {
 ; V8MBASE-NEXT:    .pad #8
 ; V8MBASE-NEXT:    sub sp, #8
 ; V8MBASE-NEXT:    movs r0, #7
-; V8MBASE-NEXT:    mvns r0, r0
-; V8MBASE-NEXT:    str r0, [sp]
-; V8MBASE-NEXT:    adds r1, r0, #5
-; V8MBASE-NEXT:    str r1, [sp, #4]
-; V8MBASE-NEXT:    sdiv r2, r1, r0
-; V8MBASE-NEXT:    muls r2, r0, r2
-; V8MBASE-NEXT:    subs r0, r1, r2
-; V8MBASE-NEXT:    subs r1, r0, r1
+; V8MBASE-NEXT:    mvns r1, r0
+; V8MBASE-NEXT:    str r1, [sp]
+; V8MBASE-NEXT:    adds r0, r1, #5
+; V8MBASE-NEXT:    str r0, [sp, #4]
+; V8MBASE-NEXT:    adds r1, #8
 ; V8MBASE-NEXT:    rsbs r0, r1, #0
 ; V8MBASE-NEXT:    adcs r0, r1
 ; V8MBASE-NEXT:    add sp, #8
@@ -719,7 +716,7 @@ define i1 @t11() {
 ; ARMT2-NEXT:    and r1, r1, r2
 ; ARMT2-NEXT:    orr r0, r1, r0
 ; ARMT2-NEXT:    str r0, [sp]
-; ARMT2-NEXT:    bfc r0, #12, #20
+; ARMT2-NEXT:    and r0, r0, #15
 ; ARMT2-NEXT:    sub r0, r0, #3
 ; ARMT2-NEXT:    clz r0, r0
 ; ARMT2-NEXT:    lsr r0, r0, #5
@@ -781,7 +778,7 @@ define i1 @t11() {
 ; THUMB2-NEXT:    ands r1, r2
 ; THUMB2-NEXT:    orrs r0, r1
 ; THUMB2-NEXT:    str r0, [sp]
-; THUMB2-NEXT:    bfc r0, #12, #20
+; THUMB2-NEXT:    and r0, r0, #15
 ; THUMB2-NEXT:    subs r0, #3
 ; THUMB2-NEXT:    clz r0, r0
 ; THUMB2-NEXT:    lsrs r0, r0, #5
diff --git a/llvm/test/CodeGen/ARM/struct_byval.ll b/llvm/test/CodeGen/ARM/struct_byval.ll
index 73a1b5ee33bca..2bc4f9c816d53 100644
--- a/llvm/test/CodeGen/ARM/struct_byval.ll
+++ b/llvm/test/CodeGen/ARM/struct_byval.ll
@@ -63,25 +63,6 @@ declare i32 @e1(ptr nocapture byval(%struct.SmallStruct) %in) nounwind
 declare i32 @e2(ptr nocapture byval(%struct.LargeStruct) %in) nounwind
 declare i32 @e3(ptr nocapture byval(%struct.LargeStruct) align 16 %in) nounwind
 
-; rdar://12442472
-; We can't do tail call since address of s is passed to the callee and part of
-; s is in caller's local frame.
-define void @f3(ptr nocapture byval(%struct.SmallStruct) %s) nounwind optsize {
-; CHECK-LABEL: f3
-; CHECK: bl _consumestruct
-entry:
-  tail call void @consumestruct(ptr %s, i32 80) optsize
-  ret void
-}
-
-define void @f4(ptr nocapture byval(%struct.SmallStruct) %s) nounwind optsize {
-; CHECK-LABEL: f4
-; CHECK: bl _consumestruct
-entry:
-  tail call void @consumestruct(ptr %s, i32 80) optsize
-  ret void
-}
-
 ; We can do tail call here since s is in the incoming argument area.
 define void @f5(i32 %a, i32 %b, i32 %c, i32 %d, ptr nocapture byval(%struct.SmallStruct) %s) nounwind optsize {
 ; CHECK-LABEL: f5
diff --git a/llvm/test/CodeGen/BPF/BTF/print_btf.py b/llvm/test/CodeGen/BPF/BTF/print_btf.py
index 6ce08b76c363e..c574d0f8524b0 100644
--- a/llvm/test/CodeGen/BPF/BTF/print_btf.py
+++ b/llvm/test/CodeGen/BPF/BTF/print_btf.py
@@ -88,7 +88,7 @@ def print_btf(filename):
             buf = file.read()
 
     fmt_cache = {}
-    endian_pfx = ""
+    endian_pfx = ">"  # big endian
     off = 0
 
     def unpack(fmt):
@@ -104,9 +104,9 @@ def unpack(fmt):
     # Use magic number at the header start to determine endianness
     (magic,) = unpack("H")
     if magic == 0xEB9F:
-        endian_pfx = "<"
+        endian_pfx = ">"  # big endian
     elif magic == 0x9FEB:
-        endian_pfx = ">"
+        endian_pfx = "<"  # little endian
     else:
         warn(f"Unexpected BTF magic: {magic:02x}")
         return
@@ -290,6 +290,6 @@ def warn_nonzero(val, name):
 
 if __name__ == "__main__":
     if len(sys.argv) != 2:
-        warn("Usage: {sys.argv[0]} <btf_file>")
+        warn(f"Usage: {sys.argv[0]} <btf_file>")
         sys.exit(1)
     print_btf(sys.argv[1])
diff --git a/llvm/test/CodeGen/DirectX/llc-pipeline.ll b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
index 46326d6917587..102748508b4ad 100644
--- a/llvm/test/CodeGen/DirectX/llc-pipeline.ll
+++ b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
@@ -8,6 +8,7 @@
 ; CHECK-NEXT: Target Transform Information
 ; CHECK-NEXT: ModulePass Manager
 ; CHECK-NEXT:   DXIL Intrinsic Expansion
+; CHECK-NEXT:   DXIL Data Scalarization
 ; CHECK-NEXT:   FunctionPass Manager
 ; CHECK-NEXT:     Dominator Tree Construction
 ; CHECK-NEXT:     Scalarize vector operations
diff --git a/llvm/test/CodeGen/DirectX/scalar-data.ll b/llvm/test/CodeGen/DirectX/scalar-data.ll
new file mode 100644
index 0000000000000..4438604a3a879
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/scalar-data.ll
@@ -0,0 +1,12 @@
+; RUN: opt -S -dxil-data-scalarization -scalarizer -scalarize-load-store -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: llc %s -mtriple=dxil-pc-shadermodel6.3-library --filetype=asm -o - | FileCheck %s
+
+; Make sure we don't touch arrays without vectors and that can recurse multiple-dimension arrays of vectors
+
+@staticArray = internal global [4 x i32] [i32 1, i32 2, i32 3, i32 4], align 4
+@"groushared3dArrayofVectors" = local_unnamed_addr addrspace(3) global [3 x [3 x [3 x <4 x i32>]]] zeroinitializer, align 16
+
+; CHECK @staticArray
+; CHECK-NOT: @staticArray.scalarized
+; CHECK: @groushared3dArrayofVectors.scalarized = local_unnamed_addr addrspace(3) global [3 x [3 x [3 x [4 x i32]]]] zeroinitializer, align 16
+; CHECK-NOT: @groushared3dArrayofVectors
diff --git a/llvm/test/CodeGen/DirectX/scalar-load.ll b/llvm/test/CodeGen/DirectX/scalar-load.ll
new file mode 100644
index 0000000000000..11678f48a5e01
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/scalar-load.ll
@@ -0,0 +1,58 @@
+; RUN: opt -S -dxil-data-scalarization -scalarizer -scalarize-load-store -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: llc %s -mtriple=dxil-pc-shadermodel6.3-library --filetype=asm -o - | FileCheck %s
+
+; Make sure we can load groupshared, static vectors and arrays of vectors
+
+@"arrayofVecData" = local_unnamed_addr addrspace(3) global [2 x <3 x float>] zeroinitializer, align 16
+@"vecData" = external addrspace(3) global <4 x i32>, align 4
+@staticArrayOfVecData = internal global [3 x <4 x i32>] [<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> <i32 9, i32 10, i32 11, i32 12>], align 4
+@"groushared2dArrayofVectors" = local_unnamed_addr addrspace(3) global [3 x [ 3 x <4 x i32>]] zeroinitializer, align 16
+
+; CHECK: @arrayofVecData.scalarized = local_unnamed_addr addrspace(3) global [2 x [3 x float]] zeroinitializer, align 16
+; CHECK: @vecData.scalarized = external addrspace(3) global [4 x i32], align 4
+; CHECK: @staticArrayOfVecData.scalarized = internal global [3 x [4 x i32]] {{\[}}[4 x i32] [i32 1, i32 2, i32 3, i32 4], [4 x i32] [i32 5, i32 6, i32 7, i32 8], [4 x i32] [i32 9, i32 10, i32 11, i32 12]], align 4
+; CHECK: @groushared2dArrayofVectors.scalarized = local_unnamed_addr addrspace(3) global [3 x [3 x [4 x i32]]] zeroinitializer, align 16
+
+; CHECK-NOT: @arrayofVecData
+; CHECK-NOT: @vecData
+; CHECK-NOT: @staticArrayOfVecData
+; CHECK-NOT: @groushared2dArrayofVectors
+
+
+; CHECK-LABEL: load_array_vec_test
+define <4 x i32> @load_array_vec_test() {
+  ; CHECK-COUNT-8: load i32, ptr addrspace(3) {{(.*@arrayofVecData.scalarized.*|%.*)}}, align 4
+  ; CHECK-NOT: load i32, ptr addrspace(3) {{.*}}, align 4
+  %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([2 x <4 x i32>], [2 x <4 x i32>] addrspace(3)* @"arrayofVecData", i32 0, i32 0), align 4
+  %2 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([2 x <4 x i32>], [2 x <4 x i32>] addrspace(3)* @"arrayofVecData", i32 0, i32 1), align 4
+  %3 = add <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+; CHECK-LABEL: load_vec_test
+define <4 x i32> @load_vec_test() {
+  ; CHECK-COUNT-4: load i32, ptr addrspace(3) {{(@vecData.scalarized|getelementptr \(i32, ptr addrspace\(3\) @vecData.scalarized, i32 .*\)|%.*)}}, align {{.*}}
+  ; CHECK-NOT: load i32, ptr addrspace(3) {{.*}}, align 4 
+  %1 = load <4 x i32>, <4 x i32> addrspace(3)* @"vecData", align 4
+  ret <4 x i32> %1
+}
+
+; CHECK-LABEL: load_static_array_of_vec_test
+define <4 x i32> @load_static_array_of_vec_test(i32 %index) {
+  ; CHECK: getelementptr [3 x [4 x i32]], ptr @staticArrayOfVecData.scalarized, i32 0, i32 %index
+  ; CHECK-COUNT-4: load i32, ptr {{.*}}, align 4
+  ; CHECK-NOT: load i32, ptr {{.*}}, align 4
+  %3 = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* @staticArrayOfVecData, i32 0, i32 %index
+  %4 = load <4 x i32>, <4 x i32>* %3, align 4
+  ret <4 x i32> %4
+}
+
+; CHECK-LABEL: multid_load_test
+define <4 x i32> @multid_load_test()  {
+  ; CHECK-COUNT-8: load i32, ptr addrspace(3) {{(.*@groushared2dArrayofVectors.scalarized.*|%.*)}}, align 4
+  ; CHECK-NOT: load i32, ptr addrspace(3) {{.*}}, align 4
+  %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 0, i32 0), align 4
+  %2 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 1, i32 1), align 4
+  %3 = add <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
diff --git a/llvm/test/CodeGen/DirectX/scalar-store.ll b/llvm/test/CodeGen/DirectX/scalar-store.ll
index b970a2842e5a8..08d8a2c57c6c3 100644
--- a/llvm/test/CodeGen/DirectX/scalar-store.ll
+++ b/llvm/test/CodeGen/DirectX/scalar-store.ll
@@ -1,17 +1,29 @@
-; RUN: opt -S -scalarizer -scalarize-load-store -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -dxil-data-scalarization -scalarizer -scalarize-load-store -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 ; RUN: llc %s -mtriple=dxil-pc-shadermodel6.3-library --filetype=asm -o - | FileCheck %s
 
-@"sharedData" = local_unnamed_addr addrspace(3) global [2 x <3 x float>] zeroinitializer, align 16 
-; CHECK-LABEL: store_test
-define void @store_test () local_unnamed_addr {
-    ; CHECK: store float 1.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
-    ; CHECK: store float 2.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}}
-    ; CHECK: store float 3.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
-    ; CHECK: store float 2.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
-    ; CHECK: store float 4.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
-    ; CHECK: store float 6.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
+; Make sure we can store groupshared, static vectors and arrays of vectors
 
-    store <3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, ptr addrspace(3) @"sharedData", align 16 
-    store <3 x float> <float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>, ptr addrspace(3)   getelementptr inbounds (i8, ptr addrspace(3) @"sharedData", i32 16), align 16 
+@"arrayofVecData" = local_unnamed_addr addrspace(3) global [2 x <3 x float>] zeroinitializer, align 16
+@"vecData" = external addrspace(3) global <4 x i32>, align 4
+
+; CHECK: @arrayofVecData.scalarized = local_unnamed_addr addrspace(3) global [2 x [3 x float]] zeroinitializer, align 16
+; CHECK: @vecData.scalarized = external addrspace(3) global [4 x i32], align 4
+; CHECK-NOT: @arrayofVecData
+; CHECK-NOT: @vecData
+
+; CHECK-LABEL: store_array_vec_test
+define void @store_array_vec_test () local_unnamed_addr {
+    ; CHECK-COUNT-6: store float {{1|2|3|4|6}}.000000e+00, ptr addrspace(3) {{(.*@arrayofVecData.scalarized.*|%.*)}}, align {{4|8|16}}
+    ; CHECK-NOT: store float {{1|2|3|4|6}}.000000e+00, ptr addrspace(3) {{(.*@arrayofVecData.scalarized.*|%.*)}}, align {{4|8|16}}
+    store <3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, ptr addrspace(3) @"arrayofVecData", align 16 
+    store <3 x float> <float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>, ptr addrspace(3)   getelementptr inbounds (i8, ptr addrspace(3) @"arrayofVecData", i32 16), align 16 
     ret void
  } 
+
+; CHECK-LABEL: store_vec_test
+define void @store_vec_test(<4 x i32> %inputVec) {
+  ; CHECK-COUNT-4: store i32 %inputVec.{{.*}}, ptr addrspace(3) {{(@vecData.scalarized|getelementptr \(i32, ptr addrspace\(3\) @vecData.scalarized, i32 .*\)|%.*)}}, align 4 
+  ; CHECK-NOT: store i32 %inputVec.{{.*}}, ptr addrspace(3)
+  store <4 x i32> %inputVec, <4 x i32> addrspace(3)* @"vecData", align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/Mips/fp16-promote.ll b/llvm/test/CodeGen/Mips/fp16-promote.ll
index 6cbfab5f77669..348cf38e51974 100644
--- a/llvm/test/CodeGen/Mips/fp16-promote.ll
+++ b/llvm/test/CodeGen/Mips/fp16-promote.ll
@@ -1,11 +1,30 @@
-; RUN: llc -asm-verbose=false -mtriple=mipsel-linux-gnu -relocation-model=pic < %s | FileCheck %s -check-prefix=CHECK-LIBCALL
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=mipsel-linux-gnu < %s | FileCheck %s -check-prefix=CHECK-LIBCALL
 
+define void @test_fadd(ptr %p, ptr %q) nounwind {
 ; CHECK-LIBCALL-LABEL: test_fadd:
-; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
-; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
-; CHECK-LIBCALL-DAG: add.s
-; CHECK-LIBCALL-DAG: %call16(__gnu_f2h_ieee)
-define void @test_fadd(ptr %p, ptr %q) #0 {
+; CHECK-LIBCALL:       # %bb.0:
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -40
+; CHECK-LIBCALL-NEXT:    sdc1 $f20, 32($sp) # 8-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    move $17, $4
+; CHECK-LIBCALL-NEXT:    lhu $4, 0($4)
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    move $16, $5
+; CHECK-LIBCALL-NEXT:    lhu $4, 0($16)
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    mov.s $f20, $f0
+; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT:    add.s $f12, $f20, $f0
+; CHECK-LIBCALL-NEXT:    sh $2, 0($17)
+; CHECK-LIBCALL-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    ldc1 $f20, 32($sp) # 8-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    jr $ra
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 40
   %a = load half, ptr %p, align 2
   %b = load half, ptr %q, align 2
   %r = fadd half %a, %b
@@ -13,84 +32,229 @@ define void @test_fadd(ptr %p, ptr %q) #0 {
   ret void
 }
 
+define float @test_fpext_float(ptr %p) nounwind {
 ; CHECK-LIBCALL-LABEL: test_fpext_float:
-; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
-define float @test_fpext_float(ptr %p) {
+; CHECK-LIBCALL:       # %bb.0:
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -24
+; CHECK-LIBCALL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    lhu $4, 0($4)
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    nop
+; CHECK-LIBCALL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    jr $ra
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 24
   %a = load half, ptr %p, align 2
   %r = fpext half %a to float
   ret float %r
 }
 
+define double @test_fpext_double(ptr %p) nounwind {
 ; CHECK-LIBCALL-LABEL: test_fpext_double:
-; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
-; CHECK-LIBCALL: cvt.d.s
-define double @test_fpext_double(ptr %p) {
+; CHECK-LIBCALL:       # %bb.0:
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -24
+; CHECK-LIBCALL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    lhu $4, 0($4)
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    nop
+; CHECK-LIBCALL-NEXT:    cvt.d.s $f0, $f0
+; CHECK-LIBCALL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    jr $ra
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 24
   %a = load half, ptr %p, align 2
   %r = fpext half %a to double
   ret double %r
 }
 
+define void @test_fptrunc_float(float %f, ptr %p) nounwind {
 ; CHECK-LIBCALL-LABEL: test_fptrunc_float:
-; CHECK-LIBCALL: %call16(__gnu_f2h_ieee)
-define void @test_fptrunc_float(float %f, ptr %p) #0 {
+; CHECK-LIBCALL:       # %bb.0:
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -24
+; CHECK-LIBCALL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT:    move $16, $5
+; CHECK-LIBCALL-NEXT:    sh $2, 0($16)
+; CHECK-LIBCALL-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    jr $ra
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 24
   %a = fptrunc float %f to half
   store half %a, ptr %p
   ret void
 }
 
+define void @test_fptrunc_double(double %d, ptr %p) nounwind {
 ; CHECK-LIBCALL-LABEL: test_fptrunc_double:
-; CHECK-LIBCALL: %call16(__truncdfhf2)
-define void @test_fptrunc_double(double %d, ptr %p) #0 {
+; CHECK-LIBCALL:       # %bb.0:
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -24
+; CHECK-LIBCALL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    jal __truncdfhf2
+; CHECK-LIBCALL-NEXT:    move $16, $6
+; CHECK-LIBCALL-NEXT:    sh $2, 0($16)
+; CHECK-LIBCALL-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    jr $ra
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 24
   %a = fptrunc double %d to half
   store half %a, ptr %p
   ret void
 }
 
+define <4 x float> @test_vec_fpext_float(ptr %p) nounwind {
 ; CHECK-LIBCALL-LABEL: test_vec_fpext_float:
-; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
-; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
-; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
-; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
-define <4 x float> @test_vec_fpext_float(ptr %p) #0 {
+; CHECK-LIBCALL:       # %bb.0:
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -32
+; CHECK-LIBCALL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    move $17, $4
+; CHECK-LIBCALL-NEXT:    lhu $4, 6($5)
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    move $16, $5
+; CHECK-LIBCALL-NEXT:    lhu $4, 4($16)
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    swc1 $f0, 12($17)
+; CHECK-LIBCALL-NEXT:    swc1 $f0, 8($17)
+; CHECK-LIBCALL-NEXT:    lhu $4, 2($16)
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    nop
+; CHECK-LIBCALL-NEXT:    swc1 $f0, 4($17)
+; CHECK-LIBCALL-NEXT:    lhu $4, 0($16)
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    nop
+; CHECK-LIBCALL-NEXT:    swc1 $f0, 0($17)
+; CHECK-LIBCALL-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    jr $ra
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 32
   %a = load <4 x half>, ptr %p, align 8
   %b = fpext <4 x half> %a to <4 x float>
   ret <4 x float> %b
 }
 
-; This test is not robust against variations in instruction scheduling.
-; See the discussion in http://reviews.llvm.org/D8804
+define <4 x double> @test_vec_fpext_double(ptr %p) nounwind {
 ; CHECK-LIBCALL-LABEL: test_vec_fpext_double:
-; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
-; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
-; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
-; CHECK-LIBCALL: cvt.d.s
-; CHECK-LIBCALL: cvt.d.s
-; CHECK-LIBCALL: cvt.d.s
-; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
-; CHECK-LIBCALL: cvt.d.s
-define <4 x double> @test_vec_fpext_double(ptr %p) #0 {
+; CHECK-LIBCALL:       # %bb.0:
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -40
+; CHECK-LIBCALL-NEXT:    sdc1 $f20, 32($sp) # 8-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    move $17, $4
+; CHECK-LIBCALL-NEXT:    lhu $4, 6($5)
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    move $16, $5
+; CHECK-LIBCALL-NEXT:    lhu $4, 4($16)
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    mov.s $f20, $f0
+; CHECK-LIBCALL-NEXT:    lhu $4, 2($16)
+; CHECK-LIBCALL-NEXT:    cvt.d.s $f0, $f0
+; CHECK-LIBCALL-NEXT:    cvt.d.s $f2, $f20
+; CHECK-LIBCALL-NEXT:    sdc1 $f2, 24($17)
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    sdc1 $f0, 16($17)
+; CHECK-LIBCALL-NEXT:    cvt.d.s $f0, $f0
+; CHECK-LIBCALL-NEXT:    sdc1 $f0, 8($17)
+; CHECK-LIBCALL-NEXT:    lhu $4, 0($16)
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    nop
+; CHECK-LIBCALL-NEXT:    cvt.d.s $f0, $f0
+; CHECK-LIBCALL-NEXT:    sdc1 $f0, 0($17)
+; CHECK-LIBCALL-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    ldc1 $f20, 32($sp) # 8-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    jr $ra
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 40
   %a = load <4 x half>, ptr %p, align 8
   %b = fpext <4 x half> %a to <4 x double>
   ret <4 x double> %b
 }
 
+define void @test_vec_fptrunc_float(<4 x float> %a, ptr %p) nounwind {
 ; CHECK-LIBCALL-LABEL: test_vec_fptrunc_float:
-; CHECK-LIBCALL: %call16(__gnu_f2h_ieee)
-; CHECK-LIBCALL: %call16(__gnu_f2h_ieee)
-; CHECK-LIBCALL: %call16(__gnu_f2h_ieee)
-; CHECK-LIBCALL: %call16(__gnu_f2h_ieee)
-define void @test_vec_fptrunc_float(<4 x float> %a, ptr %p) #0 {
+; CHECK-LIBCALL:       # %bb.0:
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -40
+; CHECK-LIBCALL-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $19, 32($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $18, 28($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    move $16, $7
+; CHECK-LIBCALL-NEXT:    move $17, $5
+; CHECK-LIBCALL-NEXT:    move $18, $4
+; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT:    mtc1 $6, $f12
+; CHECK-LIBCALL-NEXT:    move $19, $2
+; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT:    mtc1 $16, $f12
+; CHECK-LIBCALL-NEXT:    mtc1 $17, $f12
+; CHECK-LIBCALL-NEXT:    lw $16, 56($sp)
+; CHECK-LIBCALL-NEXT:    sh $2, 6($16)
+; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT:    sh $19, 4($16)
+; CHECK-LIBCALL-NEXT:    sh $2, 2($16)
+; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT:    mtc1 $18, $f12
+; CHECK-LIBCALL-NEXT:    sh $2, 0($16)
+; CHECK-LIBCALL-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $18, 28($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $19, 32($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    jr $ra
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 40
   %b = fptrunc <4 x float> %a to <4 x half>
   store <4 x half> %b, ptr %p, align 8
   ret void
 }
 
+define void @test_vec_fptrunc_double(<4 x double> %a, ptr %p) nounwind {
 ; CHECK-LIBCALL-LABEL: test_vec_fptrunc_double:
-; CHECK-LIBCALL: %call16(__truncdfhf2)
-; CHECK-LIBCALL: %call16(__truncdfhf2)
-; CHECK-LIBCALL: %call16(__truncdfhf2)
-; CHECK-LIBCALL: %call16(__truncdfhf2)
-define void @test_vec_fptrunc_double(<4 x double> %a, ptr %p) #0 {
+; CHECK-LIBCALL:       # %bb.0:
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -72
+; CHECK-LIBCALL-NEXT:    sw $ra, 68($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $19, 64($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $18, 60($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $17, 56($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $16, 52($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    move $16, $5
+; CHECK-LIBCALL-NEXT:    move $17, $4
+; CHECK-LIBCALL-NEXT:    lw $1, 92($sp)
+; CHECK-LIBCALL-NEXT:    sw $1, 36($sp)
+; CHECK-LIBCALL-NEXT:    lw $1, 88($sp)
+; CHECK-LIBCALL-NEXT:    sw $1, 32($sp)
+; CHECK-LIBCALL-NEXT:    lw $1, 96($sp)
+; CHECK-LIBCALL-NEXT:    lw $2, 100($sp)
+; CHECK-LIBCALL-NEXT:    sw $2, 44($sp)
+; CHECK-LIBCALL-NEXT:    sw $1, 40($sp)
+; CHECK-LIBCALL-NEXT:    ldc1 $f12, 32($sp)
+; CHECK-LIBCALL-NEXT:    sw $7, 28($sp)
+; CHECK-LIBCALL-NEXT:    jal __truncdfhf2
+; CHECK-LIBCALL-NEXT:    sw $6, 24($sp)
+; CHECK-LIBCALL-NEXT:    move $18, $2
+; CHECK-LIBCALL-NEXT:    jal __truncdfhf2
+; CHECK-LIBCALL-NEXT:    ldc1 $f12, 40($sp)
+; CHECK-LIBCALL-NEXT:    ldc1 $f12, 24($sp)
+; CHECK-LIBCALL-NEXT:    lw $19, 104($sp)
+; CHECK-LIBCALL-NEXT:    sh $2, 6($19)
+; CHECK-LIBCALL-NEXT:    jal __truncdfhf2
+; CHECK-LIBCALL-NEXT:    sh $18, 4($19)
+; CHECK-LIBCALL-NEXT:    sh $2, 2($19)
+; CHECK-LIBCALL-NEXT:    sw $16, 20($sp)
+; CHECK-LIBCALL-NEXT:    sw $17, 16($sp)
+; CHECK-LIBCALL-NEXT:    jal __truncdfhf2
+; CHECK-LIBCALL-NEXT:    ldc1 $f12, 16($sp)
+; CHECK-LIBCALL-NEXT:    sh $2, 0($19)
+; CHECK-LIBCALL-NEXT:    lw $16, 52($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $17, 56($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $18, 60($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $19, 64($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $ra, 68($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    jr $ra
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 72
   %b = fptrunc <4 x double> %a to <4 x half>
   store <4 x half> %b, ptr %p, align 8
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 7030e5435f723..8d40a9ef54dca 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -508,21 +508,24 @@ define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 {
 ; CHECK-LABEL: test_copysign(
 ; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_param_0];
 ; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_copysign_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  abs.bf16        [[AW1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  neg.bf16        [[AY1:%rs[0-9]+]], [[AW1]];
-; CHECK-DAG:  shr.u16         [[BS1:%rs[0-9]+]], [[B1]], 15;
-; CHECK-DAG:  and.b16         [[BR1:%rs[0-9]+]], [[BS1]], 1;
-; CHECK-DAG:  setp.eq.b16     [[P1:%p[0-9]+]], [[BR1]], 1;
-; CHECK-DAG:  selp.b16        [[RS1:%rs[0-9]+]], [[AY1]], [[AW1]], [[P1]]
-; CHECK-DAG:  abs.bf16        [[AW0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  neg.bf16        [[AY0:%rs[0-9]+]], [[AW0]];
-; CHECK-DAG:  shr.u16         [[BS0:%rs[0-9]+]], [[B0]], 15;
-; CHECK-DAG:  and.b16         [[BR0:%rs[0-9]+]], [[BS0]], 1;
-; CHECK-DAG:  setp.eq.b16     [[P0:%p[0-9]+]], [[BR0]], 1;
-; CHECK-DAG:  selp.b16        [[RS0:%rs[0-9]+]], [[AY0]], [[AW0]], [[P0]]
-; CHECK-DAG:  mov.b32         [[R:%r[0-9]+]], {[[RS0]], [[RS1]]}
+; SM80-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; SM80-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
+; SM80-DAG:  abs.bf16        [[AW1:%rs[0-9]+]], [[A1]];
+; SM80-DAG:  neg.bf16        [[AY1:%rs[0-9]+]], [[AW1]];
+; SM80-DAG:  shr.u16         [[BS1:%rs[0-9]+]], [[B1]], 15;
+; SM80-DAG:  and.b16         [[BR1:%rs[0-9]+]], [[BS1]], 1;
+; SM80-DAG:  setp.eq.b16     [[P1:%p[0-9]+]], [[BR1]], 1;
+; SM80-DAG:  selp.b16        [[RS1:%rs[0-9]+]], [[AY1]], [[AW1]], [[P1]]
+; SM80-DAG:  abs.bf16        [[AW0:%rs[0-9]+]], [[A0]];
+; SM80-DAG:  neg.bf16        [[AY0:%rs[0-9]+]], [[AW0]];
+; SM80-DAG:  shr.u16         [[BS0:%rs[0-9]+]], [[B0]], 15;
+; SM80-DAG:  and.b16         [[BR0:%rs[0-9]+]], [[BS0]], 1;
+; SM80-DAG:  setp.eq.b16     [[P0:%p[0-9]+]], [[BR0]], 1;
+; SM80-DAG:  selp.b16        [[RS0:%rs[0-9]+]], [[AY0]], [[AW0]], [[P0]]
+; SM80-DAG:  mov.b32         [[R:%r[0-9]+]], {[[RS0]], [[RS1]]}
+; SM90-DAG:  and.b32         [[R1:%r[0-9]+]], [[B]], -2147450880;
+; SM90-DAG:  and.b32         [[R2:%r[0-9]+]], [[A]], 2147450879;
+; SM90-DAG:  or.b32          [[R:%r[0-9]+]], [[R2]], [[R1]];
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
diff --git a/llvm/test/CodeGen/NVPTX/cluster-dim.ll b/llvm/test/CodeGen/NVPTX/cluster-dim.ll
new file mode 100644
index 0000000000000..42dff68fa6594
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cluster-dim.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck -check-prefixes=CHECK80 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 | FileCheck -check-prefixes=CHECK90 %s
+; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %}
+
+define void @kernel_func_clusterxyz() {
+; CHECK80-LABEL: kernel_func_clusterxyz(
+; CHECK80:       {
+; CHECK80-EMPTY:
+; CHECK80-EMPTY:
+; CHECK80-NEXT:  // %bb.0:
+; CHECK80-NEXT:    ret;
+;
+; CHECK90-LABEL: kernel_func_clusterxyz(
+; CHECK90:       .explicitcluster
+; CHECK90-NEXT:  .reqnctapercluster 3, 5, 7
+; CHECK90-NEXT:  {
+; CHECK90-EMPTY:
+; CHECK90-EMPTY:
+; CHECK90-NEXT:  // %bb.0:
+; CHECK90-NEXT:    ret;
+  ret void
+}
+
+
+!nvvm.annotations = !{!1, !2}
+
+!1 = !{ptr @kernel_func_clusterxyz, !"kernel", i32 1}
+!2 = !{ptr @kernel_func_clusterxyz, !"cluster_dim_x", i32 3, !"cluster_dim_y", i32 5, !"cluster_dim_z", i32 7}
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 464b3a754804f..b41f63b783d39 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -1184,14 +1184,15 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
 
 ; CHECK-LABEL: test_fabs(
 ; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fabs_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  abs.f32         [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  abs.f32         [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-NOF16-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-NOF16-DAG:  abs.f32         [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-NOF16-DAG:  abs.f32         [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
+; CHECK-NOF16:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-F16:        and.b32         [[R:%r[0-9]+]], [[A]], 2147450879;
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_fabs(<2 x half> %a) #0 {
@@ -1244,15 +1245,18 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-LABEL: test_copysign(
 ; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_param_0];
 ; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_copysign_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[A0]], 32767;
-; CHECK-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[A1]], 32767;
-; CHECK-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[B0]], -32768;
-; CHECK-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[B1]], -32768;
-; CHECK-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AX0]], [[BX0]];
-; CHECK-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AX1]], [[BX1]];
-; CHECK-DAG:  mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[A0]], 32767;
+; CHECK-NOF16-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[A1]], 32767;
+; CHECK-NOF16-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[B0]], -32768;
+; CHECK-NOF16-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[B1]], -32768;
+; CHECK-NOF16-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-NOF16-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-NOF16-DAG:  mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-F16-DAG:    and.b32         [[R0:%r[0-9]+]], [[B]], -2147450880;
+; CHECK-F16-DAG:    and.b32         [[R1:%r[0-9]+]], [[A]], 2147450879;
+; CHECK-F16-DAG:    or.b32          [[R:%r[0-9]+]], [[R1]], [[R0]]
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
@@ -1263,18 +1267,24 @@ define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-LABEL: test_copysign_f32(
 ; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_f32_param_0];
 ; CHECK-DAG:  ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         [[BI0:%r[0-9]+]], [[B0]];
-; CHECK-DAG:  mov.b32         [[BI1:%r[0-9]+]], [[B1]];
-; CHECK-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[A0]], 32767;
-; CHECK-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[A1]], 32767;
-; CHECK-DAG:  and.b32         [[BX0:%r[0-9]+]], [[BI0]], -2147483648;
-; CHECK-DAG:  and.b32         [[BX1:%r[0-9]+]], [[BI1]], -2147483648;
-; CHECK-DAG:  mov.b32         {tmp, [[BZ0:%rs[0-9]+]]}, [[BX0]]; }
-; CHECK-DAG:  mov.b32         {tmp, [[BZ1:%rs[0-9]+]]}, [[BX1]]; }
-; CHECK-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]];
-; CHECK-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]];
-; CHECK-DAG:  mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32         [[BI0:%r[0-9]+]], [[B0]];
+; CHECK-NOF16-DAG:  mov.b32         [[BI1:%r[0-9]+]], [[B1]];
+; CHECK-NOF16-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[A0]], 32767;
+; CHECK-NOF16-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[A1]], 32767;
+; CHECK-NOF16-DAG:  and.b32         [[BX0:%r[0-9]+]], [[BI0]], -2147483648;
+; CHECK-NOF16-DAG:  and.b32         [[BX1:%r[0-9]+]], [[BI1]], -2147483648;
+; CHECK-NOF16-DAG:  mov.b32         {tmp, [[BZ0:%rs[0-9]+]]}, [[BX0]]; }
+; CHECK-NOF16-DAG:  mov.b32         {tmp, [[BZ1:%rs[0-9]+]]}, [[BX1]]; }
+; CHECK-NOF16-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-NOF16-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-NOF16-DAG:  mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-F16-DAG:    cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[B1]];
+; CHECK-F16-DAG:    cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[B0]];
+; CHECK-F16-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[R1]], [[R0]]};
+; CHECK-F16-DAG:    and.b32         [[R3:%r[0-9]+]], [[R2]], -2147450880;
+; CHECK-F16-DAG:    and.b32         [[R4:%r[0-9]+]], [[A]], 2147450879;
+; CHECK-F16-DAG:    or.b32          [[R:%r[0-9]+]], [[R4]], [[R3]]
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
@@ -1286,20 +1296,26 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
 ; CHECK-LABEL: test_copysign_f64(
 ; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_f64_param_0];
 ; CHECK-DAG:  ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b64         [[BI0:%rd[0-9]+]], [[B0]];
-; CHECK-DAG:  mov.b64         [[BI1:%rd[0-9]+]], [[B1]];
-; CHECK-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[A0]], 32767;
-; CHECK-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[A1]], 32767;
-; CHECK-DAG:  and.b64         [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808;
-; CHECK-DAG:  and.b64         [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808;
-; CHECK-DAG:  shr.u64         [[BY0:%rd[0-9]+]], [[BX0]], 48;
-; CHECK-DAG:  shr.u64         [[BY1:%rd[0-9]+]], [[BX1]], 48;
-; CHECK-DAG:  cvt.u16.u64     [[BZ0:%rs[0-9]+]], [[BY0]];
-; CHECK-DAG:  cvt.u16.u64     [[BZ1:%rs[0-9]+]], [[BY1]];
-; CHECK-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]];
-; CHECK-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]];
-; CHECK-DAG:  mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NOF16-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b64         [[BI0:%rd[0-9]+]], [[B0]];
+; CHECK-NOF16-DAG:  mov.b64         [[BI1:%rd[0-9]+]], [[B1]];
+; CHECK-NOF16-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[A0]], 32767;
+; CHECK-NOF16-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[A1]], 32767;
+; CHECK-NOF16-DAG:  and.b64         [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808;
+; CHECK-NOF16-DAG:  and.b64         [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808;
+; CHECK-NOF16-DAG:  shr.u64         [[BY0:%rd[0-9]+]], [[BX0]], 48;
+; CHECK-NOF16-DAG:  shr.u64         [[BY1:%rd[0-9]+]], [[BX1]], 48;
+; CHECK-NOF16-DAG:  cvt.u16.u64     [[BZ0:%rs[0-9]+]], [[BY0]];
+; CHECK-NOF16-DAG:  cvt.u16.u64     [[BZ1:%rs[0-9]+]], [[BY1]];
+; CHECK-NOF16-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-NOF16-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-NOF16-DAG:  mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-F16-DAG:      cvt.rn.f16.f64  [[R0:%rs[0-9]+]], [[B1]];
+; CHECK-F16-DAG:      cvt.rn.f16.f64  [[R1:%rs[0-9]+]], [[B0]];
+; CHECK-F16-DAG:      mov.b32         [[R2:%r[0-9]+]], {[[R1]], [[R0]]};
+; CHECK-F16-DAG:      and.b32         [[R3:%r[0-9]+]], [[R2]], -2147450880;
+; CHECK-F16-DAG:      and.b32         [[R4:%r[0-9]+]], [[A]], 2147450879;
+; CHECK-F16-DAG:      or.b32          [[R:%r[0-9]+]], [[R4]], [[R3]];
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
 define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
@@ -1311,16 +1327,22 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_copysign_extended(
 ; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_extended_param_0];
 ; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_copysign_extended_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[A0]], 32767;
-; CHECK-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[A1]], 32767;
-; CHECK-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[B0]], -32768;
-; CHECK-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[B1]], -32768;
-; CHECK-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AX0]], [[BX0]];
-; CHECK-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AX1]], [[BX1]];
-; CHECK-DAG:  cvt.f32.f16     [[XR0:%f[0-9]+]], [[R0]];
-; CHECK-DAG:  cvt.f32.f16     [[XR1:%f[0-9]+]], [[R1]];
+; CHECK-NOF16-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[A0]], 32767;
+; CHECK-NOF16-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[A1]], 32767;
+; CHECK-NOF16-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[B0]], -32768;
+; CHECK-NOF16-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[B1]], -32768;
+; CHECK-NOF16-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-NOF16-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-NOF16-DAG:  cvt.f32.f16     [[XR0:%f[0-9]+]], [[R0]];
+; CHECK-NOF16-DAG:  cvt.f32.f16     [[XR1:%f[0-9]+]], [[R1]];
+; CHECK-F16-DAG:    and.b32         [[R0:%r[0-9]+]], [[B]], -2147450880;
+; CHECK-F16-DAG:    and.b32         [[R1:%r[0-9]+]], [[A]], 2147450879;
+; CHECK-F16-DAG:    or.b32          [[R2:%r[0-9]+]], [[R1]], [[R0]]
+; CHECK-F16-DAG:    mov.b32         {[[R3:%rs[0-9]+]], [[R4:%rs[0-9]+]]}, [[R2]]
+; CHECK-F16-DAG:    cvt.f32.f16     [[XR0:%f[0-9]+]], [[R3]]
+; CHECK-F16-DAG:    cvt.f32.f16     [[XR1:%f[0-9]+]], [[R4]]
 ; CHECK:      st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]};
 ; CHECK:      ret;
 define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
index 9ec5bcd13403b..6586393f83d44 100644
--- a/llvm/test/CodeGen/NVPTX/rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -321,7 +321,7 @@ define i32 @funnel_shift_right_32(i32 %a, i32 %b, i32 %c) {
 ; SM35-NEXT:    ld.param.u32 %r1, [funnel_shift_right_32_param_0];
 ; SM35-NEXT:    ld.param.u32 %r2, [funnel_shift_right_32_param_1];
 ; SM35-NEXT:    ld.param.u32 %r3, [funnel_shift_right_32_param_2];
-; SM35-NEXT:    shf.r.wrap.b32 %r4, %r1, %r2, %r3;
+; SM35-NEXT:    shf.r.wrap.b32 %r4, %r2, %r1, %r3;
 ; SM35-NEXT:    st.param.b32 [func_retval0+0], %r4;
 ; SM35-NEXT:    ret;
   %val = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
@@ -355,7 +355,7 @@ define i32 @funnel_shift_left_32(i32 %a, i32 %b, i32 %c) {
 ; SM35-NEXT:    ld.param.u32 %r1, [funnel_shift_left_32_param_0];
 ; SM35-NEXT:    ld.param.u32 %r2, [funnel_shift_left_32_param_1];
 ; SM35-NEXT:    ld.param.u32 %r3, [funnel_shift_left_32_param_2];
-; SM35-NEXT:    shf.l.wrap.b32 %r4, %r1, %r2, %r3;
+; SM35-NEXT:    shf.l.wrap.b32 %r4, %r2, %r1, %r3;
 ; SM35-NEXT:    st.param.b32 [func_retval0+0], %r4;
 ; SM35-NEXT:    ret;
   %val = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
diff --git a/llvm/test/CodeGen/PowerPC/vec_abs.ll b/llvm/test/CodeGen/PowerPC/vec_abs.ll
index 50dcfc3faf62e..b900f0ea29c4a 100644
--- a/llvm/test/CodeGen/PowerPC/vec_abs.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_abs.ll
@@ -19,10 +19,9 @@ declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #1
 
 ; CHECK: xvabssp
 ; CHECK: blr
-; CHECK-NOVSX: fabs
-; CHECK-NOVSX: fabs
-; CHECK-NOVSX: fabs
-; CHECK-NOVSX: fabs
+; CHECK-NOVSX: vspltisb
+; CHECK-NOVSX: vslw
+; CHECK-NOVSX: vandc
 ; CHECK-NOVSX: blr
 
 define <4 x float> @test2_float(<4 x float> %aa) #0 {
@@ -40,11 +39,8 @@ define <4 x float> @test2_float(<4 x float> %aa) #0 {
 ; CHECK: xvnabssp
 ; CHECK: blr
 ; CHECK-NOVSX: vspltisb
-; CHECK-NOVSX: fabs
-; CHECK-NOVSX: fabs
-; CHECK-NOVSX: fabs
-; CHECK-NOVSX: fabs
-; CHECK-NOVSX: vxor
+; CHECK-NOVSX: vslw
+; CHECK-NOVSX: vor
 ; CHECK-NOVSX: blr
 
 define <2 x double> @test_double(<2 x double> %aa) #0 {
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
index a49d4de6e9cf0..01cab0d0e157b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
@@ -69,15 +69,15 @@ define i32 @va1(ptr %fmt, ...) {
 ; RV64-NEXT:    sd a2, 32(sp)
 ; RV64-NEXT:    sd a3, 40(sp)
 ; RV64-NEXT:    sd a4, 48(sp)
-; RV64-NEXT:    sd a5, 56(sp)
 ; RV64-NEXT:    addi a0, sp, 24
 ; RV64-NEXT:    sd a0, 8(sp)
-; RV64-NEXT:    lw a0, 12(sp)
-; RV64-NEXT:    lwu a1, 8(sp)
+; RV64-NEXT:    lwu a0, 8(sp)
+; RV64-NEXT:    lw a1, 12(sp)
+; RV64-NEXT:    sd a5, 56(sp)
 ; RV64-NEXT:    sd a6, 64(sp)
 ; RV64-NEXT:    sd a7, 72(sp)
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    addi a1, a0, 4
 ; RV64-NEXT:    srli a2, a1, 32
 ; RV64-NEXT:    sw a1, 8(sp)
@@ -128,15 +128,15 @@ define i32 @va1(ptr %fmt, ...) {
 ; RV64-WITHFP-NEXT:    sd a2, 16(s0)
 ; RV64-WITHFP-NEXT:    sd a3, 24(s0)
 ; RV64-WITHFP-NEXT:    sd a4, 32(s0)
-; RV64-WITHFP-NEXT:    sd a5, 40(s0)
 ; RV64-WITHFP-NEXT:    addi a0, s0, 8
 ; RV64-WITHFP-NEXT:    sd a0, -24(s0)
-; RV64-WITHFP-NEXT:    lw a0, -20(s0)
-; RV64-WITHFP-NEXT:    lwu a1, -24(s0)
+; RV64-WITHFP-NEXT:    lwu a0, -24(s0)
+; RV64-WITHFP-NEXT:    lw a1, -20(s0)
+; RV64-WITHFP-NEXT:    sd a5, 40(s0)
 ; RV64-WITHFP-NEXT:    sd a6, 48(s0)
 ; RV64-WITHFP-NEXT:    sd a7, 56(s0)
-; RV64-WITHFP-NEXT:    slli a0, a0, 32
-; RV64-WITHFP-NEXT:    or a0, a0, a1
+; RV64-WITHFP-NEXT:    slli a1, a1, 32
+; RV64-WITHFP-NEXT:    or a0, a1, a0
 ; RV64-WITHFP-NEXT:    addi a1, a0, 4
 ; RV64-WITHFP-NEXT:    srli a2, a1, 32
 ; RV64-WITHFP-NEXT:    sw a1, -24(s0)
@@ -1609,22 +1609,22 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    sd a4, 304(a0)
 ; RV64-NEXT:    lui a0, 24414
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    sd a5, 312(a0)
-; RV64-NEXT:    lui a0, 24414
 ; RV64-NEXT:    addiw a0, a0, 280
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    sd a0, 8(sp)
-; RV64-NEXT:    lw a0, 12(sp)
-; RV64-NEXT:    lwu a1, 8(sp)
+; RV64-NEXT:    lwu a0, 8(sp)
+; RV64-NEXT:    lw a1, 12(sp)
+; RV64-NEXT:    lui a2, 24414
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    sd a5, 312(a2)
 ; RV64-NEXT:    lui a2, 24414
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    sd a6, 320(a2)
 ; RV64-NEXT:    lui a2, 24414
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    sd a7, 328(a2)
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    addi a1, a0, 4
 ; RV64-NEXT:    srli a2, a1, 32
 ; RV64-NEXT:    sw a1, 8(sp)
@@ -1692,15 +1692,15 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; RV64-WITHFP-NEXT:    sd a2, 16(s0)
 ; RV64-WITHFP-NEXT:    sd a3, 24(s0)
 ; RV64-WITHFP-NEXT:    sd a4, 32(s0)
-; RV64-WITHFP-NEXT:    sd a5, 40(s0)
 ; RV64-WITHFP-NEXT:    addi a1, s0, 8
 ; RV64-WITHFP-NEXT:    sd a1, 0(a0)
-; RV64-WITHFP-NEXT:    lw a1, 4(a0)
-; RV64-WITHFP-NEXT:    lwu a2, 0(a0)
+; RV64-WITHFP-NEXT:    lwu a1, 0(a0)
+; RV64-WITHFP-NEXT:    lw a2, 4(a0)
+; RV64-WITHFP-NEXT:    sd a5, 40(s0)
 ; RV64-WITHFP-NEXT:    sd a6, 48(s0)
 ; RV64-WITHFP-NEXT:    sd a7, 56(s0)
-; RV64-WITHFP-NEXT:    slli a1, a1, 32
-; RV64-WITHFP-NEXT:    or a1, a1, a2
+; RV64-WITHFP-NEXT:    slli a2, a2, 32
+; RV64-WITHFP-NEXT:    or a1, a2, a1
 ; RV64-WITHFP-NEXT:    addi a2, a1, 4
 ; RV64-WITHFP-NEXT:    srli a3, a2, 32
 ; RV64-WITHFP-NEXT:    sw a2, 0(a0)
diff --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll
index 168615983d970..b6064198935a6 100644
--- a/llvm/test/CodeGen/RISCV/abds-neg.ll
+++ b/llvm/test/CodeGen/RISCV/abds-neg.ll
@@ -622,23 +622,23 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 0(a2)
-; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t1, 12(a1)
+; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t2, 12(a2)
-; RV32I-NEXT:    lw a1, 4(a2)
+; RV32I-NEXT:    lw a1, 0(a2)
+; RV32I-NEXT:    lw a2, 4(a2)
 ; RV32I-NEXT:    sltu t3, a7, a6
 ; RV32I-NEXT:    mv t4, t3
 ; RV32I-NEXT:    beq t1, t2, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    slt t4, t2, t1
 ; RV32I-NEXT:  .LBB11_2:
-; RV32I-NEXT:    sltu a2, a4, a3
-; RV32I-NEXT:    sltu t6, a1, a5
-; RV32I-NEXT:    mv t0, a2
-; RV32I-NEXT:    beq a5, a1, .LBB11_4
+; RV32I-NEXT:    sltu a5, a1, a3
+; RV32I-NEXT:    sltu t6, a2, a4
+; RV32I-NEXT:    mv t0, a5
+; RV32I-NEXT:    beq a4, a2, .LBB11_4
 ; RV32I-NEXT:  # %bb.3:
 ; RV32I-NEXT:    mv t0, t6
 ; RV32I-NEXT:  .LBB11_4:
@@ -651,16 +651,16 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    mv t0, t4
 ; RV32I-NEXT:  .LBB11_6:
-; RV32I-NEXT:    mv t5, a2
-; RV32I-NEXT:    beq a1, a5, .LBB11_8
+; RV32I-NEXT:    mv t5, a5
+; RV32I-NEXT:    beq a2, a4, .LBB11_8
 ; RV32I-NEXT:  # %bb.7:
 ; RV32I-NEXT:    mv t5, t6
 ; RV32I-NEXT:  .LBB11_8:
-; RV32I-NEXT:    sltu t4, a3, a4
+; RV32I-NEXT:    sltu t4, a3, a1
 ; RV32I-NEXT:    mv t6, t4
-; RV32I-NEXT:    beq a5, a1, .LBB11_10
+; RV32I-NEXT:    beq a4, a2, .LBB11_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    sltu t6, a5, a1
+; RV32I-NEXT:    sltu t6, a4, a2
 ; RV32I-NEXT:  .LBB11_10:
 ; RV32I-NEXT:    bnez t0, .LBB11_12
 ; RV32I-NEXT:  # %bb.11:
@@ -684,29 +684,29 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    add a7, a7, t1
 ; RV32I-NEXT:    bnez t0, .LBB11_15
 ; RV32I-NEXT:  # %bb.14:
-; RV32I-NEXT:    sub a1, a1, a5
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    sub a2, a2, a4
+; RV32I-NEXT:    sub a2, a2, a5
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    j .LBB11_16
 ; RV32I-NEXT:  .LBB11_15:
-; RV32I-NEXT:    sub a5, a5, a1
-; RV32I-NEXT:    sub a1, a5, t4
-; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sub a4, a4, a2
+; RV32I-NEXT:    sub a2, a4, t4
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:  .LBB11_16:
-; RV32I-NEXT:    or a2, a3, a1
-; RV32I-NEXT:    snez a2, a2
+; RV32I-NEXT:    or a3, a1, a2
+; RV32I-NEXT:    snez a3, a3
 ; RV32I-NEXT:    neg a4, a6
-; RV32I-NEXT:    sltu a5, a4, a2
+; RV32I-NEXT:    sltu a5, a4, a3
 ; RV32I-NEXT:    neg a6, a7
 ; RV32I-NEXT:    sub a5, a6, a5
-; RV32I-NEXT:    snez a6, a3
-; RV32I-NEXT:    add a1, a1, a6
+; RV32I-NEXT:    snez a6, a1
+; RV32I-NEXT:    add a2, a2, a6
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a4, a4, a3
 ; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a4, a4, a2
-; RV32I-NEXT:    neg a2, a3
-; RV32I-NEXT:    sw a2, 0(a0)
+; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a5, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -741,23 +741,23 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_ext_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw a4, 0(a2)
-; RV32ZBB-NEXT:    lw a5, 4(a1)
+; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t1, 12(a1)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t2, 12(a2)
-; RV32ZBB-NEXT:    lw a1, 4(a2)
+; RV32ZBB-NEXT:    lw a1, 0(a2)
+; RV32ZBB-NEXT:    lw a2, 4(a2)
 ; RV32ZBB-NEXT:    sltu t3, a7, a6
 ; RV32ZBB-NEXT:    mv t4, t3
 ; RV32ZBB-NEXT:    beq t1, t2, .LBB11_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    slt t4, t2, t1
 ; RV32ZBB-NEXT:  .LBB11_2:
-; RV32ZBB-NEXT:    sltu a2, a4, a3
-; RV32ZBB-NEXT:    sltu t6, a1, a5
-; RV32ZBB-NEXT:    mv t0, a2
-; RV32ZBB-NEXT:    beq a5, a1, .LBB11_4
+; RV32ZBB-NEXT:    sltu a5, a1, a3
+; RV32ZBB-NEXT:    sltu t6, a2, a4
+; RV32ZBB-NEXT:    mv t0, a5
+; RV32ZBB-NEXT:    beq a4, a2, .LBB11_4
 ; RV32ZBB-NEXT:  # %bb.3:
 ; RV32ZBB-NEXT:    mv t0, t6
 ; RV32ZBB-NEXT:  .LBB11_4:
@@ -770,16 +770,16 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    mv t0, t4
 ; RV32ZBB-NEXT:  .LBB11_6:
-; RV32ZBB-NEXT:    mv t5, a2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB11_8
+; RV32ZBB-NEXT:    mv t5, a5
+; RV32ZBB-NEXT:    beq a2, a4, .LBB11_8
 ; RV32ZBB-NEXT:  # %bb.7:
 ; RV32ZBB-NEXT:    mv t5, t6
 ; RV32ZBB-NEXT:  .LBB11_8:
-; RV32ZBB-NEXT:    sltu t4, a3, a4
+; RV32ZBB-NEXT:    sltu t4, a3, a1
 ; RV32ZBB-NEXT:    mv t6, t4
-; RV32ZBB-NEXT:    beq a5, a1, .LBB11_10
+; RV32ZBB-NEXT:    beq a4, a2, .LBB11_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    sltu t6, a5, a1
+; RV32ZBB-NEXT:    sltu t6, a4, a2
 ; RV32ZBB-NEXT:  .LBB11_10:
 ; RV32ZBB-NEXT:    bnez t0, .LBB11_12
 ; RV32ZBB-NEXT:  # %bb.11:
@@ -803,29 +803,29 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    add a7, a7, t1
 ; RV32ZBB-NEXT:    bnez t0, .LBB11_15
 ; RV32ZBB-NEXT:  # %bb.14:
-; RV32ZBB-NEXT:    sub a1, a1, a5
-; RV32ZBB-NEXT:    sub a1, a1, a2
-; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    sub a2, a2, a4
+; RV32ZBB-NEXT:    sub a2, a2, a5
+; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    j .LBB11_16
 ; RV32ZBB-NEXT:  .LBB11_15:
-; RV32ZBB-NEXT:    sub a5, a5, a1
-; RV32ZBB-NEXT:    sub a1, a5, t4
-; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sub a4, a4, a2
+; RV32ZBB-NEXT:    sub a2, a4, t4
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:  .LBB11_16:
-; RV32ZBB-NEXT:    or a2, a3, a1
-; RV32ZBB-NEXT:    snez a2, a2
+; RV32ZBB-NEXT:    or a3, a1, a2
+; RV32ZBB-NEXT:    snez a3, a3
 ; RV32ZBB-NEXT:    neg a4, a6
-; RV32ZBB-NEXT:    sltu a5, a4, a2
+; RV32ZBB-NEXT:    sltu a5, a4, a3
 ; RV32ZBB-NEXT:    neg a6, a7
 ; RV32ZBB-NEXT:    sub a5, a6, a5
-; RV32ZBB-NEXT:    snez a6, a3
-; RV32ZBB-NEXT:    add a1, a1, a6
+; RV32ZBB-NEXT:    snez a6, a1
+; RV32ZBB-NEXT:    add a2, a2, a6
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a4, a4, a3
 ; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a4, a4, a2
-; RV32ZBB-NEXT:    neg a2, a3
-; RV32ZBB-NEXT:    sw a2, 0(a0)
+; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a5, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
@@ -869,23 +869,23 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 0(a2)
-; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t1, 12(a1)
+; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t2, 12(a2)
-; RV32I-NEXT:    lw a1, 4(a2)
+; RV32I-NEXT:    lw a1, 0(a2)
+; RV32I-NEXT:    lw a2, 4(a2)
 ; RV32I-NEXT:    sltu t3, a7, a6
 ; RV32I-NEXT:    mv t4, t3
 ; RV32I-NEXT:    beq t1, t2, .LBB12_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    slt t4, t2, t1
 ; RV32I-NEXT:  .LBB12_2:
-; RV32I-NEXT:    sltu a2, a4, a3
-; RV32I-NEXT:    sltu t6, a1, a5
-; RV32I-NEXT:    mv t0, a2
-; RV32I-NEXT:    beq a5, a1, .LBB12_4
+; RV32I-NEXT:    sltu a5, a1, a3
+; RV32I-NEXT:    sltu t6, a2, a4
+; RV32I-NEXT:    mv t0, a5
+; RV32I-NEXT:    beq a4, a2, .LBB12_4
 ; RV32I-NEXT:  # %bb.3:
 ; RV32I-NEXT:    mv t0, t6
 ; RV32I-NEXT:  .LBB12_4:
@@ -898,16 +898,16 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    mv t0, t4
 ; RV32I-NEXT:  .LBB12_6:
-; RV32I-NEXT:    mv t5, a2
-; RV32I-NEXT:    beq a1, a5, .LBB12_8
+; RV32I-NEXT:    mv t5, a5
+; RV32I-NEXT:    beq a2, a4, .LBB12_8
 ; RV32I-NEXT:  # %bb.7:
 ; RV32I-NEXT:    mv t5, t6
 ; RV32I-NEXT:  .LBB12_8:
-; RV32I-NEXT:    sltu t4, a3, a4
+; RV32I-NEXT:    sltu t4, a3, a1
 ; RV32I-NEXT:    mv t6, t4
-; RV32I-NEXT:    beq a5, a1, .LBB12_10
+; RV32I-NEXT:    beq a4, a2, .LBB12_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    sltu t6, a5, a1
+; RV32I-NEXT:    sltu t6, a4, a2
 ; RV32I-NEXT:  .LBB12_10:
 ; RV32I-NEXT:    bnez t0, .LBB12_12
 ; RV32I-NEXT:  # %bb.11:
@@ -931,29 +931,29 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    add a7, a7, t1
 ; RV32I-NEXT:    bnez t0, .LBB12_15
 ; RV32I-NEXT:  # %bb.14:
-; RV32I-NEXT:    sub a1, a1, a5
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    sub a2, a2, a4
+; RV32I-NEXT:    sub a2, a2, a5
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    j .LBB12_16
 ; RV32I-NEXT:  .LBB12_15:
-; RV32I-NEXT:    sub a5, a5, a1
-; RV32I-NEXT:    sub a1, a5, t4
-; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sub a4, a4, a2
+; RV32I-NEXT:    sub a2, a4, t4
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:  .LBB12_16:
-; RV32I-NEXT:    or a2, a3, a1
-; RV32I-NEXT:    snez a2, a2
+; RV32I-NEXT:    or a3, a1, a2
+; RV32I-NEXT:    snez a3, a3
 ; RV32I-NEXT:    neg a4, a6
-; RV32I-NEXT:    sltu a5, a4, a2
+; RV32I-NEXT:    sltu a5, a4, a3
 ; RV32I-NEXT:    neg a6, a7
 ; RV32I-NEXT:    sub a5, a6, a5
-; RV32I-NEXT:    snez a6, a3
-; RV32I-NEXT:    add a1, a1, a6
+; RV32I-NEXT:    snez a6, a1
+; RV32I-NEXT:    add a2, a2, a6
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a4, a4, a3
 ; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a4, a4, a2
-; RV32I-NEXT:    neg a2, a3
-; RV32I-NEXT:    sw a2, 0(a0)
+; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a5, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -988,23 +988,23 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_ext_i128_undef:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw a4, 0(a2)
-; RV32ZBB-NEXT:    lw a5, 4(a1)
+; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t1, 12(a1)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t2, 12(a2)
-; RV32ZBB-NEXT:    lw a1, 4(a2)
+; RV32ZBB-NEXT:    lw a1, 0(a2)
+; RV32ZBB-NEXT:    lw a2, 4(a2)
 ; RV32ZBB-NEXT:    sltu t3, a7, a6
 ; RV32ZBB-NEXT:    mv t4, t3
 ; RV32ZBB-NEXT:    beq t1, t2, .LBB12_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    slt t4, t2, t1
 ; RV32ZBB-NEXT:  .LBB12_2:
-; RV32ZBB-NEXT:    sltu a2, a4, a3
-; RV32ZBB-NEXT:    sltu t6, a1, a5
-; RV32ZBB-NEXT:    mv t0, a2
-; RV32ZBB-NEXT:    beq a5, a1, .LBB12_4
+; RV32ZBB-NEXT:    sltu a5, a1, a3
+; RV32ZBB-NEXT:    sltu t6, a2, a4
+; RV32ZBB-NEXT:    mv t0, a5
+; RV32ZBB-NEXT:    beq a4, a2, .LBB12_4
 ; RV32ZBB-NEXT:  # %bb.3:
 ; RV32ZBB-NEXT:    mv t0, t6
 ; RV32ZBB-NEXT:  .LBB12_4:
@@ -1017,16 +1017,16 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    mv t0, t4
 ; RV32ZBB-NEXT:  .LBB12_6:
-; RV32ZBB-NEXT:    mv t5, a2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB12_8
+; RV32ZBB-NEXT:    mv t5, a5
+; RV32ZBB-NEXT:    beq a2, a4, .LBB12_8
 ; RV32ZBB-NEXT:  # %bb.7:
 ; RV32ZBB-NEXT:    mv t5, t6
 ; RV32ZBB-NEXT:  .LBB12_8:
-; RV32ZBB-NEXT:    sltu t4, a3, a4
+; RV32ZBB-NEXT:    sltu t4, a3, a1
 ; RV32ZBB-NEXT:    mv t6, t4
-; RV32ZBB-NEXT:    beq a5, a1, .LBB12_10
+; RV32ZBB-NEXT:    beq a4, a2, .LBB12_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    sltu t6, a5, a1
+; RV32ZBB-NEXT:    sltu t6, a4, a2
 ; RV32ZBB-NEXT:  .LBB12_10:
 ; RV32ZBB-NEXT:    bnez t0, .LBB12_12
 ; RV32ZBB-NEXT:  # %bb.11:
@@ -1050,29 +1050,29 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    add a7, a7, t1
 ; RV32ZBB-NEXT:    bnez t0, .LBB12_15
 ; RV32ZBB-NEXT:  # %bb.14:
-; RV32ZBB-NEXT:    sub a1, a1, a5
-; RV32ZBB-NEXT:    sub a1, a1, a2
-; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    sub a2, a2, a4
+; RV32ZBB-NEXT:    sub a2, a2, a5
+; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    j .LBB12_16
 ; RV32ZBB-NEXT:  .LBB12_15:
-; RV32ZBB-NEXT:    sub a5, a5, a1
-; RV32ZBB-NEXT:    sub a1, a5, t4
-; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sub a4, a4, a2
+; RV32ZBB-NEXT:    sub a2, a4, t4
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:  .LBB12_16:
-; RV32ZBB-NEXT:    or a2, a3, a1
-; RV32ZBB-NEXT:    snez a2, a2
+; RV32ZBB-NEXT:    or a3, a1, a2
+; RV32ZBB-NEXT:    snez a3, a3
 ; RV32ZBB-NEXT:    neg a4, a6
-; RV32ZBB-NEXT:    sltu a5, a4, a2
+; RV32ZBB-NEXT:    sltu a5, a4, a3
 ; RV32ZBB-NEXT:    neg a6, a7
 ; RV32ZBB-NEXT:    sub a5, a6, a5
-; RV32ZBB-NEXT:    snez a6, a3
-; RV32ZBB-NEXT:    add a1, a1, a6
+; RV32ZBB-NEXT:    snez a6, a1
+; RV32ZBB-NEXT:    add a2, a2, a6
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a4, a4, a3
 ; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a4, a4, a2
-; RV32ZBB-NEXT:    neg a2, a3
-; RV32ZBB-NEXT:    sw a2, 0(a0)
+; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a5, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
@@ -1383,10 +1383,10 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a6, 4(a2)
-; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t0, 12(a2)
 ; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    beq a5, t0, .LBB17_2
 ; RV32I-NEXT:  # %bb.1:
@@ -1510,10 +1510,10 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_minmax_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a6, 4(a2)
-; RV32ZBB-NEXT:    lw a3, 4(a1)
 ; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t0, 12(a2)
 ; RV32ZBB-NEXT:    lw a5, 12(a1)
+; RV32ZBB-NEXT:    lw a3, 4(a1)
 ; RV32ZBB-NEXT:    lw a4, 8(a1)
 ; RV32ZBB-NEXT:    beq a5, t0, .LBB17_2
 ; RV32ZBB-NEXT:  # %bb.1:
@@ -1861,67 +1861,67 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a5, 4(a2)
-; RV32I-NEXT:    lw a6, 8(a2)
-; RV32I-NEXT:    lw a7, 8(a1)
-; RV32I-NEXT:    lw a2, 12(a2)
+; RV32I-NEXT:    lw a4, 4(a2)
+; RV32I-NEXT:    lw a5, 8(a2)
+; RV32I-NEXT:    lw a7, 12(a2)
+; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a7, a6
+; RV32I-NEXT:    sltu t1, a6, a5
 ; RV32I-NEXT:    mv t4, t1
-; RV32I-NEXT:    beq t0, a2, .LBB22_2
+; RV32I-NEXT:    beq t0, a7, .LBB22_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt t4, t0, a2
+; RV32I-NEXT:    slt t4, t0, a7
 ; RV32I-NEXT:  .LBB22_2:
-; RV32I-NEXT:    sltu t2, a4, a3
+; RV32I-NEXT:    sltu t2, a2, a3
 ; RV32I-NEXT:    mv t3, t2
-; RV32I-NEXT:    beq a1, a5, .LBB22_4
+; RV32I-NEXT:    beq a1, a4, .LBB22_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t3, a1, a5
+; RV32I-NEXT:    sltu t3, a1, a4
 ; RV32I-NEXT:  .LBB22_4:
-; RV32I-NEXT:    xor t5, t0, a2
-; RV32I-NEXT:    xor t6, a7, a6
+; RV32I-NEXT:    xor t5, t0, a7
+; RV32I-NEXT:    xor t6, a6, a5
 ; RV32I-NEXT:    or t5, t6, t5
 ; RV32I-NEXT:    mv t6, t3
 ; RV32I-NEXT:    beqz t5, .LBB22_6
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    mv t6, t4
 ; RV32I-NEXT:  .LBB22_6:
-; RV32I-NEXT:    sltu t4, a3, a4
+; RV32I-NEXT:    sltu t4, a3, a2
 ; RV32I-NEXT:    mv t5, t4
-; RV32I-NEXT:    beq a1, a5, .LBB22_8
+; RV32I-NEXT:    beq a1, a4, .LBB22_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    sltu t5, a5, a1
+; RV32I-NEXT:    sltu t5, a4, a1
 ; RV32I-NEXT:  .LBB22_8:
 ; RV32I-NEXT:    bnez t6, .LBB22_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    sltu t1, a6, a7
-; RV32I-NEXT:    sub a2, a2, t0
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t5
-; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sltu t1, a5, a6
+; RV32I-NEXT:    sub a7, a7, t0
+; RV32I-NEXT:    sub a7, a7, t1
+; RV32I-NEXT:    sub a6, a5, a6
+; RV32I-NEXT:    sltu a5, a6, t5
+; RV32I-NEXT:    sub a5, a7, a5
 ; RV32I-NEXT:    sub a6, a6, t5
-; RV32I-NEXT:    sub a5, a5, a1
-; RV32I-NEXT:    sub a1, a5, t4
-; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sub a4, a4, a1
+; RV32I-NEXT:    sub a1, a4, t4
+; RV32I-NEXT:    sub a2, a3, a2
 ; RV32I-NEXT:    j .LBB22_11
 ; RV32I-NEXT:  .LBB22_10:
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, a2, t1
+; RV32I-NEXT:    sub a7, t0, a7
+; RV32I-NEXT:    sub a6, a6, a5
+; RV32I-NEXT:    sub a5, a7, t1
 ; RV32I-NEXT:    sltu a7, a6, t3
-; RV32I-NEXT:    sub a1, a1, a5
-; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    sub a5, a5, a7
 ; RV32I-NEXT:    sub a6, a6, t3
 ; RV32I-NEXT:    sub a1, a1, t2
-; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    sub a2, a2, a3
 ; RV32I-NEXT:  .LBB22_11:
 ; RV32I-NEXT:    sw a6, 8(a0)
 ; RV32I-NEXT:    sw a1, 4(a0)
-; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a2, 0(a0)
+; RV32I-NEXT:    sw a5, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i128:
@@ -1948,67 +1948,67 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_cmp_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a5, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a1)
-; RV32ZBB-NEXT:    lw a2, 12(a2)
+; RV32ZBB-NEXT:    lw a4, 4(a2)
+; RV32ZBB-NEXT:    lw a5, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw a6, 8(a1)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a7, a6
+; RV32ZBB-NEXT:    sltu t1, a6, a5
 ; RV32ZBB-NEXT:    mv t4, t1
-; RV32ZBB-NEXT:    beq t0, a2, .LBB22_2
+; RV32ZBB-NEXT:    beq t0, a7, .LBB22_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt t4, t0, a2
+; RV32ZBB-NEXT:    slt t4, t0, a7
 ; RV32ZBB-NEXT:  .LBB22_2:
-; RV32ZBB-NEXT:    sltu t2, a4, a3
+; RV32ZBB-NEXT:    sltu t2, a2, a3
 ; RV32ZBB-NEXT:    mv t3, t2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB22_4
+; RV32ZBB-NEXT:    beq a1, a4, .LBB22_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t3, a1, a5
+; RV32ZBB-NEXT:    sltu t3, a1, a4
 ; RV32ZBB-NEXT:  .LBB22_4:
-; RV32ZBB-NEXT:    xor t5, t0, a2
-; RV32ZBB-NEXT:    xor t6, a7, a6
+; RV32ZBB-NEXT:    xor t5, t0, a7
+; RV32ZBB-NEXT:    xor t6, a6, a5
 ; RV32ZBB-NEXT:    or t5, t6, t5
 ; RV32ZBB-NEXT:    mv t6, t3
 ; RV32ZBB-NEXT:    beqz t5, .LBB22_6
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    mv t6, t4
 ; RV32ZBB-NEXT:  .LBB22_6:
-; RV32ZBB-NEXT:    sltu t4, a3, a4
+; RV32ZBB-NEXT:    sltu t4, a3, a2
 ; RV32ZBB-NEXT:    mv t5, t4
-; RV32ZBB-NEXT:    beq a1, a5, .LBB22_8
+; RV32ZBB-NEXT:    beq a1, a4, .LBB22_8
 ; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    sltu t5, a5, a1
+; RV32ZBB-NEXT:    sltu t5, a4, a1
 ; RV32ZBB-NEXT:  .LBB22_8:
 ; RV32ZBB-NEXT:    bnez t6, .LBB22_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    sltu t1, a6, a7
-; RV32ZBB-NEXT:    sub a2, a2, t0
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t5
-; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sltu t1, a5, a6
+; RV32ZBB-NEXT:    sub a7, a7, t0
+; RV32ZBB-NEXT:    sub a7, a7, t1
+; RV32ZBB-NEXT:    sub a6, a5, a6
+; RV32ZBB-NEXT:    sltu a5, a6, t5
+; RV32ZBB-NEXT:    sub a5, a7, a5
 ; RV32ZBB-NEXT:    sub a6, a6, t5
-; RV32ZBB-NEXT:    sub a5, a5, a1
-; RV32ZBB-NEXT:    sub a1, a5, t4
-; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sub a4, a4, a1
+; RV32ZBB-NEXT:    sub a1, a4, t4
+; RV32ZBB-NEXT:    sub a2, a3, a2
 ; RV32ZBB-NEXT:    j .LBB22_11
 ; RV32ZBB-NEXT:  .LBB22_10:
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, a2, t1
+; RV32ZBB-NEXT:    sub a7, t0, a7
+; RV32ZBB-NEXT:    sub a6, a6, a5
+; RV32ZBB-NEXT:    sub a5, a7, t1
 ; RV32ZBB-NEXT:    sltu a7, a6, t3
-; RV32ZBB-NEXT:    sub a1, a1, a5
-; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sub a5, a5, a7
 ; RV32ZBB-NEXT:    sub a6, a6, t3
 ; RV32ZBB-NEXT:    sub a1, a1, t2
-; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    sub a2, a2, a3
 ; RV32ZBB-NEXT:  .LBB22_11:
 ; RV32ZBB-NEXT:    sw a6, 8(a0)
 ; RV32ZBB-NEXT:    sw a1, 4(a0)
-; RV32ZBB-NEXT:    sw a3, 0(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a2, 0(a0)
+; RV32ZBB-NEXT:    sw a5, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i128:
@@ -2390,31 +2390,31 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_subnsw_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a7, 12(a2)
+; RV32I-NEXT:    lw a4, 4(a2)
 ; RV32I-NEXT:    lw a5, 8(a2)
+; RV32I-NEXT:    lw a7, 12(a2)
 ; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw t0, 12(a1)
-; RV32I-NEXT:    lw a2, 4(a2)
+; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    sltu t1, a6, a5
 ; RV32I-NEXT:    sub t0, t0, a7
-; RV32I-NEXT:    sltu a7, a4, a3
+; RV32I-NEXT:    sltu a7, a2, a3
 ; RV32I-NEXT:    sub t1, t0, t1
 ; RV32I-NEXT:    mv t0, a7
-; RV32I-NEXT:    beq a1, a2, .LBB31_2
+; RV32I-NEXT:    beq a1, a4, .LBB31_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t0, a1, a2
+; RV32I-NEXT:    sltu t0, a1, a4
 ; RV32I-NEXT:  .LBB31_2:
 ; RV32I-NEXT:    sub a5, a6, a5
 ; RV32I-NEXT:    sltu a6, a5, t0
 ; RV32I-NEXT:    sub a6, t1, a6
-; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub t1, a1, a7
-; RV32I-NEXT:    sub a2, a5, t0
-; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    sub a4, a5, t0
+; RV32I-NEXT:    sub a3, a2, a3
 ; RV32I-NEXT:    srai a1, a6, 31
-; RV32I-NEXT:    xor a2, a2, a1
+; RV32I-NEXT:    xor a2, a4, a1
 ; RV32I-NEXT:    sltu a4, a1, a2
 ; RV32I-NEXT:    xor a5, a6, a1
 ; RV32I-NEXT:    sub a5, a1, a5
@@ -2458,31 +2458,31 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_subnsw_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw a4, 4(a2)
 ; RV32ZBB-NEXT:    lw a5, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 12(a2)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
-; RV32ZBB-NEXT:    lw a2, 4(a2)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw a1, 4(a1)
 ; RV32ZBB-NEXT:    sltu t1, a6, a5
 ; RV32ZBB-NEXT:    sub t0, t0, a7
-; RV32ZBB-NEXT:    sltu a7, a4, a3
+; RV32ZBB-NEXT:    sltu a7, a2, a3
 ; RV32ZBB-NEXT:    sub t1, t0, t1
 ; RV32ZBB-NEXT:    mv t0, a7
-; RV32ZBB-NEXT:    beq a1, a2, .LBB31_2
+; RV32ZBB-NEXT:    beq a1, a4, .LBB31_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t0, a1, a2
+; RV32ZBB-NEXT:    sltu t0, a1, a4
 ; RV32ZBB-NEXT:  .LBB31_2:
 ; RV32ZBB-NEXT:    sub a5, a6, a5
 ; RV32ZBB-NEXT:    sltu a6, a5, t0
 ; RV32ZBB-NEXT:    sub a6, t1, a6
-; RV32ZBB-NEXT:    sub a1, a1, a2
+; RV32ZBB-NEXT:    sub a1, a1, a4
 ; RV32ZBB-NEXT:    sub t1, a1, a7
-; RV32ZBB-NEXT:    sub a2, a5, t0
-; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    sub a4, a5, t0
+; RV32ZBB-NEXT:    sub a3, a2, a3
 ; RV32ZBB-NEXT:    srai a1, a6, 31
-; RV32ZBB-NEXT:    xor a2, a2, a1
+; RV32ZBB-NEXT:    xor a2, a4, a1
 ; RV32ZBB-NEXT:    sltu a4, a1, a2
 ; RV32ZBB-NEXT:    xor a5, a6, a1
 ; RV32ZBB-NEXT:    sub a5, a1, a5
@@ -2532,31 +2532,31 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_subnsw_i128_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a7, 12(a2)
+; RV32I-NEXT:    lw a4, 4(a2)
 ; RV32I-NEXT:    lw a5, 8(a2)
+; RV32I-NEXT:    lw a7, 12(a2)
 ; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw t0, 12(a1)
-; RV32I-NEXT:    lw a2, 4(a2)
+; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    sltu t1, a6, a5
 ; RV32I-NEXT:    sub t0, t0, a7
-; RV32I-NEXT:    sltu a7, a4, a3
+; RV32I-NEXT:    sltu a7, a2, a3
 ; RV32I-NEXT:    sub t1, t0, t1
 ; RV32I-NEXT:    mv t0, a7
-; RV32I-NEXT:    beq a1, a2, .LBB32_2
+; RV32I-NEXT:    beq a1, a4, .LBB32_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t0, a1, a2
+; RV32I-NEXT:    sltu t0, a1, a4
 ; RV32I-NEXT:  .LBB32_2:
 ; RV32I-NEXT:    sub a5, a6, a5
 ; RV32I-NEXT:    sltu a6, a5, t0
 ; RV32I-NEXT:    sub a6, t1, a6
-; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub t1, a1, a7
-; RV32I-NEXT:    sub a2, a5, t0
-; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    sub a4, a5, t0
+; RV32I-NEXT:    sub a3, a2, a3
 ; RV32I-NEXT:    srai a1, a6, 31
-; RV32I-NEXT:    xor a2, a2, a1
+; RV32I-NEXT:    xor a2, a4, a1
 ; RV32I-NEXT:    sltu a4, a1, a2
 ; RV32I-NEXT:    xor a5, a6, a1
 ; RV32I-NEXT:    sub a5, a1, a5
@@ -2600,31 +2600,31 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_subnsw_i128_undef:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw a4, 4(a2)
 ; RV32ZBB-NEXT:    lw a5, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 12(a2)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
-; RV32ZBB-NEXT:    lw a2, 4(a2)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw a1, 4(a1)
 ; RV32ZBB-NEXT:    sltu t1, a6, a5
 ; RV32ZBB-NEXT:    sub t0, t0, a7
-; RV32ZBB-NEXT:    sltu a7, a4, a3
+; RV32ZBB-NEXT:    sltu a7, a2, a3
 ; RV32ZBB-NEXT:    sub t1, t0, t1
 ; RV32ZBB-NEXT:    mv t0, a7
-; RV32ZBB-NEXT:    beq a1, a2, .LBB32_2
+; RV32ZBB-NEXT:    beq a1, a4, .LBB32_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t0, a1, a2
+; RV32ZBB-NEXT:    sltu t0, a1, a4
 ; RV32ZBB-NEXT:  .LBB32_2:
 ; RV32ZBB-NEXT:    sub a5, a6, a5
 ; RV32ZBB-NEXT:    sltu a6, a5, t0
 ; RV32ZBB-NEXT:    sub a6, t1, a6
-; RV32ZBB-NEXT:    sub a1, a1, a2
+; RV32ZBB-NEXT:    sub a1, a1, a4
 ; RV32ZBB-NEXT:    sub t1, a1, a7
-; RV32ZBB-NEXT:    sub a2, a5, t0
-; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    sub a4, a5, t0
+; RV32ZBB-NEXT:    sub a3, a2, a3
 ; RV32ZBB-NEXT:    srai a1, a6, 31
-; RV32ZBB-NEXT:    xor a2, a2, a1
+; RV32ZBB-NEXT:    xor a2, a4, a1
 ; RV32ZBB-NEXT:    sltu a4, a1, a2
 ; RV32ZBB-NEXT:    xor a5, a6, a1
 ; RV32ZBB-NEXT:    sub a5, a1, a5
diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll
index 919214b0e9a8d..91b044902a520 100644
--- a/llvm/test/CodeGen/RISCV/abds.ll
+++ b/llvm/test/CodeGen/RISCV/abds.ll
@@ -535,12 +535,12 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a5, 0(a2)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t1, 12(a2)
+; RV32I-NEXT:    lw a5, 0(a2)
 ; RV32I-NEXT:    lw a1, 4(a2)
 ; RV32I-NEXT:    sltu a2, a7, a6
 ; RV32I-NEXT:    mv t4, a2
@@ -631,12 +631,12 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_ext_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a1, 4(a2)
 ; RV32ZBB-NEXT:    sltu a2, a7, a6
 ; RV32ZBB-NEXT:    mv t4, a2
@@ -735,12 +735,12 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a5, 0(a2)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t1, 12(a2)
+; RV32I-NEXT:    lw a5, 0(a2)
 ; RV32I-NEXT:    lw a1, 4(a2)
 ; RV32I-NEXT:    sltu a2, a7, a6
 ; RV32I-NEXT:    mv t4, a2
@@ -831,12 +831,12 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_ext_i128_undef:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a1, 4(a2)
 ; RV32ZBB-NEXT:    sltu a2, a7, a6
 ; RV32ZBB-NEXT:    mv t4, a2
@@ -1124,12 +1124,12 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a5, 0(a2)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t1, 12(a2)
+; RV32I-NEXT:    lw a5, 0(a2)
 ; RV32I-NEXT:    lw a1, 4(a2)
 ; RV32I-NEXT:    sltu a2, a7, a6
 ; RV32I-NEXT:    mv t4, a2
@@ -1220,12 +1220,12 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_minmax_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a1, 4(a2)
 ; RV32ZBB-NEXT:    sltu a2, a7, a6
 ; RV32ZBB-NEXT:    mv t4, a2
@@ -1515,12 +1515,12 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a5, 0(a2)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t1, 12(a2)
+; RV32I-NEXT:    lw a5, 0(a2)
 ; RV32I-NEXT:    lw a1, 4(a2)
 ; RV32I-NEXT:    sltu a2, a7, a6
 ; RV32I-NEXT:    mv t4, a2
@@ -1611,12 +1611,12 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_cmp_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a1, 4(a2)
 ; RV32ZBB-NEXT:    sltu a2, a7, a6
 ; RV32ZBB-NEXT:    mv t4, a2
@@ -2044,28 +2044,28 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_subnsw_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    lw a5, 0(a1)
-; RV32I-NEXT:    lw t1, 12(a2)
-; RV32I-NEXT:    lw a7, 8(a2)
-; RV32I-NEXT:    lw t0, 8(a1)
-; RV32I-NEXT:    lw t2, 12(a1)
 ; RV32I-NEXT:    lw a4, 4(a2)
-; RV32I-NEXT:    lw a6, 4(a1)
-; RV32I-NEXT:    sltu a1, t0, a7
-; RV32I-NEXT:    sub a2, t2, t1
-; RV32I-NEXT:    sltu t1, a5, a3
-; RV32I-NEXT:    sub a1, a2, a1
-; RV32I-NEXT:    mv a2, t1
-; RV32I-NEXT:    beq a6, a4, .LBB31_2
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw t0, 12(a2)
+; RV32I-NEXT:    lw a2, 8(a1)
+; RV32I-NEXT:    lw t1, 12(a1)
+; RV32I-NEXT:    lw a5, 0(a1)
+; RV32I-NEXT:    lw a7, 4(a1)
+; RV32I-NEXT:    sltu a1, a2, a6
+; RV32I-NEXT:    sub t1, t1, t0
+; RV32I-NEXT:    sltu t0, a5, a3
+; RV32I-NEXT:    sub a1, t1, a1
+; RV32I-NEXT:    mv t1, t0
+; RV32I-NEXT:    beq a7, a4, .LBB31_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a2, a6, a4
+; RV32I-NEXT:    sltu t1, a7, a4
 ; RV32I-NEXT:  .LBB31_2:
-; RV32I-NEXT:    sub a7, t0, a7
-; RV32I-NEXT:    sltu t0, a7, a2
-; RV32I-NEXT:    sub a1, a1, t0
-; RV32I-NEXT:    sub a2, a7, a2
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    sub a4, a4, t1
+; RV32I-NEXT:    sub a2, a2, a6
+; RV32I-NEXT:    sltu a6, a2, t1
+; RV32I-NEXT:    sub a1, a1, a6
+; RV32I-NEXT:    sub a2, a2, t1
+; RV32I-NEXT:    sub a4, a7, a4
+; RV32I-NEXT:    sub a4, a4, t0
 ; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    bgez a1, .LBB31_4
 ; RV32I-NEXT:  # %bb.3:
@@ -2107,28 +2107,28 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_subnsw_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
-; RV32ZBB-NEXT:    lw a5, 0(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
-; RV32ZBB-NEXT:    lw t0, 8(a1)
-; RV32ZBB-NEXT:    lw t2, 12(a1)
 ; RV32ZBB-NEXT:    lw a4, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, t0, a7
-; RV32ZBB-NEXT:    sub a2, t2, t1
-; RV32ZBB-NEXT:    sltu t1, a5, a3
-; RV32ZBB-NEXT:    sub a1, a2, a1
-; RV32ZBB-NEXT:    mv a2, t1
-; RV32ZBB-NEXT:    beq a6, a4, .LBB31_2
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw t0, 12(a2)
+; RV32ZBB-NEXT:    lw a2, 8(a1)
+; RV32ZBB-NEXT:    lw t1, 12(a1)
+; RV32ZBB-NEXT:    lw a5, 0(a1)
+; RV32ZBB-NEXT:    lw a7, 4(a1)
+; RV32ZBB-NEXT:    sltu a1, a2, a6
+; RV32ZBB-NEXT:    sub t1, t1, t0
+; RV32ZBB-NEXT:    sltu t0, a5, a3
+; RV32ZBB-NEXT:    sub a1, t1, a1
+; RV32ZBB-NEXT:    mv t1, t0
+; RV32ZBB-NEXT:    beq a7, a4, .LBB31_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu a2, a6, a4
+; RV32ZBB-NEXT:    sltu t1, a7, a4
 ; RV32ZBB-NEXT:  .LBB31_2:
-; RV32ZBB-NEXT:    sub a7, t0, a7
-; RV32ZBB-NEXT:    sltu t0, a7, a2
-; RV32ZBB-NEXT:    sub a1, a1, t0
-; RV32ZBB-NEXT:    sub a2, a7, a2
-; RV32ZBB-NEXT:    sub a4, a6, a4
-; RV32ZBB-NEXT:    sub a4, a4, t1
+; RV32ZBB-NEXT:    sub a2, a2, a6
+; RV32ZBB-NEXT:    sltu a6, a2, t1
+; RV32ZBB-NEXT:    sub a1, a1, a6
+; RV32ZBB-NEXT:    sub a2, a2, t1
+; RV32ZBB-NEXT:    sub a4, a7, a4
+; RV32ZBB-NEXT:    sub a4, a4, t0
 ; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    bgez a1, .LBB31_4
 ; RV32ZBB-NEXT:  # %bb.3:
@@ -2175,28 +2175,28 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_subnsw_i128_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    lw a5, 0(a1)
-; RV32I-NEXT:    lw t1, 12(a2)
-; RV32I-NEXT:    lw a7, 8(a2)
-; RV32I-NEXT:    lw t0, 8(a1)
-; RV32I-NEXT:    lw t2, 12(a1)
 ; RV32I-NEXT:    lw a4, 4(a2)
-; RV32I-NEXT:    lw a6, 4(a1)
-; RV32I-NEXT:    sltu a1, t0, a7
-; RV32I-NEXT:    sub a2, t2, t1
-; RV32I-NEXT:    sltu t1, a5, a3
-; RV32I-NEXT:    sub a1, a2, a1
-; RV32I-NEXT:    mv a2, t1
-; RV32I-NEXT:    beq a6, a4, .LBB32_2
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw t0, 12(a2)
+; RV32I-NEXT:    lw a2, 8(a1)
+; RV32I-NEXT:    lw t1, 12(a1)
+; RV32I-NEXT:    lw a5, 0(a1)
+; RV32I-NEXT:    lw a7, 4(a1)
+; RV32I-NEXT:    sltu a1, a2, a6
+; RV32I-NEXT:    sub t1, t1, t0
+; RV32I-NEXT:    sltu t0, a5, a3
+; RV32I-NEXT:    sub a1, t1, a1
+; RV32I-NEXT:    mv t1, t0
+; RV32I-NEXT:    beq a7, a4, .LBB32_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a2, a6, a4
+; RV32I-NEXT:    sltu t1, a7, a4
 ; RV32I-NEXT:  .LBB32_2:
-; RV32I-NEXT:    sub a7, t0, a7
-; RV32I-NEXT:    sltu t0, a7, a2
-; RV32I-NEXT:    sub a1, a1, t0
-; RV32I-NEXT:    sub a2, a7, a2
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    sub a4, a4, t1
+; RV32I-NEXT:    sub a2, a2, a6
+; RV32I-NEXT:    sltu a6, a2, t1
+; RV32I-NEXT:    sub a1, a1, a6
+; RV32I-NEXT:    sub a2, a2, t1
+; RV32I-NEXT:    sub a4, a7, a4
+; RV32I-NEXT:    sub a4, a4, t0
 ; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    bgez a1, .LBB32_4
 ; RV32I-NEXT:  # %bb.3:
@@ -2238,28 +2238,28 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_subnsw_i128_undef:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
-; RV32ZBB-NEXT:    lw a5, 0(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
-; RV32ZBB-NEXT:    lw t0, 8(a1)
-; RV32ZBB-NEXT:    lw t2, 12(a1)
 ; RV32ZBB-NEXT:    lw a4, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, t0, a7
-; RV32ZBB-NEXT:    sub a2, t2, t1
-; RV32ZBB-NEXT:    sltu t1, a5, a3
-; RV32ZBB-NEXT:    sub a1, a2, a1
-; RV32ZBB-NEXT:    mv a2, t1
-; RV32ZBB-NEXT:    beq a6, a4, .LBB32_2
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw t0, 12(a2)
+; RV32ZBB-NEXT:    lw a2, 8(a1)
+; RV32ZBB-NEXT:    lw t1, 12(a1)
+; RV32ZBB-NEXT:    lw a5, 0(a1)
+; RV32ZBB-NEXT:    lw a7, 4(a1)
+; RV32ZBB-NEXT:    sltu a1, a2, a6
+; RV32ZBB-NEXT:    sub t1, t1, t0
+; RV32ZBB-NEXT:    sltu t0, a5, a3
+; RV32ZBB-NEXT:    sub a1, t1, a1
+; RV32ZBB-NEXT:    mv t1, t0
+; RV32ZBB-NEXT:    beq a7, a4, .LBB32_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu a2, a6, a4
+; RV32ZBB-NEXT:    sltu t1, a7, a4
 ; RV32ZBB-NEXT:  .LBB32_2:
-; RV32ZBB-NEXT:    sub a7, t0, a7
-; RV32ZBB-NEXT:    sltu t0, a7, a2
-; RV32ZBB-NEXT:    sub a1, a1, t0
-; RV32ZBB-NEXT:    sub a2, a7, a2
-; RV32ZBB-NEXT:    sub a4, a6, a4
-; RV32ZBB-NEXT:    sub a4, a4, t1
+; RV32ZBB-NEXT:    sub a2, a2, a6
+; RV32ZBB-NEXT:    sltu a6, a2, t1
+; RV32ZBB-NEXT:    sub a1, a1, a6
+; RV32ZBB-NEXT:    sub a2, a2, t1
+; RV32ZBB-NEXT:    sub a4, a7, a4
+; RV32ZBB-NEXT:    sub a4, a4, t0
 ; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    bgez a1, .LBB32_4
 ; RV32ZBB-NEXT:  # %bb.3:
@@ -2552,10 +2552,10 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_select_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a7, 4(a2)
-; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a2)
 ; RV32I-NEXT:    lw t0, 12(a2)
 ; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    beq a5, t0, .LBB38_2
 ; RV32I-NEXT:  # %bb.1:
@@ -2647,12 +2647,12 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_select_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a1, 4(a2)
 ; RV32ZBB-NEXT:    sltu a2, a7, a6
 ; RV32ZBB-NEXT:    mv t4, a2
diff --git a/llvm/test/CodeGen/RISCV/abdu-neg.ll b/llvm/test/CodeGen/RISCV/abdu-neg.ll
index 87a06fc4403eb..54075f4169439 100644
--- a/llvm/test/CodeGen/RISCV/abdu-neg.ll
+++ b/llvm/test/CodeGen/RISCV/abdu-neg.ll
@@ -624,83 +624,83 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a5, 0(a2)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw t2, 12(a2)
+; RV32I-NEXT:    lw a4, 0(a2)
+; RV32I-NEXT:    lw a6, 4(a2)
 ; RV32I-NEXT:    lw t1, 8(a2)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw a7, 4(a2)
+; RV32I-NEXT:    lw a2, 12(a2)
+; RV32I-NEXT:    lw a3, 8(a1)
+; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a7, 0(a1)
 ; RV32I-NEXT:    lw t0, 4(a1)
-; RV32I-NEXT:    sltu a1, a4, t1
-; RV32I-NEXT:    sub a2, a6, t2
-; RV32I-NEXT:    sltu t2, a3, a5
+; RV32I-NEXT:    sltu a1, a3, t1
+; RV32I-NEXT:    sub a2, a5, a2
+; RV32I-NEXT:    sltu t2, a7, a4
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    mv a2, t2
-; RV32I-NEXT:    beq t0, a7, .LBB11_2
+; RV32I-NEXT:    beq t0, a6, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a2, t0, a7
+; RV32I-NEXT:    sltu a2, t0, a6
 ; RV32I-NEXT:  .LBB11_2:
-; RV32I-NEXT:    sub t1, a4, t1
+; RV32I-NEXT:    sub t1, a3, t1
 ; RV32I-NEXT:    sltu t3, t1, a2
 ; RV32I-NEXT:    sub a1, a1, t3
 ; RV32I-NEXT:    sub a2, t1, a2
-; RV32I-NEXT:    beq a1, a6, .LBB11_4
+; RV32I-NEXT:    beq a1, a5, .LBB11_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a6, a1
+; RV32I-NEXT:    sltu t1, a5, a1
 ; RV32I-NEXT:    j .LBB11_5
 ; RV32I-NEXT:  .LBB11_4:
-; RV32I-NEXT:    sltu t1, a4, a2
+; RV32I-NEXT:    sltu t1, a3, a2
 ; RV32I-NEXT:  .LBB11_5:
-; RV32I-NEXT:    sub a7, t0, a7
-; RV32I-NEXT:    sub a7, a7, t2
-; RV32I-NEXT:    sub a5, a3, a5
-; RV32I-NEXT:    beq a7, t0, .LBB11_7
+; RV32I-NEXT:    sub a6, t0, a6
+; RV32I-NEXT:    sub a6, a6, t2
+; RV32I-NEXT:    sub t2, a7, a4
+; RV32I-NEXT:    beq a6, t0, .LBB11_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a3, t0, a7
+; RV32I-NEXT:    sltu a4, t0, a6
 ; RV32I-NEXT:    j .LBB11_8
 ; RV32I-NEXT:  .LBB11_7:
-; RV32I-NEXT:    sltu a3, a3, a5
+; RV32I-NEXT:    sltu a4, a7, t2
 ; RV32I-NEXT:  .LBB11_8:
-; RV32I-NEXT:    xor a6, a1, a6
-; RV32I-NEXT:    xor a4, a2, a4
-; RV32I-NEXT:    or a4, a4, a6
-; RV32I-NEXT:    beqz a4, .LBB11_10
+; RV32I-NEXT:    xor a5, a1, a5
+; RV32I-NEXT:    xor a3, a2, a3
+; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    beqz a3, .LBB11_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a3, t1
+; RV32I-NEXT:    mv a4, t1
 ; RV32I-NEXT:  .LBB11_10:
-; RV32I-NEXT:    neg t0, a3
-; RV32I-NEXT:    xor a5, a5, t0
+; RV32I-NEXT:    neg t0, a4
+; RV32I-NEXT:    xor a5, t2, t0
 ; RV32I-NEXT:    sltu t2, a5, t0
-; RV32I-NEXT:    xor t3, a7, t0
-; RV32I-NEXT:    add a4, t3, a3
-; RV32I-NEXT:    sub a4, a4, t2
-; RV32I-NEXT:    snez t1, a4
-; RV32I-NEXT:    add a5, a5, a3
-; RV32I-NEXT:    snez a6, a5
-; RV32I-NEXT:    or t1, a6, t1
-; RV32I-NEXT:    beqz a7, .LBB11_12
+; RV32I-NEXT:    xor t3, a6, t0
+; RV32I-NEXT:    add a3, t3, a4
+; RV32I-NEXT:    sub a3, a3, t2
+; RV32I-NEXT:    snez t1, a3
+; RV32I-NEXT:    add a5, a5, a4
+; RV32I-NEXT:    snez a7, a5
+; RV32I-NEXT:    or t1, a7, t1
+; RV32I-NEXT:    beqz a6, .LBB11_12
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sltu t2, t3, t0
 ; RV32I-NEXT:  .LBB11_12:
 ; RV32I-NEXT:    xor a2, a2, t0
-; RV32I-NEXT:    add a7, a2, a3
-; RV32I-NEXT:    sub t3, a7, t2
+; RV32I-NEXT:    add a6, a2, a4
+; RV32I-NEXT:    sub t3, a6, t2
 ; RV32I-NEXT:    neg t4, t3
 ; RV32I-NEXT:    sltu t5, t4, t1
 ; RV32I-NEXT:    sltu a2, a2, t0
 ; RV32I-NEXT:    xor a1, a1, t0
-; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    add a1, a1, a4
 ; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    sltu a2, a7, t2
+; RV32I-NEXT:    sltu a2, a6, t2
 ; RV32I-NEXT:    sub a1, a1, a2
 ; RV32I-NEXT:    snez a2, t3
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:    sub a1, a1, t5
 ; RV32I-NEXT:    sub a2, t4, t1
-; RV32I-NEXT:    add a4, a4, a6
-; RV32I-NEXT:    neg a3, a4
+; RV32I-NEXT:    add a3, a3, a7
+; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:    neg a4, a5
 ; RV32I-NEXT:    sw a4, 0(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
@@ -736,83 +736,83 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ;
 ; RV32ZBB-LABEL: abd_ext_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a5, 0(a2)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw t2, 12(a2)
+; RV32ZBB-NEXT:    lw a4, 0(a2)
+; RV32ZBB-NEXT:    lw a6, 4(a2)
 ; RV32ZBB-NEXT:    lw t1, 8(a2)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a6, 12(a1)
-; RV32ZBB-NEXT:    lw a7, 4(a2)
+; RV32ZBB-NEXT:    lw a2, 12(a2)
+; RV32ZBB-NEXT:    lw a3, 8(a1)
+; RV32ZBB-NEXT:    lw a5, 12(a1)
+; RV32ZBB-NEXT:    lw a7, 0(a1)
 ; RV32ZBB-NEXT:    lw t0, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, a4, t1
-; RV32ZBB-NEXT:    sub a2, a6, t2
-; RV32ZBB-NEXT:    sltu t2, a3, a5
+; RV32ZBB-NEXT:    sltu a1, a3, t1
+; RV32ZBB-NEXT:    sub a2, a5, a2
+; RV32ZBB-NEXT:    sltu t2, a7, a4
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    mv a2, t2
-; RV32ZBB-NEXT:    beq t0, a7, .LBB11_2
+; RV32ZBB-NEXT:    beq t0, a6, .LBB11_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu a2, t0, a7
+; RV32ZBB-NEXT:    sltu a2, t0, a6
 ; RV32ZBB-NEXT:  .LBB11_2:
-; RV32ZBB-NEXT:    sub t1, a4, t1
+; RV32ZBB-NEXT:    sub t1, a3, t1
 ; RV32ZBB-NEXT:    sltu t3, t1, a2
 ; RV32ZBB-NEXT:    sub a1, a1, t3
 ; RV32ZBB-NEXT:    sub a2, t1, a2
-; RV32ZBB-NEXT:    beq a1, a6, .LBB11_4
+; RV32ZBB-NEXT:    beq a1, a5, .LBB11_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a6, a1
+; RV32ZBB-NEXT:    sltu t1, a5, a1
 ; RV32ZBB-NEXT:    j .LBB11_5
 ; RV32ZBB-NEXT:  .LBB11_4:
-; RV32ZBB-NEXT:    sltu t1, a4, a2
+; RV32ZBB-NEXT:    sltu t1, a3, a2
 ; RV32ZBB-NEXT:  .LBB11_5:
-; RV32ZBB-NEXT:    sub a7, t0, a7
-; RV32ZBB-NEXT:    sub a7, a7, t2
-; RV32ZBB-NEXT:    sub a5, a3, a5
-; RV32ZBB-NEXT:    beq a7, t0, .LBB11_7
+; RV32ZBB-NEXT:    sub a6, t0, a6
+; RV32ZBB-NEXT:    sub a6, a6, t2
+; RV32ZBB-NEXT:    sub t2, a7, a4
+; RV32ZBB-NEXT:    beq a6, t0, .LBB11_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a3, t0, a7
+; RV32ZBB-NEXT:    sltu a4, t0, a6
 ; RV32ZBB-NEXT:    j .LBB11_8
 ; RV32ZBB-NEXT:  .LBB11_7:
-; RV32ZBB-NEXT:    sltu a3, a3, a5
+; RV32ZBB-NEXT:    sltu a4, a7, t2
 ; RV32ZBB-NEXT:  .LBB11_8:
-; RV32ZBB-NEXT:    xor a6, a1, a6
-; RV32ZBB-NEXT:    xor a4, a2, a4
-; RV32ZBB-NEXT:    or a4, a4, a6
-; RV32ZBB-NEXT:    beqz a4, .LBB11_10
+; RV32ZBB-NEXT:    xor a5, a1, a5
+; RV32ZBB-NEXT:    xor a3, a2, a3
+; RV32ZBB-NEXT:    or a3, a3, a5
+; RV32ZBB-NEXT:    beqz a3, .LBB11_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    mv a3, t1
+; RV32ZBB-NEXT:    mv a4, t1
 ; RV32ZBB-NEXT:  .LBB11_10:
-; RV32ZBB-NEXT:    neg t0, a3
-; RV32ZBB-NEXT:    xor a5, a5, t0
+; RV32ZBB-NEXT:    neg t0, a4
+; RV32ZBB-NEXT:    xor a5, t2, t0
 ; RV32ZBB-NEXT:    sltu t2, a5, t0
-; RV32ZBB-NEXT:    xor t3, a7, t0
-; RV32ZBB-NEXT:    add a4, t3, a3
-; RV32ZBB-NEXT:    sub a4, a4, t2
-; RV32ZBB-NEXT:    snez t1, a4
-; RV32ZBB-NEXT:    add a5, a5, a3
-; RV32ZBB-NEXT:    snez a6, a5
-; RV32ZBB-NEXT:    or t1, a6, t1
-; RV32ZBB-NEXT:    beqz a7, .LBB11_12
+; RV32ZBB-NEXT:    xor t3, a6, t0
+; RV32ZBB-NEXT:    add a3, t3, a4
+; RV32ZBB-NEXT:    sub a3, a3, t2
+; RV32ZBB-NEXT:    snez t1, a3
+; RV32ZBB-NEXT:    add a5, a5, a4
+; RV32ZBB-NEXT:    snez a7, a5
+; RV32ZBB-NEXT:    or t1, a7, t1
+; RV32ZBB-NEXT:    beqz a6, .LBB11_12
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sltu t2, t3, t0
 ; RV32ZBB-NEXT:  .LBB11_12:
 ; RV32ZBB-NEXT:    xor a2, a2, t0
-; RV32ZBB-NEXT:    add a7, a2, a3
-; RV32ZBB-NEXT:    sub t3, a7, t2
+; RV32ZBB-NEXT:    add a6, a2, a4
+; RV32ZBB-NEXT:    sub t3, a6, t2
 ; RV32ZBB-NEXT:    neg t4, t3
 ; RV32ZBB-NEXT:    sltu t5, t4, t1
 ; RV32ZBB-NEXT:    sltu a2, a2, t0
 ; RV32ZBB-NEXT:    xor a1, a1, t0
-; RV32ZBB-NEXT:    add a1, a1, a3
+; RV32ZBB-NEXT:    add a1, a1, a4
 ; RV32ZBB-NEXT:    sub a1, a1, a2
-; RV32ZBB-NEXT:    sltu a2, a7, t2
+; RV32ZBB-NEXT:    sltu a2, a6, t2
 ; RV32ZBB-NEXT:    sub a1, a1, a2
 ; RV32ZBB-NEXT:    snez a2, t3
 ; RV32ZBB-NEXT:    add a1, a1, a2
 ; RV32ZBB-NEXT:    neg a1, a1
 ; RV32ZBB-NEXT:    sub a1, a1, t5
 ; RV32ZBB-NEXT:    sub a2, t4, t1
-; RV32ZBB-NEXT:    add a4, a4, a6
-; RV32ZBB-NEXT:    neg a3, a4
+; RV32ZBB-NEXT:    add a3, a3, a7
+; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:    neg a4, a5
 ; RV32ZBB-NEXT:    sw a4, 0(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
@@ -857,83 +857,83 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128_undef:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a5, 0(a2)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw t2, 12(a2)
+; RV32I-NEXT:    lw a4, 0(a2)
+; RV32I-NEXT:    lw a6, 4(a2)
 ; RV32I-NEXT:    lw t1, 8(a2)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw a7, 4(a2)
+; RV32I-NEXT:    lw a2, 12(a2)
+; RV32I-NEXT:    lw a3, 8(a1)
+; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a7, 0(a1)
 ; RV32I-NEXT:    lw t0, 4(a1)
-; RV32I-NEXT:    sltu a1, a4, t1
-; RV32I-NEXT:    sub a2, a6, t2
-; RV32I-NEXT:    sltu t2, a3, a5
+; RV32I-NEXT:    sltu a1, a3, t1
+; RV32I-NEXT:    sub a2, a5, a2
+; RV32I-NEXT:    sltu t2, a7, a4
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    mv a2, t2
-; RV32I-NEXT:    beq t0, a7, .LBB12_2
+; RV32I-NEXT:    beq t0, a6, .LBB12_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a2, t0, a7
+; RV32I-NEXT:    sltu a2, t0, a6
 ; RV32I-NEXT:  .LBB12_2:
-; RV32I-NEXT:    sub t1, a4, t1
+; RV32I-NEXT:    sub t1, a3, t1
 ; RV32I-NEXT:    sltu t3, t1, a2
 ; RV32I-NEXT:    sub a1, a1, t3
 ; RV32I-NEXT:    sub a2, t1, a2
-; RV32I-NEXT:    beq a1, a6, .LBB12_4
+; RV32I-NEXT:    beq a1, a5, .LBB12_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a6, a1
+; RV32I-NEXT:    sltu t1, a5, a1
 ; RV32I-NEXT:    j .LBB12_5
 ; RV32I-NEXT:  .LBB12_4:
-; RV32I-NEXT:    sltu t1, a4, a2
+; RV32I-NEXT:    sltu t1, a3, a2
 ; RV32I-NEXT:  .LBB12_5:
-; RV32I-NEXT:    sub a7, t0, a7
-; RV32I-NEXT:    sub a7, a7, t2
-; RV32I-NEXT:    sub a5, a3, a5
-; RV32I-NEXT:    beq a7, t0, .LBB12_7
+; RV32I-NEXT:    sub a6, t0, a6
+; RV32I-NEXT:    sub a6, a6, t2
+; RV32I-NEXT:    sub t2, a7, a4
+; RV32I-NEXT:    beq a6, t0, .LBB12_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a3, t0, a7
+; RV32I-NEXT:    sltu a4, t0, a6
 ; RV32I-NEXT:    j .LBB12_8
 ; RV32I-NEXT:  .LBB12_7:
-; RV32I-NEXT:    sltu a3, a3, a5
+; RV32I-NEXT:    sltu a4, a7, t2
 ; RV32I-NEXT:  .LBB12_8:
-; RV32I-NEXT:    xor a6, a1, a6
-; RV32I-NEXT:    xor a4, a2, a4
-; RV32I-NEXT:    or a4, a4, a6
-; RV32I-NEXT:    beqz a4, .LBB12_10
+; RV32I-NEXT:    xor a5, a1, a5
+; RV32I-NEXT:    xor a3, a2, a3
+; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    beqz a3, .LBB12_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a3, t1
+; RV32I-NEXT:    mv a4, t1
 ; RV32I-NEXT:  .LBB12_10:
-; RV32I-NEXT:    neg t0, a3
-; RV32I-NEXT:    xor a5, a5, t0
+; RV32I-NEXT:    neg t0, a4
+; RV32I-NEXT:    xor a5, t2, t0
 ; RV32I-NEXT:    sltu t2, a5, t0
-; RV32I-NEXT:    xor t3, a7, t0
-; RV32I-NEXT:    add a4, t3, a3
-; RV32I-NEXT:    sub a4, a4, t2
-; RV32I-NEXT:    snez t1, a4
-; RV32I-NEXT:    add a5, a5, a3
-; RV32I-NEXT:    snez a6, a5
-; RV32I-NEXT:    or t1, a6, t1
-; RV32I-NEXT:    beqz a7, .LBB12_12
+; RV32I-NEXT:    xor t3, a6, t0
+; RV32I-NEXT:    add a3, t3, a4
+; RV32I-NEXT:    sub a3, a3, t2
+; RV32I-NEXT:    snez t1, a3
+; RV32I-NEXT:    add a5, a5, a4
+; RV32I-NEXT:    snez a7, a5
+; RV32I-NEXT:    or t1, a7, t1
+; RV32I-NEXT:    beqz a6, .LBB12_12
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sltu t2, t3, t0
 ; RV32I-NEXT:  .LBB12_12:
 ; RV32I-NEXT:    xor a2, a2, t0
-; RV32I-NEXT:    add a7, a2, a3
-; RV32I-NEXT:    sub t3, a7, t2
+; RV32I-NEXT:    add a6, a2, a4
+; RV32I-NEXT:    sub t3, a6, t2
 ; RV32I-NEXT:    neg t4, t3
 ; RV32I-NEXT:    sltu t5, t4, t1
 ; RV32I-NEXT:    sltu a2, a2, t0
 ; RV32I-NEXT:    xor a1, a1, t0
-; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    add a1, a1, a4
 ; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    sltu a2, a7, t2
+; RV32I-NEXT:    sltu a2, a6, t2
 ; RV32I-NEXT:    sub a1, a1, a2
 ; RV32I-NEXT:    snez a2, t3
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:    sub a1, a1, t5
 ; RV32I-NEXT:    sub a2, t4, t1
-; RV32I-NEXT:    add a4, a4, a6
-; RV32I-NEXT:    neg a3, a4
+; RV32I-NEXT:    add a3, a3, a7
+; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:    neg a4, a5
 ; RV32I-NEXT:    sw a4, 0(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
@@ -969,83 +969,83 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ;
 ; RV32ZBB-LABEL: abd_ext_i128_undef:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a5, 0(a2)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw t2, 12(a2)
+; RV32ZBB-NEXT:    lw a4, 0(a2)
+; RV32ZBB-NEXT:    lw a6, 4(a2)
 ; RV32ZBB-NEXT:    lw t1, 8(a2)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a6, 12(a1)
-; RV32ZBB-NEXT:    lw a7, 4(a2)
+; RV32ZBB-NEXT:    lw a2, 12(a2)
+; RV32ZBB-NEXT:    lw a3, 8(a1)
+; RV32ZBB-NEXT:    lw a5, 12(a1)
+; RV32ZBB-NEXT:    lw a7, 0(a1)
 ; RV32ZBB-NEXT:    lw t0, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, a4, t1
-; RV32ZBB-NEXT:    sub a2, a6, t2
-; RV32ZBB-NEXT:    sltu t2, a3, a5
+; RV32ZBB-NEXT:    sltu a1, a3, t1
+; RV32ZBB-NEXT:    sub a2, a5, a2
+; RV32ZBB-NEXT:    sltu t2, a7, a4
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    mv a2, t2
-; RV32ZBB-NEXT:    beq t0, a7, .LBB12_2
+; RV32ZBB-NEXT:    beq t0, a6, .LBB12_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu a2, t0, a7
+; RV32ZBB-NEXT:    sltu a2, t0, a6
 ; RV32ZBB-NEXT:  .LBB12_2:
-; RV32ZBB-NEXT:    sub t1, a4, t1
+; RV32ZBB-NEXT:    sub t1, a3, t1
 ; RV32ZBB-NEXT:    sltu t3, t1, a2
 ; RV32ZBB-NEXT:    sub a1, a1, t3
 ; RV32ZBB-NEXT:    sub a2, t1, a2
-; RV32ZBB-NEXT:    beq a1, a6, .LBB12_4
+; RV32ZBB-NEXT:    beq a1, a5, .LBB12_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a6, a1
+; RV32ZBB-NEXT:    sltu t1, a5, a1
 ; RV32ZBB-NEXT:    j .LBB12_5
 ; RV32ZBB-NEXT:  .LBB12_4:
-; RV32ZBB-NEXT:    sltu t1, a4, a2
+; RV32ZBB-NEXT:    sltu t1, a3, a2
 ; RV32ZBB-NEXT:  .LBB12_5:
-; RV32ZBB-NEXT:    sub a7, t0, a7
-; RV32ZBB-NEXT:    sub a7, a7, t2
-; RV32ZBB-NEXT:    sub a5, a3, a5
-; RV32ZBB-NEXT:    beq a7, t0, .LBB12_7
+; RV32ZBB-NEXT:    sub a6, t0, a6
+; RV32ZBB-NEXT:    sub a6, a6, t2
+; RV32ZBB-NEXT:    sub t2, a7, a4
+; RV32ZBB-NEXT:    beq a6, t0, .LBB12_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a3, t0, a7
+; RV32ZBB-NEXT:    sltu a4, t0, a6
 ; RV32ZBB-NEXT:    j .LBB12_8
 ; RV32ZBB-NEXT:  .LBB12_7:
-; RV32ZBB-NEXT:    sltu a3, a3, a5
+; RV32ZBB-NEXT:    sltu a4, a7, t2
 ; RV32ZBB-NEXT:  .LBB12_8:
-; RV32ZBB-NEXT:    xor a6, a1, a6
-; RV32ZBB-NEXT:    xor a4, a2, a4
-; RV32ZBB-NEXT:    or a4, a4, a6
-; RV32ZBB-NEXT:    beqz a4, .LBB12_10
+; RV32ZBB-NEXT:    xor a5, a1, a5
+; RV32ZBB-NEXT:    xor a3, a2, a3
+; RV32ZBB-NEXT:    or a3, a3, a5
+; RV32ZBB-NEXT:    beqz a3, .LBB12_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    mv a3, t1
+; RV32ZBB-NEXT:    mv a4, t1
 ; RV32ZBB-NEXT:  .LBB12_10:
-; RV32ZBB-NEXT:    neg t0, a3
-; RV32ZBB-NEXT:    xor a5, a5, t0
+; RV32ZBB-NEXT:    neg t0, a4
+; RV32ZBB-NEXT:    xor a5, t2, t0
 ; RV32ZBB-NEXT:    sltu t2, a5, t0
-; RV32ZBB-NEXT:    xor t3, a7, t0
-; RV32ZBB-NEXT:    add a4, t3, a3
-; RV32ZBB-NEXT:    sub a4, a4, t2
-; RV32ZBB-NEXT:    snez t1, a4
-; RV32ZBB-NEXT:    add a5, a5, a3
-; RV32ZBB-NEXT:    snez a6, a5
-; RV32ZBB-NEXT:    or t1, a6, t1
-; RV32ZBB-NEXT:    beqz a7, .LBB12_12
+; RV32ZBB-NEXT:    xor t3, a6, t0
+; RV32ZBB-NEXT:    add a3, t3, a4
+; RV32ZBB-NEXT:    sub a3, a3, t2
+; RV32ZBB-NEXT:    snez t1, a3
+; RV32ZBB-NEXT:    add a5, a5, a4
+; RV32ZBB-NEXT:    snez a7, a5
+; RV32ZBB-NEXT:    or t1, a7, t1
+; RV32ZBB-NEXT:    beqz a6, .LBB12_12
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sltu t2, t3, t0
 ; RV32ZBB-NEXT:  .LBB12_12:
 ; RV32ZBB-NEXT:    xor a2, a2, t0
-; RV32ZBB-NEXT:    add a7, a2, a3
-; RV32ZBB-NEXT:    sub t3, a7, t2
+; RV32ZBB-NEXT:    add a6, a2, a4
+; RV32ZBB-NEXT:    sub t3, a6, t2
 ; RV32ZBB-NEXT:    neg t4, t3
 ; RV32ZBB-NEXT:    sltu t5, t4, t1
 ; RV32ZBB-NEXT:    sltu a2, a2, t0
 ; RV32ZBB-NEXT:    xor a1, a1, t0
-; RV32ZBB-NEXT:    add a1, a1, a3
+; RV32ZBB-NEXT:    add a1, a1, a4
 ; RV32ZBB-NEXT:    sub a1, a1, a2
-; RV32ZBB-NEXT:    sltu a2, a7, t2
+; RV32ZBB-NEXT:    sltu a2, a6, t2
 ; RV32ZBB-NEXT:    sub a1, a1, a2
 ; RV32ZBB-NEXT:    snez a2, t3
 ; RV32ZBB-NEXT:    add a1, a1, a2
 ; RV32ZBB-NEXT:    neg a1, a1
 ; RV32ZBB-NEXT:    sub a1, a1, t5
 ; RV32ZBB-NEXT:    sub a2, t4, t1
-; RV32ZBB-NEXT:    add a4, a4, a6
-; RV32ZBB-NEXT:    neg a3, a4
+; RV32ZBB-NEXT:    add a3, a3, a7
+; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:    neg a4, a5
 ; RV32ZBB-NEXT:    sw a4, 0(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
@@ -1336,10 +1336,10 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a6, 4(a2)
-; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t0, 12(a2)
 ; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    beq a5, t0, .LBB17_2
 ; RV32I-NEXT:  # %bb.1:
@@ -1463,10 +1463,10 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_minmax_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a6, 4(a2)
-; RV32ZBB-NEXT:    lw a3, 4(a1)
 ; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t0, 12(a2)
 ; RV32ZBB-NEXT:    lw a5, 12(a1)
+; RV32ZBB-NEXT:    lw a3, 4(a1)
 ; RV32ZBB-NEXT:    lw a4, 8(a1)
 ; RV32ZBB-NEXT:    beq a5, t0, .LBB17_2
 ; RV32ZBB-NEXT:  # %bb.1:
@@ -1798,67 +1798,67 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a5, 4(a2)
-; RV32I-NEXT:    lw a6, 8(a2)
-; RV32I-NEXT:    lw a7, 8(a1)
-; RV32I-NEXT:    lw a2, 12(a2)
+; RV32I-NEXT:    lw a4, 4(a2)
+; RV32I-NEXT:    lw a5, 8(a2)
+; RV32I-NEXT:    lw a7, 12(a2)
+; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a7, a6
+; RV32I-NEXT:    sltu t1, a6, a5
 ; RV32I-NEXT:    mv t4, t1
-; RV32I-NEXT:    beq t0, a2, .LBB22_2
+; RV32I-NEXT:    beq t0, a7, .LBB22_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t4, t0, a2
+; RV32I-NEXT:    sltu t4, t0, a7
 ; RV32I-NEXT:  .LBB22_2:
-; RV32I-NEXT:    sltu t2, a4, a3
+; RV32I-NEXT:    sltu t2, a2, a3
 ; RV32I-NEXT:    mv t3, t2
-; RV32I-NEXT:    beq a1, a5, .LBB22_4
+; RV32I-NEXT:    beq a1, a4, .LBB22_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t3, a1, a5
+; RV32I-NEXT:    sltu t3, a1, a4
 ; RV32I-NEXT:  .LBB22_4:
-; RV32I-NEXT:    xor t5, t0, a2
-; RV32I-NEXT:    xor t6, a7, a6
+; RV32I-NEXT:    xor t5, t0, a7
+; RV32I-NEXT:    xor t6, a6, a5
 ; RV32I-NEXT:    or t5, t6, t5
 ; RV32I-NEXT:    mv t6, t3
 ; RV32I-NEXT:    beqz t5, .LBB22_6
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    mv t6, t4
 ; RV32I-NEXT:  .LBB22_6:
-; RV32I-NEXT:    sltu t4, a3, a4
+; RV32I-NEXT:    sltu t4, a3, a2
 ; RV32I-NEXT:    mv t5, t4
-; RV32I-NEXT:    beq a1, a5, .LBB22_8
+; RV32I-NEXT:    beq a1, a4, .LBB22_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    sltu t5, a5, a1
+; RV32I-NEXT:    sltu t5, a4, a1
 ; RV32I-NEXT:  .LBB22_8:
 ; RV32I-NEXT:    bnez t6, .LBB22_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    sltu t1, a6, a7
-; RV32I-NEXT:    sub a2, a2, t0
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t5
-; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sltu t1, a5, a6
+; RV32I-NEXT:    sub a7, a7, t0
+; RV32I-NEXT:    sub a7, a7, t1
+; RV32I-NEXT:    sub a6, a5, a6
+; RV32I-NEXT:    sltu a5, a6, t5
+; RV32I-NEXT:    sub a5, a7, a5
 ; RV32I-NEXT:    sub a6, a6, t5
-; RV32I-NEXT:    sub a5, a5, a1
-; RV32I-NEXT:    sub a1, a5, t4
-; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sub a4, a4, a1
+; RV32I-NEXT:    sub a1, a4, t4
+; RV32I-NEXT:    sub a2, a3, a2
 ; RV32I-NEXT:    j .LBB22_11
 ; RV32I-NEXT:  .LBB22_10:
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, a2, t1
+; RV32I-NEXT:    sub a7, t0, a7
+; RV32I-NEXT:    sub a6, a6, a5
+; RV32I-NEXT:    sub a5, a7, t1
 ; RV32I-NEXT:    sltu a7, a6, t3
-; RV32I-NEXT:    sub a1, a1, a5
-; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    sub a5, a5, a7
 ; RV32I-NEXT:    sub a6, a6, t3
 ; RV32I-NEXT:    sub a1, a1, t2
-; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    sub a2, a2, a3
 ; RV32I-NEXT:  .LBB22_11:
 ; RV32I-NEXT:    sw a6, 8(a0)
 ; RV32I-NEXT:    sw a1, 4(a0)
-; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a2, 0(a0)
+; RV32I-NEXT:    sw a5, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i128:
@@ -1885,67 +1885,67 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_cmp_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a5, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a1)
-; RV32ZBB-NEXT:    lw a2, 12(a2)
+; RV32ZBB-NEXT:    lw a4, 4(a2)
+; RV32ZBB-NEXT:    lw a5, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw a6, 8(a1)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a7, a6
+; RV32ZBB-NEXT:    sltu t1, a6, a5
 ; RV32ZBB-NEXT:    mv t4, t1
-; RV32ZBB-NEXT:    beq t0, a2, .LBB22_2
+; RV32ZBB-NEXT:    beq t0, a7, .LBB22_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t4, t0, a2
+; RV32ZBB-NEXT:    sltu t4, t0, a7
 ; RV32ZBB-NEXT:  .LBB22_2:
-; RV32ZBB-NEXT:    sltu t2, a4, a3
+; RV32ZBB-NEXT:    sltu t2, a2, a3
 ; RV32ZBB-NEXT:    mv t3, t2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB22_4
+; RV32ZBB-NEXT:    beq a1, a4, .LBB22_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t3, a1, a5
+; RV32ZBB-NEXT:    sltu t3, a1, a4
 ; RV32ZBB-NEXT:  .LBB22_4:
-; RV32ZBB-NEXT:    xor t5, t0, a2
-; RV32ZBB-NEXT:    xor t6, a7, a6
+; RV32ZBB-NEXT:    xor t5, t0, a7
+; RV32ZBB-NEXT:    xor t6, a6, a5
 ; RV32ZBB-NEXT:    or t5, t6, t5
 ; RV32ZBB-NEXT:    mv t6, t3
 ; RV32ZBB-NEXT:    beqz t5, .LBB22_6
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    mv t6, t4
 ; RV32ZBB-NEXT:  .LBB22_6:
-; RV32ZBB-NEXT:    sltu t4, a3, a4
+; RV32ZBB-NEXT:    sltu t4, a3, a2
 ; RV32ZBB-NEXT:    mv t5, t4
-; RV32ZBB-NEXT:    beq a1, a5, .LBB22_8
+; RV32ZBB-NEXT:    beq a1, a4, .LBB22_8
 ; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    sltu t5, a5, a1
+; RV32ZBB-NEXT:    sltu t5, a4, a1
 ; RV32ZBB-NEXT:  .LBB22_8:
 ; RV32ZBB-NEXT:    bnez t6, .LBB22_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    sltu t1, a6, a7
-; RV32ZBB-NEXT:    sub a2, a2, t0
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t5
-; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sltu t1, a5, a6
+; RV32ZBB-NEXT:    sub a7, a7, t0
+; RV32ZBB-NEXT:    sub a7, a7, t1
+; RV32ZBB-NEXT:    sub a6, a5, a6
+; RV32ZBB-NEXT:    sltu a5, a6, t5
+; RV32ZBB-NEXT:    sub a5, a7, a5
 ; RV32ZBB-NEXT:    sub a6, a6, t5
-; RV32ZBB-NEXT:    sub a5, a5, a1
-; RV32ZBB-NEXT:    sub a1, a5, t4
-; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sub a4, a4, a1
+; RV32ZBB-NEXT:    sub a1, a4, t4
+; RV32ZBB-NEXT:    sub a2, a3, a2
 ; RV32ZBB-NEXT:    j .LBB22_11
 ; RV32ZBB-NEXT:  .LBB22_10:
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, a2, t1
+; RV32ZBB-NEXT:    sub a7, t0, a7
+; RV32ZBB-NEXT:    sub a6, a6, a5
+; RV32ZBB-NEXT:    sub a5, a7, t1
 ; RV32ZBB-NEXT:    sltu a7, a6, t3
-; RV32ZBB-NEXT:    sub a1, a1, a5
-; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sub a5, a5, a7
 ; RV32ZBB-NEXT:    sub a6, a6, t3
 ; RV32ZBB-NEXT:    sub a1, a1, t2
-; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    sub a2, a2, a3
 ; RV32ZBB-NEXT:  .LBB22_11:
 ; RV32ZBB-NEXT:    sw a6, 8(a0)
 ; RV32ZBB-NEXT:    sw a1, 4(a0)
-; RV32ZBB-NEXT:    sw a3, 0(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a2, 0(a0)
+; RV32ZBB-NEXT:    sw a5, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i128:
diff --git a/llvm/test/CodeGen/RISCV/abdu.ll b/llvm/test/CodeGen/RISCV/abdu.ll
index a9f933243f679..a04a800157dbb 100644
--- a/llvm/test/CodeGen/RISCV/abdu.ll
+++ b/llvm/test/CodeGen/RISCV/abdu.ll
@@ -540,75 +540,75 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a5, 0(a2)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw t1, 12(a2)
-; RV32I-NEXT:    lw a7, 8(a2)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw t0, 4(a2)
+; RV32I-NEXT:    lw a3, 0(a2)
+; RV32I-NEXT:    lw a5, 4(a2)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a7, 12(a2)
+; RV32I-NEXT:    lw a2, 8(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    lw t0, 0(a1)
 ; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu a2, a4, a7
-; RV32I-NEXT:    sub t1, a6, t1
-; RV32I-NEXT:    sltu t2, a3, a5
-; RV32I-NEXT:    sub a2, t1, a2
+; RV32I-NEXT:    sltu t1, a2, a6
+; RV32I-NEXT:    sub a7, a4, a7
+; RV32I-NEXT:    sltu t2, t0, a3
+; RV32I-NEXT:    sub a7, a7, t1
 ; RV32I-NEXT:    mv t1, t2
-; RV32I-NEXT:    beq a1, t0, .LBB11_2
+; RV32I-NEXT:    beq a1, a5, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a1, t0
+; RV32I-NEXT:    sltu t1, a1, a5
 ; RV32I-NEXT:  .LBB11_2:
-; RV32I-NEXT:    sub a7, a4, a7
-; RV32I-NEXT:    sltu t3, a7, t1
-; RV32I-NEXT:    sub a2, a2, t3
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    beq a2, a6, .LBB11_4
+; RV32I-NEXT:    sub t3, a2, a6
+; RV32I-NEXT:    sltu a6, t3, t1
+; RV32I-NEXT:    sub a6, a7, a6
+; RV32I-NEXT:    sub a7, t3, t1
+; RV32I-NEXT:    beq a6, a4, .LBB11_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a6, a2
+; RV32I-NEXT:    sltu t1, a4, a6
 ; RV32I-NEXT:    j .LBB11_5
 ; RV32I-NEXT:  .LBB11_4:
-; RV32I-NEXT:    sltu t1, a4, a7
+; RV32I-NEXT:    sltu t1, a2, a7
 ; RV32I-NEXT:  .LBB11_5:
-; RV32I-NEXT:    sub t0, a1, t0
-; RV32I-NEXT:    sub t0, t0, t2
-; RV32I-NEXT:    sub a5, a3, a5
-; RV32I-NEXT:    beq t0, a1, .LBB11_7
+; RV32I-NEXT:    sub a5, a1, a5
+; RV32I-NEXT:    sub a5, a5, t2
+; RV32I-NEXT:    sub a3, t0, a3
+; RV32I-NEXT:    beq a5, a1, .LBB11_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a1, a1, t0
+; RV32I-NEXT:    sltu a1, a1, a5
 ; RV32I-NEXT:    j .LBB11_8
 ; RV32I-NEXT:  .LBB11_7:
-; RV32I-NEXT:    sltu a1, a3, a5
+; RV32I-NEXT:    sltu a1, t0, a3
 ; RV32I-NEXT:  .LBB11_8:
-; RV32I-NEXT:    xor a3, a2, a6
-; RV32I-NEXT:    xor a4, a7, a4
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    beqz a3, .LBB11_10
+; RV32I-NEXT:    xor a4, a6, a4
+; RV32I-NEXT:    xor a2, a7, a2
+; RV32I-NEXT:    or a2, a2, a4
+; RV32I-NEXT:    beqz a2, .LBB11_10
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    mv a1, t1
 ; RV32I-NEXT:  .LBB11_10:
-; RV32I-NEXT:    neg a6, a1
-; RV32I-NEXT:    xor a3, a7, a6
-; RV32I-NEXT:    sltu a4, a3, a6
-; RV32I-NEXT:    xor a2, a2, a6
-; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sub a4, a2, a4
-; RV32I-NEXT:    xor a2, a5, a6
-; RV32I-NEXT:    sltu a5, a2, a6
-; RV32I-NEXT:    xor a7, t0, a6
-; RV32I-NEXT:    mv t1, a5
-; RV32I-NEXT:    beqz t0, .LBB11_12
+; RV32I-NEXT:    neg t0, a1
+; RV32I-NEXT:    xor a2, a7, t0
+; RV32I-NEXT:    sltu a4, a2, t0
+; RV32I-NEXT:    xor a6, a6, t0
+; RV32I-NEXT:    add a6, a6, a1
+; RV32I-NEXT:    sub a4, a6, a4
+; RV32I-NEXT:    xor a3, a3, t0
+; RV32I-NEXT:    sltu a6, a3, t0
+; RV32I-NEXT:    xor a7, a5, t0
+; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    beqz a5, .LBB11_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, a6
+; RV32I-NEXT:    sltu t1, a7, t0
 ; RV32I-NEXT:  .LBB11_12:
-; RV32I-NEXT:    add a3, a3, a1
-; RV32I-NEXT:    sltu a6, a3, t1
-; RV32I-NEXT:    sub a4, a4, a6
-; RV32I-NEXT:    sub a3, a3, t1
+; RV32I-NEXT:    add a2, a2, a1
+; RV32I-NEXT:    sltu a5, a2, t1
+; RV32I-NEXT:    sub a4, a4, a5
+; RV32I-NEXT:    sub a2, a2, t1
 ; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a5
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    sub a5, a7, a6
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sw a2, 8(a0)
 ; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 ;
@@ -636,75 +636,75 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ;
 ; RV32ZBB-LABEL: abd_ext_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a5, 0(a2)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a6, 12(a1)
-; RV32ZBB-NEXT:    lw t0, 4(a2)
+; RV32ZBB-NEXT:    lw a3, 0(a2)
+; RV32ZBB-NEXT:    lw a5, 4(a2)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw a2, 8(a1)
+; RV32ZBB-NEXT:    lw a4, 12(a1)
+; RV32ZBB-NEXT:    lw t0, 0(a1)
 ; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu a2, a4, a7
-; RV32ZBB-NEXT:    sub t1, a6, t1
-; RV32ZBB-NEXT:    sltu t2, a3, a5
-; RV32ZBB-NEXT:    sub a2, t1, a2
+; RV32ZBB-NEXT:    sltu t1, a2, a6
+; RV32ZBB-NEXT:    sub a7, a4, a7
+; RV32ZBB-NEXT:    sltu t2, t0, a3
+; RV32ZBB-NEXT:    sub a7, a7, t1
 ; RV32ZBB-NEXT:    mv t1, t2
-; RV32ZBB-NEXT:    beq a1, t0, .LBB11_2
+; RV32ZBB-NEXT:    beq a1, a5, .LBB11_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a1, t0
+; RV32ZBB-NEXT:    sltu t1, a1, a5
 ; RV32ZBB-NEXT:  .LBB11_2:
-; RV32ZBB-NEXT:    sub a7, a4, a7
-; RV32ZBB-NEXT:    sltu t3, a7, t1
-; RV32ZBB-NEXT:    sub a2, a2, t3
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    beq a2, a6, .LBB11_4
+; RV32ZBB-NEXT:    sub t3, a2, a6
+; RV32ZBB-NEXT:    sltu a6, t3, t1
+; RV32ZBB-NEXT:    sub a6, a7, a6
+; RV32ZBB-NEXT:    sub a7, t3, t1
+; RV32ZBB-NEXT:    beq a6, a4, .LBB11_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a6, a2
+; RV32ZBB-NEXT:    sltu t1, a4, a6
 ; RV32ZBB-NEXT:    j .LBB11_5
 ; RV32ZBB-NEXT:  .LBB11_4:
-; RV32ZBB-NEXT:    sltu t1, a4, a7
+; RV32ZBB-NEXT:    sltu t1, a2, a7
 ; RV32ZBB-NEXT:  .LBB11_5:
-; RV32ZBB-NEXT:    sub t0, a1, t0
-; RV32ZBB-NEXT:    sub t0, t0, t2
-; RV32ZBB-NEXT:    sub a5, a3, a5
-; RV32ZBB-NEXT:    beq t0, a1, .LBB11_7
+; RV32ZBB-NEXT:    sub a5, a1, a5
+; RV32ZBB-NEXT:    sub a5, a5, t2
+; RV32ZBB-NEXT:    sub a3, t0, a3
+; RV32ZBB-NEXT:    beq a5, a1, .LBB11_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a1, a1, t0
+; RV32ZBB-NEXT:    sltu a1, a1, a5
 ; RV32ZBB-NEXT:    j .LBB11_8
 ; RV32ZBB-NEXT:  .LBB11_7:
-; RV32ZBB-NEXT:    sltu a1, a3, a5
+; RV32ZBB-NEXT:    sltu a1, t0, a3
 ; RV32ZBB-NEXT:  .LBB11_8:
-; RV32ZBB-NEXT:    xor a3, a2, a6
-; RV32ZBB-NEXT:    xor a4, a7, a4
-; RV32ZBB-NEXT:    or a3, a4, a3
-; RV32ZBB-NEXT:    beqz a3, .LBB11_10
+; RV32ZBB-NEXT:    xor a4, a6, a4
+; RV32ZBB-NEXT:    xor a2, a7, a2
+; RV32ZBB-NEXT:    or a2, a2, a4
+; RV32ZBB-NEXT:    beqz a2, .LBB11_10
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    mv a1, t1
 ; RV32ZBB-NEXT:  .LBB11_10:
-; RV32ZBB-NEXT:    neg a6, a1
-; RV32ZBB-NEXT:    xor a3, a7, a6
-; RV32ZBB-NEXT:    sltu a4, a3, a6
-; RV32ZBB-NEXT:    xor a2, a2, a6
-; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sub a4, a2, a4
-; RV32ZBB-NEXT:    xor a2, a5, a6
-; RV32ZBB-NEXT:    sltu a5, a2, a6
-; RV32ZBB-NEXT:    xor a7, t0, a6
-; RV32ZBB-NEXT:    mv t1, a5
-; RV32ZBB-NEXT:    beqz t0, .LBB11_12
+; RV32ZBB-NEXT:    neg t0, a1
+; RV32ZBB-NEXT:    xor a2, a7, t0
+; RV32ZBB-NEXT:    sltu a4, a2, t0
+; RV32ZBB-NEXT:    xor a6, a6, t0
+; RV32ZBB-NEXT:    add a6, a6, a1
+; RV32ZBB-NEXT:    sub a4, a6, a4
+; RV32ZBB-NEXT:    xor a3, a3, t0
+; RV32ZBB-NEXT:    sltu a6, a3, t0
+; RV32ZBB-NEXT:    xor a7, a5, t0
+; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    beqz a5, .LBB11_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, a6
+; RV32ZBB-NEXT:    sltu t1, a7, t0
 ; RV32ZBB-NEXT:  .LBB11_12:
-; RV32ZBB-NEXT:    add a3, a3, a1
-; RV32ZBB-NEXT:    sltu a6, a3, t1
-; RV32ZBB-NEXT:    sub a4, a4, a6
-; RV32ZBB-NEXT:    sub a3, a3, t1
+; RV32ZBB-NEXT:    add a2, a2, a1
+; RV32ZBB-NEXT:    sltu a5, a2, t1
+; RV32ZBB-NEXT:    sub a4, a4, a5
+; RV32ZBB-NEXT:    sub a2, a2, t1
 ; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    sub a5, a7, a6
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
-; RV32ZBB-NEXT:    sw a3, 8(a0)
+; RV32ZBB-NEXT:    sw a2, 8(a0)
 ; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
@@ -740,75 +740,75 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128_undef:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a5, 0(a2)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw t1, 12(a2)
-; RV32I-NEXT:    lw a7, 8(a2)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw t0, 4(a2)
+; RV32I-NEXT:    lw a3, 0(a2)
+; RV32I-NEXT:    lw a5, 4(a2)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a7, 12(a2)
+; RV32I-NEXT:    lw a2, 8(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    lw t0, 0(a1)
 ; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu a2, a4, a7
-; RV32I-NEXT:    sub t1, a6, t1
-; RV32I-NEXT:    sltu t2, a3, a5
-; RV32I-NEXT:    sub a2, t1, a2
+; RV32I-NEXT:    sltu t1, a2, a6
+; RV32I-NEXT:    sub a7, a4, a7
+; RV32I-NEXT:    sltu t2, t0, a3
+; RV32I-NEXT:    sub a7, a7, t1
 ; RV32I-NEXT:    mv t1, t2
-; RV32I-NEXT:    beq a1, t0, .LBB12_2
+; RV32I-NEXT:    beq a1, a5, .LBB12_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a1, t0
+; RV32I-NEXT:    sltu t1, a1, a5
 ; RV32I-NEXT:  .LBB12_2:
-; RV32I-NEXT:    sub a7, a4, a7
-; RV32I-NEXT:    sltu t3, a7, t1
-; RV32I-NEXT:    sub a2, a2, t3
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    beq a2, a6, .LBB12_4
+; RV32I-NEXT:    sub t3, a2, a6
+; RV32I-NEXT:    sltu a6, t3, t1
+; RV32I-NEXT:    sub a6, a7, a6
+; RV32I-NEXT:    sub a7, t3, t1
+; RV32I-NEXT:    beq a6, a4, .LBB12_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a6, a2
+; RV32I-NEXT:    sltu t1, a4, a6
 ; RV32I-NEXT:    j .LBB12_5
 ; RV32I-NEXT:  .LBB12_4:
-; RV32I-NEXT:    sltu t1, a4, a7
+; RV32I-NEXT:    sltu t1, a2, a7
 ; RV32I-NEXT:  .LBB12_5:
-; RV32I-NEXT:    sub t0, a1, t0
-; RV32I-NEXT:    sub t0, t0, t2
-; RV32I-NEXT:    sub a5, a3, a5
-; RV32I-NEXT:    beq t0, a1, .LBB12_7
+; RV32I-NEXT:    sub a5, a1, a5
+; RV32I-NEXT:    sub a5, a5, t2
+; RV32I-NEXT:    sub a3, t0, a3
+; RV32I-NEXT:    beq a5, a1, .LBB12_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a1, a1, t0
+; RV32I-NEXT:    sltu a1, a1, a5
 ; RV32I-NEXT:    j .LBB12_8
 ; RV32I-NEXT:  .LBB12_7:
-; RV32I-NEXT:    sltu a1, a3, a5
+; RV32I-NEXT:    sltu a1, t0, a3
 ; RV32I-NEXT:  .LBB12_8:
-; RV32I-NEXT:    xor a3, a2, a6
-; RV32I-NEXT:    xor a4, a7, a4
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    beqz a3, .LBB12_10
+; RV32I-NEXT:    xor a4, a6, a4
+; RV32I-NEXT:    xor a2, a7, a2
+; RV32I-NEXT:    or a2, a2, a4
+; RV32I-NEXT:    beqz a2, .LBB12_10
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    mv a1, t1
 ; RV32I-NEXT:  .LBB12_10:
-; RV32I-NEXT:    neg a6, a1
-; RV32I-NEXT:    xor a3, a7, a6
-; RV32I-NEXT:    sltu a4, a3, a6
-; RV32I-NEXT:    xor a2, a2, a6
-; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sub a4, a2, a4
-; RV32I-NEXT:    xor a2, a5, a6
-; RV32I-NEXT:    sltu a5, a2, a6
-; RV32I-NEXT:    xor a7, t0, a6
-; RV32I-NEXT:    mv t1, a5
-; RV32I-NEXT:    beqz t0, .LBB12_12
+; RV32I-NEXT:    neg t0, a1
+; RV32I-NEXT:    xor a2, a7, t0
+; RV32I-NEXT:    sltu a4, a2, t0
+; RV32I-NEXT:    xor a6, a6, t0
+; RV32I-NEXT:    add a6, a6, a1
+; RV32I-NEXT:    sub a4, a6, a4
+; RV32I-NEXT:    xor a3, a3, t0
+; RV32I-NEXT:    sltu a6, a3, t0
+; RV32I-NEXT:    xor a7, a5, t0
+; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    beqz a5, .LBB12_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, a6
+; RV32I-NEXT:    sltu t1, a7, t0
 ; RV32I-NEXT:  .LBB12_12:
-; RV32I-NEXT:    add a3, a3, a1
-; RV32I-NEXT:    sltu a6, a3, t1
-; RV32I-NEXT:    sub a4, a4, a6
-; RV32I-NEXT:    sub a3, a3, t1
+; RV32I-NEXT:    add a2, a2, a1
+; RV32I-NEXT:    sltu a5, a2, t1
+; RV32I-NEXT:    sub a4, a4, a5
+; RV32I-NEXT:    sub a2, a2, t1
 ; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a5
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    sub a5, a7, a6
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sw a2, 8(a0)
 ; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 ;
@@ -836,75 +836,75 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ;
 ; RV32ZBB-LABEL: abd_ext_i128_undef:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a5, 0(a2)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a6, 12(a1)
-; RV32ZBB-NEXT:    lw t0, 4(a2)
+; RV32ZBB-NEXT:    lw a3, 0(a2)
+; RV32ZBB-NEXT:    lw a5, 4(a2)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw a2, 8(a1)
+; RV32ZBB-NEXT:    lw a4, 12(a1)
+; RV32ZBB-NEXT:    lw t0, 0(a1)
 ; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu a2, a4, a7
-; RV32ZBB-NEXT:    sub t1, a6, t1
-; RV32ZBB-NEXT:    sltu t2, a3, a5
-; RV32ZBB-NEXT:    sub a2, t1, a2
+; RV32ZBB-NEXT:    sltu t1, a2, a6
+; RV32ZBB-NEXT:    sub a7, a4, a7
+; RV32ZBB-NEXT:    sltu t2, t0, a3
+; RV32ZBB-NEXT:    sub a7, a7, t1
 ; RV32ZBB-NEXT:    mv t1, t2
-; RV32ZBB-NEXT:    beq a1, t0, .LBB12_2
+; RV32ZBB-NEXT:    beq a1, a5, .LBB12_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a1, t0
+; RV32ZBB-NEXT:    sltu t1, a1, a5
 ; RV32ZBB-NEXT:  .LBB12_2:
-; RV32ZBB-NEXT:    sub a7, a4, a7
-; RV32ZBB-NEXT:    sltu t3, a7, t1
-; RV32ZBB-NEXT:    sub a2, a2, t3
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    beq a2, a6, .LBB12_4
+; RV32ZBB-NEXT:    sub t3, a2, a6
+; RV32ZBB-NEXT:    sltu a6, t3, t1
+; RV32ZBB-NEXT:    sub a6, a7, a6
+; RV32ZBB-NEXT:    sub a7, t3, t1
+; RV32ZBB-NEXT:    beq a6, a4, .LBB12_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a6, a2
+; RV32ZBB-NEXT:    sltu t1, a4, a6
 ; RV32ZBB-NEXT:    j .LBB12_5
 ; RV32ZBB-NEXT:  .LBB12_4:
-; RV32ZBB-NEXT:    sltu t1, a4, a7
+; RV32ZBB-NEXT:    sltu t1, a2, a7
 ; RV32ZBB-NEXT:  .LBB12_5:
-; RV32ZBB-NEXT:    sub t0, a1, t0
-; RV32ZBB-NEXT:    sub t0, t0, t2
-; RV32ZBB-NEXT:    sub a5, a3, a5
-; RV32ZBB-NEXT:    beq t0, a1, .LBB12_7
+; RV32ZBB-NEXT:    sub a5, a1, a5
+; RV32ZBB-NEXT:    sub a5, a5, t2
+; RV32ZBB-NEXT:    sub a3, t0, a3
+; RV32ZBB-NEXT:    beq a5, a1, .LBB12_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a1, a1, t0
+; RV32ZBB-NEXT:    sltu a1, a1, a5
 ; RV32ZBB-NEXT:    j .LBB12_8
 ; RV32ZBB-NEXT:  .LBB12_7:
-; RV32ZBB-NEXT:    sltu a1, a3, a5
+; RV32ZBB-NEXT:    sltu a1, t0, a3
 ; RV32ZBB-NEXT:  .LBB12_8:
-; RV32ZBB-NEXT:    xor a3, a2, a6
-; RV32ZBB-NEXT:    xor a4, a7, a4
-; RV32ZBB-NEXT:    or a3, a4, a3
-; RV32ZBB-NEXT:    beqz a3, .LBB12_10
+; RV32ZBB-NEXT:    xor a4, a6, a4
+; RV32ZBB-NEXT:    xor a2, a7, a2
+; RV32ZBB-NEXT:    or a2, a2, a4
+; RV32ZBB-NEXT:    beqz a2, .LBB12_10
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    mv a1, t1
 ; RV32ZBB-NEXT:  .LBB12_10:
-; RV32ZBB-NEXT:    neg a6, a1
-; RV32ZBB-NEXT:    xor a3, a7, a6
-; RV32ZBB-NEXT:    sltu a4, a3, a6
-; RV32ZBB-NEXT:    xor a2, a2, a6
-; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sub a4, a2, a4
-; RV32ZBB-NEXT:    xor a2, a5, a6
-; RV32ZBB-NEXT:    sltu a5, a2, a6
-; RV32ZBB-NEXT:    xor a7, t0, a6
-; RV32ZBB-NEXT:    mv t1, a5
-; RV32ZBB-NEXT:    beqz t0, .LBB12_12
+; RV32ZBB-NEXT:    neg t0, a1
+; RV32ZBB-NEXT:    xor a2, a7, t0
+; RV32ZBB-NEXT:    sltu a4, a2, t0
+; RV32ZBB-NEXT:    xor a6, a6, t0
+; RV32ZBB-NEXT:    add a6, a6, a1
+; RV32ZBB-NEXT:    sub a4, a6, a4
+; RV32ZBB-NEXT:    xor a3, a3, t0
+; RV32ZBB-NEXT:    sltu a6, a3, t0
+; RV32ZBB-NEXT:    xor a7, a5, t0
+; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    beqz a5, .LBB12_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, a6
+; RV32ZBB-NEXT:    sltu t1, a7, t0
 ; RV32ZBB-NEXT:  .LBB12_12:
-; RV32ZBB-NEXT:    add a3, a3, a1
-; RV32ZBB-NEXT:    sltu a6, a3, t1
-; RV32ZBB-NEXT:    sub a4, a4, a6
-; RV32ZBB-NEXT:    sub a3, a3, t1
+; RV32ZBB-NEXT:    add a2, a2, a1
+; RV32ZBB-NEXT:    sltu a5, a2, t1
+; RV32ZBB-NEXT:    sub a4, a4, a5
+; RV32ZBB-NEXT:    sub a2, a2, t1
 ; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    sub a5, a7, a6
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
-; RV32ZBB-NEXT:    sw a3, 8(a0)
+; RV32ZBB-NEXT:    sw a2, 8(a0)
 ; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
@@ -1131,75 +1131,75 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a5, 0(a2)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw t1, 12(a2)
-; RV32I-NEXT:    lw a7, 8(a2)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw t0, 4(a2)
+; RV32I-NEXT:    lw a3, 0(a2)
+; RV32I-NEXT:    lw a5, 4(a2)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a7, 12(a2)
+; RV32I-NEXT:    lw a2, 8(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    lw t0, 0(a1)
 ; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu a2, a4, a7
-; RV32I-NEXT:    sub t1, a6, t1
-; RV32I-NEXT:    sltu t2, a3, a5
-; RV32I-NEXT:    sub a2, t1, a2
+; RV32I-NEXT:    sltu t1, a2, a6
+; RV32I-NEXT:    sub a7, a4, a7
+; RV32I-NEXT:    sltu t2, t0, a3
+; RV32I-NEXT:    sub a7, a7, t1
 ; RV32I-NEXT:    mv t1, t2
-; RV32I-NEXT:    beq a1, t0, .LBB17_2
+; RV32I-NEXT:    beq a1, a5, .LBB17_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a1, t0
+; RV32I-NEXT:    sltu t1, a1, a5
 ; RV32I-NEXT:  .LBB17_2:
-; RV32I-NEXT:    sub a7, a4, a7
-; RV32I-NEXT:    sltu t3, a7, t1
-; RV32I-NEXT:    sub a2, a2, t3
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    beq a2, a6, .LBB17_4
+; RV32I-NEXT:    sub t3, a2, a6
+; RV32I-NEXT:    sltu a6, t3, t1
+; RV32I-NEXT:    sub a6, a7, a6
+; RV32I-NEXT:    sub a7, t3, t1
+; RV32I-NEXT:    beq a6, a4, .LBB17_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a6, a2
+; RV32I-NEXT:    sltu t1, a4, a6
 ; RV32I-NEXT:    j .LBB17_5
 ; RV32I-NEXT:  .LBB17_4:
-; RV32I-NEXT:    sltu t1, a4, a7
+; RV32I-NEXT:    sltu t1, a2, a7
 ; RV32I-NEXT:  .LBB17_5:
-; RV32I-NEXT:    sub t0, a1, t0
-; RV32I-NEXT:    sub t0, t0, t2
-; RV32I-NEXT:    sub a5, a3, a5
-; RV32I-NEXT:    beq t0, a1, .LBB17_7
+; RV32I-NEXT:    sub a5, a1, a5
+; RV32I-NEXT:    sub a5, a5, t2
+; RV32I-NEXT:    sub a3, t0, a3
+; RV32I-NEXT:    beq a5, a1, .LBB17_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a1, a1, t0
+; RV32I-NEXT:    sltu a1, a1, a5
 ; RV32I-NEXT:    j .LBB17_8
 ; RV32I-NEXT:  .LBB17_7:
-; RV32I-NEXT:    sltu a1, a3, a5
+; RV32I-NEXT:    sltu a1, t0, a3
 ; RV32I-NEXT:  .LBB17_8:
-; RV32I-NEXT:    xor a3, a2, a6
-; RV32I-NEXT:    xor a4, a7, a4
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    beqz a3, .LBB17_10
+; RV32I-NEXT:    xor a4, a6, a4
+; RV32I-NEXT:    xor a2, a7, a2
+; RV32I-NEXT:    or a2, a2, a4
+; RV32I-NEXT:    beqz a2, .LBB17_10
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    mv a1, t1
 ; RV32I-NEXT:  .LBB17_10:
-; RV32I-NEXT:    neg a6, a1
-; RV32I-NEXT:    xor a3, a7, a6
-; RV32I-NEXT:    sltu a4, a3, a6
-; RV32I-NEXT:    xor a2, a2, a6
-; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sub a4, a2, a4
-; RV32I-NEXT:    xor a2, a5, a6
-; RV32I-NEXT:    sltu a5, a2, a6
-; RV32I-NEXT:    xor a7, t0, a6
-; RV32I-NEXT:    mv t1, a5
-; RV32I-NEXT:    beqz t0, .LBB17_12
+; RV32I-NEXT:    neg t0, a1
+; RV32I-NEXT:    xor a2, a7, t0
+; RV32I-NEXT:    sltu a4, a2, t0
+; RV32I-NEXT:    xor a6, a6, t0
+; RV32I-NEXT:    add a6, a6, a1
+; RV32I-NEXT:    sub a4, a6, a4
+; RV32I-NEXT:    xor a3, a3, t0
+; RV32I-NEXT:    sltu a6, a3, t0
+; RV32I-NEXT:    xor a7, a5, t0
+; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    beqz a5, .LBB17_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, a6
+; RV32I-NEXT:    sltu t1, a7, t0
 ; RV32I-NEXT:  .LBB17_12:
-; RV32I-NEXT:    add a3, a3, a1
-; RV32I-NEXT:    sltu a6, a3, t1
-; RV32I-NEXT:    sub a4, a4, a6
-; RV32I-NEXT:    sub a3, a3, t1
+; RV32I-NEXT:    add a2, a2, a1
+; RV32I-NEXT:    sltu a5, a2, t1
+; RV32I-NEXT:    sub a4, a4, a5
+; RV32I-NEXT:    sub a2, a2, t1
 ; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a5
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    sub a5, a7, a6
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sw a2, 8(a0)
 ; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 ;
@@ -1227,75 +1227,75 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ;
 ; RV32ZBB-LABEL: abd_minmax_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a5, 0(a2)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a6, 12(a1)
-; RV32ZBB-NEXT:    lw t0, 4(a2)
+; RV32ZBB-NEXT:    lw a3, 0(a2)
+; RV32ZBB-NEXT:    lw a5, 4(a2)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw a2, 8(a1)
+; RV32ZBB-NEXT:    lw a4, 12(a1)
+; RV32ZBB-NEXT:    lw t0, 0(a1)
 ; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu a2, a4, a7
-; RV32ZBB-NEXT:    sub t1, a6, t1
-; RV32ZBB-NEXT:    sltu t2, a3, a5
-; RV32ZBB-NEXT:    sub a2, t1, a2
+; RV32ZBB-NEXT:    sltu t1, a2, a6
+; RV32ZBB-NEXT:    sub a7, a4, a7
+; RV32ZBB-NEXT:    sltu t2, t0, a3
+; RV32ZBB-NEXT:    sub a7, a7, t1
 ; RV32ZBB-NEXT:    mv t1, t2
-; RV32ZBB-NEXT:    beq a1, t0, .LBB17_2
+; RV32ZBB-NEXT:    beq a1, a5, .LBB17_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a1, t0
+; RV32ZBB-NEXT:    sltu t1, a1, a5
 ; RV32ZBB-NEXT:  .LBB17_2:
-; RV32ZBB-NEXT:    sub a7, a4, a7
-; RV32ZBB-NEXT:    sltu t3, a7, t1
-; RV32ZBB-NEXT:    sub a2, a2, t3
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    beq a2, a6, .LBB17_4
+; RV32ZBB-NEXT:    sub t3, a2, a6
+; RV32ZBB-NEXT:    sltu a6, t3, t1
+; RV32ZBB-NEXT:    sub a6, a7, a6
+; RV32ZBB-NEXT:    sub a7, t3, t1
+; RV32ZBB-NEXT:    beq a6, a4, .LBB17_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a6, a2
+; RV32ZBB-NEXT:    sltu t1, a4, a6
 ; RV32ZBB-NEXT:    j .LBB17_5
 ; RV32ZBB-NEXT:  .LBB17_4:
-; RV32ZBB-NEXT:    sltu t1, a4, a7
+; RV32ZBB-NEXT:    sltu t1, a2, a7
 ; RV32ZBB-NEXT:  .LBB17_5:
-; RV32ZBB-NEXT:    sub t0, a1, t0
-; RV32ZBB-NEXT:    sub t0, t0, t2
-; RV32ZBB-NEXT:    sub a5, a3, a5
-; RV32ZBB-NEXT:    beq t0, a1, .LBB17_7
+; RV32ZBB-NEXT:    sub a5, a1, a5
+; RV32ZBB-NEXT:    sub a5, a5, t2
+; RV32ZBB-NEXT:    sub a3, t0, a3
+; RV32ZBB-NEXT:    beq a5, a1, .LBB17_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a1, a1, t0
+; RV32ZBB-NEXT:    sltu a1, a1, a5
 ; RV32ZBB-NEXT:    j .LBB17_8
 ; RV32ZBB-NEXT:  .LBB17_7:
-; RV32ZBB-NEXT:    sltu a1, a3, a5
+; RV32ZBB-NEXT:    sltu a1, t0, a3
 ; RV32ZBB-NEXT:  .LBB17_8:
-; RV32ZBB-NEXT:    xor a3, a2, a6
-; RV32ZBB-NEXT:    xor a4, a7, a4
-; RV32ZBB-NEXT:    or a3, a4, a3
-; RV32ZBB-NEXT:    beqz a3, .LBB17_10
+; RV32ZBB-NEXT:    xor a4, a6, a4
+; RV32ZBB-NEXT:    xor a2, a7, a2
+; RV32ZBB-NEXT:    or a2, a2, a4
+; RV32ZBB-NEXT:    beqz a2, .LBB17_10
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    mv a1, t1
 ; RV32ZBB-NEXT:  .LBB17_10:
-; RV32ZBB-NEXT:    neg a6, a1
-; RV32ZBB-NEXT:    xor a3, a7, a6
-; RV32ZBB-NEXT:    sltu a4, a3, a6
-; RV32ZBB-NEXT:    xor a2, a2, a6
-; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sub a4, a2, a4
-; RV32ZBB-NEXT:    xor a2, a5, a6
-; RV32ZBB-NEXT:    sltu a5, a2, a6
-; RV32ZBB-NEXT:    xor a7, t0, a6
-; RV32ZBB-NEXT:    mv t1, a5
-; RV32ZBB-NEXT:    beqz t0, .LBB17_12
+; RV32ZBB-NEXT:    neg t0, a1
+; RV32ZBB-NEXT:    xor a2, a7, t0
+; RV32ZBB-NEXT:    sltu a4, a2, t0
+; RV32ZBB-NEXT:    xor a6, a6, t0
+; RV32ZBB-NEXT:    add a6, a6, a1
+; RV32ZBB-NEXT:    sub a4, a6, a4
+; RV32ZBB-NEXT:    xor a3, a3, t0
+; RV32ZBB-NEXT:    sltu a6, a3, t0
+; RV32ZBB-NEXT:    xor a7, a5, t0
+; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    beqz a5, .LBB17_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, a6
+; RV32ZBB-NEXT:    sltu t1, a7, t0
 ; RV32ZBB-NEXT:  .LBB17_12:
-; RV32ZBB-NEXT:    add a3, a3, a1
-; RV32ZBB-NEXT:    sltu a6, a3, t1
-; RV32ZBB-NEXT:    sub a4, a4, a6
-; RV32ZBB-NEXT:    sub a3, a3, t1
+; RV32ZBB-NEXT:    add a2, a2, a1
+; RV32ZBB-NEXT:    sltu a5, a2, t1
+; RV32ZBB-NEXT:    sub a4, a4, a5
+; RV32ZBB-NEXT:    sub a2, a2, t1
 ; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    sub a5, a7, a6
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
-; RV32ZBB-NEXT:    sw a3, 8(a0)
+; RV32ZBB-NEXT:    sw a2, 8(a0)
 ; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
@@ -1524,75 +1524,75 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a5, 0(a2)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw t1, 12(a2)
-; RV32I-NEXT:    lw a7, 8(a2)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw t0, 4(a2)
+; RV32I-NEXT:    lw a3, 0(a2)
+; RV32I-NEXT:    lw a5, 4(a2)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a7, 12(a2)
+; RV32I-NEXT:    lw a2, 8(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    lw t0, 0(a1)
 ; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu a2, a4, a7
-; RV32I-NEXT:    sub t1, a6, t1
-; RV32I-NEXT:    sltu t2, a3, a5
-; RV32I-NEXT:    sub a2, t1, a2
+; RV32I-NEXT:    sltu t1, a2, a6
+; RV32I-NEXT:    sub a7, a4, a7
+; RV32I-NEXT:    sltu t2, t0, a3
+; RV32I-NEXT:    sub a7, a7, t1
 ; RV32I-NEXT:    mv t1, t2
-; RV32I-NEXT:    beq a1, t0, .LBB22_2
+; RV32I-NEXT:    beq a1, a5, .LBB22_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a1, t0
+; RV32I-NEXT:    sltu t1, a1, a5
 ; RV32I-NEXT:  .LBB22_2:
-; RV32I-NEXT:    sub a7, a4, a7
-; RV32I-NEXT:    sltu t3, a7, t1
-; RV32I-NEXT:    sub a2, a2, t3
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    beq a2, a6, .LBB22_4
+; RV32I-NEXT:    sub t3, a2, a6
+; RV32I-NEXT:    sltu a6, t3, t1
+; RV32I-NEXT:    sub a6, a7, a6
+; RV32I-NEXT:    sub a7, t3, t1
+; RV32I-NEXT:    beq a6, a4, .LBB22_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a6, a2
+; RV32I-NEXT:    sltu t1, a4, a6
 ; RV32I-NEXT:    j .LBB22_5
 ; RV32I-NEXT:  .LBB22_4:
-; RV32I-NEXT:    sltu t1, a4, a7
+; RV32I-NEXT:    sltu t1, a2, a7
 ; RV32I-NEXT:  .LBB22_5:
-; RV32I-NEXT:    sub t0, a1, t0
-; RV32I-NEXT:    sub t0, t0, t2
-; RV32I-NEXT:    sub a5, a3, a5
-; RV32I-NEXT:    beq t0, a1, .LBB22_7
+; RV32I-NEXT:    sub a5, a1, a5
+; RV32I-NEXT:    sub a5, a5, t2
+; RV32I-NEXT:    sub a3, t0, a3
+; RV32I-NEXT:    beq a5, a1, .LBB22_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a1, a1, t0
+; RV32I-NEXT:    sltu a1, a1, a5
 ; RV32I-NEXT:    j .LBB22_8
 ; RV32I-NEXT:  .LBB22_7:
-; RV32I-NEXT:    sltu a1, a3, a5
+; RV32I-NEXT:    sltu a1, t0, a3
 ; RV32I-NEXT:  .LBB22_8:
-; RV32I-NEXT:    xor a3, a2, a6
-; RV32I-NEXT:    xor a4, a7, a4
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    beqz a3, .LBB22_10
+; RV32I-NEXT:    xor a4, a6, a4
+; RV32I-NEXT:    xor a2, a7, a2
+; RV32I-NEXT:    or a2, a2, a4
+; RV32I-NEXT:    beqz a2, .LBB22_10
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    mv a1, t1
 ; RV32I-NEXT:  .LBB22_10:
-; RV32I-NEXT:    neg a6, a1
-; RV32I-NEXT:    xor a3, a7, a6
-; RV32I-NEXT:    sltu a4, a3, a6
-; RV32I-NEXT:    xor a2, a2, a6
-; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sub a4, a2, a4
-; RV32I-NEXT:    xor a2, a5, a6
-; RV32I-NEXT:    sltu a5, a2, a6
-; RV32I-NEXT:    xor a7, t0, a6
-; RV32I-NEXT:    mv t1, a5
-; RV32I-NEXT:    beqz t0, .LBB22_12
+; RV32I-NEXT:    neg t0, a1
+; RV32I-NEXT:    xor a2, a7, t0
+; RV32I-NEXT:    sltu a4, a2, t0
+; RV32I-NEXT:    xor a6, a6, t0
+; RV32I-NEXT:    add a6, a6, a1
+; RV32I-NEXT:    sub a4, a6, a4
+; RV32I-NEXT:    xor a3, a3, t0
+; RV32I-NEXT:    sltu a6, a3, t0
+; RV32I-NEXT:    xor a7, a5, t0
+; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    beqz a5, .LBB22_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, a6
+; RV32I-NEXT:    sltu t1, a7, t0
 ; RV32I-NEXT:  .LBB22_12:
-; RV32I-NEXT:    add a3, a3, a1
-; RV32I-NEXT:    sltu a6, a3, t1
-; RV32I-NEXT:    sub a4, a4, a6
-; RV32I-NEXT:    sub a3, a3, t1
+; RV32I-NEXT:    add a2, a2, a1
+; RV32I-NEXT:    sltu a5, a2, t1
+; RV32I-NEXT:    sub a4, a4, a5
+; RV32I-NEXT:    sub a2, a2, t1
 ; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a5
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    sub a5, a7, a6
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sw a2, 8(a0)
 ; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 ;
@@ -1620,75 +1620,75 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ;
 ; RV32ZBB-LABEL: abd_cmp_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a5, 0(a2)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a6, 12(a1)
-; RV32ZBB-NEXT:    lw t0, 4(a2)
+; RV32ZBB-NEXT:    lw a3, 0(a2)
+; RV32ZBB-NEXT:    lw a5, 4(a2)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw a2, 8(a1)
+; RV32ZBB-NEXT:    lw a4, 12(a1)
+; RV32ZBB-NEXT:    lw t0, 0(a1)
 ; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu a2, a4, a7
-; RV32ZBB-NEXT:    sub t1, a6, t1
-; RV32ZBB-NEXT:    sltu t2, a3, a5
-; RV32ZBB-NEXT:    sub a2, t1, a2
+; RV32ZBB-NEXT:    sltu t1, a2, a6
+; RV32ZBB-NEXT:    sub a7, a4, a7
+; RV32ZBB-NEXT:    sltu t2, t0, a3
+; RV32ZBB-NEXT:    sub a7, a7, t1
 ; RV32ZBB-NEXT:    mv t1, t2
-; RV32ZBB-NEXT:    beq a1, t0, .LBB22_2
+; RV32ZBB-NEXT:    beq a1, a5, .LBB22_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a1, t0
+; RV32ZBB-NEXT:    sltu t1, a1, a5
 ; RV32ZBB-NEXT:  .LBB22_2:
-; RV32ZBB-NEXT:    sub a7, a4, a7
-; RV32ZBB-NEXT:    sltu t3, a7, t1
-; RV32ZBB-NEXT:    sub a2, a2, t3
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    beq a2, a6, .LBB22_4
+; RV32ZBB-NEXT:    sub t3, a2, a6
+; RV32ZBB-NEXT:    sltu a6, t3, t1
+; RV32ZBB-NEXT:    sub a6, a7, a6
+; RV32ZBB-NEXT:    sub a7, t3, t1
+; RV32ZBB-NEXT:    beq a6, a4, .LBB22_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a6, a2
+; RV32ZBB-NEXT:    sltu t1, a4, a6
 ; RV32ZBB-NEXT:    j .LBB22_5
 ; RV32ZBB-NEXT:  .LBB22_4:
-; RV32ZBB-NEXT:    sltu t1, a4, a7
+; RV32ZBB-NEXT:    sltu t1, a2, a7
 ; RV32ZBB-NEXT:  .LBB22_5:
-; RV32ZBB-NEXT:    sub t0, a1, t0
-; RV32ZBB-NEXT:    sub t0, t0, t2
-; RV32ZBB-NEXT:    sub a5, a3, a5
-; RV32ZBB-NEXT:    beq t0, a1, .LBB22_7
+; RV32ZBB-NEXT:    sub a5, a1, a5
+; RV32ZBB-NEXT:    sub a5, a5, t2
+; RV32ZBB-NEXT:    sub a3, t0, a3
+; RV32ZBB-NEXT:    beq a5, a1, .LBB22_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a1, a1, t0
+; RV32ZBB-NEXT:    sltu a1, a1, a5
 ; RV32ZBB-NEXT:    j .LBB22_8
 ; RV32ZBB-NEXT:  .LBB22_7:
-; RV32ZBB-NEXT:    sltu a1, a3, a5
+; RV32ZBB-NEXT:    sltu a1, t0, a3
 ; RV32ZBB-NEXT:  .LBB22_8:
-; RV32ZBB-NEXT:    xor a3, a2, a6
-; RV32ZBB-NEXT:    xor a4, a7, a4
-; RV32ZBB-NEXT:    or a3, a4, a3
-; RV32ZBB-NEXT:    beqz a3, .LBB22_10
+; RV32ZBB-NEXT:    xor a4, a6, a4
+; RV32ZBB-NEXT:    xor a2, a7, a2
+; RV32ZBB-NEXT:    or a2, a2, a4
+; RV32ZBB-NEXT:    beqz a2, .LBB22_10
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    mv a1, t1
 ; RV32ZBB-NEXT:  .LBB22_10:
-; RV32ZBB-NEXT:    neg a6, a1
-; RV32ZBB-NEXT:    xor a3, a7, a6
-; RV32ZBB-NEXT:    sltu a4, a3, a6
-; RV32ZBB-NEXT:    xor a2, a2, a6
-; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sub a4, a2, a4
-; RV32ZBB-NEXT:    xor a2, a5, a6
-; RV32ZBB-NEXT:    sltu a5, a2, a6
-; RV32ZBB-NEXT:    xor a7, t0, a6
-; RV32ZBB-NEXT:    mv t1, a5
-; RV32ZBB-NEXT:    beqz t0, .LBB22_12
+; RV32ZBB-NEXT:    neg t0, a1
+; RV32ZBB-NEXT:    xor a2, a7, t0
+; RV32ZBB-NEXT:    sltu a4, a2, t0
+; RV32ZBB-NEXT:    xor a6, a6, t0
+; RV32ZBB-NEXT:    add a6, a6, a1
+; RV32ZBB-NEXT:    sub a4, a6, a4
+; RV32ZBB-NEXT:    xor a3, a3, t0
+; RV32ZBB-NEXT:    sltu a6, a3, t0
+; RV32ZBB-NEXT:    xor a7, a5, t0
+; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    beqz a5, .LBB22_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, a6
+; RV32ZBB-NEXT:    sltu t1, a7, t0
 ; RV32ZBB-NEXT:  .LBB22_12:
-; RV32ZBB-NEXT:    add a3, a3, a1
-; RV32ZBB-NEXT:    sltu a6, a3, t1
-; RV32ZBB-NEXT:    sub a4, a4, a6
-; RV32ZBB-NEXT:    sub a3, a3, t1
+; RV32ZBB-NEXT:    add a2, a2, a1
+; RV32ZBB-NEXT:    sltu a5, a2, t1
+; RV32ZBB-NEXT:    sub a4, a4, a5
+; RV32ZBB-NEXT:    sub a2, a2, t1
 ; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    sub a5, a7, a6
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
-; RV32ZBB-NEXT:    sw a3, 8(a0)
+; RV32ZBB-NEXT:    sw a2, 8(a0)
 ; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
@@ -1918,10 +1918,10 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_select_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a7, 4(a2)
-; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a2)
 ; RV32I-NEXT:    lw t0, 12(a2)
 ; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    beq a5, t0, .LBB27_2
 ; RV32I-NEXT:  # %bb.1:
@@ -2012,75 +2012,75 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ;
 ; RV32ZBB-LABEL: abd_select_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a5, 0(a2)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a6, 12(a1)
-; RV32ZBB-NEXT:    lw t0, 4(a2)
+; RV32ZBB-NEXT:    lw a3, 0(a2)
+; RV32ZBB-NEXT:    lw a5, 4(a2)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw a2, 8(a1)
+; RV32ZBB-NEXT:    lw a4, 12(a1)
+; RV32ZBB-NEXT:    lw t0, 0(a1)
 ; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu a2, a4, a7
-; RV32ZBB-NEXT:    sub t1, a6, t1
-; RV32ZBB-NEXT:    sltu t2, a3, a5
-; RV32ZBB-NEXT:    sub a2, t1, a2
+; RV32ZBB-NEXT:    sltu t1, a2, a6
+; RV32ZBB-NEXT:    sub a7, a4, a7
+; RV32ZBB-NEXT:    sltu t2, t0, a3
+; RV32ZBB-NEXT:    sub a7, a7, t1
 ; RV32ZBB-NEXT:    mv t1, t2
-; RV32ZBB-NEXT:    beq a1, t0, .LBB27_2
+; RV32ZBB-NEXT:    beq a1, a5, .LBB27_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a1, t0
+; RV32ZBB-NEXT:    sltu t1, a1, a5
 ; RV32ZBB-NEXT:  .LBB27_2:
-; RV32ZBB-NEXT:    sub a7, a4, a7
-; RV32ZBB-NEXT:    sltu t3, a7, t1
-; RV32ZBB-NEXT:    sub a2, a2, t3
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    beq a2, a6, .LBB27_4
+; RV32ZBB-NEXT:    sub t3, a2, a6
+; RV32ZBB-NEXT:    sltu a6, t3, t1
+; RV32ZBB-NEXT:    sub a6, a7, a6
+; RV32ZBB-NEXT:    sub a7, t3, t1
+; RV32ZBB-NEXT:    beq a6, a4, .LBB27_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a6, a2
+; RV32ZBB-NEXT:    sltu t1, a4, a6
 ; RV32ZBB-NEXT:    j .LBB27_5
 ; RV32ZBB-NEXT:  .LBB27_4:
-; RV32ZBB-NEXT:    sltu t1, a4, a7
+; RV32ZBB-NEXT:    sltu t1, a2, a7
 ; RV32ZBB-NEXT:  .LBB27_5:
-; RV32ZBB-NEXT:    sub t0, a1, t0
-; RV32ZBB-NEXT:    sub t0, t0, t2
-; RV32ZBB-NEXT:    sub a5, a3, a5
-; RV32ZBB-NEXT:    beq t0, a1, .LBB27_7
+; RV32ZBB-NEXT:    sub a5, a1, a5
+; RV32ZBB-NEXT:    sub a5, a5, t2
+; RV32ZBB-NEXT:    sub a3, t0, a3
+; RV32ZBB-NEXT:    beq a5, a1, .LBB27_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a1, a1, t0
+; RV32ZBB-NEXT:    sltu a1, a1, a5
 ; RV32ZBB-NEXT:    j .LBB27_8
 ; RV32ZBB-NEXT:  .LBB27_7:
-; RV32ZBB-NEXT:    sltu a1, a3, a5
+; RV32ZBB-NEXT:    sltu a1, t0, a3
 ; RV32ZBB-NEXT:  .LBB27_8:
-; RV32ZBB-NEXT:    xor a3, a2, a6
-; RV32ZBB-NEXT:    xor a4, a7, a4
-; RV32ZBB-NEXT:    or a3, a4, a3
-; RV32ZBB-NEXT:    beqz a3, .LBB27_10
+; RV32ZBB-NEXT:    xor a4, a6, a4
+; RV32ZBB-NEXT:    xor a2, a7, a2
+; RV32ZBB-NEXT:    or a2, a2, a4
+; RV32ZBB-NEXT:    beqz a2, .LBB27_10
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    mv a1, t1
 ; RV32ZBB-NEXT:  .LBB27_10:
-; RV32ZBB-NEXT:    neg a6, a1
-; RV32ZBB-NEXT:    xor a3, a7, a6
-; RV32ZBB-NEXT:    sltu a4, a3, a6
-; RV32ZBB-NEXT:    xor a2, a2, a6
-; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sub a4, a2, a4
-; RV32ZBB-NEXT:    xor a2, a5, a6
-; RV32ZBB-NEXT:    sltu a5, a2, a6
-; RV32ZBB-NEXT:    xor a7, t0, a6
-; RV32ZBB-NEXT:    mv t1, a5
-; RV32ZBB-NEXT:    beqz t0, .LBB27_12
+; RV32ZBB-NEXT:    neg t0, a1
+; RV32ZBB-NEXT:    xor a2, a7, t0
+; RV32ZBB-NEXT:    sltu a4, a2, t0
+; RV32ZBB-NEXT:    xor a6, a6, t0
+; RV32ZBB-NEXT:    add a6, a6, a1
+; RV32ZBB-NEXT:    sub a4, a6, a4
+; RV32ZBB-NEXT:    xor a3, a3, t0
+; RV32ZBB-NEXT:    sltu a6, a3, t0
+; RV32ZBB-NEXT:    xor a7, a5, t0
+; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    beqz a5, .LBB27_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, a6
+; RV32ZBB-NEXT:    sltu t1, a7, t0
 ; RV32ZBB-NEXT:  .LBB27_12:
-; RV32ZBB-NEXT:    add a3, a3, a1
-; RV32ZBB-NEXT:    sltu a6, a3, t1
-; RV32ZBB-NEXT:    sub a4, a4, a6
-; RV32ZBB-NEXT:    sub a3, a3, t1
+; RV32ZBB-NEXT:    add a2, a2, a1
+; RV32ZBB-NEXT:    sltu a5, a2, t1
+; RV32ZBB-NEXT:    sub a4, a4, a5
+; RV32ZBB-NEXT:    sub a2, a2, t1
 ; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    sub a5, a7, a6
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
-; RV32ZBB-NEXT:    sw a3, 8(a0)
+; RV32ZBB-NEXT:    sw a2, 8(a0)
 ; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll
index 274f1cef49aa9..823918f1c42e7 100644
--- a/llvm/test/CodeGen/RISCV/add-before-shl.ll
+++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll
@@ -167,17 +167,17 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a4, 12(a1)
-; RV32I-NEXT:    lw a1, 8(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    srli a5, a2, 29
 ; RV32I-NEXT:    slli a6, a3, 3
 ; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    srli a3, a3, 29
-; RV32I-NEXT:    slli a6, a1, 3
+; RV32I-NEXT:    slli a6, a4, 3
 ; RV32I-NEXT:    or a3, a6, a3
-; RV32I-NEXT:    srli a1, a1, 29
-; RV32I-NEXT:    slli a4, a4, 3
-; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    srli a4, a4, 29
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a2, a2, 3
 ; RV32I-NEXT:    lui a4, 128
 ; RV32I-NEXT:    add a1, a1, a4
@@ -200,26 +200,26 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ;
 ; RV32C-LABEL: add_wide_operand:
 ; RV32C:       # %bb.0:
-; RV32C-NEXT:    lw a6, 4(a1)
-; RV32C-NEXT:    c.lw a3, 12(a1)
-; RV32C-NEXT:    c.lw a4, 0(a1)
+; RV32C-NEXT:    c.lw a2, 12(a1)
+; RV32C-NEXT:    lw a6, 0(a1)
+; RV32C-NEXT:    c.lw a3, 4(a1)
 ; RV32C-NEXT:    c.lw a1, 8(a1)
 ; RV32C-NEXT:    c.lui a5, 16
-; RV32C-NEXT:    c.add a3, a5
-; RV32C-NEXT:    c.slli a3, 3
+; RV32C-NEXT:    c.add a2, a5
+; RV32C-NEXT:    c.slli a2, 3
 ; RV32C-NEXT:    srli a5, a1, 29
-; RV32C-NEXT:    c.or a3, a5
-; RV32C-NEXT:    srli a5, a4, 29
-; RV32C-NEXT:    slli a2, a6, 3
 ; RV32C-NEXT:    c.or a2, a5
 ; RV32C-NEXT:    srli a5, a6, 29
+; RV32C-NEXT:    slli a4, a3, 3
+; RV32C-NEXT:    c.or a4, a5
+; RV32C-NEXT:    c.srli a3, 29
 ; RV32C-NEXT:    c.slli a1, 3
-; RV32C-NEXT:    c.or a1, a5
-; RV32C-NEXT:    c.slli a4, 3
-; RV32C-NEXT:    c.sw a4, 0(a0)
+; RV32C-NEXT:    c.or a1, a3
+; RV32C-NEXT:    c.slli a6, 3
+; RV32C-NEXT:    sw a6, 0(a0)
 ; RV32C-NEXT:    c.sw a1, 8(a0)
-; RV32C-NEXT:    c.sw a2, 4(a0)
-; RV32C-NEXT:    c.sw a3, 12(a0)
+; RV32C-NEXT:    c.sw a4, 4(a0)
+; RV32C-NEXT:    c.sw a2, 12(a0)
 ; RV32C-NEXT:    c.jr ra
 ;
 ; RV64C-LABEL: add_wide_operand:
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
index 8d3fc96109262..35a1227b86b3a 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
@@ -192,37 +192,37 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 4(a0)
-; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a5, 4(a0)
 ; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB11_2
 ; RV32-NEXT:  .LBB11_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB11_2 Depth=1
-; RV32-NEXT:    sw a5, 8(sp)
-; RV32-NEXT:    sw a4, 12(sp)
+; RV32-NEXT:    sw a4, 8(sp)
+; RV32-NEXT:    sw a5, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a4, 12(sp)
-; RV32-NEXT:    lw a5, 8(sp)
+; RV32-NEXT:    lw a4, 8(sp)
+; RV32-NEXT:    lw a5, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB11_6
 ; RV32-NEXT:  .LBB11_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a4, s1, .LBB11_4
+; RV32-NEXT:    beq a5, s1, .LBB11_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB11_2 Depth=1
-; RV32-NEXT:    slt a0, s1, a4
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    slt a0, s1, a5
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    bnez a0, .LBB11_1
 ; RV32-NEXT:    j .LBB11_5
 ; RV32-NEXT:  .LBB11_4: # in Loop: Header=BB11_2 Depth=1
-; RV32-NEXT:    sltu a0, s2, a5
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    sltu a0, s2, a4
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    bnez a0, .LBB11_1
 ; RV32-NEXT:  .LBB11_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB11_2 Depth=1
@@ -268,37 +268,37 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 4(a0)
-; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a5, 4(a0)
 ; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB13_2
 ; RV32-NEXT:  .LBB13_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB13_2 Depth=1
-; RV32-NEXT:    sw a5, 8(sp)
-; RV32-NEXT:    sw a4, 12(sp)
+; RV32-NEXT:    sw a4, 8(sp)
+; RV32-NEXT:    sw a5, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a4, 12(sp)
-; RV32-NEXT:    lw a5, 8(sp)
+; RV32-NEXT:    lw a4, 8(sp)
+; RV32-NEXT:    lw a5, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB13_6
 ; RV32-NEXT:  .LBB13_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a4, s1, .LBB13_4
+; RV32-NEXT:    beq a5, s1, .LBB13_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB13_2 Depth=1
-; RV32-NEXT:    sltu a0, s1, a4
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    sltu a0, s1, a5
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    bnez a0, .LBB13_1
 ; RV32-NEXT:    j .LBB13_5
 ; RV32-NEXT:  .LBB13_4: # in Loop: Header=BB13_2 Depth=1
-; RV32-NEXT:    sltu a0, s2, a5
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    sltu a0, s2, a4
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    bnez a0, .LBB13_1
 ; RV32-NEXT:  .LBB13_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB13_2 Depth=1
@@ -344,37 +344,37 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 4(a0)
-; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a5, 4(a0)
 ; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB15_2
 ; RV32-NEXT:  .LBB15_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB15_2 Depth=1
-; RV32-NEXT:    sw a5, 8(sp)
-; RV32-NEXT:    sw a4, 12(sp)
+; RV32-NEXT:    sw a4, 8(sp)
+; RV32-NEXT:    sw a5, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a4, 12(sp)
-; RV32-NEXT:    lw a5, 8(sp)
+; RV32-NEXT:    lw a4, 8(sp)
+; RV32-NEXT:    lw a5, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_6
 ; RV32-NEXT:  .LBB15_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a4, s1, .LBB15_4
+; RV32-NEXT:    beq a5, s1, .LBB15_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB15_2 Depth=1
-; RV32-NEXT:    slt a0, s1, a4
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    slt a0, s1, a5
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    beqz a0, .LBB15_1
 ; RV32-NEXT:    j .LBB15_5
 ; RV32-NEXT:  .LBB15_4: # in Loop: Header=BB15_2 Depth=1
-; RV32-NEXT:    sltu a0, s2, a5
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    sltu a0, s2, a4
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    beqz a0, .LBB15_1
 ; RV32-NEXT:  .LBB15_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB15_2 Depth=1
@@ -420,37 +420,37 @@ define void @amominu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 4(a0)
-; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a5, 4(a0)
 ; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB17_2
 ; RV32-NEXT:  .LBB17_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB17_2 Depth=1
-; RV32-NEXT:    sw a5, 8(sp)
-; RV32-NEXT:    sw a4, 12(sp)
+; RV32-NEXT:    sw a4, 8(sp)
+; RV32-NEXT:    sw a5, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a4, 12(sp)
-; RV32-NEXT:    lw a5, 8(sp)
+; RV32-NEXT:    lw a4, 8(sp)
+; RV32-NEXT:    lw a5, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB17_6
 ; RV32-NEXT:  .LBB17_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a4, s1, .LBB17_4
+; RV32-NEXT:    beq a5, s1, .LBB17_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB17_2 Depth=1
-; RV32-NEXT:    sltu a0, s1, a4
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    sltu a0, s1, a5
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    beqz a0, .LBB17_1
 ; RV32-NEXT:    j .LBB17_5
 ; RV32-NEXT:  .LBB17_4: # in Loop: Header=BB17_2 Depth=1
-; RV32-NEXT:    sltu a0, s2, a5
-; RV32-NEXT:    mv a2, a5
-; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    sltu a0, s2, a4
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    beqz a0, .LBB17_1
 ; RV32-NEXT:  .LBB17_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB17_2 Depth=1
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
index f50744fc3c1f3..469edacb391df 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -26073,36 +26073,36 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB220_2
 ; RV32I-NEXT:  .LBB220_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB220_7
 ; RV32I-NEXT:  .LBB220_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB220_4
+; RV32I-NEXT:    beq a4, s1, .LBB220_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB220_5
 ; RV32I-NEXT:  .LBB220_4: # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB220_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB220_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
@@ -26110,8 +26110,8 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB220_1
 ; RV32I-NEXT:  .LBB220_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26127,36 +26127,36 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB220_2
 ; RV32IA-NEXT:  .LBB220_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB220_7
 ; RV32IA-NEXT:  .LBB220_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB220_4
+; RV32IA-NEXT:    beq a4, s1, .LBB220_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB220_5
 ; RV32IA-NEXT:  .LBB220_4: # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB220_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB220_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
@@ -26164,8 +26164,8 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB220_1
 ; RV32IA-NEXT:  .LBB220_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26226,36 +26226,36 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB221_2
 ; RV32I-NEXT:  .LBB221_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB221_7
 ; RV32I-NEXT:  .LBB221_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB221_4
+; RV32I-NEXT:    beq a4, s1, .LBB221_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB221_5
 ; RV32I-NEXT:  .LBB221_4: # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB221_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB221_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
@@ -26263,8 +26263,8 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB221_1
 ; RV32I-NEXT:  .LBB221_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26280,36 +26280,36 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB221_2
 ; RV32IA-NEXT:  .LBB221_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB221_7
 ; RV32IA-NEXT:  .LBB221_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB221_4
+; RV32IA-NEXT:    beq a4, s1, .LBB221_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB221_5
 ; RV32IA-NEXT:  .LBB221_4: # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB221_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB221_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
@@ -26317,8 +26317,8 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB221_1
 ; RV32IA-NEXT:  .LBB221_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26384,36 +26384,36 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB222_2
 ; RV32I-NEXT:  .LBB222_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB222_7
 ; RV32I-NEXT:  .LBB222_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB222_4
+; RV32I-NEXT:    beq a4, s1, .LBB222_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB222_5
 ; RV32I-NEXT:  .LBB222_4: # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB222_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB222_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
@@ -26421,8 +26421,8 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB222_1
 ; RV32I-NEXT:  .LBB222_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26438,36 +26438,36 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB222_2
 ; RV32IA-NEXT:  .LBB222_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB222_7
 ; RV32IA-NEXT:  .LBB222_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB222_4
+; RV32IA-NEXT:    beq a4, s1, .LBB222_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB222_5
 ; RV32IA-NEXT:  .LBB222_4: # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB222_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB222_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
@@ -26475,8 +26475,8 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB222_1
 ; RV32IA-NEXT:  .LBB222_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26542,36 +26542,36 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB223_2
 ; RV32I-NEXT:  .LBB223_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB223_7
 ; RV32I-NEXT:  .LBB223_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB223_4
+; RV32I-NEXT:    beq a4, s1, .LBB223_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB223_5
 ; RV32I-NEXT:  .LBB223_4: # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB223_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB223_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
@@ -26579,8 +26579,8 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB223_1
 ; RV32I-NEXT:  .LBB223_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26596,36 +26596,36 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB223_2
 ; RV32IA-NEXT:  .LBB223_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB223_7
 ; RV32IA-NEXT:  .LBB223_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB223_4
+; RV32IA-NEXT:    beq a4, s1, .LBB223_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB223_5
 ; RV32IA-NEXT:  .LBB223_4: # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB223_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB223_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
@@ -26633,8 +26633,8 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB223_1
 ; RV32IA-NEXT:  .LBB223_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26700,36 +26700,36 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB224_2
 ; RV32I-NEXT:  .LBB224_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB224_7
 ; RV32I-NEXT:  .LBB224_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB224_4
+; RV32I-NEXT:    beq a4, s1, .LBB224_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB224_5
 ; RV32I-NEXT:  .LBB224_4: # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB224_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB224_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
@@ -26737,8 +26737,8 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB224_1
 ; RV32I-NEXT:  .LBB224_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26754,36 +26754,36 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB224_2
 ; RV32IA-NEXT:  .LBB224_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB224_7
 ; RV32IA-NEXT:  .LBB224_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB224_4
+; RV32IA-NEXT:    beq a4, s1, .LBB224_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB224_5
 ; RV32IA-NEXT:  .LBB224_4: # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB224_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB224_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
@@ -26791,8 +26791,8 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB224_1
 ; RV32IA-NEXT:  .LBB224_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26858,36 +26858,36 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB225_2
 ; RV32I-NEXT:  .LBB225_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB225_7
 ; RV32I-NEXT:  .LBB225_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB225_4
+; RV32I-NEXT:    beq a4, s1, .LBB225_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB225_5
 ; RV32I-NEXT:  .LBB225_4: # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB225_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB225_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
@@ -26895,8 +26895,8 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB225_1
 ; RV32I-NEXT:  .LBB225_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26912,36 +26912,36 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB225_2
 ; RV32IA-NEXT:  .LBB225_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB225_7
 ; RV32IA-NEXT:  .LBB225_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB225_4
+; RV32IA-NEXT:    beq a4, s1, .LBB225_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB225_5
 ; RV32IA-NEXT:  .LBB225_4: # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB225_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB225_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
@@ -26949,8 +26949,8 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB225_1
 ; RV32IA-NEXT:  .LBB225_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27011,36 +27011,36 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB226_2
 ; RV32I-NEXT:  .LBB226_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB226_7
 ; RV32I-NEXT:  .LBB226_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB226_4
+; RV32I-NEXT:    beq a4, s1, .LBB226_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB226_5
 ; RV32I-NEXT:  .LBB226_4: # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB226_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB226_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
@@ -27048,8 +27048,8 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB226_1
 ; RV32I-NEXT:  .LBB226_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27065,36 +27065,36 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB226_2
 ; RV32IA-NEXT:  .LBB226_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB226_7
 ; RV32IA-NEXT:  .LBB226_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB226_4
+; RV32IA-NEXT:    beq a4, s1, .LBB226_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB226_5
 ; RV32IA-NEXT:  .LBB226_4: # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB226_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB226_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
@@ -27102,8 +27102,8 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB226_1
 ; RV32IA-NEXT:  .LBB226_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27169,36 +27169,36 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB227_2
 ; RV32I-NEXT:  .LBB227_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB227_7
 ; RV32I-NEXT:  .LBB227_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB227_4
+; RV32I-NEXT:    beq a4, s1, .LBB227_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB227_5
 ; RV32I-NEXT:  .LBB227_4: # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB227_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB227_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
@@ -27206,8 +27206,8 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB227_1
 ; RV32I-NEXT:  .LBB227_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27223,36 +27223,36 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB227_2
 ; RV32IA-NEXT:  .LBB227_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB227_7
 ; RV32IA-NEXT:  .LBB227_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB227_4
+; RV32IA-NEXT:    beq a4, s1, .LBB227_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB227_5
 ; RV32IA-NEXT:  .LBB227_4: # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB227_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB227_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
@@ -27260,8 +27260,8 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB227_1
 ; RV32IA-NEXT:  .LBB227_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27327,36 +27327,36 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB228_2
 ; RV32I-NEXT:  .LBB228_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB228_7
 ; RV32I-NEXT:  .LBB228_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB228_4
+; RV32I-NEXT:    beq a4, s1, .LBB228_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB228_5
 ; RV32I-NEXT:  .LBB228_4: # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB228_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB228_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
@@ -27364,8 +27364,8 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB228_1
 ; RV32I-NEXT:  .LBB228_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27381,36 +27381,36 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB228_2
 ; RV32IA-NEXT:  .LBB228_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB228_7
 ; RV32IA-NEXT:  .LBB228_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB228_4
+; RV32IA-NEXT:    beq a4, s1, .LBB228_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB228_5
 ; RV32IA-NEXT:  .LBB228_4: # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB228_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB228_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
@@ -27418,8 +27418,8 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB228_1
 ; RV32IA-NEXT:  .LBB228_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27485,36 +27485,36 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB229_2
 ; RV32I-NEXT:  .LBB229_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB229_7
 ; RV32I-NEXT:  .LBB229_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB229_4
+; RV32I-NEXT:    beq a4, s1, .LBB229_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB229_5
 ; RV32I-NEXT:  .LBB229_4: # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB229_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB229_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
@@ -27522,8 +27522,8 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB229_1
 ; RV32I-NEXT:  .LBB229_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27539,36 +27539,36 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB229_2
 ; RV32IA-NEXT:  .LBB229_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB229_7
 ; RV32IA-NEXT:  .LBB229_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB229_4
+; RV32IA-NEXT:    beq a4, s1, .LBB229_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB229_5
 ; RV32IA-NEXT:  .LBB229_4: # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB229_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB229_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
@@ -27576,8 +27576,8 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB229_1
 ; RV32IA-NEXT:  .LBB229_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27643,36 +27643,36 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB230_2
 ; RV32I-NEXT:  .LBB230_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB230_7
 ; RV32I-NEXT:  .LBB230_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB230_4
+; RV32I-NEXT:    beq a4, s1, .LBB230_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB230_5
 ; RV32I-NEXT:  .LBB230_4: # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB230_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB230_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
@@ -27680,8 +27680,8 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB230_1
 ; RV32I-NEXT:  .LBB230_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27697,36 +27697,36 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB230_2
 ; RV32IA-NEXT:  .LBB230_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB230_7
 ; RV32IA-NEXT:  .LBB230_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB230_4
+; RV32IA-NEXT:    beq a4, s1, .LBB230_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB230_5
 ; RV32IA-NEXT:  .LBB230_4: # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB230_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB230_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
@@ -27734,8 +27734,8 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB230_1
 ; RV32IA-NEXT:  .LBB230_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27796,36 +27796,36 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB231_2
 ; RV32I-NEXT:  .LBB231_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB231_7
 ; RV32I-NEXT:  .LBB231_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB231_4
+; RV32I-NEXT:    beq a4, s1, .LBB231_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB231_5
 ; RV32I-NEXT:  .LBB231_4: # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB231_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB231_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
@@ -27833,8 +27833,8 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB231_1
 ; RV32I-NEXT:  .LBB231_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27850,36 +27850,36 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB231_2
 ; RV32IA-NEXT:  .LBB231_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB231_7
 ; RV32IA-NEXT:  .LBB231_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB231_4
+; RV32IA-NEXT:    beq a4, s1, .LBB231_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB231_5
 ; RV32IA-NEXT:  .LBB231_4: # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB231_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB231_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
@@ -27887,8 +27887,8 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB231_1
 ; RV32IA-NEXT:  .LBB231_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27954,36 +27954,36 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB232_2
 ; RV32I-NEXT:  .LBB232_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB232_7
 ; RV32I-NEXT:  .LBB232_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB232_4
+; RV32I-NEXT:    beq a4, s1, .LBB232_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB232_5
 ; RV32I-NEXT:  .LBB232_4: # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB232_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB232_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
@@ -27991,8 +27991,8 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB232_1
 ; RV32I-NEXT:  .LBB232_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28008,36 +28008,36 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB232_2
 ; RV32IA-NEXT:  .LBB232_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB232_7
 ; RV32IA-NEXT:  .LBB232_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB232_4
+; RV32IA-NEXT:    beq a4, s1, .LBB232_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB232_5
 ; RV32IA-NEXT:  .LBB232_4: # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB232_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB232_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
@@ -28045,8 +28045,8 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB232_1
 ; RV32IA-NEXT:  .LBB232_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28112,36 +28112,36 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB233_2
 ; RV32I-NEXT:  .LBB233_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB233_7
 ; RV32I-NEXT:  .LBB233_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB233_4
+; RV32I-NEXT:    beq a4, s1, .LBB233_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB233_5
 ; RV32I-NEXT:  .LBB233_4: # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB233_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB233_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
@@ -28149,8 +28149,8 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB233_1
 ; RV32I-NEXT:  .LBB233_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28166,36 +28166,36 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB233_2
 ; RV32IA-NEXT:  .LBB233_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB233_7
 ; RV32IA-NEXT:  .LBB233_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB233_4
+; RV32IA-NEXT:    beq a4, s1, .LBB233_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB233_5
 ; RV32IA-NEXT:  .LBB233_4: # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB233_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB233_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
@@ -28203,8 +28203,8 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB233_1
 ; RV32IA-NEXT:  .LBB233_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28270,36 +28270,36 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB234_2
 ; RV32I-NEXT:  .LBB234_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB234_7
 ; RV32I-NEXT:  .LBB234_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB234_4
+; RV32I-NEXT:    beq a4, s1, .LBB234_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB234_5
 ; RV32I-NEXT:  .LBB234_4: # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB234_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB234_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
@@ -28307,8 +28307,8 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB234_1
 ; RV32I-NEXT:  .LBB234_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28324,36 +28324,36 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB234_2
 ; RV32IA-NEXT:  .LBB234_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB234_7
 ; RV32IA-NEXT:  .LBB234_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB234_4
+; RV32IA-NEXT:    beq a4, s1, .LBB234_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB234_5
 ; RV32IA-NEXT:  .LBB234_4: # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB234_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB234_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
@@ -28361,8 +28361,8 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB234_1
 ; RV32IA-NEXT:  .LBB234_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28428,36 +28428,36 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB235_2
 ; RV32I-NEXT:  .LBB235_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB235_7
 ; RV32I-NEXT:  .LBB235_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB235_4
+; RV32I-NEXT:    beq a4, s1, .LBB235_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB235_5
 ; RV32I-NEXT:  .LBB235_4: # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB235_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB235_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
@@ -28465,8 +28465,8 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB235_1
 ; RV32I-NEXT:  .LBB235_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28482,36 +28482,36 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB235_2
 ; RV32IA-NEXT:  .LBB235_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB235_7
 ; RV32IA-NEXT:  .LBB235_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB235_4
+; RV32IA-NEXT:    beq a4, s1, .LBB235_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB235_5
 ; RV32IA-NEXT:  .LBB235_4: # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB235_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB235_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
@@ -28519,8 +28519,8 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB235_1
 ; RV32IA-NEXT:  .LBB235_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28581,36 +28581,36 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB236_2
 ; RV32I-NEXT:  .LBB236_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB236_7
 ; RV32I-NEXT:  .LBB236_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB236_4
+; RV32I-NEXT:    beq a4, s1, .LBB236_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB236_5
 ; RV32I-NEXT:  .LBB236_4: # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB236_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB236_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
@@ -28618,8 +28618,8 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB236_1
 ; RV32I-NEXT:  .LBB236_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28635,36 +28635,36 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB236_2
 ; RV32IA-NEXT:  .LBB236_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB236_7
 ; RV32IA-NEXT:  .LBB236_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB236_4
+; RV32IA-NEXT:    beq a4, s1, .LBB236_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB236_5
 ; RV32IA-NEXT:  .LBB236_4: # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB236_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB236_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
@@ -28672,8 +28672,8 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB236_1
 ; RV32IA-NEXT:  .LBB236_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28739,36 +28739,36 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB237_2
 ; RV32I-NEXT:  .LBB237_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB237_7
 ; RV32I-NEXT:  .LBB237_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB237_4
+; RV32I-NEXT:    beq a4, s1, .LBB237_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB237_5
 ; RV32I-NEXT:  .LBB237_4: # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB237_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB237_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
@@ -28776,8 +28776,8 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB237_1
 ; RV32I-NEXT:  .LBB237_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28793,36 +28793,36 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB237_2
 ; RV32IA-NEXT:  .LBB237_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB237_7
 ; RV32IA-NEXT:  .LBB237_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB237_4
+; RV32IA-NEXT:    beq a4, s1, .LBB237_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB237_5
 ; RV32IA-NEXT:  .LBB237_4: # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB237_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB237_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
@@ -28830,8 +28830,8 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB237_1
 ; RV32IA-NEXT:  .LBB237_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28897,36 +28897,36 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB238_2
 ; RV32I-NEXT:  .LBB238_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB238_7
 ; RV32I-NEXT:  .LBB238_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB238_4
+; RV32I-NEXT:    beq a4, s1, .LBB238_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB238_5
 ; RV32I-NEXT:  .LBB238_4: # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB238_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB238_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
@@ -28934,8 +28934,8 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB238_1
 ; RV32I-NEXT:  .LBB238_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28951,36 +28951,36 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB238_2
 ; RV32IA-NEXT:  .LBB238_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB238_7
 ; RV32IA-NEXT:  .LBB238_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB238_4
+; RV32IA-NEXT:    beq a4, s1, .LBB238_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB238_5
 ; RV32IA-NEXT:  .LBB238_4: # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB238_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB238_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
@@ -28988,8 +28988,8 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB238_1
 ; RV32IA-NEXT:  .LBB238_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -29055,36 +29055,36 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB239_2
 ; RV32I-NEXT:  .LBB239_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB239_7
 ; RV32I-NEXT:  .LBB239_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB239_4
+; RV32I-NEXT:    beq a4, s1, .LBB239_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB239_5
 ; RV32I-NEXT:  .LBB239_4: # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB239_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB239_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
@@ -29092,8 +29092,8 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB239_1
 ; RV32I-NEXT:  .LBB239_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -29109,36 +29109,36 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB239_2
 ; RV32IA-NEXT:  .LBB239_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB239_7
 ; RV32IA-NEXT:  .LBB239_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB239_4
+; RV32IA-NEXT:    beq a4, s1, .LBB239_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB239_5
 ; RV32IA-NEXT:  .LBB239_4: # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB239_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB239_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
@@ -29146,8 +29146,8 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB239_1
 ; RV32IA-NEXT:  .LBB239_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index ed0a160d3f58a..06594e35be870 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -3183,36 +3183,36 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB43_2
 ; RV32I-NEXT:  .LBB43_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB43_7
 ; RV32I-NEXT:  .LBB43_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB43_4
+; RV32I-NEXT:    beq a4, s1, .LBB43_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB43_5
 ; RV32I-NEXT:  .LBB43_4: # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB43_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB43_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
@@ -3220,8 +3220,8 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB43_1
 ; RV32I-NEXT:  .LBB43_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3237,36 +3237,36 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB43_2
 ; RV32IA-NEXT:  .LBB43_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB43_7
 ; RV32IA-NEXT:  .LBB43_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB43_4
+; RV32IA-NEXT:    beq a4, s1, .LBB43_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB43_5
 ; RV32IA-NEXT:  .LBB43_4: # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB43_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB43_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
@@ -3274,8 +3274,8 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB43_1
 ; RV32IA-NEXT:  .LBB43_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3336,36 +3336,36 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB44_2
 ; RV32I-NEXT:  .LBB44_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB44_7
 ; RV32I-NEXT:  .LBB44_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB44_4
+; RV32I-NEXT:    beq a4, s1, .LBB44_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a5
+; RV32I-NEXT:    slt a0, s1, a4
 ; RV32I-NEXT:    j .LBB44_5
 ; RV32I-NEXT:  .LBB44_4: # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB44_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB44_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
@@ -3373,8 +3373,8 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB44_1
 ; RV32I-NEXT:  .LBB44_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3390,36 +3390,36 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB44_2
 ; RV32IA-NEXT:  .LBB44_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB44_7
 ; RV32IA-NEXT:  .LBB44_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB44_4
+; RV32IA-NEXT:    beq a4, s1, .LBB44_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a5
+; RV32IA-NEXT:    slt a0, s1, a4
 ; RV32IA-NEXT:    j .LBB44_5
 ; RV32IA-NEXT:  .LBB44_4: # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB44_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB44_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
@@ -3427,8 +3427,8 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB44_1
 ; RV32IA-NEXT:  .LBB44_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3489,36 +3489,36 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB45_2
 ; RV32I-NEXT:  .LBB45_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB45_7
 ; RV32I-NEXT:  .LBB45_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB45_4
+; RV32I-NEXT:    beq a4, s1, .LBB45_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB45_5
 ; RV32I-NEXT:  .LBB45_4: # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB45_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    bnez a0, .LBB45_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
@@ -3526,8 +3526,8 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB45_1
 ; RV32I-NEXT:  .LBB45_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3543,36 +3543,36 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB45_2
 ; RV32IA-NEXT:  .LBB45_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB45_7
 ; RV32IA-NEXT:  .LBB45_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB45_4
+; RV32IA-NEXT:    beq a4, s1, .LBB45_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB45_5
 ; RV32IA-NEXT:  .LBB45_4: # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB45_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    bnez a0, .LBB45_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
@@ -3580,8 +3580,8 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB45_1
 ; RV32IA-NEXT:  .LBB45_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3642,36 +3642,36 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB46_2
 ; RV32I-NEXT:  .LBB46_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB46_7
 ; RV32I-NEXT:  .LBB46_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB46_4
+; RV32I-NEXT:    beq a4, s1, .LBB46_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB46_5
 ; RV32I-NEXT:  .LBB46_4: # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB46_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a2, a5
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:    beqz a0, .LBB46_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
@@ -3679,8 +3679,8 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s1
 ; RV32I-NEXT:    j .LBB46_1
 ; RV32I-NEXT:  .LBB46_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3696,36 +3696,36 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB46_2
 ; RV32IA-NEXT:  .LBB46_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB46_7
 ; RV32IA-NEXT:  .LBB46_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB46_4
+; RV32IA-NEXT:    beq a4, s1, .LBB46_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB46_5
 ; RV32IA-NEXT:  .LBB46_4: # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB46_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a2, a5
+; RV32IA-NEXT:    mv a3, a4
 ; RV32IA-NEXT:    beqz a0, .LBB46_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
@@ -3733,8 +3733,8 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    mv a3, s1
 ; RV32IA-NEXT:    j .LBB46_1
 ; RV32IA-NEXT:  .LBB46_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
index b5e892c0ff6ac..0d6ae3a51e246 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
@@ -488,43 +488,43 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB3_3
 ; RV32I-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a5, s1
+; RV32I-NEXT:    sltu a0, a4, s1
 ; RV32I-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    and a1, a0, s2
-; RV32I-NEXT:    sltu a2, a4, a1
+; RV32I-NEXT:    sltu a2, a5, a1
 ; RV32I-NEXT:    and a0, a0, s1
-; RV32I-NEXT:    sub a3, a5, a0
+; RV32I-NEXT:    sub a3, a4, a0
 ; RV32I-NEXT:    sub a3, a3, a2
-; RV32I-NEXT:    sub a2, a4, a1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sub a2, a5, a1
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB3_5
 ; RV32I-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    bne a5, s1, .LBB3_1
+; RV32I-NEXT:    bne a4, s1, .LBB3_1
 ; RV32I-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a4, s2
+; RV32I-NEXT:    sltu a0, a5, s2
 ; RV32I-NEXT:    j .LBB3_2
 ; RV32I-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -545,43 +545,43 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB3_3
 ; RV32IA-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a5, s1
+; RV32IA-NEXT:    sltu a0, a4, s1
 ; RV32IA-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
 ; RV32IA-NEXT:    xori a0, a0, 1
 ; RV32IA-NEXT:    neg a0, a0
 ; RV32IA-NEXT:    and a1, a0, s2
-; RV32IA-NEXT:    sltu a2, a4, a1
+; RV32IA-NEXT:    sltu a2, a5, a1
 ; RV32IA-NEXT:    and a0, a0, s1
-; RV32IA-NEXT:    sub a3, a5, a0
+; RV32IA-NEXT:    sub a3, a4, a0
 ; RV32IA-NEXT:    sub a3, a3, a2
-; RV32IA-NEXT:    sub a2, a4, a1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sub a2, a5, a1
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB3_5
 ; RV32IA-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    bne a5, s1, .LBB3_1
+; RV32IA-NEXT:    bne a4, s1, .LBB3_1
 ; RV32IA-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a4, s2
+; RV32IA-NEXT:    sltu a0, a5, s2
 ; RV32IA-NEXT:    j .LBB3_2
 ; RV32IA-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1102,42 +1102,42 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB7_3
 ; RV32I-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_3 Depth=1
-; RV32I-NEXT:    sltu a2, a5, a0
+; RV32I-NEXT:    sltu a2, a4, a0
 ; RV32I-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_3 Depth=1
 ; RV32I-NEXT:    addi a3, a2, -1
 ; RV32I-NEXT:    and a2, a3, a1
 ; RV32I-NEXT:    and a3, a3, a0
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB7_5
 ; RV32I-NEXT:  .LBB7_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    sltu a0, a4, s2
-; RV32I-NEXT:    sub a1, a5, s1
+; RV32I-NEXT:    sltu a0, a5, s2
+; RV32I-NEXT:    sub a1, a4, s1
 ; RV32I-NEXT:    sub a0, a1, a0
-; RV32I-NEXT:    sub a1, a4, s2
-; RV32I-NEXT:    bne a0, a5, .LBB7_1
+; RV32I-NEXT:    sub a1, a5, s2
+; RV32I-NEXT:    bne a0, a4, .LBB7_1
 ; RV32I-NEXT:  # %bb.4: # in Loop: Header=BB7_3 Depth=1
-; RV32I-NEXT:    sltu a2, a4, a1
+; RV32I-NEXT:    sltu a2, a5, a1
 ; RV32I-NEXT:    j .LBB7_2
 ; RV32I-NEXT:  .LBB7_5: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1158,42 +1158,42 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB7_3
 ; RV32IA-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_3 Depth=1
-; RV32IA-NEXT:    sltu a2, a5, a0
+; RV32IA-NEXT:    sltu a2, a4, a0
 ; RV32IA-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_3 Depth=1
 ; RV32IA-NEXT:    addi a3, a2, -1
 ; RV32IA-NEXT:    and a2, a3, a1
 ; RV32IA-NEXT:    and a3, a3, a0
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB7_5
 ; RV32IA-NEXT:  .LBB7_3: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    sltu a0, a4, s2
-; RV32IA-NEXT:    sub a1, a5, s1
+; RV32IA-NEXT:    sltu a0, a5, s2
+; RV32IA-NEXT:    sub a1, a4, s1
 ; RV32IA-NEXT:    sub a0, a1, a0
-; RV32IA-NEXT:    sub a1, a4, s2
-; RV32IA-NEXT:    bne a0, a5, .LBB7_1
+; RV32IA-NEXT:    sub a1, a5, s2
+; RV32IA-NEXT:    bne a0, a4, .LBB7_1
 ; RV32IA-NEXT:  # %bb.4: # in Loop: Header=BB7_3 Depth=1
-; RV32IA-NEXT:    sltu a2, a4, a1
+; RV32IA-NEXT:    sltu a2, a5, a1
 ; RV32IA-NEXT:    j .LBB7_2
 ; RV32IA-NEXT:  .LBB7_5: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index 634ed45044ee2..927e778c9dd9c 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -468,41 +468,41 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB3_3
 ; RV32I-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a5, s1
+; RV32I-NEXT:    sltu a0, a4, s1
 ; RV32I-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    addi a1, a4, 1
+; RV32I-NEXT:    addi a1, a5, 1
 ; RV32I-NEXT:    seqz a2, a1
-; RV32I-NEXT:    add a3, a5, a2
+; RV32I-NEXT:    add a3, a4, a2
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    and a2, a0, a1
 ; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB3_5
 ; RV32I-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    bne a5, s1, .LBB3_1
+; RV32I-NEXT:    bne a4, s1, .LBB3_1
 ; RV32I-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a4, s2
+; RV32I-NEXT:    sltu a0, a5, s2
 ; RV32I-NEXT:    j .LBB3_2
 ; RV32I-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -523,41 +523,41 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB3_3
 ; RV32IA-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a5, s1
+; RV32IA-NEXT:    sltu a0, a4, s1
 ; RV32IA-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    addi a1, a4, 1
+; RV32IA-NEXT:    addi a1, a5, 1
 ; RV32IA-NEXT:    seqz a2, a1
-; RV32IA-NEXT:    add a3, a5, a2
+; RV32IA-NEXT:    add a3, a4, a2
 ; RV32IA-NEXT:    neg a0, a0
 ; RV32IA-NEXT:    and a2, a0, a1
 ; RV32IA-NEXT:    and a3, a0, a3
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB3_5
 ; RV32IA-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    bne a5, s1, .LBB3_1
+; RV32IA-NEXT:    bne a4, s1, .LBB3_1
 ; RV32IA-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a4, s2
+; RV32IA-NEXT:    sltu a0, a5, s2
 ; RV32IA-NEXT:    j .LBB3_2
 ; RV32IA-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1211,35 +1211,35 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB7_2
 ; RV32I-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 8(sp)
+; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB7_7
 ; RV32I-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s1, .LBB7_4
+; RV32I-NEXT:    beq a4, s1, .LBB7_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a5
+; RV32I-NEXT:    sltu a0, s1, a4
 ; RV32I-NEXT:    j .LBB7_5
 ; RV32I-NEXT:  .LBB7_4: # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a0, s2, a5
 ; RV32I-NEXT:  .LBB7_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    or a1, a4, a5
+; RV32I-NEXT:    or a1, a5, a4
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    mv a2, s2
@@ -1247,13 +1247,13 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    bnez a0, .LBB7_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    seqz a0, a4
-; RV32I-NEXT:    sub a3, a5, a0
-; RV32I-NEXT:    addi a2, a4, -1
+; RV32I-NEXT:    seqz a0, a5
+; RV32I-NEXT:    sub a3, a4, a0
+; RV32I-NEXT:    addi a2, a5, -1
 ; RV32I-NEXT:    j .LBB7_1
 ; RV32I-NEXT:  .LBB7_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a0, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1274,35 +1274,35 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 0(a0)
+; RV32IA-NEXT:    lw a4, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB7_2
 ; RV32IA-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a5, 8(sp)
+; RV32IA-NEXT:    sw a4, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 8(sp)
+; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB7_7
 ; RV32IA-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s1, .LBB7_4
+; RV32IA-NEXT:    beq a4, s1, .LBB7_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a5
+; RV32IA-NEXT:    sltu a0, s1, a4
 ; RV32IA-NEXT:    j .LBB7_5
 ; RV32IA-NEXT:  .LBB7_4: # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a0, s2, a5
 ; RV32IA-NEXT:  .LBB7_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    or a1, a4, a5
+; RV32IA-NEXT:    or a1, a5, a4
 ; RV32IA-NEXT:    seqz a1, a1
 ; RV32IA-NEXT:    or a0, a1, a0
 ; RV32IA-NEXT:    mv a2, s2
@@ -1310,13 +1310,13 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    bnez a0, .LBB7_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    seqz a0, a4
-; RV32IA-NEXT:    sub a3, a5, a0
-; RV32IA-NEXT:    addi a2, a4, -1
+; RV32IA-NEXT:    seqz a0, a5
+; RV32IA-NEXT:    sub a3, a4, a0
+; RV32IA-NEXT:    addi a2, a5, -1
 ; RV32IA-NEXT:    j .LBB7_1
 ; RV32IA-NEXT:  .LBB7_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    mv a0, a5
+; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/branch-relaxation.ll b/llvm/test/CodeGen/RISCV/branch-relaxation.ll
index 3d48dc9637eae..ec77d54da116d 100644
--- a/llvm/test/CodeGen/RISCV/branch-relaxation.ll
+++ b/llvm/test/CodeGen/RISCV/branch-relaxation.ll
@@ -824,10 +824,7 @@ define void @relax_jal_spill_32_adjust_spill_slot() {
 ; CHECK-RV32-NEXT:    #APP
 ; CHECK-RV32-NEXT:    # reg use t6
 ; CHECK-RV32-NEXT:    #NO_APP
-; CHECK-RV32-NEXT:    lui a0, 2
-; CHECK-RV32-NEXT:    sub sp, s0, a0
-; CHECK-RV32-NEXT:    addi a0, a0, -2032
-; CHECK-RV32-NEXT:    add sp, sp, a0
+; CHECK-RV32-NEXT:    addi sp, s0, -2032
 ; CHECK-RV32-NEXT:    lw ra, 2028(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s0, 2024(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 2020(sp) # 4-byte Folded Reload
@@ -1073,10 +1070,7 @@ define void @relax_jal_spill_32_adjust_spill_slot() {
 ; CHECK-RV64-NEXT:    #APP
 ; CHECK-RV64-NEXT:    # reg use t6
 ; CHECK-RV64-NEXT:    #NO_APP
-; CHECK-RV64-NEXT:    lui a0, 2
-; CHECK-RV64-NEXT:    sub sp, s0, a0
-; CHECK-RV64-NEXT:    addiw a0, a0, -2032
-; CHECK-RV64-NEXT:    add sp, sp, a0
+; CHECK-RV64-NEXT:    addi sp, s0, -2032
 ; CHECK-RV64-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s0, 2016(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 2008(sp) # 8-byte Folded Reload
@@ -2323,10 +2317,7 @@ define void @relax_jal_spill_64_adjust_spill_slot() {
 ; CHECK-RV32-NEXT:    #APP
 ; CHECK-RV32-NEXT:    # reg use t6
 ; CHECK-RV32-NEXT:    #NO_APP
-; CHECK-RV32-NEXT:    lui a0, 2
-; CHECK-RV32-NEXT:    sub sp, s0, a0
-; CHECK-RV32-NEXT:    addi a0, a0, -2032
-; CHECK-RV32-NEXT:    add sp, sp, a0
+; CHECK-RV32-NEXT:    addi sp, s0, -2032
 ; CHECK-RV32-NEXT:    lw ra, 2028(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s0, 2024(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 2020(sp) # 4-byte Folded Reload
@@ -2560,10 +2551,7 @@ define void @relax_jal_spill_64_adjust_spill_slot() {
 ; CHECK-RV64-NEXT:    #APP
 ; CHECK-RV64-NEXT:    # reg use t6
 ; CHECK-RV64-NEXT:    #NO_APP
-; CHECK-RV64-NEXT:    lui a0, 2
-; CHECK-RV64-NEXT:    sub sp, s0, a0
-; CHECK-RV64-NEXT:    addiw a0, a0, -2032
-; CHECK-RV64-NEXT:    add sp, sp, a0
+; CHECK-RV64-NEXT:    addi sp, s0, -2032
 ; CHECK-RV64-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s0, 2016(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 2008(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
index 2122b3fd91788..337e9bc5845f9 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
@@ -53,22 +53,22 @@ define void @callee() nounwind {
 ; ILP32-NEXT:    flw fs1, 84(a1)
 ; ILP32-NEXT:    flw fs2, 88(a1)
 ; ILP32-NEXT:    flw fs3, 92(a1)
-; ILP32-NEXT:    flw fs4, 96(a1)
-; ILP32-NEXT:    flw fs5, 100(a1)
-; ILP32-NEXT:    flw fs6, 104(a1)
-; ILP32-NEXT:    flw fs7, 108(a1)
-; ILP32-NEXT:    flw fs8, 124(a1)
-; ILP32-NEXT:    flw fs9, 120(a1)
-; ILP32-NEXT:    flw fs10, 116(a1)
-; ILP32-NEXT:    flw fs11, 112(a1)
-; ILP32-NEXT:    fsw fs8, 124(a1)
-; ILP32-NEXT:    fsw fs9, 120(a1)
-; ILP32-NEXT:    fsw fs10, 116(a1)
-; ILP32-NEXT:    fsw fs11, 112(a1)
-; ILP32-NEXT:    fsw fs7, 108(a1)
-; ILP32-NEXT:    fsw fs6, 104(a1)
-; ILP32-NEXT:    fsw fs5, 100(a1)
-; ILP32-NEXT:    fsw fs4, 96(a1)
+; ILP32-NEXT:    flw fs4, 112(a1)
+; ILP32-NEXT:    flw fs5, 116(a1)
+; ILP32-NEXT:    flw fs6, 120(a1)
+; ILP32-NEXT:    flw fs7, 124(a1)
+; ILP32-NEXT:    flw fs8, 96(a1)
+; ILP32-NEXT:    flw fs9, 100(a1)
+; ILP32-NEXT:    flw fs10, 104(a1)
+; ILP32-NEXT:    flw fs11, 108(a1)
+; ILP32-NEXT:    fsw fs7, 124(a1)
+; ILP32-NEXT:    fsw fs6, 120(a1)
+; ILP32-NEXT:    fsw fs5, 116(a1)
+; ILP32-NEXT:    fsw fs4, 112(a1)
+; ILP32-NEXT:    fsw fs11, 108(a1)
+; ILP32-NEXT:    fsw fs10, 104(a1)
+; ILP32-NEXT:    fsw fs9, 100(a1)
+; ILP32-NEXT:    fsw fs8, 96(a1)
 ; ILP32-NEXT:    fsw fs3, 92(a1)
 ; ILP32-NEXT:    fsw fs2, 88(a1)
 ; ILP32-NEXT:    fsw fs1, 84(a1)
@@ -123,22 +123,22 @@ define void @callee() nounwind {
 ; ILP32E-NEXT:    flw fs1, 84(a1)
 ; ILP32E-NEXT:    flw fs2, 88(a1)
 ; ILP32E-NEXT:    flw fs3, 92(a1)
-; ILP32E-NEXT:    flw fs4, 96(a1)
-; ILP32E-NEXT:    flw fs5, 100(a1)
-; ILP32E-NEXT:    flw fs6, 104(a1)
-; ILP32E-NEXT:    flw fs7, 108(a1)
-; ILP32E-NEXT:    flw fs8, 124(a1)
-; ILP32E-NEXT:    flw fs9, 120(a1)
-; ILP32E-NEXT:    flw fs10, 116(a1)
-; ILP32E-NEXT:    flw fs11, 112(a1)
-; ILP32E-NEXT:    fsw fs8, 124(a1)
-; ILP32E-NEXT:    fsw fs9, 120(a1)
-; ILP32E-NEXT:    fsw fs10, 116(a1)
-; ILP32E-NEXT:    fsw fs11, 112(a1)
-; ILP32E-NEXT:    fsw fs7, 108(a1)
-; ILP32E-NEXT:    fsw fs6, 104(a1)
-; ILP32E-NEXT:    fsw fs5, 100(a1)
-; ILP32E-NEXT:    fsw fs4, 96(a1)
+; ILP32E-NEXT:    flw fs4, 112(a1)
+; ILP32E-NEXT:    flw fs5, 116(a1)
+; ILP32E-NEXT:    flw fs6, 120(a1)
+; ILP32E-NEXT:    flw fs7, 124(a1)
+; ILP32E-NEXT:    flw fs8, 96(a1)
+; ILP32E-NEXT:    flw fs9, 100(a1)
+; ILP32E-NEXT:    flw fs10, 104(a1)
+; ILP32E-NEXT:    flw fs11, 108(a1)
+; ILP32E-NEXT:    fsw fs7, 124(a1)
+; ILP32E-NEXT:    fsw fs6, 120(a1)
+; ILP32E-NEXT:    fsw fs5, 116(a1)
+; ILP32E-NEXT:    fsw fs4, 112(a1)
+; ILP32E-NEXT:    fsw fs11, 108(a1)
+; ILP32E-NEXT:    fsw fs10, 104(a1)
+; ILP32E-NEXT:    fsw fs9, 100(a1)
+; ILP32E-NEXT:    fsw fs8, 96(a1)
 ; ILP32E-NEXT:    fsw fs3, 92(a1)
 ; ILP32E-NEXT:    fsw fs2, 88(a1)
 ; ILP32E-NEXT:    fsw fs1, 84(a1)
@@ -193,22 +193,22 @@ define void @callee() nounwind {
 ; LP64-NEXT:    flw fs1, 84(a1)
 ; LP64-NEXT:    flw fs2, 88(a1)
 ; LP64-NEXT:    flw fs3, 92(a1)
-; LP64-NEXT:    flw fs4, 96(a1)
-; LP64-NEXT:    flw fs5, 100(a1)
-; LP64-NEXT:    flw fs6, 104(a1)
-; LP64-NEXT:    flw fs7, 108(a1)
-; LP64-NEXT:    flw fs8, 124(a1)
-; LP64-NEXT:    flw fs9, 120(a1)
-; LP64-NEXT:    flw fs10, 116(a1)
-; LP64-NEXT:    flw fs11, 112(a1)
-; LP64-NEXT:    fsw fs8, 124(a1)
-; LP64-NEXT:    fsw fs9, 120(a1)
-; LP64-NEXT:    fsw fs10, 116(a1)
-; LP64-NEXT:    fsw fs11, 112(a1)
-; LP64-NEXT:    fsw fs7, 108(a1)
-; LP64-NEXT:    fsw fs6, 104(a1)
-; LP64-NEXT:    fsw fs5, 100(a1)
-; LP64-NEXT:    fsw fs4, 96(a1)
+; LP64-NEXT:    flw fs4, 112(a1)
+; LP64-NEXT:    flw fs5, 116(a1)
+; LP64-NEXT:    flw fs6, 120(a1)
+; LP64-NEXT:    flw fs7, 124(a1)
+; LP64-NEXT:    flw fs8, 96(a1)
+; LP64-NEXT:    flw fs9, 100(a1)
+; LP64-NEXT:    flw fs10, 104(a1)
+; LP64-NEXT:    flw fs11, 108(a1)
+; LP64-NEXT:    fsw fs7, 124(a1)
+; LP64-NEXT:    fsw fs6, 120(a1)
+; LP64-NEXT:    fsw fs5, 116(a1)
+; LP64-NEXT:    fsw fs4, 112(a1)
+; LP64-NEXT:    fsw fs11, 108(a1)
+; LP64-NEXT:    fsw fs10, 104(a1)
+; LP64-NEXT:    fsw fs9, 100(a1)
+; LP64-NEXT:    fsw fs8, 96(a1)
 ; LP64-NEXT:    fsw fs3, 92(a1)
 ; LP64-NEXT:    fsw fs2, 88(a1)
 ; LP64-NEXT:    fsw fs1, 84(a1)
@@ -263,22 +263,22 @@ define void @callee() nounwind {
 ; LP64E-NEXT:    flw fs1, 84(a1)
 ; LP64E-NEXT:    flw fs2, 88(a1)
 ; LP64E-NEXT:    flw fs3, 92(a1)
-; LP64E-NEXT:    flw fs4, 96(a1)
-; LP64E-NEXT:    flw fs5, 100(a1)
-; LP64E-NEXT:    flw fs6, 104(a1)
-; LP64E-NEXT:    flw fs7, 108(a1)
-; LP64E-NEXT:    flw fs8, 124(a1)
-; LP64E-NEXT:    flw fs9, 120(a1)
-; LP64E-NEXT:    flw fs10, 116(a1)
-; LP64E-NEXT:    flw fs11, 112(a1)
-; LP64E-NEXT:    fsw fs8, 124(a1)
-; LP64E-NEXT:    fsw fs9, 120(a1)
-; LP64E-NEXT:    fsw fs10, 116(a1)
-; LP64E-NEXT:    fsw fs11, 112(a1)
-; LP64E-NEXT:    fsw fs7, 108(a1)
-; LP64E-NEXT:    fsw fs6, 104(a1)
-; LP64E-NEXT:    fsw fs5, 100(a1)
-; LP64E-NEXT:    fsw fs4, 96(a1)
+; LP64E-NEXT:    flw fs4, 112(a1)
+; LP64E-NEXT:    flw fs5, 116(a1)
+; LP64E-NEXT:    flw fs6, 120(a1)
+; LP64E-NEXT:    flw fs7, 124(a1)
+; LP64E-NEXT:    flw fs8, 96(a1)
+; LP64E-NEXT:    flw fs9, 100(a1)
+; LP64E-NEXT:    flw fs10, 104(a1)
+; LP64E-NEXT:    flw fs11, 108(a1)
+; LP64E-NEXT:    fsw fs7, 124(a1)
+; LP64E-NEXT:    fsw fs6, 120(a1)
+; LP64E-NEXT:    fsw fs5, 116(a1)
+; LP64E-NEXT:    fsw fs4, 112(a1)
+; LP64E-NEXT:    fsw fs11, 108(a1)
+; LP64E-NEXT:    fsw fs10, 104(a1)
+; LP64E-NEXT:    fsw fs9, 100(a1)
+; LP64E-NEXT:    fsw fs8, 96(a1)
 ; LP64E-NEXT:    fsw fs3, 92(a1)
 ; LP64E-NEXT:    fsw fs2, 88(a1)
 ; LP64E-NEXT:    fsw fs1, 84(a1)
@@ -346,22 +346,22 @@ define void @callee() nounwind {
 ; ILP32F-NEXT:    flw fs1, 84(a1)
 ; ILP32F-NEXT:    flw fs2, 88(a1)
 ; ILP32F-NEXT:    flw fs3, 92(a1)
-; ILP32F-NEXT:    flw fs4, 96(a1)
-; ILP32F-NEXT:    flw fs5, 100(a1)
-; ILP32F-NEXT:    flw fs6, 104(a1)
-; ILP32F-NEXT:    flw fs7, 108(a1)
-; ILP32F-NEXT:    flw fs8, 124(a1)
-; ILP32F-NEXT:    flw fs9, 120(a1)
-; ILP32F-NEXT:    flw fs10, 116(a1)
-; ILP32F-NEXT:    flw fs11, 112(a1)
-; ILP32F-NEXT:    fsw fs8, 124(a1)
-; ILP32F-NEXT:    fsw fs9, 120(a1)
-; ILP32F-NEXT:    fsw fs10, 116(a1)
-; ILP32F-NEXT:    fsw fs11, 112(a1)
-; ILP32F-NEXT:    fsw fs7, 108(a1)
-; ILP32F-NEXT:    fsw fs6, 104(a1)
-; ILP32F-NEXT:    fsw fs5, 100(a1)
-; ILP32F-NEXT:    fsw fs4, 96(a1)
+; ILP32F-NEXT:    flw fs4, 112(a1)
+; ILP32F-NEXT:    flw fs5, 116(a1)
+; ILP32F-NEXT:    flw fs6, 120(a1)
+; ILP32F-NEXT:    flw fs7, 124(a1)
+; ILP32F-NEXT:    flw fs8, 96(a1)
+; ILP32F-NEXT:    flw fs9, 100(a1)
+; ILP32F-NEXT:    flw fs10, 104(a1)
+; ILP32F-NEXT:    flw fs11, 108(a1)
+; ILP32F-NEXT:    fsw fs7, 124(a1)
+; ILP32F-NEXT:    fsw fs6, 120(a1)
+; ILP32F-NEXT:    fsw fs5, 116(a1)
+; ILP32F-NEXT:    fsw fs4, 112(a1)
+; ILP32F-NEXT:    fsw fs11, 108(a1)
+; ILP32F-NEXT:    fsw fs10, 104(a1)
+; ILP32F-NEXT:    fsw fs9, 100(a1)
+; ILP32F-NEXT:    fsw fs8, 96(a1)
 ; ILP32F-NEXT:    fsw fs3, 92(a1)
 ; ILP32F-NEXT:    fsw fs2, 88(a1)
 ; ILP32F-NEXT:    fsw fs1, 84(a1)
@@ -442,22 +442,22 @@ define void @callee() nounwind {
 ; LP64F-NEXT:    flw fs1, 84(a1)
 ; LP64F-NEXT:    flw fs2, 88(a1)
 ; LP64F-NEXT:    flw fs3, 92(a1)
-; LP64F-NEXT:    flw fs4, 96(a1)
-; LP64F-NEXT:    flw fs5, 100(a1)
-; LP64F-NEXT:    flw fs6, 104(a1)
-; LP64F-NEXT:    flw fs7, 108(a1)
-; LP64F-NEXT:    flw fs8, 124(a1)
-; LP64F-NEXT:    flw fs9, 120(a1)
-; LP64F-NEXT:    flw fs10, 116(a1)
-; LP64F-NEXT:    flw fs11, 112(a1)
-; LP64F-NEXT:    fsw fs8, 124(a1)
-; LP64F-NEXT:    fsw fs9, 120(a1)
-; LP64F-NEXT:    fsw fs10, 116(a1)
-; LP64F-NEXT:    fsw fs11, 112(a1)
-; LP64F-NEXT:    fsw fs7, 108(a1)
-; LP64F-NEXT:    fsw fs6, 104(a1)
-; LP64F-NEXT:    fsw fs5, 100(a1)
-; LP64F-NEXT:    fsw fs4, 96(a1)
+; LP64F-NEXT:    flw fs4, 112(a1)
+; LP64F-NEXT:    flw fs5, 116(a1)
+; LP64F-NEXT:    flw fs6, 120(a1)
+; LP64F-NEXT:    flw fs7, 124(a1)
+; LP64F-NEXT:    flw fs8, 96(a1)
+; LP64F-NEXT:    flw fs9, 100(a1)
+; LP64F-NEXT:    flw fs10, 104(a1)
+; LP64F-NEXT:    flw fs11, 108(a1)
+; LP64F-NEXT:    fsw fs7, 124(a1)
+; LP64F-NEXT:    fsw fs6, 120(a1)
+; LP64F-NEXT:    fsw fs5, 116(a1)
+; LP64F-NEXT:    fsw fs4, 112(a1)
+; LP64F-NEXT:    fsw fs11, 108(a1)
+; LP64F-NEXT:    fsw fs10, 104(a1)
+; LP64F-NEXT:    fsw fs9, 100(a1)
+; LP64F-NEXT:    fsw fs8, 96(a1)
 ; LP64F-NEXT:    fsw fs3, 92(a1)
 ; LP64F-NEXT:    fsw fs2, 88(a1)
 ; LP64F-NEXT:    fsw fs1, 84(a1)
@@ -538,22 +538,22 @@ define void @callee() nounwind {
 ; ILP32D-NEXT:    flw fs1, 84(a1)
 ; ILP32D-NEXT:    flw fs2, 88(a1)
 ; ILP32D-NEXT:    flw fs3, 92(a1)
-; ILP32D-NEXT:    flw fs4, 96(a1)
-; ILP32D-NEXT:    flw fs5, 100(a1)
-; ILP32D-NEXT:    flw fs6, 104(a1)
-; ILP32D-NEXT:    flw fs7, 108(a1)
-; ILP32D-NEXT:    flw fs8, 124(a1)
-; ILP32D-NEXT:    flw fs9, 120(a1)
-; ILP32D-NEXT:    flw fs10, 116(a1)
-; ILP32D-NEXT:    flw fs11, 112(a1)
-; ILP32D-NEXT:    fsw fs8, 124(a1)
-; ILP32D-NEXT:    fsw fs9, 120(a1)
-; ILP32D-NEXT:    fsw fs10, 116(a1)
-; ILP32D-NEXT:    fsw fs11, 112(a1)
-; ILP32D-NEXT:    fsw fs7, 108(a1)
-; ILP32D-NEXT:    fsw fs6, 104(a1)
-; ILP32D-NEXT:    fsw fs5, 100(a1)
-; ILP32D-NEXT:    fsw fs4, 96(a1)
+; ILP32D-NEXT:    flw fs4, 112(a1)
+; ILP32D-NEXT:    flw fs5, 116(a1)
+; ILP32D-NEXT:    flw fs6, 120(a1)
+; ILP32D-NEXT:    flw fs7, 124(a1)
+; ILP32D-NEXT:    flw fs8, 96(a1)
+; ILP32D-NEXT:    flw fs9, 100(a1)
+; ILP32D-NEXT:    flw fs10, 104(a1)
+; ILP32D-NEXT:    flw fs11, 108(a1)
+; ILP32D-NEXT:    fsw fs7, 124(a1)
+; ILP32D-NEXT:    fsw fs6, 120(a1)
+; ILP32D-NEXT:    fsw fs5, 116(a1)
+; ILP32D-NEXT:    fsw fs4, 112(a1)
+; ILP32D-NEXT:    fsw fs11, 108(a1)
+; ILP32D-NEXT:    fsw fs10, 104(a1)
+; ILP32D-NEXT:    fsw fs9, 100(a1)
+; ILP32D-NEXT:    fsw fs8, 96(a1)
 ; ILP32D-NEXT:    fsw fs3, 92(a1)
 ; ILP32D-NEXT:    fsw fs2, 88(a1)
 ; ILP32D-NEXT:    fsw fs1, 84(a1)
@@ -634,22 +634,22 @@ define void @callee() nounwind {
 ; LP64D-NEXT:    flw fs1, 84(a1)
 ; LP64D-NEXT:    flw fs2, 88(a1)
 ; LP64D-NEXT:    flw fs3, 92(a1)
-; LP64D-NEXT:    flw fs4, 96(a1)
-; LP64D-NEXT:    flw fs5, 100(a1)
-; LP64D-NEXT:    flw fs6, 104(a1)
-; LP64D-NEXT:    flw fs7, 108(a1)
-; LP64D-NEXT:    flw fs8, 124(a1)
-; LP64D-NEXT:    flw fs9, 120(a1)
-; LP64D-NEXT:    flw fs10, 116(a1)
-; LP64D-NEXT:    flw fs11, 112(a1)
-; LP64D-NEXT:    fsw fs8, 124(a1)
-; LP64D-NEXT:    fsw fs9, 120(a1)
-; LP64D-NEXT:    fsw fs10, 116(a1)
-; LP64D-NEXT:    fsw fs11, 112(a1)
-; LP64D-NEXT:    fsw fs7, 108(a1)
-; LP64D-NEXT:    fsw fs6, 104(a1)
-; LP64D-NEXT:    fsw fs5, 100(a1)
-; LP64D-NEXT:    fsw fs4, 96(a1)
+; LP64D-NEXT:    flw fs4, 112(a1)
+; LP64D-NEXT:    flw fs5, 116(a1)
+; LP64D-NEXT:    flw fs6, 120(a1)
+; LP64D-NEXT:    flw fs7, 124(a1)
+; LP64D-NEXT:    flw fs8, 96(a1)
+; LP64D-NEXT:    flw fs9, 100(a1)
+; LP64D-NEXT:    flw fs10, 104(a1)
+; LP64D-NEXT:    flw fs11, 108(a1)
+; LP64D-NEXT:    fsw fs7, 124(a1)
+; LP64D-NEXT:    fsw fs6, 120(a1)
+; LP64D-NEXT:    fsw fs5, 116(a1)
+; LP64D-NEXT:    fsw fs4, 112(a1)
+; LP64D-NEXT:    fsw fs11, 108(a1)
+; LP64D-NEXT:    fsw fs10, 104(a1)
+; LP64D-NEXT:    fsw fs9, 100(a1)
+; LP64D-NEXT:    fsw fs8, 96(a1)
 ; LP64D-NEXT:    fsw fs3, 92(a1)
 ; LP64D-NEXT:    fsw fs2, 88(a1)
 ; LP64D-NEXT:    fsw fs1, 84(a1)
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
index 38e3c2d9256cd..0501c700f57df 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
@@ -45,26 +45,26 @@ define void @callee() nounwind {
 ; ILP32-NEXT:    fld ft11, 152(a1)
 ; ILP32-NEXT:    fld fs0, 160(a1)
 ; ILP32-NEXT:    fld fs1, 168(a1)
-; ILP32-NEXT:    fld fs2, 176(a1)
-; ILP32-NEXT:    fld fs3, 184(a1)
-; ILP32-NEXT:    fld fs4, 192(a1)
-; ILP32-NEXT:    fld fs5, 200(a1)
-; ILP32-NEXT:    fld fs6, 208(a1)
-; ILP32-NEXT:    fld fs7, 216(a1)
-; ILP32-NEXT:    fld fs8, 248(a1)
-; ILP32-NEXT:    fld fs9, 240(a1)
-; ILP32-NEXT:    fld fs10, 232(a1)
-; ILP32-NEXT:    fld fs11, 224(a1)
-; ILP32-NEXT:    fsd fs8, 248(a1)
-; ILP32-NEXT:    fsd fs9, 240(a1)
-; ILP32-NEXT:    fsd fs10, 232(a1)
-; ILP32-NEXT:    fsd fs11, 224(a1)
-; ILP32-NEXT:    fsd fs7, 216(a1)
-; ILP32-NEXT:    fsd fs6, 208(a1)
-; ILP32-NEXT:    fsd fs5, 200(a1)
-; ILP32-NEXT:    fsd fs4, 192(a1)
-; ILP32-NEXT:    fsd fs3, 184(a1)
-; ILP32-NEXT:    fsd fs2, 176(a1)
+; ILP32-NEXT:    fld fs2, 208(a1)
+; ILP32-NEXT:    fld fs3, 216(a1)
+; ILP32-NEXT:    fld fs4, 224(a1)
+; ILP32-NEXT:    fld fs5, 232(a1)
+; ILP32-NEXT:    fld fs6, 240(a1)
+; ILP32-NEXT:    fld fs7, 248(a1)
+; ILP32-NEXT:    fld fs8, 176(a1)
+; ILP32-NEXT:    fld fs9, 184(a1)
+; ILP32-NEXT:    fld fs10, 192(a1)
+; ILP32-NEXT:    fld fs11, 200(a1)
+; ILP32-NEXT:    fsd fs7, 248(a1)
+; ILP32-NEXT:    fsd fs6, 240(a1)
+; ILP32-NEXT:    fsd fs5, 232(a1)
+; ILP32-NEXT:    fsd fs4, 224(a1)
+; ILP32-NEXT:    fsd fs3, 216(a1)
+; ILP32-NEXT:    fsd fs2, 208(a1)
+; ILP32-NEXT:    fsd fs11, 200(a1)
+; ILP32-NEXT:    fsd fs10, 192(a1)
+; ILP32-NEXT:    fsd fs9, 184(a1)
+; ILP32-NEXT:    fsd fs8, 176(a1)
 ; ILP32-NEXT:    fsd fs1, 168(a1)
 ; ILP32-NEXT:    fsd fs0, 160(a1)
 ; ILP32-NEXT:    fsd ft11, 152(a1)
@@ -115,26 +115,26 @@ define void @callee() nounwind {
 ; LP64-NEXT:    fld ft11, 152(a1)
 ; LP64-NEXT:    fld fs0, 160(a1)
 ; LP64-NEXT:    fld fs1, 168(a1)
-; LP64-NEXT:    fld fs2, 176(a1)
-; LP64-NEXT:    fld fs3, 184(a1)
-; LP64-NEXT:    fld fs4, 192(a1)
-; LP64-NEXT:    fld fs5, 200(a1)
-; LP64-NEXT:    fld fs6, 208(a1)
-; LP64-NEXT:    fld fs7, 216(a1)
-; LP64-NEXT:    fld fs8, 248(a1)
-; LP64-NEXT:    fld fs9, 240(a1)
-; LP64-NEXT:    fld fs10, 232(a1)
-; LP64-NEXT:    fld fs11, 224(a1)
-; LP64-NEXT:    fsd fs8, 248(a1)
-; LP64-NEXT:    fsd fs9, 240(a1)
-; LP64-NEXT:    fsd fs10, 232(a1)
-; LP64-NEXT:    fsd fs11, 224(a1)
-; LP64-NEXT:    fsd fs7, 216(a1)
-; LP64-NEXT:    fsd fs6, 208(a1)
-; LP64-NEXT:    fsd fs5, 200(a1)
-; LP64-NEXT:    fsd fs4, 192(a1)
-; LP64-NEXT:    fsd fs3, 184(a1)
-; LP64-NEXT:    fsd fs2, 176(a1)
+; LP64-NEXT:    fld fs2, 208(a1)
+; LP64-NEXT:    fld fs3, 216(a1)
+; LP64-NEXT:    fld fs4, 224(a1)
+; LP64-NEXT:    fld fs5, 232(a1)
+; LP64-NEXT:    fld fs6, 240(a1)
+; LP64-NEXT:    fld fs7, 248(a1)
+; LP64-NEXT:    fld fs8, 176(a1)
+; LP64-NEXT:    fld fs9, 184(a1)
+; LP64-NEXT:    fld fs10, 192(a1)
+; LP64-NEXT:    fld fs11, 200(a1)
+; LP64-NEXT:    fsd fs7, 248(a1)
+; LP64-NEXT:    fsd fs6, 240(a1)
+; LP64-NEXT:    fsd fs5, 232(a1)
+; LP64-NEXT:    fsd fs4, 224(a1)
+; LP64-NEXT:    fsd fs3, 216(a1)
+; LP64-NEXT:    fsd fs2, 208(a1)
+; LP64-NEXT:    fsd fs11, 200(a1)
+; LP64-NEXT:    fsd fs10, 192(a1)
+; LP64-NEXT:    fsd fs9, 184(a1)
+; LP64-NEXT:    fsd fs8, 176(a1)
 ; LP64-NEXT:    fsd fs1, 168(a1)
 ; LP64-NEXT:    fsd fs0, 160(a1)
 ; LP64-NEXT:    fsd ft11, 152(a1)
@@ -185,26 +185,26 @@ define void @callee() nounwind {
 ; LP64E-NEXT:    fld ft11, 152(a1)
 ; LP64E-NEXT:    fld fs0, 160(a1)
 ; LP64E-NEXT:    fld fs1, 168(a1)
-; LP64E-NEXT:    fld fs2, 176(a1)
-; LP64E-NEXT:    fld fs3, 184(a1)
-; LP64E-NEXT:    fld fs4, 192(a1)
-; LP64E-NEXT:    fld fs5, 200(a1)
-; LP64E-NEXT:    fld fs6, 208(a1)
-; LP64E-NEXT:    fld fs7, 216(a1)
-; LP64E-NEXT:    fld fs8, 248(a1)
-; LP64E-NEXT:    fld fs9, 240(a1)
-; LP64E-NEXT:    fld fs10, 232(a1)
-; LP64E-NEXT:    fld fs11, 224(a1)
-; LP64E-NEXT:    fsd fs8, 248(a1)
-; LP64E-NEXT:    fsd fs9, 240(a1)
-; LP64E-NEXT:    fsd fs10, 232(a1)
-; LP64E-NEXT:    fsd fs11, 224(a1)
-; LP64E-NEXT:    fsd fs7, 216(a1)
-; LP64E-NEXT:    fsd fs6, 208(a1)
-; LP64E-NEXT:    fsd fs5, 200(a1)
-; LP64E-NEXT:    fsd fs4, 192(a1)
-; LP64E-NEXT:    fsd fs3, 184(a1)
-; LP64E-NEXT:    fsd fs2, 176(a1)
+; LP64E-NEXT:    fld fs2, 208(a1)
+; LP64E-NEXT:    fld fs3, 216(a1)
+; LP64E-NEXT:    fld fs4, 224(a1)
+; LP64E-NEXT:    fld fs5, 232(a1)
+; LP64E-NEXT:    fld fs6, 240(a1)
+; LP64E-NEXT:    fld fs7, 248(a1)
+; LP64E-NEXT:    fld fs8, 176(a1)
+; LP64E-NEXT:    fld fs9, 184(a1)
+; LP64E-NEXT:    fld fs10, 192(a1)
+; LP64E-NEXT:    fld fs11, 200(a1)
+; LP64E-NEXT:    fsd fs7, 248(a1)
+; LP64E-NEXT:    fsd fs6, 240(a1)
+; LP64E-NEXT:    fsd fs5, 232(a1)
+; LP64E-NEXT:    fsd fs4, 224(a1)
+; LP64E-NEXT:    fsd fs3, 216(a1)
+; LP64E-NEXT:    fsd fs2, 208(a1)
+; LP64E-NEXT:    fsd fs11, 200(a1)
+; LP64E-NEXT:    fsd fs10, 192(a1)
+; LP64E-NEXT:    fsd fs9, 184(a1)
+; LP64E-NEXT:    fsd fs8, 176(a1)
 ; LP64E-NEXT:    fsd fs1, 168(a1)
 ; LP64E-NEXT:    fsd fs0, 160(a1)
 ; LP64E-NEXT:    fsd ft11, 152(a1)
@@ -268,26 +268,26 @@ define void @callee() nounwind {
 ; ILP32D-NEXT:    fld ft11, 152(a1)
 ; ILP32D-NEXT:    fld fs0, 160(a1)
 ; ILP32D-NEXT:    fld fs1, 168(a1)
-; ILP32D-NEXT:    fld fs2, 176(a1)
-; ILP32D-NEXT:    fld fs3, 184(a1)
-; ILP32D-NEXT:    fld fs4, 192(a1)
-; ILP32D-NEXT:    fld fs5, 200(a1)
-; ILP32D-NEXT:    fld fs6, 208(a1)
-; ILP32D-NEXT:    fld fs7, 216(a1)
-; ILP32D-NEXT:    fld fs8, 248(a1)
-; ILP32D-NEXT:    fld fs9, 240(a1)
-; ILP32D-NEXT:    fld fs10, 232(a1)
-; ILP32D-NEXT:    fld fs11, 224(a1)
-; ILP32D-NEXT:    fsd fs8, 248(a1)
-; ILP32D-NEXT:    fsd fs9, 240(a1)
-; ILP32D-NEXT:    fsd fs10, 232(a1)
-; ILP32D-NEXT:    fsd fs11, 224(a1)
-; ILP32D-NEXT:    fsd fs7, 216(a1)
-; ILP32D-NEXT:    fsd fs6, 208(a1)
-; ILP32D-NEXT:    fsd fs5, 200(a1)
-; ILP32D-NEXT:    fsd fs4, 192(a1)
-; ILP32D-NEXT:    fsd fs3, 184(a1)
-; ILP32D-NEXT:    fsd fs2, 176(a1)
+; ILP32D-NEXT:    fld fs2, 208(a1)
+; ILP32D-NEXT:    fld fs3, 216(a1)
+; ILP32D-NEXT:    fld fs4, 224(a1)
+; ILP32D-NEXT:    fld fs5, 232(a1)
+; ILP32D-NEXT:    fld fs6, 240(a1)
+; ILP32D-NEXT:    fld fs7, 248(a1)
+; ILP32D-NEXT:    fld fs8, 176(a1)
+; ILP32D-NEXT:    fld fs9, 184(a1)
+; ILP32D-NEXT:    fld fs10, 192(a1)
+; ILP32D-NEXT:    fld fs11, 200(a1)
+; ILP32D-NEXT:    fsd fs7, 248(a1)
+; ILP32D-NEXT:    fsd fs6, 240(a1)
+; ILP32D-NEXT:    fsd fs5, 232(a1)
+; ILP32D-NEXT:    fsd fs4, 224(a1)
+; ILP32D-NEXT:    fsd fs3, 216(a1)
+; ILP32D-NEXT:    fsd fs2, 208(a1)
+; ILP32D-NEXT:    fsd fs11, 200(a1)
+; ILP32D-NEXT:    fsd fs10, 192(a1)
+; ILP32D-NEXT:    fsd fs9, 184(a1)
+; ILP32D-NEXT:    fsd fs8, 176(a1)
 ; ILP32D-NEXT:    fsd fs1, 168(a1)
 ; ILP32D-NEXT:    fsd fs0, 160(a1)
 ; ILP32D-NEXT:    fsd ft11, 152(a1)
@@ -364,26 +364,26 @@ define void @callee() nounwind {
 ; LP64D-NEXT:    fld ft11, 152(a1)
 ; LP64D-NEXT:    fld fs0, 160(a1)
 ; LP64D-NEXT:    fld fs1, 168(a1)
-; LP64D-NEXT:    fld fs2, 176(a1)
-; LP64D-NEXT:    fld fs3, 184(a1)
-; LP64D-NEXT:    fld fs4, 192(a1)
-; LP64D-NEXT:    fld fs5, 200(a1)
-; LP64D-NEXT:    fld fs6, 208(a1)
-; LP64D-NEXT:    fld fs7, 216(a1)
-; LP64D-NEXT:    fld fs8, 248(a1)
-; LP64D-NEXT:    fld fs9, 240(a1)
-; LP64D-NEXT:    fld fs10, 232(a1)
-; LP64D-NEXT:    fld fs11, 224(a1)
-; LP64D-NEXT:    fsd fs8, 248(a1)
-; LP64D-NEXT:    fsd fs9, 240(a1)
-; LP64D-NEXT:    fsd fs10, 232(a1)
-; LP64D-NEXT:    fsd fs11, 224(a1)
-; LP64D-NEXT:    fsd fs7, 216(a1)
-; LP64D-NEXT:    fsd fs6, 208(a1)
-; LP64D-NEXT:    fsd fs5, 200(a1)
-; LP64D-NEXT:    fsd fs4, 192(a1)
-; LP64D-NEXT:    fsd fs3, 184(a1)
-; LP64D-NEXT:    fsd fs2, 176(a1)
+; LP64D-NEXT:    fld fs2, 208(a1)
+; LP64D-NEXT:    fld fs3, 216(a1)
+; LP64D-NEXT:    fld fs4, 224(a1)
+; LP64D-NEXT:    fld fs5, 232(a1)
+; LP64D-NEXT:    fld fs6, 240(a1)
+; LP64D-NEXT:    fld fs7, 248(a1)
+; LP64D-NEXT:    fld fs8, 176(a1)
+; LP64D-NEXT:    fld fs9, 184(a1)
+; LP64D-NEXT:    fld fs10, 192(a1)
+; LP64D-NEXT:    fld fs11, 200(a1)
+; LP64D-NEXT:    fsd fs7, 248(a1)
+; LP64D-NEXT:    fsd fs6, 240(a1)
+; LP64D-NEXT:    fsd fs5, 232(a1)
+; LP64D-NEXT:    fsd fs4, 224(a1)
+; LP64D-NEXT:    fsd fs3, 216(a1)
+; LP64D-NEXT:    fsd fs2, 208(a1)
+; LP64D-NEXT:    fsd fs11, 200(a1)
+; LP64D-NEXT:    fsd fs10, 192(a1)
+; LP64D-NEXT:    fsd fs9, 184(a1)
+; LP64D-NEXT:    fsd fs8, 176(a1)
 ; LP64D-NEXT:    fsd fs1, 168(a1)
 ; LP64D-NEXT:    fsd fs0, 160(a1)
 ; LP64D-NEXT:    fsd ft11, 152(a1)
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
index 5e8ed4509b535..6d2263f74062d 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
@@ -54,16 +54,16 @@ define void @callee() nounwind {
 ; RV32I-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a6, %hi(var)
-; RV32I-NEXT:    lw a0, %lo(var)(a6)
+; RV32I-NEXT:    lui a7, %hi(var)
+; RV32I-NEXT:    lw a0, %lo(var)(a7)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32I-NEXT:    lw a0, %lo(var+4)(a7)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32I-NEXT:    lw a0, %lo(var+8)(a7)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32I-NEXT:    lw a0, %lo(var+12)(a7)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a6, %lo(var)
+; RV32I-NEXT:    addi a5, a7, %lo(var)
 ; RV32I-NEXT:    lw a0, 16(a5)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lw a0, 20(a5)
@@ -86,22 +86,22 @@ define void @callee() nounwind {
 ; RV32I-NEXT:    lw s8, 84(a5)
 ; RV32I-NEXT:    lw s9, 88(a5)
 ; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 96(a5)
-; RV32I-NEXT:    lw ra, 100(a5)
-; RV32I-NEXT:    lw a7, 104(a5)
-; RV32I-NEXT:    lw a4, 108(a5)
+; RV32I-NEXT:    lw s11, 112(a5)
+; RV32I-NEXT:    lw ra, 116(a5)
+; RV32I-NEXT:    lw a3, 120(a5)
 ; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a1, 120(a5)
-; RV32I-NEXT:    lw a2, 116(a5)
-; RV32I-NEXT:    lw a3, 112(a5)
+; RV32I-NEXT:    lw a6, 96(a5)
+; RV32I-NEXT:    lw a4, 100(a5)
+; RV32I-NEXT:    lw a2, 104(a5)
+; RV32I-NEXT:    lw a1, 108(a5)
 ; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a1, 120(a5)
-; RV32I-NEXT:    sw a2, 116(a5)
-; RV32I-NEXT:    sw a3, 112(a5)
-; RV32I-NEXT:    sw a4, 108(a5)
-; RV32I-NEXT:    sw a7, 104(a5)
-; RV32I-NEXT:    sw ra, 100(a5)
-; RV32I-NEXT:    sw s11, 96(a5)
+; RV32I-NEXT:    sw a3, 120(a5)
+; RV32I-NEXT:    sw ra, 116(a5)
+; RV32I-NEXT:    sw s11, 112(a5)
+; RV32I-NEXT:    sw a1, 108(a5)
+; RV32I-NEXT:    sw a2, 104(a5)
+; RV32I-NEXT:    sw a4, 100(a5)
+; RV32I-NEXT:    sw a6, 96(a5)
 ; RV32I-NEXT:    sw s10, 92(a5)
 ; RV32I-NEXT:    sw s9, 88(a5)
 ; RV32I-NEXT:    sw s8, 84(a5)
@@ -125,13 +125,13 @@ define void @callee() nounwind {
 ; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sw a0, 16(a5)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32I-NEXT:    sw a0, %lo(var+12)(a7)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32I-NEXT:    sw a0, %lo(var+8)(a7)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32I-NEXT:    sw a0, %lo(var+4)(a7)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var)(a6)
+; RV32I-NEXT:    sw a0, %lo(var)(a7)
 ; RV32I-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -154,16 +154,16 @@ define void @callee() nounwind {
 ; RV32I-ILP32E-NEXT:    sw ra, 32(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32I-ILP32E-NEXT:    lui a6, %hi(var)
-; RV32I-ILP32E-NEXT:    lw a0, %lo(var)(a6)
+; RV32I-ILP32E-NEXT:    lui a7, %hi(var)
+; RV32I-ILP32E-NEXT:    lw a0, %lo(var)(a7)
 ; RV32I-ILP32E-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-ILP32E-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32I-ILP32E-NEXT:    lw a0, %lo(var+4)(a7)
 ; RV32I-ILP32E-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-ILP32E-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32I-ILP32E-NEXT:    lw a0, %lo(var+8)(a7)
 ; RV32I-ILP32E-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-ILP32E-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32I-ILP32E-NEXT:    lw a0, %lo(var+12)(a7)
 ; RV32I-ILP32E-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-ILP32E-NEXT:    addi a5, a6, %lo(var)
+; RV32I-ILP32E-NEXT:    addi a5, a7, %lo(var)
 ; RV32I-ILP32E-NEXT:    lw a0, 16(a5)
 ; RV32I-ILP32E-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lw a0, 20(a5)
@@ -186,22 +186,22 @@ define void @callee() nounwind {
 ; RV32I-ILP32E-NEXT:    lw s10, 84(a5)
 ; RV32I-ILP32E-NEXT:    lw s11, 88(a5)
 ; RV32I-ILP32E-NEXT:    lw s0, 92(a5)
-; RV32I-ILP32E-NEXT:    lw s1, 96(a5)
-; RV32I-ILP32E-NEXT:    lw ra, 100(a5)
-; RV32I-ILP32E-NEXT:    lw a7, 104(a5)
-; RV32I-ILP32E-NEXT:    lw a4, 108(a5)
+; RV32I-ILP32E-NEXT:    lw s1, 112(a5)
+; RV32I-ILP32E-NEXT:    lw ra, 116(a5)
+; RV32I-ILP32E-NEXT:    lw a3, 120(a5)
 ; RV32I-ILP32E-NEXT:    lw a0, 124(a5)
-; RV32I-ILP32E-NEXT:    lw a1, 120(a5)
-; RV32I-ILP32E-NEXT:    lw a2, 116(a5)
-; RV32I-ILP32E-NEXT:    lw a3, 112(a5)
+; RV32I-ILP32E-NEXT:    lw a6, 96(a5)
+; RV32I-ILP32E-NEXT:    lw a4, 100(a5)
+; RV32I-ILP32E-NEXT:    lw a2, 104(a5)
+; RV32I-ILP32E-NEXT:    lw a1, 108(a5)
 ; RV32I-ILP32E-NEXT:    sw a0, 124(a5)
-; RV32I-ILP32E-NEXT:    sw a1, 120(a5)
-; RV32I-ILP32E-NEXT:    sw a2, 116(a5)
-; RV32I-ILP32E-NEXT:    sw a3, 112(a5)
-; RV32I-ILP32E-NEXT:    sw a4, 108(a5)
-; RV32I-ILP32E-NEXT:    sw a7, 104(a5)
-; RV32I-ILP32E-NEXT:    sw ra, 100(a5)
-; RV32I-ILP32E-NEXT:    sw s1, 96(a5)
+; RV32I-ILP32E-NEXT:    sw a3, 120(a5)
+; RV32I-ILP32E-NEXT:    sw ra, 116(a5)
+; RV32I-ILP32E-NEXT:    sw s1, 112(a5)
+; RV32I-ILP32E-NEXT:    sw a1, 108(a5)
+; RV32I-ILP32E-NEXT:    sw a2, 104(a5)
+; RV32I-ILP32E-NEXT:    sw a4, 100(a5)
+; RV32I-ILP32E-NEXT:    sw a6, 96(a5)
 ; RV32I-ILP32E-NEXT:    sw s0, 92(a5)
 ; RV32I-ILP32E-NEXT:    sw s11, 88(a5)
 ; RV32I-ILP32E-NEXT:    sw s10, 84(a5)
@@ -225,13 +225,13 @@ define void @callee() nounwind {
 ; RV32I-ILP32E-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, 16(a5)
 ; RV32I-ILP32E-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32I-ILP32E-NEXT:    sw a0, %lo(var+12)(a7)
 ; RV32I-ILP32E-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32I-ILP32E-NEXT:    sw a0, %lo(var+8)(a7)
 ; RV32I-ILP32E-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32I-ILP32E-NEXT:    sw a0, %lo(var+4)(a7)
 ; RV32I-ILP32E-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    sw a0, %lo(var)(a6)
+; RV32I-ILP32E-NEXT:    sw a0, %lo(var)(a7)
 ; RV32I-ILP32E-NEXT:    lw ra, 32(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
@@ -255,16 +255,16 @@ define void @callee() nounwind {
 ; RV32I-WITH-FP-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
 ; RV32I-WITH-FP-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
 ; RV32I-WITH-FP-NEXT:    addi s0, sp, 80
-; RV32I-WITH-FP-NEXT:    lui a6, %hi(var)
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
+; RV32I-WITH-FP-NEXT:    lui t0, %hi(var)
+; RV32I-WITH-FP-NEXT:    lw a0, %lo(var)(t0)
 ; RV32I-WITH-FP-NEXT:    sw a0, -56(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+4)(t0)
 ; RV32I-WITH-FP-NEXT:    sw a0, -60(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+8)(t0)
 ; RV32I-WITH-FP-NEXT:    sw a0, -64(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+12)(t0)
 ; RV32I-WITH-FP-NEXT:    sw a0, -68(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    addi a5, a6, %lo(var)
+; RV32I-WITH-FP-NEXT:    addi a5, t0, %lo(var)
 ; RV32I-WITH-FP-NEXT:    lw a0, 16(a5)
 ; RV32I-WITH-FP-NEXT:    sw a0, -72(s0) # 4-byte Folded Spill
 ; RV32I-WITH-FP-NEXT:    lw a0, 20(a5)
@@ -288,22 +288,22 @@ define void @callee() nounwind {
 ; RV32I-WITH-FP-NEXT:    lw s9, 84(a5)
 ; RV32I-WITH-FP-NEXT:    lw s10, 88(a5)
 ; RV32I-WITH-FP-NEXT:    lw s11, 92(a5)
-; RV32I-WITH-FP-NEXT:    lw ra, 96(a5)
-; RV32I-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV32I-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV32I-WITH-FP-NEXT:    lw a4, 108(a5)
+; RV32I-WITH-FP-NEXT:    lw ra, 112(a5)
+; RV32I-WITH-FP-NEXT:    lw a4, 116(a5)
+; RV32I-WITH-FP-NEXT:    lw a3, 120(a5)
 ; RV32I-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV32I-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV32I-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV32I-WITH-FP-NEXT:    lw a3, 112(a5)
+; RV32I-WITH-FP-NEXT:    lw a7, 96(a5)
+; RV32I-WITH-FP-NEXT:    lw a6, 100(a5)
+; RV32I-WITH-FP-NEXT:    lw a2, 104(a5)
+; RV32I-WITH-FP-NEXT:    lw a1, 108(a5)
 ; RV32I-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV32I-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV32I-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV32I-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV32I-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV32I-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV32I-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV32I-WITH-FP-NEXT:    sw ra, 96(a5)
+; RV32I-WITH-FP-NEXT:    sw a3, 120(a5)
+; RV32I-WITH-FP-NEXT:    sw a4, 116(a5)
+; RV32I-WITH-FP-NEXT:    sw ra, 112(a5)
+; RV32I-WITH-FP-NEXT:    sw a1, 108(a5)
+; RV32I-WITH-FP-NEXT:    sw a2, 104(a5)
+; RV32I-WITH-FP-NEXT:    sw a6, 100(a5)
+; RV32I-WITH-FP-NEXT:    sw a7, 96(a5)
 ; RV32I-WITH-FP-NEXT:    sw s11, 92(a5)
 ; RV32I-WITH-FP-NEXT:    sw s10, 88(a5)
 ; RV32I-WITH-FP-NEXT:    sw s9, 84(a5)
@@ -328,13 +328,13 @@ define void @callee() nounwind {
 ; RV32I-WITH-FP-NEXT:    lw a0, -72(s0) # 4-byte Folded Reload
 ; RV32I-WITH-FP-NEXT:    sw a0, 16(a5)
 ; RV32I-WITH-FP-NEXT:    lw a0, -68(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+12)(t0)
 ; RV32I-WITH-FP-NEXT:    lw a0, -64(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+8)(t0)
 ; RV32I-WITH-FP-NEXT:    lw a0, -60(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+4)(t0)
 ; RV32I-WITH-FP-NEXT:    lw a0, -56(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, %lo(var)(t0)
 ; RV32I-WITH-FP-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-WITH-FP-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-WITH-FP-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -354,16 +354,16 @@ define void @callee() nounwind {
 ; RV32IZCMP-LABEL: callee:
 ; RV32IZCMP:       # %bb.0:
 ; RV32IZCMP-NEXT:    cm.push {ra, s0-s11}, -96
-; RV32IZCMP-NEXT:    lui a6, %hi(var)
-; RV32IZCMP-NEXT:    lw a0, %lo(var)(a6)
+; RV32IZCMP-NEXT:    lui t0, %hi(var)
+; RV32IZCMP-NEXT:    lw a0, %lo(var)(t0)
 ; RV32IZCMP-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32IZCMP-NEXT:    lw a0, %lo(var+4)(t0)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32IZCMP-NEXT:    lw a0, %lo(var+8)(t0)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32IZCMP-NEXT:    lw a0, %lo(var+12)(t0)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, a6, %lo(var)
+; RV32IZCMP-NEXT:    addi a5, t0, %lo(var)
 ; RV32IZCMP-NEXT:    lw a0, 16(a5)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    lw a0, 20(a5)
@@ -386,22 +386,22 @@ define void @callee() nounwind {
 ; RV32IZCMP-NEXT:    lw t3, 84(a5)
 ; RV32IZCMP-NEXT:    lw t2, 88(a5)
 ; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-NEXT:    lw a4, 108(a5)
+; RV32IZCMP-NEXT:    lw a7, 112(a5)
+; RV32IZCMP-NEXT:    lw s0, 116(a5)
+; RV32IZCMP-NEXT:    lw a3, 120(a5)
 ; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 112(a5)
+; RV32IZCMP-NEXT:    lw a6, 96(a5)
+; RV32IZCMP-NEXT:    lw a4, 100(a5)
+; RV32IZCMP-NEXT:    lw a2, 104(a5)
+; RV32IZCMP-NEXT:    lw a1, 108(a5)
 ; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-NEXT:    sw t0, 96(a5)
+; RV32IZCMP-NEXT:    sw a3, 120(a5)
+; RV32IZCMP-NEXT:    sw s0, 116(a5)
+; RV32IZCMP-NEXT:    sw a7, 112(a5)
+; RV32IZCMP-NEXT:    sw a1, 108(a5)
+; RV32IZCMP-NEXT:    sw a2, 104(a5)
+; RV32IZCMP-NEXT:    sw a4, 100(a5)
+; RV32IZCMP-NEXT:    sw a6, 96(a5)
 ; RV32IZCMP-NEXT:    sw t1, 92(a5)
 ; RV32IZCMP-NEXT:    sw t2, 88(a5)
 ; RV32IZCMP-NEXT:    sw t3, 84(a5)
@@ -425,13 +425,13 @@ define void @callee() nounwind {
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    sw a0, 16(a5)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var+12)(t0)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var+8)(t0)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var+4)(t0)
 ; RV32IZCMP-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var)(t0)
 ; RV32IZCMP-NEXT:    cm.popret {ra, s0-s11}, 96
 ;
 ; RV32IZCMP-WITH-FP-LABEL: callee:
@@ -451,16 +451,16 @@ define void @callee() nounwind {
 ; RV32IZCMP-WITH-FP-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
 ; RV32IZCMP-WITH-FP-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
 ; RV32IZCMP-WITH-FP-NEXT:    addi s0, sp, 80
-; RV32IZCMP-WITH-FP-NEXT:    lui a6, %hi(var)
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lui t1, %hi(var)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(t1)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -56(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(t1)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -60(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(t1)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -64(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(t1)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -68(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    addi a5, a6, %lo(var)
+; RV32IZCMP-WITH-FP-NEXT:    addi a5, t1, %lo(var)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, 16(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -72(s0) # 4-byte Folded Spill
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, 20(a5)
@@ -484,22 +484,22 @@ define void @callee() nounwind {
 ; RV32IZCMP-WITH-FP-NEXT:    lw t3, 84(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    lw t2, 88(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    lw s1, 92(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t1, 96(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a4, 108(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw t0, 112(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a4, 116(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a3, 120(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a3, 112(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a7, 96(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a6, 100(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a2, 104(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a1, 108(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t1, 96(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a3, 120(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a4, 116(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw t0, 112(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a1, 108(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a2, 104(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a6, 100(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a7, 96(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    sw s1, 92(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    sw t2, 88(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    sw t3, 84(a5)
@@ -524,13 +524,13 @@ define void @callee() nounwind {
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -72(s0) # 4-byte Folded Reload
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, 16(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -68(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(t1)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -64(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(t1)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -60(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(t1)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -56(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(t1)
 ; RV32IZCMP-WITH-FP-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32IZCMP-WITH-FP-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32IZCMP-WITH-FP-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -563,16 +563,16 @@ define void @callee() nounwind {
 ; RV64I-NEXT:    sd s9, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a6, %hi(var)
-; RV64I-NEXT:    lw a0, %lo(var)(a6)
+; RV64I-NEXT:    lui a7, %hi(var)
+; RV64I-NEXT:    lw a0, %lo(var)(a7)
 ; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64I-NEXT:    lw a0, %lo(var+4)(a7)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64I-NEXT:    lw a0, %lo(var+8)(a7)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64I-NEXT:    lw a0, %lo(var+12)(a7)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a6, %lo(var)
+; RV64I-NEXT:    addi a5, a7, %lo(var)
 ; RV64I-NEXT:    lw a0, 16(a5)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lw a0, 20(a5)
@@ -595,22 +595,22 @@ define void @callee() nounwind {
 ; RV64I-NEXT:    lw s8, 84(a5)
 ; RV64I-NEXT:    lw s9, 88(a5)
 ; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 96(a5)
-; RV64I-NEXT:    lw ra, 100(a5)
-; RV64I-NEXT:    lw a7, 104(a5)
-; RV64I-NEXT:    lw a4, 108(a5)
+; RV64I-NEXT:    lw s11, 112(a5)
+; RV64I-NEXT:    lw ra, 116(a5)
+; RV64I-NEXT:    lw a3, 120(a5)
 ; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a1, 120(a5)
-; RV64I-NEXT:    lw a2, 116(a5)
-; RV64I-NEXT:    lw a3, 112(a5)
+; RV64I-NEXT:    lw a6, 96(a5)
+; RV64I-NEXT:    lw a4, 100(a5)
+; RV64I-NEXT:    lw a2, 104(a5)
+; RV64I-NEXT:    lw a1, 108(a5)
 ; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a1, 120(a5)
-; RV64I-NEXT:    sw a2, 116(a5)
-; RV64I-NEXT:    sw a3, 112(a5)
-; RV64I-NEXT:    sw a4, 108(a5)
-; RV64I-NEXT:    sw a7, 104(a5)
-; RV64I-NEXT:    sw ra, 100(a5)
-; RV64I-NEXT:    sw s11, 96(a5)
+; RV64I-NEXT:    sw a3, 120(a5)
+; RV64I-NEXT:    sw ra, 116(a5)
+; RV64I-NEXT:    sw s11, 112(a5)
+; RV64I-NEXT:    sw a1, 108(a5)
+; RV64I-NEXT:    sw a2, 104(a5)
+; RV64I-NEXT:    sw a4, 100(a5)
+; RV64I-NEXT:    sw a6, 96(a5)
 ; RV64I-NEXT:    sw s10, 92(a5)
 ; RV64I-NEXT:    sw s9, 88(a5)
 ; RV64I-NEXT:    sw s8, 84(a5)
@@ -634,13 +634,13 @@ define void @callee() nounwind {
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sw a0, 16(a5)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64I-NEXT:    sw a0, %lo(var+12)(a7)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64I-NEXT:    sw a0, %lo(var+8)(a7)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64I-NEXT:    sw a0, %lo(var+4)(a7)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var)(a6)
+; RV64I-NEXT:    sw a0, %lo(var)(a7)
 ; RV64I-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
@@ -663,16 +663,16 @@ define void @callee() nounwind {
 ; RV64I-LP64E-NEXT:    sd ra, 64(sp) # 8-byte Folded Spill
 ; RV64I-LP64E-NEXT:    sd s0, 56(sp) # 8-byte Folded Spill
 ; RV64I-LP64E-NEXT:    sd s1, 48(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lui a6, %hi(var)
-; RV64I-LP64E-NEXT:    lw a0, %lo(var)(a6)
+; RV64I-LP64E-NEXT:    lui a7, %hi(var)
+; RV64I-LP64E-NEXT:    lw a0, %lo(var)(a7)
 ; RV64I-LP64E-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64I-LP64E-NEXT:    lw a0, %lo(var+4)(a7)
 ; RV64I-LP64E-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64I-LP64E-NEXT:    lw a0, %lo(var+8)(a7)
 ; RV64I-LP64E-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64I-LP64E-NEXT:    lw a0, %lo(var+12)(a7)
 ; RV64I-LP64E-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    addi a5, a6, %lo(var)
+; RV64I-LP64E-NEXT:    addi a5, a7, %lo(var)
 ; RV64I-LP64E-NEXT:    lw a0, 16(a5)
 ; RV64I-LP64E-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
 ; RV64I-LP64E-NEXT:    lw a0, 20(a5)
@@ -695,22 +695,22 @@ define void @callee() nounwind {
 ; RV64I-LP64E-NEXT:    lw s10, 84(a5)
 ; RV64I-LP64E-NEXT:    lw s11, 88(a5)
 ; RV64I-LP64E-NEXT:    lw s0, 92(a5)
-; RV64I-LP64E-NEXT:    lw s1, 96(a5)
-; RV64I-LP64E-NEXT:    lw ra, 100(a5)
-; RV64I-LP64E-NEXT:    lw a7, 104(a5)
-; RV64I-LP64E-NEXT:    lw a4, 108(a5)
+; RV64I-LP64E-NEXT:    lw s1, 112(a5)
+; RV64I-LP64E-NEXT:    lw ra, 116(a5)
+; RV64I-LP64E-NEXT:    lw a3, 120(a5)
 ; RV64I-LP64E-NEXT:    lw a0, 124(a5)
-; RV64I-LP64E-NEXT:    lw a1, 120(a5)
-; RV64I-LP64E-NEXT:    lw a2, 116(a5)
-; RV64I-LP64E-NEXT:    lw a3, 112(a5)
+; RV64I-LP64E-NEXT:    lw a6, 96(a5)
+; RV64I-LP64E-NEXT:    lw a4, 100(a5)
+; RV64I-LP64E-NEXT:    lw a2, 104(a5)
+; RV64I-LP64E-NEXT:    lw a1, 108(a5)
 ; RV64I-LP64E-NEXT:    sw a0, 124(a5)
-; RV64I-LP64E-NEXT:    sw a1, 120(a5)
-; RV64I-LP64E-NEXT:    sw a2, 116(a5)
-; RV64I-LP64E-NEXT:    sw a3, 112(a5)
-; RV64I-LP64E-NEXT:    sw a4, 108(a5)
-; RV64I-LP64E-NEXT:    sw a7, 104(a5)
-; RV64I-LP64E-NEXT:    sw ra, 100(a5)
-; RV64I-LP64E-NEXT:    sw s1, 96(a5)
+; RV64I-LP64E-NEXT:    sw a3, 120(a5)
+; RV64I-LP64E-NEXT:    sw ra, 116(a5)
+; RV64I-LP64E-NEXT:    sw s1, 112(a5)
+; RV64I-LP64E-NEXT:    sw a1, 108(a5)
+; RV64I-LP64E-NEXT:    sw a2, 104(a5)
+; RV64I-LP64E-NEXT:    sw a4, 100(a5)
+; RV64I-LP64E-NEXT:    sw a6, 96(a5)
 ; RV64I-LP64E-NEXT:    sw s0, 92(a5)
 ; RV64I-LP64E-NEXT:    sw s11, 88(a5)
 ; RV64I-LP64E-NEXT:    sw s10, 84(a5)
@@ -734,13 +734,13 @@ define void @callee() nounwind {
 ; RV64I-LP64E-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, 16(a5)
 ; RV64I-LP64E-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64I-LP64E-NEXT:    sw a0, %lo(var+12)(a7)
 ; RV64I-LP64E-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64I-LP64E-NEXT:    sw a0, %lo(var+8)(a7)
 ; RV64I-LP64E-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64I-LP64E-NEXT:    sw a0, %lo(var+4)(a7)
 ; RV64I-LP64E-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    sw a0, %lo(var)(a6)
+; RV64I-LP64E-NEXT:    sw a0, %lo(var)(a7)
 ; RV64I-LP64E-NEXT:    ld ra, 64(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    ld s0, 56(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    ld s1, 48(sp) # 8-byte Folded Reload
@@ -764,16 +764,16 @@ define void @callee() nounwind {
 ; RV64I-WITH-FP-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; RV64I-WITH-FP-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
 ; RV64I-WITH-FP-NEXT:    addi s0, sp, 160
-; RV64I-WITH-FP-NEXT:    lui a6, %hi(var)
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
+; RV64I-WITH-FP-NEXT:    lui t0, %hi(var)
+; RV64I-WITH-FP-NEXT:    lw a0, %lo(var)(t0)
 ; RV64I-WITH-FP-NEXT:    sd a0, -112(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+4)(t0)
 ; RV64I-WITH-FP-NEXT:    sd a0, -120(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+8)(t0)
 ; RV64I-WITH-FP-NEXT:    sd a0, -128(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+12)(t0)
 ; RV64I-WITH-FP-NEXT:    sd a0, -136(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    addi a5, a6, %lo(var)
+; RV64I-WITH-FP-NEXT:    addi a5, t0, %lo(var)
 ; RV64I-WITH-FP-NEXT:    lw a0, 16(a5)
 ; RV64I-WITH-FP-NEXT:    sd a0, -144(s0) # 8-byte Folded Spill
 ; RV64I-WITH-FP-NEXT:    lw a0, 20(a5)
@@ -797,22 +797,22 @@ define void @callee() nounwind {
 ; RV64I-WITH-FP-NEXT:    lw s9, 84(a5)
 ; RV64I-WITH-FP-NEXT:    lw s10, 88(a5)
 ; RV64I-WITH-FP-NEXT:    lw s11, 92(a5)
-; RV64I-WITH-FP-NEXT:    lw ra, 96(a5)
-; RV64I-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV64I-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV64I-WITH-FP-NEXT:    lw a4, 108(a5)
+; RV64I-WITH-FP-NEXT:    lw ra, 112(a5)
+; RV64I-WITH-FP-NEXT:    lw a4, 116(a5)
+; RV64I-WITH-FP-NEXT:    lw a3, 120(a5)
 ; RV64I-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV64I-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV64I-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV64I-WITH-FP-NEXT:    lw a3, 112(a5)
+; RV64I-WITH-FP-NEXT:    lw a7, 96(a5)
+; RV64I-WITH-FP-NEXT:    lw a6, 100(a5)
+; RV64I-WITH-FP-NEXT:    lw a2, 104(a5)
+; RV64I-WITH-FP-NEXT:    lw a1, 108(a5)
 ; RV64I-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV64I-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV64I-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV64I-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV64I-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV64I-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV64I-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV64I-WITH-FP-NEXT:    sw ra, 96(a5)
+; RV64I-WITH-FP-NEXT:    sw a3, 120(a5)
+; RV64I-WITH-FP-NEXT:    sw a4, 116(a5)
+; RV64I-WITH-FP-NEXT:    sw ra, 112(a5)
+; RV64I-WITH-FP-NEXT:    sw a1, 108(a5)
+; RV64I-WITH-FP-NEXT:    sw a2, 104(a5)
+; RV64I-WITH-FP-NEXT:    sw a6, 100(a5)
+; RV64I-WITH-FP-NEXT:    sw a7, 96(a5)
 ; RV64I-WITH-FP-NEXT:    sw s11, 92(a5)
 ; RV64I-WITH-FP-NEXT:    sw s10, 88(a5)
 ; RV64I-WITH-FP-NEXT:    sw s9, 84(a5)
@@ -837,13 +837,13 @@ define void @callee() nounwind {
 ; RV64I-WITH-FP-NEXT:    ld a0, -144(s0) # 8-byte Folded Reload
 ; RV64I-WITH-FP-NEXT:    sw a0, 16(a5)
 ; RV64I-WITH-FP-NEXT:    ld a0, -136(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+12)(t0)
 ; RV64I-WITH-FP-NEXT:    ld a0, -128(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+8)(t0)
 ; RV64I-WITH-FP-NEXT:    ld a0, -120(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+4)(t0)
 ; RV64I-WITH-FP-NEXT:    ld a0, -112(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, %lo(var)(t0)
 ; RV64I-WITH-FP-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-WITH-FP-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-WITH-FP-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
@@ -863,16 +863,16 @@ define void @callee() nounwind {
 ; RV64IZCMP-LABEL: callee:
 ; RV64IZCMP:       # %bb.0:
 ; RV64IZCMP-NEXT:    cm.push {ra, s0-s11}, -160
-; RV64IZCMP-NEXT:    lui a6, %hi(var)
-; RV64IZCMP-NEXT:    lw a0, %lo(var)(a6)
+; RV64IZCMP-NEXT:    lui t0, %hi(var)
+; RV64IZCMP-NEXT:    lw a0, %lo(var)(t0)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64IZCMP-NEXT:    lw a0, %lo(var+4)(t0)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64IZCMP-NEXT:    lw a0, %lo(var+8)(t0)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64IZCMP-NEXT:    lw a0, %lo(var+12)(t0)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, a6, %lo(var)
+; RV64IZCMP-NEXT:    addi a5, t0, %lo(var)
 ; RV64IZCMP-NEXT:    lw a0, 16(a5)
 ; RV64IZCMP-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    lw a0, 20(a5)
@@ -895,22 +895,22 @@ define void @callee() nounwind {
 ; RV64IZCMP-NEXT:    lw t3, 84(a5)
 ; RV64IZCMP-NEXT:    lw t2, 88(a5)
 ; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-NEXT:    lw a4, 108(a5)
+; RV64IZCMP-NEXT:    lw a7, 112(a5)
+; RV64IZCMP-NEXT:    lw s0, 116(a5)
+; RV64IZCMP-NEXT:    lw a3, 120(a5)
 ; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 112(a5)
+; RV64IZCMP-NEXT:    lw a6, 96(a5)
+; RV64IZCMP-NEXT:    lw a4, 100(a5)
+; RV64IZCMP-NEXT:    lw a2, 104(a5)
+; RV64IZCMP-NEXT:    lw a1, 108(a5)
 ; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-NEXT:    sw t0, 96(a5)
+; RV64IZCMP-NEXT:    sw a3, 120(a5)
+; RV64IZCMP-NEXT:    sw s0, 116(a5)
+; RV64IZCMP-NEXT:    sw a7, 112(a5)
+; RV64IZCMP-NEXT:    sw a1, 108(a5)
+; RV64IZCMP-NEXT:    sw a2, 104(a5)
+; RV64IZCMP-NEXT:    sw a4, 100(a5)
+; RV64IZCMP-NEXT:    sw a6, 96(a5)
 ; RV64IZCMP-NEXT:    sw t1, 92(a5)
 ; RV64IZCMP-NEXT:    sw t2, 88(a5)
 ; RV64IZCMP-NEXT:    sw t3, 84(a5)
@@ -934,13 +934,13 @@ define void @callee() nounwind {
 ; RV64IZCMP-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    sw a0, 16(a5)
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var+12)(t0)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var+8)(t0)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var+4)(t0)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var)(t0)
 ; RV64IZCMP-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV64IZCMP-WITH-FP-LABEL: callee:
@@ -960,16 +960,16 @@ define void @callee() nounwind {
 ; RV64IZCMP-WITH-FP-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; RV64IZCMP-WITH-FP-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
 ; RV64IZCMP-WITH-FP-NEXT:    addi s0, sp, 160
-; RV64IZCMP-WITH-FP-NEXT:    lui a6, %hi(var)
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lui t1, %hi(var)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(t1)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -112(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(t1)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -120(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(t1)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -128(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(t1)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -136(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    addi a5, a6, %lo(var)
+; RV64IZCMP-WITH-FP-NEXT:    addi a5, t1, %lo(var)
 ; RV64IZCMP-WITH-FP-NEXT:    lw a0, 16(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -144(s0) # 8-byte Folded Spill
 ; RV64IZCMP-WITH-FP-NEXT:    lw a0, 20(a5)
@@ -993,22 +993,22 @@ define void @callee() nounwind {
 ; RV64IZCMP-WITH-FP-NEXT:    lw t3, 84(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    lw t2, 88(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    lw s1, 92(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t1, 96(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a4, 108(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw t0, 112(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a4, 116(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a3, 120(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a3, 112(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a7, 96(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a6, 100(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a2, 104(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a1, 108(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t1, 96(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a3, 120(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a4, 116(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw t0, 112(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a1, 108(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a2, 104(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a6, 100(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a7, 96(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    sw s1, 92(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    sw t2, 88(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    sw t3, 84(a5)
@@ -1033,13 +1033,13 @@ define void @callee() nounwind {
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -144(s0) # 8-byte Folded Reload
 ; RV64IZCMP-WITH-FP-NEXT:    sw a0, 16(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -136(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(t1)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -128(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(t1)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -120(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(t1)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -112(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(t1)
 ; RV64IZCMP-WITH-FP-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64IZCMP-WITH-FP-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64IZCMP-WITH-FP-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
index 231ed159ab206..bb082b0314d59 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
@@ -190,21 +190,21 @@ define i32 @caller_many_scalars() nounwind {
 define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-FPELIM-LABEL: callee_large_scalars:
 ; RV32I-FPELIM:       # %bb.0:
-; RV32I-FPELIM-NEXT:    lw a2, 0(a1)
-; RV32I-FPELIM-NEXT:    lw a3, 0(a0)
-; RV32I-FPELIM-NEXT:    lw a4, 4(a1)
-; RV32I-FPELIM-NEXT:    lw a5, 12(a1)
-; RV32I-FPELIM-NEXT:    lw a6, 12(a0)
-; RV32I-FPELIM-NEXT:    lw a7, 4(a0)
+; RV32I-FPELIM-NEXT:    lw a2, 0(a0)
+; RV32I-FPELIM-NEXT:    lw a3, 4(a0)
+; RV32I-FPELIM-NEXT:    lw a4, 12(a1)
+; RV32I-FPELIM-NEXT:    lw a5, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a6, 0(a1)
+; RV32I-FPELIM-NEXT:    lw a7, 4(a1)
 ; RV32I-FPELIM-NEXT:    lw a1, 8(a1)
 ; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
-; RV32I-FPELIM-NEXT:    xor a5, a6, a5
-; RV32I-FPELIM-NEXT:    xor a4, a7, a4
-; RV32I-FPELIM-NEXT:    or a4, a4, a5
+; RV32I-FPELIM-NEXT:    xor a4, a5, a4
+; RV32I-FPELIM-NEXT:    xor a3, a3, a7
+; RV32I-FPELIM-NEXT:    or a3, a3, a4
 ; RV32I-FPELIM-NEXT:    xor a0, a0, a1
-; RV32I-FPELIM-NEXT:    xor a2, a3, a2
-; RV32I-FPELIM-NEXT:    or a0, a2, a0
-; RV32I-FPELIM-NEXT:    or a0, a0, a4
+; RV32I-FPELIM-NEXT:    xor a1, a2, a6
+; RV32I-FPELIM-NEXT:    or a0, a1, a0
+; RV32I-FPELIM-NEXT:    or a0, a0, a3
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -214,21 +214,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    lw a2, 0(a1)
-; RV32I-WITHFP-NEXT:    lw a3, 0(a0)
-; RV32I-WITHFP-NEXT:    lw a4, 4(a1)
-; RV32I-WITHFP-NEXT:    lw a5, 12(a1)
-; RV32I-WITHFP-NEXT:    lw a6, 12(a0)
-; RV32I-WITHFP-NEXT:    lw a7, 4(a0)
+; RV32I-WITHFP-NEXT:    lw a2, 0(a0)
+; RV32I-WITHFP-NEXT:    lw a3, 4(a0)
+; RV32I-WITHFP-NEXT:    lw a4, 12(a1)
+; RV32I-WITHFP-NEXT:    lw a5, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a6, 0(a1)
+; RV32I-WITHFP-NEXT:    lw a7, 4(a1)
 ; RV32I-WITHFP-NEXT:    lw a1, 8(a1)
 ; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
-; RV32I-WITHFP-NEXT:    xor a5, a6, a5
-; RV32I-WITHFP-NEXT:    xor a4, a7, a4
-; RV32I-WITHFP-NEXT:    or a4, a4, a5
+; RV32I-WITHFP-NEXT:    xor a4, a5, a4
+; RV32I-WITHFP-NEXT:    xor a3, a3, a7
+; RV32I-WITHFP-NEXT:    or a3, a3, a4
 ; RV32I-WITHFP-NEXT:    xor a0, a0, a1
-; RV32I-WITHFP-NEXT:    xor a2, a3, a2
-; RV32I-WITHFP-NEXT:    or a0, a2, a0
-; RV32I-WITHFP-NEXT:    or a0, a0, a4
+; RV32I-WITHFP-NEXT:    xor a1, a2, a6
+; RV32I-WITHFP-NEXT:    or a0, a1, a0
+; RV32I-WITHFP-NEXT:    or a0, a0, a3
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -297,21 +297,21 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-FPELIM-LABEL: callee_large_scalars_exhausted_regs:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    lw a0, 4(sp)
-; RV32I-FPELIM-NEXT:    lw a1, 0(a0)
-; RV32I-FPELIM-NEXT:    lw a2, 0(a7)
-; RV32I-FPELIM-NEXT:    lw a3, 4(a0)
-; RV32I-FPELIM-NEXT:    lw a4, 12(a0)
-; RV32I-FPELIM-NEXT:    lw a5, 12(a7)
-; RV32I-FPELIM-NEXT:    lw a6, 4(a7)
+; RV32I-FPELIM-NEXT:    lw a1, 0(a7)
+; RV32I-FPELIM-NEXT:    lw a2, 4(a7)
+; RV32I-FPELIM-NEXT:    lw a3, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a4, 12(a7)
+; RV32I-FPELIM-NEXT:    lw a5, 0(a0)
+; RV32I-FPELIM-NEXT:    lw a6, 4(a0)
 ; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
 ; RV32I-FPELIM-NEXT:    lw a7, 8(a7)
-; RV32I-FPELIM-NEXT:    xor a4, a5, a4
-; RV32I-FPELIM-NEXT:    xor a3, a6, a3
-; RV32I-FPELIM-NEXT:    or a3, a3, a4
+; RV32I-FPELIM-NEXT:    xor a3, a4, a3
+; RV32I-FPELIM-NEXT:    xor a2, a2, a6
+; RV32I-FPELIM-NEXT:    or a2, a2, a3
 ; RV32I-FPELIM-NEXT:    xor a0, a7, a0
-; RV32I-FPELIM-NEXT:    xor a1, a2, a1
+; RV32I-FPELIM-NEXT:    xor a1, a1, a5
 ; RV32I-FPELIM-NEXT:    or a0, a1, a0
-; RV32I-FPELIM-NEXT:    or a0, a0, a3
+; RV32I-FPELIM-NEXT:    or a0, a0, a2
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -322,21 +322,21 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
 ; RV32I-WITHFP-NEXT:    lw a0, 4(s0)
-; RV32I-WITHFP-NEXT:    lw a1, 0(a0)
-; RV32I-WITHFP-NEXT:    lw a2, 0(a7)
-; RV32I-WITHFP-NEXT:    lw a3, 4(a0)
-; RV32I-WITHFP-NEXT:    lw a4, 12(a0)
-; RV32I-WITHFP-NEXT:    lw a5, 12(a7)
-; RV32I-WITHFP-NEXT:    lw a6, 4(a7)
+; RV32I-WITHFP-NEXT:    lw a1, 0(a7)
+; RV32I-WITHFP-NEXT:    lw a2, 4(a7)
+; RV32I-WITHFP-NEXT:    lw a3, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a4, 12(a7)
+; RV32I-WITHFP-NEXT:    lw a5, 0(a0)
+; RV32I-WITHFP-NEXT:    lw a6, 4(a0)
 ; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
 ; RV32I-WITHFP-NEXT:    lw a7, 8(a7)
-; RV32I-WITHFP-NEXT:    xor a4, a5, a4
-; RV32I-WITHFP-NEXT:    xor a3, a6, a3
-; RV32I-WITHFP-NEXT:    or a3, a3, a4
+; RV32I-WITHFP-NEXT:    xor a3, a4, a3
+; RV32I-WITHFP-NEXT:    xor a2, a2, a6
+; RV32I-WITHFP-NEXT:    or a2, a2, a3
 ; RV32I-WITHFP-NEXT:    xor a0, a7, a0
-; RV32I-WITHFP-NEXT:    xor a1, a2, a1
+; RV32I-WITHFP-NEXT:    xor a1, a1, a5
 ; RV32I-WITHFP-NEXT:    or a0, a1, a0
-; RV32I-WITHFP-NEXT:    or a0, a0, a3
+; RV32I-WITHFP-NEXT:    or a0, a0, a2
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
index d08cf577b1bdd..708cb00d1c45c 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
@@ -1267,21 +1267,21 @@ define i32 @caller_many_scalars() {
 define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-FPELIM-LABEL: callee_large_scalars:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    lw a2, 0(a1)
-; ILP32E-FPELIM-NEXT:    lw a3, 0(a0)
-; ILP32E-FPELIM-NEXT:    lw a4, 4(a1)
-; ILP32E-FPELIM-NEXT:    lw a5, 12(a1)
-; ILP32E-FPELIM-NEXT:    lw a6, 12(a0)
-; ILP32E-FPELIM-NEXT:    lw a7, 4(a0)
+; ILP32E-FPELIM-NEXT:    lw a2, 0(a0)
+; ILP32E-FPELIM-NEXT:    lw a3, 4(a0)
+; ILP32E-FPELIM-NEXT:    lw a4, 12(a1)
+; ILP32E-FPELIM-NEXT:    lw a5, 12(a0)
+; ILP32E-FPELIM-NEXT:    lw a6, 0(a1)
+; ILP32E-FPELIM-NEXT:    lw a7, 4(a1)
 ; ILP32E-FPELIM-NEXT:    lw a1, 8(a1)
 ; ILP32E-FPELIM-NEXT:    lw a0, 8(a0)
-; ILP32E-FPELIM-NEXT:    xor a5, a6, a5
-; ILP32E-FPELIM-NEXT:    xor a4, a7, a4
-; ILP32E-FPELIM-NEXT:    or a4, a4, a5
+; ILP32E-FPELIM-NEXT:    xor a4, a5, a4
+; ILP32E-FPELIM-NEXT:    xor a3, a3, a7
+; ILP32E-FPELIM-NEXT:    or a3, a3, a4
 ; ILP32E-FPELIM-NEXT:    xor a0, a0, a1
-; ILP32E-FPELIM-NEXT:    xor a2, a3, a2
-; ILP32E-FPELIM-NEXT:    or a0, a2, a0
-; ILP32E-FPELIM-NEXT:    or a0, a0, a4
+; ILP32E-FPELIM-NEXT:    xor a1, a2, a6
+; ILP32E-FPELIM-NEXT:    or a0, a1, a0
+; ILP32E-FPELIM-NEXT:    or a0, a0, a3
 ; ILP32E-FPELIM-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-NEXT:    ret
 ;
@@ -1295,21 +1295,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
-; ILP32E-WITHFP-NEXT:    lw a2, 0(a1)
-; ILP32E-WITHFP-NEXT:    lw a3, 0(a0)
-; ILP32E-WITHFP-NEXT:    lw a4, 4(a1)
-; ILP32E-WITHFP-NEXT:    lw a5, 12(a1)
-; ILP32E-WITHFP-NEXT:    lw a6, 12(a0)
-; ILP32E-WITHFP-NEXT:    lw a7, 4(a0)
+; ILP32E-WITHFP-NEXT:    lw a2, 0(a0)
+; ILP32E-WITHFP-NEXT:    lw a3, 4(a0)
+; ILP32E-WITHFP-NEXT:    lw a4, 12(a1)
+; ILP32E-WITHFP-NEXT:    lw a5, 12(a0)
+; ILP32E-WITHFP-NEXT:    lw a6, 0(a1)
+; ILP32E-WITHFP-NEXT:    lw a7, 4(a1)
 ; ILP32E-WITHFP-NEXT:    lw a1, 8(a1)
 ; ILP32E-WITHFP-NEXT:    lw a0, 8(a0)
-; ILP32E-WITHFP-NEXT:    xor a5, a6, a5
-; ILP32E-WITHFP-NEXT:    xor a4, a7, a4
-; ILP32E-WITHFP-NEXT:    or a4, a4, a5
+; ILP32E-WITHFP-NEXT:    xor a4, a5, a4
+; ILP32E-WITHFP-NEXT:    xor a3, a3, a7
+; ILP32E-WITHFP-NEXT:    or a3, a3, a4
 ; ILP32E-WITHFP-NEXT:    xor a0, a0, a1
-; ILP32E-WITHFP-NEXT:    xor a2, a3, a2
-; ILP32E-WITHFP-NEXT:    or a0, a2, a0
-; ILP32E-WITHFP-NEXT:    or a0, a0, a4
+; ILP32E-WITHFP-NEXT:    xor a1, a2, a6
+; ILP32E-WITHFP-NEXT:    or a0, a1, a0
+; ILP32E-WITHFP-NEXT:    or a0, a0, a3
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
 ; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
@@ -1318,21 +1318,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars:
 ; ILP32E-FPELIM-SAVE-RESTORE:       # %bb.0:
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a2, 0(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a3, 0(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 4(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 12(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 12(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a7, 4(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a2, 0(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 12(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 0(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a7, 4(a1)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 8(a1)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a5, a6, a5
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a7, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a4, a4, a5
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a5, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a3, a7
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a3, a3, a4
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a0, a1
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a2, a3, a2
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a2, a0
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a2, a6
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a1, a0
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a3
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    ret
 ;
@@ -1344,21 +1344,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_offset s0, -8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a2, 0(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a3, 0(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 4(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 12(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 12(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a7, 4(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a2, 0(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 12(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 0(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a7, 4(a1)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 8(a1)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a5, a6, a5
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a7, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a4, a4, a5
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a5, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a3, a7
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a3, a3, a4
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a0, a1
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a2, a3, a2
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a2, a0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a2, a6
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a1, a0
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a3
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    tail __riscv_restore_1
   %b_bitcast = bitcast fp128 %b to i128
@@ -1492,23 +1492,23 @@ define i32 @caller_large_scalars() {
 define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i128 %h, i32 %i, fp128 %j) {
 ; ILP32E-FPELIM-LABEL: callee_large_scalars_exhausted_regs:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    lw a0, 12(sp)
-; ILP32E-FPELIM-NEXT:    lw a1, 4(sp)
+; ILP32E-FPELIM-NEXT:    lw a0, 4(sp)
+; ILP32E-FPELIM-NEXT:    lw a1, 12(sp)
 ; ILP32E-FPELIM-NEXT:    lw a2, 0(a0)
-; ILP32E-FPELIM-NEXT:    lw a3, 0(a1)
-; ILP32E-FPELIM-NEXT:    lw a4, 4(a0)
+; ILP32E-FPELIM-NEXT:    lw a3, 4(a0)
+; ILP32E-FPELIM-NEXT:    lw a4, 12(a1)
 ; ILP32E-FPELIM-NEXT:    lw a5, 12(a0)
-; ILP32E-FPELIM-NEXT:    lw a6, 12(a1)
+; ILP32E-FPELIM-NEXT:    lw a6, 0(a1)
 ; ILP32E-FPELIM-NEXT:    lw a7, 4(a1)
-; ILP32E-FPELIM-NEXT:    lw a0, 8(a0)
 ; ILP32E-FPELIM-NEXT:    lw a1, 8(a1)
-; ILP32E-FPELIM-NEXT:    xor a5, a6, a5
-; ILP32E-FPELIM-NEXT:    xor a4, a7, a4
-; ILP32E-FPELIM-NEXT:    or a4, a4, a5
-; ILP32E-FPELIM-NEXT:    xor a0, a1, a0
-; ILP32E-FPELIM-NEXT:    xor a2, a3, a2
-; ILP32E-FPELIM-NEXT:    or a0, a2, a0
-; ILP32E-FPELIM-NEXT:    or a0, a0, a4
+; ILP32E-FPELIM-NEXT:    lw a0, 8(a0)
+; ILP32E-FPELIM-NEXT:    xor a4, a5, a4
+; ILP32E-FPELIM-NEXT:    xor a3, a3, a7
+; ILP32E-FPELIM-NEXT:    or a3, a3, a4
+; ILP32E-FPELIM-NEXT:    xor a0, a0, a1
+; ILP32E-FPELIM-NEXT:    xor a1, a2, a6
+; ILP32E-FPELIM-NEXT:    or a0, a1, a0
+; ILP32E-FPELIM-NEXT:    or a0, a0, a3
 ; ILP32E-FPELIM-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-NEXT:    ret
 ;
@@ -1522,23 +1522,23 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
-; ILP32E-WITHFP-NEXT:    lw a0, 12(s0)
-; ILP32E-WITHFP-NEXT:    lw a1, 4(s0)
+; ILP32E-WITHFP-NEXT:    lw a0, 4(s0)
+; ILP32E-WITHFP-NEXT:    lw a1, 12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a2, 0(a0)
-; ILP32E-WITHFP-NEXT:    lw a3, 0(a1)
-; ILP32E-WITHFP-NEXT:    lw a4, 4(a0)
+; ILP32E-WITHFP-NEXT:    lw a3, 4(a0)
+; ILP32E-WITHFP-NEXT:    lw a4, 12(a1)
 ; ILP32E-WITHFP-NEXT:    lw a5, 12(a0)
-; ILP32E-WITHFP-NEXT:    lw a6, 12(a1)
+; ILP32E-WITHFP-NEXT:    lw a6, 0(a1)
 ; ILP32E-WITHFP-NEXT:    lw a7, 4(a1)
-; ILP32E-WITHFP-NEXT:    lw a0, 8(a0)
 ; ILP32E-WITHFP-NEXT:    lw a1, 8(a1)
-; ILP32E-WITHFP-NEXT:    xor a5, a6, a5
-; ILP32E-WITHFP-NEXT:    xor a4, a7, a4
-; ILP32E-WITHFP-NEXT:    or a4, a4, a5
-; ILP32E-WITHFP-NEXT:    xor a0, a1, a0
-; ILP32E-WITHFP-NEXT:    xor a2, a3, a2
-; ILP32E-WITHFP-NEXT:    or a0, a2, a0
-; ILP32E-WITHFP-NEXT:    or a0, a0, a4
+; ILP32E-WITHFP-NEXT:    lw a0, 8(a0)
+; ILP32E-WITHFP-NEXT:    xor a4, a5, a4
+; ILP32E-WITHFP-NEXT:    xor a3, a3, a7
+; ILP32E-WITHFP-NEXT:    or a3, a3, a4
+; ILP32E-WITHFP-NEXT:    xor a0, a0, a1
+; ILP32E-WITHFP-NEXT:    xor a1, a2, a6
+; ILP32E-WITHFP-NEXT:    or a0, a1, a0
+; ILP32E-WITHFP-NEXT:    or a0, a0, a3
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
 ; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
@@ -1547,23 +1547,23 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars_exhausted_regs:
 ; ILP32E-FPELIM-SAVE-RESTORE:       # %bb.0:
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 12(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 4(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 4(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 12(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a2, 0(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a3, 0(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 4(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 12(a1)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 12(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 0(a1)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a7, 4(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 8(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a5, a6, a5
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a7, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a4, a4, a5
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a1, a0
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a2, a3, a2
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a2, a0
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a5, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a3, a7
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a3, a3, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a0, a1
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a2, a6
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a1, a0
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a3
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    ret
 ;
@@ -1575,23 +1575,23 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_offset s0, -8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 12(s0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 4(s0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 4(s0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 12(s0)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a2, 0(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a3, 0(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 4(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 12(a1)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 12(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 0(a1)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a7, 4(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 8(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a5, a6, a5
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a7, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a4, a4, a5
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a1, a0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a2, a3, a2
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a2, a0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a5, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a3, a7
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a3, a3, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a0, a1
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a2, a6
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a1, a0
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a3
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    tail __riscv_restore_1
   %j_bitcast = bitcast fp128 %j to i128
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
index 67123466354c4..a0e1b002b7260 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
@@ -106,21 +106,21 @@ define i32 @caller_many_scalars() nounwind {
 define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 ; RV64I-LABEL: callee_large_scalars:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    ld a2, 0(a1)
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    ld a4, 8(a1)
-; RV64I-NEXT:    ld a5, 24(a1)
-; RV64I-NEXT:    ld a6, 24(a0)
-; RV64I-NEXT:    ld a7, 8(a0)
+; RV64I-NEXT:    ld a2, 0(a0)
+; RV64I-NEXT:    ld a3, 8(a0)
+; RV64I-NEXT:    ld a4, 24(a1)
+; RV64I-NEXT:    ld a5, 24(a0)
+; RV64I-NEXT:    ld a6, 0(a1)
+; RV64I-NEXT:    ld a7, 8(a1)
 ; RV64I-NEXT:    ld a1, 16(a1)
 ; RV64I-NEXT:    ld a0, 16(a0)
-; RV64I-NEXT:    xor a5, a6, a5
-; RV64I-NEXT:    xor a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    xor a4, a5, a4
+; RV64I-NEXT:    xor a3, a3, a7
+; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    xor a2, a3, a2
-; RV64I-NEXT:    or a0, a2, a0
-; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    xor a1, a2, a6
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %a, %b
@@ -161,21 +161,21 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 ; RV64I-LABEL: callee_large_scalars_exhausted_regs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    ld a0, 8(sp)
-; RV64I-NEXT:    ld a1, 0(a0)
-; RV64I-NEXT:    ld a2, 0(a7)
-; RV64I-NEXT:    ld a3, 8(a0)
-; RV64I-NEXT:    ld a4, 24(a0)
-; RV64I-NEXT:    ld a5, 24(a7)
-; RV64I-NEXT:    ld a6, 8(a7)
+; RV64I-NEXT:    ld a1, 0(a7)
+; RV64I-NEXT:    ld a2, 8(a7)
+; RV64I-NEXT:    ld a3, 24(a0)
+; RV64I-NEXT:    ld a4, 24(a7)
+; RV64I-NEXT:    ld a5, 0(a0)
+; RV64I-NEXT:    ld a6, 8(a0)
 ; RV64I-NEXT:    ld a0, 16(a0)
 ; RV64I-NEXT:    ld a7, 16(a7)
-; RV64I-NEXT:    xor a4, a5, a4
-; RV64I-NEXT:    xor a3, a6, a3
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    xor a3, a4, a3
+; RV64I-NEXT:    xor a2, a2, a6
+; RV64I-NEXT:    or a2, a2, a3
 ; RV64I-NEXT:    xor a0, a7, a0
-; RV64I-NEXT:    xor a1, a2, a1
+; RV64I-NEXT:    xor a1, a1, a5
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %h, %j
diff --git a/llvm/test/CodeGen/RISCV/calls.ll b/llvm/test/CodeGen/RISCV/calls.ll
index 598a026fb9552..f18bbb4ed84ee 100644
--- a/llvm/test/CodeGen/RISCV/calls.ll
+++ b/llvm/test/CodeGen/RISCV/calls.ll
@@ -11,6 +11,8 @@
 ; RUN:   | FileCheck -check-prefix=RV64I-MEDIUM %s
 ; RUN: llc -code-model=large -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I-LARGE %s
+; RUN: llc -code-model=large -mtriple=riscv64 -mattr=experimental-zicfilp -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I-LARGE-ZICFILP %s
 
 declare i32 @external_function(i32)
 
@@ -62,6 +64,19 @@ define i32 @test_call_external(i32 %a) nounwind {
 ; RV64I-LARGE-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-LARGE-NEXT:    addi sp, sp, 16
 ; RV64I-LARGE-NEXT:    ret
+;
+; RV64I-LARGE-ZICFILP-LABEL: test_call_external:
+; RV64I-LARGE-ZICFILP:       # %bb.0:
+; RV64I-LARGE-ZICFILP-NEXT:    lpad 0
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
+; RV64I-LARGE-ZICFILP-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-LARGE-ZICFILP-NEXT:  .Lpcrel_hi0:
+; RV64I-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI0_0)
+; RV64I-LARGE-ZICFILP-NEXT:    ld t2, %pcrel_lo(.Lpcrel_hi0)(a1)
+; RV64I-LARGE-ZICFILP-NEXT:    jalr t2
+; RV64I-LARGE-ZICFILP-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, 16
+; RV64I-LARGE-ZICFILP-NEXT:    ret
   %1 = call i32 @external_function(i32 %a)
   ret i32 %1
 }
@@ -116,6 +131,19 @@ define i32 @test_call_dso_local(i32 %a) nounwind {
 ; RV64I-LARGE-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-LARGE-NEXT:    addi sp, sp, 16
 ; RV64I-LARGE-NEXT:    ret
+;
+; RV64I-LARGE-ZICFILP-LABEL: test_call_dso_local:
+; RV64I-LARGE-ZICFILP:       # %bb.0:
+; RV64I-LARGE-ZICFILP-NEXT:    lpad 0
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
+; RV64I-LARGE-ZICFILP-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-LARGE-ZICFILP-NEXT:  .Lpcrel_hi1:
+; RV64I-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI1_0)
+; RV64I-LARGE-ZICFILP-NEXT:    ld t2, %pcrel_lo(.Lpcrel_hi1)(a1)
+; RV64I-LARGE-ZICFILP-NEXT:    jalr t2
+; RV64I-LARGE-ZICFILP-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, 16
+; RV64I-LARGE-ZICFILP-NEXT:    ret
   %1 = call i32 @dso_local_function(i32 %a)
   ret i32 %1
 }
@@ -145,6 +173,12 @@ define i32 @defined_function(i32 %a) nounwind {
 ; RV64I-LARGE:       # %bb.0:
 ; RV64I-LARGE-NEXT:    addiw a0, a0, 1
 ; RV64I-LARGE-NEXT:    ret
+;
+; RV64I-LARGE-ZICFILP-LABEL: defined_function:
+; RV64I-LARGE-ZICFILP:       # %bb.0:
+; RV64I-LARGE-ZICFILP-NEXT:    lpad 0
+; RV64I-LARGE-ZICFILP-NEXT:    addiw a0, a0, 1
+; RV64I-LARGE-ZICFILP-NEXT:    ret
   %1 = add i32 %a, 1
   ret i32 %1
 }
@@ -197,6 +231,19 @@ define i32 @test_call_defined(i32 %a) nounwind {
 ; RV64I-LARGE-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-LARGE-NEXT:    addi sp, sp, 16
 ; RV64I-LARGE-NEXT:    ret
+;
+; RV64I-LARGE-ZICFILP-LABEL: test_call_defined:
+; RV64I-LARGE-ZICFILP:       # %bb.0:
+; RV64I-LARGE-ZICFILP-NEXT:    lpad 0
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
+; RV64I-LARGE-ZICFILP-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-LARGE-ZICFILP-NEXT:  .Lpcrel_hi2:
+; RV64I-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI3_0)
+; RV64I-LARGE-ZICFILP-NEXT:    ld t2, %pcrel_lo(.Lpcrel_hi2)(a1)
+; RV64I-LARGE-ZICFILP-NEXT:    jalr t2
+; RV64I-LARGE-ZICFILP-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, 16
+; RV64I-LARGE-ZICFILP-NEXT:    ret
   %1 = call i32 @defined_function(i32 %a)
   ret i32 %1
 }
@@ -256,6 +303,18 @@ define i32 @test_call_indirect(ptr %a, i32 %b) nounwind {
 ; RV64I-LARGE-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-LARGE-NEXT:    addi sp, sp, 16
 ; RV64I-LARGE-NEXT:    ret
+;
+; RV64I-LARGE-ZICFILP-LABEL: test_call_indirect:
+; RV64I-LARGE-ZICFILP:       # %bb.0:
+; RV64I-LARGE-ZICFILP-NEXT:    lpad 0
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
+; RV64I-LARGE-ZICFILP-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-LARGE-ZICFILP-NEXT:    mv a2, a0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a0, a1
+; RV64I-LARGE-ZICFILP-NEXT:    jalr a2
+; RV64I-LARGE-ZICFILP-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, 16
+; RV64I-LARGE-ZICFILP-NEXT:    ret
   %1 = call i32 %a(i32 %b)
   ret i32 %1
 }
@@ -347,6 +406,24 @@ define i32 @test_call_indirect_no_t0(ptr %a, i32 %b, i32 %c, i32 %d, i32 %e, i32
 ; RV64I-LARGE-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-LARGE-NEXT:    addi sp, sp, 16
 ; RV64I-LARGE-NEXT:    ret
+;
+; RV64I-LARGE-ZICFILP-LABEL: test_call_indirect_no_t0:
+; RV64I-LARGE-ZICFILP:       # %bb.0:
+; RV64I-LARGE-ZICFILP-NEXT:    lpad 0
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
+; RV64I-LARGE-ZICFILP-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-LARGE-ZICFILP-NEXT:    mv t1, a0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a0, a1
+; RV64I-LARGE-ZICFILP-NEXT:    mv a1, a2
+; RV64I-LARGE-ZICFILP-NEXT:    mv a2, a3
+; RV64I-LARGE-ZICFILP-NEXT:    mv a3, a4
+; RV64I-LARGE-ZICFILP-NEXT:    mv a4, a5
+; RV64I-LARGE-ZICFILP-NEXT:    mv a5, a6
+; RV64I-LARGE-ZICFILP-NEXT:    mv a6, a7
+; RV64I-LARGE-ZICFILP-NEXT:    jalr t1
+; RV64I-LARGE-ZICFILP-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, 16
+; RV64I-LARGE-ZICFILP-NEXT:    ret
   %1 = call i32 %a(i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h)
   ret i32 %1
 }
@@ -379,6 +456,12 @@ define fastcc i32 @fastcc_function(i32 %a, i32 %b) nounwind {
 ; RV64I-LARGE:       # %bb.0:
 ; RV64I-LARGE-NEXT:    addw a0, a0, a1
 ; RV64I-LARGE-NEXT:    ret
+;
+; RV64I-LARGE-ZICFILP-LABEL: fastcc_function:
+; RV64I-LARGE-ZICFILP:       # %bb.0:
+; RV64I-LARGE-ZICFILP-NEXT:    lpad 0
+; RV64I-LARGE-ZICFILP-NEXT:    addw a0, a0, a1
+; RV64I-LARGE-ZICFILP-NEXT:    ret
  %1 = add i32 %a, %b
  ret i32 %1
 }
@@ -452,6 +535,24 @@ define i32 @test_call_fastcc(i32 %a, i32 %b) nounwind {
 ; RV64I-LARGE-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
 ; RV64I-LARGE-NEXT:    addi sp, sp, 16
 ; RV64I-LARGE-NEXT:    ret
+;
+; RV64I-LARGE-ZICFILP-LABEL: test_call_fastcc:
+; RV64I-LARGE-ZICFILP:       # %bb.0:
+; RV64I-LARGE-ZICFILP-NEXT:    lpad 0
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
+; RV64I-LARGE-ZICFILP-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-LARGE-ZICFILP-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-LARGE-ZICFILP-NEXT:    mv s0, a0
+; RV64I-LARGE-ZICFILP-NEXT:  .Lpcrel_hi3:
+; RV64I-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI7_0)
+; RV64I-LARGE-ZICFILP-NEXT:    ld t2, %pcrel_lo(.Lpcrel_hi3)(a0)
+; RV64I-LARGE-ZICFILP-NEXT:    mv a0, s0
+; RV64I-LARGE-ZICFILP-NEXT:    jalr t2
+; RV64I-LARGE-ZICFILP-NEXT:    mv a0, s0
+; RV64I-LARGE-ZICFILP-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-LARGE-ZICFILP-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, 16
+; RV64I-LARGE-ZICFILP-NEXT:    ret
   %1 = call fastcc i32 @fastcc_function(i32 %a, i32 %b)
   ret i32 %a
 }
@@ -572,6 +673,33 @@ define i32 @test_call_external_many_args(i32 %a) nounwind {
 ; RV64I-LARGE-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-LARGE-NEXT:    addi sp, sp, 32
 ; RV64I-LARGE-NEXT:    ret
+;
+; RV64I-LARGE-ZICFILP-LABEL: test_call_external_many_args:
+; RV64I-LARGE-ZICFILP:       # %bb.0:
+; RV64I-LARGE-ZICFILP-NEXT:    lpad 0
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, -32
+; RV64I-LARGE-ZICFILP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-LARGE-ZICFILP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-LARGE-ZICFILP-NEXT:    mv s0, a0
+; RV64I-LARGE-ZICFILP-NEXT:  .Lpcrel_hi4:
+; RV64I-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI8_0)
+; RV64I-LARGE-ZICFILP-NEXT:    ld t2, %pcrel_lo(.Lpcrel_hi4)(a0)
+; RV64I-LARGE-ZICFILP-NEXT:    sd s0, 8(sp)
+; RV64I-LARGE-ZICFILP-NEXT:    sd s0, 0(sp)
+; RV64I-LARGE-ZICFILP-NEXT:    mv a0, s0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a1, s0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a2, s0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a3, s0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a4, s0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a5, s0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a6, s0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a7, s0
+; RV64I-LARGE-ZICFILP-NEXT:    jalr t2
+; RV64I-LARGE-ZICFILP-NEXT:    mv a0, s0
+; RV64I-LARGE-ZICFILP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-LARGE-ZICFILP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, 32
+; RV64I-LARGE-ZICFILP-NEXT:    ret
   %1 = call i32 @external_many_args(i32 %a, i32 %a, i32 %a, i32 %a, i32 %a,
                                     i32 %a, i32 %a, i32 %a, i32 %a, i32 %a)
   ret i32 %a
@@ -607,6 +735,13 @@ define i32 @defined_many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 %
 ; RV64I-LARGE-NEXT:    lw a0, 8(sp)
 ; RV64I-LARGE-NEXT:    addiw a0, a0, 1
 ; RV64I-LARGE-NEXT:    ret
+;
+; RV64I-LARGE-ZICFILP-LABEL: defined_many_args:
+; RV64I-LARGE-ZICFILP:       # %bb.0:
+; RV64I-LARGE-ZICFILP-NEXT:    lpad 0
+; RV64I-LARGE-ZICFILP-NEXT:    lw a0, 8(sp)
+; RV64I-LARGE-ZICFILP-NEXT:    addiw a0, a0, 1
+; RV64I-LARGE-ZICFILP-NEXT:    ret
   %added = add i32 %j, 1
   ret i32 %added
 }
@@ -704,6 +839,28 @@ define i32 @test_call_defined_many_args(i32 %a) nounwind {
 ; RV64I-LARGE-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-LARGE-NEXT:    addi sp, sp, 32
 ; RV64I-LARGE-NEXT:    ret
+;
+; RV64I-LARGE-ZICFILP-LABEL: test_call_defined_many_args:
+; RV64I-LARGE-ZICFILP:       # %bb.0:
+; RV64I-LARGE-ZICFILP-NEXT:    lpad 0
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, -32
+; RV64I-LARGE-ZICFILP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-LARGE-ZICFILP-NEXT:  .Lpcrel_hi5:
+; RV64I-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI10_0)
+; RV64I-LARGE-ZICFILP-NEXT:    ld t2, %pcrel_lo(.Lpcrel_hi5)(a1)
+; RV64I-LARGE-ZICFILP-NEXT:    sd a0, 8(sp)
+; RV64I-LARGE-ZICFILP-NEXT:    sd a0, 0(sp)
+; RV64I-LARGE-ZICFILP-NEXT:    mv a1, a0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a2, a0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a3, a0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a4, a0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a5, a0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a6, a0
+; RV64I-LARGE-ZICFILP-NEXT:    mv a7, a0
+; RV64I-LARGE-ZICFILP-NEXT:    jalr t2
+; RV64I-LARGE-ZICFILP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, 32
+; RV64I-LARGE-ZICFILP-NEXT:    ret
   %1 = call i32 @defined_many_args(i32 %a, i32 %a, i32 %a, i32 %a, i32 %a,
                                    i32 %a, i32 %a, i32 %a, i32 %a, i32 %a)
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/codemodel-lowering.ll b/llvm/test/CodeGen/RISCV/codemodel-lowering.ll
index ad81db75f7bc9..4831f0b24c7fe 100644
--- a/llvm/test/CodeGen/RISCV/codemodel-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/codemodel-lowering.ll
@@ -1,14 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+f -target-abi=ilp32f -code-model=small -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefix=RV32I-SMALL
-; RUN: llc -mtriple=riscv32 -mattr=+f -target-abi=ilp32f -code-model=medium -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefix=RV32I-MEDIUM
-; RUN: llc -mtriple=riscv64 -mattr=+f -target-abi=lp64f -code-model=small -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefix=RV64I-SMALL
-; RUN: llc -mtriple=riscv64 -mattr=+f -target-abi=lp64f -code-model=medium -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefix=RV64I-MEDIUM
-; RUN: llc -mtriple=riscv64 -mattr=+f -target-abi=lp64f -code-model=large -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefix=RV64I-LARGE
+; RUN: llc -mtriple=riscv32 -mattr=+f,+zfh -target-abi=ilp32f -code-model=small -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV32I-SMALL,RV32F-SMALL
+; RUN: llc -mtriple=riscv32 -mattr=+f,+zfh -target-abi=ilp32f -code-model=medium -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV32I-MEDIUM,RV32F-MEDIUM
+; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh -target-abi=lp64f -code-model=small -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64I-SMALL,RV64F-SMALL
+; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh -target-abi=lp64f -code-model=medium -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64I-MEDIUM,RV64F-MEDIUM
+; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh -target-abi=lp64f -code-model=large -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64I-LARGE,RV64F-LARGE
+; RUN: llc -mtriple=riscv32 -mattr=+zfinx,+zhinx -target-abi=ilp32 -code-model=small -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV32I-SMALL,RV32FINX-SMALL
+; RUN: llc -mtriple=riscv32 -mattr=+zfinx,+zhinx -target-abi=ilp32 -code-model=medium -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV32I-MEDIUM,RV32FINX-MEDIUM
+; RUN: llc -mtriple=riscv64 -mattr=+zfinx,+zhinx -target-abi=lp64 -code-model=small -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64I-SMALL,RV64FINX-SMALL
+; RUN: llc -mtriple=riscv64 -mattr=+zfinx,+zhinx -target-abi=lp64 -code-model=medium -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64I-MEDIUM,RV64FINX-MEDIUM
+; RUN: llc -mtriple=riscv64 -mattr=+zfinx,+zhinx -target-abi=lp64 -code-model=large -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64I-LARGE,RV64FINX-LARGE
 
 ; Check lowering of globals
 @G = global i32 0
@@ -238,43 +248,78 @@ indirectgoto:
 ; Check lowering of constantpools
 
 define float @lower_constantpool(float %a) nounwind {
-; RV32I-SMALL-LABEL: lower_constantpool:
-; RV32I-SMALL:       # %bb.0:
-; RV32I-SMALL-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32I-SMALL-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
-; RV32I-SMALL-NEXT:    fadd.s fa0, fa0, fa5
-; RV32I-SMALL-NEXT:    ret
+; RV32F-SMALL-LABEL: lower_constantpool:
+; RV32F-SMALL:       # %bb.0:
+; RV32F-SMALL-NEXT:    lui a0, %hi(.LCPI3_0)
+; RV32F-SMALL-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
+; RV32F-SMALL-NEXT:    fadd.s fa0, fa0, fa5
+; RV32F-SMALL-NEXT:    ret
 ;
-; RV32I-MEDIUM-LABEL: lower_constantpool:
-; RV32I-MEDIUM:       # %bb.0:
-; RV32I-MEDIUM-NEXT:  .Lpcrel_hi3:
-; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.LCPI3_0)
-; RV32I-MEDIUM-NEXT:    flw fa5, %pcrel_lo(.Lpcrel_hi3)(a0)
-; RV32I-MEDIUM-NEXT:    fadd.s fa0, fa0, fa5
-; RV32I-MEDIUM-NEXT:    ret
+; RV32F-MEDIUM-LABEL: lower_constantpool:
+; RV32F-MEDIUM:       # %bb.0:
+; RV32F-MEDIUM-NEXT:  .Lpcrel_hi3:
+; RV32F-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.LCPI3_0)
+; RV32F-MEDIUM-NEXT:    flw fa5, %pcrel_lo(.Lpcrel_hi3)(a0)
+; RV32F-MEDIUM-NEXT:    fadd.s fa0, fa0, fa5
+; RV32F-MEDIUM-NEXT:    ret
 ;
-; RV64I-SMALL-LABEL: lower_constantpool:
-; RV64I-SMALL:       # %bb.0:
-; RV64I-SMALL-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV64I-SMALL-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
-; RV64I-SMALL-NEXT:    fadd.s fa0, fa0, fa5
-; RV64I-SMALL-NEXT:    ret
+; RV64F-SMALL-LABEL: lower_constantpool:
+; RV64F-SMALL:       # %bb.0:
+; RV64F-SMALL-NEXT:    lui a0, %hi(.LCPI3_0)
+; RV64F-SMALL-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
+; RV64F-SMALL-NEXT:    fadd.s fa0, fa0, fa5
+; RV64F-SMALL-NEXT:    ret
 ;
-; RV64I-MEDIUM-LABEL: lower_constantpool:
-; RV64I-MEDIUM:       # %bb.0:
-; RV64I-MEDIUM-NEXT:  .Lpcrel_hi3:
-; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.LCPI3_0)
-; RV64I-MEDIUM-NEXT:    flw fa5, %pcrel_lo(.Lpcrel_hi3)(a0)
-; RV64I-MEDIUM-NEXT:    fadd.s fa0, fa0, fa5
-; RV64I-MEDIUM-NEXT:    ret
+; RV64F-MEDIUM-LABEL: lower_constantpool:
+; RV64F-MEDIUM:       # %bb.0:
+; RV64F-MEDIUM-NEXT:  .Lpcrel_hi3:
+; RV64F-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.LCPI3_0)
+; RV64F-MEDIUM-NEXT:    flw fa5, %pcrel_lo(.Lpcrel_hi3)(a0)
+; RV64F-MEDIUM-NEXT:    fadd.s fa0, fa0, fa5
+; RV64F-MEDIUM-NEXT:    ret
 ;
-; RV64I-LARGE-LABEL: lower_constantpool:
-; RV64I-LARGE:       # %bb.0:
-; RV64I-LARGE-NEXT:  .Lpcrel_hi3:
-; RV64I-LARGE-NEXT:    auipc a0, %pcrel_hi(.LCPI3_0)
-; RV64I-LARGE-NEXT:    flw fa5, %pcrel_lo(.Lpcrel_hi3)(a0)
-; RV64I-LARGE-NEXT:    fadd.s fa0, fa0, fa5
-; RV64I-LARGE-NEXT:    ret
+; RV64F-LARGE-LABEL: lower_constantpool:
+; RV64F-LARGE:       # %bb.0:
+; RV64F-LARGE-NEXT:  .Lpcrel_hi3:
+; RV64F-LARGE-NEXT:    auipc a0, %pcrel_hi(.LCPI3_0)
+; RV64F-LARGE-NEXT:    flw fa5, %pcrel_lo(.Lpcrel_hi3)(a0)
+; RV64F-LARGE-NEXT:    fadd.s fa0, fa0, fa5
+; RV64F-LARGE-NEXT:    ret
+;
+; RV32FINX-SMALL-LABEL: lower_constantpool:
+; RV32FINX-SMALL:       # %bb.0:
+; RV32FINX-SMALL-NEXT:    lui a1, 260097
+; RV32FINX-SMALL-NEXT:    addi a1, a1, -2048
+; RV32FINX-SMALL-NEXT:    fadd.s a0, a0, a1
+; RV32FINX-SMALL-NEXT:    ret
+;
+; RV32FINX-MEDIUM-LABEL: lower_constantpool:
+; RV32FINX-MEDIUM:       # %bb.0:
+; RV32FINX-MEDIUM-NEXT:    lui a1, 260097
+; RV32FINX-MEDIUM-NEXT:    addi a1, a1, -2048
+; RV32FINX-MEDIUM-NEXT:    fadd.s a0, a0, a1
+; RV32FINX-MEDIUM-NEXT:    ret
+;
+; RV64FINX-SMALL-LABEL: lower_constantpool:
+; RV64FINX-SMALL:       # %bb.0:
+; RV64FINX-SMALL-NEXT:    lui a1, 260097
+; RV64FINX-SMALL-NEXT:    addiw a1, a1, -2048
+; RV64FINX-SMALL-NEXT:    fadd.s a0, a0, a1
+; RV64FINX-SMALL-NEXT:    ret
+;
+; RV64FINX-MEDIUM-LABEL: lower_constantpool:
+; RV64FINX-MEDIUM:       # %bb.0:
+; RV64FINX-MEDIUM-NEXT:    lui a1, 260097
+; RV64FINX-MEDIUM-NEXT:    addiw a1, a1, -2048
+; RV64FINX-MEDIUM-NEXT:    fadd.s a0, a0, a1
+; RV64FINX-MEDIUM-NEXT:    ret
+;
+; RV64FINX-LARGE-LABEL: lower_constantpool:
+; RV64FINX-LARGE:       # %bb.0:
+; RV64FINX-LARGE-NEXT:    lui a1, 260097
+; RV64FINX-LARGE-NEXT:    addiw a1, a1, -2048
+; RV64FINX-LARGE-NEXT:    fadd.s a0, a0, a1
+; RV64FINX-LARGE-NEXT:    ret
   %1 = fadd float %a, 1.000244140625
   ret float %1
 }
@@ -289,13 +334,13 @@ define i32 @lower_extern_weak(i32 %a) nounwind {
 ; RV32I-SMALL-NEXT:    lw a0, %lo(W)(a0)
 ; RV32I-SMALL-NEXT:    ret
 ;
-; RV32I-MEDIUM-LABEL: lower_extern_weak:
-; RV32I-MEDIUM:       # %bb.0:
-; RV32I-MEDIUM-NEXT:  .Lpcrel_hi4:
-; RV32I-MEDIUM-NEXT:    auipc a0, %got_pcrel_hi(W)
-; RV32I-MEDIUM-NEXT:    lw a0, %pcrel_lo(.Lpcrel_hi4)(a0)
-; RV32I-MEDIUM-NEXT:    lw a0, 0(a0)
-; RV32I-MEDIUM-NEXT:    ret
+; RV32F-MEDIUM-LABEL: lower_extern_weak:
+; RV32F-MEDIUM:       # %bb.0:
+; RV32F-MEDIUM-NEXT:  .Lpcrel_hi4:
+; RV32F-MEDIUM-NEXT:    auipc a0, %got_pcrel_hi(W)
+; RV32F-MEDIUM-NEXT:    lw a0, %pcrel_lo(.Lpcrel_hi4)(a0)
+; RV32F-MEDIUM-NEXT:    lw a0, 0(a0)
+; RV32F-MEDIUM-NEXT:    ret
 ;
 ; RV64I-SMALL-LABEL: lower_extern_weak:
 ; RV64I-SMALL:       # %bb.0:
@@ -303,21 +348,130 @@ define i32 @lower_extern_weak(i32 %a) nounwind {
 ; RV64I-SMALL-NEXT:    lw a0, %lo(W)(a0)
 ; RV64I-SMALL-NEXT:    ret
 ;
-; RV64I-MEDIUM-LABEL: lower_extern_weak:
-; RV64I-MEDIUM:       # %bb.0:
-; RV64I-MEDIUM-NEXT:  .Lpcrel_hi4:
-; RV64I-MEDIUM-NEXT:    auipc a0, %got_pcrel_hi(W)
-; RV64I-MEDIUM-NEXT:    ld a0, %pcrel_lo(.Lpcrel_hi4)(a0)
-; RV64I-MEDIUM-NEXT:    lw a0, 0(a0)
-; RV64I-MEDIUM-NEXT:    ret
+; RV64F-MEDIUM-LABEL: lower_extern_weak:
+; RV64F-MEDIUM:       # %bb.0:
+; RV64F-MEDIUM-NEXT:  .Lpcrel_hi4:
+; RV64F-MEDIUM-NEXT:    auipc a0, %got_pcrel_hi(W)
+; RV64F-MEDIUM-NEXT:    ld a0, %pcrel_lo(.Lpcrel_hi4)(a0)
+; RV64F-MEDIUM-NEXT:    lw a0, 0(a0)
+; RV64F-MEDIUM-NEXT:    ret
 ;
-; RV64I-LARGE-LABEL: lower_extern_weak:
-; RV64I-LARGE:       # %bb.0:
-; RV64I-LARGE-NEXT:  .Lpcrel_hi4:
-; RV64I-LARGE-NEXT:    auipc a0, %pcrel_hi(.LCPI4_0)
-; RV64I-LARGE-NEXT:    ld a0, %pcrel_lo(.Lpcrel_hi4)(a0)
-; RV64I-LARGE-NEXT:    lw a0, 0(a0)
-; RV64I-LARGE-NEXT:    ret
+; RV64F-LARGE-LABEL: lower_extern_weak:
+; RV64F-LARGE:       # %bb.0:
+; RV64F-LARGE-NEXT:  .Lpcrel_hi4:
+; RV64F-LARGE-NEXT:    auipc a0, %pcrel_hi(.LCPI4_0)
+; RV64F-LARGE-NEXT:    ld a0, %pcrel_lo(.Lpcrel_hi4)(a0)
+; RV64F-LARGE-NEXT:    lw a0, 0(a0)
+; RV64F-LARGE-NEXT:    ret
+;
+; RV32FINX-MEDIUM-LABEL: lower_extern_weak:
+; RV32FINX-MEDIUM:       # %bb.0:
+; RV32FINX-MEDIUM-NEXT:  .Lpcrel_hi3:
+; RV32FINX-MEDIUM-NEXT:    auipc a0, %got_pcrel_hi(W)
+; RV32FINX-MEDIUM-NEXT:    lw a0, %pcrel_lo(.Lpcrel_hi3)(a0)
+; RV32FINX-MEDIUM-NEXT:    lw a0, 0(a0)
+; RV32FINX-MEDIUM-NEXT:    ret
+;
+; RV64FINX-MEDIUM-LABEL: lower_extern_weak:
+; RV64FINX-MEDIUM:       # %bb.0:
+; RV64FINX-MEDIUM-NEXT:  .Lpcrel_hi3:
+; RV64FINX-MEDIUM-NEXT:    auipc a0, %got_pcrel_hi(W)
+; RV64FINX-MEDIUM-NEXT:    ld a0, %pcrel_lo(.Lpcrel_hi3)(a0)
+; RV64FINX-MEDIUM-NEXT:    lw a0, 0(a0)
+; RV64FINX-MEDIUM-NEXT:    ret
+;
+; RV64FINX-LARGE-LABEL: lower_extern_weak:
+; RV64FINX-LARGE:       # %bb.0:
+; RV64FINX-LARGE-NEXT:  .Lpcrel_hi3:
+; RV64FINX-LARGE-NEXT:    auipc a0, %pcrel_hi(.LCPI4_0)
+; RV64FINX-LARGE-NEXT:    ld a0, %pcrel_lo(.Lpcrel_hi3)(a0)
+; RV64FINX-LARGE-NEXT:    lw a0, 0(a0)
+; RV64FINX-LARGE-NEXT:    ret
   %1 = load volatile i32, ptr @W
   ret i32 %1
 }
+
+@X = global half 1.5
+
+define half @lower_global_half(half %a) nounwind {
+; RV32F-SMALL-LABEL: lower_global_half:
+; RV32F-SMALL:       # %bb.0:
+; RV32F-SMALL-NEXT:    lui a0, %hi(X)
+; RV32F-SMALL-NEXT:    flh fa5, %lo(X)(a0)
+; RV32F-SMALL-NEXT:    fadd.h fa0, fa0, fa5
+; RV32F-SMALL-NEXT:    ret
+;
+; RV32F-MEDIUM-LABEL: lower_global_half:
+; RV32F-MEDIUM:       # %bb.0:
+; RV32F-MEDIUM-NEXT:  .Lpcrel_hi5:
+; RV32F-MEDIUM-NEXT:    auipc a0, %pcrel_hi(X)
+; RV32F-MEDIUM-NEXT:    flh fa5, %pcrel_lo(.Lpcrel_hi5)(a0)
+; RV32F-MEDIUM-NEXT:    fadd.h fa0, fa0, fa5
+; RV32F-MEDIUM-NEXT:    ret
+;
+; RV64F-SMALL-LABEL: lower_global_half:
+; RV64F-SMALL:       # %bb.0:
+; RV64F-SMALL-NEXT:    lui a0, %hi(X)
+; RV64F-SMALL-NEXT:    flh fa5, %lo(X)(a0)
+; RV64F-SMALL-NEXT:    fadd.h fa0, fa0, fa5
+; RV64F-SMALL-NEXT:    ret
+;
+; RV64F-MEDIUM-LABEL: lower_global_half:
+; RV64F-MEDIUM:       # %bb.0:
+; RV64F-MEDIUM-NEXT:  .Lpcrel_hi5:
+; RV64F-MEDIUM-NEXT:    auipc a0, %pcrel_hi(X)
+; RV64F-MEDIUM-NEXT:    flh fa5, %pcrel_lo(.Lpcrel_hi5)(a0)
+; RV64F-MEDIUM-NEXT:    fadd.h fa0, fa0, fa5
+; RV64F-MEDIUM-NEXT:    ret
+;
+; RV64F-LARGE-LABEL: lower_global_half:
+; RV64F-LARGE:       # %bb.0:
+; RV64F-LARGE-NEXT:  .Lpcrel_hi5:
+; RV64F-LARGE-NEXT:    auipc a0, %pcrel_hi(.LCPI5_0)
+; RV64F-LARGE-NEXT:    ld a0, %pcrel_lo(.Lpcrel_hi5)(a0)
+; RV64F-LARGE-NEXT:    flh fa5, 0(a0)
+; RV64F-LARGE-NEXT:    fadd.h fa0, fa0, fa5
+; RV64F-LARGE-NEXT:    ret
+;
+; RV32FINX-SMALL-LABEL: lower_global_half:
+; RV32FINX-SMALL:       # %bb.0:
+; RV32FINX-SMALL-NEXT:    lui a1, %hi(X)
+; RV32FINX-SMALL-NEXT:    lh a1, %lo(X)(a1)
+; RV32FINX-SMALL-NEXT:    fadd.h a0, a0, a1
+; RV32FINX-SMALL-NEXT:    ret
+;
+; RV32FINX-MEDIUM-LABEL: lower_global_half:
+; RV32FINX-MEDIUM:       # %bb.0:
+; RV32FINX-MEDIUM-NEXT:  .Lpcrel_hi4:
+; RV32FINX-MEDIUM-NEXT:    auipc a1, %pcrel_hi(X)
+; RV32FINX-MEDIUM-NEXT:    lh a1, %pcrel_lo(.Lpcrel_hi4)(a1)
+; RV32FINX-MEDIUM-NEXT:    fadd.h a0, a0, a1
+; RV32FINX-MEDIUM-NEXT:    ret
+;
+; RV64FINX-SMALL-LABEL: lower_global_half:
+; RV64FINX-SMALL:       # %bb.0:
+; RV64FINX-SMALL-NEXT:    lui a1, %hi(X)
+; RV64FINX-SMALL-NEXT:    lh a1, %lo(X)(a1)
+; RV64FINX-SMALL-NEXT:    fadd.h a0, a0, a1
+; RV64FINX-SMALL-NEXT:    ret
+;
+; RV64FINX-MEDIUM-LABEL: lower_global_half:
+; RV64FINX-MEDIUM:       # %bb.0:
+; RV64FINX-MEDIUM-NEXT:  .Lpcrel_hi4:
+; RV64FINX-MEDIUM-NEXT:    auipc a1, %pcrel_hi(X)
+; RV64FINX-MEDIUM-NEXT:    lh a1, %pcrel_lo(.Lpcrel_hi4)(a1)
+; RV64FINX-MEDIUM-NEXT:    fadd.h a0, a0, a1
+; RV64FINX-MEDIUM-NEXT:    ret
+;
+; RV64FINX-LARGE-LABEL: lower_global_half:
+; RV64FINX-LARGE:       # %bb.0:
+; RV64FINX-LARGE-NEXT:  .Lpcrel_hi4:
+; RV64FINX-LARGE-NEXT:    auipc a1, %pcrel_hi(.LCPI5_0)
+; RV64FINX-LARGE-NEXT:    ld a1, %pcrel_lo(.Lpcrel_hi4)(a1)
+; RV64FINX-LARGE-NEXT:    lh a1, 0(a1)
+; RV64FINX-LARGE-NEXT:    fadd.h a0, a0, a1
+; RV64FINX-LARGE-NEXT:    ret
+  %b = load half, ptr @X
+  %1 = fadd half %a, %b
+  ret half %1
+}
diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
index 899aad6ed7232..0c2b809c0be20 100644
--- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
+++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
@@ -17,10 +17,10 @@ define void @_Z3foov() {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 10
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x0a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 10 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_49)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_49)
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
@@ -83,8 +83,8 @@ define void @_Z3foov() {
 ; CHECK-NEXT:    addi a0, a0, %lo(var_47)
 ; CHECK-NEXT:    vsseg4e16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 10
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
index 8e2fdfc4ba94c..de5bb8a30db16 100644
--- a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
+++ b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
@@ -260,14 +260,14 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX32-NEXT:    sw s9, 68(sp) # 4-byte Folded Spill
 ; ZHINX32-NEXT:    sw s10, 64(sp) # 4-byte Folded Spill
 ; ZHINX32-NEXT:    sw s11, 60(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lh t0, 124(sp)
-; ZHINX32-NEXT:    sw t0, 56(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lh t0, 120(sp)
-; ZHINX32-NEXT:    sw t0, 52(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lh t0, 116(sp)
-; ZHINX32-NEXT:    sw t0, 48(sp) # 4-byte Folded Spill
 ; ZHINX32-NEXT:    lh t0, 112(sp)
-; ZHINX32-NEXT:    sw t0, 44(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sh t0, 58(sp) # 2-byte Folded Spill
+; ZHINX32-NEXT:    lh t0, 116(sp)
+; ZHINX32-NEXT:    sh t0, 56(sp) # 2-byte Folded Spill
+; ZHINX32-NEXT:    lh t0, 120(sp)
+; ZHINX32-NEXT:    sh t0, 54(sp) # 2-byte Folded Spill
+; ZHINX32-NEXT:    lh t0, 124(sp)
+; ZHINX32-NEXT:    sh t0, 52(sp) # 2-byte Folded Spill
 ; ZHINX32-NEXT:    lh t6, 128(sp)
 ; ZHINX32-NEXT:    lh t5, 132(sp)
 ; ZHINX32-NEXT:    lh t4, 136(sp)
@@ -308,10 +308,10 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX32-NEXT:    sh t4, 4(sp)
 ; ZHINX32-NEXT:    sh t5, 2(sp)
 ; ZHINX32-NEXT:    sh t6, 0(sp)
-; ZHINX32-NEXT:    lw t3, 44(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t4, 48(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t5, 52(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t6, 56(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lh t3, 58(sp) # 2-byte Folded Reload
+; ZHINX32-NEXT:    lh t4, 56(sp) # 2-byte Folded Reload
+; ZHINX32-NEXT:    lh t5, 54(sp) # 2-byte Folded Reload
+; ZHINX32-NEXT:    lh t6, 52(sp) # 2-byte Folded Reload
 ; ZHINX32-NEXT:    call callee_half_32
 ; ZHINX32-NEXT:    lw ra, 108(sp) # 4-byte Folded Reload
 ; ZHINX32-NEXT:    lw s0, 104(sp) # 4-byte Folded Reload
@@ -331,48 +331,48 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ;
 ; ZHINX64-LABEL: caller_half_32:
 ; ZHINX64:       # %bb.0:
-; ZHINX64-NEXT:    addi sp, sp, -176
-; ZHINX64-NEXT:    sd ra, 168(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s0, 160(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s1, 152(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s2, 144(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s3, 136(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s4, 128(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s5, 120(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s6, 112(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s7, 104(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s8, 96(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s9, 88(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s10, 80(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s11, 72(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lh t0, 200(sp)
-; ZHINX64-NEXT:    sd t0, 64(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lh t0, 192(sp)
-; ZHINX64-NEXT:    sd t0, 56(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lh t0, 184(sp)
-; ZHINX64-NEXT:    sd t0, 48(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    addi sp, sp, -160
+; ZHINX64-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s0, 144(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s1, 136(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s2, 128(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s3, 120(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s4, 112(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s5, 104(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s6, 96(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s7, 88(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s8, 80(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s9, 72(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    lh t0, 160(sp)
+; ZHINX64-NEXT:    sh t0, 54(sp) # 2-byte Folded Spill
+; ZHINX64-NEXT:    lh t0, 168(sp)
+; ZHINX64-NEXT:    sh t0, 52(sp) # 2-byte Folded Spill
 ; ZHINX64-NEXT:    lh t0, 176(sp)
-; ZHINX64-NEXT:    sd t0, 40(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lh t6, 208(sp)
-; ZHINX64-NEXT:    lh t5, 216(sp)
-; ZHINX64-NEXT:    lh t4, 224(sp)
-; ZHINX64-NEXT:    lh s0, 232(sp)
-; ZHINX64-NEXT:    lh s1, 240(sp)
-; ZHINX64-NEXT:    lh s2, 248(sp)
-; ZHINX64-NEXT:    lh s3, 256(sp)
-; ZHINX64-NEXT:    lh s4, 264(sp)
-; ZHINX64-NEXT:    lh s5, 272(sp)
-; ZHINX64-NEXT:    lh s6, 280(sp)
-; ZHINX64-NEXT:    lh s7, 288(sp)
-; ZHINX64-NEXT:    lh s8, 296(sp)
-; ZHINX64-NEXT:    lh s9, 304(sp)
-; ZHINX64-NEXT:    lh s10, 312(sp)
-; ZHINX64-NEXT:    lh s11, 320(sp)
-; ZHINX64-NEXT:    lh ra, 328(sp)
-; ZHINX64-NEXT:    lh t3, 336(sp)
-; ZHINX64-NEXT:    lh t2, 344(sp)
-; ZHINX64-NEXT:    lh t1, 352(sp)
-; ZHINX64-NEXT:    lh t0, 360(sp)
+; ZHINX64-NEXT:    sh t0, 50(sp) # 2-byte Folded Spill
+; ZHINX64-NEXT:    lh t0, 184(sp)
+; ZHINX64-NEXT:    sh t0, 48(sp) # 2-byte Folded Spill
+; ZHINX64-NEXT:    lh t6, 192(sp)
+; ZHINX64-NEXT:    lh t5, 200(sp)
+; ZHINX64-NEXT:    lh t4, 208(sp)
+; ZHINX64-NEXT:    lh s0, 216(sp)
+; ZHINX64-NEXT:    lh s1, 224(sp)
+; ZHINX64-NEXT:    lh s2, 232(sp)
+; ZHINX64-NEXT:    lh s3, 240(sp)
+; ZHINX64-NEXT:    lh s4, 248(sp)
+; ZHINX64-NEXT:    lh s5, 256(sp)
+; ZHINX64-NEXT:    lh s6, 264(sp)
+; ZHINX64-NEXT:    lh s7, 272(sp)
+; ZHINX64-NEXT:    lh s8, 280(sp)
+; ZHINX64-NEXT:    lh s9, 288(sp)
+; ZHINX64-NEXT:    lh s10, 296(sp)
+; ZHINX64-NEXT:    lh s11, 304(sp)
+; ZHINX64-NEXT:    lh ra, 312(sp)
+; ZHINX64-NEXT:    lh t3, 320(sp)
+; ZHINX64-NEXT:    lh t2, 328(sp)
+; ZHINX64-NEXT:    lh t1, 336(sp)
+; ZHINX64-NEXT:    lh t0, 344(sp)
 ; ZHINX64-NEXT:    sh t0, 38(sp)
 ; ZHINX64-NEXT:    sh t1, 36(sp)
 ; ZHINX64-NEXT:    sh t2, 34(sp)
@@ -393,25 +393,25 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX64-NEXT:    sh t4, 4(sp)
 ; ZHINX64-NEXT:    sh t5, 2(sp)
 ; ZHINX64-NEXT:    sh t6, 0(sp)
-; ZHINX64-NEXT:    ld t3, 40(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t4, 48(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t5, 56(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t6, 64(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    lh t3, 54(sp) # 2-byte Folded Reload
+; ZHINX64-NEXT:    lh t4, 52(sp) # 2-byte Folded Reload
+; ZHINX64-NEXT:    lh t5, 50(sp) # 2-byte Folded Reload
+; ZHINX64-NEXT:    lh t6, 48(sp) # 2-byte Folded Reload
 ; ZHINX64-NEXT:    call callee_half_32
-; ZHINX64-NEXT:    ld ra, 168(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s0, 160(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s1, 152(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s2, 144(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s3, 136(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s4, 128(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s5, 120(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s6, 112(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s7, 104(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s8, 96(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s9, 88(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s10, 80(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s11, 72(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    addi sp, sp, 176
+; ZHINX64-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s2, 128(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s3, 120(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s4, 112(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s5, 104(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s6, 96(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s7, 88(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s8, 80(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s9, 72(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s10, 64(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s11, 56(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    addi sp, sp, 160
 ; ZHINX64-NEXT:    ret
 ;
 ; ZFINX32-LABEL: caller_half_32:
@@ -917,32 +917,28 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ;
 ; ZHINX64-LABEL: caller_float_32:
 ; ZHINX64:       # %bb.0:
-; ZHINX64-NEXT:    addi sp, sp, -224
-; ZHINX64-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lw t0, 248(sp)
-; ZHINX64-NEXT:    sd t0, 112(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lw t0, 240(sp)
-; ZHINX64-NEXT:    sd t0, 104(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lw t0, 232(sp)
-; ZHINX64-NEXT:    sd t0, 96(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    addi sp, sp, -192
+; ZHINX64-NEXT:    sd ra, 184(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s0, 176(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s1, 168(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s2, 160(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s3, 152(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s4, 144(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s5, 136(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s6, 128(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s7, 120(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s8, 112(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s9, 104(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s10, 96(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    sd s11, 88(sp) # 8-byte Folded Spill
 ; ZHINX64-NEXT:    lw t0, 224(sp)
-; ZHINX64-NEXT:    sd t0, 88(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lw t6, 256(sp)
-; ZHINX64-NEXT:    lw t5, 264(sp)
-; ZHINX64-NEXT:    lw t4, 272(sp)
-; ZHINX64-NEXT:    lw s0, 280(sp)
+; ZHINX64-NEXT:    lw t1, 232(sp)
+; ZHINX64-NEXT:    lw t2, 240(sp)
+; ZHINX64-NEXT:    lw s0, 248(sp)
+; ZHINX64-NEXT:    lw t3, 256(sp)
+; ZHINX64-NEXT:    lw t4, 264(sp)
+; ZHINX64-NEXT:    lw t5, 272(sp)
+; ZHINX64-NEXT:    lw t6, 280(sp)
 ; ZHINX64-NEXT:    lw s1, 288(sp)
 ; ZHINX64-NEXT:    lw s2, 296(sp)
 ; ZHINX64-NEXT:    lw s3, 304(sp)
@@ -955,49 +951,45 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX64-NEXT:    lw s10, 360(sp)
 ; ZHINX64-NEXT:    lw s11, 368(sp)
 ; ZHINX64-NEXT:    lw ra, 376(sp)
-; ZHINX64-NEXT:    lw t3, 384(sp)
-; ZHINX64-NEXT:    lw t2, 392(sp)
-; ZHINX64-NEXT:    lw t1, 400(sp)
-; ZHINX64-NEXT:    lw t0, 408(sp)
-; ZHINX64-NEXT:    sw t0, 76(sp)
-; ZHINX64-NEXT:    sw t1, 72(sp)
-; ZHINX64-NEXT:    sw t2, 68(sp)
-; ZHINX64-NEXT:    sw t3, 64(sp)
-; ZHINX64-NEXT:    sw ra, 60(sp)
-; ZHINX64-NEXT:    sw s11, 56(sp)
-; ZHINX64-NEXT:    sw s10, 52(sp)
-; ZHINX64-NEXT:    sw s9, 48(sp)
-; ZHINX64-NEXT:    sw s8, 44(sp)
-; ZHINX64-NEXT:    sw s7, 40(sp)
-; ZHINX64-NEXT:    sw s6, 36(sp)
-; ZHINX64-NEXT:    sw s5, 32(sp)
-; ZHINX64-NEXT:    sw s4, 28(sp)
-; ZHINX64-NEXT:    sw s3, 24(sp)
-; ZHINX64-NEXT:    sw s2, 20(sp)
-; ZHINX64-NEXT:    sw s1, 16(sp)
+; ZHINX64-NEXT:    sw ra, 76(sp)
+; ZHINX64-NEXT:    sw s11, 72(sp)
+; ZHINX64-NEXT:    sw s10, 68(sp)
+; ZHINX64-NEXT:    sw s9, 64(sp)
+; ZHINX64-NEXT:    sw s8, 60(sp)
+; ZHINX64-NEXT:    sw s7, 56(sp)
+; ZHINX64-NEXT:    sw s6, 52(sp)
+; ZHINX64-NEXT:    sw s5, 48(sp)
+; ZHINX64-NEXT:    sw s4, 44(sp)
+; ZHINX64-NEXT:    sw s3, 40(sp)
+; ZHINX64-NEXT:    sw s2, 36(sp)
+; ZHINX64-NEXT:    sw s1, 32(sp)
+; ZHINX64-NEXT:    sw t6, 28(sp)
+; ZHINX64-NEXT:    sw t5, 24(sp)
+; ZHINX64-NEXT:    sw t4, 20(sp)
+; ZHINX64-NEXT:    sw t3, 16(sp)
+; ZHINX64-NEXT:    lw t3, 192(sp)
+; ZHINX64-NEXT:    lw t4, 200(sp)
+; ZHINX64-NEXT:    lw t5, 208(sp)
+; ZHINX64-NEXT:    lw t6, 216(sp)
 ; ZHINX64-NEXT:    sw s0, 12(sp)
-; ZHINX64-NEXT:    sw t4, 8(sp)
-; ZHINX64-NEXT:    sw t5, 4(sp)
-; ZHINX64-NEXT:    sw t6, 0(sp)
-; ZHINX64-NEXT:    ld t3, 88(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t4, 96(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t5, 104(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t6, 112(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    sw t2, 8(sp)
+; ZHINX64-NEXT:    sw t1, 4(sp)
+; ZHINX64-NEXT:    sw t0, 0(sp)
 ; ZHINX64-NEXT:    call callee_float_32
-; ZHINX64-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    addi sp, sp, 224
+; ZHINX64-NEXT:    ld ra, 184(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s0, 176(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s1, 168(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s2, 160(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s3, 152(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s4, 144(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s5, 136(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s6, 128(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s7, 120(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s8, 112(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s9, 104(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s10, 96(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld s11, 88(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    addi sp, sp, 192
 ; ZHINX64-NEXT:    ret
 ;
 ; ZFINX32-LABEL: caller_float_32:
@@ -1087,32 +1079,28 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ;
 ; ZFINX64-LABEL: caller_float_32:
 ; ZFINX64:       # %bb.0:
-; ZFINX64-NEXT:    addi sp, sp, -224
-; ZFINX64-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    lw t0, 248(sp)
-; ZFINX64-NEXT:    sd t0, 112(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    lw t0, 240(sp)
-; ZFINX64-NEXT:    sd t0, 104(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    lw t0, 232(sp)
-; ZFINX64-NEXT:    sd t0, 96(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    addi sp, sp, -192
+; ZFINX64-NEXT:    sd ra, 184(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s0, 176(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s1, 168(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s2, 160(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s3, 152(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s4, 144(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s5, 136(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s6, 128(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s7, 120(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s8, 112(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s9, 104(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s10, 96(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s11, 88(sp) # 8-byte Folded Spill
 ; ZFINX64-NEXT:    lw t0, 224(sp)
-; ZFINX64-NEXT:    sd t0, 88(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    lw t6, 256(sp)
-; ZFINX64-NEXT:    lw t5, 264(sp)
-; ZFINX64-NEXT:    lw t4, 272(sp)
-; ZFINX64-NEXT:    lw s0, 280(sp)
+; ZFINX64-NEXT:    lw t1, 232(sp)
+; ZFINX64-NEXT:    lw t2, 240(sp)
+; ZFINX64-NEXT:    lw s0, 248(sp)
+; ZFINX64-NEXT:    lw t3, 256(sp)
+; ZFINX64-NEXT:    lw t4, 264(sp)
+; ZFINX64-NEXT:    lw t5, 272(sp)
+; ZFINX64-NEXT:    lw t6, 280(sp)
 ; ZFINX64-NEXT:    lw s1, 288(sp)
 ; ZFINX64-NEXT:    lw s2, 296(sp)
 ; ZFINX64-NEXT:    lw s3, 304(sp)
@@ -1125,49 +1113,45 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX64-NEXT:    lw s10, 360(sp)
 ; ZFINX64-NEXT:    lw s11, 368(sp)
 ; ZFINX64-NEXT:    lw ra, 376(sp)
-; ZFINX64-NEXT:    lw t3, 384(sp)
-; ZFINX64-NEXT:    lw t2, 392(sp)
-; ZFINX64-NEXT:    lw t1, 400(sp)
-; ZFINX64-NEXT:    lw t0, 408(sp)
-; ZFINX64-NEXT:    sw t0, 76(sp)
-; ZFINX64-NEXT:    sw t1, 72(sp)
-; ZFINX64-NEXT:    sw t2, 68(sp)
-; ZFINX64-NEXT:    sw t3, 64(sp)
-; ZFINX64-NEXT:    sw ra, 60(sp)
-; ZFINX64-NEXT:    sw s11, 56(sp)
-; ZFINX64-NEXT:    sw s10, 52(sp)
-; ZFINX64-NEXT:    sw s9, 48(sp)
-; ZFINX64-NEXT:    sw s8, 44(sp)
-; ZFINX64-NEXT:    sw s7, 40(sp)
-; ZFINX64-NEXT:    sw s6, 36(sp)
-; ZFINX64-NEXT:    sw s5, 32(sp)
-; ZFINX64-NEXT:    sw s4, 28(sp)
-; ZFINX64-NEXT:    sw s3, 24(sp)
-; ZFINX64-NEXT:    sw s2, 20(sp)
-; ZFINX64-NEXT:    sw s1, 16(sp)
+; ZFINX64-NEXT:    sw ra, 76(sp)
+; ZFINX64-NEXT:    sw s11, 72(sp)
+; ZFINX64-NEXT:    sw s10, 68(sp)
+; ZFINX64-NEXT:    sw s9, 64(sp)
+; ZFINX64-NEXT:    sw s8, 60(sp)
+; ZFINX64-NEXT:    sw s7, 56(sp)
+; ZFINX64-NEXT:    sw s6, 52(sp)
+; ZFINX64-NEXT:    sw s5, 48(sp)
+; ZFINX64-NEXT:    sw s4, 44(sp)
+; ZFINX64-NEXT:    sw s3, 40(sp)
+; ZFINX64-NEXT:    sw s2, 36(sp)
+; ZFINX64-NEXT:    sw s1, 32(sp)
+; ZFINX64-NEXT:    sw t6, 28(sp)
+; ZFINX64-NEXT:    sw t5, 24(sp)
+; ZFINX64-NEXT:    sw t4, 20(sp)
+; ZFINX64-NEXT:    sw t3, 16(sp)
+; ZFINX64-NEXT:    lw t3, 192(sp)
+; ZFINX64-NEXT:    lw t4, 200(sp)
+; ZFINX64-NEXT:    lw t5, 208(sp)
+; ZFINX64-NEXT:    lw t6, 216(sp)
 ; ZFINX64-NEXT:    sw s0, 12(sp)
-; ZFINX64-NEXT:    sw t4, 8(sp)
-; ZFINX64-NEXT:    sw t5, 4(sp)
-; ZFINX64-NEXT:    sw t6, 0(sp)
-; ZFINX64-NEXT:    ld t3, 88(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld t4, 96(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld t5, 104(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld t6, 112(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    sw t2, 8(sp)
+; ZFINX64-NEXT:    sw t1, 4(sp)
+; ZFINX64-NEXT:    sw t0, 0(sp)
 ; ZFINX64-NEXT:    call callee_float_32
-; ZFINX64-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    addi sp, sp, 224
+; ZFINX64-NEXT:    ld ra, 184(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s0, 176(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s1, 168(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s2, 160(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s3, 152(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s4, 144(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s5, 136(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s6, 128(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s7, 120(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s8, 112(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s9, 104(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s10, 96(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s11, 88(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    addi sp, sp, 192
 ; ZFINX64-NEXT:    ret
 ;
 ; ZDINX32-LABEL: caller_float_32:
@@ -1257,32 +1241,28 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ;
 ; ZDINX64-LABEL: caller_float_32:
 ; ZDINX64:       # %bb.0:
-; ZDINX64-NEXT:    addi sp, sp, -224
-; ZDINX64-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    lw t0, 248(sp)
-; ZDINX64-NEXT:    sd t0, 112(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    lw t0, 240(sp)
-; ZDINX64-NEXT:    sd t0, 104(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    lw t0, 232(sp)
-; ZDINX64-NEXT:    sd t0, 96(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    addi sp, sp, -192
+; ZDINX64-NEXT:    sd ra, 184(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s0, 176(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s1, 168(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s2, 160(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s3, 152(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s4, 144(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s5, 136(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s6, 128(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s7, 120(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s8, 112(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s9, 104(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s10, 96(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s11, 88(sp) # 8-byte Folded Spill
 ; ZDINX64-NEXT:    lw t0, 224(sp)
-; ZDINX64-NEXT:    sd t0, 88(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    lw t6, 256(sp)
-; ZDINX64-NEXT:    lw t5, 264(sp)
-; ZDINX64-NEXT:    lw t4, 272(sp)
-; ZDINX64-NEXT:    lw s0, 280(sp)
+; ZDINX64-NEXT:    lw t1, 232(sp)
+; ZDINX64-NEXT:    lw t2, 240(sp)
+; ZDINX64-NEXT:    lw s0, 248(sp)
+; ZDINX64-NEXT:    lw t3, 256(sp)
+; ZDINX64-NEXT:    lw t4, 264(sp)
+; ZDINX64-NEXT:    lw t5, 272(sp)
+; ZDINX64-NEXT:    lw t6, 280(sp)
 ; ZDINX64-NEXT:    lw s1, 288(sp)
 ; ZDINX64-NEXT:    lw s2, 296(sp)
 ; ZDINX64-NEXT:    lw s3, 304(sp)
@@ -1295,49 +1275,45 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX64-NEXT:    lw s10, 360(sp)
 ; ZDINX64-NEXT:    lw s11, 368(sp)
 ; ZDINX64-NEXT:    lw ra, 376(sp)
-; ZDINX64-NEXT:    lw t3, 384(sp)
-; ZDINX64-NEXT:    lw t2, 392(sp)
-; ZDINX64-NEXT:    lw t1, 400(sp)
-; ZDINX64-NEXT:    lw t0, 408(sp)
-; ZDINX64-NEXT:    sw t0, 76(sp)
-; ZDINX64-NEXT:    sw t1, 72(sp)
-; ZDINX64-NEXT:    sw t2, 68(sp)
-; ZDINX64-NEXT:    sw t3, 64(sp)
-; ZDINX64-NEXT:    sw ra, 60(sp)
-; ZDINX64-NEXT:    sw s11, 56(sp)
-; ZDINX64-NEXT:    sw s10, 52(sp)
-; ZDINX64-NEXT:    sw s9, 48(sp)
-; ZDINX64-NEXT:    sw s8, 44(sp)
-; ZDINX64-NEXT:    sw s7, 40(sp)
-; ZDINX64-NEXT:    sw s6, 36(sp)
-; ZDINX64-NEXT:    sw s5, 32(sp)
-; ZDINX64-NEXT:    sw s4, 28(sp)
-; ZDINX64-NEXT:    sw s3, 24(sp)
-; ZDINX64-NEXT:    sw s2, 20(sp)
-; ZDINX64-NEXT:    sw s1, 16(sp)
+; ZDINX64-NEXT:    sw ra, 76(sp)
+; ZDINX64-NEXT:    sw s11, 72(sp)
+; ZDINX64-NEXT:    sw s10, 68(sp)
+; ZDINX64-NEXT:    sw s9, 64(sp)
+; ZDINX64-NEXT:    sw s8, 60(sp)
+; ZDINX64-NEXT:    sw s7, 56(sp)
+; ZDINX64-NEXT:    sw s6, 52(sp)
+; ZDINX64-NEXT:    sw s5, 48(sp)
+; ZDINX64-NEXT:    sw s4, 44(sp)
+; ZDINX64-NEXT:    sw s3, 40(sp)
+; ZDINX64-NEXT:    sw s2, 36(sp)
+; ZDINX64-NEXT:    sw s1, 32(sp)
+; ZDINX64-NEXT:    sw t6, 28(sp)
+; ZDINX64-NEXT:    sw t5, 24(sp)
+; ZDINX64-NEXT:    sw t4, 20(sp)
+; ZDINX64-NEXT:    sw t3, 16(sp)
+; ZDINX64-NEXT:    lw t3, 192(sp)
+; ZDINX64-NEXT:    lw t4, 200(sp)
+; ZDINX64-NEXT:    lw t5, 208(sp)
+; ZDINX64-NEXT:    lw t6, 216(sp)
 ; ZDINX64-NEXT:    sw s0, 12(sp)
-; ZDINX64-NEXT:    sw t4, 8(sp)
-; ZDINX64-NEXT:    sw t5, 4(sp)
-; ZDINX64-NEXT:    sw t6, 0(sp)
-; ZDINX64-NEXT:    ld t3, 88(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld t4, 96(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld t5, 104(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld t6, 112(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    sw t2, 8(sp)
+; ZDINX64-NEXT:    sw t1, 4(sp)
+; ZDINX64-NEXT:    sw t0, 0(sp)
 ; ZDINX64-NEXT:    call callee_float_32
-; ZDINX64-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    addi sp, sp, 224
+; ZDINX64-NEXT:    ld ra, 184(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s0, 176(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s1, 168(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s2, 160(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s3, 152(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s4, 144(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s5, 136(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s6, 128(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s7, 120(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s8, 112(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s9, 104(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s10, 96(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s11, 88(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    addi sp, sp, 192
 ; ZDINX64-NEXT:    ret
 	%C = call fastcc float @callee_float_32(<32 x float> %A)
 	ret float %C
diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll
index 35900f8a0717a..603491bf3d300 100644
--- a/llvm/test/CodeGen/RISCV/forced-atomics.ll
+++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll
@@ -3348,8 +3348,8 @@ define i64 @rmw64_max_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    j .LBB49_2
 ; RV32-NEXT:  .LBB49_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB49_2 Depth=1
@@ -3362,8 +3362,8 @@ define i64 @rmw64_max_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw a4, 0(sp)
+; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    bnez a0, .LBB49_6
 ; RV32-NEXT:  .LBB49_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3453,8 +3453,8 @@ define i64 @rmw64_min_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    j .LBB50_2
 ; RV32-NEXT:  .LBB50_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB50_2 Depth=1
@@ -3467,8 +3467,8 @@ define i64 @rmw64_min_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw a4, 0(sp)
+; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    bnez a0, .LBB50_6
 ; RV32-NEXT:  .LBB50_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3560,8 +3560,8 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    j .LBB51_2
 ; RV32-NEXT:  .LBB51_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB51_2 Depth=1
@@ -3574,8 +3574,8 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw a4, 0(sp)
+; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    bnez a0, .LBB51_4
 ; RV32-NEXT:  .LBB51_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3652,8 +3652,8 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    j .LBB52_2
 ; RV32-NEXT:  .LBB52_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB52_2 Depth=1
@@ -3666,8 +3666,8 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw a4, 0(sp)
+; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    bnez a0, .LBB52_4
 ; RV32-NEXT:  .LBB52_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3802,30 +3802,30 @@ define double @rmw64_fadd_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw s1, 4(a0)
-; RV32-NEXT:    lw s2, 0(a0)
+; RV32-NEXT:    lw s1, 0(a0)
+; RV32-NEXT:    lw s2, 4(a0)
 ; RV32-NEXT:  .LBB54_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lui a3, 261888
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    li a2, 0
 ; RV32-NEXT:    call __adddf3
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    mv a3, a1
-; RV32-NEXT:    sw s2, 8(sp)
-; RV32-NEXT:    sw s1, 12(sp)
+; RV32-NEXT:    sw s1, 8(sp)
+; RV32-NEXT:    sw s2, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw s1, 12(sp)
-; RV32-NEXT:    lw s2, 8(sp)
+; RV32-NEXT:    lw s1, 8(sp)
+; RV32-NEXT:    lw s2, 12(sp)
 ; RV32-NEXT:    beqz a0, .LBB54_1
 ; RV32-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3937,30 +3937,30 @@ define double @rmw64_fsub_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw s1, 4(a0)
-; RV32-NEXT:    lw s2, 0(a0)
+; RV32-NEXT:    lw s1, 0(a0)
+; RV32-NEXT:    lw s2, 4(a0)
 ; RV32-NEXT:  .LBB55_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lui a3, 786176
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    li a2, 0
 ; RV32-NEXT:    call __adddf3
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    mv a3, a1
-; RV32-NEXT:    sw s2, 8(sp)
-; RV32-NEXT:    sw s1, 12(sp)
+; RV32-NEXT:    sw s1, 8(sp)
+; RV32-NEXT:    sw s2, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw s1, 12(sp)
-; RV32-NEXT:    lw s2, 8(sp)
+; RV32-NEXT:    lw s1, 8(sp)
+; RV32-NEXT:    lw s2, 12(sp)
 ; RV32-NEXT:    beqz a0, .LBB55_1
 ; RV32-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -4072,30 +4072,30 @@ define double @rmw64_fmin_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw s1, 4(a0)
-; RV32-NEXT:    lw s2, 0(a0)
+; RV32-NEXT:    lw s1, 0(a0)
+; RV32-NEXT:    lw s2, 4(a0)
 ; RV32-NEXT:  .LBB56_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lui a3, 261888
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    li a2, 0
 ; RV32-NEXT:    call fmin
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    mv a3, a1
-; RV32-NEXT:    sw s2, 8(sp)
-; RV32-NEXT:    sw s1, 12(sp)
+; RV32-NEXT:    sw s1, 8(sp)
+; RV32-NEXT:    sw s2, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw s1, 12(sp)
-; RV32-NEXT:    lw s2, 8(sp)
+; RV32-NEXT:    lw s1, 8(sp)
+; RV32-NEXT:    lw s2, 12(sp)
 ; RV32-NEXT:    beqz a0, .LBB56_1
 ; RV32-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -4207,30 +4207,30 @@ define double @rmw64_fmax_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw s1, 4(a0)
-; RV32-NEXT:    lw s2, 0(a0)
+; RV32-NEXT:    lw s1, 0(a0)
+; RV32-NEXT:    lw s2, 4(a0)
 ; RV32-NEXT:  .LBB57_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lui a3, 261888
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    li a2, 0
 ; RV32-NEXT:    call fmax
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    mv a3, a1
-; RV32-NEXT:    sw s2, 8(sp)
-; RV32-NEXT:    sw s1, 12(sp)
+; RV32-NEXT:    sw s1, 8(sp)
+; RV32-NEXT:    sw s2, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw s1, 12(sp)
-; RV32-NEXT:    lw s2, 8(sp)
+; RV32-NEXT:    lw s1, 8(sp)
+; RV32-NEXT:    lw s2, 12(sp)
 ; RV32-NEXT:    beqz a0, .LBB57_1
 ; RV32-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -4346,8 +4346,8 @@ define i64 @cmpxchg64_monotonic(ptr %p) nounwind {
 ; RV32-NEXT:    li a4, 0
 ; RV32-NEXT:    li a5, 0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw a0, 0(sp)
+; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -4406,8 +4406,8 @@ define i64 @cmpxchg64_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    li a3, 0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw a0, 0(sp)
+; RV32-NEXT:    lw a1, 4(sp)
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -4531,25 +4531,25 @@ define i128 @rmw128(ptr %p) nounwind {
 ; RV32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a1
-; RV32-NEXT:    lw a1, 12(a1)
-; RV32-NEXT:    lw a2, 8(s0)
-; RV32-NEXT:    lw a3, 4(s0)
-; RV32-NEXT:    lw a4, 0(s0)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw a2, 4(s0)
+; RV32-NEXT:    lw a3, 8(s0)
+; RV32-NEXT:    lw a4, 12(s0)
 ; RV32-NEXT:    mv s1, a0
 ; RV32-NEXT:  .LBB62_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    addi a0, a4, 1
+; RV32-NEXT:    addi a0, a1, 1
 ; RV32-NEXT:    seqz a5, a0
-; RV32-NEXT:    add a5, a3, a5
+; RV32-NEXT:    add a5, a2, a5
 ; RV32-NEXT:    or a6, a0, a5
 ; RV32-NEXT:    seqz a6, a6
-; RV32-NEXT:    add a6, a2, a6
-; RV32-NEXT:    sltu a7, a6, a2
-; RV32-NEXT:    add a7, a1, a7
-; RV32-NEXT:    sw a4, 16(sp)
-; RV32-NEXT:    sw a3, 20(sp)
-; RV32-NEXT:    sw a2, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    add a6, a3, a6
+; RV32-NEXT:    sltu a7, a6, a3
+; RV32-NEXT:    add a7, a4, a7
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
+; RV32-NEXT:    sw a3, 24(sp)
+; RV32-NEXT:    sw a4, 28(sp)
 ; RV32-NEXT:    sw a5, 4(sp)
 ; RV32-NEXT:    sw a0, 0(sp)
 ; RV32-NEXT:    sw a6, 8(sp)
@@ -4561,16 +4561,16 @@ define i128 @rmw128(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a1, s0
 ; RV32-NEXT:    call __atomic_compare_exchange
-; RV32-NEXT:    lw a1, 28(sp)
-; RV32-NEXT:    lw a2, 24(sp)
-; RV32-NEXT:    lw a3, 20(sp)
-; RV32-NEXT:    lw a4, 16(sp)
+; RV32-NEXT:    lw a1, 16(sp)
+; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a3, 24(sp)
+; RV32-NEXT:    lw a4, 28(sp)
 ; RV32-NEXT:    beqz a0, .LBB62_1
 ; RV32-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NEXT:    sw a4, 0(s1)
-; RV32-NEXT:    sw a3, 4(s1)
-; RV32-NEXT:    sw a2, 8(s1)
-; RV32-NEXT:    sw a1, 12(s1)
+; RV32-NEXT:    sw a1, 0(s1)
+; RV32-NEXT:    sw a2, 4(s1)
+; RV32-NEXT:    sw a3, 8(s1)
+; RV32-NEXT:    sw a4, 12(s1)
 ; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
@@ -4639,8 +4639,8 @@ define i128 @cmpxchg128(ptr %p) nounwind {
 ; RV64-NEXT:    li a5, 5
 ; RV64-NEXT:    li a3, 0
 ; RV64-NEXT:    call __atomic_compare_exchange_16
-; RV64-NEXT:    ld a1, 8(sp)
 ; RV64-NEXT:    ld a0, 0(sp)
+; RV64-NEXT:    ld a1, 8(sp)
 ; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 32
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index deb5a6d4013d4..c44f4942e9e69 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -1043,24 +1043,24 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixdfti
-; RV32IF-NEXT:    lw a0, 16(sp)
-; RV32IF-NEXT:    lw a2, 20(sp)
+; RV32IF-NEXT:    lw a3, 8(sp)
 ; RV32IF-NEXT:    lw a1, 12(sp)
-; RV32IF-NEXT:    lw a4, 8(sp)
-; RV32IF-NEXT:    lui a3, 524288
-; RV32IF-NEXT:    addi a5, a3, -1
+; RV32IF-NEXT:    lw a2, 16(sp)
+; RV32IF-NEXT:    lw a4, 20(sp)
+; RV32IF-NEXT:    lui a0, 524288
+; RV32IF-NEXT:    addi a5, a0, -1
 ; RV32IF-NEXT:    beq a1, a5, .LBB18_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
 ; RV32IF-NEXT:    sltu a6, a1, a5
-; RV32IF-NEXT:    or a7, a0, a2
+; RV32IF-NEXT:    or a7, a2, a4
 ; RV32IF-NEXT:    bnez a7, .LBB18_3
 ; RV32IF-NEXT:    j .LBB18_4
 ; RV32IF-NEXT:  .LBB18_2:
-; RV32IF-NEXT:    sltiu a6, a4, -1
-; RV32IF-NEXT:    or a7, a0, a2
+; RV32IF-NEXT:    sltiu a6, a3, -1
+; RV32IF-NEXT:    or a7, a2, a4
 ; RV32IF-NEXT:    beqz a7, .LBB18_4
 ; RV32IF-NEXT:  .LBB18_3: # %entry
-; RV32IF-NEXT:    slti a6, a2, 0
+; RV32IF-NEXT:    slti a6, a4, 0
 ; RV32IF-NEXT:  .LBB18_4: # %entry
 ; RV32IF-NEXT:    addi a7, a6, -1
 ; RV32IF-NEXT:    neg t0, a6
@@ -1068,21 +1068,21 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IF-NEXT:  # %bb.5: # %entry
 ; RV32IF-NEXT:    mv a1, a5
 ; RV32IF-NEXT:  .LBB18_6: # %entry
-; RV32IF-NEXT:    or a4, a7, a4
+; RV32IF-NEXT:    or a3, a7, a3
+; RV32IF-NEXT:    and a4, t0, a4
 ; RV32IF-NEXT:    and a2, t0, a2
-; RV32IF-NEXT:    and a5, t0, a0
-; RV32IF-NEXT:    beq a1, a3, .LBB18_8
+; RV32IF-NEXT:    beq a1, a0, .LBB18_8
 ; RV32IF-NEXT:  # %bb.7: # %entry
-; RV32IF-NEXT:    sltu a0, a3, a1
+; RV32IF-NEXT:    sltu a0, a0, a1
 ; RV32IF-NEXT:    j .LBB18_9
 ; RV32IF-NEXT:  .LBB18_8:
-; RV32IF-NEXT:    snez a0, a4
+; RV32IF-NEXT:    snez a0, a3
 ; RV32IF-NEXT:  .LBB18_9: # %entry
-; RV32IF-NEXT:    and a5, a5, a2
-; RV32IF-NEXT:    li a3, -1
-; RV32IF-NEXT:    beq a5, a3, .LBB18_11
+; RV32IF-NEXT:    and a2, a2, a4
+; RV32IF-NEXT:    li a5, -1
+; RV32IF-NEXT:    beq a2, a5, .LBB18_11
 ; RV32IF-NEXT:  # %bb.10: # %entry
-; RV32IF-NEXT:    slti a0, a2, 0
+; RV32IF-NEXT:    slti a0, a4, 0
 ; RV32IF-NEXT:    xori a0, a0, 1
 ; RV32IF-NEXT:  .LBB18_11: # %entry
 ; RV32IF-NEXT:    bnez a0, .LBB18_13
@@ -1090,7 +1090,7 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IF-NEXT:    lui a1, 524288
 ; RV32IF-NEXT:  .LBB18_13: # %entry
 ; RV32IF-NEXT:    neg a0, a0
-; RV32IF-NEXT:    and a0, a0, a4
+; RV32IF-NEXT:    and a0, a0, a3
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 32
 ; RV32IF-NEXT:    ret
@@ -1142,24 +1142,24 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixdfti
-; RV32IFD-NEXT:    lw a0, 16(sp)
-; RV32IFD-NEXT:    lw a2, 20(sp)
+; RV32IFD-NEXT:    lw a3, 8(sp)
 ; RV32IFD-NEXT:    lw a1, 12(sp)
-; RV32IFD-NEXT:    lw a4, 8(sp)
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    addi a5, a3, -1
+; RV32IFD-NEXT:    lw a2, 16(sp)
+; RV32IFD-NEXT:    lw a4, 20(sp)
+; RV32IFD-NEXT:    lui a0, 524288
+; RV32IFD-NEXT:    addi a5, a0, -1
 ; RV32IFD-NEXT:    beq a1, a5, .LBB18_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
 ; RV32IFD-NEXT:    sltu a6, a1, a5
-; RV32IFD-NEXT:    or a7, a0, a2
+; RV32IFD-NEXT:    or a7, a2, a4
 ; RV32IFD-NEXT:    bnez a7, .LBB18_3
 ; RV32IFD-NEXT:    j .LBB18_4
 ; RV32IFD-NEXT:  .LBB18_2:
-; RV32IFD-NEXT:    sltiu a6, a4, -1
-; RV32IFD-NEXT:    or a7, a0, a2
+; RV32IFD-NEXT:    sltiu a6, a3, -1
+; RV32IFD-NEXT:    or a7, a2, a4
 ; RV32IFD-NEXT:    beqz a7, .LBB18_4
 ; RV32IFD-NEXT:  .LBB18_3: # %entry
-; RV32IFD-NEXT:    slti a6, a2, 0
+; RV32IFD-NEXT:    slti a6, a4, 0
 ; RV32IFD-NEXT:  .LBB18_4: # %entry
 ; RV32IFD-NEXT:    addi a7, a6, -1
 ; RV32IFD-NEXT:    neg t0, a6
@@ -1167,21 +1167,21 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IFD-NEXT:  # %bb.5: # %entry
 ; RV32IFD-NEXT:    mv a1, a5
 ; RV32IFD-NEXT:  .LBB18_6: # %entry
-; RV32IFD-NEXT:    or a4, a7, a4
+; RV32IFD-NEXT:    or a3, a7, a3
+; RV32IFD-NEXT:    and a4, t0, a4
 ; RV32IFD-NEXT:    and a2, t0, a2
-; RV32IFD-NEXT:    and a5, t0, a0
-; RV32IFD-NEXT:    beq a1, a3, .LBB18_8
+; RV32IFD-NEXT:    beq a1, a0, .LBB18_8
 ; RV32IFD-NEXT:  # %bb.7: # %entry
-; RV32IFD-NEXT:    sltu a0, a3, a1
+; RV32IFD-NEXT:    sltu a0, a0, a1
 ; RV32IFD-NEXT:    j .LBB18_9
 ; RV32IFD-NEXT:  .LBB18_8:
-; RV32IFD-NEXT:    snez a0, a4
+; RV32IFD-NEXT:    snez a0, a3
 ; RV32IFD-NEXT:  .LBB18_9: # %entry
-; RV32IFD-NEXT:    and a5, a5, a2
-; RV32IFD-NEXT:    li a3, -1
-; RV32IFD-NEXT:    beq a5, a3, .LBB18_11
+; RV32IFD-NEXT:    and a2, a2, a4
+; RV32IFD-NEXT:    li a5, -1
+; RV32IFD-NEXT:    beq a2, a5, .LBB18_11
 ; RV32IFD-NEXT:  # %bb.10: # %entry
-; RV32IFD-NEXT:    slti a0, a2, 0
+; RV32IFD-NEXT:    slti a0, a4, 0
 ; RV32IFD-NEXT:    xori a0, a0, 1
 ; RV32IFD-NEXT:  .LBB18_11: # %entry
 ; RV32IFD-NEXT:    bnez a0, .LBB18_13
@@ -1189,7 +1189,7 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IFD-NEXT:    lui a1, 524288
 ; RV32IFD-NEXT:  .LBB18_13: # %entry
 ; RV32IFD-NEXT:    neg a0, a0
-; RV32IFD-NEXT:    and a0, a0, a4
+; RV32IFD-NEXT:    and a0, a0, a3
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 32
 ; RV32IFD-NEXT:    ret
@@ -1440,24 +1440,24 @@ define i64 @stest_f32i64(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 16(sp)
-; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a4, 8(sp)
-; RV32-NEXT:    lui a3, 524288
-; RV32-NEXT:    addi a5, a3, -1
+; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a4, 20(sp)
+; RV32-NEXT:    lui a0, 524288
+; RV32-NEXT:    addi a5, a0, -1
 ; RV32-NEXT:    beq a1, a5, .LBB21_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a6, a1, a5
-; RV32-NEXT:    or a7, a0, a2
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    bnez a7, .LBB21_3
 ; RV32-NEXT:    j .LBB21_4
 ; RV32-NEXT:  .LBB21_2:
-; RV32-NEXT:    sltiu a6, a4, -1
-; RV32-NEXT:    or a7, a0, a2
+; RV32-NEXT:    sltiu a6, a3, -1
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    beqz a7, .LBB21_4
 ; RV32-NEXT:  .LBB21_3: # %entry
-; RV32-NEXT:    slti a6, a2, 0
+; RV32-NEXT:    slti a6, a4, 0
 ; RV32-NEXT:  .LBB21_4: # %entry
 ; RV32-NEXT:    addi a7, a6, -1
 ; RV32-NEXT:    neg t0, a6
@@ -1465,21 +1465,21 @@ define i64 @stest_f32i64(float %x) {
 ; RV32-NEXT:  # %bb.5: # %entry
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB21_6: # %entry
-; RV32-NEXT:    or a4, a7, a4
+; RV32-NEXT:    or a3, a7, a3
+; RV32-NEXT:    and a4, t0, a4
 ; RV32-NEXT:    and a2, t0, a2
-; RV32-NEXT:    and a5, t0, a0
-; RV32-NEXT:    beq a1, a3, .LBB21_8
+; RV32-NEXT:    beq a1, a0, .LBB21_8
 ; RV32-NEXT:  # %bb.7: # %entry
-; RV32-NEXT:    sltu a0, a3, a1
+; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    j .LBB21_9
 ; RV32-NEXT:  .LBB21_8:
-; RV32-NEXT:    snez a0, a4
+; RV32-NEXT:    snez a0, a3
 ; RV32-NEXT:  .LBB21_9: # %entry
-; RV32-NEXT:    and a5, a5, a2
-; RV32-NEXT:    li a3, -1
-; RV32-NEXT:    beq a5, a3, .LBB21_11
+; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    li a5, -1
+; RV32-NEXT:    beq a2, a5, .LBB21_11
 ; RV32-NEXT:  # %bb.10: # %entry
-; RV32-NEXT:    slti a0, a2, 0
+; RV32-NEXT:    slti a0, a4, 0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:  .LBB21_11: # %entry
 ; RV32-NEXT:    bnez a0, .LBB21_13
@@ -1487,7 +1487,7 @@ define i64 @stest_f32i64(float %x) {
 ; RV32-NEXT:    lui a1, 524288
 ; RV32-NEXT:  .LBB21_13: # %entry
 ; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -1657,24 +1657,24 @@ define i64 @stest_f16i64(half %x) {
 ; RV32-NEXT:    call __extendhfsf2
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 16(sp)
-; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a4, 8(sp)
-; RV32-NEXT:    lui a3, 524288
-; RV32-NEXT:    addi a5, a3, -1
+; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a4, 20(sp)
+; RV32-NEXT:    lui a0, 524288
+; RV32-NEXT:    addi a5, a0, -1
 ; RV32-NEXT:    beq a1, a5, .LBB24_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a6, a1, a5
-; RV32-NEXT:    or a7, a0, a2
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    bnez a7, .LBB24_3
 ; RV32-NEXT:    j .LBB24_4
 ; RV32-NEXT:  .LBB24_2:
-; RV32-NEXT:    sltiu a6, a4, -1
-; RV32-NEXT:    or a7, a0, a2
+; RV32-NEXT:    sltiu a6, a3, -1
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    beqz a7, .LBB24_4
 ; RV32-NEXT:  .LBB24_3: # %entry
-; RV32-NEXT:    slti a6, a2, 0
+; RV32-NEXT:    slti a6, a4, 0
 ; RV32-NEXT:  .LBB24_4: # %entry
 ; RV32-NEXT:    addi a7, a6, -1
 ; RV32-NEXT:    neg t0, a6
@@ -1682,21 +1682,21 @@ define i64 @stest_f16i64(half %x) {
 ; RV32-NEXT:  # %bb.5: # %entry
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB24_6: # %entry
-; RV32-NEXT:    or a4, a7, a4
+; RV32-NEXT:    or a3, a7, a3
+; RV32-NEXT:    and a4, t0, a4
 ; RV32-NEXT:    and a2, t0, a2
-; RV32-NEXT:    and a5, t0, a0
-; RV32-NEXT:    beq a1, a3, .LBB24_8
+; RV32-NEXT:    beq a1, a0, .LBB24_8
 ; RV32-NEXT:  # %bb.7: # %entry
-; RV32-NEXT:    sltu a0, a3, a1
+; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    j .LBB24_9
 ; RV32-NEXT:  .LBB24_8:
-; RV32-NEXT:    snez a0, a4
+; RV32-NEXT:    snez a0, a3
 ; RV32-NEXT:  .LBB24_9: # %entry
-; RV32-NEXT:    and a5, a5, a2
-; RV32-NEXT:    li a3, -1
-; RV32-NEXT:    beq a5, a3, .LBB24_11
+; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    li a5, -1
+; RV32-NEXT:    beq a2, a5, .LBB24_11
 ; RV32-NEXT:  # %bb.10: # %entry
-; RV32-NEXT:    slti a0, a2, 0
+; RV32-NEXT:    slti a0, a4, 0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:  .LBB24_11: # %entry
 ; RV32-NEXT:    bnez a0, .LBB24_13
@@ -1704,7 +1704,7 @@ define i64 @stest_f16i64(half %x) {
 ; RV32-NEXT:    lui a1, 524288
 ; RV32-NEXT:  .LBB24_13: # %entry
 ; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -2891,24 +2891,24 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixdfti
-; RV32IF-NEXT:    lw a0, 16(sp)
-; RV32IF-NEXT:    lw a2, 20(sp)
+; RV32IF-NEXT:    lw a3, 8(sp)
 ; RV32IF-NEXT:    lw a1, 12(sp)
-; RV32IF-NEXT:    lw a4, 8(sp)
-; RV32IF-NEXT:    lui a3, 524288
-; RV32IF-NEXT:    addi a5, a3, -1
+; RV32IF-NEXT:    lw a2, 16(sp)
+; RV32IF-NEXT:    lw a4, 20(sp)
+; RV32IF-NEXT:    lui a0, 524288
+; RV32IF-NEXT:    addi a5, a0, -1
 ; RV32IF-NEXT:    beq a1, a5, .LBB45_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
 ; RV32IF-NEXT:    sltu a6, a1, a5
-; RV32IF-NEXT:    or a7, a0, a2
+; RV32IF-NEXT:    or a7, a2, a4
 ; RV32IF-NEXT:    bnez a7, .LBB45_3
 ; RV32IF-NEXT:    j .LBB45_4
 ; RV32IF-NEXT:  .LBB45_2:
-; RV32IF-NEXT:    sltiu a6, a4, -1
-; RV32IF-NEXT:    or a7, a0, a2
+; RV32IF-NEXT:    sltiu a6, a3, -1
+; RV32IF-NEXT:    or a7, a2, a4
 ; RV32IF-NEXT:    beqz a7, .LBB45_4
 ; RV32IF-NEXT:  .LBB45_3: # %entry
-; RV32IF-NEXT:    slti a6, a2, 0
+; RV32IF-NEXT:    slti a6, a4, 0
 ; RV32IF-NEXT:  .LBB45_4: # %entry
 ; RV32IF-NEXT:    addi a7, a6, -1
 ; RV32IF-NEXT:    neg t0, a6
@@ -2916,21 +2916,21 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:  # %bb.5: # %entry
 ; RV32IF-NEXT:    mv a1, a5
 ; RV32IF-NEXT:  .LBB45_6: # %entry
-; RV32IF-NEXT:    or a4, a7, a4
+; RV32IF-NEXT:    or a3, a7, a3
+; RV32IF-NEXT:    and a4, t0, a4
 ; RV32IF-NEXT:    and a2, t0, a2
-; RV32IF-NEXT:    and a5, t0, a0
-; RV32IF-NEXT:    beq a1, a3, .LBB45_8
+; RV32IF-NEXT:    beq a1, a0, .LBB45_8
 ; RV32IF-NEXT:  # %bb.7: # %entry
-; RV32IF-NEXT:    sltu a0, a3, a1
+; RV32IF-NEXT:    sltu a0, a0, a1
 ; RV32IF-NEXT:    j .LBB45_9
 ; RV32IF-NEXT:  .LBB45_8:
-; RV32IF-NEXT:    snez a0, a4
+; RV32IF-NEXT:    snez a0, a3
 ; RV32IF-NEXT:  .LBB45_9: # %entry
-; RV32IF-NEXT:    and a5, a5, a2
-; RV32IF-NEXT:    li a3, -1
-; RV32IF-NEXT:    beq a5, a3, .LBB45_11
+; RV32IF-NEXT:    and a2, a2, a4
+; RV32IF-NEXT:    li a5, -1
+; RV32IF-NEXT:    beq a2, a5, .LBB45_11
 ; RV32IF-NEXT:  # %bb.10: # %entry
-; RV32IF-NEXT:    slti a0, a2, 0
+; RV32IF-NEXT:    slti a0, a4, 0
 ; RV32IF-NEXT:    xori a0, a0, 1
 ; RV32IF-NEXT:  .LBB45_11: # %entry
 ; RV32IF-NEXT:    bnez a0, .LBB45_13
@@ -2938,7 +2938,7 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    lui a1, 524288
 ; RV32IF-NEXT:  .LBB45_13: # %entry
 ; RV32IF-NEXT:    neg a0, a0
-; RV32IF-NEXT:    and a0, a0, a4
+; RV32IF-NEXT:    and a0, a0, a3
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 32
 ; RV32IF-NEXT:    ret
@@ -2990,24 +2990,24 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixdfti
-; RV32IFD-NEXT:    lw a0, 16(sp)
-; RV32IFD-NEXT:    lw a2, 20(sp)
+; RV32IFD-NEXT:    lw a3, 8(sp)
 ; RV32IFD-NEXT:    lw a1, 12(sp)
-; RV32IFD-NEXT:    lw a4, 8(sp)
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    addi a5, a3, -1
+; RV32IFD-NEXT:    lw a2, 16(sp)
+; RV32IFD-NEXT:    lw a4, 20(sp)
+; RV32IFD-NEXT:    lui a0, 524288
+; RV32IFD-NEXT:    addi a5, a0, -1
 ; RV32IFD-NEXT:    beq a1, a5, .LBB45_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
 ; RV32IFD-NEXT:    sltu a6, a1, a5
-; RV32IFD-NEXT:    or a7, a0, a2
+; RV32IFD-NEXT:    or a7, a2, a4
 ; RV32IFD-NEXT:    bnez a7, .LBB45_3
 ; RV32IFD-NEXT:    j .LBB45_4
 ; RV32IFD-NEXT:  .LBB45_2:
-; RV32IFD-NEXT:    sltiu a6, a4, -1
-; RV32IFD-NEXT:    or a7, a0, a2
+; RV32IFD-NEXT:    sltiu a6, a3, -1
+; RV32IFD-NEXT:    or a7, a2, a4
 ; RV32IFD-NEXT:    beqz a7, .LBB45_4
 ; RV32IFD-NEXT:  .LBB45_3: # %entry
-; RV32IFD-NEXT:    slti a6, a2, 0
+; RV32IFD-NEXT:    slti a6, a4, 0
 ; RV32IFD-NEXT:  .LBB45_4: # %entry
 ; RV32IFD-NEXT:    addi a7, a6, -1
 ; RV32IFD-NEXT:    neg t0, a6
@@ -3015,21 +3015,21 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:  # %bb.5: # %entry
 ; RV32IFD-NEXT:    mv a1, a5
 ; RV32IFD-NEXT:  .LBB45_6: # %entry
-; RV32IFD-NEXT:    or a4, a7, a4
+; RV32IFD-NEXT:    or a3, a7, a3
+; RV32IFD-NEXT:    and a4, t0, a4
 ; RV32IFD-NEXT:    and a2, t0, a2
-; RV32IFD-NEXT:    and a5, t0, a0
-; RV32IFD-NEXT:    beq a1, a3, .LBB45_8
+; RV32IFD-NEXT:    beq a1, a0, .LBB45_8
 ; RV32IFD-NEXT:  # %bb.7: # %entry
-; RV32IFD-NEXT:    sltu a0, a3, a1
+; RV32IFD-NEXT:    sltu a0, a0, a1
 ; RV32IFD-NEXT:    j .LBB45_9
 ; RV32IFD-NEXT:  .LBB45_8:
-; RV32IFD-NEXT:    snez a0, a4
+; RV32IFD-NEXT:    snez a0, a3
 ; RV32IFD-NEXT:  .LBB45_9: # %entry
-; RV32IFD-NEXT:    and a5, a5, a2
-; RV32IFD-NEXT:    li a3, -1
-; RV32IFD-NEXT:    beq a5, a3, .LBB45_11
+; RV32IFD-NEXT:    and a2, a2, a4
+; RV32IFD-NEXT:    li a5, -1
+; RV32IFD-NEXT:    beq a2, a5, .LBB45_11
 ; RV32IFD-NEXT:  # %bb.10: # %entry
-; RV32IFD-NEXT:    slti a0, a2, 0
+; RV32IFD-NEXT:    slti a0, a4, 0
 ; RV32IFD-NEXT:    xori a0, a0, 1
 ; RV32IFD-NEXT:  .LBB45_11: # %entry
 ; RV32IFD-NEXT:    bnez a0, .LBB45_13
@@ -3037,7 +3037,7 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    lui a1, 524288
 ; RV32IFD-NEXT:  .LBB45_13: # %entry
 ; RV32IFD-NEXT:    neg a0, a0
-; RV32IFD-NEXT:    and a0, a0, a4
+; RV32IFD-NEXT:    and a0, a0, a3
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 32
 ; RV32IFD-NEXT:    ret
@@ -3246,24 +3246,24 @@ define i64 @stest_f32i64_mm(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 16(sp)
-; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a4, 8(sp)
-; RV32-NEXT:    lui a3, 524288
-; RV32-NEXT:    addi a5, a3, -1
+; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a4, 20(sp)
+; RV32-NEXT:    lui a0, 524288
+; RV32-NEXT:    addi a5, a0, -1
 ; RV32-NEXT:    beq a1, a5, .LBB48_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a6, a1, a5
-; RV32-NEXT:    or a7, a0, a2
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    bnez a7, .LBB48_3
 ; RV32-NEXT:    j .LBB48_4
 ; RV32-NEXT:  .LBB48_2:
-; RV32-NEXT:    sltiu a6, a4, -1
-; RV32-NEXT:    or a7, a0, a2
+; RV32-NEXT:    sltiu a6, a3, -1
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    beqz a7, .LBB48_4
 ; RV32-NEXT:  .LBB48_3: # %entry
-; RV32-NEXT:    slti a6, a2, 0
+; RV32-NEXT:    slti a6, a4, 0
 ; RV32-NEXT:  .LBB48_4: # %entry
 ; RV32-NEXT:    addi a7, a6, -1
 ; RV32-NEXT:    neg t0, a6
@@ -3271,21 +3271,21 @@ define i64 @stest_f32i64_mm(float %x) {
 ; RV32-NEXT:  # %bb.5: # %entry
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB48_6: # %entry
-; RV32-NEXT:    or a4, a7, a4
+; RV32-NEXT:    or a3, a7, a3
+; RV32-NEXT:    and a4, t0, a4
 ; RV32-NEXT:    and a2, t0, a2
-; RV32-NEXT:    and a5, t0, a0
-; RV32-NEXT:    beq a1, a3, .LBB48_8
+; RV32-NEXT:    beq a1, a0, .LBB48_8
 ; RV32-NEXT:  # %bb.7: # %entry
-; RV32-NEXT:    sltu a0, a3, a1
+; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    j .LBB48_9
 ; RV32-NEXT:  .LBB48_8:
-; RV32-NEXT:    snez a0, a4
+; RV32-NEXT:    snez a0, a3
 ; RV32-NEXT:  .LBB48_9: # %entry
-; RV32-NEXT:    and a5, a5, a2
-; RV32-NEXT:    li a3, -1
-; RV32-NEXT:    beq a5, a3, .LBB48_11
+; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    li a5, -1
+; RV32-NEXT:    beq a2, a5, .LBB48_11
 ; RV32-NEXT:  # %bb.10: # %entry
-; RV32-NEXT:    slti a0, a2, 0
+; RV32-NEXT:    slti a0, a4, 0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:  .LBB48_11: # %entry
 ; RV32-NEXT:    bnez a0, .LBB48_13
@@ -3293,7 +3293,7 @@ define i64 @stest_f32i64_mm(float %x) {
 ; RV32-NEXT:    lui a1, 524288
 ; RV32-NEXT:  .LBB48_13: # %entry
 ; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -3437,24 +3437,24 @@ define i64 @stest_f16i64_mm(half %x) {
 ; RV32-NEXT:    call __extendhfsf2
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 16(sp)
-; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a4, 8(sp)
-; RV32-NEXT:    lui a3, 524288
-; RV32-NEXT:    addi a5, a3, -1
+; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a4, 20(sp)
+; RV32-NEXT:    lui a0, 524288
+; RV32-NEXT:    addi a5, a0, -1
 ; RV32-NEXT:    beq a1, a5, .LBB51_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a6, a1, a5
-; RV32-NEXT:    or a7, a0, a2
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    bnez a7, .LBB51_3
 ; RV32-NEXT:    j .LBB51_4
 ; RV32-NEXT:  .LBB51_2:
-; RV32-NEXT:    sltiu a6, a4, -1
-; RV32-NEXT:    or a7, a0, a2
+; RV32-NEXT:    sltiu a6, a3, -1
+; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    beqz a7, .LBB51_4
 ; RV32-NEXT:  .LBB51_3: # %entry
-; RV32-NEXT:    slti a6, a2, 0
+; RV32-NEXT:    slti a6, a4, 0
 ; RV32-NEXT:  .LBB51_4: # %entry
 ; RV32-NEXT:    addi a7, a6, -1
 ; RV32-NEXT:    neg t0, a6
@@ -3462,21 +3462,21 @@ define i64 @stest_f16i64_mm(half %x) {
 ; RV32-NEXT:  # %bb.5: # %entry
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB51_6: # %entry
-; RV32-NEXT:    or a4, a7, a4
+; RV32-NEXT:    or a3, a7, a3
+; RV32-NEXT:    and a4, t0, a4
 ; RV32-NEXT:    and a2, t0, a2
-; RV32-NEXT:    and a5, t0, a0
-; RV32-NEXT:    beq a1, a3, .LBB51_8
+; RV32-NEXT:    beq a1, a0, .LBB51_8
 ; RV32-NEXT:  # %bb.7: # %entry
-; RV32-NEXT:    sltu a0, a3, a1
+; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    j .LBB51_9
 ; RV32-NEXT:  .LBB51_8:
-; RV32-NEXT:    snez a0, a4
+; RV32-NEXT:    snez a0, a3
 ; RV32-NEXT:  .LBB51_9: # %entry
-; RV32-NEXT:    and a5, a5, a2
-; RV32-NEXT:    li a3, -1
-; RV32-NEXT:    beq a5, a3, .LBB51_11
+; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    li a5, -1
+; RV32-NEXT:    beq a2, a5, .LBB51_11
 ; RV32-NEXT:  # %bb.10: # %entry
-; RV32-NEXT:    slti a0, a2, 0
+; RV32-NEXT:    slti a0, a4, 0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:  .LBB51_11: # %entry
 ; RV32-NEXT:    bnez a0, .LBB51_13
@@ -3484,7 +3484,7 @@ define i64 @stest_f16i64_mm(half %x) {
 ; RV32-NEXT:    lui a1, 524288
 ; RV32-NEXT:  .LBB51_13: # %entry
 ; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll
index b033c75eeadd8..4c2deafdc7e66 100644
--- a/llvm/test/CodeGen/RISCV/half-arith.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith.ll
@@ -466,20 +466,26 @@ define half @fsgnj_h(half %a, half %b) nounwind {
 ;
 ; RV32IZHINXMIN-LABEL: fsgnj_h:
 ; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    lui a2, 1048568
 ; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    or a0, a0, a1
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: fsgnj_h:
 ; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    lui a2, 1048568
 ; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    or a0, a0, a1
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV64IZHINXMIN-NEXT:    ret
   %1 = call half @llvm.copysign.f16(half %a, half %b)
   ret half %1
@@ -725,6 +731,7 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ;
 ; RV32IZHINXMIN-LABEL: fsgnjn_h:
 ; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a2, a0
 ; RV32IZHINXMIN-NEXT:    fadd.s a1, a2, a1
@@ -735,10 +742,12 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    or a0, a0, a1
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: fsgnjn_h:
 ; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a2, a0
 ; RV64IZHINXMIN-NEXT:    fadd.s a1, a2, a1
@@ -749,6 +758,7 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    or a0, a0, a1
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV64IZHINXMIN-NEXT:    ret
   %1 = fadd half %a, %b
   %2 = fneg half %1
@@ -1702,8 +1712,7 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
 ; CHECKIZHINX-LABEL: fnmadd_h_3:
 ; CHECKIZHINX:       # %bb.0:
 ; CHECKIZHINX-NEXT:    fmadd.h a0, a0, a1, a2
-; CHECKIZHINX-NEXT:    lui a1, 1048568
-; CHECKIZHINX-NEXT:    xor a0, a0, a1
+; CHECKIZHINX-NEXT:    fneg.h a0, a0
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fnmadd_h_3:
@@ -1798,6 +1807,7 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    lui a1, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; CHECKIZHINXMIN-NEXT:    ret
   %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
   %neg = fneg half %1
@@ -1823,9 +1833,7 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
 ;
 ; CHECKIZHINX-LABEL: fnmadd_nsz:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    fmadd.h a0, a0, a1, a2
-; CHECKIZHINX-NEXT:    lui a1, 1048568
-; CHECKIZHINX-NEXT:    xor a0, a0, a1
+; CHECKIZHINX-NEXT:    fnmadd.h a0, a0, a1, a2
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fnmadd_nsz:
@@ -1920,6 +1928,7 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    lui a1, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; CHECKIZHINXMIN-NEXT:    ret
   %1 = call nsz half @llvm.fma.f16(half %a, half %b, half %c)
   %neg = fneg nsz half %1
@@ -2877,14 +2886,13 @@ define half @fsgnjx_f16(half %x, half %y) nounwind {
 ; RV32IZFHMIN-LABEL: fsgnjx_f16:
 ; RV32IZFHMIN:       # %bb.0:
 ; RV32IZFHMIN-NEXT:    lui a0, %hi(.LCPI23_0)
-; RV32IZFHMIN-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
-; RV32IZFHMIN-NEXT:    fmv.x.h a0, fa0
-; RV32IZFHMIN-NEXT:    lui a1, 1048568
-; RV32IZFHMIN-NEXT:    and a0, a0, a1
-; RV32IZFHMIN-NEXT:    fmv.x.h a1, fa5
-; RV32IZFHMIN-NEXT:    slli a1, a1, 17
-; RV32IZFHMIN-NEXT:    srli a1, a1, 17
-; RV32IZFHMIN-NEXT:    or a0, a1, a0
+; RV32IZFHMIN-NEXT:    lhu a0, %lo(.LCPI23_0)(a0)
+; RV32IZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV32IZFHMIN-NEXT:    lui a2, 1048568
+; RV32IZFHMIN-NEXT:    and a1, a1, a2
+; RV32IZFHMIN-NEXT:    slli a0, a0, 17
+; RV32IZFHMIN-NEXT:    srli a0, a0, 17
+; RV32IZFHMIN-NEXT:    or a0, a0, a1
 ; RV32IZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa1
@@ -2895,14 +2903,13 @@ define half @fsgnjx_f16(half %x, half %y) nounwind {
 ; RV64IZFHMIN-LABEL: fsgnjx_f16:
 ; RV64IZFHMIN:       # %bb.0:
 ; RV64IZFHMIN-NEXT:    lui a0, %hi(.LCPI23_0)
-; RV64IZFHMIN-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
-; RV64IZFHMIN-NEXT:    fmv.x.h a0, fa0
-; RV64IZFHMIN-NEXT:    lui a1, 1048568
-; RV64IZFHMIN-NEXT:    and a0, a0, a1
-; RV64IZFHMIN-NEXT:    fmv.x.h a1, fa5
-; RV64IZFHMIN-NEXT:    slli a1, a1, 49
-; RV64IZFHMIN-NEXT:    srli a1, a1, 49
-; RV64IZFHMIN-NEXT:    or a0, a1, a0
+; RV64IZFHMIN-NEXT:    lhu a0, %lo(.LCPI23_0)(a0)
+; RV64IZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV64IZFHMIN-NEXT:    lui a2, 1048568
+; RV64IZFHMIN-NEXT:    and a1, a1, a2
+; RV64IZFHMIN-NEXT:    slli a0, a0, 49
+; RV64IZFHMIN-NEXT:    srli a0, a0, 49
+; RV64IZFHMIN-NEXT:    or a0, a0, a1
 ; RV64IZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa1
@@ -2912,6 +2919,7 @@ define half @fsgnjx_f16(half %x, half %y) nounwind {
 ;
 ; CHECKIZHINXMIN-LABEL: fsgnjx_f16:
 ; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; CHECKIZHINXMIN-NEXT:    lui a2, 1048568
 ; CHECKIZHINXMIN-NEXT:    and a0, a0, a2
 ; CHECKIZHINXMIN-NEXT:    li a2, 15
diff --git a/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll
index 506b7027a8b35..e0c47bfac6fec 100644
--- a/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll
+++ b/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll
@@ -55,14 +55,12 @@ define half @fneg(half %a) nounwind {
 ;
 ; RV32IZHINX-LABEL: fneg:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, 1048568
-; RV32IZHINX-NEXT:    xor a0, a0, a1
+; RV32IZHINX-NEXT:    fneg.h a0, a0
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: fneg:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, 1048568
-; RV64IZHINX-NEXT:    xor a0, a0, a1
+; RV64IZHINX-NEXT:    fneg.h a0, a0
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IZFHMIN-LABEL: fneg:
@@ -79,8 +77,10 @@ define half @fneg(half %a) nounwind {
 ;
 ; RVIZHINXMIN-LABEL: fneg:
 ; RVIZHINXMIN:       # %bb.0:
+; RVIZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RVIZHINXMIN-NEXT:    lui a1, 1048568
 ; RVIZHINXMIN-NEXT:    xor a0, a0, a1
+; RVIZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RVIZHINXMIN-NEXT:    ret
   %1 = fneg half %a
   ret half %1
@@ -115,14 +115,12 @@ define half @fabs(half %a) nounwind {
 ;
 ; RV32IZHINX-LABEL: fabs:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    slli a0, a0, 17
-; RV32IZHINX-NEXT:    srli a0, a0, 17
+; RV32IZHINX-NEXT:    fabs.h a0, a0
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: fabs:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    slli a0, a0, 49
-; RV64IZHINX-NEXT:    srli a0, a0, 49
+; RV64IZHINX-NEXT:    fabs.h a0, a0
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IZFHMIN-LABEL: fabs:
@@ -139,14 +137,18 @@ define half @fabs(half %a) nounwind {
 ;
 ; RV32IZHINXMIN-LABEL: fabs:
 ; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: fabs:
 ; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV64IZHINXMIN-NEXT:    ret
   %1 = call half @llvm.fabs.f16(half %a)
   ret half %1
@@ -227,22 +229,28 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ;
 ; RV32IZHINXMIN-LABEL: fcopysign_fneg:
 ; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    not a1, a1
 ; RV32IZHINXMIN-NEXT:    lui a2, 1048568
 ; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    or a0, a0, a1
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: fcopysign_fneg:
 ; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    not a1, a1
 ; RV64IZHINXMIN-NEXT:    lui a2, 1048568
 ; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    or a0, a0, a1
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV64IZHINXMIN-NEXT:    ret
   %1 = fneg half %b
   %2 = call half @llvm.copysign.f16(half %a, half %1)
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index e5585661ce79a..0c84a08f1fd45 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -5536,10 +5536,12 @@ define half @bitcast_h_i16(i16 %a) nounwind {
 ;
 ; CHECKIZHINX-LABEL: bitcast_h_i16:
 ; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZDINXZHINX-LABEL: bitcast_h_i16:
 ; CHECKIZDINXZHINX:       # %bb.0:
+; CHECKIZDINXZHINX-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; CHECKIZDINXZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: bitcast_h_i16:
@@ -5588,18 +5590,22 @@ define half @bitcast_h_i16(i16 %a) nounwind {
 ;
 ; CHECK32-IZHINXMIN-LABEL: bitcast_h_i16:
 ; CHECK32-IZHINXMIN:       # %bb.0:
+; CHECK32-IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; CHECK32-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZHINXMIN-LABEL: bitcast_h_i16:
 ; CHECK64-IZHINXMIN:       # %bb.0:
+; CHECK64-IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; CHECK64-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK32-IZDINXZHINXMIN-LABEL: bitcast_h_i16:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0:
+; CHECK32-IZDINXZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; CHECK32-IZDINXZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZDINXZHINXMIN-LABEL: bitcast_h_i16:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0:
+; CHECK64-IZDINXZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; CHECK64-IZDINXZHINXMIN-NEXT:    ret
   %1 = bitcast i16 %a to half
   ret half %1
@@ -5623,10 +5629,12 @@ define i16 @bitcast_i16_h(half %a) nounwind {
 ;
 ; CHECKIZHINX-LABEL: bitcast_i16_h:
 ; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZDINXZHINX-LABEL: bitcast_i16_h:
 ; CHECKIZDINXZHINX:       # %bb.0:
+; CHECKIZDINXZHINX-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; CHECKIZDINXZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: bitcast_i16_h:
@@ -5667,18 +5675,22 @@ define i16 @bitcast_i16_h(half %a) nounwind {
 ;
 ; CHECK32-IZHINXMIN-LABEL: bitcast_i16_h:
 ; CHECK32-IZHINXMIN:       # %bb.0:
+; CHECK32-IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; CHECK32-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZHINXMIN-LABEL: bitcast_i16_h:
 ; CHECK64-IZHINXMIN:       # %bb.0:
+; CHECK64-IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; CHECK64-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK32-IZDINXZHINXMIN-LABEL: bitcast_i16_h:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0:
+; CHECK32-IZDINXZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; CHECK32-IZDINXZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZDINXZHINXMIN-LABEL: bitcast_i16_h:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0:
+; CHECK64-IZDINXZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; CHECK64-IZDINXZHINXMIN-NEXT:    ret
   %1 = bitcast half %a to i16
   ret i16 %1
diff --git a/llvm/test/CodeGen/RISCV/half-imm.ll b/llvm/test/CodeGen/RISCV/half-imm.ll
index 2ebc28c2ebd44..1045df1c3e766 100644
--- a/llvm/test/CodeGen/RISCV/half-imm.ll
+++ b/llvm/test/CodeGen/RISCV/half-imm.ll
@@ -32,12 +32,14 @@ define half @half_imm() nounwind {
 ; RV32IZHINX:       # %bb.0:
 ; RV32IZHINX-NEXT:    lui a0, 4
 ; RV32IZHINX-NEXT:    addi a0, a0, 512
+; RV32IZHINX-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: half_imm:
 ; RV64IZHINX:       # %bb.0:
 ; RV64IZHINX-NEXT:    lui a0, 4
 ; RV64IZHINX-NEXT:    addiw a0, a0, 512
+; RV64IZHINX-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV64IZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: half_imm:
@@ -50,12 +52,14 @@ define half @half_imm() nounwind {
 ; RV32IZHINXMIN:       # %bb.0:
 ; RV32IZHINXMIN-NEXT:    lui a0, 4
 ; RV32IZHINXMIN-NEXT:    addi a0, a0, 512
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: half_imm:
 ; RV64IZHINXMIN:       # %bb.0:
 ; RV64IZHINXMIN-NEXT:    lui a0, 4
 ; RV64IZHINXMIN-NEXT:    addiw a0, a0, 512
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV64IZHINXMIN-NEXT:    ret
   ret half 3.0
 }
diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
index 3e0f838270aa5..81e29329e7181 100644
--- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
@@ -1797,17 +1797,10 @@ define half @fabs_f16(half %a) nounwind {
 ; CHECKIZFH-NEXT:    fabs.h fa0, fa0
 ; CHECKIZFH-NEXT:    ret
 ;
-; RV32IZHINX-LABEL: fabs_f16:
-; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    slli a0, a0, 17
-; RV32IZHINX-NEXT:    srli a0, a0, 17
-; RV32IZHINX-NEXT:    ret
-;
-; RV64IZHINX-LABEL: fabs_f16:
-; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    slli a0, a0, 49
-; RV64IZHINX-NEXT:    srli a0, a0, 49
-; RV64IZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fabs_f16:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fabs.h a0, a0
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fabs_f16:
 ; RV32I:       # %bb.0:
@@ -1839,14 +1832,18 @@ define half @fabs_f16(half %a) nounwind {
 ;
 ; RV32IZHINXMIN-LABEL: fabs_f16:
 ; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: fabs_f16:
 ; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV64IZHINXMIN-NEXT:    ret
   %1 = call half @llvm.fabs.f16(half %a)
   ret half %1
@@ -2094,20 +2091,26 @@ define half @copysign_f16(half %a, half %b) nounwind {
 ;
 ; RV32IZHINXMIN-LABEL: copysign_f16:
 ; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    lui a2, 1048568
 ; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    or a0, a0, a1
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: copysign_f16:
 ; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    lui a2, 1048568
 ; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    or a0, a0, a1
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
 ; RV64IZHINXMIN-NEXT:    ret
   %1 = call half @llvm.copysign.f16(half %a, half %b)
   ret half %1
@@ -2835,6 +2838,7 @@ define i1 @isnan_d_fpclass(half %x) {
 ;
 ; RV32IZHINXMIN-LABEL: isnan_d_fpclass:
 ; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    li a1, 31
@@ -2844,6 +2848,7 @@ define i1 @isnan_d_fpclass(half %x) {
 ;
 ; RV64IZHINXMIN-LABEL: isnan_d_fpclass:
 ; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    li a1, 31
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-xsfvcp.ll b/llvm/test/CodeGen/RISCV/inline-asm-xsfvcp.ll
new file mode 100644
index 0000000000000..015b1bb2e6c5a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/inline-asm-xsfvcp.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+xsfvcp \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+xsfvcp \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+; VCIX instructions can not reorder between each other.
+define void @test_reorder(<vscale x 1 x i64> %vreg) {
+; CHECK-LABEL: test_reorder:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 0, e64, m1, ta, ma
+; CHECK-NEXT:    sf.vc.iv 0, 0, v8, 0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    sf.vc.vv 3, 0, v8, v8
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.sf.vc.iv.se.iXLen.nxv1i64.iXLen.iXLen(iXLen 0, iXLen 0, <vscale x 1 x i64> %vreg, iXLen 0, iXLen 0)
+  call iXLen asm sideeffect "sf.vc.vv 0x3, 0x0, $1, $1;", "=r,^vr,~{memory},~{vl},~{sf.vcix_state}"(<vscale x 1 x i64> %vreg)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/kcfi-mir.ll b/llvm/test/CodeGen/RISCV/kcfi-mir.ll
index 9d8475e2171ea..e478930d59abc 100644
--- a/llvm/test/CodeGen/RISCV/kcfi-mir.ll
+++ b/llvm/test/CodeGen/RISCV/kcfi-mir.ll
@@ -10,7 +10,7 @@ define void @f1(ptr noundef %x) !kcfi_type !1 {
   ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
   ; CHECK-NEXT:   SD killed $x1, $x2, 8 :: (store (s64) into %stack.0)
   ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x1, -8
-  ; CHECK-NEXT:   BUNDLE implicit-def $x6, implicit-def $x7, implicit-def $x28, implicit-def $x29, implicit-def $x30, implicit-def $x31, implicit-def dead $x1, implicit-def $x2, implicit killed $x10 {
+  ; CHECK-NEXT:   BUNDLE implicit-def $x6, implicit-def $x6_h, implicit-def $x7, implicit-def $x7_h, implicit-def $x28, implicit-def $x28_h, implicit-def $x29, implicit-def $x29_h, implicit-def $x30, implicit-def $x30_h, implicit-def $x31, implicit-def $x31_h, implicit-def dead $x1, implicit-def $x2, implicit-def $x2_h, implicit killed $x10 {
   ; CHECK-NEXT:     KCFI_CHECK $x10, 12345678, implicit-def $x6, implicit-def $x7, implicit-def $x28, implicit-def $x29, implicit-def $x30, implicit-def $x31
   ; CHECK-NEXT:     PseudoCALLIndirect killed $x10, csr_ilp32_lp64, implicit-def dead $x1, implicit-def $x2
   ; CHECK-NEXT:   }
@@ -26,7 +26,7 @@ define void @f2(ptr noundef %x) #0 {
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $x10
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   BUNDLE implicit-def $x6, implicit-def $x7, implicit-def $x28, implicit-def $x29, implicit-def $x30, implicit-def $x31, implicit killed $x10, implicit $x2 {
+  ; CHECK-NEXT:   BUNDLE implicit-def $x6, implicit-def $x6_h, implicit-def $x7, implicit-def $x7_h, implicit-def $x28, implicit-def $x28_h, implicit-def $x29, implicit-def $x29_h, implicit-def $x30, implicit-def $x30_h, implicit-def $x31, implicit-def $x31_h, implicit killed $x10, implicit $x2 {
   ; CHECK-NEXT:     KCFI_CHECK $x10, 12345678, implicit-def $x6, implicit-def $x7, implicit-def $x28, implicit-def $x29, implicit-def $x30, implicit-def $x31
   ; CHECK-NEXT:     PseudoTAILIndirect killed $x10, implicit $x2
   ; CHECK-NEXT:   }
diff --git a/llvm/test/CodeGen/RISCV/legalize-fneg.ll b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
index 13d03c5217fb1..dfd62e8d5f9f5 100644
--- a/llvm/test/CodeGen/RISCV/legalize-fneg.ll
+++ b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
@@ -56,16 +56,16 @@ entry:
 define void @test3(ptr %a, ptr %b) nounwind {
 ; RV32-LABEL: test3:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lw a2, 4(a1)
-; RV32-NEXT:    lw a3, 12(a1)
+; RV32-NEXT:    lw a2, 12(a1)
+; RV32-NEXT:    lw a3, 4(a1)
 ; RV32-NEXT:    lw a4, 8(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
 ; RV32-NEXT:    lui a5, 524288
-; RV32-NEXT:    xor a3, a3, a5
+; RV32-NEXT:    xor a2, a2, a5
 ; RV32-NEXT:    sw a4, 8(a0)
 ; RV32-NEXT:    sw a1, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    sw a3, 12(a0)
+; RV32-NEXT:    sw a3, 4(a0)
+; RV32-NEXT:    sw a2, 12(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test3:
diff --git a/llvm/test/CodeGen/RISCV/llvm.exp10.ll b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
index 6fde86733b07f..0941f6a73da28 100644
--- a/llvm/test/CodeGen/RISCV/llvm.exp10.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
@@ -222,32 +222,32 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; RV64IFD-NEXT:    .cfi_offset s1, -24
 ; RV64IFD-NEXT:    .cfi_offset s2, -32
 ; RV64IFD-NEXT:    .cfi_offset fs0, -40
-; RV64IFD-NEXT:    lhu s1, 16(a1)
-; RV64IFD-NEXT:    lhu s2, 0(a1)
-; RV64IFD-NEXT:    lhu a1, 8(a1)
+; RV64IFD-NEXT:    lhu s1, 0(a1)
+; RV64IFD-NEXT:    lhu a2, 8(a1)
+; RV64IFD-NEXT:    lhu s2, 16(a1)
 ; RV64IFD-NEXT:    mv s0, a0
-; RV64IFD-NEXT:    fmv.w.x fa0, a1
+; RV64IFD-NEXT:    fmv.w.x fa0, a2
 ; RV64IFD-NEXT:    call __extendhfsf2
 ; RV64IFD-NEXT:    call exp10f
 ; RV64IFD-NEXT:    call __truncsfhf2
 ; RV64IFD-NEXT:    fmv.s fs0, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s2
+; RV64IFD-NEXT:    fmv.w.x fa0, s1
 ; RV64IFD-NEXT:    call __extendhfsf2
 ; RV64IFD-NEXT:    call exp10f
 ; RV64IFD-NEXT:    fmv.x.w a0, fs0
-; RV64IFD-NEXT:    slli s2, a0, 16
+; RV64IFD-NEXT:    slli s1, a0, 16
 ; RV64IFD-NEXT:    call __truncsfhf2
 ; RV64IFD-NEXT:    fmv.x.w a0, fa0
 ; RV64IFD-NEXT:    slli a0, a0, 48
 ; RV64IFD-NEXT:    srli a0, a0, 48
-; RV64IFD-NEXT:    or s2, a0, s2
-; RV64IFD-NEXT:    fmv.w.x fa0, s1
+; RV64IFD-NEXT:    or s1, a0, s1
+; RV64IFD-NEXT:    fmv.w.x fa0, s2
 ; RV64IFD-NEXT:    call __extendhfsf2
 ; RV64IFD-NEXT:    call exp10f
 ; RV64IFD-NEXT:    call __truncsfhf2
 ; RV64IFD-NEXT:    fmv.x.w a0, fa0
 ; RV64IFD-NEXT:    sh a0, 4(s0)
-; RV64IFD-NEXT:    sw s2, 0(s0)
+; RV64IFD-NEXT:    sw s1, 0(s0)
 ; RV64IFD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64IFD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64IFD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -349,27 +349,27 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
 ; RV64IFD-NEXT:    .cfi_offset fs0, -48
 ; RV64IFD-NEXT:    .cfi_offset fs1, -56
 ; RV64IFD-NEXT:    .cfi_offset fs2, -64
-; RV64IFD-NEXT:    lhu s1, 24(a1)
-; RV64IFD-NEXT:    lhu s2, 0(a1)
-; RV64IFD-NEXT:    lhu s3, 8(a1)
-; RV64IFD-NEXT:    lhu a1, 16(a1)
+; RV64IFD-NEXT:    lhu s1, 0(a1)
+; RV64IFD-NEXT:    lhu s2, 8(a1)
+; RV64IFD-NEXT:    lhu a2, 16(a1)
+; RV64IFD-NEXT:    lhu s3, 24(a1)
 ; RV64IFD-NEXT:    mv s0, a0
-; RV64IFD-NEXT:    fmv.w.x fa0, a1
+; RV64IFD-NEXT:    fmv.w.x fa0, a2
 ; RV64IFD-NEXT:    call __extendhfsf2
 ; RV64IFD-NEXT:    call exp10f
 ; RV64IFD-NEXT:    call __truncsfhf2
 ; RV64IFD-NEXT:    fmv.s fs0, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s3
+; RV64IFD-NEXT:    fmv.w.x fa0, s2
 ; RV64IFD-NEXT:    call __extendhfsf2
 ; RV64IFD-NEXT:    call exp10f
 ; RV64IFD-NEXT:    call __truncsfhf2
 ; RV64IFD-NEXT:    fmv.s fs1, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s2
+; RV64IFD-NEXT:    fmv.w.x fa0, s1
 ; RV64IFD-NEXT:    call __extendhfsf2
 ; RV64IFD-NEXT:    call exp10f
 ; RV64IFD-NEXT:    call __truncsfhf2
 ; RV64IFD-NEXT:    fmv.s fs2, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s1
+; RV64IFD-NEXT:    fmv.w.x fa0, s3
 ; RV64IFD-NEXT:    call __extendhfsf2
 ; RV64IFD-NEXT:    call exp10f
 ; RV64IFD-NEXT:    fmv.x.w s1, fs2
diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
index 30f9dd1e51658..442b0cf5b4a85 100644
--- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
@@ -738,25 +738,25 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV32I-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 12(a1)
-; RV32I-NEXT:    lw s1, 8(a1)
-; RV32I-NEXT:    lw s2, 4(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
+; RV32I-NEXT:    lw s0, 4(a1)
+; RV32I-NEXT:    lw s1, 8(a1)
+; RV32I-NEXT:    lw s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    addi a1, sp, 12
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call frexpf
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    addi a1, sp, 16
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    addi a1, sp, 20
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    lw a1, 8(sp)
 ; RV32I-NEXT:    lw a2, 12(sp)
@@ -764,7 +764,7 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV32I-NEXT:    lw a4, 20(sp)
 ; RV32I-NEXT:    sw a0, 12(s3)
 ; RV32I-NEXT:    sw s1, 8(s3)
-; RV32I-NEXT:    sw s2, 4(s3)
+; RV32I-NEXT:    sw s0, 4(s3)
 ; RV32I-NEXT:    sw s4, 0(s3)
 ; RV32I-NEXT:    sw a4, 28(s3)
 ; RV32I-NEXT:    sw a3, 24(s3)
@@ -788,25 +788,25 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV64I-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw s0, 24(a1)
-; RV64I-NEXT:    lw s1, 16(a1)
-; RV64I-NEXT:    lw s2, 8(a1)
 ; RV64I-NEXT:    lw a2, 0(a1)
+; RV64I-NEXT:    lw s0, 8(a1)
+; RV64I-NEXT:    lw s1, 16(a1)
+; RV64I-NEXT:    lw s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    addi a1, sp, 4
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call frexpf
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    addi a1, sp, 8
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    lw a1, 0(sp)
 ; RV64I-NEXT:    lw a2, 4(sp)
@@ -814,7 +814,7 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV64I-NEXT:    lw a4, 12(sp)
 ; RV64I-NEXT:    sw a0, 12(s3)
 ; RV64I-NEXT:    sw s1, 8(s3)
-; RV64I-NEXT:    sw s2, 4(s3)
+; RV64I-NEXT:    sw s0, 4(s3)
 ; RV64I-NEXT:    sw s4, 0(s3)
 ; RV64I-NEXT:    sw a4, 28(s3)
 ; RV64I-NEXT:    sw a3, 24(s3)
@@ -1006,29 +1006,29 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV32I-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 12(a1)
-; RV32I-NEXT:    lw s1, 8(a1)
-; RV32I-NEXT:    lw s2, 4(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
+; RV32I-NEXT:    lw s0, 4(a1)
+; RV32I-NEXT:    lw s1, 8(a1)
+; RV32I-NEXT:    lw s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    addi a1, sp, 12
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call frexpf
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    addi a1, sp, 16
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    addi a1, sp, 20
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    sw a0, 12(s3)
 ; RV32I-NEXT:    sw s1, 8(s3)
-; RV32I-NEXT:    sw s2, 4(s3)
+; RV32I-NEXT:    sw s0, 4(s3)
 ; RV32I-NEXT:    sw s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
@@ -1048,29 +1048,29 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV64I-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw s0, 24(a1)
-; RV64I-NEXT:    lw s1, 16(a1)
-; RV64I-NEXT:    lw s2, 8(a1)
 ; RV64I-NEXT:    lw a2, 0(a1)
+; RV64I-NEXT:    lw s0, 8(a1)
+; RV64I-NEXT:    lw s1, 16(a1)
+; RV64I-NEXT:    lw s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    addi a1, sp, 4
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call frexpf
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    addi a1, sp, 8
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    sw a0, 12(s3)
 ; RV64I-NEXT:    sw s1, 8(s3)
-; RV64I-NEXT:    sw s2, 4(s3)
+; RV64I-NEXT:    sw s0, 4(s3)
 ; RV64I-NEXT:    sw s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -1254,22 +1254,22 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV32I-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 12(a1)
-; RV32I-NEXT:    lw s1, 8(a1)
-; RV32I-NEXT:    lw s2, 4(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
+; RV32I-NEXT:    lw s0, 4(a1)
+; RV32I-NEXT:    lw s1, 8(a1)
+; RV32I-NEXT:    lw s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    addi a1, sp, 12
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    addi a1, sp, 16
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    addi a1, sp, 20
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    addi a1, sp, 24
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    lw a0, 24(sp)
 ; RV32I-NEXT:    lw a1, 20(sp)
@@ -1295,22 +1295,22 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV64I-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw s0, 24(a1)
-; RV64I-NEXT:    lw s1, 16(a1)
-; RV64I-NEXT:    lw s2, 8(a1)
 ; RV64I-NEXT:    lw a2, 0(a1)
+; RV64I-NEXT:    lw s0, 8(a1)
+; RV64I-NEXT:    lw s1, 16(a1)
+; RV64I-NEXT:    lw s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    addi a1, sp, 8
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    addi a1, sp, 16
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    addi a1, sp, 20
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    lw a0, 20(sp)
 ; RV64I-NEXT:    lw a1, 16(sp)
@@ -1584,16 +1584,16 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind {
 ; RV32IFD-NEXT:    addi a2, sp, 36
 ; RV32IFD-NEXT:    sw a3, 0(sp)
 ; RV32IFD-NEXT:    call frexpl
-; RV32IFD-NEXT:    lw a0, 36(sp)
+; RV32IFD-NEXT:    lw a0, 24(sp)
 ; RV32IFD-NEXT:    lw a1, 28(sp)
-; RV32IFD-NEXT:    lw a2, 24(sp)
+; RV32IFD-NEXT:    lw a2, 16(sp)
 ; RV32IFD-NEXT:    lw a3, 20(sp)
-; RV32IFD-NEXT:    lw a4, 16(sp)
+; RV32IFD-NEXT:    lw a4, 36(sp)
 ; RV32IFD-NEXT:    sw a1, 12(s0)
-; RV32IFD-NEXT:    sw a2, 8(s0)
+; RV32IFD-NEXT:    sw a0, 8(s0)
 ; RV32IFD-NEXT:    sw a3, 4(s0)
-; RV32IFD-NEXT:    sw a4, 0(s0)
-; RV32IFD-NEXT:    sw a0, 16(s0)
+; RV32IFD-NEXT:    sw a2, 0(s0)
+; RV32IFD-NEXT:    sw a4, 16(s0)
 ; RV32IFD-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 48
@@ -1637,16 +1637,16 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    addi a2, sp, 36
 ; RV32IZFINXZDINX-NEXT:    sw a3, 0(sp)
 ; RV32IZFINXZDINX-NEXT:    call frexpl
-; RV32IZFINXZDINX-NEXT:    lw a0, 36(sp)
+; RV32IZFINXZDINX-NEXT:    lw a0, 24(sp)
 ; RV32IZFINXZDINX-NEXT:    lw a1, 28(sp)
-; RV32IZFINXZDINX-NEXT:    lw a2, 24(sp)
+; RV32IZFINXZDINX-NEXT:    lw a2, 16(sp)
 ; RV32IZFINXZDINX-NEXT:    lw a3, 20(sp)
-; RV32IZFINXZDINX-NEXT:    lw a4, 16(sp)
+; RV32IZFINXZDINX-NEXT:    lw a4, 36(sp)
 ; RV32IZFINXZDINX-NEXT:    sw a1, 12(s0)
-; RV32IZFINXZDINX-NEXT:    sw a2, 8(s0)
+; RV32IZFINXZDINX-NEXT:    sw a0, 8(s0)
 ; RV32IZFINXZDINX-NEXT:    sw a3, 4(s0)
-; RV32IZFINXZDINX-NEXT:    sw a4, 0(s0)
-; RV32IZFINXZDINX-NEXT:    sw a0, 16(s0)
+; RV32IZFINXZDINX-NEXT:    sw a2, 0(s0)
+; RV32IZFINXZDINX-NEXT:    sw a4, 16(s0)
 ; RV32IZFINXZDINX-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    addi sp, sp, 48
@@ -1690,16 +1690,16 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind {
 ; RV32I-NEXT:    addi a2, sp, 36
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    call frexpl
-; RV32I-NEXT:    lw a0, 36(sp)
+; RV32I-NEXT:    lw a0, 24(sp)
 ; RV32I-NEXT:    lw a1, 28(sp)
-; RV32I-NEXT:    lw a2, 24(sp)
+; RV32I-NEXT:    lw a2, 16(sp)
 ; RV32I-NEXT:    lw a3, 20(sp)
-; RV32I-NEXT:    lw a4, 16(sp)
+; RV32I-NEXT:    lw a4, 36(sp)
 ; RV32I-NEXT:    sw a1, 12(s0)
-; RV32I-NEXT:    sw a2, 8(s0)
+; RV32I-NEXT:    sw a0, 8(s0)
 ; RV32I-NEXT:    sw a3, 4(s0)
-; RV32I-NEXT:    sw a4, 0(s0)
-; RV32I-NEXT:    sw a0, 16(s0)
+; RV32I-NEXT:    sw a2, 0(s0)
+; RV32I-NEXT:    sw a4, 16(s0)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 48
diff --git a/llvm/test/CodeGen/RISCV/make-compressible-zbc-zhinx.mir b/llvm/test/CodeGen/RISCV/make-compressible-zbc-zhinx.mir
new file mode 100644
index 0000000000000..45fcc792d2fca
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/make-compressible-zbc-zhinx.mir
@@ -0,0 +1,249 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - %s -mtriple=riscv32 -mattr=+zcb,+zhinx -simplify-mir \
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=CHECK %s
+# RUN: llc -o - %s -mtriple=riscv64 -mattr=+zcb,+zhinx -simplify-mir \
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=CHECK %s
+
+--- |
+  define void @store_common_value_half(ptr %a, ptr %b, ptr %c) #0 {
+  entry:
+    store half 0.0, ptr %a, align 2
+    store half 0.0, ptr %b, align 2
+    store half 0.0, ptr %c, align 2
+    ret void
+  }
+
+  define void @store_common_ptr_half(ptr %p) #0 {
+  entry:
+    store volatile half 2.0, ptr %p, align 2
+    store volatile half 32.0, ptr %p, align 2
+    store volatile half 512.0, ptr %p, align 2
+    ret void
+  }
+
+  define void @load_common_ptr_half(ptr %p) #0 {
+  entry:
+    %0 = load volatile half, ptr %p, align 2
+    %1 = load volatile half, ptr %p, align 2
+    %2 = load volatile half, ptr %p, align 2
+    ret void
+  }
+
+  define void @store_large_offset_half(ptr %p) #0 {
+  entry:
+    %0 = getelementptr inbounds half, ptr %p, i32 100
+    store volatile half 2.0, ptr %0, align 2
+    %1 = getelementptr inbounds half, ptr %p, i32 101
+    store volatile half 32.0, ptr %1, align 2
+    %2 = getelementptr inbounds half, ptr %p, i32 102
+    store volatile half 512.0, ptr %2, align 2
+    %3 = getelementptr inbounds half, ptr %p, i32 103
+    store volatile half 16384.0, ptr %3, align 2
+    ret void
+  }
+
+  define void @load_large_offset_half(ptr %p) #0 {
+  entry:
+    %0 = getelementptr inbounds half, ptr %p, i32 100
+    %a = load volatile half, ptr %0, align 2
+    %1 = getelementptr inbounds half, ptr %p, i32 100
+    %b = load volatile half, ptr %1, align 2
+    %2 = getelementptr inbounds half, ptr %p, i32 101
+    %c = load volatile half, ptr %2, align 2
+    %3 = getelementptr inbounds half, ptr %p, i32 101
+    %d = load volatile half, ptr %3, align 2
+    ret void
+  }
+
+  define void @store_large_offset_no_opt_half(ptr %p) #0 {
+  entry:
+    %0 = getelementptr inbounds i8, ptr %p, i8 100
+    store volatile half 2.0, ptr %0, align 2
+    %1 = getelementptr inbounds i8, ptr %p, i8 101
+    store volatile half 32.0, ptr %1, align 2
+    %2 = getelementptr inbounds i8, ptr %p, i8 104
+    store volatile half 512.0, ptr %2, align 2
+    ret void
+  }
+
+  define void @load_large_offset_no_opt_half(ptr %p) #0 {
+  entry:
+    %0 = getelementptr inbounds half, ptr %p, i32 100
+    %a = load volatile half, ptr %0, align 2
+    %1 = getelementptr inbounds half, ptr %p, i32 101
+    %c = load volatile half, ptr %1, align 2
+    %2 = getelementptr inbounds half, ptr %p, i32 102
+    %d = load volatile half, ptr %2, align 2
+    ret void
+  }
+
+  attributes #0 = { minsize }
+
+...
+---
+name:            store_common_value_half
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11, $x12
+
+    ; CHECK-LABEL: name: store_common_value_half
+    ; CHECK: liveins: $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x13_h = PseudoMV_FPR16INX $x0_h
+    ; CHECK-NEXT: SH_INX $x13_h, killed renamable $x10, 0 :: (store (s16) into %ir.a)
+    ; CHECK-NEXT: SH_INX $x13_h, killed renamable $x11, 0 :: (store (s16) into %ir.b)
+    ; CHECK-NEXT: SH_INX $x13_h, killed renamable $x12, 0 :: (store (s16) into %ir.c)
+    ; CHECK-NEXT: PseudoRET
+    SH_INX $x0_h, killed renamable $x10, 0 :: (store (s16) into %ir.a)
+    SH_INX $x0_h, killed renamable $x11, 0 :: (store (s16) into %ir.b)
+    SH_INX $x0_h, killed renamable $x12, 0 :: (store (s16) into %ir.c)
+    PseudoRET
+
+...
+---
+name:            store_common_ptr_half
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x16
+
+    ; CHECK-LABEL: name: store_common_ptr_half
+    ; CHECK: liveins: $x16
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $x10 = LUI 4
+    ; CHECK-NEXT: $x11 = ADDI $x16, 0
+    ; CHECK-NEXT: SH_INX killed renamable $x10_h, $x11, 0 :: (volatile store (s16) into %ir.p)
+    ; CHECK-NEXT: renamable $x10 = LUI 5
+    ; CHECK-NEXT: SH_INX killed renamable $x10_h, $x11, 0 :: (volatile store (s16) into %ir.p)
+    ; CHECK-NEXT: renamable $x10 = LUI 6
+    ; CHECK-NEXT: SH_INX killed renamable $x10_h, killed $x11, 0 :: (volatile store (s16) into %ir.p)
+    ; CHECK-NEXT: PseudoRET
+    renamable $x10 = LUI 4
+    SH_INX killed renamable $x10_h, renamable $x16, 0 :: (volatile store (s16) into %ir.p)
+    renamable $x10 = LUI 5
+    SH_INX killed renamable $x10_h, renamable $x16, 0 :: (volatile store (s16) into %ir.p)
+    renamable $x10 = LUI 6
+    SH_INX killed renamable $x10_h, killed renamable $x16, 0 :: (volatile store (s16) into %ir.p)
+    PseudoRET
+
+...
+---
+name:            load_common_ptr_half
+body:             |
+  bb.0.entry:
+    liveins: $x16
+
+    ; CHECK-LABEL: name: load_common_ptr_half
+    ; CHECK: liveins: $x16
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x11 = ADDI $x16, 0
+    ; CHECK-NEXT: dead $x10_h = LH_INX $x11, 0 :: (volatile load (s16) from %ir.p)
+    ; CHECK-NEXT: dead $x10_h = LH_INX $x11, 0 :: (volatile load (s16) from %ir.p)
+    ; CHECK-NEXT: dead $x10_h = LH_INX killed $x11, 0 :: (volatile load (s16) from %ir.p)
+    ; CHECK-NEXT: PseudoRET
+    dead $x10_h = LH_INX renamable $x16, 0 :: (volatile load (s16) from %ir.p)
+    dead $x10_h = LH_INX renamable $x16, 0 :: (volatile load (s16) from %ir.p)
+    dead $x10_h = LH_INX killed renamable $x16, 0 :: (volatile load (s16) from %ir.p)
+    PseudoRET
+
+...
+---
+name:            store_large_offset_half
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: store_large_offset_half
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $x11 = LUI 4
+    ; CHECK-NEXT: $x12 = ADDI $x10, 200
+    ; CHECK-NEXT: SH_INX killed renamable $x11_h, $x12, 0 :: (volatile store (s16) into %ir.0)
+    ; CHECK-NEXT: renamable $x11 = LUI 5
+    ; CHECK-NEXT: SH_INX killed renamable $x11_h, $x12, 0 :: (volatile store (s16) into %ir.1)
+    ; CHECK-NEXT: renamable $x11 = LUI 6
+    ; CHECK-NEXT: SH_INX killed renamable $x11_h, $x12, 2 :: (volatile store (s16) into %ir.2)
+    ; CHECK-NEXT: renamable $x11 = LUI 7
+    ; CHECK-NEXT: SH_INX killed renamable $x11_h, killed $x12, 2 :: (volatile store (s16) into %ir.3)
+    ; CHECK-NEXT: PseudoRET
+    renamable $x11 = LUI 4
+    SH_INX killed renamable $x11_h, renamable $x10, 200 :: (volatile store (s16) into %ir.0)
+    renamable $x11 = LUI 5
+    SH_INX killed renamable $x11_h, renamable $x10, 200 :: (volatile store (s16) into %ir.1)
+    renamable $x11 = LUI 6
+    SH_INX killed renamable $x11_h, renamable $x10, 202 :: (volatile store (s16) into %ir.2)
+    renamable $x11 = LUI 7
+    SH_INX killed renamable $x11_h, killed renamable $x10, 202 :: (volatile store (s16) into %ir.3)
+    PseudoRET
+
+...
+---
+name:            load_large_offset_half
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x16
+
+    ; CHECK-LABEL: name: load_large_offset_half
+    ; CHECK: liveins: $x16
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x11 = ADDI $x16, 100
+    ; CHECK-NEXT: dead $x10_h = LH_INX $x11, 0 :: (volatile load (s16) from %ir.0)
+    ; CHECK-NEXT: dead $x10_h = LH_INX $x11, 0 :: (volatile load (s16) from %ir.1)
+    ; CHECK-NEXT: dead $x10_h = LH_INX $x11, 2 :: (volatile load (s16) from %ir.2)
+    ; CHECK-NEXT: dead $x10_h = LH_INX killed $x11, 2 :: (volatile load (s16) from %ir.3)
+    ; CHECK-NEXT: PseudoRET
+    dead $x10_h = LH_INX renamable $x16, 100 :: (volatile load (s16) from %ir.0)
+    dead $x10_h = LH_INX renamable $x16, 100 :: (volatile load (s16) from %ir.1)
+    dead $x10_h = LH_INX renamable $x16, 102 :: (volatile load (s16) from %ir.2)
+    dead $x10_h = LH_INX killed renamable $x16, 102 :: (volatile load (s16) from %ir.3)
+    PseudoRET
+
+...
+---
+name:            store_large_offset_no_opt_half
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x16
+
+    ; CHECK-LABEL: name: store_large_offset_no_opt_half
+    ; CHECK: liveins: $x16
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $x11 = LUI 4
+    ; CHECK-NEXT: SH_INX killed renamable $x11_h, renamable $x16, 200 :: (volatile store (s16) into %ir.0)
+    ; CHECK-NEXT: renamable $x11 = LUI 5
+    ; CHECK-NEXT: SH_INX killed renamable $x11_h, renamable $x16, 202 :: (volatile store (s16) into %ir.1)
+    ; CHECK-NEXT: renamable $x11 = LUI 6
+    ; CHECK-NEXT: SH_INX killed renamable $x11_h, renamable $x16, 204 :: (volatile store (s16) into %ir.2)
+    ; CHECK-NEXT: PseudoRET
+    renamable $x11 = LUI 4
+    SH_INX killed renamable $x11_h, renamable $x16, 200 :: (volatile store (s16) into %ir.0)
+    renamable $x11 = LUI 5
+    SH_INX killed renamable $x11_h, renamable $x16, 202 :: (volatile store (s16) into %ir.1)
+    renamable $x11 = LUI 6
+    SH_INX killed renamable $x11_h, renamable $x16, 204 :: (volatile store (s16) into %ir.2)
+    PseudoRET
+
+...
+---
+name:            load_large_offset_no_opt_half
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x16
+
+    ; CHECK-LABEL: name: load_large_offset_no_opt_half
+    ; CHECK: liveins: $x16
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: dead $x10_h = LH_INX renamable $x16, 100 :: (volatile load (s8) from %ir.0)
+    ; CHECK-NEXT: dead $x10_h = LH_INX renamable $x16, 102 :: (volatile load (s8) from %ir.1)
+    ; CHECK-NEXT: dead $x10_h = LH_INX killed renamable $x16, 104 :: (volatile load (s8) from %ir.2)
+    ; CHECK-NEXT: PseudoRET
+    dead $x10_h = LH_INX renamable $x16, 100 :: (volatile load (s8) from %ir.0)
+    dead $x10_h = LH_INX renamable $x16, 102 :: (volatile load (s8) from %ir.1)
+    dead $x10_h = LH_INX killed renamable $x16, 104 :: (volatile load (s8) from %ir.2)
+    PseudoRET
+
+...
diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll
index 02f582339d0b7..41c27d83defe6 100644
--- a/llvm/test/CodeGen/RISCV/memcpy.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy.ll
@@ -25,16 +25,16 @@ define i32 @t0() {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    lui a0, %hi(src)
 ; RV32-NEXT:    lw a1, %lo(src)(a0)
-; RV32-NEXT:    lui a2, %hi(dst)
-; RV32-NEXT:    sw a1, %lo(dst)(a2)
 ; RV32-NEXT:    addi a0, a0, %lo(src)
-; RV32-NEXT:    lbu a1, 10(a0)
+; RV32-NEXT:    lw a2, 4(a0)
 ; RV32-NEXT:    lh a3, 8(a0)
-; RV32-NEXT:    lw a0, 4(a0)
-; RV32-NEXT:    addi a2, a2, %lo(dst)
-; RV32-NEXT:    sb a1, 10(a2)
-; RV32-NEXT:    sh a3, 8(a2)
-; RV32-NEXT:    sw a0, 4(a2)
+; RV32-NEXT:    lbu a0, 10(a0)
+; RV32-NEXT:    lui a4, %hi(dst)
+; RV32-NEXT:    sw a1, %lo(dst)(a4)
+; RV32-NEXT:    addi a1, a4, %lo(dst)
+; RV32-NEXT:    sb a0, 10(a1)
+; RV32-NEXT:    sh a3, 8(a1)
+; RV32-NEXT:    sw a2, 4(a1)
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
 ;
@@ -42,14 +42,14 @@ define i32 @t0() {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    lui a0, %hi(src)
 ; RV64-NEXT:    ld a1, %lo(src)(a0)
-; RV64-NEXT:    lui a2, %hi(dst)
 ; RV64-NEXT:    addi a0, a0, %lo(src)
-; RV64-NEXT:    lbu a3, 10(a0)
-; RV64-NEXT:    lh a0, 8(a0)
-; RV64-NEXT:    sd a1, %lo(dst)(a2)
-; RV64-NEXT:    addi a1, a2, %lo(dst)
-; RV64-NEXT:    sb a3, 10(a1)
-; RV64-NEXT:    sh a0, 8(a1)
+; RV64-NEXT:    lh a2, 8(a0)
+; RV64-NEXT:    lbu a0, 10(a0)
+; RV64-NEXT:    lui a3, %hi(dst)
+; RV64-NEXT:    sd a1, %lo(dst)(a3)
+; RV64-NEXT:    addi a1, a3, %lo(dst)
+; RV64-NEXT:    sb a0, 10(a1)
+; RV64-NEXT:    sh a2, 8(a1)
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    ret
 ;
@@ -57,14 +57,14 @@ define i32 @t0() {
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lui a0, %hi(src)
 ; RV32-FAST-NEXT:    lw a1, %lo(src)(a0)
-; RV32-FAST-NEXT:    lui a2, %hi(dst)
 ; RV32-FAST-NEXT:    addi a0, a0, %lo(src)
-; RV32-FAST-NEXT:    lw a3, 7(a0)
-; RV32-FAST-NEXT:    lw a0, 4(a0)
-; RV32-FAST-NEXT:    sw a1, %lo(dst)(a2)
-; RV32-FAST-NEXT:    addi a1, a2, %lo(dst)
-; RV32-FAST-NEXT:    sw a3, 7(a1)
-; RV32-FAST-NEXT:    sw a0, 4(a1)
+; RV32-FAST-NEXT:    lw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a0, 7(a0)
+; RV32-FAST-NEXT:    lui a3, %hi(dst)
+; RV32-FAST-NEXT:    sw a1, %lo(dst)(a3)
+; RV32-FAST-NEXT:    addi a1, a3, %lo(dst)
+; RV32-FAST-NEXT:    sw a0, 7(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a1)
 ; RV32-FAST-NEXT:    li a0, 0
 ; RV32-FAST-NEXT:    ret
 ;
@@ -166,16 +166,16 @@ define void @t2(ptr nocapture %C) nounwind {
 ; RV64-FAST-NEXT:    lui a1, %hi(.L.str2)
 ; RV64-FAST-NEXT:    ld a2, %lo(.L.str2)(a1)
 ; RV64-FAST-NEXT:    sd a2, 0(a0)
-; RV64-FAST-NEXT:    lui a2, 1156
-; RV64-FAST-NEXT:    addi a2, a2, 332
 ; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str2)
-; RV64-FAST-NEXT:    ld a3, 24(a1)
-; RV64-FAST-NEXT:    ld a4, 16(a1)
-; RV64-FAST-NEXT:    ld a1, 8(a1)
-; RV64-FAST-NEXT:    sw a2, 32(a0)
-; RV64-FAST-NEXT:    sd a3, 24(a0)
-; RV64-FAST-NEXT:    sd a4, 16(a0)
-; RV64-FAST-NEXT:    sd a1, 8(a0)
+; RV64-FAST-NEXT:    ld a2, 8(a1)
+; RV64-FAST-NEXT:    ld a3, 16(a1)
+; RV64-FAST-NEXT:    ld a1, 24(a1)
+; RV64-FAST-NEXT:    lui a4, 1156
+; RV64-FAST-NEXT:    addi a4, a4, 332
+; RV64-FAST-NEXT:    sw a4, 32(a0)
+; RV64-FAST-NEXT:    sd a1, 24(a0)
+; RV64-FAST-NEXT:    sd a3, 16(a0)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
index db41b26271814..cf290a0b8682d 100644
--- a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
+++ b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
@@ -1,12 +1,14 @@
 ; REQUIRES: asserts
-; RUN: llc -mtriple=riscv32 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: llc -mtriple=riscv32 -verify-misched -riscv-misched-load-clustering=false \
+; RUN:     -debug-only=machine-scheduler -o - 2>&1 < %s \
 ; RUN:   | FileCheck -check-prefix=NOCLUSTER %s
-; RUN: llc -mtriple=riscv64 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: llc -mtriple=riscv64 -verify-misched -riscv-misched-load-clustering=false \
+; RUN:     -debug-only=machine-scheduler -o - 2>&1 < %s \
 ; RUN:   | FileCheck -check-prefix=NOCLUSTER %s
-; RUN: llc -mtriple=riscv32 -riscv-misched-load-clustering -verify-misched \
+; RUN: llc -mtriple=riscv32 -verify-misched \
 ; RUN:     -debug-only=machine-scheduler -o - 2>&1 < %s \
 ; RUN:   | FileCheck -check-prefix=LDCLUSTER %s
-; RUN: llc -mtriple=riscv64 -riscv-misched-load-clustering -verify-misched \
+; RUN: llc -mtriple=riscv64 -verify-misched \
 ; RUN:     -debug-only=machine-scheduler -o - 2>&1 < %s \
 ; RUN:   | FileCheck -check-prefix=LDCLUSTER %s
 
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 14f2777fdd06d..e9b84b3cd97ed 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1351,48 +1351,48 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32IM-NEXT:    addi sp, sp, -16
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a2, 12(a1)
-; RV32IM-NEXT:    lw a3, 8(a1)
-; RV32IM-NEXT:    lw a4, 0(a1)
-; RV32IM-NEXT:    lw a1, 4(a1)
+; RV32IM-NEXT:    lw a2, 0(a1)
+; RV32IM-NEXT:    lw a3, 4(a1)
+; RV32IM-NEXT:    lw a4, 8(a1)
+; RV32IM-NEXT:    lw a1, 12(a1)
 ; RV32IM-NEXT:    li a5, -15
 ; RV32IM-NEXT:    slli a5, a5, 8
-; RV32IM-NEXT:    mulhu a6, a4, a5
-; RV32IM-NEXT:    mul a7, a1, a5
+; RV32IM-NEXT:    mulhu a6, a2, a5
+; RV32IM-NEXT:    mul a7, a3, a5
 ; RV32IM-NEXT:    add a6, a7, a6
 ; RV32IM-NEXT:    sltu a7, a6, a7
-; RV32IM-NEXT:    mulhu t0, a1, a5
+; RV32IM-NEXT:    mulhu t0, a3, a5
 ; RV32IM-NEXT:    add a7, t0, a7
-; RV32IM-NEXT:    sub a6, a6, a4
-; RV32IM-NEXT:    neg t0, a4
+; RV32IM-NEXT:    sub a6, a6, a2
+; RV32IM-NEXT:    neg t0, a2
 ; RV32IM-NEXT:    sltu t1, a6, t0
 ; RV32IM-NEXT:    li t2, -1
-; RV32IM-NEXT:    mulhu t3, a4, t2
+; RV32IM-NEXT:    mulhu t3, a2, t2
 ; RV32IM-NEXT:    add t1, t3, t1
 ; RV32IM-NEXT:    add t1, a7, t1
-; RV32IM-NEXT:    sub t4, t1, a1
-; RV32IM-NEXT:    mul t5, a3, a5
-; RV32IM-NEXT:    sub t5, t5, a4
+; RV32IM-NEXT:    sub t4, t1, a3
+; RV32IM-NEXT:    mul t5, a4, a5
+; RV32IM-NEXT:    sub t5, t5, a2
 ; RV32IM-NEXT:    add t6, t4, t5
 ; RV32IM-NEXT:    sltu s0, t6, t4
-; RV32IM-NEXT:    neg s1, a1
+; RV32IM-NEXT:    neg s1, a3
 ; RV32IM-NEXT:    sltu t4, t4, s1
 ; RV32IM-NEXT:    sltu a7, t1, a7
-; RV32IM-NEXT:    mulhu t1, a1, t2
+; RV32IM-NEXT:    mulhu t1, a3, t2
 ; RV32IM-NEXT:    add a7, t1, a7
 ; RV32IM-NEXT:    add a7, a7, t4
 ; RV32IM-NEXT:    sltu t0, t5, t0
-; RV32IM-NEXT:    mul a2, a2, a5
-; RV32IM-NEXT:    mulhu t1, a3, a5
-; RV32IM-NEXT:    sub a3, t1, a3
-; RV32IM-NEXT:    add a2, a3, a2
+; RV32IM-NEXT:    mul a1, a1, a5
+; RV32IM-NEXT:    mulhu t1, a4, a5
+; RV32IM-NEXT:    sub a4, t1, a4
 ; RV32IM-NEXT:    add a1, a4, a1
-; RV32IM-NEXT:    sub a1, t3, a1
-; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    add a3, a2, a3
+; RV32IM-NEXT:    sub a3, t3, a3
+; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    add a1, a1, t0
 ; RV32IM-NEXT:    add a1, a7, a1
 ; RV32IM-NEXT:    add a1, a1, s0
-; RV32IM-NEXT:    mul a2, a4, a5
+; RV32IM-NEXT:    mul a2, a2, a5
 ; RV32IM-NEXT:    sw a2, 0(a0)
 ; RV32IM-NEXT:    sw a6, 4(a0)
 ; RV32IM-NEXT:    sw t6, 8(a0)
@@ -1436,39 +1436,39 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32I-LABEL: muli128_m63:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    slli a3, a2, 6
-; RV32I-NEXT:    sltu a5, a2, a3
+; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    slli a1, a2, 6
+; RV32I-NEXT:    sltu a4, a2, a1
 ; RV32I-NEXT:    srli a7, a2, 26
-; RV32I-NEXT:    slli t0, a1, 6
+; RV32I-NEXT:    slli t0, a3, 6
 ; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    mv t0, a5
-; RV32I-NEXT:    beq a1, a7, .LBB37_2
+; RV32I-NEXT:    mv t0, a4
+; RV32I-NEXT:    beq a3, a7, .LBB37_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t0, a1, a7
+; RV32I-NEXT:    sltu t0, a3, a7
 ; RV32I-NEXT:  .LBB37_2:
-; RV32I-NEXT:    srli t1, a1, 26
+; RV32I-NEXT:    srli t1, a3, 26
 ; RV32I-NEXT:    slli t2, a6, 6
 ; RV32I-NEXT:    or t1, t2, t1
 ; RV32I-NEXT:    sub t2, a6, t1
 ; RV32I-NEXT:    sltu t3, t2, t0
 ; RV32I-NEXT:    sltu t1, a6, t1
 ; RV32I-NEXT:    srli a6, a6, 26
-; RV32I-NEXT:    slli t4, a4, 6
+; RV32I-NEXT:    slli t4, a5, 6
 ; RV32I-NEXT:    or a6, t4, a6
-; RV32I-NEXT:    sub a4, a4, a6
-; RV32I-NEXT:    sub a4, a4, t1
-; RV32I-NEXT:    sub a4, a4, t3
+; RV32I-NEXT:    sub a5, a5, a6
+; RV32I-NEXT:    sub a5, a5, t1
+; RV32I-NEXT:    sub a5, a5, t3
 ; RV32I-NEXT:    sub a6, t2, t0
-; RV32I-NEXT:    sub a1, a1, a7
-; RV32I-NEXT:    sub a1, a1, a5
-; RV32I-NEXT:    sub a2, a2, a3
+; RV32I-NEXT:    sub a3, a3, a7
+; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sub a2, a2, a1
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a3, 4(a0)
 ; RV32I-NEXT:    sw a6, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a5, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli128_m63:
@@ -1476,52 +1476,52 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32IM-NEXT:    addi sp, sp, -16
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a2, 12(a1)
-; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a4, 4(a1)
-; RV32IM-NEXT:    lw a1, 8(a1)
+; RV32IM-NEXT:    lw a2, 0(a1)
+; RV32IM-NEXT:    lw a3, 4(a1)
+; RV32IM-NEXT:    lw a4, 8(a1)
+; RV32IM-NEXT:    lw a1, 12(a1)
 ; RV32IM-NEXT:    li a5, -63
-; RV32IM-NEXT:    mulhu a6, a3, a5
-; RV32IM-NEXT:    slli a7, a4, 6
-; RV32IM-NEXT:    sub a7, a4, a7
+; RV32IM-NEXT:    mulhu a6, a2, a5
+; RV32IM-NEXT:    slli a7, a3, 6
+; RV32IM-NEXT:    sub a7, a3, a7
 ; RV32IM-NEXT:    add a6, a7, a6
 ; RV32IM-NEXT:    sltu a7, a6, a7
-; RV32IM-NEXT:    mulhu t0, a4, a5
+; RV32IM-NEXT:    mulhu t0, a3, a5
 ; RV32IM-NEXT:    add a7, t0, a7
-; RV32IM-NEXT:    sub a6, a6, a3
-; RV32IM-NEXT:    neg t0, a3
+; RV32IM-NEXT:    sub a6, a6, a2
+; RV32IM-NEXT:    neg t0, a2
 ; RV32IM-NEXT:    sltu t1, a6, t0
 ; RV32IM-NEXT:    li t2, -1
-; RV32IM-NEXT:    mulhu t3, a3, t2
+; RV32IM-NEXT:    mulhu t3, a2, t2
 ; RV32IM-NEXT:    add t1, t3, t1
 ; RV32IM-NEXT:    add t1, a7, t1
-; RV32IM-NEXT:    sub t4, t1, a4
-; RV32IM-NEXT:    slli t5, a1, 6
-; RV32IM-NEXT:    sub t6, a1, a3
+; RV32IM-NEXT:    sub t4, t1, a3
+; RV32IM-NEXT:    slli t5, a4, 6
+; RV32IM-NEXT:    sub t6, a4, a2
 ; RV32IM-NEXT:    sub t5, t6, t5
 ; RV32IM-NEXT:    add t6, t4, t5
 ; RV32IM-NEXT:    sltu s0, t6, t4
-; RV32IM-NEXT:    neg s1, a4
+; RV32IM-NEXT:    neg s1, a3
 ; RV32IM-NEXT:    sltu t4, t4, s1
 ; RV32IM-NEXT:    sltu a7, t1, a7
-; RV32IM-NEXT:    mulhu t1, a4, t2
+; RV32IM-NEXT:    mulhu t1, a3, t2
 ; RV32IM-NEXT:    add a7, t1, a7
 ; RV32IM-NEXT:    add a7, a7, t4
 ; RV32IM-NEXT:    sltu t0, t5, t0
-; RV32IM-NEXT:    slli t1, a2, 6
-; RV32IM-NEXT:    sub a2, a2, t1
-; RV32IM-NEXT:    mulhu a5, a1, a5
-; RV32IM-NEXT:    sub a5, a5, a1
-; RV32IM-NEXT:    add a2, a5, a2
-; RV32IM-NEXT:    add a4, a3, a4
-; RV32IM-NEXT:    sub a1, t3, a4
-; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    slli t1, a1, 6
+; RV32IM-NEXT:    sub a1, a1, t1
+; RV32IM-NEXT:    mulhu a5, a4, a5
+; RV32IM-NEXT:    sub a5, a5, a4
+; RV32IM-NEXT:    add a1, a5, a1
+; RV32IM-NEXT:    add a3, a2, a3
+; RV32IM-NEXT:    sub a3, t3, a3
+; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    add a1, a1, t0
 ; RV32IM-NEXT:    add a1, a7, a1
 ; RV32IM-NEXT:    add a1, a1, s0
-; RV32IM-NEXT:    slli a2, a3, 6
-; RV32IM-NEXT:    sub a3, a3, a2
-; RV32IM-NEXT:    sw a3, 0(a0)
+; RV32IM-NEXT:    slli a3, a2, 6
+; RV32IM-NEXT:    sub a2, a2, a3
+; RV32IM-NEXT:    sw a2, 0(a0)
 ; RV32IM-NEXT:    sw a6, 4(a0)
 ; RV32IM-NEXT:    sw t6, 8(a0)
 ; RV32IM-NEXT:    sw a1, 12(a0)
diff --git a/llvm/test/CodeGen/RISCV/nontemporal.ll b/llvm/test/CodeGen/RISCV/nontemporal.ll
index 4c5c36fc72d14..55bd32e485734 100644
--- a/llvm/test/CodeGen/RISCV/nontemporal.ll
+++ b/llvm/test/CodeGen/RISCV/nontemporal.ll
@@ -915,30 +915,30 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    lbu a7, 40(a1)
 ; CHECK-RV64-NEXT:    lbu t0, 48(a1)
 ; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
-; CHECK-RV64-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64-NEXT:    lbu t2, 96(a1)
+; CHECK-RV64-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64-NEXT:    lbu t4, 112(a1)
+; CHECK-RV64-NEXT:    lbu t5, 120(a1)
+; CHECK-RV64-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64-NEXT:    lbu s0, 72(a1)
+; CHECK-RV64-NEXT:    lbu s1, 80(a1)
+; CHECK-RV64-NEXT:    lbu a1, 88(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb t5, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb t4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a1, 12(a0)
+; CHECK-RV64-NEXT:    sb t2, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb a1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s1, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb s0, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t6, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sb t1, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.all
@@ -976,30 +976,30 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    lbu a7, 20(a1)
 ; CHECK-RV32-NEXT:    lbu t0, 24(a1)
 ; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
-; CHECK-RV32-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32-NEXT:    lbu t2, 48(a1)
+; CHECK-RV32-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32-NEXT:    lbu t4, 56(a1)
+; CHECK-RV32-NEXT:    lbu t5, 60(a1)
+; CHECK-RV32-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32-NEXT:    lbu s0, 36(a1)
+; CHECK-RV32-NEXT:    lbu s1, 40(a1)
+; CHECK-RV32-NEXT:    lbu a1, 44(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb t5, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb t4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a1, 12(a0)
+; CHECK-RV32-NEXT:    sb t2, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb a1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s1, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb s0, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t6, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sb t1, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.all
@@ -1037,28 +1037,28 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
 ; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
 ; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 96(a1)
+; CHECK-RV64C-NEXT:    lbu a3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
-; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
-; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 72(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 80(a1)
+; CHECK-RV64C-NEXT:    lbu a1, 88(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a5, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb a3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a1, 12(a0)
+; CHECK-RV64C-NEXT:    sb a2, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb a1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s1, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a3, 9(a0)
+; CHECK-RV64C-NEXT:    sb s0, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb t6, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
@@ -1098,28 +1098,28 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
 ; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
 ; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 48(a1)
+; CHECK-RV32C-NEXT:    lbu a3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
-; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
-; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 36(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 40(a1)
+; CHECK-RV32C-NEXT:    lbu a1, 44(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a5, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb a3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a1, 12(a0)
+; CHECK-RV32C-NEXT:    sb a2, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb a1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s1, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a3, 9(a0)
+; CHECK-RV32C-NEXT:    sb s0, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb t6, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
@@ -1163,112 +1163,112 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
-; CHECK-RV64-NEXT:    lh a1, 32(a1)
+; CHECK-RV64-NEXT:    lh a2, 32(a1)
+; CHECK-RV64-NEXT:    lh a3, 40(a1)
+; CHECK-RV64-NEXT:    lh a4, 48(a1)
+; CHECK-RV64-NEXT:    lh a5, 56(a1)
+; CHECK-RV64-NEXT:    lh a6, 0(a1)
+; CHECK-RV64-NEXT:    lh a7, 8(a1)
+; CHECK-RV64-NEXT:    lh t0, 16(a1)
+; CHECK-RV64-NEXT:    lh a1, 24(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a5, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a4, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a3, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a1, 8(a0)
+; CHECK-RV64-NEXT:    sh a2, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh a1, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh t0, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
-; CHECK-RV32-NEXT:    lh a1, 16(a1)
+; CHECK-RV32-NEXT:    lh a2, 16(a1)
+; CHECK-RV32-NEXT:    lh a3, 20(a1)
+; CHECK-RV32-NEXT:    lh a4, 24(a1)
+; CHECK-RV32-NEXT:    lh a5, 28(a1)
+; CHECK-RV32-NEXT:    lh a6, 0(a1)
+; CHECK-RV32-NEXT:    lh a7, 4(a1)
+; CHECK-RV32-NEXT:    lh t0, 8(a1)
+; CHECK-RV32-NEXT:    lh a1, 12(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a5, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a4, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a3, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a1, 8(a0)
+; CHECK-RV32-NEXT:    sh a2, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh a1, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh t0, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 32(a1)
+; CHECK-RV64C-NEXT:    lh a3, 40(a1)
+; CHECK-RV64C-NEXT:    lh a4, 48(a1)
+; CHECK-RV64C-NEXT:    lh a5, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
-; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
-; CHECK-RV64C-NEXT:    lh a1, 32(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
+; CHECK-RV64C-NEXT:    lh a1, 24(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a5, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a3, 12(a0)
+; CHECK-RV64C-NEXT:    sh a4, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a3, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a1, 8(a0)
+; CHECK-RV64C-NEXT:    sh a7, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a5, 6(a0)
+; CHECK-RV64C-NEXT:    sh a1, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 16(a1)
+; CHECK-RV32C-NEXT:    lh a3, 20(a1)
+; CHECK-RV32C-NEXT:    lh a4, 24(a1)
+; CHECK-RV32C-NEXT:    lh a5, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
-; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
-; CHECK-RV32C-NEXT:    lh a1, 16(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
+; CHECK-RV32C-NEXT:    lh a1, 12(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a5, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a3, 12(a0)
+; CHECK-RV32C-NEXT:    sh a4, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a3, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a1, 8(a0)
+; CHECK-RV32C-NEXT:    sh a7, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a5, 6(a0)
+; CHECK-RV32C-NEXT:    sh a1, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -2329,30 +2329,30 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    lbu a7, 40(a1)
 ; CHECK-RV64-NEXT:    lbu t0, 48(a1)
 ; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
-; CHECK-RV64-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64-NEXT:    lbu t2, 96(a1)
+; CHECK-RV64-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64-NEXT:    lbu t4, 112(a1)
+; CHECK-RV64-NEXT:    lbu t5, 120(a1)
+; CHECK-RV64-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64-NEXT:    lbu s0, 72(a1)
+; CHECK-RV64-NEXT:    lbu s1, 80(a1)
+; CHECK-RV64-NEXT:    lbu a1, 88(a1)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb t5, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb t4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a1, 12(a0)
+; CHECK-RV64-NEXT:    sb t2, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb a1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s1, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb s0, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t6, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
 ; CHECK-RV64-NEXT:    sb t1, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
@@ -2390,30 +2390,30 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    lbu a7, 20(a1)
 ; CHECK-RV32-NEXT:    lbu t0, 24(a1)
 ; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
-; CHECK-RV32-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32-NEXT:    lbu t2, 48(a1)
+; CHECK-RV32-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32-NEXT:    lbu t4, 56(a1)
+; CHECK-RV32-NEXT:    lbu t5, 60(a1)
+; CHECK-RV32-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32-NEXT:    lbu s0, 36(a1)
+; CHECK-RV32-NEXT:    lbu s1, 40(a1)
+; CHECK-RV32-NEXT:    lbu a1, 44(a1)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb t5, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb t4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a1, 12(a0)
+; CHECK-RV32-NEXT:    sb t2, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb a1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s1, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb s0, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t6, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
 ; CHECK-RV32-NEXT:    sb t1, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
@@ -2451,28 +2451,28 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
 ; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
 ; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 96(a1)
+; CHECK-RV64C-NEXT:    lbu a3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
-; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
-; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 72(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 80(a1)
+; CHECK-RV64C-NEXT:    lbu a1, 88(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a5, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb a3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a1, 12(a0)
+; CHECK-RV64C-NEXT:    sb a2, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb a1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s1, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a3, 9(a0)
+; CHECK-RV64C-NEXT:    sb s0, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sb t6, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
@@ -2512,28 +2512,28 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
 ; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
 ; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 48(a1)
+; CHECK-RV32C-NEXT:    lbu a3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
-; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
-; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 36(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 40(a1)
+; CHECK-RV32C-NEXT:    lbu a1, 44(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a5, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb a3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a1, 12(a0)
+; CHECK-RV32C-NEXT:    sb a2, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb a1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s1, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a3, 9(a0)
+; CHECK-RV32C-NEXT:    sb s0, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sb t6, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
@@ -2577,112 +2577,112 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_P1_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
-; CHECK-RV64-NEXT:    lh a1, 32(a1)
+; CHECK-RV64-NEXT:    lh a2, 32(a1)
+; CHECK-RV64-NEXT:    lh a3, 40(a1)
+; CHECK-RV64-NEXT:    lh a4, 48(a1)
+; CHECK-RV64-NEXT:    lh a5, 56(a1)
+; CHECK-RV64-NEXT:    lh a6, 0(a1)
+; CHECK-RV64-NEXT:    lh a7, 8(a1)
+; CHECK-RV64-NEXT:    lh t0, 16(a1)
+; CHECK-RV64-NEXT:    lh a1, 24(a1)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a5, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a4, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a3, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a1, 8(a0)
+; CHECK-RV64-NEXT:    sh a2, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh a1, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh t0, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
-; CHECK-RV32-NEXT:    lh a1, 16(a1)
+; CHECK-RV32-NEXT:    lh a2, 16(a1)
+; CHECK-RV32-NEXT:    lh a3, 20(a1)
+; CHECK-RV32-NEXT:    lh a4, 24(a1)
+; CHECK-RV32-NEXT:    lh a5, 28(a1)
+; CHECK-RV32-NEXT:    lh a6, 0(a1)
+; CHECK-RV32-NEXT:    lh a7, 4(a1)
+; CHECK-RV32-NEXT:    lh t0, 8(a1)
+; CHECK-RV32-NEXT:    lh a1, 12(a1)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a5, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a4, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a3, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a1, 8(a0)
+; CHECK-RV32-NEXT:    sh a2, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh a1, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh t0, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 32(a1)
+; CHECK-RV64C-NEXT:    lh a3, 40(a1)
+; CHECK-RV64C-NEXT:    lh a4, 48(a1)
+; CHECK-RV64C-NEXT:    lh a5, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
-; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
-; CHECK-RV64C-NEXT:    lh a1, 32(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
+; CHECK-RV64C-NEXT:    lh a1, 24(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a5, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a3, 12(a0)
+; CHECK-RV64C-NEXT:    sh a4, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a3, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a1, 8(a0)
+; CHECK-RV64C-NEXT:    sh a7, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a5, 6(a0)
+; CHECK-RV64C-NEXT:    sh a1, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 16(a1)
+; CHECK-RV32C-NEXT:    lh a3, 20(a1)
+; CHECK-RV32C-NEXT:    lh a4, 24(a1)
+; CHECK-RV32C-NEXT:    lh a5, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
-; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
-; CHECK-RV32C-NEXT:    lh a1, 16(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
+; CHECK-RV32C-NEXT:    lh a1, 12(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a5, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a3, 12(a0)
+; CHECK-RV32C-NEXT:    sh a4, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a3, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a1, 8(a0)
+; CHECK-RV32C-NEXT:    sh a7, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a5, 6(a0)
+; CHECK-RV32C-NEXT:    sh a1, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -3743,30 +3743,30 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    lbu a7, 40(a1)
 ; CHECK-RV64-NEXT:    lbu t0, 48(a1)
 ; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
-; CHECK-RV64-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64-NEXT:    lbu t2, 96(a1)
+; CHECK-RV64-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64-NEXT:    lbu t4, 112(a1)
+; CHECK-RV64-NEXT:    lbu t5, 120(a1)
+; CHECK-RV64-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64-NEXT:    lbu s0, 72(a1)
+; CHECK-RV64-NEXT:    lbu s1, 80(a1)
+; CHECK-RV64-NEXT:    lbu a1, 88(a1)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb t5, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb t4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a1, 12(a0)
+; CHECK-RV64-NEXT:    sb t2, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb a1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s1, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb s0, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t6, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
 ; CHECK-RV64-NEXT:    sb t1, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
@@ -3804,30 +3804,30 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    lbu a7, 20(a1)
 ; CHECK-RV32-NEXT:    lbu t0, 24(a1)
 ; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
-; CHECK-RV32-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32-NEXT:    lbu t2, 48(a1)
+; CHECK-RV32-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32-NEXT:    lbu t4, 56(a1)
+; CHECK-RV32-NEXT:    lbu t5, 60(a1)
+; CHECK-RV32-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32-NEXT:    lbu s0, 36(a1)
+; CHECK-RV32-NEXT:    lbu s1, 40(a1)
+; CHECK-RV32-NEXT:    lbu a1, 44(a1)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb t5, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb t4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a1, 12(a0)
+; CHECK-RV32-NEXT:    sb t2, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb a1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s1, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb s0, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t6, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
 ; CHECK-RV32-NEXT:    sb t1, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
@@ -3865,28 +3865,28 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
 ; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
 ; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 96(a1)
+; CHECK-RV64C-NEXT:    lbu a3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
-; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
-; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 72(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 80(a1)
+; CHECK-RV64C-NEXT:    lbu a1, 88(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a5, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb a3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a1, 12(a0)
+; CHECK-RV64C-NEXT:    sb a2, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb a1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s1, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a3, 9(a0)
+; CHECK-RV64C-NEXT:    sb s0, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sb t6, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
@@ -3926,28 +3926,28 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
 ; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
 ; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 48(a1)
+; CHECK-RV32C-NEXT:    lbu a3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
-; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
-; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 36(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 40(a1)
+; CHECK-RV32C-NEXT:    lbu a1, 44(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a5, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb a3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a1, 12(a0)
+; CHECK-RV32C-NEXT:    sb a2, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb a1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s1, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a3, 9(a0)
+; CHECK-RV32C-NEXT:    sb s0, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sb t6, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
@@ -3991,112 +3991,112 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_PALL_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
-; CHECK-RV64-NEXT:    lh a1, 32(a1)
+; CHECK-RV64-NEXT:    lh a2, 32(a1)
+; CHECK-RV64-NEXT:    lh a3, 40(a1)
+; CHECK-RV64-NEXT:    lh a4, 48(a1)
+; CHECK-RV64-NEXT:    lh a5, 56(a1)
+; CHECK-RV64-NEXT:    lh a6, 0(a1)
+; CHECK-RV64-NEXT:    lh a7, 8(a1)
+; CHECK-RV64-NEXT:    lh t0, 16(a1)
+; CHECK-RV64-NEXT:    lh a1, 24(a1)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a5, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a4, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a3, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a1, 8(a0)
+; CHECK-RV64-NEXT:    sh a2, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh a1, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh t0, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
-; CHECK-RV32-NEXT:    lh a1, 16(a1)
+; CHECK-RV32-NEXT:    lh a2, 16(a1)
+; CHECK-RV32-NEXT:    lh a3, 20(a1)
+; CHECK-RV32-NEXT:    lh a4, 24(a1)
+; CHECK-RV32-NEXT:    lh a5, 28(a1)
+; CHECK-RV32-NEXT:    lh a6, 0(a1)
+; CHECK-RV32-NEXT:    lh a7, 4(a1)
+; CHECK-RV32-NEXT:    lh t0, 8(a1)
+; CHECK-RV32-NEXT:    lh a1, 12(a1)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a5, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a4, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a3, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a1, 8(a0)
+; CHECK-RV32-NEXT:    sh a2, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh a1, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh t0, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 32(a1)
+; CHECK-RV64C-NEXT:    lh a3, 40(a1)
+; CHECK-RV64C-NEXT:    lh a4, 48(a1)
+; CHECK-RV64C-NEXT:    lh a5, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
-; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
-; CHECK-RV64C-NEXT:    lh a1, 32(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
+; CHECK-RV64C-NEXT:    lh a1, 24(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a5, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a3, 12(a0)
+; CHECK-RV64C-NEXT:    sh a4, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a3, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a1, 8(a0)
+; CHECK-RV64C-NEXT:    sh a7, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a5, 6(a0)
+; CHECK-RV64C-NEXT:    sh a1, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 16(a1)
+; CHECK-RV32C-NEXT:    lh a3, 20(a1)
+; CHECK-RV32C-NEXT:    lh a4, 24(a1)
+; CHECK-RV32C-NEXT:    lh a5, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
-; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
-; CHECK-RV32C-NEXT:    lh a1, 16(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
+; CHECK-RV32C-NEXT:    lh a1, 12(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a5, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a3, 12(a0)
+; CHECK-RV32C-NEXT:    sh a4, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a3, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a1, 8(a0)
+; CHECK-RV32C-NEXT:    sh a7, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a5, 6(a0)
+; CHECK-RV32C-NEXT:    sh a1, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -5157,30 +5157,30 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    lbu a7, 40(a1)
 ; CHECK-RV64-NEXT:    lbu t0, 48(a1)
 ; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
-; CHECK-RV64-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64-NEXT:    lbu t2, 96(a1)
+; CHECK-RV64-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64-NEXT:    lbu t4, 112(a1)
+; CHECK-RV64-NEXT:    lbu t5, 120(a1)
+; CHECK-RV64-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64-NEXT:    lbu s0, 72(a1)
+; CHECK-RV64-NEXT:    lbu s1, 80(a1)
+; CHECK-RV64-NEXT:    lbu a1, 88(a1)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb t5, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb t4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a1, 12(a0)
+; CHECK-RV64-NEXT:    sb t2, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb a1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s1, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb s0, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t6, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
 ; CHECK-RV64-NEXT:    sb t1, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
@@ -5218,30 +5218,30 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    lbu a7, 20(a1)
 ; CHECK-RV32-NEXT:    lbu t0, 24(a1)
 ; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
-; CHECK-RV32-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32-NEXT:    lbu t2, 48(a1)
+; CHECK-RV32-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32-NEXT:    lbu t4, 56(a1)
+; CHECK-RV32-NEXT:    lbu t5, 60(a1)
+; CHECK-RV32-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32-NEXT:    lbu s0, 36(a1)
+; CHECK-RV32-NEXT:    lbu s1, 40(a1)
+; CHECK-RV32-NEXT:    lbu a1, 44(a1)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb t5, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb t4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a1, 12(a0)
+; CHECK-RV32-NEXT:    sb t2, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb a1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s1, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb s0, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t6, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
 ; CHECK-RV32-NEXT:    sb t1, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
@@ -5279,28 +5279,28 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
 ; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
 ; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 96(a1)
+; CHECK-RV64C-NEXT:    lbu a3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
-; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
-; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 72(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 80(a1)
+; CHECK-RV64C-NEXT:    lbu a1, 88(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a5, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb a3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a1, 12(a0)
+; CHECK-RV64C-NEXT:    sb a2, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb a1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s1, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a3, 9(a0)
+; CHECK-RV64C-NEXT:    sb s0, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sb t6, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
@@ -5340,28 +5340,28 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
 ; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
 ; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 48(a1)
+; CHECK-RV32C-NEXT:    lbu a3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
-; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
-; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 36(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 40(a1)
+; CHECK-RV32C-NEXT:    lbu a1, 44(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a5, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb a3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a1, 12(a0)
+; CHECK-RV32C-NEXT:    sb a2, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb a1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s1, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a3, 9(a0)
+; CHECK-RV32C-NEXT:    sb s0, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sb t6, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
@@ -5405,112 +5405,112 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_S1_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
-; CHECK-RV64-NEXT:    lh a1, 32(a1)
+; CHECK-RV64-NEXT:    lh a2, 32(a1)
+; CHECK-RV64-NEXT:    lh a3, 40(a1)
+; CHECK-RV64-NEXT:    lh a4, 48(a1)
+; CHECK-RV64-NEXT:    lh a5, 56(a1)
+; CHECK-RV64-NEXT:    lh a6, 0(a1)
+; CHECK-RV64-NEXT:    lh a7, 8(a1)
+; CHECK-RV64-NEXT:    lh t0, 16(a1)
+; CHECK-RV64-NEXT:    lh a1, 24(a1)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a5, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a4, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a3, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a1, 8(a0)
+; CHECK-RV64-NEXT:    sh a2, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh a1, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh t0, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
-; CHECK-RV32-NEXT:    lh a1, 16(a1)
+; CHECK-RV32-NEXT:    lh a2, 16(a1)
+; CHECK-RV32-NEXT:    lh a3, 20(a1)
+; CHECK-RV32-NEXT:    lh a4, 24(a1)
+; CHECK-RV32-NEXT:    lh a5, 28(a1)
+; CHECK-RV32-NEXT:    lh a6, 0(a1)
+; CHECK-RV32-NEXT:    lh a7, 4(a1)
+; CHECK-RV32-NEXT:    lh t0, 8(a1)
+; CHECK-RV32-NEXT:    lh a1, 12(a1)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a5, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a4, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a3, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a1, 8(a0)
+; CHECK-RV32-NEXT:    sh a2, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh a1, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh t0, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 32(a1)
+; CHECK-RV64C-NEXT:    lh a3, 40(a1)
+; CHECK-RV64C-NEXT:    lh a4, 48(a1)
+; CHECK-RV64C-NEXT:    lh a5, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
-; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
-; CHECK-RV64C-NEXT:    lh a1, 32(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
+; CHECK-RV64C-NEXT:    lh a1, 24(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a5, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a3, 12(a0)
+; CHECK-RV64C-NEXT:    sh a4, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a3, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a1, 8(a0)
+; CHECK-RV64C-NEXT:    sh a7, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a5, 6(a0)
+; CHECK-RV64C-NEXT:    sh a1, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 16(a1)
+; CHECK-RV32C-NEXT:    lh a3, 20(a1)
+; CHECK-RV32C-NEXT:    lh a4, 24(a1)
+; CHECK-RV32C-NEXT:    lh a5, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
-; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
-; CHECK-RV32C-NEXT:    lh a1, 16(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
+; CHECK-RV32C-NEXT:    lh a1, 12(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a5, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a3, 12(a0)
+; CHECK-RV32C-NEXT:    sh a4, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a3, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a1, 8(a0)
+; CHECK-RV32C-NEXT:    sh a7, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a5, 6(a0)
+; CHECK-RV32C-NEXT:    sh a1, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -6571,30 +6571,30 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    lbu a7, 40(a1)
 ; CHECK-RV64-NEXT:    lbu t0, 48(a1)
 ; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
-; CHECK-RV64-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64-NEXT:    lbu t2, 96(a1)
+; CHECK-RV64-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64-NEXT:    lbu t4, 112(a1)
+; CHECK-RV64-NEXT:    lbu t5, 120(a1)
+; CHECK-RV64-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64-NEXT:    lbu s0, 72(a1)
+; CHECK-RV64-NEXT:    lbu s1, 80(a1)
+; CHECK-RV64-NEXT:    lbu a1, 88(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb t5, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb t4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a1, 12(a0)
+; CHECK-RV64-NEXT:    sb t2, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb a1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s1, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb s0, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t6, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sb t1, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.all
@@ -6632,30 +6632,30 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    lbu a7, 20(a1)
 ; CHECK-RV32-NEXT:    lbu t0, 24(a1)
 ; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
-; CHECK-RV32-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32-NEXT:    lbu t2, 48(a1)
+; CHECK-RV32-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32-NEXT:    lbu t4, 56(a1)
+; CHECK-RV32-NEXT:    lbu t5, 60(a1)
+; CHECK-RV32-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32-NEXT:    lbu s0, 36(a1)
+; CHECK-RV32-NEXT:    lbu s1, 40(a1)
+; CHECK-RV32-NEXT:    lbu a1, 44(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb t5, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb t4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a1, 12(a0)
+; CHECK-RV32-NEXT:    sb t2, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb a1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s1, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb s0, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t6, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sb t1, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.all
@@ -6693,28 +6693,28 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
 ; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
 ; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 96(a1)
+; CHECK-RV64C-NEXT:    lbu a3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
-; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
-; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 72(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 80(a1)
+; CHECK-RV64C-NEXT:    lbu a1, 88(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a5, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb a3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a1, 12(a0)
+; CHECK-RV64C-NEXT:    sb a2, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb a1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s1, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a3, 9(a0)
+; CHECK-RV64C-NEXT:    sb s0, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb t6, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
@@ -6754,28 +6754,28 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
 ; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
 ; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 48(a1)
+; CHECK-RV32C-NEXT:    lbu a3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
-; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
-; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 36(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 40(a1)
+; CHECK-RV32C-NEXT:    lbu a1, 44(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a5, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb a3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a1, 12(a0)
+; CHECK-RV32C-NEXT:    sb a2, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb a1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s1, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a3, 9(a0)
+; CHECK-RV32C-NEXT:    sb s0, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb t6, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
@@ -6819,112 +6819,112 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_ALL_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
-; CHECK-RV64-NEXT:    lh a1, 32(a1)
+; CHECK-RV64-NEXT:    lh a2, 32(a1)
+; CHECK-RV64-NEXT:    lh a3, 40(a1)
+; CHECK-RV64-NEXT:    lh a4, 48(a1)
+; CHECK-RV64-NEXT:    lh a5, 56(a1)
+; CHECK-RV64-NEXT:    lh a6, 0(a1)
+; CHECK-RV64-NEXT:    lh a7, 8(a1)
+; CHECK-RV64-NEXT:    lh t0, 16(a1)
+; CHECK-RV64-NEXT:    lh a1, 24(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a5, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a4, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a3, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a1, 8(a0)
+; CHECK-RV64-NEXT:    sh a2, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh a1, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh t0, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
-; CHECK-RV32-NEXT:    lh a1, 16(a1)
+; CHECK-RV32-NEXT:    lh a2, 16(a1)
+; CHECK-RV32-NEXT:    lh a3, 20(a1)
+; CHECK-RV32-NEXT:    lh a4, 24(a1)
+; CHECK-RV32-NEXT:    lh a5, 28(a1)
+; CHECK-RV32-NEXT:    lh a6, 0(a1)
+; CHECK-RV32-NEXT:    lh a7, 4(a1)
+; CHECK-RV32-NEXT:    lh t0, 8(a1)
+; CHECK-RV32-NEXT:    lh a1, 12(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a5, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a4, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a3, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a1, 8(a0)
+; CHECK-RV32-NEXT:    sh a2, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh a1, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh t0, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 32(a1)
+; CHECK-RV64C-NEXT:    lh a3, 40(a1)
+; CHECK-RV64C-NEXT:    lh a4, 48(a1)
+; CHECK-RV64C-NEXT:    lh a5, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
-; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
-; CHECK-RV64C-NEXT:    lh a1, 32(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
+; CHECK-RV64C-NEXT:    lh a1, 24(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a5, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a3, 12(a0)
+; CHECK-RV64C-NEXT:    sh a4, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a3, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a1, 8(a0)
+; CHECK-RV64C-NEXT:    sh a7, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a5, 6(a0)
+; CHECK-RV64C-NEXT:    sh a1, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 16(a1)
+; CHECK-RV32C-NEXT:    lh a3, 20(a1)
+; CHECK-RV32C-NEXT:    lh a4, 24(a1)
+; CHECK-RV32C-NEXT:    lh a5, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
-; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
-; CHECK-RV32C-NEXT:    lh a1, 16(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
+; CHECK-RV32C-NEXT:    lh a1, 12(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a5, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a3, 12(a0)
+; CHECK-RV32C-NEXT:    sh a4, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a3, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a1, 8(a0)
+; CHECK-RV32C-NEXT:    sh a7, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a5, 6(a0)
+; CHECK-RV32C-NEXT:    sh a1, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 4bb65f376218f..fe602b5b8fc2b 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -1241,8 +1241,8 @@ define i64 @foo2(ptr %p) {
 define void @PR41129(ptr %p64) {
 ; RV32-LABEL: PR41129:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lw a2, 4(a0)
 ; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a2, 4(a0)
 ; RV32-NEXT:    or a3, a1, a2
 ; RV32-NEXT:    beqz a3, .LBB37_2
 ; RV32-NEXT:  # %bb.1: # %false
diff --git a/llvm/test/CodeGen/RISCV/pr97304.ll b/llvm/test/CodeGen/RISCV/pr97304.ll
index 120a0e787384d..694f6384b6855 100644
--- a/llvm/test/CodeGen/RISCV/pr97304.ll
+++ b/llvm/test/CodeGen/RISCV/pr97304.ll
@@ -17,7 +17,7 @@ define i32 @_ZNK2cv12LMSolverImpl3runERKNS_17_InputOutputArrayE(i1 %cmp436) {
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 8, 0, implicit-def dead $x2, implicit $x2
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gprjalr = COPY $x0
-  ; CHECK-NEXT:   SD [[COPY3]], [[COPY2]], 0 :: (store (s64))
+  ; CHECK-NEXT:   SD [[COPY3]], [[COPY2]], 0 :: (store (s64) into stack)
   ; CHECK-NEXT:   [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
   ; CHECK-NEXT:   [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 32
   ; CHECK-NEXT:   BNE [[ANDI]], $x0, %bb.3
diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
index 7548faaae61f4..85c2997e268a9 100644
--- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll
+++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
@@ -1117,26 +1117,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV32IZCMP-NEXT:    lw t3, 20(a5)
 ; RV32IZCMP-NEXT:    lw t4, 24(a5)
 ; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-NEXT:    lw a1, 48(a5)
-; RV32IZCMP-NEXT:    lw s0, 52(a5)
-; RV32IZCMP-NEXT:    lw s1, 68(a5)
-; RV32IZCMP-NEXT:    lw a2, 64(a5)
-; RV32IZCMP-NEXT:    lw a3, 60(a5)
-; RV32IZCMP-NEXT:    lw a4, 56(a5)
-; RV32IZCMP-NEXT:    sw s1, 68(a5)
-; RV32IZCMP-NEXT:    sw a2, 64(a5)
-; RV32IZCMP-NEXT:    sw a3, 60(a5)
-; RV32IZCMP-NEXT:    sw a4, 56(a5)
-; RV32IZCMP-NEXT:    sw s0, 52(a5)
-; RV32IZCMP-NEXT:    sw a1, 48(a5)
-; RV32IZCMP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-NEXT:    sw t6, 32(a5)
+; RV32IZCMP-NEXT:    lw t6, 48(a5)
+; RV32IZCMP-NEXT:    lw s2, 52(a5)
+; RV32IZCMP-NEXT:    lw a3, 56(a5)
+; RV32IZCMP-NEXT:    lw a4, 60(a5)
+; RV32IZCMP-NEXT:    lw a1, 64(a5)
+; RV32IZCMP-NEXT:    lw s0, 68(a5)
+; RV32IZCMP-NEXT:    lw s3, 32(a5)
+; RV32IZCMP-NEXT:    lw s4, 36(a5)
+; RV32IZCMP-NEXT:    lw s1, 40(a5)
+; RV32IZCMP-NEXT:    lw a2, 44(a5)
+; RV32IZCMP-NEXT:    sw s0, 68(a5)
+; RV32IZCMP-NEXT:    sw a1, 64(a5)
+; RV32IZCMP-NEXT:    sw a4, 60(a5)
+; RV32IZCMP-NEXT:    sw a3, 56(a5)
+; RV32IZCMP-NEXT:    sw s2, 52(a5)
+; RV32IZCMP-NEXT:    sw t6, 48(a5)
+; RV32IZCMP-NEXT:    sw a2, 44(a5)
+; RV32IZCMP-NEXT:    sw s1, 40(a5)
+; RV32IZCMP-NEXT:    sw s4, 36(a5)
+; RV32IZCMP-NEXT:    sw s3, 32(a5)
 ; RV32IZCMP-NEXT:    sw t5, 28(a5)
 ; RV32IZCMP-NEXT:    sw t4, 24(a5)
 ; RV32IZCMP-NEXT:    sw t3, 20(a5)
@@ -1160,26 +1160,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV64IZCMP-NEXT:    lw t3, 20(a5)
 ; RV64IZCMP-NEXT:    lw t4, 24(a5)
 ; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-NEXT:    lw a1, 48(a5)
-; RV64IZCMP-NEXT:    lw s0, 52(a5)
-; RV64IZCMP-NEXT:    lw s1, 68(a5)
-; RV64IZCMP-NEXT:    lw a2, 64(a5)
-; RV64IZCMP-NEXT:    lw a3, 60(a5)
-; RV64IZCMP-NEXT:    lw a4, 56(a5)
-; RV64IZCMP-NEXT:    sw s1, 68(a5)
-; RV64IZCMP-NEXT:    sw a2, 64(a5)
-; RV64IZCMP-NEXT:    sw a3, 60(a5)
-; RV64IZCMP-NEXT:    sw a4, 56(a5)
-; RV64IZCMP-NEXT:    sw s0, 52(a5)
-; RV64IZCMP-NEXT:    sw a1, 48(a5)
-; RV64IZCMP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-NEXT:    sw t6, 32(a5)
+; RV64IZCMP-NEXT:    lw t6, 48(a5)
+; RV64IZCMP-NEXT:    lw s2, 52(a5)
+; RV64IZCMP-NEXT:    lw a3, 56(a5)
+; RV64IZCMP-NEXT:    lw a4, 60(a5)
+; RV64IZCMP-NEXT:    lw a1, 64(a5)
+; RV64IZCMP-NEXT:    lw s0, 68(a5)
+; RV64IZCMP-NEXT:    lw s3, 32(a5)
+; RV64IZCMP-NEXT:    lw s4, 36(a5)
+; RV64IZCMP-NEXT:    lw s1, 40(a5)
+; RV64IZCMP-NEXT:    lw a2, 44(a5)
+; RV64IZCMP-NEXT:    sw s0, 68(a5)
+; RV64IZCMP-NEXT:    sw a1, 64(a5)
+; RV64IZCMP-NEXT:    sw a4, 60(a5)
+; RV64IZCMP-NEXT:    sw a3, 56(a5)
+; RV64IZCMP-NEXT:    sw s2, 52(a5)
+; RV64IZCMP-NEXT:    sw t6, 48(a5)
+; RV64IZCMP-NEXT:    sw a2, 44(a5)
+; RV64IZCMP-NEXT:    sw s1, 40(a5)
+; RV64IZCMP-NEXT:    sw s4, 36(a5)
+; RV64IZCMP-NEXT:    sw s3, 32(a5)
 ; RV64IZCMP-NEXT:    sw t5, 28(a5)
 ; RV64IZCMP-NEXT:    sw t4, 24(a5)
 ; RV64IZCMP-NEXT:    sw t3, 20(a5)
@@ -1203,26 +1203,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV32IZCMP-SR-NEXT:    lw t3, 20(a5)
 ; RV32IZCMP-SR-NEXT:    lw t4, 24(a5)
 ; RV32IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 48(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 52(a5)
-; RV32IZCMP-SR-NEXT:    lw s1, 68(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 64(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 60(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s1, 68(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 64(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 60(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 52(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 48(a5)
-; RV32IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    sw t6, 32(a5)
+; RV32IZCMP-SR-NEXT:    lw t6, 48(a5)
+; RV32IZCMP-SR-NEXT:    lw s2, 52(a5)
+; RV32IZCMP-SR-NEXT:    lw a3, 56(a5)
+; RV32IZCMP-SR-NEXT:    lw a4, 60(a5)
+; RV32IZCMP-SR-NEXT:    lw a1, 64(a5)
+; RV32IZCMP-SR-NEXT:    lw s0, 68(a5)
+; RV32IZCMP-SR-NEXT:    lw s3, 32(a5)
+; RV32IZCMP-SR-NEXT:    lw s4, 36(a5)
+; RV32IZCMP-SR-NEXT:    lw s1, 40(a5)
+; RV32IZCMP-SR-NEXT:    lw a2, 44(a5)
+; RV32IZCMP-SR-NEXT:    sw s0, 68(a5)
+; RV32IZCMP-SR-NEXT:    sw a1, 64(a5)
+; RV32IZCMP-SR-NEXT:    sw a4, 60(a5)
+; RV32IZCMP-SR-NEXT:    sw a3, 56(a5)
+; RV32IZCMP-SR-NEXT:    sw s2, 52(a5)
+; RV32IZCMP-SR-NEXT:    sw t6, 48(a5)
+; RV32IZCMP-SR-NEXT:    sw a2, 44(a5)
+; RV32IZCMP-SR-NEXT:    sw s1, 40(a5)
+; RV32IZCMP-SR-NEXT:    sw s4, 36(a5)
+; RV32IZCMP-SR-NEXT:    sw s3, 32(a5)
 ; RV32IZCMP-SR-NEXT:    sw t5, 28(a5)
 ; RV32IZCMP-SR-NEXT:    sw t4, 24(a5)
 ; RV32IZCMP-SR-NEXT:    sw t3, 20(a5)
@@ -1246,26 +1246,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV64IZCMP-SR-NEXT:    lw t3, 20(a5)
 ; RV64IZCMP-SR-NEXT:    lw t4, 24(a5)
 ; RV64IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 48(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 52(a5)
-; RV64IZCMP-SR-NEXT:    lw s1, 68(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 64(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 60(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s1, 68(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 64(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 60(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 52(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 48(a5)
-; RV64IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    sw t6, 32(a5)
+; RV64IZCMP-SR-NEXT:    lw t6, 48(a5)
+; RV64IZCMP-SR-NEXT:    lw s2, 52(a5)
+; RV64IZCMP-SR-NEXT:    lw a3, 56(a5)
+; RV64IZCMP-SR-NEXT:    lw a4, 60(a5)
+; RV64IZCMP-SR-NEXT:    lw a1, 64(a5)
+; RV64IZCMP-SR-NEXT:    lw s0, 68(a5)
+; RV64IZCMP-SR-NEXT:    lw s3, 32(a5)
+; RV64IZCMP-SR-NEXT:    lw s4, 36(a5)
+; RV64IZCMP-SR-NEXT:    lw s1, 40(a5)
+; RV64IZCMP-SR-NEXT:    lw a2, 44(a5)
+; RV64IZCMP-SR-NEXT:    sw s0, 68(a5)
+; RV64IZCMP-SR-NEXT:    sw a1, 64(a5)
+; RV64IZCMP-SR-NEXT:    sw a4, 60(a5)
+; RV64IZCMP-SR-NEXT:    sw a3, 56(a5)
+; RV64IZCMP-SR-NEXT:    sw s2, 52(a5)
+; RV64IZCMP-SR-NEXT:    sw t6, 48(a5)
+; RV64IZCMP-SR-NEXT:    sw a2, 44(a5)
+; RV64IZCMP-SR-NEXT:    sw s1, 40(a5)
+; RV64IZCMP-SR-NEXT:    sw s4, 36(a5)
+; RV64IZCMP-SR-NEXT:    sw s3, 32(a5)
 ; RV64IZCMP-SR-NEXT:    sw t5, 28(a5)
 ; RV64IZCMP-SR-NEXT:    sw t4, 24(a5)
 ; RV64IZCMP-SR-NEXT:    sw t3, 20(a5)
@@ -1294,26 +1294,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV32I-NEXT:    lw a7, 20(a5)
 ; RV32I-NEXT:    lw t0, 24(a5)
 ; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 32(a5)
-; RV32I-NEXT:    lw t3, 36(a5)
-; RV32I-NEXT:    lw t4, 40(a5)
-; RV32I-NEXT:    lw t5, 44(a5)
-; RV32I-NEXT:    lw t6, 48(a5)
-; RV32I-NEXT:    lw s0, 52(a5)
-; RV32I-NEXT:    lw s1, 68(a5)
-; RV32I-NEXT:    lw s2, 64(a5)
-; RV32I-NEXT:    lw s3, 60(a5)
-; RV32I-NEXT:    lw s4, 56(a5)
-; RV32I-NEXT:    sw s1, 68(a5)
-; RV32I-NEXT:    sw s2, 64(a5)
-; RV32I-NEXT:    sw s3, 60(a5)
-; RV32I-NEXT:    sw s4, 56(a5)
-; RV32I-NEXT:    sw s0, 52(a5)
-; RV32I-NEXT:    sw t6, 48(a5)
-; RV32I-NEXT:    sw t5, 44(a5)
-; RV32I-NEXT:    sw t4, 40(a5)
-; RV32I-NEXT:    sw t3, 36(a5)
-; RV32I-NEXT:    sw t2, 32(a5)
+; RV32I-NEXT:    lw t2, 48(a5)
+; RV32I-NEXT:    lw t3, 52(a5)
+; RV32I-NEXT:    lw t4, 56(a5)
+; RV32I-NEXT:    lw t5, 60(a5)
+; RV32I-NEXT:    lw t6, 64(a5)
+; RV32I-NEXT:    lw s0, 68(a5)
+; RV32I-NEXT:    lw s1, 32(a5)
+; RV32I-NEXT:    lw s2, 36(a5)
+; RV32I-NEXT:    lw s3, 40(a5)
+; RV32I-NEXT:    lw s4, 44(a5)
+; RV32I-NEXT:    sw s0, 68(a5)
+; RV32I-NEXT:    sw t6, 64(a5)
+; RV32I-NEXT:    sw t5, 60(a5)
+; RV32I-NEXT:    sw t4, 56(a5)
+; RV32I-NEXT:    sw t3, 52(a5)
+; RV32I-NEXT:    sw t2, 48(a5)
+; RV32I-NEXT:    sw s4, 44(a5)
+; RV32I-NEXT:    sw s3, 40(a5)
+; RV32I-NEXT:    sw s2, 36(a5)
+; RV32I-NEXT:    sw s1, 32(a5)
 ; RV32I-NEXT:    sw t1, 28(a5)
 ; RV32I-NEXT:    sw t0, 24(a5)
 ; RV32I-NEXT:    sw a7, 20(a5)
@@ -1348,26 +1348,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV64I-NEXT:    lw a7, 20(a5)
 ; RV64I-NEXT:    lw t0, 24(a5)
 ; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 32(a5)
-; RV64I-NEXT:    lw t3, 36(a5)
-; RV64I-NEXT:    lw t4, 40(a5)
-; RV64I-NEXT:    lw t5, 44(a5)
-; RV64I-NEXT:    lw t6, 48(a5)
-; RV64I-NEXT:    lw s0, 52(a5)
-; RV64I-NEXT:    lw s1, 68(a5)
-; RV64I-NEXT:    lw s2, 64(a5)
-; RV64I-NEXT:    lw s3, 60(a5)
-; RV64I-NEXT:    lw s4, 56(a5)
-; RV64I-NEXT:    sw s1, 68(a5)
-; RV64I-NEXT:    sw s2, 64(a5)
-; RV64I-NEXT:    sw s3, 60(a5)
-; RV64I-NEXT:    sw s4, 56(a5)
-; RV64I-NEXT:    sw s0, 52(a5)
-; RV64I-NEXT:    sw t6, 48(a5)
-; RV64I-NEXT:    sw t5, 44(a5)
-; RV64I-NEXT:    sw t4, 40(a5)
-; RV64I-NEXT:    sw t3, 36(a5)
-; RV64I-NEXT:    sw t2, 32(a5)
+; RV64I-NEXT:    lw t2, 48(a5)
+; RV64I-NEXT:    lw t3, 52(a5)
+; RV64I-NEXT:    lw t4, 56(a5)
+; RV64I-NEXT:    lw t5, 60(a5)
+; RV64I-NEXT:    lw t6, 64(a5)
+; RV64I-NEXT:    lw s0, 68(a5)
+; RV64I-NEXT:    lw s1, 32(a5)
+; RV64I-NEXT:    lw s2, 36(a5)
+; RV64I-NEXT:    lw s3, 40(a5)
+; RV64I-NEXT:    lw s4, 44(a5)
+; RV64I-NEXT:    sw s0, 68(a5)
+; RV64I-NEXT:    sw t6, 64(a5)
+; RV64I-NEXT:    sw t5, 60(a5)
+; RV64I-NEXT:    sw t4, 56(a5)
+; RV64I-NEXT:    sw t3, 52(a5)
+; RV64I-NEXT:    sw t2, 48(a5)
+; RV64I-NEXT:    sw s4, 44(a5)
+; RV64I-NEXT:    sw s3, 40(a5)
+; RV64I-NEXT:    sw s2, 36(a5)
+; RV64I-NEXT:    sw s1, 32(a5)
 ; RV64I-NEXT:    sw t1, 28(a5)
 ; RV64I-NEXT:    sw t0, 24(a5)
 ; RV64I-NEXT:    sw a7, 20(a5)
@@ -1813,16 +1813,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32IZCMP-NEXT:    sw t4, 44(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    sw t5, 40(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    sw t6, 36(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    lui t0, %hi(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(t0)
 ; RV32IZCMP-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
 ; RV32IZCMP-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
+; RV32IZCMP-NEXT:    addi a5, t0, %lo(var_test_irq)
 ; RV32IZCMP-NEXT:    lw a0, 16(a5)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    lw a0, 20(a5)
@@ -1845,22 +1845,22 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32IZCMP-NEXT:    lw t3, 84(a5)
 ; RV32IZCMP-NEXT:    lw t2, 88(a5)
 ; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-NEXT:    lw a4, 108(a5)
+; RV32IZCMP-NEXT:    lw a7, 112(a5)
+; RV32IZCMP-NEXT:    lw s0, 116(a5)
+; RV32IZCMP-NEXT:    lw a3, 120(a5)
 ; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 112(a5)
+; RV32IZCMP-NEXT:    lw a6, 96(a5)
+; RV32IZCMP-NEXT:    lw a4, 100(a5)
+; RV32IZCMP-NEXT:    lw a2, 104(a5)
+; RV32IZCMP-NEXT:    lw a1, 108(a5)
 ; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-NEXT:    sw t0, 96(a5)
+; RV32IZCMP-NEXT:    sw a3, 120(a5)
+; RV32IZCMP-NEXT:    sw s0, 116(a5)
+; RV32IZCMP-NEXT:    sw a7, 112(a5)
+; RV32IZCMP-NEXT:    sw a1, 108(a5)
+; RV32IZCMP-NEXT:    sw a2, 104(a5)
+; RV32IZCMP-NEXT:    sw a4, 100(a5)
+; RV32IZCMP-NEXT:    sw a6, 96(a5)
 ; RV32IZCMP-NEXT:    sw t1, 92(a5)
 ; RV32IZCMP-NEXT:    sw t2, 88(a5)
 ; RV32IZCMP-NEXT:    sw t3, 84(a5)
@@ -1884,13 +1884,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    sw a0, 16(a5)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
 ; RV32IZCMP-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
 ; RV32IZCMP-NEXT:    lw a0, 32(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(t0)
 ; RV32IZCMP-NEXT:    lw t0, 92(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    lw t1, 88(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    lw t2, 84(sp) # 4-byte Folded Reload
@@ -1929,16 +1929,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64IZCMP-NEXT:    sd t4, 72(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    sd t5, 64(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    sd t6, 56(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    lui t0, %hi(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(t0)
 ; RV64IZCMP-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
+; RV64IZCMP-NEXT:    addi a5, t0, %lo(var_test_irq)
 ; RV64IZCMP-NEXT:    lw a0, 16(a5)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    lw a0, 20(a5)
@@ -1961,22 +1961,22 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64IZCMP-NEXT:    lw t3, 84(a5)
 ; RV64IZCMP-NEXT:    lw t2, 88(a5)
 ; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-NEXT:    lw a4, 108(a5)
+; RV64IZCMP-NEXT:    lw a7, 112(a5)
+; RV64IZCMP-NEXT:    lw s0, 116(a5)
+; RV64IZCMP-NEXT:    lw a3, 120(a5)
 ; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 112(a5)
+; RV64IZCMP-NEXT:    lw a6, 96(a5)
+; RV64IZCMP-NEXT:    lw a4, 100(a5)
+; RV64IZCMP-NEXT:    lw a2, 104(a5)
+; RV64IZCMP-NEXT:    lw a1, 108(a5)
 ; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-NEXT:    sw t0, 96(a5)
+; RV64IZCMP-NEXT:    sw a3, 120(a5)
+; RV64IZCMP-NEXT:    sw s0, 116(a5)
+; RV64IZCMP-NEXT:    sw a7, 112(a5)
+; RV64IZCMP-NEXT:    sw a1, 108(a5)
+; RV64IZCMP-NEXT:    sw a2, 104(a5)
+; RV64IZCMP-NEXT:    sw a4, 100(a5)
+; RV64IZCMP-NEXT:    sw a6, 96(a5)
 ; RV64IZCMP-NEXT:    sw t1, 92(a5)
 ; RV64IZCMP-NEXT:    sw t2, 88(a5)
 ; RV64IZCMP-NEXT:    sw t3, 84(a5)
@@ -2000,13 +2000,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    sw a0, 16(a5)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
 ; RV64IZCMP-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(t0)
 ; RV64IZCMP-NEXT:    ld t0, 168(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    ld t1, 160(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    ld t2, 152(sp) # 8-byte Folded Reload
@@ -2045,16 +2045,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32IZCMP-SR-NEXT:    sw t4, 44(sp) # 4-byte Folded Spill
 ; RV32IZCMP-SR-NEXT:    sw t5, 40(sp) # 4-byte Folded Spill
 ; RV32IZCMP-SR-NEXT:    sw t6, 36(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    lui t0, %hi(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(t0)
 ; RV32IZCMP-SR-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
 ; RV32IZCMP-SR-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
 ; RV32IZCMP-SR-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
 ; RV32IZCMP-SR-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
+; RV32IZCMP-SR-NEXT:    addi a5, t0, %lo(var_test_irq)
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(a5)
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(a5)
@@ -2077,22 +2077,22 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32IZCMP-SR-NEXT:    lw t3, 84(a5)
 ; RV32IZCMP-SR-NEXT:    lw t2, 88(a5)
 ; RV32IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 108(a5)
+; RV32IZCMP-SR-NEXT:    lw a7, 112(a5)
+; RV32IZCMP-SR-NEXT:    lw s0, 116(a5)
+; RV32IZCMP-SR-NEXT:    lw a3, 120(a5)
 ; RV32IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 112(a5)
+; RV32IZCMP-SR-NEXT:    lw a6, 96(a5)
+; RV32IZCMP-SR-NEXT:    lw a4, 100(a5)
+; RV32IZCMP-SR-NEXT:    lw a2, 104(a5)
+; RV32IZCMP-SR-NEXT:    lw a1, 108(a5)
 ; RV32IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    sw t0, 96(a5)
+; RV32IZCMP-SR-NEXT:    sw a3, 120(a5)
+; RV32IZCMP-SR-NEXT:    sw s0, 116(a5)
+; RV32IZCMP-SR-NEXT:    sw a7, 112(a5)
+; RV32IZCMP-SR-NEXT:    sw a1, 108(a5)
+; RV32IZCMP-SR-NEXT:    sw a2, 104(a5)
+; RV32IZCMP-SR-NEXT:    sw a4, 100(a5)
+; RV32IZCMP-SR-NEXT:    sw a6, 96(a5)
 ; RV32IZCMP-SR-NEXT:    sw t1, 92(a5)
 ; RV32IZCMP-SR-NEXT:    sw t2, 88(a5)
 ; RV32IZCMP-SR-NEXT:    sw t3, 84(a5)
@@ -2116,13 +2116,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(a5)
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
 ; RV32IZCMP-SR-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
 ; RV32IZCMP-SR-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
 ; RV32IZCMP-SR-NEXT:    lw a0, 32(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(t0)
 ; RV32IZCMP-SR-NEXT:    lw t0, 92(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    lw t1, 88(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    lw t2, 84(sp) # 4-byte Folded Reload
@@ -2161,16 +2161,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64IZCMP-SR-NEXT:    sd t4, 72(sp) # 8-byte Folded Spill
 ; RV64IZCMP-SR-NEXT:    sd t5, 64(sp) # 8-byte Folded Spill
 ; RV64IZCMP-SR-NEXT:    sd t6, 56(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    lui t0, %hi(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(t0)
 ; RV64IZCMP-SR-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
 ; RV64IZCMP-SR-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
 ; RV64IZCMP-SR-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
 ; RV64IZCMP-SR-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
+; RV64IZCMP-SR-NEXT:    addi a5, t0, %lo(var_test_irq)
 ; RV64IZCMP-SR-NEXT:    lw a0, 16(a5)
 ; RV64IZCMP-SR-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
 ; RV64IZCMP-SR-NEXT:    lw a0, 20(a5)
@@ -2193,22 +2193,22 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64IZCMP-SR-NEXT:    lw t3, 84(a5)
 ; RV64IZCMP-SR-NEXT:    lw t2, 88(a5)
 ; RV64IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 108(a5)
+; RV64IZCMP-SR-NEXT:    lw a7, 112(a5)
+; RV64IZCMP-SR-NEXT:    lw s0, 116(a5)
+; RV64IZCMP-SR-NEXT:    lw a3, 120(a5)
 ; RV64IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 112(a5)
+; RV64IZCMP-SR-NEXT:    lw a6, 96(a5)
+; RV64IZCMP-SR-NEXT:    lw a4, 100(a5)
+; RV64IZCMP-SR-NEXT:    lw a2, 104(a5)
+; RV64IZCMP-SR-NEXT:    lw a1, 108(a5)
 ; RV64IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    sw t0, 96(a5)
+; RV64IZCMP-SR-NEXT:    sw a3, 120(a5)
+; RV64IZCMP-SR-NEXT:    sw s0, 116(a5)
+; RV64IZCMP-SR-NEXT:    sw a7, 112(a5)
+; RV64IZCMP-SR-NEXT:    sw a1, 108(a5)
+; RV64IZCMP-SR-NEXT:    sw a2, 104(a5)
+; RV64IZCMP-SR-NEXT:    sw a4, 100(a5)
+; RV64IZCMP-SR-NEXT:    sw a6, 96(a5)
 ; RV64IZCMP-SR-NEXT:    sw t1, 92(a5)
 ; RV64IZCMP-SR-NEXT:    sw t2, 88(a5)
 ; RV64IZCMP-SR-NEXT:    sw t3, 84(a5)
@@ -2232,13 +2232,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64IZCMP-SR-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    sw a0, 16(a5)
 ; RV64IZCMP-SR-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
 ; RV64IZCMP-SR-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
 ; RV64IZCMP-SR-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
 ; RV64IZCMP-SR-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(t0)
 ; RV64IZCMP-SR-NEXT:    ld t0, 168(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    ld t1, 160(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    ld t2, 152(sp) # 8-byte Folded Reload
@@ -2289,16 +2289,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32I-NEXT:    sw t4, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw t5, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw t6, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a6, %hi(var_test_irq)
-; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    lui a7, %hi(var_test_irq)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a7)
 ; RV32I-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a7)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a7)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a7)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a6, %lo(var_test_irq)
+; RV32I-NEXT:    addi a5, a7, %lo(var_test_irq)
 ; RV32I-NEXT:    lw a0, 16(a5)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lw a0, 20(a5)
@@ -2321,22 +2321,22 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32I-NEXT:    lw s8, 84(a5)
 ; RV32I-NEXT:    lw s9, 88(a5)
 ; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 96(a5)
-; RV32I-NEXT:    lw ra, 100(a5)
-; RV32I-NEXT:    lw a7, 104(a5)
-; RV32I-NEXT:    lw a4, 108(a5)
+; RV32I-NEXT:    lw s11, 112(a5)
+; RV32I-NEXT:    lw ra, 116(a5)
+; RV32I-NEXT:    lw a3, 120(a5)
 ; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a1, 120(a5)
-; RV32I-NEXT:    lw a2, 116(a5)
-; RV32I-NEXT:    lw a3, 112(a5)
+; RV32I-NEXT:    lw a6, 96(a5)
+; RV32I-NEXT:    lw a4, 100(a5)
+; RV32I-NEXT:    lw a2, 104(a5)
+; RV32I-NEXT:    lw a1, 108(a5)
 ; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a1, 120(a5)
-; RV32I-NEXT:    sw a2, 116(a5)
-; RV32I-NEXT:    sw a3, 112(a5)
-; RV32I-NEXT:    sw a4, 108(a5)
-; RV32I-NEXT:    sw a7, 104(a5)
-; RV32I-NEXT:    sw ra, 100(a5)
-; RV32I-NEXT:    sw s11, 96(a5)
+; RV32I-NEXT:    sw a3, 120(a5)
+; RV32I-NEXT:    sw ra, 116(a5)
+; RV32I-NEXT:    sw s11, 112(a5)
+; RV32I-NEXT:    sw a1, 108(a5)
+; RV32I-NEXT:    sw a2, 104(a5)
+; RV32I-NEXT:    sw a4, 100(a5)
+; RV32I-NEXT:    sw a6, 96(a5)
 ; RV32I-NEXT:    sw s10, 92(a5)
 ; RV32I-NEXT:    sw s9, 88(a5)
 ; RV32I-NEXT:    sw s8, 84(a5)
@@ -2360,13 +2360,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sw a0, 16(a5)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a7)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a7)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a7)
 ; RV32I-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a7)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw t0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw t1, 132(sp) # 4-byte Folded Reload
@@ -2429,16 +2429,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64I-NEXT:    sd t4, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd t5, 56(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd t6, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a6, %hi(var_test_irq)
-; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    lui a7, %hi(var_test_irq)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a7)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a7)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a7)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a7)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a6, %lo(var_test_irq)
+; RV64I-NEXT:    addi a5, a7, %lo(var_test_irq)
 ; RV64I-NEXT:    lw a0, 16(a5)
 ; RV64I-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lw a0, 20(a5)
@@ -2461,22 +2461,22 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64I-NEXT:    lw s8, 84(a5)
 ; RV64I-NEXT:    lw s9, 88(a5)
 ; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 96(a5)
-; RV64I-NEXT:    lw ra, 100(a5)
-; RV64I-NEXT:    lw a7, 104(a5)
-; RV64I-NEXT:    lw a4, 108(a5)
+; RV64I-NEXT:    lw s11, 112(a5)
+; RV64I-NEXT:    lw ra, 116(a5)
+; RV64I-NEXT:    lw a3, 120(a5)
 ; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a1, 120(a5)
-; RV64I-NEXT:    lw a2, 116(a5)
-; RV64I-NEXT:    lw a3, 112(a5)
+; RV64I-NEXT:    lw a6, 96(a5)
+; RV64I-NEXT:    lw a4, 100(a5)
+; RV64I-NEXT:    lw a2, 104(a5)
+; RV64I-NEXT:    lw a1, 108(a5)
 ; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a1, 120(a5)
-; RV64I-NEXT:    sw a2, 116(a5)
-; RV64I-NEXT:    sw a3, 112(a5)
-; RV64I-NEXT:    sw a4, 108(a5)
-; RV64I-NEXT:    sw a7, 104(a5)
-; RV64I-NEXT:    sw ra, 100(a5)
-; RV64I-NEXT:    sw s11, 96(a5)
+; RV64I-NEXT:    sw a3, 120(a5)
+; RV64I-NEXT:    sw ra, 116(a5)
+; RV64I-NEXT:    sw s11, 112(a5)
+; RV64I-NEXT:    sw a1, 108(a5)
+; RV64I-NEXT:    sw a2, 104(a5)
+; RV64I-NEXT:    sw a4, 100(a5)
+; RV64I-NEXT:    sw a6, 96(a5)
 ; RV64I-NEXT:    sw s10, 92(a5)
 ; RV64I-NEXT:    sw s9, 88(a5)
 ; RV64I-NEXT:    sw s8, 84(a5)
@@ -2500,13 +2500,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sw a0, 16(a5)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a7)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a7)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a7)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a7)
 ; RV64I-NEXT:    ld ra, 264(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld t0, 256(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld t1, 248(sp) # 8-byte Folded Reload
@@ -2546,16 +2546,16 @@ define void @callee_no_irq() nounwind{
 ; RV32IZCMP-LABEL: callee_no_irq:
 ; RV32IZCMP:       # %bb.0:
 ; RV32IZCMP-NEXT:    cm.push {ra, s0-s11}, -96
-; RV32IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    lui t0, %hi(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(t0)
 ; RV32IZCMP-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
+; RV32IZCMP-NEXT:    addi a5, t0, %lo(var_test_irq)
 ; RV32IZCMP-NEXT:    lw a0, 16(a5)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    lw a0, 20(a5)
@@ -2578,22 +2578,22 @@ define void @callee_no_irq() nounwind{
 ; RV32IZCMP-NEXT:    lw t3, 84(a5)
 ; RV32IZCMP-NEXT:    lw t2, 88(a5)
 ; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-NEXT:    lw a4, 108(a5)
+; RV32IZCMP-NEXT:    lw a7, 112(a5)
+; RV32IZCMP-NEXT:    lw s0, 116(a5)
+; RV32IZCMP-NEXT:    lw a3, 120(a5)
 ; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 112(a5)
+; RV32IZCMP-NEXT:    lw a6, 96(a5)
+; RV32IZCMP-NEXT:    lw a4, 100(a5)
+; RV32IZCMP-NEXT:    lw a2, 104(a5)
+; RV32IZCMP-NEXT:    lw a1, 108(a5)
 ; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-NEXT:    sw t0, 96(a5)
+; RV32IZCMP-NEXT:    sw a3, 120(a5)
+; RV32IZCMP-NEXT:    sw s0, 116(a5)
+; RV32IZCMP-NEXT:    sw a7, 112(a5)
+; RV32IZCMP-NEXT:    sw a1, 108(a5)
+; RV32IZCMP-NEXT:    sw a2, 104(a5)
+; RV32IZCMP-NEXT:    sw a4, 100(a5)
+; RV32IZCMP-NEXT:    sw a6, 96(a5)
 ; RV32IZCMP-NEXT:    sw t1, 92(a5)
 ; RV32IZCMP-NEXT:    sw t2, 88(a5)
 ; RV32IZCMP-NEXT:    sw t3, 84(a5)
@@ -2617,28 +2617,28 @@ define void @callee_no_irq() nounwind{
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    sw a0, 16(a5)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
 ; RV32IZCMP-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(t0)
 ; RV32IZCMP-NEXT:    cm.popret {ra, s0-s11}, 96
 ;
 ; RV64IZCMP-LABEL: callee_no_irq:
 ; RV64IZCMP:       # %bb.0:
 ; RV64IZCMP-NEXT:    cm.push {ra, s0-s11}, -160
-; RV64IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    lui t0, %hi(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(t0)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
+; RV64IZCMP-NEXT:    addi a5, t0, %lo(var_test_irq)
 ; RV64IZCMP-NEXT:    lw a0, 16(a5)
 ; RV64IZCMP-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    lw a0, 20(a5)
@@ -2661,22 +2661,22 @@ define void @callee_no_irq() nounwind{
 ; RV64IZCMP-NEXT:    lw t3, 84(a5)
 ; RV64IZCMP-NEXT:    lw t2, 88(a5)
 ; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-NEXT:    lw a4, 108(a5)
+; RV64IZCMP-NEXT:    lw a7, 112(a5)
+; RV64IZCMP-NEXT:    lw s0, 116(a5)
+; RV64IZCMP-NEXT:    lw a3, 120(a5)
 ; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 112(a5)
+; RV64IZCMP-NEXT:    lw a6, 96(a5)
+; RV64IZCMP-NEXT:    lw a4, 100(a5)
+; RV64IZCMP-NEXT:    lw a2, 104(a5)
+; RV64IZCMP-NEXT:    lw a1, 108(a5)
 ; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-NEXT:    sw t0, 96(a5)
+; RV64IZCMP-NEXT:    sw a3, 120(a5)
+; RV64IZCMP-NEXT:    sw s0, 116(a5)
+; RV64IZCMP-NEXT:    sw a7, 112(a5)
+; RV64IZCMP-NEXT:    sw a1, 108(a5)
+; RV64IZCMP-NEXT:    sw a2, 104(a5)
+; RV64IZCMP-NEXT:    sw a4, 100(a5)
+; RV64IZCMP-NEXT:    sw a6, 96(a5)
 ; RV64IZCMP-NEXT:    sw t1, 92(a5)
 ; RV64IZCMP-NEXT:    sw t2, 88(a5)
 ; RV64IZCMP-NEXT:    sw t3, 84(a5)
@@ -2700,28 +2700,28 @@ define void @callee_no_irq() nounwind{
 ; RV64IZCMP-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    sw a0, 16(a5)
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(t0)
 ; RV64IZCMP-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV32IZCMP-SR-LABEL: callee_no_irq:
 ; RV32IZCMP-SR:       # %bb.0:
 ; RV32IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -96
-; RV32IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    lui t0, %hi(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(t0)
 ; RV32IZCMP-SR-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
 ; RV32IZCMP-SR-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
 ; RV32IZCMP-SR-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
+; RV32IZCMP-SR-NEXT:    addi a5, t0, %lo(var_test_irq)
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(a5)
 ; RV32IZCMP-SR-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(a5)
@@ -2744,22 +2744,22 @@ define void @callee_no_irq() nounwind{
 ; RV32IZCMP-SR-NEXT:    lw t3, 84(a5)
 ; RV32IZCMP-SR-NEXT:    lw t2, 88(a5)
 ; RV32IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 108(a5)
+; RV32IZCMP-SR-NEXT:    lw a7, 112(a5)
+; RV32IZCMP-SR-NEXT:    lw s0, 116(a5)
+; RV32IZCMP-SR-NEXT:    lw a3, 120(a5)
 ; RV32IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 112(a5)
+; RV32IZCMP-SR-NEXT:    lw a6, 96(a5)
+; RV32IZCMP-SR-NEXT:    lw a4, 100(a5)
+; RV32IZCMP-SR-NEXT:    lw a2, 104(a5)
+; RV32IZCMP-SR-NEXT:    lw a1, 108(a5)
 ; RV32IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    sw t0, 96(a5)
+; RV32IZCMP-SR-NEXT:    sw a3, 120(a5)
+; RV32IZCMP-SR-NEXT:    sw s0, 116(a5)
+; RV32IZCMP-SR-NEXT:    sw a7, 112(a5)
+; RV32IZCMP-SR-NEXT:    sw a1, 108(a5)
+; RV32IZCMP-SR-NEXT:    sw a2, 104(a5)
+; RV32IZCMP-SR-NEXT:    sw a4, 100(a5)
+; RV32IZCMP-SR-NEXT:    sw a6, 96(a5)
 ; RV32IZCMP-SR-NEXT:    sw t1, 92(a5)
 ; RV32IZCMP-SR-NEXT:    sw t2, 88(a5)
 ; RV32IZCMP-SR-NEXT:    sw t3, 84(a5)
@@ -2783,28 +2783,28 @@ define void @callee_no_irq() nounwind{
 ; RV32IZCMP-SR-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(a5)
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
 ; RV32IZCMP-SR-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
 ; RV32IZCMP-SR-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(t0)
 ; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 96
 ;
 ; RV64IZCMP-SR-LABEL: callee_no_irq:
 ; RV64IZCMP-SR:       # %bb.0:
 ; RV64IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -160
-; RV64IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    lui t0, %hi(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(t0)
 ; RV64IZCMP-SR-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
 ; RV64IZCMP-SR-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
 ; RV64IZCMP-SR-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
 ; RV64IZCMP-SR-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
+; RV64IZCMP-SR-NEXT:    addi a5, t0, %lo(var_test_irq)
 ; RV64IZCMP-SR-NEXT:    lw a0, 16(a5)
 ; RV64IZCMP-SR-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
 ; RV64IZCMP-SR-NEXT:    lw a0, 20(a5)
@@ -2827,22 +2827,22 @@ define void @callee_no_irq() nounwind{
 ; RV64IZCMP-SR-NEXT:    lw t3, 84(a5)
 ; RV64IZCMP-SR-NEXT:    lw t2, 88(a5)
 ; RV64IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 108(a5)
+; RV64IZCMP-SR-NEXT:    lw a7, 112(a5)
+; RV64IZCMP-SR-NEXT:    lw s0, 116(a5)
+; RV64IZCMP-SR-NEXT:    lw a3, 120(a5)
 ; RV64IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 112(a5)
+; RV64IZCMP-SR-NEXT:    lw a6, 96(a5)
+; RV64IZCMP-SR-NEXT:    lw a4, 100(a5)
+; RV64IZCMP-SR-NEXT:    lw a2, 104(a5)
+; RV64IZCMP-SR-NEXT:    lw a1, 108(a5)
 ; RV64IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    sw t0, 96(a5)
+; RV64IZCMP-SR-NEXT:    sw a3, 120(a5)
+; RV64IZCMP-SR-NEXT:    sw s0, 116(a5)
+; RV64IZCMP-SR-NEXT:    sw a7, 112(a5)
+; RV64IZCMP-SR-NEXT:    sw a1, 108(a5)
+; RV64IZCMP-SR-NEXT:    sw a2, 104(a5)
+; RV64IZCMP-SR-NEXT:    sw a4, 100(a5)
+; RV64IZCMP-SR-NEXT:    sw a6, 96(a5)
 ; RV64IZCMP-SR-NEXT:    sw t1, 92(a5)
 ; RV64IZCMP-SR-NEXT:    sw t2, 88(a5)
 ; RV64IZCMP-SR-NEXT:    sw t3, 84(a5)
@@ -2866,13 +2866,13 @@ define void @callee_no_irq() nounwind{
 ; RV64IZCMP-SR-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    sw a0, 16(a5)
 ; RV64IZCMP-SR-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
 ; RV64IZCMP-SR-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
 ; RV64IZCMP-SR-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
 ; RV64IZCMP-SR-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(t0)
 ; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV32I-LABEL: callee_no_irq:
@@ -2891,16 +2891,16 @@ define void @callee_no_irq() nounwind{
 ; RV32I-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a6, %hi(var_test_irq)
-; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    lui a7, %hi(var_test_irq)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a7)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a7)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a7)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a7)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a6, %lo(var_test_irq)
+; RV32I-NEXT:    addi a5, a7, %lo(var_test_irq)
 ; RV32I-NEXT:    lw a0, 16(a5)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lw a0, 20(a5)
@@ -2923,22 +2923,22 @@ define void @callee_no_irq() nounwind{
 ; RV32I-NEXT:    lw s8, 84(a5)
 ; RV32I-NEXT:    lw s9, 88(a5)
 ; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 96(a5)
-; RV32I-NEXT:    lw ra, 100(a5)
-; RV32I-NEXT:    lw a7, 104(a5)
-; RV32I-NEXT:    lw a4, 108(a5)
+; RV32I-NEXT:    lw s11, 112(a5)
+; RV32I-NEXT:    lw ra, 116(a5)
+; RV32I-NEXT:    lw a3, 120(a5)
 ; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a1, 120(a5)
-; RV32I-NEXT:    lw a2, 116(a5)
-; RV32I-NEXT:    lw a3, 112(a5)
+; RV32I-NEXT:    lw a6, 96(a5)
+; RV32I-NEXT:    lw a4, 100(a5)
+; RV32I-NEXT:    lw a2, 104(a5)
+; RV32I-NEXT:    lw a1, 108(a5)
 ; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a1, 120(a5)
-; RV32I-NEXT:    sw a2, 116(a5)
-; RV32I-NEXT:    sw a3, 112(a5)
-; RV32I-NEXT:    sw a4, 108(a5)
-; RV32I-NEXT:    sw a7, 104(a5)
-; RV32I-NEXT:    sw ra, 100(a5)
-; RV32I-NEXT:    sw s11, 96(a5)
+; RV32I-NEXT:    sw a3, 120(a5)
+; RV32I-NEXT:    sw ra, 116(a5)
+; RV32I-NEXT:    sw s11, 112(a5)
+; RV32I-NEXT:    sw a1, 108(a5)
+; RV32I-NEXT:    sw a2, 104(a5)
+; RV32I-NEXT:    sw a4, 100(a5)
+; RV32I-NEXT:    sw a6, 96(a5)
 ; RV32I-NEXT:    sw s10, 92(a5)
 ; RV32I-NEXT:    sw s9, 88(a5)
 ; RV32I-NEXT:    sw s8, 84(a5)
@@ -2962,13 +2962,13 @@ define void @callee_no_irq() nounwind{
 ; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sw a0, 16(a5)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a7)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a7)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a7)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a7)
 ; RV32I-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -3001,16 +3001,16 @@ define void @callee_no_irq() nounwind{
 ; RV64I-NEXT:    sd s9, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a6, %hi(var_test_irq)
-; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    lui a7, %hi(var_test_irq)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a7)
 ; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a7)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a7)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a7)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a6, %lo(var_test_irq)
+; RV64I-NEXT:    addi a5, a7, %lo(var_test_irq)
 ; RV64I-NEXT:    lw a0, 16(a5)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lw a0, 20(a5)
@@ -3033,22 +3033,22 @@ define void @callee_no_irq() nounwind{
 ; RV64I-NEXT:    lw s8, 84(a5)
 ; RV64I-NEXT:    lw s9, 88(a5)
 ; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 96(a5)
-; RV64I-NEXT:    lw ra, 100(a5)
-; RV64I-NEXT:    lw a7, 104(a5)
-; RV64I-NEXT:    lw a4, 108(a5)
+; RV64I-NEXT:    lw s11, 112(a5)
+; RV64I-NEXT:    lw ra, 116(a5)
+; RV64I-NEXT:    lw a3, 120(a5)
 ; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a1, 120(a5)
-; RV64I-NEXT:    lw a2, 116(a5)
-; RV64I-NEXT:    lw a3, 112(a5)
+; RV64I-NEXT:    lw a6, 96(a5)
+; RV64I-NEXT:    lw a4, 100(a5)
+; RV64I-NEXT:    lw a2, 104(a5)
+; RV64I-NEXT:    lw a1, 108(a5)
 ; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a1, 120(a5)
-; RV64I-NEXT:    sw a2, 116(a5)
-; RV64I-NEXT:    sw a3, 112(a5)
-; RV64I-NEXT:    sw a4, 108(a5)
-; RV64I-NEXT:    sw a7, 104(a5)
-; RV64I-NEXT:    sw ra, 100(a5)
-; RV64I-NEXT:    sw s11, 96(a5)
+; RV64I-NEXT:    sw a3, 120(a5)
+; RV64I-NEXT:    sw ra, 116(a5)
+; RV64I-NEXT:    sw s11, 112(a5)
+; RV64I-NEXT:    sw a1, 108(a5)
+; RV64I-NEXT:    sw a2, 104(a5)
+; RV64I-NEXT:    sw a4, 100(a5)
+; RV64I-NEXT:    sw a6, 96(a5)
 ; RV64I-NEXT:    sw s10, 92(a5)
 ; RV64I-NEXT:    sw s9, 88(a5)
 ; RV64I-NEXT:    sw s8, 84(a5)
@@ -3072,13 +3072,13 @@ define void @callee_no_irq() nounwind{
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sw a0, 16(a5)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a7)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a7)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a7)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a7)
 ; RV64I-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/reduction-formation.ll b/llvm/test/CodeGen/RISCV/reduction-formation.ll
index 6b4dc0cd3699e..ced3a38ab5ea0 100644
--- a/llvm/test/CodeGen/RISCV/reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/reduction-formation.ll
@@ -8,24 +8,24 @@
 define i32 @reduce_sum_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_sum_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 12(a0)
+; RV32-NEXT:    lw a1, 0(a0)
 ; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 0(a0)
-; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    add a2, a3, a2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a0, 12(a0)
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_sum_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a1, 24(a0)
+; RV64-NEXT:    lw a1, 0(a0)
 ; RV64-NEXT:    lw a2, 8(a0)
-; RV64-NEXT:    lw a3, 0(a0)
-; RV64-NEXT:    lw a0, 16(a0)
-; RV64-NEXT:    add a2, a3, a2
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    addw a0, a2, a0
+; RV64-NEXT:    lw a3, 16(a0)
+; RV64-NEXT:    lw a0, 24(a0)
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a0, a3, a0
+; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
@@ -40,24 +40,24 @@ define i32 @reduce_sum_4xi32(<4 x i32> %v) {
 define i32 @reduce_xor_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_xor_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 12(a0)
+; RV32-NEXT:    lw a1, 0(a0)
 ; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 0(a0)
-; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    xor a2, a3, a2
-; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    xor a0, a2, a0
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a0, 12(a0)
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    xor a0, a3, a0
+; RV32-NEXT:    xor a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_xor_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 24(a0)
+; RV64-NEXT:    ld a1, 0(a0)
 ; RV64-NEXT:    ld a2, 8(a0)
-; RV64-NEXT:    ld a3, 0(a0)
-; RV64-NEXT:    ld a0, 16(a0)
-; RV64-NEXT:    xor a2, a3, a2
-; RV64-NEXT:    xor a0, a0, a1
-; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    ld a3, 16(a0)
+; RV64-NEXT:    ld a0, 24(a0)
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    xor a0, a3, a0
+; RV64-NEXT:    xor a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
@@ -72,24 +72,24 @@ define i32 @reduce_xor_4xi32(<4 x i32> %v) {
 define i32 @reduce_or_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_or_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 12(a0)
+; RV32-NEXT:    lw a1, 0(a0)
 ; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 0(a0)
-; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    or a2, a3, a2
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a0, 12(a0)
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    or a0, a3, a0
+; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_or_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 24(a0)
+; RV64-NEXT:    ld a1, 0(a0)
 ; RV64-NEXT:    ld a2, 8(a0)
-; RV64-NEXT:    ld a3, 0(a0)
-; RV64-NEXT:    ld a0, 16(a0)
-; RV64-NEXT:    or a2, a3, a2
-; RV64-NEXT:    or a0, a0, a1
-; RV64-NEXT:    or a0, a2, a0
+; RV64-NEXT:    ld a3, 16(a0)
+; RV64-NEXT:    ld a0, 24(a0)
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    or a0, a3, a0
+; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index fa320f53cec6c..e24b1b41645cd 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -683,9 +683,9 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a4, 0(a1)
 ; RV32I-NEXT:    lw a2, 8(a1)
-; RV32I-NEXT:    lw a4, 12(a1)
-; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    srli a5, a3, 1
 ; RV32I-NEXT:    lui a6, 349525
 ; RV32I-NEXT:    addi a6, a6, 1365
@@ -707,37 +707,37 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    slli t0, a3, 16
 ; RV32I-NEXT:    add a3, a3, t0
 ; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    srli t0, a1, 1
+; RV32I-NEXT:    srli t0, a4, 1
 ; RV32I-NEXT:    and t0, t0, a6
-; RV32I-NEXT:    sub a1, a1, t0
-; RV32I-NEXT:    and t0, a1, a5
+; RV32I-NEXT:    sub a4, a4, t0
+; RV32I-NEXT:    and t0, a4, a5
+; RV32I-NEXT:    srli a4, a4, 2
+; RV32I-NEXT:    and a4, a4, a5
+; RV32I-NEXT:    add a4, t0, a4
+; RV32I-NEXT:    srli t0, a4, 4
+; RV32I-NEXT:    add a4, a4, t0
+; RV32I-NEXT:    and a4, a4, a7
+; RV32I-NEXT:    slli t0, a4, 8
+; RV32I-NEXT:    add a4, a4, t0
+; RV32I-NEXT:    slli t0, a4, 16
+; RV32I-NEXT:    add a4, a4, t0
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    add a3, a4, a3
+; RV32I-NEXT:    srli a4, a1, 1
+; RV32I-NEXT:    and a4, a4, a6
+; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    and a4, a1, a5
 ; RV32I-NEXT:    srli a1, a1, 2
 ; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    add a1, t0, a1
-; RV32I-NEXT:    srli t0, a1, 4
-; RV32I-NEXT:    add a1, a1, t0
+; RV32I-NEXT:    add a1, a4, a1
+; RV32I-NEXT:    srli a4, a1, 4
+; RV32I-NEXT:    add a1, a1, a4
 ; RV32I-NEXT:    and a1, a1, a7
-; RV32I-NEXT:    slli t0, a1, 8
-; RV32I-NEXT:    add a1, a1, t0
-; RV32I-NEXT:    slli t0, a1, 16
-; RV32I-NEXT:    add a1, a1, t0
+; RV32I-NEXT:    slli a4, a1, 8
+; RV32I-NEXT:    add a1, a1, a4
+; RV32I-NEXT:    slli a4, a1, 16
+; RV32I-NEXT:    add a1, a1, a4
 ; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    add a1, a1, a3
-; RV32I-NEXT:    srli a3, a4, 1
-; RV32I-NEXT:    and a3, a3, a6
-; RV32I-NEXT:    sub a4, a4, a3
-; RV32I-NEXT:    and a3, a4, a5
-; RV32I-NEXT:    srli a4, a4, 2
-; RV32I-NEXT:    and a4, a4, a5
-; RV32I-NEXT:    add a3, a3, a4
-; RV32I-NEXT:    srli a4, a3, 4
-; RV32I-NEXT:    add a3, a3, a4
-; RV32I-NEXT:    and a3, a3, a7
-; RV32I-NEXT:    slli a4, a3, 8
-; RV32I-NEXT:    add a3, a3, a4
-; RV32I-NEXT:    slli a4, a3, 16
-; RV32I-NEXT:    add a3, a3, a4
-; RV32I-NEXT:    srli a3, a3, 24
 ; RV32I-NEXT:    srli a4, a2, 1
 ; RV32I-NEXT:    and a4, a4, a6
 ; RV32I-NEXT:    sub a2, a2, a4
@@ -753,11 +753,11 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    slli a4, a2, 16
 ; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    srli a2, a2, 24
-; RV32I-NEXT:    add a2, a2, a3
+; RV32I-NEXT:    add a1, a2, a1
 ; RV32I-NEXT:    sw zero, 12(a0)
 ; RV32I-NEXT:    sw zero, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a3, 0(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: ctpop_v2i64:
@@ -785,21 +785,21 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64_ult_two:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a1, 0(a0)
-; RV32I-NEXT:    lw a2, 12(a0)
+; RV32I-NEXT:    lw a2, 4(a0)
 ; RV32I-NEXT:    lw a3, 8(a0)
-; RV32I-NEXT:    lw a0, 4(a0)
-; RV32I-NEXT:    addi a4, a1, -1
-; RV32I-NEXT:    and a4, a1, a4
+; RV32I-NEXT:    lw a4, 12(a0)
+; RV32I-NEXT:    addi a0, a1, -1
+; RV32I-NEXT:    and a0, a1, a0
 ; RV32I-NEXT:    seqz a1, a1
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    seqz a0, a0
 ; RV32I-NEXT:    addi a1, a3, -1
 ; RV32I-NEXT:    and a1, a3, a1
-; RV32I-NEXT:    seqz a3, a3
-; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    seqz a2, a3
+; RV32I-NEXT:    sub a2, a4, a2
+; RV32I-NEXT:    and a2, a4, a2
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    ret
@@ -828,21 +828,21 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64_ugt_one:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a1, 0(a0)
-; RV32I-NEXT:    lw a2, 12(a0)
+; RV32I-NEXT:    lw a2, 4(a0)
 ; RV32I-NEXT:    lw a3, 8(a0)
-; RV32I-NEXT:    lw a0, 4(a0)
-; RV32I-NEXT:    addi a4, a1, -1
-; RV32I-NEXT:    and a4, a1, a4
+; RV32I-NEXT:    lw a4, 12(a0)
+; RV32I-NEXT:    addi a0, a1, -1
+; RV32I-NEXT:    and a0, a1, a0
 ; RV32I-NEXT:    seqz a1, a1
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    addi a1, a3, -1
 ; RV32I-NEXT:    and a1, a3, a1
-; RV32I-NEXT:    seqz a3, a3
-; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    seqz a2, a3
+; RV32I-NEXT:    sub a2, a4, a2
+; RV32I-NEXT:    and a2, a4, a2
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    snez a1, a1
 ; RV32I-NEXT:    ret
@@ -873,15 +873,15 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64_eq_one:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mv a1, a0
-; RV32I-NEXT:    lw a2, 12(a0)
-; RV32I-NEXT:    lw a0, 4(a0)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    beqz a0, .LBB22_3
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a2, 12(a1)
+; RV32I-NEXT:    beqz a3, .LBB22_3
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    seqz a3, a3
-; RV32I-NEXT:    sub a3, a0, a3
-; RV32I-NEXT:    xor a0, a0, a3
-; RV32I-NEXT:    sltu a0, a3, a0
+; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    sub a0, a3, a0
+; RV32I-NEXT:    xor a3, a3, a0
+; RV32I-NEXT:    sltu a0, a0, a3
 ; RV32I-NEXT:    lw a1, 8(a1)
 ; RV32I-NEXT:    bnez a2, .LBB22_4
 ; RV32I-NEXT:  .LBB22_2:
@@ -890,9 +890,9 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    sltu a1, a2, a1
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB22_3:
-; RV32I-NEXT:    addi a0, a3, -1
-; RV32I-NEXT:    xor a3, a3, a0
-; RV32I-NEXT:    sltu a0, a0, a3
+; RV32I-NEXT:    addi a3, a0, -1
+; RV32I-NEXT:    xor a0, a0, a3
+; RV32I-NEXT:    sltu a0, a3, a0
 ; RV32I-NEXT:    lw a1, 8(a1)
 ; RV32I-NEXT:    beqz a2, .LBB22_2
 ; RV32I-NEXT:  .LBB22_4:
@@ -927,20 +927,20 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64_ne_one:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    lw a2, 0(a0)
+; RV32I-NEXT:    lw a3, 4(a0)
 ; RV32I-NEXT:    lw a1, 12(a0)
-; RV32I-NEXT:    lw a2, 4(a0)
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    beqz a2, .LBB23_2
+; RV32I-NEXT:    beqz a3, .LBB23_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    seqz a3, a3
-; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    xor a2, a2, a3
-; RV32I-NEXT:    sltu a2, a3, a2
-; RV32I-NEXT:    j .LBB23_3
-; RV32I-NEXT:  .LBB23_2:
-; RV32I-NEXT:    addi a2, a3, -1
+; RV32I-NEXT:    seqz a2, a2
+; RV32I-NEXT:    sub a2, a3, a2
 ; RV32I-NEXT:    xor a3, a3, a2
 ; RV32I-NEXT:    sltu a2, a2, a3
+; RV32I-NEXT:    j .LBB23_3
+; RV32I-NEXT:  .LBB23_2:
+; RV32I-NEXT:    addi a3, a2, -1
+; RV32I-NEXT:    xor a2, a2, a3
+; RV32I-NEXT:    sltu a2, a3, a2
 ; RV32I-NEXT:  .LBB23_3:
 ; RV32I-NEXT:    lw a3, 8(a0)
 ; RV32I-NEXT:    xori a0, a2, 1
diff --git a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
index f38aa71fb158d..6c4466796aeed 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
@@ -177,12 +177,12 @@ define i8 @test13(ptr %0, i64 %1) {
 ; RV64I-NEXT:    li a2, 1
 ; RV64I-NEXT:    subw a2, a2, a1
 ; RV64I-NEXT:    add a2, a0, a2
-; RV64I-NEXT:    lbu a2, 0(a2)
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    subw a3, a3, a1
 ; RV64I-NEXT:    add a0, a0, a3
+; RV64I-NEXT:    lbu a1, 0(a2)
 ; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    ret
   %3 = mul i64 %1, -4294967296
   %4 = add i64 %3, 4294967296 ; 1 << 32
diff --git a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
index 93fe66695b70e..225680e846bac 100644
--- a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
+++ b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
@@ -10,9 +10,10 @@ define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x
 ; OMIT-FP-NEXT:    addi sp, sp, -16
 ; OMIT-FP-NEXT:    .cfi_def_cfa_offset 16
 ; OMIT-FP-NEXT:    csrr a0, vlenb
-; OMIT-FP-NEXT:    slli a0, a0, 3
+; OMIT-FP-NEXT:    slli a1, a0, 3
+; OMIT-FP-NEXT:    sub a0, a1, a0
 ; OMIT-FP-NEXT:    sub sp, sp, a0
-; OMIT-FP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 7 * vlenb
 ; OMIT-FP-NEXT:    csrr a0, vlenb
 ; OMIT-FP-NEXT:    li a1, 6
 ; OMIT-FP-NEXT:    mul a0, a0, a1
@@ -26,13 +27,13 @@ define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x
 ; OMIT-FP-NEXT:    vs2r.v v2, (a0) # Unknown-size Folded Spill
 ; OMIT-FP-NEXT:    addi a0, sp, 16
 ; OMIT-FP-NEXT:    vs4r.v v4, (a0) # Unknown-size Folded Spill
-; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x61, 0x08, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 2 * vlenb
-; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x08, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2 @ cfa - 4 * vlenb
-; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x63, 0x08, 0x11, 0x7d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v3 @ cfa - 3 * vlenb
-; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x08, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4 @ cfa - 8 * vlenb
-; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x65, 0x08, 0x11, 0x79, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v5 @ cfa - 7 * vlenb
-; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x66, 0x08, 0x11, 0x7a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v6 @ cfa - 6 * vlenb
-; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x67, 0x08, 0x11, 0x7b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v7 @ cfa - 5 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x61, 0x08, 0x11, 0x7f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 1 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x08, 0x11, 0x7d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2 @ cfa - 3 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x63, 0x08, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v3 @ cfa - 2 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x08, 0x11, 0x79, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4 @ cfa - 7 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x65, 0x08, 0x11, 0x7a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v5 @ cfa - 6 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x66, 0x08, 0x11, 0x7b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v6 @ cfa - 5 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x67, 0x08, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v7 @ cfa - 4 * vlenb
 ; OMIT-FP-NEXT:    #APP
 ; OMIT-FP-NEXT:    #NO_APP
 ; OMIT-FP-NEXT:    csrr a0, vlenb
@@ -49,7 +50,8 @@ define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x
 ; OMIT-FP-NEXT:    addi a0, sp, 16
 ; OMIT-FP-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
 ; OMIT-FP-NEXT:    csrr a0, vlenb
-; OMIT-FP-NEXT:    slli a0, a0, 3
+; OMIT-FP-NEXT:    slli a1, a0, 3
+; OMIT-FP-NEXT:    sub a0, a1, a0
 ; OMIT-FP-NEXT:    add sp, sp, a0
 ; OMIT-FP-NEXT:    addi sp, sp, 16
 ; OMIT-FP-NEXT:    ret
@@ -65,44 +67,47 @@ define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x
 ; NO-OMIT-FP-NEXT:    addi s0, sp, 32
 ; NO-OMIT-FP-NEXT:    .cfi_def_cfa s0, 0
 ; NO-OMIT-FP-NEXT:    csrr a0, vlenb
-; NO-OMIT-FP-NEXT:    slli a0, a0, 3
+; NO-OMIT-FP-NEXT:    slli a1, a0, 3
+; NO-OMIT-FP-NEXT:    sub a0, a1, a0
 ; NO-OMIT-FP-NEXT:    sub sp, sp, a0
 ; NO-OMIT-FP-NEXT:    csrr a0, vlenb
-; NO-OMIT-FP-NEXT:    slli a0, a0, 1
 ; NO-OMIT-FP-NEXT:    sub a0, s0, a0
 ; NO-OMIT-FP-NEXT:    addi a0, a0, -32
 ; NO-OMIT-FP-NEXT:    vs1r.v v1, (a0) # Unknown-size Folded Spill
 ; NO-OMIT-FP-NEXT:    csrr a0, vlenb
-; NO-OMIT-FP-NEXT:    slli a0, a0, 2
+; NO-OMIT-FP-NEXT:    slli a1, a0, 1
+; NO-OMIT-FP-NEXT:    add a0, a1, a0
 ; NO-OMIT-FP-NEXT:    sub a0, s0, a0
 ; NO-OMIT-FP-NEXT:    addi a0, a0, -32
 ; NO-OMIT-FP-NEXT:    vs2r.v v2, (a0) # Unknown-size Folded Spill
 ; NO-OMIT-FP-NEXT:    csrr a0, vlenb
-; NO-OMIT-FP-NEXT:    slli a0, a0, 3
+; NO-OMIT-FP-NEXT:    slli a1, a0, 3
+; NO-OMIT-FP-NEXT:    sub a0, a1, a0
 ; NO-OMIT-FP-NEXT:    sub a0, s0, a0
 ; NO-OMIT-FP-NEXT:    addi a0, a0, -32
 ; NO-OMIT-FP-NEXT:    vs4r.v v4, (a0) # Unknown-size Folded Spill
-; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x61, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 32 - 2 * vlenb
-; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2 @ cfa - 32 - 4 * vlenb
-; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x63, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v3 @ cfa - 32 - 3 * vlenb
-; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4 @ cfa - 32 - 8 * vlenb
-; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x65, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x79, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v5 @ cfa - 32 - 7 * vlenb
-; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x66, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v6 @ cfa - 32 - 6 * vlenb
-; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x67, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v7 @ cfa - 32 - 5 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x61, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 32 - 1 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2 @ cfa - 32 - 3 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x63, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v3 @ cfa - 32 - 2 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x79, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4 @ cfa - 32 - 7 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x65, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v5 @ cfa - 32 - 6 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x66, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v6 @ cfa - 32 - 5 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x67, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v7 @ cfa - 32 - 4 * vlenb
 ; NO-OMIT-FP-NEXT:    #APP
 ; NO-OMIT-FP-NEXT:    #NO_APP
 ; NO-OMIT-FP-NEXT:    csrr a0, vlenb
-; NO-OMIT-FP-NEXT:    slli a0, a0, 1
 ; NO-OMIT-FP-NEXT:    sub a0, s0, a0
 ; NO-OMIT-FP-NEXT:    addi a0, a0, -32
 ; NO-OMIT-FP-NEXT:    vl1r.v v1, (a0) # Unknown-size Folded Reload
 ; NO-OMIT-FP-NEXT:    csrr a0, vlenb
-; NO-OMIT-FP-NEXT:    slli a0, a0, 2
+; NO-OMIT-FP-NEXT:    slli a1, a0, 1
+; NO-OMIT-FP-NEXT:    add a0, a1, a0
 ; NO-OMIT-FP-NEXT:    sub a0, s0, a0
 ; NO-OMIT-FP-NEXT:    addi a0, a0, -32
 ; NO-OMIT-FP-NEXT:    vl2r.v v2, (a0) # Unknown-size Folded Reload
 ; NO-OMIT-FP-NEXT:    csrr a0, vlenb
-; NO-OMIT-FP-NEXT:    slli a0, a0, 3
+; NO-OMIT-FP-NEXT:    slli a1, a0, 3
+; NO-OMIT-FP-NEXT:    sub a0, a1, a0
 ; NO-OMIT-FP-NEXT:    sub a0, s0, a0
 ; NO-OMIT-FP-NEXT:    addi a0, a0, -32
 ; NO-OMIT-FP-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
index 91f700ef96800..8640ac2da5303 100644
--- a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
@@ -33,9 +33,8 @@ define <vscale x 1 x i64> @access_fixed_and_vector_objects(ptr %val) {
 ; RV64IV-NEXT:    addi sp, sp, -528
 ; RV64IV-NEXT:    .cfi_def_cfa_offset 528
 ; RV64IV-NEXT:    csrr a0, vlenb
-; RV64IV-NEXT:    slli a0, a0, 1
 ; RV64IV-NEXT:    sub sp, sp, a0
-; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x04, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 528 + 2 * vlenb
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x04, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 528 + 1 * vlenb
 ; RV64IV-NEXT:    addi a0, sp, 8
 ; RV64IV-NEXT:    vl1re64.v v8, (a0)
 ; RV64IV-NEXT:    addi a0, sp, 528
@@ -44,7 +43,6 @@ define <vscale x 1 x i64> @access_fixed_and_vector_objects(ptr %val) {
 ; RV64IV-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64IV-NEXT:    vadd.vv v8, v8, v9
 ; RV64IV-NEXT:    csrr a0, vlenb
-; RV64IV-NEXT:    slli a0, a0, 1
 ; RV64IV-NEXT:    add sp, sp, a0
 ; RV64IV-NEXT:    addi sp, sp, 528
 ; RV64IV-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir b/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir
index f976adcfe931c..43fb0c10ca46f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir
@@ -38,19 +38,15 @@ body: |
     ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $x8, 0
     ; CHECK-NEXT: $x2 = frame-setup ADDI $x2, -240
     ; CHECK-NEXT: $x12 = frame-setup PseudoReadVLENB
-    ; CHECK-NEXT: $x12 = frame-setup SLLI killed $x12, 1
     ; CHECK-NEXT: $x2 = frame-setup SUB $x2, killed $x12
     ; CHECK-NEXT: dead $x0 = PseudoVSETVLI killed renamable $x11, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: renamable $v8 = PseudoVLE64_V_M1 undef renamable $v8, killed renamable $x10, $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype :: (load unknown-size from %ir.pa, align 8)
     ; CHECK-NEXT: $x10 = PseudoReadVLENB
-    ; CHECK-NEXT: $x10 = SLLI killed $x10, 1
     ; CHECK-NEXT: $x10 = SUB $x8, killed $x10
     ; CHECK-NEXT: $x10 = ADDI killed $x10, -2048
     ; CHECK-NEXT: $x10 = ADDI killed $x10, -224
     ; CHECK-NEXT: VS1R_V killed renamable $v8, killed renamable $x10
-    ; CHECK-NEXT: $x2 = frame-destroy ADDI $x8, -2048
-    ; CHECK-NEXT: $x2 = frame-destroy ADDI killed $x2, -224
-    ; CHECK-NEXT: $x2 = frame-destroy ADDI $x2, 240
+    ; CHECK-NEXT: $x2 = frame-destroy ADDI $x8, -2032
     ; CHECK-NEXT: $x1 = LD $x2, 2024 :: (load (s64) from %stack.3)
     ; CHECK-NEXT: $x8 = LD $x2, 2016 :: (load (s64) from %stack.4)
     ; CHECK-NEXT: $x2 = frame-destroy ADDI $x2, 2032
diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
index 1fe91c721f4dd..2e70c3395090e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
@@ -11,9 +11,10 @@ define void @test(ptr %addr) {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrrs a1, vlenb, zero
-; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    slli a2, a1, 1
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb
 ; CHECK-NEXT:    csrrs a1, vlenb, zero
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    vl1re64.v v8, (a2)
@@ -28,7 +29,8 @@ define void @test(ptr %addr) {
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    csrrs a0, vlenb, zero
-; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    jalr zero, 0(ra)
diff --git a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
index 90794820ddd84..35e269b911902 100644
--- a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
@@ -7,34 +7,13 @@
 ; RUN:    | FileCheck %s --check-prefixes=CHECK,NOMUL
 
 define void @lmul1() nounwind {
-; NOZBA-LABEL: lmul1:
-; NOZBA:       # %bb.0:
-; NOZBA-NEXT:    csrr a0, vlenb
-; NOZBA-NEXT:    slli a0, a0, 1
-; NOZBA-NEXT:    sub sp, sp, a0
-; NOZBA-NEXT:    csrr a0, vlenb
-; NOZBA-NEXT:    slli a0, a0, 1
-; NOZBA-NEXT:    add sp, sp, a0
-; NOZBA-NEXT:    ret
-;
-; ZBA-LABEL: lmul1:
-; ZBA:       # %bb.0:
-; ZBA-NEXT:    csrr a0, vlenb
-; ZBA-NEXT:    slli a0, a0, 1
-; ZBA-NEXT:    sub sp, sp, a0
-; ZBA-NEXT:    csrr a0, vlenb
-; ZBA-NEXT:    sh1add sp, a0, sp
-; ZBA-NEXT:    ret
-;
-; NOMUL-LABEL: lmul1:
-; NOMUL:       # %bb.0:
-; NOMUL-NEXT:    csrr a0, vlenb
-; NOMUL-NEXT:    slli a0, a0, 1
-; NOMUL-NEXT:    sub sp, sp, a0
-; NOMUL-NEXT:    csrr a0, vlenb
-; NOMUL-NEXT:    slli a0, a0, 1
-; NOMUL-NEXT:    add sp, sp, a0
-; NOMUL-NEXT:    ret
+; CHECK-LABEL: lmul1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    ret
   %v = alloca <vscale x 1 x i64>
   ret void
 }
@@ -192,29 +171,34 @@ define void @lmul2_and_1() nounwind {
 ; NOZBA-LABEL: lmul2_and_1:
 ; NOZBA:       # %bb.0:
 ; NOZBA-NEXT:    csrr a0, vlenb
-; NOZBA-NEXT:    slli a0, a0, 2
+; NOZBA-NEXT:    slli a1, a0, 1
+; NOZBA-NEXT:    add a0, a1, a0
 ; NOZBA-NEXT:    sub sp, sp, a0
 ; NOZBA-NEXT:    csrr a0, vlenb
-; NOZBA-NEXT:    slli a0, a0, 2
+; NOZBA-NEXT:    slli a1, a0, 1
+; NOZBA-NEXT:    add a0, a1, a0
 ; NOZBA-NEXT:    add sp, sp, a0
 ; NOZBA-NEXT:    ret
 ;
 ; ZBA-LABEL: lmul2_and_1:
 ; ZBA:       # %bb.0:
 ; ZBA-NEXT:    csrr a0, vlenb
-; ZBA-NEXT:    slli a0, a0, 2
+; ZBA-NEXT:    sh1add a0, a0, a0
 ; ZBA-NEXT:    sub sp, sp, a0
 ; ZBA-NEXT:    csrr a0, vlenb
-; ZBA-NEXT:    sh2add sp, a0, sp
+; ZBA-NEXT:    sh1add a0, a0, a0
+; ZBA-NEXT:    add sp, sp, a0
 ; ZBA-NEXT:    ret
 ;
 ; NOMUL-LABEL: lmul2_and_1:
 ; NOMUL:       # %bb.0:
 ; NOMUL-NEXT:    csrr a0, vlenb
-; NOMUL-NEXT:    slli a0, a0, 2
+; NOMUL-NEXT:    slli a1, a0, 1
+; NOMUL-NEXT:    add a0, a1, a0
 ; NOMUL-NEXT:    sub sp, sp, a0
 ; NOMUL-NEXT:    csrr a0, vlenb
-; NOMUL-NEXT:    slli a0, a0, 2
+; NOMUL-NEXT:    slli a1, a0, 1
+; NOMUL-NEXT:    add a0, a1, a0
 ; NOMUL-NEXT:    add sp, sp, a0
 ; NOMUL-NEXT:    ret
   %v1 = alloca <vscale x 2 x i64>
@@ -223,63 +207,176 @@ define void @lmul2_and_1() nounwind {
 }
 
 define void @lmul4_and_1() nounwind {
-; CHECK-LABEL: lmul4_and_1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -48
-; CHECK-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    addi s0, sp, 48
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    andi sp, sp, -32
-; CHECK-NEXT:    addi sp, s0, -48
-; CHECK-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 48
-; CHECK-NEXT:    ret
+; NOZBA-LABEL: lmul4_and_1:
+; NOZBA:       # %bb.0:
+; NOZBA-NEXT:    addi sp, sp, -48
+; NOZBA-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; NOZBA-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; NOZBA-NEXT:    addi s0, sp, 48
+; NOZBA-NEXT:    csrr a0, vlenb
+; NOZBA-NEXT:    li a1, 6
+; NOZBA-NEXT:    mul a0, a0, a1
+; NOZBA-NEXT:    sub sp, sp, a0
+; NOZBA-NEXT:    andi sp, sp, -32
+; NOZBA-NEXT:    addi sp, s0, -48
+; NOZBA-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; NOZBA-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; NOZBA-NEXT:    addi sp, sp, 48
+; NOZBA-NEXT:    ret
+;
+; ZBA-LABEL: lmul4_and_1:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    addi sp, sp, -48
+; ZBA-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; ZBA-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; ZBA-NEXT:    addi s0, sp, 48
+; ZBA-NEXT:    csrr a0, vlenb
+; ZBA-NEXT:    slli a0, a0, 1
+; ZBA-NEXT:    sh1add a0, a0, a0
+; ZBA-NEXT:    sub sp, sp, a0
+; ZBA-NEXT:    andi sp, sp, -32
+; ZBA-NEXT:    addi sp, s0, -48
+; ZBA-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; ZBA-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; ZBA-NEXT:    addi sp, sp, 48
+; ZBA-NEXT:    ret
+;
+; NOMUL-LABEL: lmul4_and_1:
+; NOMUL:       # %bb.0:
+; NOMUL-NEXT:    addi sp, sp, -48
+; NOMUL-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; NOMUL-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; NOMUL-NEXT:    addi s0, sp, 48
+; NOMUL-NEXT:    csrr a0, vlenb
+; NOMUL-NEXT:    slli a0, a0, 1
+; NOMUL-NEXT:    mv a1, a0
+; NOMUL-NEXT:    slli a0, a0, 1
+; NOMUL-NEXT:    add a0, a0, a1
+; NOMUL-NEXT:    sub sp, sp, a0
+; NOMUL-NEXT:    andi sp, sp, -32
+; NOMUL-NEXT:    addi sp, s0, -48
+; NOMUL-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; NOMUL-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; NOMUL-NEXT:    addi sp, sp, 48
+; NOMUL-NEXT:    ret
   %v1 = alloca <vscale x 4 x i64>
   %v2 = alloca <vscale x 1 x i64>
   ret void
 }
 
 define void @lmul4_and_2() nounwind {
-; CHECK-LABEL: lmul4_and_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -48
-; CHECK-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    addi s0, sp, 48
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    andi sp, sp, -32
-; CHECK-NEXT:    addi sp, s0, -48
-; CHECK-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 48
-; CHECK-NEXT:    ret
+; NOZBA-LABEL: lmul4_and_2:
+; NOZBA:       # %bb.0:
+; NOZBA-NEXT:    addi sp, sp, -48
+; NOZBA-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; NOZBA-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; NOZBA-NEXT:    addi s0, sp, 48
+; NOZBA-NEXT:    csrr a0, vlenb
+; NOZBA-NEXT:    li a1, 6
+; NOZBA-NEXT:    mul a0, a0, a1
+; NOZBA-NEXT:    sub sp, sp, a0
+; NOZBA-NEXT:    andi sp, sp, -32
+; NOZBA-NEXT:    addi sp, s0, -48
+; NOZBA-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; NOZBA-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; NOZBA-NEXT:    addi sp, sp, 48
+; NOZBA-NEXT:    ret
+;
+; ZBA-LABEL: lmul4_and_2:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    addi sp, sp, -48
+; ZBA-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; ZBA-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; ZBA-NEXT:    addi s0, sp, 48
+; ZBA-NEXT:    csrr a0, vlenb
+; ZBA-NEXT:    slli a0, a0, 1
+; ZBA-NEXT:    sh1add a0, a0, a0
+; ZBA-NEXT:    sub sp, sp, a0
+; ZBA-NEXT:    andi sp, sp, -32
+; ZBA-NEXT:    addi sp, s0, -48
+; ZBA-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; ZBA-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; ZBA-NEXT:    addi sp, sp, 48
+; ZBA-NEXT:    ret
+;
+; NOMUL-LABEL: lmul4_and_2:
+; NOMUL:       # %bb.0:
+; NOMUL-NEXT:    addi sp, sp, -48
+; NOMUL-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; NOMUL-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; NOMUL-NEXT:    addi s0, sp, 48
+; NOMUL-NEXT:    csrr a0, vlenb
+; NOMUL-NEXT:    slli a0, a0, 1
+; NOMUL-NEXT:    mv a1, a0
+; NOMUL-NEXT:    slli a0, a0, 1
+; NOMUL-NEXT:    add a0, a0, a1
+; NOMUL-NEXT:    sub sp, sp, a0
+; NOMUL-NEXT:    andi sp, sp, -32
+; NOMUL-NEXT:    addi sp, s0, -48
+; NOMUL-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; NOMUL-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; NOMUL-NEXT:    addi sp, sp, 48
+; NOMUL-NEXT:    ret
   %v1 = alloca <vscale x 4 x i64>
   %v2 = alloca <vscale x 2 x i64>
   ret void
 }
 
 define void @lmul4_and_2_x2_0() nounwind {
-; CHECK-LABEL: lmul4_and_2_x2_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -48
-; CHECK-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    addi s0, sp, 48
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    andi sp, sp, -32
-; CHECK-NEXT:    addi sp, s0, -48
-; CHECK-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 48
-; CHECK-NEXT:    ret
+; NOZBA-LABEL: lmul4_and_2_x2_0:
+; NOZBA:       # %bb.0:
+; NOZBA-NEXT:    addi sp, sp, -48
+; NOZBA-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; NOZBA-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; NOZBA-NEXT:    addi s0, sp, 48
+; NOZBA-NEXT:    csrr a0, vlenb
+; NOZBA-NEXT:    li a1, 14
+; NOZBA-NEXT:    mul a0, a0, a1
+; NOZBA-NEXT:    sub sp, sp, a0
+; NOZBA-NEXT:    andi sp, sp, -32
+; NOZBA-NEXT:    addi sp, s0, -48
+; NOZBA-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; NOZBA-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; NOZBA-NEXT:    addi sp, sp, 48
+; NOZBA-NEXT:    ret
+;
+; ZBA-LABEL: lmul4_and_2_x2_0:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    addi sp, sp, -48
+; ZBA-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; ZBA-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; ZBA-NEXT:    addi s0, sp, 48
+; ZBA-NEXT:    csrr a0, vlenb
+; ZBA-NEXT:    li a1, 14
+; ZBA-NEXT:    mul a0, a0, a1
+; ZBA-NEXT:    sub sp, sp, a0
+; ZBA-NEXT:    andi sp, sp, -32
+; ZBA-NEXT:    addi sp, s0, -48
+; ZBA-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; ZBA-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; ZBA-NEXT:    addi sp, sp, 48
+; ZBA-NEXT:    ret
+;
+; NOMUL-LABEL: lmul4_and_2_x2_0:
+; NOMUL:       # %bb.0:
+; NOMUL-NEXT:    addi sp, sp, -48
+; NOMUL-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; NOMUL-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; NOMUL-NEXT:    addi s0, sp, 48
+; NOMUL-NEXT:    csrr a0, vlenb
+; NOMUL-NEXT:    slli a0, a0, 1
+; NOMUL-NEXT:    mv a1, a0
+; NOMUL-NEXT:    slli a0, a0, 1
+; NOMUL-NEXT:    add a1, a1, a0
+; NOMUL-NEXT:    slli a0, a0, 1
+; NOMUL-NEXT:    add a0, a0, a1
+; NOMUL-NEXT:    sub sp, sp, a0
+; NOMUL-NEXT:    andi sp, sp, -32
+; NOMUL-NEXT:    addi sp, s0, -48
+; NOMUL-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; NOMUL-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; NOMUL-NEXT:    addi sp, sp, 48
+; NOMUL-NEXT:    ret
   %v1 = alloca <vscale x 4 x i64>
   %v2 = alloca <vscale x 2 x i64>
   %v3 = alloca <vscale x 4 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll
index 2177bbfe5b2a4..c0b10be847d1f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll
@@ -7,7 +7,6 @@ define <vscale x 1 x i32> @test_vector_std(<vscale x 1 x i32> %va) nounwind {
 ; SPILL-O2:       # %bb.0: # %entry
 ; SPILL-O2-NEXT:    addi sp, sp, -16
 ; SPILL-O2-NEXT:    csrr a0, vlenb
-; SPILL-O2-NEXT:    slli a0, a0, 1
 ; SPILL-O2-NEXT:    sub sp, sp, a0
 ; SPILL-O2-NEXT:    addi a0, sp, 16
 ; SPILL-O2-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
@@ -15,7 +14,6 @@ define <vscale x 1 x i32> @test_vector_std(<vscale x 1 x i32> %va) nounwind {
 ; SPILL-O2-NEXT:    #NO_APP
 ; SPILL-O2-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; SPILL-O2-NEXT:    csrr a0, vlenb
-; SPILL-O2-NEXT:    slli a0, a0, 1
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
@@ -111,9 +109,7 @@ define riscv_vector_cc void @local_stack_allocation_frame_pointer() "frame-point
 ; SPILL-O2-NEXT:    addi sp, sp, -480
 ; SPILL-O2-NEXT:    lbu a0, -1912(s0)
 ; SPILL-O2-NEXT:    sb a0, -1912(s0)
-; SPILL-O2-NEXT:    addi sp, s0, -2048
-; SPILL-O2-NEXT:    addi sp, sp, -464
-; SPILL-O2-NEXT:    addi sp, sp, 480
+; SPILL-O2-NEXT:    addi sp, s0, -2032
 ; SPILL-O2-NEXT:    lw ra, 2028(sp) # 4-byte Folded Reload
 ; SPILL-O2-NEXT:    lw s0, 2024(sp) # 4-byte Folded Reload
 ; SPILL-O2-NEXT:    addi sp, sp, 2032
diff --git a/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir b/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir
index 600084632ce68..1b9ce12af01f9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir
@@ -83,7 +83,7 @@ body:             |
   ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa $x8, 0
   ; CHECK-NEXT:   $x2 = frame-setup ADDI $x2, -272
   ; CHECK-NEXT:   $x10 = frame-setup PseudoReadVLENB
-  ; CHECK-NEXT:   $x11 = frame-setup ADDI $x0, 52
+  ; CHECK-NEXT:   $x11 = frame-setup ADDI $x0, 51
   ; CHECK-NEXT:   $x10 = frame-setup MUL killed $x10, killed $x11
   ; CHECK-NEXT:   $x2 = frame-setup SUB $x2, killed $x10
   ; CHECK-NEXT:   $x2 = frame-setup ANDI $x2, -128
@@ -150,9 +150,7 @@ body:             |
   ; CHECK-NEXT:   PseudoBR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   $x2 = frame-destroy ADDI $x8, -2048
-  ; CHECK-NEXT:   $x2 = frame-destroy ADDI killed $x2, -256
-  ; CHECK-NEXT:   $x2 = frame-destroy ADDI $x2, 272
+  ; CHECK-NEXT:   $x2 = frame-destroy ADDI $x8, -2032
   ; CHECK-NEXT:   $x1 = LD $x2, 2024 :: (load (s64) from %stack.3)
   ; CHECK-NEXT:   $x8 = LD $x2, 2016 :: (load (s64) from %stack.4)
   ; CHECK-NEXT:   $x18 = LD $x2, 2008 :: (load (s64) from %stack.5)
diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
index d34c10798f482..92b88054a1d3b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
@@ -8,14 +8,14 @@ declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32)
 define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV32-LABEL: vpreduce_add_v4i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a4, 4(a1)
-; RV32-NEXT:    lw a5, 12(a1)
+; RV32-NEXT:    lw a4, 0(a1)
+; RV32-NEXT:    lw a5, 4(a1)
 ; RV32-NEXT:    lw a6, 8(a1)
-; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw a1, 12(a1)
 ; RV32-NEXT:    lw a7, 0(a2)
-; RV32-NEXT:    lw t0, 8(a2)
-; RV32-NEXT:    lw t1, 12(a2)
-; RV32-NEXT:    lw a2, 4(a2)
+; RV32-NEXT:    lw t0, 4(a2)
+; RV32-NEXT:    lw t1, 8(a2)
+; RV32-NEXT:    lw a2, 12(a2)
 ; RV32-NEXT:    snez t2, a3
 ; RV32-NEXT:    sltiu t3, a3, 3
 ; RV32-NEXT:    xori t3, t3, 1
@@ -23,34 +23,34 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV32-NEXT:    xori t4, t4, 1
 ; RV32-NEXT:    sltiu a3, a3, 2
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    and a3, t4, t1
-; RV32-NEXT:    and t0, t3, t0
+; RV32-NEXT:    and a3, a3, t0
+; RV32-NEXT:    and a2, t4, a2
+; RV32-NEXT:    and t0, t3, t1
 ; RV32-NEXT:    and a7, t2, a7
 ; RV32-NEXT:    neg a7, a7
-; RV32-NEXT:    and a1, a7, a1
+; RV32-NEXT:    and a4, a7, a4
 ; RV32-NEXT:    neg a7, t0
 ; RV32-NEXT:    and a6, a7, a6
-; RV32-NEXT:    neg a3, a3
-; RV32-NEXT:    and a3, a3, a5
 ; RV32-NEXT:    neg a2, a2
-; RV32-NEXT:    and a2, a2, a4
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a1, a1, a6
-; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    neg a2, a3
+; RV32-NEXT:    and a2, a2, a5
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a1, a4, a1
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpreduce_add_v4i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a4, 8(a1)
-; RV64-NEXT:    lw a5, 24(a1)
+; RV64-NEXT:    lw a4, 0(a1)
+; RV64-NEXT:    lw a5, 8(a1)
 ; RV64-NEXT:    lw a6, 16(a1)
-; RV64-NEXT:    lw a1, 0(a1)
+; RV64-NEXT:    lw a1, 24(a1)
 ; RV64-NEXT:    ld a7, 0(a2)
-; RV64-NEXT:    ld t0, 16(a2)
-; RV64-NEXT:    ld t1, 24(a2)
-; RV64-NEXT:    ld a2, 8(a2)
+; RV64-NEXT:    ld t0, 8(a2)
+; RV64-NEXT:    ld t1, 16(a2)
+; RV64-NEXT:    ld a2, 24(a2)
 ; RV64-NEXT:    sext.w a3, a3
 ; RV64-NEXT:    snez t2, a3
 ; RV64-NEXT:    sltiu t3, a3, 3
@@ -59,21 +59,21 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV64-NEXT:    xori t4, t4, 1
 ; RV64-NEXT:    sltiu a3, a3, 2
 ; RV64-NEXT:    xori a3, a3, 1
-; RV64-NEXT:    and a2, a3, a2
-; RV64-NEXT:    and a3, t4, t1
-; RV64-NEXT:    and t0, t3, t0
+; RV64-NEXT:    and a3, a3, t0
+; RV64-NEXT:    and a2, t4, a2
+; RV64-NEXT:    and t0, t3, t1
 ; RV64-NEXT:    and a7, t2, a7
 ; RV64-NEXT:    negw a7, a7
-; RV64-NEXT:    and a1, a7, a1
+; RV64-NEXT:    and a4, a7, a4
 ; RV64-NEXT:    negw a7, t0
 ; RV64-NEXT:    and a6, a7, a6
-; RV64-NEXT:    negw a3, a3
-; RV64-NEXT:    and a3, a3, a5
 ; RV64-NEXT:    negw a2, a2
-; RV64-NEXT:    and a2, a2, a4
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a1, a1, a6
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    negw a2, a3
+; RV64-NEXT:    and a2, a2, a5
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a1, a4, a1
 ; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.add.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
index 209a37bf66ae3..86ef78be97afb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
@@ -1,197 +1,895 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,NOZFMIN,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,NOZFMIN,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,NOZFMIN,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,NOZFMIN,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZFMIN
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZFMIN
+
+define bfloat @extractelt_nxv1bf16_0(<vscale x 1 x bfloat> %v) {
+; NOZFMIN-LABEL: extractelt_nxv1bf16_0:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv1bf16_0:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 1 x bfloat> %v, i32 0
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv1bf16_imm(<vscale x 1 x bfloat> %v) {
+; NOZFMIN-LABEL: extractelt_nxv1bf16_imm:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; NOZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv1bf16_imm:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 1 x bfloat> %v, i32 2
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv1bf16_idx(<vscale x 1 x bfloat> %v, i32 zeroext %idx) {
+; NOZFMIN-LABEL: extractelt_nxv1bf16_idx:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; NOZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv1bf16_idx:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 1 x bfloat> %v, i32 %idx
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv2bf16_0(<vscale x 2 x bfloat> %v) {
+; NOZFMIN-LABEL: extractelt_nxv2bf16_0:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv2bf16_0:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 2 x bfloat> %v, i32 0
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv2bf16_imm(<vscale x 2 x bfloat> %v) {
+; NOZFMIN-LABEL: extractelt_nxv2bf16_imm:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; NOZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv2bf16_imm:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 2 x bfloat> %v, i32 2
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv2bf16_idx(<vscale x 2 x bfloat> %v, i32 zeroext %idx) {
+; NOZFMIN-LABEL: extractelt_nxv2bf16_idx:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; NOZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv2bf16_idx:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 2 x bfloat> %v, i32 %idx
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv4bf16_0(<vscale x 4 x bfloat> %v) {
+; NOZFMIN-LABEL: extractelt_nxv4bf16_0:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv4bf16_0:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 4 x bfloat> %v, i32 0
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv4bf16_imm(<vscale x 4 x bfloat> %v) {
+; NOZFMIN-LABEL: extractelt_nxv4bf16_imm:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; NOZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv4bf16_imm:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 4 x bfloat> %v, i32 2
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv4bf16_idx(<vscale x 4 x bfloat> %v, i32 zeroext %idx) {
+; NOZFMIN-LABEL: extractelt_nxv4bf16_idx:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; NOZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv4bf16_idx:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 4 x bfloat> %v, i32 %idx
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv8bf16_0(<vscale x 8 x bfloat> %v) {
+; NOZFMIN-LABEL: extractelt_nxv8bf16_0:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv8bf16_0:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 8 x bfloat> %v, i32 0
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv8bf16_imm(<vscale x 8 x bfloat> %v) {
+; NOZFMIN-LABEL: extractelt_nxv8bf16_imm:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; NOZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv8bf16_imm:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 8 x bfloat> %v, i32 2
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv8bf16_idx(<vscale x 8 x bfloat> %v, i32 zeroext %idx) {
+; NOZFMIN-LABEL: extractelt_nxv8bf16_idx:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; NOZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv8bf16_idx:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 8 x bfloat> %v, i32 %idx
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv16bf16_0(<vscale x 16 x bfloat> %v) {
+; NOZFMIN-LABEL: extractelt_nxv16bf16_0:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv16bf16_0:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 16 x bfloat> %v, i32 0
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv16bf16_imm(<vscale x 16 x bfloat> %v) {
+; NOZFMIN-LABEL: extractelt_nxv16bf16_imm:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; NOZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv16bf16_imm:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 16 x bfloat> %v, i32 2
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv16bf16_idx(<vscale x 16 x bfloat> %v, i32 zeroext %idx) {
+; NOZFMIN-LABEL: extractelt_nxv16bf16_idx:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m4, ta, ma
+; NOZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv16bf16_idx:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m4, ta, ma
+; ZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 16 x bfloat> %v, i32 %idx
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv32bf16_0(<vscale x 32 x bfloat> %v) {
+; NOZFMIN-LABEL: extractelt_nxv32bf16_0:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv32bf16_0:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 32 x bfloat> %v, i32 0
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv32bf16_imm(<vscale x 32 x bfloat> %v) {
+; NOZFMIN-LABEL: extractelt_nxv32bf16_imm:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; NOZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv32bf16_imm:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 32 x bfloat> %v, i32 2
+  ret bfloat %r
+}
+
+define bfloat @extractelt_nxv32bf16_idx(<vscale x 32 x bfloat> %v, i32 zeroext %idx) {
+; NOZFMIN-LABEL: extractelt_nxv32bf16_idx:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    vsetivli zero, 1, e16, m8, ta, ma
+; NOZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; NOZFMIN-NEXT:    vmv.x.s a0, v8
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv32bf16_idx:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m8, ta, ma
+; ZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
+  %r = extractelement <vscale x 32 x bfloat> %v, i32 %idx
+  ret bfloat %r
+}
 
 define half @extractelt_nxv1f16_0(<vscale x 1 x half> %v) {
-; CHECK-LABEL: extractelt_nxv1f16_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv1f16_0:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv1f16_0:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv1f16_0:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 1 x half> %v, i32 0
   ret half %r
 }
 
 define half @extractelt_nxv1f16_imm(<vscale x 1 x half> %v) {
-; CHECK-LABEL: extractelt_nxv1f16_imm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv1f16_imm:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFH-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv1f16_imm:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv1f16_imm:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 1 x half> %v, i32 2
   ret half %r
 }
 
 define half @extractelt_nxv1f16_idx(<vscale x 1 x half> %v, i32 zeroext %idx) {
-; CHECK-LABEL: extractelt_nxv1f16_idx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv1f16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv1f16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv1f16_idx:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 1 x half> %v, i32 %idx
   ret half %r
 }
 
 define half @extractelt_nxv2f16_0(<vscale x 2 x half> %v) {
-; CHECK-LABEL: extractelt_nxv2f16_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv2f16_0:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv2f16_0:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv2f16_0:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 2 x half> %v, i32 0
   ret half %r
 }
 
 define half @extractelt_nxv2f16_imm(<vscale x 2 x half> %v) {
-; CHECK-LABEL: extractelt_nxv2f16_imm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv2f16_imm:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFH-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv2f16_imm:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv2f16_imm:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 2 x half> %v, i32 2
   ret half %r
 }
 
 define half @extractelt_nxv2f16_idx(<vscale x 2 x half> %v, i32 zeroext %idx) {
-; CHECK-LABEL: extractelt_nxv2f16_idx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv2f16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv2f16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv2f16_idx:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 2 x half> %v, i32 %idx
   ret half %r
 }
 
 define half @extractelt_nxv4f16_0(<vscale x 4 x half> %v) {
-; CHECK-LABEL: extractelt_nxv4f16_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv4f16_0:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv4f16_0:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv4f16_0:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 4 x half> %v, i32 0
   ret half %r
 }
 
 define half @extractelt_nxv4f16_imm(<vscale x 4 x half> %v) {
-; CHECK-LABEL: extractelt_nxv4f16_imm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv4f16_imm:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv4f16_imm:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv4f16_imm:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 4 x half> %v, i32 2
   ret half %r
 }
 
 define half @extractelt_nxv4f16_idx(<vscale x 4 x half> %v, i32 zeroext %idx) {
-; CHECK-LABEL: extractelt_nxv4f16_idx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv4f16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv4f16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv4f16_idx:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 4 x half> %v, i32 %idx
   ret half %r
 }
 
 define half @extractelt_nxv8f16_0(<vscale x 8 x half> %v) {
-; CHECK-LABEL: extractelt_nxv8f16_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv8f16_0:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv8f16_0:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv8f16_0:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 8 x half> %v, i32 0
   ret half %r
 }
 
 define half @extractelt_nxv8f16_imm(<vscale x 8 x half> %v) {
-; CHECK-LABEL: extractelt_nxv8f16_imm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv8f16_imm:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv8f16_imm:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv8f16_imm:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 8 x half> %v, i32 2
   ret half %r
 }
 
 define half @extractelt_nxv8f16_idx(<vscale x 8 x half> %v, i32 zeroext %idx) {
-; CHECK-LABEL: extractelt_nxv8f16_idx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv8f16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv8f16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv8f16_idx:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 8 x half> %v, i32 %idx
   ret half %r
 }
 
 define half @extractelt_nxv16f16_0(<vscale x 16 x half> %v) {
-; CHECK-LABEL: extractelt_nxv16f16_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv16f16_0:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv16f16_0:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv16f16_0:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 16 x half> %v, i32 0
   ret half %r
 }
 
 define half @extractelt_nxv16f16_imm(<vscale x 16 x half> %v) {
-; CHECK-LABEL: extractelt_nxv16f16_imm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv16f16_imm:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv16f16_imm:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv16f16_imm:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 16 x half> %v, i32 2
   ret half %r
 }
 
 define half @extractelt_nxv16f16_idx(<vscale x 16 x half> %v, i32 zeroext %idx) {
-; CHECK-LABEL: extractelt_nxv16f16_idx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv16f16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m4, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv16f16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv16f16_idx:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m4, ta, ma
+; ZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 16 x half> %v, i32 %idx
   ret half %r
 }
 
 define half @extractelt_nxv32f16_0(<vscale x 32 x half> %v) {
-; CHECK-LABEL: extractelt_nxv32f16_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv32f16_0:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv32f16_0:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv32f16_0:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 32 x half> %v, i32 0
   ret half %r
 }
 
 define half @extractelt_nxv32f16_imm(<vscale x 32 x half> %v) {
-; CHECK-LABEL: extractelt_nxv32f16_imm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv32f16_imm:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv32f16_imm:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv32f16_imm:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZFMIN-NEXT:    vslidedown.vi v8, v8, 2
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 32 x half> %v, i32 2
   ret half %r
 }
 
 define half @extractelt_nxv32f16_idx(<vscale x 32 x half> %v, i32 zeroext %idx) {
-; CHECK-LABEL: extractelt_nxv32f16_idx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: extractelt_nxv32f16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m8, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_nxv32f16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    lui a1, 1048560
+; ZVFHMIN-NEXT:    or a0, a0, a1
+; ZVFHMIN-NEXT:    fmv.w.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: extractelt_nxv32f16_idx:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    vsetivli zero, 1, e16, m8, ta, ma
+; ZFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; ZFMIN-NEXT:    vmv.x.s a0, v8
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    ret
   %r = extractelement <vscale x 32 x half> %v, i32 %idx
   ret half %r
 }
@@ -636,10 +1334,10 @@ define double @extractelt_nxv16f64_neg1(<vscale x 16 x double> %v) {
 ; RV64-NEXT:    slli a2, a2, 1
 ; RV64-NEXT:    addi a2, a2, -1
 ; RV64-NEXT:    vs8r.v v16, (a3)
-; RV64-NEXT:    bltu a2, a1, .LBB52_2
+; RV64-NEXT:    bltu a2, a1, .LBB70_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a2, a1
-; RV64-NEXT:  .LBB52_2:
+; RV64-NEXT:  .LBB70_2:
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a0, a0, a2
 ; RV64-NEXT:    fld fa0, 0(a0)
@@ -669,10 +1367,10 @@ define double @extractelt_nxv16f64_idx(<vscale x 16 x double> %v, i32 zeroext %i
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a2, a1, 1
 ; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    bltu a0, a2, .LBB54_2
+; RV32-NEXT:    bltu a0, a2, .LBB72_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:  .LBB54_2:
+; RV32-NEXT:  .LBB72_2:
 ; RV32-NEXT:    addi sp, sp, -80
 ; RV32-NEXT:    .cfi_def_cfa_offset 80
 ; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -704,10 +1402,10 @@ define double @extractelt_nxv16f64_idx(<vscale x 16 x double> %v, i32 zeroext %i
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a2, a1, 1
 ; RV64-NEXT:    addi a2, a2, -1
-; RV64-NEXT:    bltu a0, a2, .LBB54_2
+; RV64-NEXT:    bltu a0, a2, .LBB72_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a0, a2
-; RV64-NEXT:  .LBB54_2:
+; RV64-NEXT:  .LBB72_2:
 ; RV64-NEXT:    addi sp, sp, -80
 ; RV64-NEXT:    .cfi_def_cfa_offset 80
 ; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
index 309ca1f964287..f1cfb6748fd61 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
@@ -121,13 +121,13 @@ define <512 x i8> @two_source(<512 x i8> %a, <512 x i8> %b) {
 ; CHECK-NEXT:    lbu a3, 985(sp)
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v24, a2
+; CHECK-NEXT:    lbu a1, 1012(sp)
 ; CHECK-NEXT:    vmv.s.x v24, a3
-; CHECK-NEXT:    li a1, 478
-; CHECK-NEXT:    li a2, 477
-; CHECK-NEXT:    lbu a3, 1012(sp)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v24, a2
-; CHECK-NEXT:    vmv.s.x v24, a3
+; CHECK-NEXT:    li a2, 478
+; CHECK-NEXT:    li a3, 477
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-NEXT:    vmv.s.x v24, a1
 ; CHECK-NEXT:    li a1, 501
 ; CHECK-NEXT:    li a2, 500
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
index 8ed19ddb1af5c..81e20a2988163 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
@@ -26,38 +26,38 @@ define void @add_v4i32(ptr %x, ptr %y) {
 define void @add_v2i64(ptr %x, ptr %y) {
 ; RV32-LABEL: add_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a2, 8(a0)
-; RV32-NEXT:    lw a3, 12(a0)
+; RV32-NEXT:    lw a2, 0(a1)
+; RV32-NEXT:    lw a3, 4(a1)
 ; RV32-NEXT:    lw a4, 0(a0)
 ; RV32-NEXT:    lw a5, 4(a0)
-; RV32-NEXT:    lw a6, 4(a1)
-; RV32-NEXT:    lw a7, 0(a1)
+; RV32-NEXT:    lw a6, 8(a0)
+; RV32-NEXT:    lw a7, 12(a0)
 ; RV32-NEXT:    lw t0, 8(a1)
 ; RV32-NEXT:    lw a1, 12(a1)
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a7, a4, a7
-; RV32-NEXT:    sltu a4, a7, a4
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add t0, a2, t0
-; RV32-NEXT:    sltu a2, t0, a2
-; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a3, a5, a3
+; RV32-NEXT:    add a2, a4, a2
+; RV32-NEXT:    sltu a4, a2, a4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a1, a7, a1
+; RV32-NEXT:    add t0, a6, t0
+; RV32-NEXT:    sltu a4, t0, a6
+; RV32-NEXT:    add a1, a1, a4
 ; RV32-NEXT:    sw t0, 8(a0)
-; RV32-NEXT:    sw a7, 0(a0)
+; RV32-NEXT:    sw a2, 0(a0)
 ; RV32-NEXT:    sw a1, 12(a0)
-; RV32-NEXT:    sw a4, 4(a0)
+; RV32-NEXT:    sw a3, 4(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_v2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a2, 8(a0)
-; RV64-NEXT:    ld a3, 0(a0)
+; RV64-NEXT:    ld a2, 0(a0)
+; RV64-NEXT:    ld a3, 8(a0)
 ; RV64-NEXT:    ld a4, 0(a1)
 ; RV64-NEXT:    ld a1, 8(a1)
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    sd a1, 8(a0)
-; RV64-NEXT:    sd a3, 0(a0)
+; RV64-NEXT:    sd a2, 0(a0)
 ; RV64-NEXT:    ret
   %a = load <2 x i64>, ptr %x
   %b = load <2 x i64>, ptr %y
@@ -134,14 +134,14 @@ define void @fadd_v4f32(ptr %x, ptr %y) {
 define void @fadd_v2f64(ptr %x, ptr %y) {
 ; CHECK-LABEL: fadd_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fld fa5, 8(a0)
-; CHECK-NEXT:    fld fa4, 0(a0)
+; CHECK-NEXT:    fld fa5, 0(a0)
+; CHECK-NEXT:    fld fa4, 8(a0)
 ; CHECK-NEXT:    fld fa3, 0(a1)
 ; CHECK-NEXT:    fld fa2, 8(a1)
-; CHECK-NEXT:    fadd.d fa4, fa4, fa3
-; CHECK-NEXT:    fadd.d fa5, fa5, fa2
-; CHECK-NEXT:    fsd fa5, 8(a0)
-; CHECK-NEXT:    fsd fa4, 0(a0)
+; CHECK-NEXT:    fadd.d fa5, fa5, fa3
+; CHECK-NEXT:    fadd.d fa4, fa4, fa2
+; CHECK-NEXT:    fsd fa4, 8(a0)
+; CHECK-NEXT:    fsd fa5, 0(a0)
 ; CHECK-NEXT:    ret
   %a = load <2 x double>, ptr %x
   %b = load <2 x double>, ptr %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll
index 170e71af09b49..bdedc5f33c3a1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll
@@ -26,9 +26,8 @@ define <4 x bfloat> @splat_idx_v4bf16(<4 x bfloat> %v, i64 %idx) {
 ; RV32-ZFBFMIN-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32-ZFBFMIN-NEXT:    .cfi_offset ra, -4
 ; RV32-ZFBFMIN-NEXT:    csrr a1, vlenb
-; RV32-ZFBFMIN-NEXT:    slli a1, a1, 1
 ; RV32-ZFBFMIN-NEXT:    sub sp, sp, a1
-; RV32-ZFBFMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV32-ZFBFMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
 ; RV32-ZFBFMIN-NEXT:    addi a1, sp, 32
 ; RV32-ZFBFMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-ZFBFMIN-NEXT:    andi a0, a0, 3
@@ -40,11 +39,9 @@ define <4 x bfloat> @splat_idx_v4bf16(<4 x bfloat> %v, i64 %idx) {
 ; RV32-ZFBFMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-ZFBFMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; RV32-ZFBFMIN-NEXT:    vse16.v v8, (a1)
-; RV32-ZFBFMIN-NEXT:    flh fa5, 0(a0)
-; RV32-ZFBFMIN-NEXT:    fmv.x.h a0, fa5
+; RV32-ZFBFMIN-NEXT:    lh a0, 0(a0)
 ; RV32-ZFBFMIN-NEXT:    vmv.v.x v8, a0
 ; RV32-ZFBFMIN-NEXT:    csrr a0, vlenb
-; RV32-ZFBFMIN-NEXT:    slli a0, a0, 1
 ; RV32-ZFBFMIN-NEXT:    add sp, sp, a0
 ; RV32-ZFBFMIN-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32-ZFBFMIN-NEXT:    addi sp, sp, 48
@@ -57,9 +54,8 @@ define <4 x bfloat> @splat_idx_v4bf16(<4 x bfloat> %v, i64 %idx) {
 ; RV64-ZFBFMIN-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64-ZFBFMIN-NEXT:    .cfi_offset ra, -8
 ; RV64-ZFBFMIN-NEXT:    csrr a1, vlenb
-; RV64-ZFBFMIN-NEXT:    slli a1, a1, 1
 ; RV64-ZFBFMIN-NEXT:    sub sp, sp, a1
-; RV64-ZFBFMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV64-ZFBFMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
 ; RV64-ZFBFMIN-NEXT:    addi a1, sp, 32
 ; RV64-ZFBFMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-ZFBFMIN-NEXT:    andi a0, a0, 3
@@ -71,11 +67,9 @@ define <4 x bfloat> @splat_idx_v4bf16(<4 x bfloat> %v, i64 %idx) {
 ; RV64-ZFBFMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-ZFBFMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; RV64-ZFBFMIN-NEXT:    vse16.v v8, (a1)
-; RV64-ZFBFMIN-NEXT:    flh fa5, 0(a0)
-; RV64-ZFBFMIN-NEXT:    fmv.x.h a0, fa5
+; RV64-ZFBFMIN-NEXT:    lh a0, 0(a0)
 ; RV64-ZFBFMIN-NEXT:    vmv.v.x v8, a0
 ; RV64-ZFBFMIN-NEXT:    csrr a0, vlenb
-; RV64-ZFBFMIN-NEXT:    slli a0, a0, 1
 ; RV64-ZFBFMIN-NEXT:    add sp, sp, a0
 ; RV64-ZFBFMIN-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64-ZFBFMIN-NEXT:    addi sp, sp, 48
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index b5d3e2cd776f2..53059a4f28d42 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -206,9 +206,8 @@ define <4 x half> @splat_idx_v4f16(<4 x half> %v, i64 %idx) {
 ; RV32-ZFHMIN-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32-ZFHMIN-NEXT:    .cfi_offset ra, -4
 ; RV32-ZFHMIN-NEXT:    csrr a1, vlenb
-; RV32-ZFHMIN-NEXT:    slli a1, a1, 1
 ; RV32-ZFHMIN-NEXT:    sub sp, sp, a1
-; RV32-ZFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV32-ZFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
 ; RV32-ZFHMIN-NEXT:    addi a1, sp, 32
 ; RV32-ZFHMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-ZFHMIN-NEXT:    andi a0, a0, 3
@@ -220,11 +219,9 @@ define <4 x half> @splat_idx_v4f16(<4 x half> %v, i64 %idx) {
 ; RV32-ZFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-ZFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; RV32-ZFHMIN-NEXT:    vse16.v v8, (a1)
-; RV32-ZFHMIN-NEXT:    flh fa5, 0(a0)
-; RV32-ZFHMIN-NEXT:    fmv.x.h a0, fa5
+; RV32-ZFHMIN-NEXT:    lh a0, 0(a0)
 ; RV32-ZFHMIN-NEXT:    vmv.v.x v8, a0
 ; RV32-ZFHMIN-NEXT:    csrr a0, vlenb
-; RV32-ZFHMIN-NEXT:    slli a0, a0, 1
 ; RV32-ZFHMIN-NEXT:    add sp, sp, a0
 ; RV32-ZFHMIN-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32-ZFHMIN-NEXT:    addi sp, sp, 48
@@ -237,9 +234,8 @@ define <4 x half> @splat_idx_v4f16(<4 x half> %v, i64 %idx) {
 ; RV64-ZFHMIN-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64-ZFHMIN-NEXT:    .cfi_offset ra, -8
 ; RV64-ZFHMIN-NEXT:    csrr a1, vlenb
-; RV64-ZFHMIN-NEXT:    slli a1, a1, 1
 ; RV64-ZFHMIN-NEXT:    sub sp, sp, a1
-; RV64-ZFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV64-ZFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
 ; RV64-ZFHMIN-NEXT:    addi a1, sp, 32
 ; RV64-ZFHMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-ZFHMIN-NEXT:    andi a0, a0, 3
@@ -251,11 +247,9 @@ define <4 x half> @splat_idx_v4f16(<4 x half> %v, i64 %idx) {
 ; RV64-ZFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-ZFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; RV64-ZFHMIN-NEXT:    vse16.v v8, (a1)
-; RV64-ZFHMIN-NEXT:    flh fa5, 0(a0)
-; RV64-ZFHMIN-NEXT:    fmv.x.h a0, fa5
+; RV64-ZFHMIN-NEXT:    lh a0, 0(a0)
 ; RV64-ZFHMIN-NEXT:    vmv.v.x v8, a0
 ; RV64-ZFHMIN-NEXT:    csrr a0, vlenb
-; RV64-ZFHMIN-NEXT:    slli a0, a0, 1
 ; RV64-ZFHMIN-NEXT:    add sp, sp, a0
 ; RV64-ZFHMIN-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64-ZFHMIN-NEXT:    addi sp, sp, 48
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index 5ab8eab091c2e..69faf269ae3db 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -508,101 +508,15 @@ define void @fabs_v8f16(ptr %x) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fabs_v8f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-RV32-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-RV32-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa4
-; ZVFHMIN-RV32-NEXT:    lui a3, 8
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa3
-; ZVFHMIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-RV32-NEXT:    addi a3, a3, -1
-; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-RV32-NEXT:    and a4, a4, a3
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
-; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
-; ZVFHMIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fabs_v8f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-RV64-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-RV64-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa4
-; ZVFHMIN-RV64-NEXT:    lui a3, 8
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa3
-; ZVFHMIN-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-RV64-NEXT:    addiw a3, a3, -1
-; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-RV64-NEXT:    and a1, a1, a3
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-RV64-NEXT:    and a4, a4, a3
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-RV64-NEXT:    and a1, a1, a3
-; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    and a1, a1, a3
-; ZVFHMIN-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fabs_v8f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = call <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
   store <8 x half> %b, ptr %x
@@ -619,105 +533,15 @@ define void @fabs_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fabs_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-RV32-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-RV32-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa4
-; ZVFHMIN-RV32-NEXT:    lui a3, 8
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa3
-; ZVFHMIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-RV32-NEXT:    addi a3, a3, -1
-; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-RV32-NEXT:    and a4, a4, a3
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
-; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
-; ZVFHMIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fabs_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-RV64-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-RV64-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa4
-; ZVFHMIN-RV64-NEXT:    lui a3, 8
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa3
-; ZVFHMIN-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-RV64-NEXT:    addiw a3, a3, -1
-; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-RV64-NEXT:    and a1, a1, a3
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-RV64-NEXT:    and a4, a4, a3
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-RV64-NEXT:    and a1, a1, a3
-; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    and a1, a1, a3
-; ZVFHMIN-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fabs_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = call <6 x half> @llvm.fabs.v6f16(<6 x half> %a)
   store <6 x half> %b, ptr %x
@@ -773,287 +597,18 @@ define void @copysign_v8f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-ZFH-RV32-LABEL: copysign_v8f16:
-; ZVFHMIN-ZFH-RV32:       # %bb.0:
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 18(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 16(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 0(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa2, fa3
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 20(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 22(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 6(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa1, fa2
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 26(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 24(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 30(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFH-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFH-RV64-LABEL: copysign_v8f16:
-; ZVFHMIN-ZFH-RV64:       # %bb.0:
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 18(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 16(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 0(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa2, fa3
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 20(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 22(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 6(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa1, fa2
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 26(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 10(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 24(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 8(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 12(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 30(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFH-RV64-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_v8f16:
-; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 18(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 16(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 20(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a1, 1048568
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 22(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui t1, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a2, t1, -1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a3, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, t1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 26(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a5, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a6, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, t0, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a5, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 24(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, t2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a7, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a6, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a5, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a7, t1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 30(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a6, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a4, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a5, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a3, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_v8f16:
-; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 18(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 16(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 0(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 20(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a1, 1048568
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t0, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 22(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui t1, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a2, t1, -1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a3, a7
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, t1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 26(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a5, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a6, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, t0, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a5, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 24(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, t2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, a7, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a6, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a5, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a7, t1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a4, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 30(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a6, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a4, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a5, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a3, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: copysign_v8f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v9, v8
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
   %c = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b)
@@ -1072,295 +627,20 @@ define void @copysign_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-ZFH-RV32-LABEL: copysign_v6f16:
-; ZVFHMIN-ZFH-RV32:       # %bb.0:
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 18(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 16(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 0(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa2, fa3
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 20(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 22(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 6(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa1, fa2
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 26(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 24(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 30(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFH-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFH-RV64-LABEL: copysign_v6f16:
-; ZVFHMIN-ZFH-RV64:       # %bb.0:
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 18(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 16(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 0(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa2, fa3
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 20(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 22(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 6(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa1, fa2
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 26(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 10(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 24(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 8(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 12(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 30(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFH-RV64-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_v6f16:
-; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 18(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 16(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 20(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a1, 1048568
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 22(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui t1, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a2, t1, -1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a3, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, t1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 26(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a5, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a6, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, t0, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a5, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 24(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, t2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a7, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a6, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a5, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a7, t1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 30(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a6, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a4, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a5, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a3, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_v6f16:
-; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 18(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 16(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 0(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 20(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a1, 1048568
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t0, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 22(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui t1, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a2, t1, -1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a3, a7
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, t1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 26(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a5, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a6, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, t0, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a5, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 24(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, t2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, a7, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a6, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a5, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a7, t1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a4, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 30(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a6, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a4, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a5, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a3, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: copysign_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vor.vv v8, v9, v8
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = call <6 x half> @llvm.copysign.v6f16(<6 x half> %a, <6 x half> %b)
@@ -1421,215 +701,19 @@ define void @copysign_vf_v8f16(ptr %x, half %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-ZFH-RV32-LABEL: copysign_vf_v8f16:
-; ZVFHMIN-ZFH-RV32:       # %bb.0:
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa4, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 6(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa4, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFH-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFH-RV64-LABEL: copysign_vf_v8f16:
-; ZVFHMIN-ZFH-RV64:       # %bb.0:
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa4, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 6(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa4, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a4, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 8(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFH-RV64-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_vf_v8f16:
-; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a2, 1048568
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a3, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a3, a3, -1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_vf_v8f16:
-; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a2, 1048568
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a3, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 0(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a3, a3, -1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a4, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a4, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a4, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: copysign_vf_v8f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a2, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a2
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = insertelement <8 x half> poison, half %y, i32 0
   %c = shufflevector <8 x half> %b, <8 x half> poison, <8 x i32> zeroinitializer
@@ -1647,227 +731,25 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-ZFH-RV32-LABEL: copysign_vf_v6f16:
-; ZVFHMIN-ZFH-RV32:       # %bb.0:
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa4, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 6(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa4, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.h fa5, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFH-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFH-RV64-LABEL: copysign_vf_v6f16:
-; ZVFHMIN-ZFH-RV64:       # %bb.0:
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa4, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 6(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa4, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a4, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.h fa5, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFH-RV64-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_vf_v6f16:
-; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a2, 1048568
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a4, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a4, a4, -1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a3, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a5, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a5, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a3, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a5, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a3, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_vf_v6f16:
-; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a2, 1048568
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a4, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 0(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a4, a4, -1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a3, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a5, a5, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a5, a5, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a3, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a5, a5, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a3, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: copysign_vf_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    li a2, 192
+; ZVFHMIN-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = insertelement <6 x half> poison, half %y, i32 0
   %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer
@@ -1926,293 +808,19 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_v8f16:
-; ZVFHMIN-ZFH-RV32:       # %bb.0:
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    lui a1, 8
-; ZVFHMIN-ZFH-RV32-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 18(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 0(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 16(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa3, fa2
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 4(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 20(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 6(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 22(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa3
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa2, fa1
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 26(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 24(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa4, fa3
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa2
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 28(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 30(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFH-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_v8f16:
-; ZVFHMIN-ZFH-RV64:       # %bb.0:
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    lui a1, 8
-; ZVFHMIN-ZFH-RV64-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 18(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 0(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 16(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa3, fa2
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 4(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 20(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 6(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 22(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa3
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa2, fa1
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 26(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 24(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa4, fa3
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa2
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 28(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 30(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFH-RV64-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_v8f16:
-; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a1, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a2, sp
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a2)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a2, sp, 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a2)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 18(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 16(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 20(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a2, a1, -1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t1, a3, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a3, 1048568
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 22(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, t1, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a5, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 26(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a7, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, t0, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a5, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, t2, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, t1, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 24(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a5, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a5, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, t0, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a6, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 30(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a5, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a4, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_v8f16:
-; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a1, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a2, sp
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a2)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a2, sp, 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a2)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 18(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 0(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 16(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 20(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a2, a1, -1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t0, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and t1, a3, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a3, 1048568
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 22(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, t1, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, a6, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a5, a5, a6
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 26(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a7, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, t0, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a5, a5, a6
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, t2, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, t1, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t0, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 24(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a5, a5, a7
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, a6, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a4, a6
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a5, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, t0, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a6, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 30(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a5, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a4, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: copysign_neg_v8f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    addi a2, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a2
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v9, v8
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
   %c = fneg <8 x half> %b
@@ -2231,301 +839,21 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_v6f16:
-; ZVFHMIN-ZFH-RV32:       # %bb.0:
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    lui a1, 8
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 18(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 0(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 16(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa3, fa2
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 4(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 20(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 6(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 22(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa3
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa2, fa1
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 26(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 24(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa4, fa3
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa2
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 28(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 30(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFH-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_v6f16:
-; ZVFHMIN-ZFH-RV64:       # %bb.0:
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    lui a1, 8
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 18(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 0(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 16(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa3, fa2
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 4(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 20(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 6(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 22(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa3
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa2, fa1
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 26(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 24(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa4, fa3
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa2
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 28(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 30(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFH-RV64-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_v6f16:
-; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a1, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a2, sp
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a2)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a2, sp, 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a2)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 18(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 16(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 20(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a2, a1, -1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t1, a3, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a3, 1048568
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 22(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, t1, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a5, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 26(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a7, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, t0, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a5, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, t2, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, t1, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 24(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a5, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a5, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, t0, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a6, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 30(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a5, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a4, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_v6f16:
-; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a1, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a2, sp
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a2)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a2, sp, 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a2)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 18(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 0(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 16(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 20(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a2, a1, -1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t0, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and t1, a3, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a3, 1048568
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 22(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, t1, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, a6, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a5, a5, a6
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 26(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a7, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, t0, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a5, a5, a6
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, t2, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, t1, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t0, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 24(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a5, a5, a7
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, a6, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a4, a6
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a5, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, t0, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a6, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 30(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a5, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a4, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: copysign_neg_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    addi a2, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a2
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vor.vv v8, v9, v8
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = fneg <6 x half> %b
@@ -2588,185 +916,20 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_trunc_v4f16_v4f32:
-; ZVFHMIN-ZFH-RV32:       # %bb.0:
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    vle32.v v9, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-ZFH-RV32-NEXT:    lui a1, 8
-; ZVFHMIN-ZFH-RV32-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 8
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 4(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa3
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa2, fa1
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFH-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_trunc_v4f16_v4f32:
-; ZVFHMIN-ZFH-RV64:       # %bb.0:
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    vle32.v v9, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-ZFH-RV64-NEXT:    lui a1, 8
-; ZVFHMIN-ZFH-RV64-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 8
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 8(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 4(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 12(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa3
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa2, fa1
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFH-RV64-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_trunc_v4f16_v4f32:
-; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle32.v v9, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a1, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a2, sp, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a2)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a5, a1, -1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a6, 1048568
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a7, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a4, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a3, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_trunc_v4f16_v4f32:
-; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle32.v v9, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a1, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a2, sp, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a2)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 0(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a5, a1, -1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a6, 1048568
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a6
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a2, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a7, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a4, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a6
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a3, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a6
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: copysign_neg_trunc_v4f16_v4f32:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle32.v v9, (a1)
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a2, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a2
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <4 x half>, ptr %x
   %b = load <4 x float>, ptr %y
   %c = fneg <4 x float> %b
@@ -2790,193 +953,22 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32:
-; ZVFHMIN-ZFH-RV32:       # %bb.0:
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    vle32.v v9, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-ZFH-RV32-NEXT:    lui a1, 8
-; ZVFHMIN-ZFH-RV32-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 8
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 4(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa3
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa2, fa1
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFH-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32:
-; ZVFHMIN-ZFH-RV64:       # %bb.0:
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    vle32.v v9, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-ZFH-RV64-NEXT:    lui a1, 8
-; ZVFHMIN-ZFH-RV64-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 8
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 8(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 4(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 12(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa3
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa2, fa1
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFH-RV64-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32:
-; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle32.v v9, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a1, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a2, sp, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a2)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a5, a1, -1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a6, 1048568
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a7, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a4, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a3, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32:
-; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle32.v v9, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a1, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a2, sp, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a2)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 0(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a5, a1, -1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a6, 1048568
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a6
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a2, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a7, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a4, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a6
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a3, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a6
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: copysign_neg_trunc_v3f16_v3f32:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle32.v v9, (a1)
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a2, a1, -1
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a2
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <3 x half>, ptr %x
   %b = load <3 x float>, ptr %y
   %c = fneg <3 x float> %b
@@ -6178,3 +4170,10 @@ define void @fnmadd_fmuladd_v2f64(ptr %x, ptr %y, ptr %z) {
   store <2 x double> %d, ptr %x
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ZVFHMIN-RV32: {{.*}}
+; ZVFHMIN-RV64: {{.*}}
+; ZVFHMIN-ZFH-RV32: {{.*}}
+; ZVFHMIN-ZFH-RV64: {{.*}}
+; ZVFHMIN-ZFHIN-RV32: {{.*}}
+; ZVFHMIN-ZFHIN-RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index cbea842e28f0f..43cee6610e787 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -1398,37 +1398,37 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a4, a4, 24
 ; RV32VB-NEXT:    or a3, a4, a3
+; RV32VB-NEXT:    lbu a2, 4(a0)
+; RV32VB-NEXT:    lbu a4, 5(a0)
 ; RV32VB-NEXT:    or a1, a1, a3
-; RV32VB-NEXT:    lbu a2, 5(a0)
-; RV32VB-NEXT:    lbu a3, 4(a0)
-; RV32VB-NEXT:    lbu a4, 6(a0)
+; RV32VB-NEXT:    lbu a3, 6(a0)
 ; RV32VB-NEXT:    lbu a5, 7(a0)
-; RV32VB-NEXT:    slli a2, a2, 8
-; RV32VB-NEXT:    or a2, a3, a2
-; RV32VB-NEXT:    slli a4, a4, 16
-; RV32VB-NEXT:    slli a5, a5, 24
-; RV32VB-NEXT:    or a4, a5, a4
+; RV32VB-NEXT:    slli a4, a4, 8
 ; RV32VB-NEXT:    or a2, a2, a4
-; RV32VB-NEXT:    lbu a3, 9(a0)
+; RV32VB-NEXT:    slli a3, a3, 16
+; RV32VB-NEXT:    slli a5, a5, 24
+; RV32VB-NEXT:    or a3, a5, a3
 ; RV32VB-NEXT:    lbu a4, 8(a0)
-; RV32VB-NEXT:    lbu a5, 10(a0)
+; RV32VB-NEXT:    lbu a5, 9(a0)
+; RV32VB-NEXT:    or a2, a2, a3
+; RV32VB-NEXT:    lbu a3, 10(a0)
 ; RV32VB-NEXT:    lbu a6, 11(a0)
-; RV32VB-NEXT:    slli a3, a3, 8
-; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    slli a5, a5, 16
+; RV32VB-NEXT:    slli a5, a5, 8
+; RV32VB-NEXT:    or a4, a4, a5
+; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a6, a6, 24
-; RV32VB-NEXT:    or a4, a6, a5
-; RV32VB-NEXT:    or a3, a3, a4
-; RV32VB-NEXT:    lbu a4, 13(a0)
+; RV32VB-NEXT:    or a3, a6, a3
 ; RV32VB-NEXT:    lbu a5, 12(a0)
-; RV32VB-NEXT:    lbu a6, 14(a0)
+; RV32VB-NEXT:    lbu a6, 13(a0)
+; RV32VB-NEXT:    or a3, a4, a3
+; RV32VB-NEXT:    lbu a4, 14(a0)
 ; RV32VB-NEXT:    lbu a0, 15(a0)
-; RV32VB-NEXT:    slli a4, a4, 8
-; RV32VB-NEXT:    or a4, a5, a4
-; RV32VB-NEXT:    slli a6, a6, 16
+; RV32VB-NEXT:    slli a6, a6, 8
+; RV32VB-NEXT:    or a5, a5, a6
+; RV32VB-NEXT:    slli a4, a4, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a6
-; RV32VB-NEXT:    or a0, a4, a0
+; RV32VB-NEXT:    or a0, a0, a4
+; RV32VB-NEXT:    or a0, a5, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a2
@@ -1443,27 +1443,27 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a3, 2(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 3(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    packh a2, a3, a4
-; RV32VB-PACK-NEXT:    pack a1, a1, a2
 ; RV32VB-PACK-NEXT:    lbu a2, 4(a0)
-; RV32VB-PACK-NEXT:    lbu a3, 5(a0)
-; RV32VB-PACK-NEXT:    lbu a4, 6(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 7(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 8(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 9(a0)
-; RV32VB-PACK-NEXT:    packh a2, a2, a3
-; RV32VB-PACK-NEXT:    packh a3, a4, a5
-; RV32VB-PACK-NEXT:    pack a2, a2, a3
+; RV32VB-PACK-NEXT:    lbu a5, 5(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 6(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 7(a0)
+; RV32VB-PACK-NEXT:    packh a3, a3, a4
+; RV32VB-PACK-NEXT:    pack a1, a1, a3
+; RV32VB-PACK-NEXT:    packh a2, a2, a5
 ; RV32VB-PACK-NEXT:    packh a3, a6, a7
-; RV32VB-PACK-NEXT:    lbu a4, 10(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 11(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 12(a0)
+; RV32VB-PACK-NEXT:    lbu a4, 8(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 9(a0)
+; RV32VB-PACK-NEXT:    pack a2, a2, a3
+; RV32VB-PACK-NEXT:    lbu a3, 10(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 11(a0)
+; RV32VB-PACK-NEXT:    packh a4, a4, a5
+; RV32VB-PACK-NEXT:    lbu a5, 12(a0)
 ; RV32VB-PACK-NEXT:    lbu a7, 13(a0)
 ; RV32VB-PACK-NEXT:    lbu t0, 14(a0)
 ; RV32VB-PACK-NEXT:    lbu a0, 15(a0)
-; RV32VB-PACK-NEXT:    packh a4, a4, a5
-; RV32VB-PACK-NEXT:    pack a3, a3, a4
-; RV32VB-PACK-NEXT:    packh a4, a6, a7
+; RV32VB-PACK-NEXT:    packh a3, a3, a6
+; RV32VB-PACK-NEXT:    pack a3, a4, a3
+; RV32VB-PACK-NEXT:    packh a4, a5, a7
 ; RV32VB-PACK-NEXT:    packh a0, t0, a0
 ; RV32VB-PACK-NEXT:    pack a0, a4, a0
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
@@ -1532,34 +1532,34 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RVA22U64-NEXT:    slli a3, a3, 16
 ; RVA22U64-NEXT:    slli a4, a4, 24
 ; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    lbu a2, 4(a0)
 ; RVA22U64-NEXT:    or a1, a1, a3
+; RVA22U64-NEXT:    lbu a2, 4(a0)
 ; RVA22U64-NEXT:    lbu a3, 5(a0)
 ; RVA22U64-NEXT:    lbu a4, 6(a0)
-; RVA22U64-NEXT:    slli a2, a2, 32
 ; RVA22U64-NEXT:    lbu a5, 7(a0)
+; RVA22U64-NEXT:    slli a2, a2, 32
 ; RVA22U64-NEXT:    slli a3, a3, 40
 ; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    slli a4, a4, 48
 ; RVA22U64-NEXT:    slli a5, a5, 56
 ; RVA22U64-NEXT:    or a4, a4, a5
 ; RVA22U64-NEXT:    or a2, a2, a4
-; RVA22U64-NEXT:    or a1, a1, a2
-; RVA22U64-NEXT:    lbu a2, 9(a0)
 ; RVA22U64-NEXT:    lbu a3, 8(a0)
-; RVA22U64-NEXT:    lbu a4, 10(a0)
+; RVA22U64-NEXT:    lbu a4, 9(a0)
+; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    lbu a2, 10(a0)
 ; RVA22U64-NEXT:    lbu a5, 11(a0)
-; RVA22U64-NEXT:    slli a2, a2, 8
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    slli a4, a4, 16
+; RVA22U64-NEXT:    slli a4, a4, 8
+; RVA22U64-NEXT:    or a3, a3, a4
+; RVA22U64-NEXT:    slli a2, a2, 16
 ; RVA22U64-NEXT:    slli a5, a5, 24
-; RVA22U64-NEXT:    or a4, a4, a5
+; RVA22U64-NEXT:    or a2, a2, a5
+; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    lbu a3, 12(a0)
-; RVA22U64-NEXT:    or a2, a2, a4
 ; RVA22U64-NEXT:    lbu a4, 13(a0)
 ; RVA22U64-NEXT:    lbu a5, 14(a0)
-; RVA22U64-NEXT:    slli a3, a3, 32
 ; RVA22U64-NEXT:    lbu a0, 15(a0)
+; RVA22U64-NEXT:    slli a3, a3, 32
 ; RVA22U64-NEXT:    slli a4, a4, 40
 ; RVA22U64-NEXT:    or a3, a3, a4
 ; RVA22U64-NEXT:    slli a5, a5, 48
@@ -1576,34 +1576,34 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RVA22U64-PACK:       # %bb.0:
 ; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
 ; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 2(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 2(a0)
 ; RVA22U64-PACK-NEXT:    lbu a4, 3(a0)
-; RVA22U64-PACK-NEXT:    packh a1, a1, a2
-; RVA22U64-PACK-NEXT:    packh a2, a3, a4
-; RVA22U64-PACK-NEXT:    lbu a3, 4(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 5(a0)
-; RVA22U64-PACK-NEXT:    packw a6, a1, a2
-; RVA22U64-PACK-NEXT:    lbu a2, 6(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 7(a0)
-; RVA22U64-PACK-NEXT:    packh a3, a3, a4
-; RVA22U64-PACK-NEXT:    lbu a4, 8(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 9(a0)
+; RVA22U64-PACK-NEXT:    packh a7, a1, a2
+; RVA22U64-PACK-NEXT:    lbu a2, 4(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 5(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 6(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 7(a0)
+; RVA22U64-PACK-NEXT:    packh a4, a6, a4
+; RVA22U64-PACK-NEXT:    packw a4, a7, a4
 ; RVA22U64-PACK-NEXT:    packh a2, a2, a5
-; RVA22U64-PACK-NEXT:    packw a2, a3, a2
-; RVA22U64-PACK-NEXT:    pack a6, a6, a2
-; RVA22U64-PACK-NEXT:    packh a7, a4, a1
-; RVA22U64-PACK-NEXT:    lbu a3, 10(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 11(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 12(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 13(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 14(a0)
+; RVA22U64-PACK-NEXT:    packh a1, a3, a1
+; RVA22U64-PACK-NEXT:    packw a1, a2, a1
+; RVA22U64-PACK-NEXT:    lbu a2, 8(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 9(a0)
+; RVA22U64-PACK-NEXT:    pack a6, a4, a1
+; RVA22U64-PACK-NEXT:    lbu a7, 10(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 11(a0)
+; RVA22U64-PACK-NEXT:    packh a2, a2, a3
+; RVA22U64-PACK-NEXT:    lbu a3, 12(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 13(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 14(a0)
 ; RVA22U64-PACK-NEXT:    lbu a0, 15(a0)
-; RVA22U64-PACK-NEXT:    packh a3, a3, a4
-; RVA22U64-PACK-NEXT:    packw a3, a7, a3
-; RVA22U64-PACK-NEXT:    packh a2, a5, a2
-; RVA22U64-PACK-NEXT:    packh a0, a1, a0
-; RVA22U64-PACK-NEXT:    packw a0, a2, a0
-; RVA22U64-PACK-NEXT:    pack a0, a3, a0
+; RVA22U64-PACK-NEXT:    packh a5, a7, a5
+; RVA22U64-PACK-NEXT:    packw a2, a2, a5
+; RVA22U64-PACK-NEXT:    packh a1, a3, a1
+; RVA22U64-PACK-NEXT:    packh a0, a4, a0
+; RVA22U64-PACK-NEXT:    packw a0, a1, a0
+; RVA22U64-PACK-NEXT:    pack a0, a2, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.v.x v8, a6
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
@@ -1720,39 +1720,39 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu a2, 1(a0)
 ; RV32-ONLY-NEXT:    lbu a3, 22(a0)
 ; RV32-ONLY-NEXT:    lbu a4, 31(a0)
-; RV32-ONLY-NEXT:    lbu a5, 44(a0)
-; RV32-ONLY-NEXT:    lbu a6, 55(a0)
-; RV32-ONLY-NEXT:    lbu a7, 623(a0)
+; RV32-ONLY-NEXT:    lbu a5, 623(a0)
+; RV32-ONLY-NEXT:    lbu a6, 44(a0)
+; RV32-ONLY-NEXT:    lbu a7, 55(a0)
 ; RV32-ONLY-NEXT:    lbu t0, 75(a0)
 ; RV32-ONLY-NEXT:    lbu t1, 82(a0)
-; RV32-ONLY-NEXT:    lbu t2, 93(a0)
-; RV32-ONLY-NEXT:    lbu t3, 105(a0)
-; RV32-ONLY-NEXT:    lbu t4, 161(a0)
-; RV32-ONLY-NEXT:    lbu t5, 124(a0)
-; RV32-ONLY-NEXT:    lbu t6, 163(a0)
-; RV32-ONLY-NEXT:    lbu s0, 144(a0)
-; RV32-ONLY-NEXT:    lbu a0, 154(a0)
+; RV32-ONLY-NEXT:    lbu t2, 154(a0)
+; RV32-ONLY-NEXT:    lbu t3, 161(a0)
+; RV32-ONLY-NEXT:    lbu t4, 163(a0)
+; RV32-ONLY-NEXT:    lbu t5, 93(a0)
+; RV32-ONLY-NEXT:    lbu t6, 105(a0)
+; RV32-ONLY-NEXT:    lbu s0, 124(a0)
+; RV32-ONLY-NEXT:    lbu a0, 144(a0)
 ; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a1
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a4
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32-ONLY-NEXT:    vslide1down.vx v9, v8, t0
 ; RV32-ONLY-NEXT:    vmv.v.x v8, t1
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t2
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t3
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t4
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t5
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t6
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t3
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, s0
-; RV32-ONLY-NEXT:    li a1, 255
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t4
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-ONLY-NEXT:    li a0, 255
 ; RV32-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-ONLY-NEXT:    vmv.s.x v0, a1
+; RV32-ONLY-NEXT:    vmv.s.x v0, a0
 ; RV32-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t2
 ; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
 ; RV32-ONLY-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-ONLY-NEXT:    addi sp, sp, 16
@@ -1770,36 +1770,36 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV32VB-NEXT:    slli a4, a4, 24
 ; RV32VB-NEXT:    or a3, a4, a3
 ; RV32VB-NEXT:    or a1, a1, a3
-; RV32VB-NEXT:    lbu a2, 55(a0)
-; RV32VB-NEXT:    lbu a3, 44(a0)
+; RV32VB-NEXT:    lbu a2, 44(a0)
+; RV32VB-NEXT:    lbu a3, 55(a0)
 ; RV32VB-NEXT:    lbu a4, 623(a0)
 ; RV32VB-NEXT:    lbu a5, 75(a0)
-; RV32VB-NEXT:    slli a2, a2, 8
-; RV32VB-NEXT:    or a2, a3, a2
+; RV32VB-NEXT:    lbu a6, 82(a0)
+; RV32VB-NEXT:    slli a3, a3, 8
+; RV32VB-NEXT:    or a2, a2, a3
 ; RV32VB-NEXT:    slli a4, a4, 16
 ; RV32VB-NEXT:    slli a5, a5, 24
 ; RV32VB-NEXT:    or a4, a5, a4
 ; RV32VB-NEXT:    or a2, a2, a4
 ; RV32VB-NEXT:    lbu a3, 93(a0)
-; RV32VB-NEXT:    lbu a4, 82(a0)
-; RV32VB-NEXT:    lbu a5, 105(a0)
-; RV32VB-NEXT:    lbu a6, 161(a0)
+; RV32VB-NEXT:    lbu a4, 105(a0)
+; RV32VB-NEXT:    lbu a5, 124(a0)
+; RV32VB-NEXT:    lbu a7, 144(a0)
 ; RV32VB-NEXT:    slli a3, a3, 8
-; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    slli a5, a5, 16
-; RV32VB-NEXT:    slli a6, a6, 24
-; RV32VB-NEXT:    or a4, a6, a5
+; RV32VB-NEXT:    lbu t0, 154(a0)
+; RV32VB-NEXT:    lbu t1, 161(a0)
+; RV32VB-NEXT:    or a3, a6, a3
+; RV32VB-NEXT:    slli a4, a4, 16
+; RV32VB-NEXT:    lbu a0, 163(a0)
+; RV32VB-NEXT:    slli t1, t1, 24
+; RV32VB-NEXT:    or a4, t1, a4
 ; RV32VB-NEXT:    or a3, a3, a4
-; RV32VB-NEXT:    lbu a4, 163(a0)
-; RV32VB-NEXT:    lbu a5, 124(a0)
-; RV32VB-NEXT:    lbu a6, 144(a0)
-; RV32VB-NEXT:    lbu a0, 154(a0)
-; RV32VB-NEXT:    slli a4, a4, 8
-; RV32VB-NEXT:    or a4, a5, a4
-; RV32VB-NEXT:    slli a6, a6, 16
-; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a6
-; RV32VB-NEXT:    or a0, a4, a0
+; RV32VB-NEXT:    slli a0, a0, 8
+; RV32VB-NEXT:    or a0, a5, a0
+; RV32VB-NEXT:    slli a7, a7, 16
+; RV32VB-NEXT:    slli t0, t0, 24
+; RV32VB-NEXT:    or a4, t0, a7
+; RV32VB-NEXT:    or a0, a0, a4
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a2
@@ -1815,32 +1815,32 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a4, 31(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
 ; RV32VB-PACK-NEXT:    packh a2, a3, a4
+; RV32VB-PACK-NEXT:    lbu a3, 623(a0)
+; RV32VB-PACK-NEXT:    lbu a4, 44(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 55(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 75(a0)
 ; RV32VB-PACK-NEXT:    pack a1, a1, a2
-; RV32VB-PACK-NEXT:    lbu a2, 44(a0)
-; RV32VB-PACK-NEXT:    lbu a3, 55(a0)
-; RV32VB-PACK-NEXT:    lbu a4, 623(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 75(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 82(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 93(a0)
-; RV32VB-PACK-NEXT:    packh a2, a2, a3
-; RV32VB-PACK-NEXT:    packh a3, a4, a5
-; RV32VB-PACK-NEXT:    pack a2, a2, a3
-; RV32VB-PACK-NEXT:    packh a3, a6, a7
-; RV32VB-PACK-NEXT:    lbu a4, 105(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 161(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 124(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 163(a0)
-; RV32VB-PACK-NEXT:    lbu t0, 144(a0)
-; RV32VB-PACK-NEXT:    lbu a0, 154(a0)
+; RV32VB-PACK-NEXT:    lbu a2, 82(a0)
 ; RV32VB-PACK-NEXT:    packh a4, a4, a5
-; RV32VB-PACK-NEXT:    pack a3, a3, a4
-; RV32VB-PACK-NEXT:    packh a4, a6, a7
-; RV32VB-PACK-NEXT:    packh a0, t0, a0
-; RV32VB-PACK-NEXT:    pack a0, a4, a0
+; RV32VB-PACK-NEXT:    packh a3, a3, a6
+; RV32VB-PACK-NEXT:    pack a3, a4, a3
+; RV32VB-PACK-NEXT:    lbu a4, 154(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 161(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 163(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 93(a0)
+; RV32VB-PACK-NEXT:    lbu t0, 105(a0)
+; RV32VB-PACK-NEXT:    lbu t1, 124(a0)
+; RV32VB-PACK-NEXT:    lbu a0, 144(a0)
+; RV32VB-PACK-NEXT:    packh a2, a2, a7
+; RV32VB-PACK-NEXT:    packh a5, t0, a5
+; RV32VB-PACK-NEXT:    pack a2, a2, a5
+; RV32VB-PACK-NEXT:    packh a5, t1, a6
+; RV32VB-PACK-NEXT:    packh a0, a0, a4
+; RV32VB-PACK-NEXT:    pack a0, a5, a0
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
 ;
@@ -1854,39 +1854,39 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu a2, 1(a0)
 ; RV64V-ONLY-NEXT:    lbu a3, 22(a0)
 ; RV64V-ONLY-NEXT:    lbu a4, 31(a0)
-; RV64V-ONLY-NEXT:    lbu a5, 44(a0)
-; RV64V-ONLY-NEXT:    lbu a6, 55(a0)
-; RV64V-ONLY-NEXT:    lbu a7, 623(a0)
+; RV64V-ONLY-NEXT:    lbu a5, 623(a0)
+; RV64V-ONLY-NEXT:    lbu a6, 44(a0)
+; RV64V-ONLY-NEXT:    lbu a7, 55(a0)
 ; RV64V-ONLY-NEXT:    lbu t0, 75(a0)
 ; RV64V-ONLY-NEXT:    lbu t1, 82(a0)
-; RV64V-ONLY-NEXT:    lbu t2, 93(a0)
-; RV64V-ONLY-NEXT:    lbu t3, 105(a0)
-; RV64V-ONLY-NEXT:    lbu t4, 161(a0)
-; RV64V-ONLY-NEXT:    lbu t5, 124(a0)
-; RV64V-ONLY-NEXT:    lbu t6, 163(a0)
-; RV64V-ONLY-NEXT:    lbu s0, 144(a0)
-; RV64V-ONLY-NEXT:    lbu a0, 154(a0)
+; RV64V-ONLY-NEXT:    lbu t2, 154(a0)
+; RV64V-ONLY-NEXT:    lbu t3, 161(a0)
+; RV64V-ONLY-NEXT:    lbu t4, 163(a0)
+; RV64V-ONLY-NEXT:    lbu t5, 93(a0)
+; RV64V-ONLY-NEXT:    lbu t6, 105(a0)
+; RV64V-ONLY-NEXT:    lbu s0, 124(a0)
+; RV64V-ONLY-NEXT:    lbu a0, 144(a0)
 ; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a1
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a4
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v9, v8, t0
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, t1
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t2
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t3
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t4
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t6
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t3
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, s0
-; RV64V-ONLY-NEXT:    li a1, 255
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t4
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a0
+; RV64V-ONLY-NEXT:    li a0, 255
 ; RV64V-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64V-ONLY-NEXT:    vmv.s.x v0, a1
+; RV64V-ONLY-NEXT:    vmv.s.x v0, a0
 ; RV64V-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a0
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t2
 ; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
 ; RV64V-ONLY-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64V-ONLY-NEXT:    addi sp, sp, 16
@@ -1903,43 +1903,43 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RVA22U64-NEXT:    slli a3, a3, 16
 ; RVA22U64-NEXT:    slli a4, a4, 24
 ; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    lbu a2, 44(a0)
 ; RVA22U64-NEXT:    or a1, a1, a3
-; RVA22U64-NEXT:    lbu a3, 55(a0)
-; RVA22U64-NEXT:    lbu a4, 623(a0)
-; RVA22U64-NEXT:    slli a2, a2, 32
+; RVA22U64-NEXT:    lbu a2, 623(a0)
+; RVA22U64-NEXT:    lbu a3, 44(a0)
+; RVA22U64-NEXT:    lbu a4, 55(a0)
 ; RVA22U64-NEXT:    lbu a5, 75(a0)
-; RVA22U64-NEXT:    slli a3, a3, 40
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    slli a4, a4, 48
+; RVA22U64-NEXT:    lbu a6, 82(a0)
+; RVA22U64-NEXT:    slli a3, a3, 32
+; RVA22U64-NEXT:    slli a4, a4, 40
+; RVA22U64-NEXT:    or a3, a3, a4
+; RVA22U64-NEXT:    slli a2, a2, 48
 ; RVA22U64-NEXT:    slli a5, a5, 56
-; RVA22U64-NEXT:    or a4, a4, a5
-; RVA22U64-NEXT:    or a2, a2, a4
-; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    or a2, a2, a5
+; RVA22U64-NEXT:    or a2, a2, a3
+; RVA22U64-NEXT:    or a7, a1, a2
 ; RVA22U64-NEXT:    lbu a2, 93(a0)
-; RVA22U64-NEXT:    lbu a3, 82(a0)
-; RVA22U64-NEXT:    lbu a4, 105(a0)
-; RVA22U64-NEXT:    lbu a5, 161(a0)
+; RVA22U64-NEXT:    lbu t0, 105(a0)
+; RVA22U64-NEXT:    lbu a4, 124(a0)
+; RVA22U64-NEXT:    lbu a5, 144(a0)
 ; RVA22U64-NEXT:    slli a2, a2, 8
+; RVA22U64-NEXT:    lbu a1, 154(a0)
+; RVA22U64-NEXT:    lbu a3, 161(a0)
+; RVA22U64-NEXT:    or a2, a6, a2
+; RVA22U64-NEXT:    slli t0, t0, 16
+; RVA22U64-NEXT:    lbu a0, 163(a0)
+; RVA22U64-NEXT:    slli a3, a3, 24
+; RVA22U64-NEXT:    or a3, a3, t0
 ; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    slli a4, a4, 16
-; RVA22U64-NEXT:    slli a5, a5, 24
-; RVA22U64-NEXT:    or a4, a4, a5
-; RVA22U64-NEXT:    lbu a3, 124(a0)
-; RVA22U64-NEXT:    or a2, a2, a4
-; RVA22U64-NEXT:    lbu a4, 163(a0)
-; RVA22U64-NEXT:    lbu a5, 144(a0)
-; RVA22U64-NEXT:    slli a3, a3, 32
-; RVA22U64-NEXT:    lbu a0, 154(a0)
-; RVA22U64-NEXT:    slli a4, a4, 40
-; RVA22U64-NEXT:    or a3, a3, a4
+; RVA22U64-NEXT:    slli a4, a4, 32
+; RVA22U64-NEXT:    slli a0, a0, 40
+; RVA22U64-NEXT:    or a0, a0, a4
 ; RVA22U64-NEXT:    slli a5, a5, 48
-; RVA22U64-NEXT:    slli a0, a0, 56
-; RVA22U64-NEXT:    or a0, a0, a5
-; RVA22U64-NEXT:    or a0, a0, a3
+; RVA22U64-NEXT:    slli a1, a1, 56
+; RVA22U64-NEXT:    or a1, a1, a5
+; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-NEXT:    vmv.v.x v8, a1
+; RVA22U64-NEXT:    vmv.v.x v8, a7
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-NEXT:    ret
 ;
@@ -1949,34 +1949,34 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
 ; RVA22U64-PACK-NEXT:    lbu a3, 22(a0)
 ; RVA22U64-PACK-NEXT:    lbu a4, 31(a0)
-; RVA22U64-PACK-NEXT:    packh a1, a1, a2
+; RVA22U64-PACK-NEXT:    packh a6, a1, a2
 ; RVA22U64-PACK-NEXT:    packh a2, a3, a4
-; RVA22U64-PACK-NEXT:    lbu a3, 44(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 55(a0)
-; RVA22U64-PACK-NEXT:    packw a6, a1, a2
-; RVA22U64-PACK-NEXT:    lbu a2, 623(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 75(a0)
-; RVA22U64-PACK-NEXT:    packh a3, a3, a4
-; RVA22U64-PACK-NEXT:    lbu a4, 82(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 93(a0)
-; RVA22U64-PACK-NEXT:    packh a2, a2, a5
-; RVA22U64-PACK-NEXT:    packw a2, a3, a2
-; RVA22U64-PACK-NEXT:    pack a6, a6, a2
-; RVA22U64-PACK-NEXT:    packh a7, a4, a1
-; RVA22U64-PACK-NEXT:    lbu a3, 105(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 161(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 124(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 163(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 144(a0)
-; RVA22U64-PACK-NEXT:    lbu a0, 154(a0)
-; RVA22U64-PACK-NEXT:    packh a3, a3, a4
-; RVA22U64-PACK-NEXT:    packw a3, a7, a3
-; RVA22U64-PACK-NEXT:    packh a2, a5, a2
-; RVA22U64-PACK-NEXT:    packh a0, a1, a0
+; RVA22U64-PACK-NEXT:    lbu a3, 623(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 55(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 75(a0)
+; RVA22U64-PACK-NEXT:    packw a2, a6, a2
+; RVA22U64-PACK-NEXT:    lbu a6, 82(a0)
+; RVA22U64-PACK-NEXT:    packh a4, a4, a5
+; RVA22U64-PACK-NEXT:    packh a1, a3, a1
+; RVA22U64-PACK-NEXT:    packw a1, a4, a1
+; RVA22U64-PACK-NEXT:    pack a7, a2, a1
+; RVA22U64-PACK-NEXT:    lbu t0, 154(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 161(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 163(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 93(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 105(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 124(a0)
+; RVA22U64-PACK-NEXT:    lbu a0, 144(a0)
+; RVA22U64-PACK-NEXT:    packh a5, a6, a5
+; RVA22U64-PACK-NEXT:    packh a1, a1, a3
+; RVA22U64-PACK-NEXT:    packw a1, a5, a1
+; RVA22U64-PACK-NEXT:    packh a2, a2, a4
+; RVA22U64-PACK-NEXT:    packh a0, a0, t0
 ; RVA22U64-PACK-NEXT:    packw a0, a2, a0
-; RVA22U64-PACK-NEXT:    pack a0, a3, a0
+; RVA22U64-PACK-NEXT:    pack a0, a1, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-PACK-NEXT:    vmv.v.x v8, a6
+; RVA22U64-PACK-NEXT:    vmv.v.x v8, a7
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
@@ -1990,39 +1990,39 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu a2, 1(a0)
 ; RV64ZVE32-NEXT:    lbu a3, 22(a0)
 ; RV64ZVE32-NEXT:    lbu a4, 31(a0)
-; RV64ZVE32-NEXT:    lbu a5, 44(a0)
-; RV64ZVE32-NEXT:    lbu a6, 55(a0)
-; RV64ZVE32-NEXT:    lbu a7, 623(a0)
+; RV64ZVE32-NEXT:    lbu a5, 623(a0)
+; RV64ZVE32-NEXT:    lbu a6, 44(a0)
+; RV64ZVE32-NEXT:    lbu a7, 55(a0)
 ; RV64ZVE32-NEXT:    lbu t0, 75(a0)
 ; RV64ZVE32-NEXT:    lbu t1, 82(a0)
-; RV64ZVE32-NEXT:    lbu t2, 93(a0)
-; RV64ZVE32-NEXT:    lbu t3, 105(a0)
-; RV64ZVE32-NEXT:    lbu t4, 161(a0)
-; RV64ZVE32-NEXT:    lbu t5, 124(a0)
-; RV64ZVE32-NEXT:    lbu t6, 163(a0)
-; RV64ZVE32-NEXT:    lbu s0, 144(a0)
-; RV64ZVE32-NEXT:    lbu a0, 154(a0)
+; RV64ZVE32-NEXT:    lbu t2, 154(a0)
+; RV64ZVE32-NEXT:    lbu t3, 161(a0)
+; RV64ZVE32-NEXT:    lbu t4, 163(a0)
+; RV64ZVE32-NEXT:    lbu t5, 93(a0)
+; RV64ZVE32-NEXT:    lbu t6, 105(a0)
+; RV64ZVE32-NEXT:    lbu s0, 124(a0)
+; RV64ZVE32-NEXT:    lbu a0, 144(a0)
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a1
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a4
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a6
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64ZVE32-NEXT:    vslide1down.vx v9, v8, t0
 ; RV64ZVE32-NEXT:    vmv.v.x v8, t1
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t2
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t3
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t4
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t5
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t6
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t3
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, s0
-; RV64ZVE32-NEXT:    li a1, 255
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t4
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a0
+; RV64ZVE32-NEXT:    li a0, 255
 ; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32-NEXT:    vmv.s.x v0, a1
+; RV64ZVE32-NEXT:    vmv.s.x v0, a0
 ; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a0
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t2
 ; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 8, v0.t
 ; RV64ZVE32-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64ZVE32-NEXT:    addi sp, sp, 16
@@ -2085,20 +2085,20 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu a1, 82(a0)
 ; RV32-ONLY-NEXT:    lbu a2, 93(a0)
 ; RV32-ONLY-NEXT:    lbu a3, 105(a0)
-; RV32-ONLY-NEXT:    lbu a4, 161(a0)
-; RV32-ONLY-NEXT:    lbu a5, 124(a0)
-; RV32-ONLY-NEXT:    lbu a6, 163(a0)
-; RV32-ONLY-NEXT:    lbu a7, 144(a0)
-; RV32-ONLY-NEXT:    lbu a0, 154(a0)
+; RV32-ONLY-NEXT:    lbu a4, 124(a0)
+; RV32-ONLY-NEXT:    lbu a5, 144(a0)
+; RV32-ONLY-NEXT:    lbu a6, 154(a0)
+; RV32-ONLY-NEXT:    lbu a7, 161(a0)
+; RV32-ONLY-NEXT:    lbu a0, 163(a0)
 ; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a1
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a6
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-ONLY-NEXT:    ret
 ;
 ; RV32VB-LABEL: buildvec_v16i8_undef_low_half:
@@ -2106,23 +2106,23 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ; RV32VB-NEXT:    lbu a1, 93(a0)
 ; RV32VB-NEXT:    lbu a2, 82(a0)
 ; RV32VB-NEXT:    lbu a3, 105(a0)
-; RV32VB-NEXT:    lbu a4, 161(a0)
+; RV32VB-NEXT:    lbu a4, 124(a0)
 ; RV32VB-NEXT:    slli a1, a1, 8
+; RV32VB-NEXT:    lbu a5, 144(a0)
+; RV32VB-NEXT:    lbu a6, 154(a0)
+; RV32VB-NEXT:    lbu a7, 161(a0)
 ; RV32VB-NEXT:    or a1, a2, a1
 ; RV32VB-NEXT:    slli a3, a3, 16
-; RV32VB-NEXT:    slli a4, a4, 24
-; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    or a1, a1, a3
-; RV32VB-NEXT:    lbu a2, 163(a0)
-; RV32VB-NEXT:    lbu a3, 124(a0)
-; RV32VB-NEXT:    lbu a4, 144(a0)
-; RV32VB-NEXT:    lbu a0, 154(a0)
-; RV32VB-NEXT:    slli a2, a2, 8
-; RV32VB-NEXT:    or a2, a3, a2
-; RV32VB-NEXT:    slli a4, a4, 16
-; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a4
-; RV32VB-NEXT:    or a0, a2, a0
+; RV32VB-NEXT:    lbu a0, 163(a0)
+; RV32VB-NEXT:    slli a7, a7, 24
+; RV32VB-NEXT:    or a2, a7, a3
+; RV32VB-NEXT:    or a1, a1, a2
+; RV32VB-NEXT:    slli a0, a0, 8
+; RV32VB-NEXT:    or a0, a4, a0
+; RV32VB-NEXT:    slli a5, a5, 16
+; RV32VB-NEXT:    slli a6, a6, 24
+; RV32VB-NEXT:    or a2, a6, a5
+; RV32VB-NEXT:    or a0, a0, a2
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.i v8, 0
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, zero
@@ -2132,26 +2132,26 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ;
 ; RV32VB-PACK-LABEL: buildvec_v16i8_undef_low_half:
 ; RV32VB-PACK:       # %bb.0:
-; RV32VB-PACK-NEXT:    lbu a1, 82(a0)
-; RV32VB-PACK-NEXT:    lbu a2, 93(a0)
-; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    lbu a2, 105(a0)
+; RV32VB-PACK-NEXT:    lbu a1, 144(a0)
+; RV32VB-PACK-NEXT:    lbu a2, 154(a0)
 ; RV32VB-PACK-NEXT:    lbu a3, 161(a0)
-; RV32VB-PACK-NEXT:    lbu a4, 124(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 163(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 144(a0)
-; RV32VB-PACK-NEXT:    lbu a0, 154(a0)
-; RV32VB-PACK-NEXT:    packh a2, a2, a3
-; RV32VB-PACK-NEXT:    pack a1, a1, a2
-; RV32VB-PACK-NEXT:    packh a2, a4, a5
-; RV32VB-PACK-NEXT:    packh a0, a6, a0
-; RV32VB-PACK-NEXT:    pack a0, a2, a0
-; RV32VB-PACK-NEXT:    packh a2, a0, a0
-; RV32VB-PACK-NEXT:    pack a2, a2, a2
+; RV32VB-PACK-NEXT:    lbu a4, 82(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 93(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 105(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 124(a0)
+; RV32VB-PACK-NEXT:    lbu a0, 163(a0)
+; RV32VB-PACK-NEXT:    packh a4, a4, a5
+; RV32VB-PACK-NEXT:    packh a3, a6, a3
+; RV32VB-PACK-NEXT:    pack a3, a4, a3
+; RV32VB-PACK-NEXT:    packh a0, a7, a0
+; RV32VB-PACK-NEXT:    packh a1, a1, a2
+; RV32VB-PACK-NEXT:    pack a0, a0, a1
+; RV32VB-PACK-NEXT:    packh a1, a0, a0
+; RV32VB-PACK-NEXT:    pack a1, a1, a1
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32VB-PACK-NEXT:    vmv.v.x v8, a2
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
+; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a1
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
 ;
@@ -2160,44 +2160,44 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu a1, 82(a0)
 ; RV64V-ONLY-NEXT:    lbu a2, 93(a0)
 ; RV64V-ONLY-NEXT:    lbu a3, 105(a0)
-; RV64V-ONLY-NEXT:    lbu a4, 161(a0)
-; RV64V-ONLY-NEXT:    lbu a5, 124(a0)
-; RV64V-ONLY-NEXT:    lbu a6, 163(a0)
-; RV64V-ONLY-NEXT:    lbu a7, 144(a0)
-; RV64V-ONLY-NEXT:    lbu a0, 154(a0)
+; RV64V-ONLY-NEXT:    lbu a4, 124(a0)
+; RV64V-ONLY-NEXT:    lbu a5, 144(a0)
+; RV64V-ONLY-NEXT:    lbu a6, 154(a0)
+; RV64V-ONLY-NEXT:    lbu a7, 161(a0)
+; RV64V-ONLY-NEXT:    lbu a0, 163(a0)
 ; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a1
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a6
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64V-ONLY-NEXT:    ret
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_undef_low_half:
 ; RVA22U64:       # %bb.0:
 ; RVA22U64-NEXT:    lbu a1, 93(a0)
-; RVA22U64-NEXT:    lbu a2, 82(a0)
-; RVA22U64-NEXT:    lbu a3, 105(a0)
-; RVA22U64-NEXT:    lbu a4, 161(a0)
+; RVA22U64-NEXT:    lbu a6, 82(a0)
+; RVA22U64-NEXT:    lbu a7, 105(a0)
+; RVA22U64-NEXT:    lbu a4, 124(a0)
 ; RVA22U64-NEXT:    slli a1, a1, 8
-; RVA22U64-NEXT:    or a1, a1, a2
-; RVA22U64-NEXT:    slli a3, a3, 16
-; RVA22U64-NEXT:    slli a4, a4, 24
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    lbu a2, 124(a0)
+; RVA22U64-NEXT:    lbu a5, 144(a0)
+; RVA22U64-NEXT:    lbu a2, 154(a0)
+; RVA22U64-NEXT:    lbu a3, 161(a0)
+; RVA22U64-NEXT:    or a1, a6, a1
+; RVA22U64-NEXT:    slli a7, a7, 16
+; RVA22U64-NEXT:    lbu a0, 163(a0)
+; RVA22U64-NEXT:    slli a3, a3, 24
+; RVA22U64-NEXT:    or a3, a3, a7
 ; RVA22U64-NEXT:    or a1, a1, a3
-; RVA22U64-NEXT:    lbu a3, 163(a0)
-; RVA22U64-NEXT:    lbu a4, 144(a0)
-; RVA22U64-NEXT:    slli a2, a2, 32
-; RVA22U64-NEXT:    lbu a0, 154(a0)
-; RVA22U64-NEXT:    slli a3, a3, 40
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    slli a4, a4, 48
-; RVA22U64-NEXT:    slli a0, a0, 56
+; RVA22U64-NEXT:    slli a4, a4, 32
+; RVA22U64-NEXT:    slli a0, a0, 40
 ; RVA22U64-NEXT:    or a0, a0, a4
+; RVA22U64-NEXT:    slli a5, a5, 48
+; RVA22U64-NEXT:    slli a2, a2, 56
+; RVA22U64-NEXT:    or a2, a2, a5
 ; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -2207,21 +2207,21 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_low_half:
 ; RVA22U64-PACK:       # %bb.0:
-; RVA22U64-PACK-NEXT:    lbu a1, 82(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 93(a0)
-; RVA22U64-PACK-NEXT:    packh a6, a1, a2
-; RVA22U64-PACK-NEXT:    lbu a2, 105(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 144(a0)
+; RVA22U64-PACK-NEXT:    lbu a7, 154(a0)
 ; RVA22U64-PACK-NEXT:    lbu a3, 161(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 124(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 163(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 144(a0)
-; RVA22U64-PACK-NEXT:    lbu a0, 154(a0)
-; RVA22U64-PACK-NEXT:    packh a2, a2, a3
-; RVA22U64-PACK-NEXT:    packw a2, a6, a2
-; RVA22U64-PACK-NEXT:    packh a3, a4, a5
-; RVA22U64-PACK-NEXT:    packh a0, a1, a0
-; RVA22U64-PACK-NEXT:    packw a0, a3, a0
-; RVA22U64-PACK-NEXT:    pack a0, a2, a0
+; RVA22U64-PACK-NEXT:    lbu a4, 82(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 93(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 105(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 124(a0)
+; RVA22U64-PACK-NEXT:    lbu a0, 163(a0)
+; RVA22U64-PACK-NEXT:    packh a4, a4, a5
+; RVA22U64-PACK-NEXT:    packh a1, a1, a3
+; RVA22U64-PACK-NEXT:    packw a1, a4, a1
+; RVA22U64-PACK-NEXT:    packh a0, a2, a0
+; RVA22U64-PACK-NEXT:    packh a2, a6, a7
+; RVA22U64-PACK-NEXT:    packw a0, a0, a2
+; RVA22U64-PACK-NEXT:    pack a0, a1, a0
 ; RVA22U64-PACK-NEXT:    packh a1, a0, a0
 ; RVA22U64-PACK-NEXT:    packw a1, a1, a1
 ; RVA22U64-PACK-NEXT:    pack a1, a1, a1
@@ -2235,20 +2235,20 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu a1, 82(a0)
 ; RV64ZVE32-NEXT:    lbu a2, 93(a0)
 ; RV64ZVE32-NEXT:    lbu a3, 105(a0)
-; RV64ZVE32-NEXT:    lbu a4, 161(a0)
-; RV64ZVE32-NEXT:    lbu a5, 124(a0)
-; RV64ZVE32-NEXT:    lbu a6, 163(a0)
-; RV64ZVE32-NEXT:    lbu a7, 144(a0)
-; RV64ZVE32-NEXT:    lbu a0, 154(a0)
+; RV64ZVE32-NEXT:    lbu a4, 124(a0)
+; RV64ZVE32-NEXT:    lbu a5, 144(a0)
+; RV64ZVE32-NEXT:    lbu a6, 154(a0)
+; RV64ZVE32-NEXT:    lbu a7, 161(a0)
+; RV64ZVE32-NEXT:    lbu a0, 163(a0)
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a1
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32-NEXT:    ret
   %p9 = getelementptr i8, ptr %p, i32 82
   %p10 = getelementptr i8, ptr %p, i32 93
@@ -2286,18 +2286,18 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu a2, 1(a0)
 ; RV32-ONLY-NEXT:    lbu a3, 22(a0)
 ; RV32-ONLY-NEXT:    lbu a4, 31(a0)
-; RV32-ONLY-NEXT:    lbu a5, 44(a0)
-; RV32-ONLY-NEXT:    lbu a6, 55(a0)
-; RV32-ONLY-NEXT:    lbu a7, 623(a0)
+; RV32-ONLY-NEXT:    lbu a5, 623(a0)
+; RV32-ONLY-NEXT:    lbu a6, 44(a0)
+; RV32-ONLY-NEXT:    lbu a7, 55(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 75(a0)
 ; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a1
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a4
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-ONLY-NEXT:    vslidedown.vi v8, v8, 8
 ; RV32-ONLY-NEXT:    ret
@@ -2313,16 +2313,16 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a4, a4, 24
 ; RV32VB-NEXT:    or a3, a4, a3
+; RV32VB-NEXT:    lbu a2, 44(a0)
+; RV32VB-NEXT:    lbu a4, 55(a0)
 ; RV32VB-NEXT:    or a1, a1, a3
-; RV32VB-NEXT:    lbu a2, 55(a0)
-; RV32VB-NEXT:    lbu a3, 44(a0)
-; RV32VB-NEXT:    lbu a4, 623(a0)
+; RV32VB-NEXT:    lbu a3, 623(a0)
 ; RV32VB-NEXT:    lbu a0, 75(a0)
-; RV32VB-NEXT:    slli a2, a2, 8
-; RV32VB-NEXT:    or a2, a3, a2
-; RV32VB-NEXT:    slli a4, a4, 16
+; RV32VB-NEXT:    slli a4, a4, 8
+; RV32VB-NEXT:    or a2, a2, a4
+; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a4
+; RV32VB-NEXT:    or a0, a0, a3
 ; RV32VB-NEXT:    or a0, a2, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
@@ -2335,18 +2335,18 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RV32VB-PACK:       # %bb.0:
 ; RV32VB-PACK-NEXT:    lbu a1, 0(a0)
 ; RV32VB-PACK-NEXT:    lbu a2, 1(a0)
+; RV32VB-PACK-NEXT:    lbu a3, 22(a0)
+; RV32VB-PACK-NEXT:    lbu a4, 31(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    lbu a2, 22(a0)
-; RV32VB-PACK-NEXT:    lbu a3, 31(a0)
-; RV32VB-PACK-NEXT:    lbu a4, 44(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 55(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 623(a0)
+; RV32VB-PACK-NEXT:    lbu a2, 623(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 44(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 55(a0)
 ; RV32VB-PACK-NEXT:    lbu a0, 75(a0)
-; RV32VB-PACK-NEXT:    packh a2, a2, a3
-; RV32VB-PACK-NEXT:    pack a1, a1, a2
-; RV32VB-PACK-NEXT:    packh a2, a4, a5
-; RV32VB-PACK-NEXT:    packh a0, a6, a0
-; RV32VB-PACK-NEXT:    pack a0, a2, a0
+; RV32VB-PACK-NEXT:    packh a3, a3, a4
+; RV32VB-PACK-NEXT:    pack a1, a1, a3
+; RV32VB-PACK-NEXT:    packh a3, a5, a6
+; RV32VB-PACK-NEXT:    packh a0, a2, a0
+; RV32VB-PACK-NEXT:    pack a0, a3, a0
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
@@ -2362,18 +2362,18 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu a2, 1(a0)
 ; RV64V-ONLY-NEXT:    lbu a3, 22(a0)
 ; RV64V-ONLY-NEXT:    lbu a4, 31(a0)
-; RV64V-ONLY-NEXT:    lbu a5, 44(a0)
-; RV64V-ONLY-NEXT:    lbu a6, 55(a0)
-; RV64V-ONLY-NEXT:    lbu a7, 623(a0)
+; RV64V-ONLY-NEXT:    lbu a5, 623(a0)
+; RV64V-ONLY-NEXT:    lbu a6, 44(a0)
+; RV64V-ONLY-NEXT:    lbu a7, 55(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 75(a0)
 ; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a1
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a4
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64V-ONLY-NEXT:    vslidedown.vi v8, v8, 8
 ; RV64V-ONLY-NEXT:    ret
@@ -2389,12 +2389,12 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RVA22U64-NEXT:    slli a3, a3, 16
 ; RVA22U64-NEXT:    slli a4, a4, 24
 ; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    lbu a2, 44(a0)
 ; RVA22U64-NEXT:    or a1, a1, a3
+; RVA22U64-NEXT:    lbu a2, 44(a0)
 ; RVA22U64-NEXT:    lbu a3, 55(a0)
 ; RVA22U64-NEXT:    lbu a4, 623(a0)
-; RVA22U64-NEXT:    slli a2, a2, 32
 ; RVA22U64-NEXT:    lbu a0, 75(a0)
+; RVA22U64-NEXT:    slli a2, a2, 32
 ; RVA22U64-NEXT:    slli a3, a3, 40
 ; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    slli a4, a4, 48
@@ -2411,19 +2411,19 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RVA22U64-PACK:       # %bb.0:
 ; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
 ; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
-; RVA22U64-PACK-NEXT:    packh a6, a1, a2
-; RVA22U64-PACK-NEXT:    lbu a2, 22(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 31(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 44(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 55(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 623(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 22(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 31(a0)
+; RVA22U64-PACK-NEXT:    packh a1, a1, a2
+; RVA22U64-PACK-NEXT:    lbu a2, 623(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 55(a0)
 ; RVA22U64-PACK-NEXT:    lbu a0, 75(a0)
-; RVA22U64-PACK-NEXT:    packh a2, a2, a3
-; RVA22U64-PACK-NEXT:    packw a2, a6, a2
-; RVA22U64-PACK-NEXT:    packh a3, a4, a5
-; RVA22U64-PACK-NEXT:    packh a0, a1, a0
+; RVA22U64-PACK-NEXT:    packh a4, a6, a4
+; RVA22U64-PACK-NEXT:    packw a1, a1, a4
+; RVA22U64-PACK-NEXT:    packh a3, a5, a3
+; RVA22U64-PACK-NEXT:    packh a0, a2, a0
 ; RVA22U64-PACK-NEXT:    packw a0, a3, a0
-; RVA22U64-PACK-NEXT:    pack a0, a2, a0
+; RVA22U64-PACK-NEXT:    pack a0, a1, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.v.x v8, a0
 ; RVA22U64-PACK-NEXT:    packh a0, a0, a0
@@ -2438,18 +2438,18 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu a2, 1(a0)
 ; RV64ZVE32-NEXT:    lbu a3, 22(a0)
 ; RV64ZVE32-NEXT:    lbu a4, 31(a0)
-; RV64ZVE32-NEXT:    lbu a5, 44(a0)
-; RV64ZVE32-NEXT:    lbu a6, 55(a0)
-; RV64ZVE32-NEXT:    lbu a7, 623(a0)
+; RV64ZVE32-NEXT:    lbu a5, 623(a0)
+; RV64ZVE32-NEXT:    lbu a6, 44(a0)
+; RV64ZVE32-NEXT:    lbu a7, 55(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 75(a0)
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a1
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a4
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a6
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32-NEXT:    vslidedown.vi v8, v8, 8
 ; RV64ZVE32-NEXT:    ret
@@ -2484,20 +2484,20 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV32-ONLY-LABEL: buildvec_v16i8_undef_edges:
 ; RV32-ONLY:       # %bb.0:
-; RV32-ONLY-NEXT:    lbu a1, 31(a0)
-; RV32-ONLY-NEXT:    lbu a2, 44(a0)
-; RV32-ONLY-NEXT:    lbu a3, 55(a0)
-; RV32-ONLY-NEXT:    lbu a4, 623(a0)
+; RV32-ONLY-NEXT:    lbu a1, 623(a0)
+; RV32-ONLY-NEXT:    lbu a2, 31(a0)
+; RV32-ONLY-NEXT:    lbu a3, 44(a0)
+; RV32-ONLY-NEXT:    lbu a4, 55(a0)
 ; RV32-ONLY-NEXT:    lbu a5, 75(a0)
 ; RV32-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV32-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV32-ONLY-NEXT:    lbu t0, 105(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 161(a0)
 ; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV32-ONLY-NEXT:    vmv.v.x v8, a1
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-ONLY-NEXT:    vmv.v.x v8, a2
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-ONLY-NEXT:    vslide1down.vx v9, v8, a5
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a6
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
@@ -2513,58 +2513,58 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ;
 ; RV32VB-LABEL: buildvec_v16i8_undef_edges:
 ; RV32VB:       # %bb.0:
-; RV32VB-NEXT:    lbu a1, 31(a0)
-; RV32VB-NEXT:    lbu a2, 55(a0)
+; RV32VB-NEXT:    lbu a1, 55(a0)
+; RV32VB-NEXT:    lbu a2, 31(a0)
 ; RV32VB-NEXT:    lbu a3, 44(a0)
 ; RV32VB-NEXT:    lbu a4, 623(a0)
 ; RV32VB-NEXT:    lbu a5, 75(a0)
-; RV32VB-NEXT:    slli a2, a2, 8
-; RV32VB-NEXT:    or a2, a3, a2
+; RV32VB-NEXT:    slli a1, a1, 8
+; RV32VB-NEXT:    or a1, a3, a1
 ; RV32VB-NEXT:    slli a4, a4, 16
 ; RV32VB-NEXT:    slli a5, a5, 24
-; RV32VB-NEXT:    lbu a3, 93(a0)
 ; RV32VB-NEXT:    or a4, a5, a4
-; RV32VB-NEXT:    or a2, a2, a4
-; RV32VB-NEXT:    lbu a4, 82(a0)
-; RV32VB-NEXT:    slli a3, a3, 8
-; RV32VB-NEXT:    lbu a5, 105(a0)
+; RV32VB-NEXT:    lbu a3, 82(a0)
+; RV32VB-NEXT:    lbu a5, 93(a0)
+; RV32VB-NEXT:    or a1, a1, a4
+; RV32VB-NEXT:    lbu a4, 105(a0)
 ; RV32VB-NEXT:    lbu a0, 161(a0)
-; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    slli a1, a1, 24
-; RV32VB-NEXT:    slli a5, a5, 16
+; RV32VB-NEXT:    slli a5, a5, 8
+; RV32VB-NEXT:    or a3, a3, a5
+; RV32VB-NEXT:    slli a2, a2, 24
+; RV32VB-NEXT:    slli a4, a4, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a5
+; RV32VB-NEXT:    or a0, a0, a4
 ; RV32VB-NEXT:    or a0, a3, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32VB-NEXT:    vmv.v.x v8, a1
-; RV32VB-NEXT:    vslide1down.vx v8, v8, a2
+; RV32VB-NEXT:    vmv.v.x v8, a2
+; RV32VB-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, zero
 ; RV32VB-NEXT:    ret
 ;
 ; RV32VB-PACK-LABEL: buildvec_v16i8_undef_edges:
 ; RV32VB-PACK:       # %bb.0:
-; RV32VB-PACK-NEXT:    lbu a1, 31(a0)
-; RV32VB-PACK-NEXT:    lbu a2, 44(a0)
-; RV32VB-PACK-NEXT:    lbu a3, 55(a0)
-; RV32VB-PACK-NEXT:    lbu a4, 623(a0)
+; RV32VB-PACK-NEXT:    lbu a1, 623(a0)
+; RV32VB-PACK-NEXT:    lbu a2, 31(a0)
+; RV32VB-PACK-NEXT:    lbu a3, 44(a0)
+; RV32VB-PACK-NEXT:    lbu a4, 55(a0)
 ; RV32VB-PACK-NEXT:    lbu a5, 75(a0)
-; RV32VB-PACK-NEXT:    packh a1, a0, a1
-; RV32VB-PACK-NEXT:    packh a2, a2, a3
-; RV32VB-PACK-NEXT:    packh a3, a4, a5
+; RV32VB-PACK-NEXT:    packh a2, a0, a2
+; RV32VB-PACK-NEXT:    packh a3, a3, a4
+; RV32VB-PACK-NEXT:    packh a1, a1, a5
 ; RV32VB-PACK-NEXT:    lbu a4, 82(a0)
 ; RV32VB-PACK-NEXT:    lbu a5, 93(a0)
-; RV32VB-PACK-NEXT:    pack a2, a2, a3
+; RV32VB-PACK-NEXT:    pack a1, a3, a1
 ; RV32VB-PACK-NEXT:    lbu a3, 105(a0)
 ; RV32VB-PACK-NEXT:    lbu a0, 161(a0)
 ; RV32VB-PACK-NEXT:    packh a4, a4, a5
 ; RV32VB-PACK-NEXT:    packh a5, a0, a0
-; RV32VB-PACK-NEXT:    pack a1, a5, a1
+; RV32VB-PACK-NEXT:    pack a2, a5, a2
 ; RV32VB-PACK-NEXT:    packh a0, a3, a0
 ; RV32VB-PACK-NEXT:    pack a0, a4, a0
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
+; RV32VB-PACK-NEXT:    vmv.v.x v8, a2
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    pack a0, a5, a5
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
@@ -2572,20 +2572,20 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ;
 ; RV64V-ONLY-LABEL: buildvec_v16i8_undef_edges:
 ; RV64V-ONLY:       # %bb.0:
-; RV64V-ONLY-NEXT:    lbu a1, 31(a0)
-; RV64V-ONLY-NEXT:    lbu a2, 44(a0)
-; RV64V-ONLY-NEXT:    lbu a3, 55(a0)
-; RV64V-ONLY-NEXT:    lbu a4, 623(a0)
+; RV64V-ONLY-NEXT:    lbu a1, 623(a0)
+; RV64V-ONLY-NEXT:    lbu a2, 31(a0)
+; RV64V-ONLY-NEXT:    lbu a3, 44(a0)
+; RV64V-ONLY-NEXT:    lbu a4, 55(a0)
 ; RV64V-ONLY-NEXT:    lbu a5, 75(a0)
 ; RV64V-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV64V-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV64V-ONLY-NEXT:    lbu t0, 105(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 161(a0)
 ; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV64V-ONLY-NEXT:    vmv.v.x v8, a1
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV64V-ONLY-NEXT:    vmv.v.x v8, a2
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64V-ONLY-NEXT:    vslide1down.vx v9, v8, a5
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
@@ -2601,30 +2601,30 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_undef_edges:
 ; RVA22U64:       # %bb.0:
-; RVA22U64-NEXT:    lbu a1, 44(a0)
-; RVA22U64-NEXT:    lbu a2, 55(a0)
-; RVA22U64-NEXT:    lbu a3, 31(a0)
+; RVA22U64-NEXT:    lbu a1, 31(a0)
+; RVA22U64-NEXT:    lbu a2, 44(a0)
+; RVA22U64-NEXT:    lbu a3, 55(a0)
 ; RVA22U64-NEXT:    lbu a4, 623(a0)
-; RVA22U64-NEXT:    slli a1, a1, 32
-; RVA22U64-NEXT:    slli a2, a2, 40
 ; RVA22U64-NEXT:    lbu a5, 75(a0)
-; RVA22U64-NEXT:    or a1, a1, a2
-; RVA22U64-NEXT:    slli a3, a3, 24
+; RVA22U64-NEXT:    slli a2, a2, 32
+; RVA22U64-NEXT:    slli a3, a3, 40
+; RVA22U64-NEXT:    or a2, a2, a3
+; RVA22U64-NEXT:    slli a1, a1, 24
 ; RVA22U64-NEXT:    slli a4, a4, 48
 ; RVA22U64-NEXT:    slli a5, a5, 56
 ; RVA22U64-NEXT:    or a4, a4, a5
-; RVA22U64-NEXT:    or a1, a1, a4
-; RVA22U64-NEXT:    add.uw a1, a3, a1
-; RVA22U64-NEXT:    lbu a2, 93(a0)
+; RVA22U64-NEXT:    or a2, a2, a4
 ; RVA22U64-NEXT:    lbu a3, 82(a0)
-; RVA22U64-NEXT:    lbu a4, 105(a0)
+; RVA22U64-NEXT:    lbu a4, 93(a0)
+; RVA22U64-NEXT:    add.uw a1, a1, a2
+; RVA22U64-NEXT:    lbu a2, 105(a0)
 ; RVA22U64-NEXT:    lbu a0, 161(a0)
-; RVA22U64-NEXT:    slli a2, a2, 8
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    slli a4, a4, 16
+; RVA22U64-NEXT:    slli a4, a4, 8
+; RVA22U64-NEXT:    or a3, a3, a4
+; RVA22U64-NEXT:    slli a2, a2, 16
 ; RVA22U64-NEXT:    slli a0, a0, 24
-; RVA22U64-NEXT:    or a0, a0, a4
 ; RVA22U64-NEXT:    or a0, a0, a2
+; RVA22U64-NEXT:    or a0, a0, a3
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-NEXT:    vmv.v.x v8, a1
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
@@ -2632,48 +2632,48 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_edges:
 ; RVA22U64-PACK:       # %bb.0:
-; RVA22U64-PACK-NEXT:    lbu a1, 31(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 44(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 55(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 623(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 623(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 31(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 55(a0)
 ; RVA22U64-PACK-NEXT:    lbu a5, 75(a0)
-; RVA22U64-PACK-NEXT:    packh a6, a0, a1
-; RVA22U64-PACK-NEXT:    packh a1, a0, a0
-; RVA22U64-PACK-NEXT:    packh a2, a2, a3
-; RVA22U64-PACK-NEXT:    packh a3, a4, a5
-; RVA22U64-PACK-NEXT:    packw a7, a2, a3
+; RVA22U64-PACK-NEXT:    packh a6, a0, a2
+; RVA22U64-PACK-NEXT:    packh a2, a0, a0
+; RVA22U64-PACK-NEXT:    packh a3, a3, a4
+; RVA22U64-PACK-NEXT:    packh a1, a1, a5
+; RVA22U64-PACK-NEXT:    packw a7, a3, a1
 ; RVA22U64-PACK-NEXT:    lbu a3, 82(a0)
 ; RVA22U64-PACK-NEXT:    lbu a4, 93(a0)
 ; RVA22U64-PACK-NEXT:    lbu a5, 105(a0)
 ; RVA22U64-PACK-NEXT:    lbu a0, 161(a0)
-; RVA22U64-PACK-NEXT:    packw a2, a1, a6
-; RVA22U64-PACK-NEXT:    pack a2, a2, a7
+; RVA22U64-PACK-NEXT:    packw a1, a2, a6
+; RVA22U64-PACK-NEXT:    pack a1, a1, a7
 ; RVA22U64-PACK-NEXT:    packh a3, a3, a4
 ; RVA22U64-PACK-NEXT:    packh a0, a5, a0
 ; RVA22U64-PACK-NEXT:    packw a0, a3, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-PACK-NEXT:    vmv.v.x v8, a2
-; RVA22U64-PACK-NEXT:    packw a1, a1, a1
+; RVA22U64-PACK-NEXT:    vmv.v.x v8, a1
+; RVA22U64-PACK-NEXT:    packw a1, a2, a2
 ; RVA22U64-PACK-NEXT:    pack a0, a0, a1
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
 ; RV64ZVE32-LABEL: buildvec_v16i8_undef_edges:
 ; RV64ZVE32:       # %bb.0:
-; RV64ZVE32-NEXT:    lbu a1, 31(a0)
-; RV64ZVE32-NEXT:    lbu a2, 44(a0)
-; RV64ZVE32-NEXT:    lbu a3, 55(a0)
-; RV64ZVE32-NEXT:    lbu a4, 623(a0)
+; RV64ZVE32-NEXT:    lbu a1, 623(a0)
+; RV64ZVE32-NEXT:    lbu a2, 31(a0)
+; RV64ZVE32-NEXT:    lbu a3, 44(a0)
+; RV64ZVE32-NEXT:    lbu a4, 55(a0)
 ; RV64ZVE32-NEXT:    lbu a5, 75(a0)
 ; RV64ZVE32-NEXT:    lbu a6, 82(a0)
 ; RV64ZVE32-NEXT:    lbu a7, 93(a0)
 ; RV64ZVE32-NEXT:    lbu t0, 105(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 161(a0)
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV64ZVE32-NEXT:    vmv.v.x v8, a1
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32-NEXT:    vmv.v.x v8, a2
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64ZVE32-NEXT:    vslide1down.vx v9, v8, a5
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a6
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
@@ -2757,30 +2757,30 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV32VB:       # %bb.0:
 ; RV32VB-NEXT:    lbu a1, 1(a0)
 ; RV32VB-NEXT:    lbu a2, 0(a0)
-; RV32VB-NEXT:    slli a1, a1, 8
 ; RV32VB-NEXT:    lbu a3, 55(a0)
 ; RV32VB-NEXT:    lbu a4, 44(a0)
+; RV32VB-NEXT:    slli a1, a1, 8
 ; RV32VB-NEXT:    or a1, a2, a1
-; RV32VB-NEXT:    lbu a2, 75(a0)
 ; RV32VB-NEXT:    slli a3, a3, 8
 ; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    lbu a4, 93(a0)
+; RV32VB-NEXT:    lbu a2, 75(a0)
+; RV32VB-NEXT:    lbu a4, 82(a0)
+; RV32VB-NEXT:    lbu a5, 93(a0)
+; RV32VB-NEXT:    lbu a6, 124(a0)
 ; RV32VB-NEXT:    slli a2, a2, 24
 ; RV32VB-NEXT:    or a2, a3, a2
-; RV32VB-NEXT:    lbu a3, 82(a0)
-; RV32VB-NEXT:    slli a4, a4, 8
-; RV32VB-NEXT:    lbu a5, 144(a0)
-; RV32VB-NEXT:    lbu a6, 154(a0)
-; RV32VB-NEXT:    or a3, a3, a4
-; RV32VB-NEXT:    lbu a0, 124(a0)
-; RV32VB-NEXT:    slli a5, a5, 16
-; RV32VB-NEXT:    slli a6, a6, 24
-; RV32VB-NEXT:    or a4, a6, a5
-; RV32VB-NEXT:    or a0, a0, a4
+; RV32VB-NEXT:    lbu a3, 144(a0)
+; RV32VB-NEXT:    lbu a0, 154(a0)
+; RV32VB-NEXT:    slli a5, a5, 8
+; RV32VB-NEXT:    or a4, a4, a5
+; RV32VB-NEXT:    slli a3, a3, 16
+; RV32VB-NEXT:    slli a0, a0, 24
+; RV32VB-NEXT:    or a0, a0, a3
+; RV32VB-NEXT:    or a0, a6, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a2
-; RV32VB-NEXT:    vslide1down.vx v8, v8, a3
+; RV32VB-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-NEXT:    ret
 ;
@@ -2790,20 +2790,20 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a2, 1(a0)
 ; RV32VB-PACK-NEXT:    lbu a3, 44(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 55(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 75(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
 ; RV32VB-PACK-NEXT:    packh a2, a3, a4
-; RV32VB-PACK-NEXT:    packh a3, a0, a5
+; RV32VB-PACK-NEXT:    lbu a3, 75(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 82(a0)
 ; RV32VB-PACK-NEXT:    lbu a5, 93(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 144(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 154(a0)
-; RV32VB-PACK-NEXT:    lbu a0, 124(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 124(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 144(a0)
+; RV32VB-PACK-NEXT:    lbu a0, 154(a0)
+; RV32VB-PACK-NEXT:    packh a3, a0, a3
 ; RV32VB-PACK-NEXT:    pack a2, a2, a3
 ; RV32VB-PACK-NEXT:    packh a3, a4, a5
-; RV32VB-PACK-NEXT:    packh a4, a6, a7
-; RV32VB-PACK-NEXT:    packh a0, a0, a0
-; RV32VB-PACK-NEXT:    pack a0, a0, a4
+; RV32VB-PACK-NEXT:    packh a0, a7, a0
+; RV32VB-PACK-NEXT:    packh a4, a6, a0
+; RV32VB-PACK-NEXT:    pack a0, a4, a0
 ; RV32VB-PACK-NEXT:    packh a4, a0, a0
 ; RV32VB-PACK-NEXT:    pack a1, a1, a4
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
@@ -2852,32 +2852,32 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RVA22U64:       # %bb.0:
 ; RVA22U64-NEXT:    lbu a1, 1(a0)
 ; RVA22U64-NEXT:    lbu a2, 0(a0)
-; RVA22U64-NEXT:    slli a1, a1, 8
 ; RVA22U64-NEXT:    lbu a3, 44(a0)
 ; RVA22U64-NEXT:    lbu a4, 55(a0)
-; RVA22U64-NEXT:    or a1, a1, a2
-; RVA22U64-NEXT:    lbu a2, 75(a0)
+; RVA22U64-NEXT:    slli a1, a1, 8
+; RVA22U64-NEXT:    or a6, a2, a1
 ; RVA22U64-NEXT:    slli a3, a3, 32
 ; RVA22U64-NEXT:    slli a4, a4, 40
 ; RVA22U64-NEXT:    or a3, a3, a4
+; RVA22U64-NEXT:    lbu a2, 75(a0)
+; RVA22U64-NEXT:    lbu a4, 82(a0)
+; RVA22U64-NEXT:    lbu a5, 93(a0)
+; RVA22U64-NEXT:    lbu a1, 124(a0)
 ; RVA22U64-NEXT:    slli a2, a2, 56
-; RVA22U64-NEXT:    lbu a4, 93(a0)
 ; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    or a1, a1, a2
-; RVA22U64-NEXT:    lbu a2, 82(a0)
-; RVA22U64-NEXT:    slli a4, a4, 8
+; RVA22U64-NEXT:    or a2, a6, a2
 ; RVA22U64-NEXT:    lbu a3, 144(a0)
-; RVA22U64-NEXT:    lbu a5, 154(a0)
-; RVA22U64-NEXT:    or a2, a2, a4
-; RVA22U64-NEXT:    lbu a0, 124(a0)
+; RVA22U64-NEXT:    lbu a0, 154(a0)
+; RVA22U64-NEXT:    slli a5, a5, 8
+; RVA22U64-NEXT:    or a4, a4, a5
 ; RVA22U64-NEXT:    slli a3, a3, 48
-; RVA22U64-NEXT:    slli a5, a5, 56
-; RVA22U64-NEXT:    or a3, a3, a5
-; RVA22U64-NEXT:    slli a0, a0, 32
+; RVA22U64-NEXT:    slli a0, a0, 56
 ; RVA22U64-NEXT:    or a0, a0, a3
-; RVA22U64-NEXT:    or a0, a0, a2
+; RVA22U64-NEXT:    slli a1, a1, 32
+; RVA22U64-NEXT:    or a0, a0, a1
+; RVA22U64-NEXT:    or a0, a0, a4
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-NEXT:    vmv.v.x v8, a1
+; RVA22U64-NEXT:    vmv.v.x v8, a2
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-NEXT:    ret
 ;
@@ -2887,27 +2887,27 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
 ; RVA22U64-PACK-NEXT:    lbu a3, 44(a0)
 ; RVA22U64-PACK-NEXT:    lbu a4, 55(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 75(a0)
-; RVA22U64-PACK-NEXT:    packh a1, a1, a2
+; RVA22U64-PACK-NEXT:    packh a6, a1, a2
 ; RVA22U64-PACK-NEXT:    packh a2, a3, a4
-; RVA22U64-PACK-NEXT:    packh a3, a0, a5
-; RVA22U64-PACK-NEXT:    packw a6, a2, a3
-; RVA22U64-PACK-NEXT:    packh a3, a0, a0
-; RVA22U64-PACK-NEXT:    packw a7, a1, a3
-; RVA22U64-PACK-NEXT:    lbu a4, 82(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 75(a0)
+; RVA22U64-PACK-NEXT:    lbu a7, 82(a0)
 ; RVA22U64-PACK-NEXT:    lbu a5, 93(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 144(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 154(a0)
-; RVA22U64-PACK-NEXT:    lbu a0, 124(a0)
-; RVA22U64-PACK-NEXT:    pack a6, a7, a6
-; RVA22U64-PACK-NEXT:    packh a4, a4, a5
-; RVA22U64-PACK-NEXT:    packh a1, a2, a1
-; RVA22U64-PACK-NEXT:    packh a0, a0, a0
-; RVA22U64-PACK-NEXT:    packw a0, a0, a1
-; RVA22U64-PACK-NEXT:    packw a1, a4, a3
-; RVA22U64-PACK-NEXT:    pack a0, a1, a0
+; RVA22U64-PACK-NEXT:    lbu t0, 124(a0)
+; RVA22U64-PACK-NEXT:    packh a3, a0, a3
+; RVA22U64-PACK-NEXT:    packw a2, a2, a3
+; RVA22U64-PACK-NEXT:    packh a3, a0, a0
+; RVA22U64-PACK-NEXT:    lbu a4, 144(a0)
+; RVA22U64-PACK-NEXT:    lbu a0, 154(a0)
+; RVA22U64-PACK-NEXT:    packw a1, a6, a3
+; RVA22U64-PACK-NEXT:    pack a1, a1, a2
+; RVA22U64-PACK-NEXT:    packh a2, a7, a5
+; RVA22U64-PACK-NEXT:    packh a0, a4, a0
+; RVA22U64-PACK-NEXT:    packh a4, t0, a0
+; RVA22U64-PACK-NEXT:    packw a0, a4, a0
+; RVA22U64-PACK-NEXT:    packw a2, a2, a3
+; RVA22U64-PACK-NEXT:    pack a0, a2, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-PACK-NEXT:    vmv.v.x v8, a6
+; RVA22U64-PACK-NEXT:    vmv.v.x v8, a1
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index 82e0760d593c2..af46849ae0871 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -7,25 +7,25 @@
 define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-LABEL: load_large_vector:
 ; ZVE32X:       # %bb.0:
-; ZVE32X-NEXT:    ld a1, 80(a0)
-; ZVE32X-NEXT:    ld a2, 72(a0)
-; ZVE32X-NEXT:    ld a3, 56(a0)
+; ZVE32X-NEXT:    ld a1, 0(a0)
+; ZVE32X-NEXT:    ld a2, 8(a0)
+; ZVE32X-NEXT:    ld a3, 24(a0)
 ; ZVE32X-NEXT:    ld a4, 32(a0)
-; ZVE32X-NEXT:    ld a5, 24(a0)
-; ZVE32X-NEXT:    ld a6, 48(a0)
-; ZVE32X-NEXT:    ld a7, 8(a0)
-; ZVE32X-NEXT:    ld a0, 0(a0)
-; ZVE32X-NEXT:    xor a4, a5, a4
-; ZVE32X-NEXT:    snez a4, a4
+; ZVE32X-NEXT:    ld a5, 48(a0)
+; ZVE32X-NEXT:    ld a6, 56(a0)
+; ZVE32X-NEXT:    ld a7, 72(a0)
+; ZVE32X-NEXT:    ld a0, 80(a0)
+; ZVE32X-NEXT:    xor a3, a3, a4
+; ZVE32X-NEXT:    snez a3, a3
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmv.s.x v8, a4
+; ZVE32X-NEXT:    vmv.s.x v8, a3
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v8, 0
 ; ZVE32X-NEXT:    vmv.s.x v9, zero
 ; ZVE32X-NEXT:    vmerge.vim v8, v9, 1, v0
-; ZVE32X-NEXT:    xor a0, a0, a7
-; ZVE32X-NEXT:    snez a0, a0
-; ZVE32X-NEXT:    vmv.s.x v10, a0
+; ZVE32X-NEXT:    xor a1, a1, a2
+; ZVE32X-NEXT:    snez a1, a1
+; ZVE32X-NEXT:    vmv.s.x v10, a1
 ; ZVE32X-NEXT:    vand.vi v10, v10, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v10, 0
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
@@ -35,9 +35,9 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vslideup.vi v11, v8, 1
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
-; ZVE32X-NEXT:    xor a0, a6, a3
-; ZVE32X-NEXT:    snez a0, a0
-; ZVE32X-NEXT:    vmv.s.x v8, a0
+; ZVE32X-NEXT:    xor a1, a5, a6
+; ZVE32X-NEXT:    snez a1, a1
+; ZVE32X-NEXT:    vmv.s.x v8, a1
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
 ; ZVE32X-NEXT:    vmsne.vi v8, v8, 0
@@ -50,8 +50,8 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vslideup.vi v11, v8, 2
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
-; ZVE32X-NEXT:    xor a1, a2, a1
-; ZVE32X-NEXT:    snez a0, a1
+; ZVE32X-NEXT:    xor a0, a7, a0
+; ZVE32X-NEXT:    snez a0, a0
 ; ZVE32X-NEXT:    vmv.s.x v8, a0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index eff56e408d6d5..5911e8248f299 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -641,10 +641,10 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 66
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a3, a2, 6
+; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc2, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 66 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc1, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 65 * vlenb
 ; RV64-NEXT:    addi a2, a1, 256
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v16, (a2)
@@ -1065,8 +1065,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 66
-; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    slli a1, a0, 6
+; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
index d52cbb54c4b2d..805b371f1e3d5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
@@ -104,9 +104,10 @@ define <3 x i64> @llrint_v3i64_v3f32(<3 x float> %x) {
 ; RV32-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    slli a1, a0, 1
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 4 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 3 * vlenb
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 1
 ; RV32-NEXT:    add a0, sp, a0
@@ -167,7 +168,8 @@ define <3 x i64> @llrint_v3i64_v3f32(<3 x float> %x) {
 ; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    slli a1, a0, 1
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
@@ -210,9 +212,10 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
 ; RV32-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    slli a1, a0, 1
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 4 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 3 * vlenb
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 1
 ; RV32-NEXT:    add a0, sp, a0
@@ -273,7 +276,8 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
 ; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    slli a1, a0, 1
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
index a90ee3ebb8766..43184a28ba323 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
@@ -777,24 +777,24 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV32-NEXT:    vfmv.f.s fa5, v10
 ; RV32-NEXT:    fcvt.w.d a2, fa5
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    fcvt.w.d a3, fa5
 ; RV32-NEXT:    fld fa5, 32(sp)
-; RV32-NEXT:    vfmv.f.s fa4, v8
-; RV32-NEXT:    fld fa3, 40(sp)
-; RV32-NEXT:    fcvt.w.d a3, fa4
+; RV32-NEXT:    fld fa4, 40(sp)
+; RV32-NEXT:    fld fa3, 48(sp)
+; RV32-NEXT:    fld fa2, 56(sp)
 ; RV32-NEXT:    fcvt.w.d a4, fa5
+; RV32-NEXT:    fcvt.w.d a5, fa4
+; RV32-NEXT:    fcvt.w.d a6, fa3
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    fcvt.w.d a1, fa3
-; RV32-NEXT:    fld fa5, 48(sp)
 ; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a3
-; RV32-NEXT:    fcvt.w.d a0, fa5
-; RV32-NEXT:    fld fa5, 56(sp)
 ; RV32-NEXT:    vslide1down.vx v8, v8, a4
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-NEXT:    fcvt.w.d a0, fa2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    addi sp, s0, -128
 ; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
@@ -827,24 +827,24 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v10
 ; RV64-i32-NEXT:    fcvt.l.d a2, fa5
 ; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    fcvt.l.d a3, fa5
 ; RV64-i32-NEXT:    fld fa5, 32(sp)
-; RV64-i32-NEXT:    vfmv.f.s fa4, v8
-; RV64-i32-NEXT:    fld fa3, 40(sp)
-; RV64-i32-NEXT:    fcvt.l.d a3, fa4
+; RV64-i32-NEXT:    fld fa4, 40(sp)
+; RV64-i32-NEXT:    fld fa3, 48(sp)
+; RV64-i32-NEXT:    fld fa2, 56(sp)
 ; RV64-i32-NEXT:    fcvt.l.d a4, fa5
+; RV64-i32-NEXT:    fcvt.l.d a5, fa4
+; RV64-i32-NEXT:    fcvt.l.d a6, fa3
 ; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV64-i32-NEXT:    vmv.v.x v8, a1
-; RV64-i32-NEXT:    fcvt.l.d a1, fa3
-; RV64-i32-NEXT:    fld fa5, 48(sp)
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a3
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
-; RV64-i32-NEXT:    fld fa5, 56(sp)
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a4
-; RV64-i32-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a5
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a6
+; RV64-i32-NEXT:    fcvt.l.d a0, fa2
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    addi sp, s0, -128
 ; RV64-i32-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 24a5bd154c64f..9cd3805636449 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -3525,9 +3525,9 @@ define <1 x i64> @mgather_v1i64(<1 x ptr> %ptrs, <1 x i1> %m, <1 x i64> %passthr
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB42_2
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a0)
-; RV32ZVE32F-NEXT:    lw a0, 0(a0)
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    lw a0, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB42_2: # %else
 ; RV32ZVE32F-NEXT:    ret
 ;
@@ -3571,30 +3571,30 @@ define <2 x i64> @mgather_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> %passthr
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a2, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, a4, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB43_4
 ; RV32ZVE32F-NEXT:  .LBB43_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a1)
-; RV32ZVE32F-NEXT:    lw a1, 8(a1)
+; RV32ZVE32F-NEXT:    lw a4, 8(a1)
+; RV32ZVE32F-NEXT:    lw a1, 12(a1)
 ; RV32ZVE32F-NEXT:    j .LBB43_5
 ; RV32ZVE32F-NEXT:  .LBB43_3:
-; RV32ZVE32F-NEXT:    lw a2, 4(a1)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a3, 4(a1)
 ; RV32ZVE32F-NEXT:    andi a4, a4, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB43_2
 ; RV32ZVE32F-NEXT:  .LBB43_4: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lw a4, 4(a1)
-; RV32ZVE32F-NEXT:    lw a1, 0(a1)
+; RV32ZVE32F-NEXT:    lw a4, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB43_5: # %else2
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
-; RV32ZVE32F-NEXT:    sw a1, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a1, 12(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i64:
@@ -3644,60 +3644,60 @@ define <4 x i64> @mgather_v4i64(<4 x ptr> %ptrs, <4 x i1> %m, <4 x i64> %passthr
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a2, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, a6, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB44_6
 ; RV32ZVE32F-NEXT:  .LBB44_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a1)
-; RV32ZVE32F-NEXT:    lw a5, 8(a1)
+; RV32ZVE32F-NEXT:    lw a4, 8(a1)
+; RV32ZVE32F-NEXT:    lw a5, 12(a1)
 ; RV32ZVE32F-NEXT:    andi a7, a6, 4
 ; RV32ZVE32F-NEXT:    bnez a7, .LBB44_7
 ; RV32ZVE32F-NEXT:  .LBB44_3:
-; RV32ZVE32F-NEXT:    lw a7, 20(a1)
-; RV32ZVE32F-NEXT:    lw t0, 16(a1)
+; RV32ZVE32F-NEXT:    lw a7, 16(a1)
+; RV32ZVE32F-NEXT:    lw t0, 20(a1)
 ; RV32ZVE32F-NEXT:    andi a6, a6, 8
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB44_8
 ; RV32ZVE32F-NEXT:  .LBB44_4:
-; RV32ZVE32F-NEXT:    lw a6, 28(a1)
-; RV32ZVE32F-NEXT:    lw a1, 24(a1)
+; RV32ZVE32F-NEXT:    lw a6, 24(a1)
+; RV32ZVE32F-NEXT:    lw a1, 28(a1)
 ; RV32ZVE32F-NEXT:    j .LBB44_9
 ; RV32ZVE32F-NEXT:  .LBB44_5:
-; RV32ZVE32F-NEXT:    lw a2, 4(a1)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a3, 4(a1)
 ; RV32ZVE32F-NEXT:    andi a4, a6, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB44_2
 ; RV32ZVE32F-NEXT:  .LBB44_6: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a7, a6, 4
 ; RV32ZVE32F-NEXT:    beqz a7, .LBB44_3
 ; RV32ZVE32F-NEXT:  .LBB44_7: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v9
-; RV32ZVE32F-NEXT:    lw a7, 4(t0)
-; RV32ZVE32F-NEXT:    lw t0, 0(t0)
+; RV32ZVE32F-NEXT:    lw a7, 0(t0)
+; RV32ZVE32F-NEXT:    lw t0, 4(t0)
 ; RV32ZVE32F-NEXT:    andi a6, a6, 8
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB44_4
 ; RV32ZVE32F-NEXT:  .LBB44_8: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lw a6, 4(a1)
-; RV32ZVE32F-NEXT:    lw a1, 0(a1)
+; RV32ZVE32F-NEXT:    lw a6, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB44_9: # %else8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw t0, 16(a0)
-; RV32ZVE32F-NEXT:    sw a7, 20(a0)
-; RV32ZVE32F-NEXT:    sw a1, 24(a0)
-; RV32ZVE32F-NEXT:    sw a6, 28(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a7, 16(a0)
+; RV32ZVE32F-NEXT:    sw t0, 20(a0)
+; RV32ZVE32F-NEXT:    sw a6, 24(a0)
+; RV32ZVE32F-NEXT:    sw a1, 28(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v4i64:
@@ -3775,18 +3775,18 @@ define <4 x i64> @mgather_truemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru) {
 ; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
-; RV32ZVE32F-NEXT:    lw a6, 0(a5)
-; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a7, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    vmv.x.s a6, v8
+; RV32ZVE32F-NEXT:    lw a7, 0(a6)
+; RV32ZVE32F-NEXT:    lw a6, 4(a6)
+; RV32ZVE32F-NEXT:    lw t0, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    sw a1, 4(a0)
 ; RV32ZVE32F-NEXT:    sw a2, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 28(a0)
+; RV32ZVE32F-NEXT:    sw a6, 28(a0)
 ; RV32ZVE32F-NEXT:    sw a7, 24(a0)
 ; RV32ZVE32F-NEXT:    sw a5, 20(a0)
-; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw t0, 16(a0)
 ; RV32ZVE32F-NEXT:    sw a3, 12(a0)
 ; RV32ZVE32F-NEXT:    sw a4, 8(a0)
 ; RV32ZVE32F-NEXT:    ret
@@ -3823,22 +3823,22 @@ define <4 x i64> @mgather_falsemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru)
 ;
 ; RV32ZVE32F-LABEL: mgather_falsemask_v4i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a2, 0(a1)
-; RV32ZVE32F-NEXT:    lw a3, 4(a1)
-; RV32ZVE32F-NEXT:    lw a4, 8(a1)
-; RV32ZVE32F-NEXT:    lw a5, 12(a1)
-; RV32ZVE32F-NEXT:    lw a6, 28(a1)
-; RV32ZVE32F-NEXT:    lw a7, 24(a1)
-; RV32ZVE32F-NEXT:    lw t0, 20(a1)
-; RV32ZVE32F-NEXT:    lw a1, 16(a1)
-; RV32ZVE32F-NEXT:    sw a6, 28(a0)
-; RV32ZVE32F-NEXT:    sw a7, 24(a0)
-; RV32ZVE32F-NEXT:    sw t0, 20(a0)
-; RV32ZVE32F-NEXT:    sw a1, 16(a0)
-; RV32ZVE32F-NEXT:    sw a5, 12(a0)
-; RV32ZVE32F-NEXT:    sw a4, 8(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    lw a2, 16(a1)
+; RV32ZVE32F-NEXT:    lw a3, 20(a1)
+; RV32ZVE32F-NEXT:    lw a4, 24(a1)
+; RV32ZVE32F-NEXT:    lw a5, 28(a1)
+; RV32ZVE32F-NEXT:    lw a6, 0(a1)
+; RV32ZVE32F-NEXT:    lw a7, 4(a1)
+; RV32ZVE32F-NEXT:    lw t0, 8(a1)
+; RV32ZVE32F-NEXT:    lw a1, 12(a1)
+; RV32ZVE32F-NEXT:    sw a5, 28(a0)
+; RV32ZVE32F-NEXT:    sw a4, 24(a0)
+; RV32ZVE32F-NEXT:    sw a3, 20(a0)
+; RV32ZVE32F-NEXT:    sw a2, 16(a0)
+; RV32ZVE32F-NEXT:    sw a1, 12(a0)
+; RV32ZVE32F-NEXT:    sw t0, 8(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_falsemask_v4i64:
@@ -3882,77 +3882,77 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a2, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB47_8
 ; RV32ZVE32F-NEXT:  .LBB47_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a1)
-; RV32ZVE32F-NEXT:    lw a5, 8(a1)
+; RV32ZVE32F-NEXT:    lw a4, 8(a1)
+; RV32ZVE32F-NEXT:    lw a5, 12(a1)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB47_9
 ; RV32ZVE32F-NEXT:  .LBB47_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a1)
-; RV32ZVE32F-NEXT:    lw a7, 16(a1)
+; RV32ZVE32F-NEXT:    lw a6, 16(a1)
+; RV32ZVE32F-NEXT:    lw a7, 20(a1)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB47_10
 ; RV32ZVE32F-NEXT:  .LBB47_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a1)
-; RV32ZVE32F-NEXT:    lw t2, 24(a1)
+; RV32ZVE32F-NEXT:    lw t1, 24(a1)
+; RV32ZVE32F-NEXT:    lw t2, 28(a1)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB47_11
 ; RV32ZVE32F-NEXT:  .LBB47_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a1)
-; RV32ZVE32F-NEXT:    lw t4, 32(a1)
+; RV32ZVE32F-NEXT:    lw t3, 32(a1)
+; RV32ZVE32F-NEXT:    lw t4, 36(a1)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB47_12
 ; RV32ZVE32F-NEXT:  .LBB47_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a1)
-; RV32ZVE32F-NEXT:    lw t6, 40(a1)
+; RV32ZVE32F-NEXT:    lw t5, 40(a1)
+; RV32ZVE32F-NEXT:    lw t6, 44(a1)
 ; RV32ZVE32F-NEXT:    j .LBB47_13
 ; RV32ZVE32F-NEXT:  .LBB47_7:
-; RV32ZVE32F-NEXT:    lw a2, 4(a1)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a3, 4(a1)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB47_2
 ; RV32ZVE32F-NEXT:  .LBB47_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB47_3
 ; RV32ZVE32F-NEXT:  .LBB47_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB47_4
 ; RV32ZVE32F-NEXT:  .LBB47_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB47_5
 ; RV32ZVE32F-NEXT:  .LBB47_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB47_6
 ; RV32ZVE32F-NEXT:  .LBB47_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB47_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -3966,42 +3966,42 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB47_17
 ; RV32ZVE32F-NEXT:  .LBB47_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a1)
-; RV32ZVE32F-NEXT:    lw a1, 56(a1)
+; RV32ZVE32F-NEXT:    lw t0, 56(a1)
+; RV32ZVE32F-NEXT:    lw a1, 60(a1)
 ; RV32ZVE32F-NEXT:    j .LBB47_18
 ; RV32ZVE32F-NEXT:  .LBB47_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a1)
-; RV32ZVE32F-NEXT:    lw s1, 48(a1)
+; RV32ZVE32F-NEXT:    lw s0, 48(a1)
+; RV32ZVE32F-NEXT:    lw s1, 52(a1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB47_15
 ; RV32ZVE32F-NEXT:  .LBB47_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a1)
-; RV32ZVE32F-NEXT:    lw a1, 0(a1)
+; RV32ZVE32F-NEXT:    lw t0, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB47_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a1, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a1, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -4129,77 +4129,77 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB48_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB48_8
 ; RV32ZVE32F-NEXT:  .LBB48_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB48_9
 ; RV32ZVE32F-NEXT:  .LBB48_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB48_10
 ; RV32ZVE32F-NEXT:  .LBB48_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB48_11
 ; RV32ZVE32F-NEXT:  .LBB48_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB48_12
 ; RV32ZVE32F-NEXT:  .LBB48_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB48_13
 ; RV32ZVE32F-NEXT:  .LBB48_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB48_2
 ; RV32ZVE32F-NEXT:  .LBB48_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB48_3
 ; RV32ZVE32F-NEXT:  .LBB48_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB48_4
 ; RV32ZVE32F-NEXT:  .LBB48_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB48_5
 ; RV32ZVE32F-NEXT:  .LBB48_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB48_6
 ; RV32ZVE32F-NEXT:  .LBB48_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB48_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -4213,42 +4213,42 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB48_17
 ; RV32ZVE32F-NEXT:  .LBB48_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB48_18
 ; RV32ZVE32F-NEXT:  .LBB48_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB48_15
 ; RV32ZVE32F-NEXT:  .LBB48_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB48_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -4403,77 +4403,77 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB49_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB49_8
 ; RV32ZVE32F-NEXT:  .LBB49_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB49_9
 ; RV32ZVE32F-NEXT:  .LBB49_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB49_10
 ; RV32ZVE32F-NEXT:  .LBB49_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB49_11
 ; RV32ZVE32F-NEXT:  .LBB49_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB49_12
 ; RV32ZVE32F-NEXT:  .LBB49_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB49_13
 ; RV32ZVE32F-NEXT:  .LBB49_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB49_2
 ; RV32ZVE32F-NEXT:  .LBB49_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB49_3
 ; RV32ZVE32F-NEXT:  .LBB49_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB49_4
 ; RV32ZVE32F-NEXT:  .LBB49_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB49_5
 ; RV32ZVE32F-NEXT:  .LBB49_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB49_6
 ; RV32ZVE32F-NEXT:  .LBB49_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB49_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -4487,42 +4487,42 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB49_17
 ; RV32ZVE32F-NEXT:  .LBB49_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB49_18
 ; RV32ZVE32F-NEXT:  .LBB49_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB49_15
 ; RV32ZVE32F-NEXT:  .LBB49_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB49_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -4679,77 +4679,77 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB50_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB50_8
 ; RV32ZVE32F-NEXT:  .LBB50_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB50_9
 ; RV32ZVE32F-NEXT:  .LBB50_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB50_10
 ; RV32ZVE32F-NEXT:  .LBB50_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB50_11
 ; RV32ZVE32F-NEXT:  .LBB50_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB50_12
 ; RV32ZVE32F-NEXT:  .LBB50_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB50_13
 ; RV32ZVE32F-NEXT:  .LBB50_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB50_2
 ; RV32ZVE32F-NEXT:  .LBB50_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB50_3
 ; RV32ZVE32F-NEXT:  .LBB50_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB50_4
 ; RV32ZVE32F-NEXT:  .LBB50_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB50_5
 ; RV32ZVE32F-NEXT:  .LBB50_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB50_6
 ; RV32ZVE32F-NEXT:  .LBB50_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB50_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -4763,42 +4763,42 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB50_17
 ; RV32ZVE32F-NEXT:  .LBB50_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB50_18
 ; RV32ZVE32F-NEXT:  .LBB50_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB50_15
 ; RV32ZVE32F-NEXT:  .LBB50_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB50_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -4962,77 +4962,77 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB51_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB51_8
 ; RV32ZVE32F-NEXT:  .LBB51_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB51_9
 ; RV32ZVE32F-NEXT:  .LBB51_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB51_10
 ; RV32ZVE32F-NEXT:  .LBB51_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB51_11
 ; RV32ZVE32F-NEXT:  .LBB51_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB51_12
 ; RV32ZVE32F-NEXT:  .LBB51_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB51_13
 ; RV32ZVE32F-NEXT:  .LBB51_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB51_2
 ; RV32ZVE32F-NEXT:  .LBB51_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB51_3
 ; RV32ZVE32F-NEXT:  .LBB51_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB51_4
 ; RV32ZVE32F-NEXT:  .LBB51_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB51_5
 ; RV32ZVE32F-NEXT:  .LBB51_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB51_6
 ; RV32ZVE32F-NEXT:  .LBB51_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB51_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -5046,42 +5046,42 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB51_17
 ; RV32ZVE32F-NEXT:  .LBB51_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB51_18
 ; RV32ZVE32F-NEXT:  .LBB51_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB51_15
 ; RV32ZVE32F-NEXT:  .LBB51_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB51_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -5237,77 +5237,77 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB52_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB52_8
 ; RV32ZVE32F-NEXT:  .LBB52_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB52_9
 ; RV32ZVE32F-NEXT:  .LBB52_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB52_10
 ; RV32ZVE32F-NEXT:  .LBB52_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB52_11
 ; RV32ZVE32F-NEXT:  .LBB52_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB52_12
 ; RV32ZVE32F-NEXT:  .LBB52_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB52_13
 ; RV32ZVE32F-NEXT:  .LBB52_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB52_2
 ; RV32ZVE32F-NEXT:  .LBB52_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB52_3
 ; RV32ZVE32F-NEXT:  .LBB52_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB52_4
 ; RV32ZVE32F-NEXT:  .LBB52_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB52_5
 ; RV32ZVE32F-NEXT:  .LBB52_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB52_6
 ; RV32ZVE32F-NEXT:  .LBB52_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB52_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -5321,42 +5321,42 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB52_17
 ; RV32ZVE32F-NEXT:  .LBB52_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB52_18
 ; RV32ZVE32F-NEXT:  .LBB52_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB52_15
 ; RV32ZVE32F-NEXT:  .LBB52_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB52_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -5514,77 +5514,77 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB53_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB53_8
 ; RV32ZVE32F-NEXT:  .LBB53_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB53_9
 ; RV32ZVE32F-NEXT:  .LBB53_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB53_10
 ; RV32ZVE32F-NEXT:  .LBB53_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB53_11
 ; RV32ZVE32F-NEXT:  .LBB53_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB53_12
 ; RV32ZVE32F-NEXT:  .LBB53_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB53_13
 ; RV32ZVE32F-NEXT:  .LBB53_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB53_2
 ; RV32ZVE32F-NEXT:  .LBB53_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB53_3
 ; RV32ZVE32F-NEXT:  .LBB53_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB53_4
 ; RV32ZVE32F-NEXT:  .LBB53_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB53_5
 ; RV32ZVE32F-NEXT:  .LBB53_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB53_6
 ; RV32ZVE32F-NEXT:  .LBB53_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB53_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -5598,42 +5598,42 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB53_17
 ; RV32ZVE32F-NEXT:  .LBB53_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB53_18
 ; RV32ZVE32F-NEXT:  .LBB53_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB53_15
 ; RV32ZVE32F-NEXT:  .LBB53_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB53_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -5798,77 +5798,77 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB54_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB54_8
 ; RV32ZVE32F-NEXT:  .LBB54_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB54_9
 ; RV32ZVE32F-NEXT:  .LBB54_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB54_10
 ; RV32ZVE32F-NEXT:  .LBB54_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB54_11
 ; RV32ZVE32F-NEXT:  .LBB54_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB54_12
 ; RV32ZVE32F-NEXT:  .LBB54_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB54_13
 ; RV32ZVE32F-NEXT:  .LBB54_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB54_2
 ; RV32ZVE32F-NEXT:  .LBB54_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB54_3
 ; RV32ZVE32F-NEXT:  .LBB54_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB54_4
 ; RV32ZVE32F-NEXT:  .LBB54_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB54_5
 ; RV32ZVE32F-NEXT:  .LBB54_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB54_6
 ; RV32ZVE32F-NEXT:  .LBB54_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB54_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -5882,42 +5882,42 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB54_17
 ; RV32ZVE32F-NEXT:  .LBB54_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB54_18
 ; RV32ZVE32F-NEXT:  .LBB54_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB54_15
 ; RV32ZVE32F-NEXT:  .LBB54_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB54_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -6071,77 +6071,77 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB55_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB55_8
 ; RV32ZVE32F-NEXT:  .LBB55_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB55_9
 ; RV32ZVE32F-NEXT:  .LBB55_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB55_10
 ; RV32ZVE32F-NEXT:  .LBB55_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB55_11
 ; RV32ZVE32F-NEXT:  .LBB55_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB55_12
 ; RV32ZVE32F-NEXT:  .LBB55_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB55_13
 ; RV32ZVE32F-NEXT:  .LBB55_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB55_2
 ; RV32ZVE32F-NEXT:  .LBB55_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB55_3
 ; RV32ZVE32F-NEXT:  .LBB55_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB55_4
 ; RV32ZVE32F-NEXT:  .LBB55_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB55_5
 ; RV32ZVE32F-NEXT:  .LBB55_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB55_6
 ; RV32ZVE32F-NEXT:  .LBB55_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB55_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -6155,42 +6155,42 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB55_17
 ; RV32ZVE32F-NEXT:  .LBB55_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB55_18
 ; RV32ZVE32F-NEXT:  .LBB55_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB55_15
 ; RV32ZVE32F-NEXT:  .LBB55_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB55_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -6345,77 +6345,77 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB56_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB56_8
 ; RV32ZVE32F-NEXT:  .LBB56_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB56_9
 ; RV32ZVE32F-NEXT:  .LBB56_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB56_10
 ; RV32ZVE32F-NEXT:  .LBB56_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB56_11
 ; RV32ZVE32F-NEXT:  .LBB56_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB56_12
 ; RV32ZVE32F-NEXT:  .LBB56_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB56_13
 ; RV32ZVE32F-NEXT:  .LBB56_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB56_2
 ; RV32ZVE32F-NEXT:  .LBB56_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB56_3
 ; RV32ZVE32F-NEXT:  .LBB56_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB56_4
 ; RV32ZVE32F-NEXT:  .LBB56_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB56_5
 ; RV32ZVE32F-NEXT:  .LBB56_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB56_6
 ; RV32ZVE32F-NEXT:  .LBB56_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB56_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -6429,42 +6429,42 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB56_17
 ; RV32ZVE32F-NEXT:  .LBB56_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB56_18
 ; RV32ZVE32F-NEXT:  .LBB56_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB56_15
 ; RV32ZVE32F-NEXT:  .LBB56_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB56_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -6617,10 +6617,10 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ;
 ; RV32ZVE32F-LABEL: mgather_baseidx_v8i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a4, 56(a2)
-; RV32ZVE32F-NEXT:    lw a5, 48(a2)
-; RV32ZVE32F-NEXT:    lw a6, 40(a2)
-; RV32ZVE32F-NEXT:    lw a7, 32(a2)
+; RV32ZVE32F-NEXT:    lw a4, 32(a2)
+; RV32ZVE32F-NEXT:    lw a5, 40(a2)
+; RV32ZVE32F-NEXT:    lw a6, 48(a2)
+; RV32ZVE32F-NEXT:    lw a7, 56(a2)
 ; RV32ZVE32F-NEXT:    lw t0, 0(a2)
 ; RV32ZVE32F-NEXT:    lw t1, 8(a2)
 ; RV32ZVE32F-NEXT:    lw t2, 16(a2)
@@ -6630,10 +6630,10 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
@@ -6643,77 +6643,77 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    beqz a2, .LBB57_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB57_8
 ; RV32ZVE32F-NEXT:  .LBB57_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a3)
-; RV32ZVE32F-NEXT:    lw a5, 8(a3)
+; RV32ZVE32F-NEXT:    lw a4, 8(a3)
+; RV32ZVE32F-NEXT:    lw a5, 12(a3)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB57_9
 ; RV32ZVE32F-NEXT:  .LBB57_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a3)
-; RV32ZVE32F-NEXT:    lw a7, 16(a3)
+; RV32ZVE32F-NEXT:    lw a6, 16(a3)
+; RV32ZVE32F-NEXT:    lw a7, 20(a3)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB57_10
 ; RV32ZVE32F-NEXT:  .LBB57_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a3)
-; RV32ZVE32F-NEXT:    lw t2, 24(a3)
+; RV32ZVE32F-NEXT:    lw t1, 24(a3)
+; RV32ZVE32F-NEXT:    lw t2, 28(a3)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB57_11
 ; RV32ZVE32F-NEXT:  .LBB57_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a3)
-; RV32ZVE32F-NEXT:    lw t4, 32(a3)
+; RV32ZVE32F-NEXT:    lw t3, 32(a3)
+; RV32ZVE32F-NEXT:    lw t4, 36(a3)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB57_12
 ; RV32ZVE32F-NEXT:  .LBB57_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a3)
-; RV32ZVE32F-NEXT:    lw t6, 40(a3)
+; RV32ZVE32F-NEXT:    lw t5, 40(a3)
+; RV32ZVE32F-NEXT:    lw t6, 44(a3)
 ; RV32ZVE32F-NEXT:    j .LBB57_13
 ; RV32ZVE32F-NEXT:  .LBB57_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB57_2
 ; RV32ZVE32F-NEXT:  .LBB57_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB57_3
 ; RV32ZVE32F-NEXT:  .LBB57_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB57_4
 ; RV32ZVE32F-NEXT:  .LBB57_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB57_5
 ; RV32ZVE32F-NEXT:  .LBB57_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB57_6
 ; RV32ZVE32F-NEXT:  .LBB57_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB57_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -6727,42 +6727,42 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB57_17
 ; RV32ZVE32F-NEXT:  .LBB57_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a3)
-; RV32ZVE32F-NEXT:    lw a3, 56(a3)
+; RV32ZVE32F-NEXT:    lw t0, 56(a3)
+; RV32ZVE32F-NEXT:    lw a3, 60(a3)
 ; RV32ZVE32F-NEXT:    j .LBB57_18
 ; RV32ZVE32F-NEXT:  .LBB57_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a3)
-; RV32ZVE32F-NEXT:    lw s1, 48(a3)
+; RV32ZVE32F-NEXT:    lw s0, 48(a3)
+; RV32ZVE32F-NEXT:    lw s1, 52(a3)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB57_15
 ; RV32ZVE32F-NEXT:  .LBB57_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw t0, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:  .LBB57_18: # %else20
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a3, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -11676,10 +11676,10 @@ define <8 x double> @mgather_baseidx_v8f64(ptr %base, <8 x i64> %idxs, <8 x i1>
 ;
 ; RV32ZVE32F-LABEL: mgather_baseidx_v8f64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a3, 56(a2)
-; RV32ZVE32F-NEXT:    lw a4, 48(a2)
-; RV32ZVE32F-NEXT:    lw a5, 40(a2)
-; RV32ZVE32F-NEXT:    lw a6, 32(a2)
+; RV32ZVE32F-NEXT:    lw a3, 32(a2)
+; RV32ZVE32F-NEXT:    lw a4, 40(a2)
+; RV32ZVE32F-NEXT:    lw a5, 48(a2)
+; RV32ZVE32F-NEXT:    lw a6, 56(a2)
 ; RV32ZVE32F-NEXT:    lw a7, 0(a2)
 ; RV32ZVE32F-NEXT:    lw t0, 8(a2)
 ; RV32ZVE32F-NEXT:    lw t1, 16(a2)
@@ -11689,10 +11689,10 @@ define <8 x double> @mgather_baseidx_v8f64(ptr %base, <8 x i64> %idxs, <8 x i1>
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
@@ -12729,54 +12729,54 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV32-NEXT:    slli a0, a0, 8
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    slli a3, a3, 8
-; RV32-NEXT:    or a2, a3, a2
 ; RV32-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    lbu a3, 0(a1)
+; RV32-NEXT:    lbu a4, 0(a1)
 ; RV32-NEXT:    lbu a1, 1(a1)
 ; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vmv.x.s a4, v10
-; RV32-NEXT:    lbu a5, 1(a4)
-; RV32-NEXT:    lbu a4, 0(a4)
+; RV32-NEXT:    vmv.x.s a5, v10
+; RV32-NEXT:    lbu a6, 0(a5)
+; RV32-NEXT:    lbu a5, 1(a5)
+; RV32-NEXT:    or a2, a3, a2
 ; RV32-NEXT:    slli a1, a1, 8
-; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    or a1, a1, a4
 ; RV32-NEXT:    slli a5, a5, 8
-; RV32-NEXT:    or a4, a5, a4
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    lbu a5, 0(a3)
+; RV32-NEXT:    lbu a4, 0(a3)
 ; RV32-NEXT:    lbu a3, 1(a3)
 ; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vmv.x.s a6, v10
-; RV32-NEXT:    lbu a7, 1(a6)
-; RV32-NEXT:    lbu a6, 0(a6)
+; RV32-NEXT:    vmv.x.s a7, v10
+; RV32-NEXT:    lbu t0, 0(a7)
+; RV32-NEXT:    lbu a7, 1(a7)
+; RV32-NEXT:    or a5, a5, a6
 ; RV32-NEXT:    slli a3, a3, 8
-; RV32-NEXT:    or a3, a3, a5
+; RV32-NEXT:    or a3, a3, a4
 ; RV32-NEXT:    slli a7, a7, 8
-; RV32-NEXT:    or a5, a7, a6
 ; RV32-NEXT:    vslidedown.vi v10, v8, 6
-; RV32-NEXT:    vmv.x.s a6, v10
-; RV32-NEXT:    lbu a7, 0(a6)
-; RV32-NEXT:    lbu a6, 1(a6)
+; RV32-NEXT:    vmv.x.s a4, v10
+; RV32-NEXT:    lbu a6, 0(a4)
+; RV32-NEXT:    lbu a4, 1(a4)
 ; RV32-NEXT:    vslidedown.vi v8, v8, 7
-; RV32-NEXT:    vmv.x.s t0, v8
-; RV32-NEXT:    lbu t1, 1(t0)
-; RV32-NEXT:    lbu t0, 0(t0)
-; RV32-NEXT:    slli a6, a6, 8
-; RV32-NEXT:    or a6, a6, a7
+; RV32-NEXT:    vmv.x.s t1, v8
+; RV32-NEXT:    lbu t2, 0(t1)
+; RV32-NEXT:    lbu t1, 1(t1)
+; RV32-NEXT:    or a7, a7, t0
+; RV32-NEXT:    slli a4, a4, 8
+; RV32-NEXT:    or a4, a4, a6
 ; RV32-NEXT:    slli t1, t1, 8
-; RV32-NEXT:    or a7, t1, t0
+; RV32-NEXT:    or a6, t1, t2
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV32-NEXT:    vmv.v.x v8, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v9, v8, a4
+; RV32-NEXT:    vslide1down.vx v9, v8, a5
 ; RV32-NEXT:    vmv.v.x v8, a3
-; RV32-NEXT:    vslide1down.vx v8, v8, a5
-; RV32-NEXT:    vslide1down.vx v8, v8, a6
-; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-NEXT:    vmv.v.i v0, 15
+; RV32-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -12805,50 +12805,50 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV64V-NEXT:    lbu a2, 0(a2)
 ; RV64V-NEXT:    slli a0, a0, 8
 ; RV64V-NEXT:    or a0, a0, a1
-; RV64V-NEXT:    slli a1, a3, 8
-; RV64V-NEXT:    or a1, a1, a2
+; RV64V-NEXT:    slli a3, a3, 8
 ; RV64V-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64V-NEXT:    vslidedown.vi v12, v8, 2
-; RV64V-NEXT:    vmv.x.s a2, v12
-; RV64V-NEXT:    lbu a3, 0(a2)
-; RV64V-NEXT:    lbu a2, 1(a2)
+; RV64V-NEXT:    vmv.x.s a1, v12
+; RV64V-NEXT:    lbu a4, 0(a1)
+; RV64V-NEXT:    lbu a1, 1(a1)
 ; RV64V-NEXT:    vslidedown.vi v12, v8, 3
-; RV64V-NEXT:    vmv.x.s a4, v12
-; RV64V-NEXT:    lbu a5, 0(a4)
-; RV64V-NEXT:    lbu a4, 1(a4)
-; RV64V-NEXT:    mv a6, sp
+; RV64V-NEXT:    vmv.x.s a5, v12
+; RV64V-NEXT:    lbu a6, 0(a5)
+; RV64V-NEXT:    lbu a5, 1(a5)
+; RV64V-NEXT:    or a2, a3, a2
+; RV64V-NEXT:    slli a1, a1, 8
+; RV64V-NEXT:    or a1, a1, a4
+; RV64V-NEXT:    slli a5, a5, 8
+; RV64V-NEXT:    mv a3, sp
 ; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64V-NEXT:    vse64.v v8, (a6)
-; RV64V-NEXT:    ld a6, 32(sp)
-; RV64V-NEXT:    slli a2, a2, 8
-; RV64V-NEXT:    or a2, a2, a3
-; RV64V-NEXT:    slli a4, a4, 8
-; RV64V-NEXT:    lbu a3, 1(a6)
-; RV64V-NEXT:    ld a7, 40(sp)
-; RV64V-NEXT:    lbu a6, 0(a6)
-; RV64V-NEXT:    or a4, a4, a5
-; RV64V-NEXT:    slli a3, a3, 8
-; RV64V-NEXT:    lbu a5, 1(a7)
-; RV64V-NEXT:    or a3, a3, a6
-; RV64V-NEXT:    lbu a6, 0(a7)
+; RV64V-NEXT:    vse64.v v8, (a3)
+; RV64V-NEXT:    ld a3, 32(sp)
+; RV64V-NEXT:    ld a4, 40(sp)
 ; RV64V-NEXT:    ld a7, 48(sp)
-; RV64V-NEXT:    slli a5, a5, 8
 ; RV64V-NEXT:    ld t0, 56(sp)
+; RV64V-NEXT:    lbu t1, 0(a3)
+; RV64V-NEXT:    lbu a3, 1(a3)
+; RV64V-NEXT:    lbu t2, 0(a4)
+; RV64V-NEXT:    lbu a4, 1(a4)
 ; RV64V-NEXT:    or a5, a5, a6
-; RV64V-NEXT:    lbu a6, 1(a7)
-; RV64V-NEXT:    lbu a7, 0(a7)
-; RV64V-NEXT:    lbu t1, 1(t0)
-; RV64V-NEXT:    lbu t0, 0(t0)
-; RV64V-NEXT:    slli a6, a6, 8
-; RV64V-NEXT:    or a6, a6, a7
-; RV64V-NEXT:    slli t1, t1, 8
-; RV64V-NEXT:    or a7, t1, t0
+; RV64V-NEXT:    slli a3, a3, 8
+; RV64V-NEXT:    or a3, a3, t1
+; RV64V-NEXT:    slli a4, a4, 8
+; RV64V-NEXT:    lbu a6, 0(a7)
+; RV64V-NEXT:    lbu a7, 1(a7)
+; RV64V-NEXT:    lbu t1, 0(t0)
+; RV64V-NEXT:    lbu t0, 1(t0)
+; RV64V-NEXT:    or a4, a4, t2
+; RV64V-NEXT:    slli a7, a7, 8
+; RV64V-NEXT:    or a6, a7, a6
+; RV64V-NEXT:    slli t0, t0, 8
+; RV64V-NEXT:    or a7, t0, t1
 ; RV64V-NEXT:    vmv.v.x v8, a0
-; RV64V-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64V-NEXT:    vslide1down.vx v8, v8, a2
-; RV64V-NEXT:    vslide1down.vx v9, v8, a4
+; RV64V-NEXT:    vslide1down.vx v8, v8, a1
+; RV64V-NEXT:    vslide1down.vx v9, v8, a5
 ; RV64V-NEXT:    vmv.v.x v8, a3
-; RV64V-NEXT:    vslide1down.vx v8, v8, a5
+; RV64V-NEXT:    vslide1down.vx v8, v8, a4
 ; RV64V-NEXT:    vslide1down.vx v8, v8, a6
 ; RV64V-NEXT:    vmv.v.i v0, 15
 ; RV64V-NEXT:    vslide1down.vx v8, v8, a7
@@ -12868,39 +12868,39 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV64ZVE32F-NEXT:    slli a1, a1, 8
 ; RV64ZVE32F-NEXT:    or a1, a1, a2
 ; RV64ZVE32F-NEXT:    slli a3, a3, 8
-; RV64ZVE32F-NEXT:    or a3, a3, a4
-; RV64ZVE32F-NEXT:    lbu a2, 9(a0)
-; RV64ZVE32F-NEXT:    lbu a4, 8(a0)
-; RV64ZVE32F-NEXT:    lbu a5, 13(a0)
+; RV64ZVE32F-NEXT:    lbu a2, 8(a0)
+; RV64ZVE32F-NEXT:    lbu a5, 9(a0)
 ; RV64ZVE32F-NEXT:    lbu a6, 12(a0)
-; RV64ZVE32F-NEXT:    slli a2, a2, 8
-; RV64ZVE32F-NEXT:    or a2, a2, a4
+; RV64ZVE32F-NEXT:    lbu a7, 13(a0)
+; RV64ZVE32F-NEXT:    or a3, a3, a4
 ; RV64ZVE32F-NEXT:    slli a5, a5, 8
-; RV64ZVE32F-NEXT:    or a4, a5, a6
+; RV64ZVE32F-NEXT:    or a2, a5, a2
+; RV64ZVE32F-NEXT:    slli a7, a7, 8
+; RV64ZVE32F-NEXT:    lbu a4, 16(a0)
 ; RV64ZVE32F-NEXT:    lbu a5, 17(a0)
-; RV64ZVE32F-NEXT:    lbu a6, 16(a0)
-; RV64ZVE32F-NEXT:    lbu a7, 21(a0)
 ; RV64ZVE32F-NEXT:    lbu t0, 20(a0)
+; RV64ZVE32F-NEXT:    lbu t1, 21(a0)
+; RV64ZVE32F-NEXT:    or a6, a7, a6
 ; RV64ZVE32F-NEXT:    slli a5, a5, 8
-; RV64ZVE32F-NEXT:    or a5, a5, a6
-; RV64ZVE32F-NEXT:    slli a7, a7, 8
-; RV64ZVE32F-NEXT:    or a6, a7, t0
+; RV64ZVE32F-NEXT:    or a4, a5, a4
+; RV64ZVE32F-NEXT:    slli t1, t1, 8
+; RV64ZVE32F-NEXT:    lbu a5, 24(a0)
 ; RV64ZVE32F-NEXT:    lbu a7, 25(a0)
-; RV64ZVE32F-NEXT:    lbu t0, 24(a0)
-; RV64ZVE32F-NEXT:    lbu t1, 29(a0)
-; RV64ZVE32F-NEXT:    lbu a0, 28(a0)
+; RV64ZVE32F-NEXT:    lbu t2, 28(a0)
+; RV64ZVE32F-NEXT:    lbu a0, 29(a0)
+; RV64ZVE32F-NEXT:    or t0, t1, t0
 ; RV64ZVE32F-NEXT:    slli a7, a7, 8
-; RV64ZVE32F-NEXT:    or a7, a7, t0
-; RV64ZVE32F-NEXT:    slli t1, t1, 8
-; RV64ZVE32F-NEXT:    or a0, t1, a0
+; RV64ZVE32F-NEXT:    or a5, a7, a5
+; RV64ZVE32F-NEXT:    slli a0, a0, 8
+; RV64ZVE32F-NEXT:    or a0, a0, t2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a6
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
@@ -13018,24 +13018,24 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_reverse_unit_strided_2xSEW:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 28(a0)
-; RV64ZVE32F-NEXT:    lh a2, 30(a0)
-; RV64ZVE32F-NEXT:    lh a3, 24(a0)
-; RV64ZVE32F-NEXT:    lh a4, 26(a0)
-; RV64ZVE32F-NEXT:    lh a5, 20(a0)
-; RV64ZVE32F-NEXT:    lh a6, 22(a0)
-; RV64ZVE32F-NEXT:    lh a7, 16(a0)
-; RV64ZVE32F-NEXT:    lh a0, 18(a0)
+; RV64ZVE32F-NEXT:    lh a1, 24(a0)
+; RV64ZVE32F-NEXT:    lh a2, 26(a0)
+; RV64ZVE32F-NEXT:    lh a3, 28(a0)
+; RV64ZVE32F-NEXT:    lh a4, 30(a0)
+; RV64ZVE32F-NEXT:    lh a5, 16(a0)
+; RV64ZVE32F-NEXT:    lh a6, 18(a0)
+; RV64ZVE32F-NEXT:    lh a7, 20(a0)
+; RV64ZVE32F-NEXT:    lh a0, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a2
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 14, i64 15, i64 12, i64 13, i64 10, i64 11, i64 8, i64 9>
@@ -13063,24 +13063,24 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_reverse_strided_2xSEW:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 28(a0)
-; RV64ZVE32F-NEXT:    lh a2, 30(a0)
-; RV64ZVE32F-NEXT:    lh a3, 20(a0)
-; RV64ZVE32F-NEXT:    lh a4, 22(a0)
-; RV64ZVE32F-NEXT:    lh a5, 12(a0)
-; RV64ZVE32F-NEXT:    lh a6, 14(a0)
-; RV64ZVE32F-NEXT:    lh a7, 4(a0)
-; RV64ZVE32F-NEXT:    lh a0, 6(a0)
+; RV64ZVE32F-NEXT:    lh a1, 20(a0)
+; RV64ZVE32F-NEXT:    lh a2, 22(a0)
+; RV64ZVE32F-NEXT:    lh a3, 28(a0)
+; RV64ZVE32F-NEXT:    lh a4, 30(a0)
+; RV64ZVE32F-NEXT:    lh a5, 4(a0)
+; RV64ZVE32F-NEXT:    lh a6, 6(a0)
+; RV64ZVE32F-NEXT:    lh a7, 12(a0)
+; RV64ZVE32F-NEXT:    lh a0, 14(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a2
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 14, i64 15, i64 10, i64 11, i64 6, i64 7, i64 2, i64 3>
@@ -13107,21 +13107,21 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_2xSEW:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 0(a0)
-; RV64ZVE32F-NEXT:    lh a2, 2(a0)
+; RV64ZVE32F-NEXT:    lh a1, 8(a0)
+; RV64ZVE32F-NEXT:    lh a2, 10(a0)
 ; RV64ZVE32F-NEXT:    lh a3, 16(a0)
 ; RV64ZVE32F-NEXT:    lh a4, 18(a0)
-; RV64ZVE32F-NEXT:    lh a5, 8(a0)
-; RV64ZVE32F-NEXT:    lh a6, 10(a0)
+; RV64ZVE32F-NEXT:    lh a5, 0(a0)
+; RV64ZVE32F-NEXT:    lh a6, 2(a0)
 ; RV64ZVE32F-NEXT:    lh a7, 4(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 6(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
@@ -13154,21 +13154,21 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_2xSEW_unaligned:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 0(a0)
-; RV64ZVE32F-NEXT:    lh a2, 2(a0)
+; RV64ZVE32F-NEXT:    lh a1, 8(a0)
+; RV64ZVE32F-NEXT:    lh a2, 10(a0)
 ; RV64ZVE32F-NEXT:    lh a3, 18(a0)
 ; RV64ZVE32F-NEXT:    lh a4, 20(a0)
-; RV64ZVE32F-NEXT:    lh a5, 8(a0)
-; RV64ZVE32F-NEXT:    lh a6, 10(a0)
+; RV64ZVE32F-NEXT:    lh a5, 0(a0)
+; RV64ZVE32F-NEXT:    lh a6, 2(a0)
 ; RV64ZVE32F-NEXT:    lh a7, 4(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 6(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
@@ -13202,23 +13202,23 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned2(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_2xSEW_unaligned2:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 2(a0)
-; RV64ZVE32F-NEXT:    lh a2, 4(a0)
-; RV64ZVE32F-NEXT:    lh a3, 18(a0)
-; RV64ZVE32F-NEXT:    lh a4, 20(a0)
-; RV64ZVE32F-NEXT:    lh a5, 8(a0)
-; RV64ZVE32F-NEXT:    lh a6, 10(a0)
-; RV64ZVE32F-NEXT:    lh a0, 6(a0)
+; RV64ZVE32F-NEXT:    lh a1, 10(a0)
+; RV64ZVE32F-NEXT:    lh a2, 18(a0)
+; RV64ZVE32F-NEXT:    lh a3, 20(a0)
+; RV64ZVE32F-NEXT:    lh a4, 2(a0)
+; RV64ZVE32F-NEXT:    lh a5, 4(a0)
+; RV64ZVE32F-NEXT:    lh a6, 6(a0)
+; RV64ZVE32F-NEXT:    lh a0, 8(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a3
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a0
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 1, i32 2, i32 9, i32 10, i32 4, i32 5, i32 2, i32 3>
@@ -13406,18 +13406,18 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
 ; RV64ZVE32F-LABEL: mgather_shuffle_vrgather:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    lh a1, 0(a0)
-; RV64ZVE32F-NEXT:    lh a2, 4(a0)
-; RV64ZVE32F-NEXT:    lh a3, 6(a0)
-; RV64ZVE32F-NEXT:    lh a4, 2(a0)
+; RV64ZVE32F-NEXT:    lh a2, 2(a0)
+; RV64ZVE32F-NEXT:    lh a3, 4(a0)
+; RV64ZVE32F-NEXT:    lh a4, 6(a0)
 ; RV64ZVE32F-NEXT:    lh a5, 8(a0)
 ; RV64ZVE32F-NEXT:    lh a6, 10(a0)
 ; RV64ZVE32F-NEXT:    lh a7, 12(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 14(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a2
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
@@ -13541,109 +13541,109 @@ define <32 x i64> @mgather_strided_split(ptr %base) {
 ; RV32ZVE32F-NEXT:    vse32.v v8, (a1)
 ; RV32ZVE32F-NEXT:    lw a1, 288(sp)
 ; RV32ZVE32F-NEXT:    lw a2, 292(sp)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 188(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a3, 296(sp)
+; RV32ZVE32F-NEXT:    lw a4, 300(sp)
+; RV32ZVE32F-NEXT:    lw a5, 0(a1)
+; RV32ZVE32F-NEXT:    sw a5, 188(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:    sw a1, 184(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 296(sp)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
-; RV32ZVE32F-NEXT:    sw a3, 180(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a2, 4(a2)
-; RV32ZVE32F-NEXT:    sw a2, 176(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a2, 300(sp)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 172(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    sw a1, 180(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a2)
+; RV32ZVE32F-NEXT:    sw a1, 176(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    sw a1, 172(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a3)
 ; RV32ZVE32F-NEXT:    sw a1, 168(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a4)
+; RV32ZVE32F-NEXT:    sw a1, 164(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a4)
+; RV32ZVE32F-NEXT:    sw a1, 160(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 304(sp)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
-; RV32ZVE32F-NEXT:    sw a3, 164(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a2, 4(a2)
-; RV32ZVE32F-NEXT:    sw a2, 160(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a2, 308(sp)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 156(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a3, 312(sp)
+; RV32ZVE32F-NEXT:    lw a4, 316(sp)
+; RV32ZVE32F-NEXT:    lw a5, 0(a1)
+; RV32ZVE32F-NEXT:    sw a5, 156(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:    sw a1, 152(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 312(sp)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
-; RV32ZVE32F-NEXT:    sw a3, 148(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a2, 4(a2)
-; RV32ZVE32F-NEXT:    sw a2, 144(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a2, 316(sp)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 140(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    sw a1, 148(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a2)
+; RV32ZVE32F-NEXT:    sw a1, 144(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    sw a1, 140(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a3)
 ; RV32ZVE32F-NEXT:    sw a1, 136(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a4)
+; RV32ZVE32F-NEXT:    sw a1, 132(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a4)
+; RV32ZVE32F-NEXT:    sw a1, 128(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 320(sp)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
-; RV32ZVE32F-NEXT:    sw a3, 132(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a2, 4(a2)
-; RV32ZVE32F-NEXT:    sw a2, 128(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a2, 324(sp)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 124(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a3, 328(sp)
+; RV32ZVE32F-NEXT:    lw a4, 332(sp)
+; RV32ZVE32F-NEXT:    lw a5, 0(a1)
+; RV32ZVE32F-NEXT:    sw a5, 124(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:    sw a1, 120(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 328(sp)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
-; RV32ZVE32F-NEXT:    sw a3, 116(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a2, 4(a2)
-; RV32ZVE32F-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a2, 332(sp)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 104(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw ra, 4(a1)
-; RV32ZVE32F-NEXT:    lw a1, 336(sp)
-; RV32ZVE32F-NEXT:    lw s10, 0(a2)
-; RV32ZVE32F-NEXT:    lw s8, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 340(sp)
-; RV32ZVE32F-NEXT:    lw s6, 0(a1)
-; RV32ZVE32F-NEXT:    lw s4, 4(a1)
-; RV32ZVE32F-NEXT:    lw a4, 344(sp)
-; RV32ZVE32F-NEXT:    lw s2, 0(a2)
-; RV32ZVE32F-NEXT:    lw t5, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 348(sp)
-; RV32ZVE32F-NEXT:    lw t3, 0(a4)
-; RV32ZVE32F-NEXT:    lw t2, 4(a4)
-; RV32ZVE32F-NEXT:    lw a4, 352(sp)
-; RV32ZVE32F-NEXT:    lw t0, 0(a2)
-; RV32ZVE32F-NEXT:    lw a7, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 356(sp)
-; RV32ZVE32F-NEXT:    lw a6, 0(a4)
-; RV32ZVE32F-NEXT:    lw a5, 4(a4)
-; RV32ZVE32F-NEXT:    lw a4, 360(sp)
 ; RV32ZVE32F-NEXT:    lw a1, 0(a2)
-; RV32ZVE32F-NEXT:    sw a1, 108(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 116(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    sw a1, 100(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a2, 364(sp)
-; RV32ZVE32F-NEXT:    lw s11, 0(a4)
-; RV32ZVE32F-NEXT:    lw s9, 4(a4)
-; RV32ZVE32F-NEXT:    lw a1, 368(sp)
-; RV32ZVE32F-NEXT:    lw s7, 0(a2)
-; RV32ZVE32F-NEXT:    lw s5, 4(a2)
+; RV32ZVE32F-NEXT:    sw a1, 112(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    sw a1, 104(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw ra, 4(a3)
+; RV32ZVE32F-NEXT:    lw s10, 0(a4)
+; RV32ZVE32F-NEXT:    lw s8, 4(a4)
+; RV32ZVE32F-NEXT:    lw a2, 336(sp)
+; RV32ZVE32F-NEXT:    lw a4, 340(sp)
+; RV32ZVE32F-NEXT:    lw a6, 344(sp)
+; RV32ZVE32F-NEXT:    lw t0, 348(sp)
+; RV32ZVE32F-NEXT:    lw s5, 0(a2)
+; RV32ZVE32F-NEXT:    lw s4, 4(a2)
+; RV32ZVE32F-NEXT:    lw t6, 0(a4)
+; RV32ZVE32F-NEXT:    lw t5, 4(a4)
+; RV32ZVE32F-NEXT:    lw t3, 0(a6)
+; RV32ZVE32F-NEXT:    lw t2, 4(a6)
+; RV32ZVE32F-NEXT:    lw t1, 0(t0)
+; RV32ZVE32F-NEXT:    lw a7, 4(t0)
+; RV32ZVE32F-NEXT:    lw a6, 352(sp)
+; RV32ZVE32F-NEXT:    lw t0, 356(sp)
+; RV32ZVE32F-NEXT:    lw t4, 360(sp)
+; RV32ZVE32F-NEXT:    lw a1, 364(sp)
+; RV32ZVE32F-NEXT:    lw a5, 0(a6)
+; RV32ZVE32F-NEXT:    lw a6, 4(a6)
+; RV32ZVE32F-NEXT:    lw a2, 0(t0)
+; RV32ZVE32F-NEXT:    sw a2, 108(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(t0)
+; RV32ZVE32F-NEXT:    sw a2, 100(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw s11, 0(t4)
+; RV32ZVE32F-NEXT:    lw s9, 4(t4)
+; RV32ZVE32F-NEXT:    lw s7, 0(a1)
+; RV32ZVE32F-NEXT:    lw s6, 4(a1)
+; RV32ZVE32F-NEXT:    lw a4, 368(sp)
 ; RV32ZVE32F-NEXT:    lw a3, 372(sp)
-; RV32ZVE32F-NEXT:    lw s3, 0(a1)
-; RV32ZVE32F-NEXT:    lw t6, 4(a1)
 ; RV32ZVE32F-NEXT:    lw a2, 376(sp)
-; RV32ZVE32F-NEXT:    lw t4, 0(a3)
 ; RV32ZVE32F-NEXT:    lw a1, 380(sp)
-; RV32ZVE32F-NEXT:    lw t1, 4(a3)
+; RV32ZVE32F-NEXT:    lw s3, 0(a4)
+; RV32ZVE32F-NEXT:    lw s2, 4(a4)
+; RV32ZVE32F-NEXT:    lw t4, 0(a3)
+; RV32ZVE32F-NEXT:    lw t0, 4(a3)
 ; RV32ZVE32F-NEXT:    lw a4, 0(a2)
 ; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    lw a2, 0(a1)
 ; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a5, 196(a0)
-; RV32ZVE32F-NEXT:    sw a6, 192(a0)
+; RV32ZVE32F-NEXT:    sw a6, 196(a0)
+; RV32ZVE32F-NEXT:    sw a5, 192(a0)
 ; RV32ZVE32F-NEXT:    sw a7, 188(a0)
-; RV32ZVE32F-NEXT:    sw t0, 184(a0)
+; RV32ZVE32F-NEXT:    sw t1, 184(a0)
 ; RV32ZVE32F-NEXT:    sw t2, 180(a0)
 ; RV32ZVE32F-NEXT:    sw t3, 176(a0)
 ; RV32ZVE32F-NEXT:    sw t5, 172(a0)
-; RV32ZVE32F-NEXT:    sw s2, 168(a0)
+; RV32ZVE32F-NEXT:    sw t6, 168(a0)
 ; RV32ZVE32F-NEXT:    sw s4, 164(a0)
-; RV32ZVE32F-NEXT:    sw s6, 160(a0)
+; RV32ZVE32F-NEXT:    sw s5, 160(a0)
 ; RV32ZVE32F-NEXT:    sw s8, 156(a0)
 ; RV32ZVE32F-NEXT:    sw s10, 152(a0)
 ; RV32ZVE32F-NEXT:    sw ra, 148(a0)
@@ -13697,11 +13697,11 @@ define <32 x i64> @mgather_strided_split(ptr %base) {
 ; RV32ZVE32F-NEXT:    sw a2, 248(a0)
 ; RV32ZVE32F-NEXT:    sw a3, 244(a0)
 ; RV32ZVE32F-NEXT:    sw a4, 240(a0)
-; RV32ZVE32F-NEXT:    sw t1, 236(a0)
+; RV32ZVE32F-NEXT:    sw t0, 236(a0)
 ; RV32ZVE32F-NEXT:    sw t4, 232(a0)
-; RV32ZVE32F-NEXT:    sw t6, 228(a0)
+; RV32ZVE32F-NEXT:    sw s2, 228(a0)
 ; RV32ZVE32F-NEXT:    sw s3, 224(a0)
-; RV32ZVE32F-NEXT:    sw s5, 220(a0)
+; RV32ZVE32F-NEXT:    sw s6, 220(a0)
 ; RV32ZVE32F-NEXT:    sw s7, 216(a0)
 ; RV32ZVE32F-NEXT:    sw s9, 212(a0)
 ; RV32ZVE32F-NEXT:    sw s11, 208(a0)
@@ -13812,22 +13812,22 @@ define <32 x i64> @mgather_strided_split(ptr %base) {
 ; RV64ZVE32F-NEXT:    ld s8, 336(a1)
 ; RV64ZVE32F-NEXT:    ld s9, 352(a1)
 ; RV64ZVE32F-NEXT:    ld s10, 368(a1)
-; RV64ZVE32F-NEXT:    ld s11, 384(a1)
-; RV64ZVE32F-NEXT:    ld ra, 400(a1)
-; RV64ZVE32F-NEXT:    ld a6, 416(a1)
-; RV64ZVE32F-NEXT:    ld a5, 432(a1)
-; RV64ZVE32F-NEXT:    ld a2, 496(a1)
+; RV64ZVE32F-NEXT:    ld s11, 448(a1)
+; RV64ZVE32F-NEXT:    ld ra, 464(a1)
 ; RV64ZVE32F-NEXT:    ld a3, 480(a1)
-; RV64ZVE32F-NEXT:    ld a4, 464(a1)
-; RV64ZVE32F-NEXT:    ld a1, 448(a1)
+; RV64ZVE32F-NEXT:    ld a2, 496(a1)
+; RV64ZVE32F-NEXT:    ld a6, 384(a1)
+; RV64ZVE32F-NEXT:    ld a5, 400(a1)
+; RV64ZVE32F-NEXT:    ld a4, 416(a1)
+; RV64ZVE32F-NEXT:    ld a1, 432(a1)
 ; RV64ZVE32F-NEXT:    sd a2, 248(a0)
 ; RV64ZVE32F-NEXT:    sd a3, 240(a0)
-; RV64ZVE32F-NEXT:    sd a4, 232(a0)
-; RV64ZVE32F-NEXT:    sd a1, 224(a0)
-; RV64ZVE32F-NEXT:    sd a5, 216(a0)
-; RV64ZVE32F-NEXT:    sd a6, 208(a0)
-; RV64ZVE32F-NEXT:    sd ra, 200(a0)
-; RV64ZVE32F-NEXT:    sd s11, 192(a0)
+; RV64ZVE32F-NEXT:    sd ra, 232(a0)
+; RV64ZVE32F-NEXT:    sd s11, 224(a0)
+; RV64ZVE32F-NEXT:    sd a1, 216(a0)
+; RV64ZVE32F-NEXT:    sd a4, 208(a0)
+; RV64ZVE32F-NEXT:    sd a5, 200(a0)
+; RV64ZVE32F-NEXT:    sd a6, 192(a0)
 ; RV64ZVE32F-NEXT:    sd s10, 184(a0)
 ; RV64ZVE32F-NEXT:    sd s9, 176(a0)
 ; RV64ZVE32F-NEXT:    sd s8, 168(a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index fe037a5af57c0..bc7758717c1c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -275,9 +275,9 @@ define void @mscatter_v4i8(<4 x i8> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4i8:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -336,17 +336,17 @@ define void @mscatter_truemask_v4i8(<4 x i8> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i8:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 0(a0)
-; RV64ZVE32F-NEXT:    ld a2, 24(a0)
-; RV64ZVE32F-NEXT:    ld a3, 8(a0)
-; RV64ZVE32F-NEXT:    ld a0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 8(a0)
+; RV64ZVE32F-NEXT:    ld a3, 16(a0)
+; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vse8.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT:    vse8.v v9, (a3)
+; RV64ZVE32F-NEXT:    vse8.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vse8.v v9, (a0)
+; RV64ZVE32F-NEXT:    vse8.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
-; RV64ZVE32F-NEXT:    vse8.v v8, (a2)
+; RV64ZVE32F-NEXT:    vse8.v v8, (a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 1))
   ret void
@@ -377,37 +377,37 @@ define void @mscatter_v8i8(<8 x i8> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8i8:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
+; RV64ZVE32F-NEXT:    ld a3, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
-; RV64ZVE32F-NEXT:    andi t1, a3, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
+; RV64ZVE32F-NEXT:    andi t1, a4, 1
 ; RV64ZVE32F-NEXT:    bnez t1, .LBB8_9
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a3, 2
+; RV64ZVE32F-NEXT:    andi a0, a4, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB8_10
 ; RV64ZVE32F-NEXT:  .LBB8_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a3, 4
+; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB8_11
 ; RV64ZVE32F-NEXT:  .LBB8_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a3, 8
+; RV64ZVE32F-NEXT:    andi a0, a4, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB8_12
 ; RV64ZVE32F-NEXT:  .LBB8_4: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a3, 16
+; RV64ZVE32F-NEXT:    andi a0, a4, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB8_13
 ; RV64ZVE32F-NEXT:  .LBB8_5: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a3, 32
+; RV64ZVE32F-NEXT:    andi a0, a4, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB8_14
 ; RV64ZVE32F-NEXT:  .LBB8_6: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a3, 64
+; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB8_15
 ; RV64ZVE32F-NEXT:  .LBB8_7: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a3, -128
+; RV64ZVE32F-NEXT:    andi a0, a4, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB8_16
 ; RV64ZVE32F-NEXT:  .LBB8_8: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -415,43 +415,43 @@ define void @mscatter_v8i8(<8 x i8> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vse8.v v8, (a0)
-; RV64ZVE32F-NEXT:    andi a0, a3, 2
+; RV64ZVE32F-NEXT:    andi a0, a4, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB8_2
 ; RV64ZVE32F-NEXT:  .LBB8_10: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64ZVE32F-NEXT:    vse8.v v9, (t0)
-; RV64ZVE32F-NEXT:    andi a0, a3, 4
+; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB8_3
 ; RV64ZVE32F-NEXT:  .LBB8_11: # %cond.store3
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV64ZVE32F-NEXT:    vse8.v v9, (a7)
-; RV64ZVE32F-NEXT:    andi a0, a3, 8
+; RV64ZVE32F-NEXT:    andi a0, a4, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB8_4
 ; RV64ZVE32F-NEXT:  .LBB8_12: # %cond.store5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    vse8.v v9, (a6)
-; RV64ZVE32F-NEXT:    andi a0, a3, 16
+; RV64ZVE32F-NEXT:    andi a0, a4, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB8_5
 ; RV64ZVE32F-NEXT:  .LBB8_13: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
 ; RV64ZVE32F-NEXT:    vse8.v v9, (a5)
-; RV64ZVE32F-NEXT:    andi a0, a3, 32
+; RV64ZVE32F-NEXT:    andi a0, a4, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB8_6
 ; RV64ZVE32F-NEXT:  .LBB8_14: # %cond.store9
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
-; RV64ZVE32F-NEXT:    vse8.v v9, (a4)
-; RV64ZVE32F-NEXT:    andi a0, a3, 64
+; RV64ZVE32F-NEXT:    vse8.v v9, (a3)
+; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB8_7
 ; RV64ZVE32F-NEXT:  .LBB8_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 6
 ; RV64ZVE32F-NEXT:    vse8.v v9, (a2)
-; RV64ZVE32F-NEXT:    andi a0, a3, -128
+; RV64ZVE32F-NEXT:    andi a0, a4, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB8_8
 ; RV64ZVE32F-NEXT:  .LBB8_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
@@ -793,9 +793,9 @@ define void @mscatter_v4i16(<4 x i16> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4i16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -854,17 +854,17 @@ define void @mscatter_truemask_v4i16(<4 x i16> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i16:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 0(a0)
-; RV64ZVE32F-NEXT:    ld a2, 24(a0)
-; RV64ZVE32F-NEXT:    ld a3, 8(a0)
-; RV64ZVE32F-NEXT:    ld a0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 8(a0)
+; RV64ZVE32F-NEXT:    ld a3, 16(a0)
+; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vse16.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT:    vse16.v v9, (a3)
+; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vse16.v v9, (a0)
+; RV64ZVE32F-NEXT:    vse16.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
-; RV64ZVE32F-NEXT:    vse16.v v8, (a2)
+; RV64ZVE32F-NEXT:    vse16.v v8, (a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %val, <4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1))
   ret void
@@ -895,37 +895,37 @@ define void @mscatter_v8i16(<8 x i16> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8i16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
+; RV64ZVE32F-NEXT:    ld a3, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
-; RV64ZVE32F-NEXT:    andi t1, a3, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
+; RV64ZVE32F-NEXT:    andi t1, a4, 1
 ; RV64ZVE32F-NEXT:    bnez t1, .LBB17_9
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a3, 2
+; RV64ZVE32F-NEXT:    andi a0, a4, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB17_10
 ; RV64ZVE32F-NEXT:  .LBB17_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a3, 4
+; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB17_11
 ; RV64ZVE32F-NEXT:  .LBB17_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a3, 8
+; RV64ZVE32F-NEXT:    andi a0, a4, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB17_12
 ; RV64ZVE32F-NEXT:  .LBB17_4: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a3, 16
+; RV64ZVE32F-NEXT:    andi a0, a4, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB17_13
 ; RV64ZVE32F-NEXT:  .LBB17_5: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a3, 32
+; RV64ZVE32F-NEXT:    andi a0, a4, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB17_14
 ; RV64ZVE32F-NEXT:  .LBB17_6: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a3, 64
+; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB17_15
 ; RV64ZVE32F-NEXT:  .LBB17_7: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a3, -128
+; RV64ZVE32F-NEXT:    andi a0, a4, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB17_16
 ; RV64ZVE32F-NEXT:  .LBB17_8: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -933,43 +933,43 @@ define void @mscatter_v8i16(<8 x i16> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse16.v v8, (a0)
-; RV64ZVE32F-NEXT:    andi a0, a3, 2
+; RV64ZVE32F-NEXT:    andi a0, a4, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB17_2
 ; RV64ZVE32F-NEXT:  .LBB17_10: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64ZVE32F-NEXT:    vse16.v v9, (t0)
-; RV64ZVE32F-NEXT:    andi a0, a3, 4
+; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB17_3
 ; RV64ZVE32F-NEXT:  .LBB17_11: # %cond.store3
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a7)
-; RV64ZVE32F-NEXT:    andi a0, a3, 8
+; RV64ZVE32F-NEXT:    andi a0, a4, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB17_4
 ; RV64ZVE32F-NEXT:  .LBB17_12: # %cond.store5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a6)
-; RV64ZVE32F-NEXT:    andi a0, a3, 16
+; RV64ZVE32F-NEXT:    andi a0, a4, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB17_5
 ; RV64ZVE32F-NEXT:  .LBB17_13: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a5)
-; RV64ZVE32F-NEXT:    andi a0, a3, 32
+; RV64ZVE32F-NEXT:    andi a0, a4, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB17_6
 ; RV64ZVE32F-NEXT:  .LBB17_14: # %cond.store9
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
-; RV64ZVE32F-NEXT:    vse16.v v9, (a4)
-; RV64ZVE32F-NEXT:    andi a0, a3, 64
+; RV64ZVE32F-NEXT:    vse16.v v9, (a3)
+; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB17_7
 ; RV64ZVE32F-NEXT:  .LBB17_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 6
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
-; RV64ZVE32F-NEXT:    andi a0, a3, -128
+; RV64ZVE32F-NEXT:    andi a0, a4, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB17_8
 ; RV64ZVE32F-NEXT:  .LBB17_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
@@ -1670,9 +1670,9 @@ define void @mscatter_v4i32(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4i32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -1731,17 +1731,17 @@ define void @mscatter_truemask_v4i32(<4 x i32> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i32:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 0(a0)
-; RV64ZVE32F-NEXT:    ld a2, 24(a0)
-; RV64ZVE32F-NEXT:    ld a3, 8(a0)
-; RV64ZVE32F-NEXT:    ld a0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 8(a0)
+; RV64ZVE32F-NEXT:    ld a3, 16(a0)
+; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT:    vse32.v v9, (a3)
+; RV64ZVE32F-NEXT:    vse32.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vse32.v v9, (a0)
+; RV64ZVE32F-NEXT:    vse32.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
-; RV64ZVE32F-NEXT:    vse32.v v8, (a2)
+; RV64ZVE32F-NEXT:    vse32.v v8, (a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 1))
   ret void
@@ -1772,37 +1772,37 @@ define void @mscatter_v8i32(<8 x i32> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8i32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
+; RV64ZVE32F-NEXT:    ld a3, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
-; RV64ZVE32F-NEXT:    andi t1, a3, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
+; RV64ZVE32F-NEXT:    andi t1, a4, 1
 ; RV64ZVE32F-NEXT:    bnez t1, .LBB28_9
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a3, 2
+; RV64ZVE32F-NEXT:    andi a0, a4, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB28_10
 ; RV64ZVE32F-NEXT:  .LBB28_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a3, 4
+; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB28_11
 ; RV64ZVE32F-NEXT:  .LBB28_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a3, 8
+; RV64ZVE32F-NEXT:    andi a0, a4, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB28_12
 ; RV64ZVE32F-NEXT:  .LBB28_4: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a3, 16
+; RV64ZVE32F-NEXT:    andi a0, a4, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB28_13
 ; RV64ZVE32F-NEXT:  .LBB28_5: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a3, 32
+; RV64ZVE32F-NEXT:    andi a0, a4, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB28_14
 ; RV64ZVE32F-NEXT:  .LBB28_6: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a3, 64
+; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB28_15
 ; RV64ZVE32F-NEXT:  .LBB28_7: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a3, -128
+; RV64ZVE32F-NEXT:    andi a0, a4, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB28_16
 ; RV64ZVE32F-NEXT:  .LBB28_8: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -1810,46 +1810,46 @@ define void @mscatter_v8i32(<8 x i32> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v8, (a0)
-; RV64ZVE32F-NEXT:    andi a0, a3, 2
+; RV64ZVE32F-NEXT:    andi a0, a4, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB28_2
 ; RV64ZVE32F-NEXT:  .LBB28_10: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-NEXT:    vse32.v v10, (t0)
-; RV64ZVE32F-NEXT:    andi a0, a3, 4
+; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB28_3
 ; RV64ZVE32F-NEXT:  .LBB28_11: # %cond.store3
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV64ZVE32F-NEXT:    vse32.v v10, (a7)
-; RV64ZVE32F-NEXT:    andi a0, a3, 8
+; RV64ZVE32F-NEXT:    andi a0, a4, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB28_4
 ; RV64ZVE32F-NEXT:  .LBB28_12: # %cond.store5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV64ZVE32F-NEXT:    vse32.v v10, (a6)
-; RV64ZVE32F-NEXT:    andi a0, a3, 16
+; RV64ZVE32F-NEXT:    andi a0, a4, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB28_5
 ; RV64ZVE32F-NEXT:  .LBB28_13: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v10, (a5)
-; RV64ZVE32F-NEXT:    andi a0, a3, 32
+; RV64ZVE32F-NEXT:    andi a0, a4, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB28_6
 ; RV64ZVE32F-NEXT:  .LBB28_14: # %cond.store9
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v10, (a4)
-; RV64ZVE32F-NEXT:    andi a0, a3, 64
+; RV64ZVE32F-NEXT:    vse32.v v10, (a3)
+; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB28_7
 ; RV64ZVE32F-NEXT:  .LBB28_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v10, (a2)
-; RV64ZVE32F-NEXT:    andi a0, a3, -128
+; RV64ZVE32F-NEXT:    andi a0, a4, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB28_8
 ; RV64ZVE32F-NEXT:  .LBB28_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -2898,8 +2898,8 @@ define void @mscatter_v2i64(<2 x i64> %val, <2 x ptr> %ptrs, <2 x i1> %m) {
 ;
 ; RV32ZVE32F-LABEL: mscatter_v2i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a2, 12(a0)
 ; RV32ZVE32F-NEXT:    lw a1, 8(a0)
+; RV32ZVE32F-NEXT:    lw a2, 12(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV32ZVE32F-NEXT:    andi a4, a3, 1
@@ -2965,12 +2965,12 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV32ZVE32F-LABEL: mscatter_v4i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a1, 28(a0)
-; RV32ZVE32F-NEXT:    lw a2, 24(a0)
-; RV32ZVE32F-NEXT:    lw a3, 20(a0)
-; RV32ZVE32F-NEXT:    lw a4, 16(a0)
-; RV32ZVE32F-NEXT:    lw a7, 12(a0)
+; RV32ZVE32F-NEXT:    lw a1, 24(a0)
+; RV32ZVE32F-NEXT:    lw a2, 28(a0)
 ; RV32ZVE32F-NEXT:    lw a6, 8(a0)
+; RV32ZVE32F-NEXT:    lw a7, 12(a0)
+; RV32ZVE32F-NEXT:    lw a3, 16(a0)
+; RV32ZVE32F-NEXT:    lw a4, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v0
 ; RV32ZVE32F-NEXT:    andi t0, a5, 1
@@ -3007,38 +3007,38 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v9
-; RV32ZVE32F-NEXT:    sw a4, 0(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
+; RV32ZVE32F-NEXT:    sw a4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a5, a5, 8
 ; RV32ZVE32F-NEXT:    beqz a5, .LBB38_4
 ; RV32ZVE32F-NEXT:  .LBB38_8: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a2, 4(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 24(a1)
+; RV64ZVE32F-NEXT:    ld a6, 8(a1)
 ; RV64ZVE32F-NEXT:    ld a4, 16(a1)
-; RV64ZVE32F-NEXT:    ld a7, 8(a1)
-; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a5, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 24(a1)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a5, 16(a0)
+; RV64ZVE32F-NEXT:    ld a3, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a6, v0
-; RV64ZVE32F-NEXT:    andi t1, a6, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a7, v0
+; RV64ZVE32F-NEXT:    andi t1, a7, 1
 ; RV64ZVE32F-NEXT:    bnez t1, .LBB38_5
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a6, 2
+; RV64ZVE32F-NEXT:    andi a0, a7, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB38_6
 ; RV64ZVE32F-NEXT:  .LBB38_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a6, 4
+; RV64ZVE32F-NEXT:    andi a0, a7, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB38_7
 ; RV64ZVE32F-NEXT:  .LBB38_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a6, 8
+; RV64ZVE32F-NEXT:    andi a0, a7, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB38_8
 ; RV64ZVE32F-NEXT:  .LBB38_4: # %else6
 ; RV64ZVE32F-NEXT:    ret
@@ -3046,15 +3046,15 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ; RV64ZVE32F-NEXT:    ld a1, 0(a1)
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    sd a0, 0(a1)
-; RV64ZVE32F-NEXT:    andi a0, a6, 2
+; RV64ZVE32F-NEXT:    andi a0, a7, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB38_2
 ; RV64ZVE32F-NEXT:  .LBB38_6: # %cond.store1
-; RV64ZVE32F-NEXT:    sd t0, 0(a7)
-; RV64ZVE32F-NEXT:    andi a0, a6, 4
+; RV64ZVE32F-NEXT:    sd t0, 0(a6)
+; RV64ZVE32F-NEXT:    andi a0, a7, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB38_3
 ; RV64ZVE32F-NEXT:  .LBB38_7: # %cond.store3
 ; RV64ZVE32F-NEXT:    sd a5, 0(a4)
-; RV64ZVE32F-NEXT:    andi a0, a6, 8
+; RV64ZVE32F-NEXT:    andi a0, a7, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB38_4
 ; RV64ZVE32F-NEXT:  .LBB38_8: # %cond.store5
 ; RV64ZVE32F-NEXT:    sd a3, 0(a2)
@@ -3078,46 +3078,46 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) {
 ;
 ; RV32ZVE32F-LABEL: mscatter_truemask_v4i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a1, 28(a0)
-; RV32ZVE32F-NEXT:    lw a2, 24(a0)
-; RV32ZVE32F-NEXT:    lw a3, 20(a0)
-; RV32ZVE32F-NEXT:    lw a4, 16(a0)
-; RV32ZVE32F-NEXT:    lw a5, 12(a0)
-; RV32ZVE32F-NEXT:    lw a6, 0(a0)
-; RV32ZVE32F-NEXT:    lw a7, 4(a0)
-; RV32ZVE32F-NEXT:    lw a0, 8(a0)
+; RV32ZVE32F-NEXT:    lw a1, 16(a0)
+; RV32ZVE32F-NEXT:    lw a2, 20(a0)
+; RV32ZVE32F-NEXT:    lw a3, 24(a0)
+; RV32ZVE32F-NEXT:    lw a4, 28(a0)
+; RV32ZVE32F-NEXT:    lw a5, 0(a0)
+; RV32ZVE32F-NEXT:    lw a6, 4(a0)
+; RV32ZVE32F-NEXT:    lw a7, 8(a0)
+; RV32ZVE32F-NEXT:    lw a0, 12(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v8
-; RV32ZVE32F-NEXT:    sw a6, 0(t0)
-; RV32ZVE32F-NEXT:    sw a7, 4(t0)
+; RV32ZVE32F-NEXT:    sw a5, 0(t0)
+; RV32ZVE32F-NEXT:    sw a6, 4(t0)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a6, v9
-; RV32ZVE32F-NEXT:    sw a0, 0(a6)
-; RV32ZVE32F-NEXT:    sw a5, 4(a6)
+; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
+; RV32ZVE32F-NEXT:    sw a7, 0(a5)
+; RV32ZVE32F-NEXT:    sw a0, 4(a5)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v9
-; RV32ZVE32F-NEXT:    sw a4, 0(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a2, 4(a0)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
+; RV32ZVE32F-NEXT:    sw a4, 4(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 24(a1)
-; RV64ZVE32F-NEXT:    ld a3, 16(a1)
-; RV64ZVE32F-NEXT:    ld a4, 8(a1)
-; RV64ZVE32F-NEXT:    ld a1, 0(a1)
+; RV64ZVE32F-NEXT:    ld a2, 0(a1)
+; RV64ZVE32F-NEXT:    ld a3, 8(a1)
+; RV64ZVE32F-NEXT:    ld a4, 16(a1)
+; RV64ZVE32F-NEXT:    ld a1, 24(a1)
 ; RV64ZVE32F-NEXT:    ld a5, 0(a0)
 ; RV64ZVE32F-NEXT:    ld a6, 8(a0)
 ; RV64ZVE32F-NEXT:    ld a7, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a0, 24(a0)
-; RV64ZVE32F-NEXT:    sd a5, 0(a1)
-; RV64ZVE32F-NEXT:    sd a6, 0(a4)
-; RV64ZVE32F-NEXT:    sd a7, 0(a3)
-; RV64ZVE32F-NEXT:    sd a0, 0(a2)
+; RV64ZVE32F-NEXT:    sd a5, 0(a2)
+; RV64ZVE32F-NEXT:    sd a6, 0(a3)
+; RV64ZVE32F-NEXT:    sd a7, 0(a4)
+; RV64ZVE32F-NEXT:    sd a0, 0(a1)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %val, <4 x ptr> %ptrs, i32 8, <4 x i1> splat (i1 1))
   ret void
@@ -3156,51 +3156,51 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a1, 60(a0)
-; RV32ZVE32F-NEXT:    lw a2, 56(a0)
-; RV32ZVE32F-NEXT:    lw a3, 52(a0)
-; RV32ZVE32F-NEXT:    lw a4, 48(a0)
-; RV32ZVE32F-NEXT:    lw a5, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw a1, 56(a0)
+; RV32ZVE32F-NEXT:    lw a2, 60(a0)
+; RV32ZVE32F-NEXT:    lw a5, 40(a0)
+; RV32ZVE32F-NEXT:    lw a6, 44(a0)
+; RV32ZVE32F-NEXT:    lw a3, 48(a0)
+; RV32ZVE32F-NEXT:    lw a4, 52(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a6, v0
-; RV32ZVE32F-NEXT:    andi s1, a6, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a7, v0
+; RV32ZVE32F-NEXT:    andi s1, a7, 1
 ; RV32ZVE32F-NEXT:    bnez s1, .LBB41_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, a6, 2
+; RV32ZVE32F-NEXT:    andi a0, a7, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_11
 ; RV32ZVE32F-NEXT:  .LBB41_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, a6, 4
+; RV32ZVE32F-NEXT:    andi a0, a7, 4
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_12
 ; RV32ZVE32F-NEXT:  .LBB41_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, a6, 8
+; RV32ZVE32F-NEXT:    andi a0, a7, 8
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_13
 ; RV32ZVE32F-NEXT:  .LBB41_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, a6, 16
+; RV32ZVE32F-NEXT:    andi a0, a7, 16
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_14
 ; RV32ZVE32F-NEXT:  .LBB41_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, a6, 32
+; RV32ZVE32F-NEXT:    andi a0, a7, 32
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_15
 ; RV32ZVE32F-NEXT:  .LBB41_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, a6, 64
+; RV32ZVE32F-NEXT:    andi a0, a7, 64
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_16
 ; RV32ZVE32F-NEXT:  .LBB41_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, a6, -128
+; RV32ZVE32F-NEXT:    andi a0, a7, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_9
 ; RV32ZVE32F-NEXT:  .LBB41_8: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a2, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB41_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -3214,7 +3214,7 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV32ZVE32F-NEXT:    vmv.x.s s2, v8
 ; RV32ZVE32F-NEXT:    sw s1, 4(s2)
 ; RV32ZVE32F-NEXT:    sw a0, 0(s2)
-; RV32ZVE32F-NEXT:    andi a0, a6, 2
+; RV32ZVE32F-NEXT:    andi a0, a7, 2
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_2
 ; RV32ZVE32F-NEXT:  .LBB41_11: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
@@ -3222,47 +3222,47 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV32ZVE32F-NEXT:    sw s0, 4(a0)
 ; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 4
+; RV32ZVE32F-NEXT:    andi a0, a7, 4
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_3
 ; RV32ZVE32F-NEXT:  .LBB41_12: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 8
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a7, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_4
 ; RV32ZVE32F-NEXT:  .LBB41_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 16
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a7, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_5
 ; RV32ZVE32F-NEXT:  .LBB41_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 32
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a7, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_6
 ; RV32ZVE32F-NEXT:  .LBB41_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a5, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 64
+; RV32ZVE32F-NEXT:    sw a5, 0(a0)
+; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a7, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_7
 ; RV32ZVE32F-NEXT:  .LBB41_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a4, 0(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, -128
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
+; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a7, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_8
 ; RV32ZVE32F-NEXT:    j .LBB41_9
 ;
@@ -3276,47 +3276,47 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV64ZVE32F-NEXT:    .cfi_offset s0, -8
 ; RV64ZVE32F-NEXT:    .cfi_offset s1, -16
 ; RV64ZVE32F-NEXT:    .cfi_offset s2, -24
+; RV64ZVE32F-NEXT:    ld a4, 40(a1)
+; RV64ZVE32F-NEXT:    ld a3, 48(a1)
 ; RV64ZVE32F-NEXT:    ld a2, 56(a1)
-; RV64ZVE32F-NEXT:    ld a4, 48(a1)
-; RV64ZVE32F-NEXT:    ld a6, 40(a1)
-; RV64ZVE32F-NEXT:    ld t1, 32(a1)
-; RV64ZVE32F-NEXT:    ld t3, 24(a1)
-; RV64ZVE32F-NEXT:    ld t5, 16(a1)
-; RV64ZVE32F-NEXT:    ld s0, 8(a1)
-; RV64ZVE32F-NEXT:    ld a3, 56(a0)
-; RV64ZVE32F-NEXT:    ld a5, 48(a0)
-; RV64ZVE32F-NEXT:    ld t0, 40(a0)
-; RV64ZVE32F-NEXT:    ld t2, 32(a0)
-; RV64ZVE32F-NEXT:    ld t4, 24(a0)
-; RV64ZVE32F-NEXT:    ld t6, 16(a0)
+; RV64ZVE32F-NEXT:    ld t5, 8(a1)
+; RV64ZVE32F-NEXT:    ld t3, 16(a1)
+; RV64ZVE32F-NEXT:    ld t2, 24(a1)
+; RV64ZVE32F-NEXT:    ld t0, 32(a1)
+; RV64ZVE32F-NEXT:    ld a7, 40(a0)
+; RV64ZVE32F-NEXT:    ld a6, 48(a0)
+; RV64ZVE32F-NEXT:    ld a5, 56(a0)
 ; RV64ZVE32F-NEXT:    ld s1, 8(a0)
+; RV64ZVE32F-NEXT:    ld s0, 16(a0)
+; RV64ZVE32F-NEXT:    ld t6, 24(a0)
+; RV64ZVE32F-NEXT:    ld t4, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a7, v0
-; RV64ZVE32F-NEXT:    andi s2, a7, 1
+; RV64ZVE32F-NEXT:    vmv.x.s t1, v0
+; RV64ZVE32F-NEXT:    andi s2, t1, 1
 ; RV64ZVE32F-NEXT:    bnez s2, .LBB41_10
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a7, 2
+; RV64ZVE32F-NEXT:    andi a0, t1, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_11
 ; RV64ZVE32F-NEXT:  .LBB41_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a7, 4
+; RV64ZVE32F-NEXT:    andi a0, t1, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_12
 ; RV64ZVE32F-NEXT:  .LBB41_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a7, 8
+; RV64ZVE32F-NEXT:    andi a0, t1, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_13
 ; RV64ZVE32F-NEXT:  .LBB41_4: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a7, 16
+; RV64ZVE32F-NEXT:    andi a0, t1, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_14
 ; RV64ZVE32F-NEXT:  .LBB41_5: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a7, 32
+; RV64ZVE32F-NEXT:    andi a0, t1, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_15
 ; RV64ZVE32F-NEXT:  .LBB41_6: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a7, 64
+; RV64ZVE32F-NEXT:    andi a0, t1, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_16
 ; RV64ZVE32F-NEXT:  .LBB41_7: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a7, -128
+; RV64ZVE32F-NEXT:    andi a0, t1, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_9
 ; RV64ZVE32F-NEXT:  .LBB41_8: # %cond.store13
-; RV64ZVE32F-NEXT:    sd a3, 0(a2)
+; RV64ZVE32F-NEXT:    sd a5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB41_9: # %else14
 ; RV64ZVE32F-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
 ; RV64ZVE32F-NEXT:    ld s1, 16(sp) # 8-byte Folded Reload
@@ -3327,31 +3327,31 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV64ZVE32F-NEXT:    ld a1, 0(a1)
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    sd a0, 0(a1)
-; RV64ZVE32F-NEXT:    andi a0, a7, 2
+; RV64ZVE32F-NEXT:    andi a0, t1, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_2
 ; RV64ZVE32F-NEXT:  .LBB41_11: # %cond.store1
-; RV64ZVE32F-NEXT:    sd s1, 0(s0)
-; RV64ZVE32F-NEXT:    andi a0, a7, 4
+; RV64ZVE32F-NEXT:    sd s1, 0(t5)
+; RV64ZVE32F-NEXT:    andi a0, t1, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_3
 ; RV64ZVE32F-NEXT:  .LBB41_12: # %cond.store3
-; RV64ZVE32F-NEXT:    sd t6, 0(t5)
-; RV64ZVE32F-NEXT:    andi a0, a7, 8
+; RV64ZVE32F-NEXT:    sd s0, 0(t3)
+; RV64ZVE32F-NEXT:    andi a0, t1, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_4
 ; RV64ZVE32F-NEXT:  .LBB41_13: # %cond.store5
-; RV64ZVE32F-NEXT:    sd t4, 0(t3)
-; RV64ZVE32F-NEXT:    andi a0, a7, 16
+; RV64ZVE32F-NEXT:    sd t6, 0(t2)
+; RV64ZVE32F-NEXT:    andi a0, t1, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_5
 ; RV64ZVE32F-NEXT:  .LBB41_14: # %cond.store7
-; RV64ZVE32F-NEXT:    sd t2, 0(t1)
-; RV64ZVE32F-NEXT:    andi a0, a7, 32
+; RV64ZVE32F-NEXT:    sd t4, 0(t0)
+; RV64ZVE32F-NEXT:    andi a0, t1, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_6
 ; RV64ZVE32F-NEXT:  .LBB41_15: # %cond.store9
-; RV64ZVE32F-NEXT:    sd t0, 0(a6)
-; RV64ZVE32F-NEXT:    andi a0, a7, 64
+; RV64ZVE32F-NEXT:    sd a7, 0(a4)
+; RV64ZVE32F-NEXT:    andi a0, t1, 64
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_7
 ; RV64ZVE32F-NEXT:  .LBB41_16: # %cond.store11
-; RV64ZVE32F-NEXT:    sd a5, 0(a4)
-; RV64ZVE32F-NEXT:    andi a0, a7, -128
+; RV64ZVE32F-NEXT:    sd a6, 0(a3)
+; RV64ZVE32F-NEXT:    andi a0, t1, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_8
 ; RV64ZVE32F-NEXT:    j .LBB41_9
   call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %val, <8 x ptr> %ptrs, i32 8, <8 x i1> %m)
@@ -3386,20 +3386,20 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t1, 36(a0)
-; RV32ZVE32F-NEXT:    lw t2, 32(a0)
-; RV32ZVE32F-NEXT:    lw t3, 28(a0)
-; RV32ZVE32F-NEXT:    lw t4, 24(a0)
-; RV32ZVE32F-NEXT:    lw t5, 20(a0)
-; RV32ZVE32F-NEXT:    lw t6, 16(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw t3, 24(a0)
+; RV32ZVE32F-NEXT:    lw t4, 28(a0)
+; RV32ZVE32F-NEXT:    lw t1, 32(a0)
+; RV32ZVE32F-NEXT:    lw t2, 36(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
+; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw t5, 16(a0)
+; RV32ZVE32F-NEXT:    lw t6, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -3434,8 +3434,8 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB42_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -3462,56 +3462,56 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    sw t5, 0(a0)
+; RV32ZVE32F-NEXT:    sw t6, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB42_4
 ; RV32ZVE32F-NEXT:  .LBB42_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t4, 0(a0)
-; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    sw t3, 0(a0)
+; RV32ZVE32F-NEXT:    sw t4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB42_5
 ; RV32ZVE32F-NEXT:  .LBB42_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t2, 0(a0)
-; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    sw t1, 0(a0)
+; RV32ZVE32F-NEXT:    sw t2, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB42_6
 ; RV32ZVE32F-NEXT:  .LBB42_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB42_7
 ; RV32ZVE32F-NEXT:  .LBB42_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB42_8
 ; RV32ZVE32F-NEXT:    j .LBB42_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i8_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
-; RV64ZVE32F-NEXT:    andi t2, a4, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV64ZVE32F-NEXT:    andi t2, a5, 1
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB42_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
@@ -3520,7 +3520,7 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV64ZVE32F-NEXT:    add t2, a1, t2
 ; RV64ZVE32F-NEXT:    sd a0, 0(t2)
 ; RV64ZVE32F-NEXT:  .LBB42_2: # %else
-; RV64ZVE32F-NEXT:    andi a0, a4, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB42_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -3532,31 +3532,31 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV64ZVE32F-NEXT:  .LBB42_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a4, 4
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB42_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB42_13
 ; RV64ZVE32F-NEXT:  .LBB42_6: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB42_14
 ; RV64ZVE32F-NEXT:  .LBB42_7: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB42_9
 ; RV64ZVE32F-NEXT:  .LBB42_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd a5, 0(a0)
+; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB42_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a4, 64
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB42_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB42_16
 ; RV64ZVE32F-NEXT:  .LBB42_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -3565,7 +3565,7 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd t0, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB42_6
 ; RV64ZVE32F-NEXT:  .LBB42_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -3573,14 +3573,14 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a7, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB42_7
 ; RV64ZVE32F-NEXT:  .LBB42_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v9
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a6, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB42_8
 ; RV64ZVE32F-NEXT:    j .LBB42_9
 ; RV64ZVE32F-NEXT:  .LBB42_15: # %cond.store11
@@ -3588,7 +3588,7 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a3, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB42_11
 ; RV64ZVE32F-NEXT:  .LBB42_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -3630,20 +3630,20 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t1, 36(a0)
-; RV32ZVE32F-NEXT:    lw t2, 32(a0)
-; RV32ZVE32F-NEXT:    lw t3, 28(a0)
-; RV32ZVE32F-NEXT:    lw t4, 24(a0)
-; RV32ZVE32F-NEXT:    lw t5, 20(a0)
-; RV32ZVE32F-NEXT:    lw t6, 16(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw t3, 24(a0)
+; RV32ZVE32F-NEXT:    lw t4, 28(a0)
+; RV32ZVE32F-NEXT:    lw t1, 32(a0)
+; RV32ZVE32F-NEXT:    lw t2, 36(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
+; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw t5, 16(a0)
+; RV32ZVE32F-NEXT:    lw t6, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -3678,8 +3678,8 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB43_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -3706,56 +3706,56 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    sw t5, 0(a0)
+; RV32ZVE32F-NEXT:    sw t6, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB43_4
 ; RV32ZVE32F-NEXT:  .LBB43_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t4, 0(a0)
-; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    sw t3, 0(a0)
+; RV32ZVE32F-NEXT:    sw t4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB43_5
 ; RV32ZVE32F-NEXT:  .LBB43_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t2, 0(a0)
-; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    sw t1, 0(a0)
+; RV32ZVE32F-NEXT:    sw t2, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB43_6
 ; RV32ZVE32F-NEXT:  .LBB43_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB43_7
 ; RV32ZVE32F-NEXT:  .LBB43_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB43_8
 ; RV32ZVE32F-NEXT:    j .LBB43_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i8_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
-; RV64ZVE32F-NEXT:    andi t2, a4, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV64ZVE32F-NEXT:    andi t2, a5, 1
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB43_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
@@ -3764,7 +3764,7 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    add t2, a1, t2
 ; RV64ZVE32F-NEXT:    sd a0, 0(t2)
 ; RV64ZVE32F-NEXT:  .LBB43_2: # %else
-; RV64ZVE32F-NEXT:    andi a0, a4, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB43_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -3776,31 +3776,31 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:  .LBB43_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a4, 4
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB43_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB43_13
 ; RV64ZVE32F-NEXT:  .LBB43_6: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB43_14
 ; RV64ZVE32F-NEXT:  .LBB43_7: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB43_9
 ; RV64ZVE32F-NEXT:  .LBB43_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd a5, 0(a0)
+; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB43_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a4, 64
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB43_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB43_16
 ; RV64ZVE32F-NEXT:  .LBB43_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -3809,7 +3809,7 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd t0, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB43_6
 ; RV64ZVE32F-NEXT:  .LBB43_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -3817,14 +3817,14 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a7, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB43_7
 ; RV64ZVE32F-NEXT:  .LBB43_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v9
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a6, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB43_8
 ; RV64ZVE32F-NEXT:    j .LBB43_9
 ; RV64ZVE32F-NEXT:  .LBB43_15: # %cond.store11
@@ -3832,7 +3832,7 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a3, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB43_11
 ; RV64ZVE32F-NEXT:  .LBB43_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -3876,20 +3876,20 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t1, 36(a0)
-; RV32ZVE32F-NEXT:    lw t2, 32(a0)
-; RV32ZVE32F-NEXT:    lw t3, 28(a0)
-; RV32ZVE32F-NEXT:    lw t4, 24(a0)
-; RV32ZVE32F-NEXT:    lw t5, 20(a0)
-; RV32ZVE32F-NEXT:    lw t6, 16(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw t3, 24(a0)
+; RV32ZVE32F-NEXT:    lw t4, 28(a0)
+; RV32ZVE32F-NEXT:    lw t1, 32(a0)
+; RV32ZVE32F-NEXT:    lw t2, 36(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
+; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw t5, 16(a0)
+; RV32ZVE32F-NEXT:    lw t6, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf4 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -3924,8 +3924,8 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB44_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -3952,56 +3952,56 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    sw t5, 0(a0)
+; RV32ZVE32F-NEXT:    sw t6, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB44_4
 ; RV32ZVE32F-NEXT:  .LBB44_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t4, 0(a0)
-; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    sw t3, 0(a0)
+; RV32ZVE32F-NEXT:    sw t4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB44_5
 ; RV32ZVE32F-NEXT:  .LBB44_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t2, 0(a0)
-; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    sw t1, 0(a0)
+; RV32ZVE32F-NEXT:    sw t2, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB44_6
 ; RV32ZVE32F-NEXT:  .LBB44_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB44_7
 ; RV32ZVE32F-NEXT:  .LBB44_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB44_8
 ; RV32ZVE32F-NEXT:    j .LBB44_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
-; RV64ZVE32F-NEXT:    andi t2, a4, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV64ZVE32F-NEXT:    andi t2, a5, 1
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB44_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
@@ -4011,7 +4011,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    add t2, a1, t2
 ; RV64ZVE32F-NEXT:    sd a0, 0(t2)
 ; RV64ZVE32F-NEXT:  .LBB44_2: # %else
-; RV64ZVE32F-NEXT:    andi a0, a4, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB44_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -4024,18 +4024,18 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:  .LBB44_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a4, 4
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB44_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB44_13
 ; RV64ZVE32F-NEXT:  .LBB44_6: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB44_14
 ; RV64ZVE32F-NEXT:  .LBB44_7: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB44_9
 ; RV64ZVE32F-NEXT:  .LBB44_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
@@ -4043,13 +4043,13 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    andi a0, a0, 255
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd a5, 0(a0)
+; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB44_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a4, 64
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB44_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB44_16
 ; RV64ZVE32F-NEXT:  .LBB44_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -4059,7 +4059,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd t0, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB44_6
 ; RV64ZVE32F-NEXT:  .LBB44_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -4068,7 +4068,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a7, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB44_7
 ; RV64ZVE32F-NEXT:  .LBB44_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v9
@@ -4076,7 +4076,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a6, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB44_8
 ; RV64ZVE32F-NEXT:    j .LBB44_9
 ; RV64ZVE32F-NEXT:  .LBB44_15: # %cond.store11
@@ -4085,7 +4085,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a3, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB44_11
 ; RV64ZVE32F-NEXT:  .LBB44_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -4129,20 +4129,20 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t1, 36(a0)
-; RV32ZVE32F-NEXT:    lw t2, 32(a0)
-; RV32ZVE32F-NEXT:    lw t3, 28(a0)
-; RV32ZVE32F-NEXT:    lw t4, 24(a0)
-; RV32ZVE32F-NEXT:    lw t5, 20(a0)
-; RV32ZVE32F-NEXT:    lw t6, 16(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw t3, 24(a0)
+; RV32ZVE32F-NEXT:    lw t4, 28(a0)
+; RV32ZVE32F-NEXT:    lw t1, 32(a0)
+; RV32ZVE32F-NEXT:    lw t2, 36(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
+; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw t5, 16(a0)
+; RV32ZVE32F-NEXT:    lw t6, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -4177,8 +4177,8 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB45_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -4205,56 +4205,56 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    sw t5, 0(a0)
+; RV32ZVE32F-NEXT:    sw t6, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_4
 ; RV32ZVE32F-NEXT:  .LBB45_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t4, 0(a0)
-; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    sw t3, 0(a0)
+; RV32ZVE32F-NEXT:    sw t4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_5
 ; RV32ZVE32F-NEXT:  .LBB45_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t2, 0(a0)
-; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    sw t1, 0(a0)
+; RV32ZVE32F-NEXT:    sw t2, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_6
 ; RV32ZVE32F-NEXT:  .LBB45_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_7
 ; RV32ZVE32F-NEXT:  .LBB45_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB45_8
 ; RV32ZVE32F-NEXT:    j .LBB45_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i16_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
-; RV64ZVE32F-NEXT:    andi t2, a4, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV64ZVE32F-NEXT:    andi t2, a5, 1
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB45_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
@@ -4264,7 +4264,7 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV64ZVE32F-NEXT:    add t2, a1, t2
 ; RV64ZVE32F-NEXT:    sd a0, 0(t2)
 ; RV64ZVE32F-NEXT:  .LBB45_2: # %else
-; RV64ZVE32F-NEXT:    andi a0, a4, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB45_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
@@ -4276,31 +4276,31 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV64ZVE32F-NEXT:  .LBB45_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a4, 4
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB45_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB45_13
 ; RV64ZVE32F-NEXT:  .LBB45_6: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB45_14
 ; RV64ZVE32F-NEXT:  .LBB45_7: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB45_9
 ; RV64ZVE32F-NEXT:  .LBB45_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd a5, 0(a0)
+; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB45_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a4, 64
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB45_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB45_16
 ; RV64ZVE32F-NEXT:  .LBB45_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -4309,7 +4309,7 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd t0, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB45_6
 ; RV64ZVE32F-NEXT:  .LBB45_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -4317,14 +4317,14 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a7, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB45_7
 ; RV64ZVE32F-NEXT:  .LBB45_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v9
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a6, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB45_8
 ; RV64ZVE32F-NEXT:    j .LBB45_9
 ; RV64ZVE32F-NEXT:  .LBB45_15: # %cond.store11
@@ -4332,7 +4332,7 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a3, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB45_11
 ; RV64ZVE32F-NEXT:  .LBB45_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -4374,20 +4374,20 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t1, 36(a0)
-; RV32ZVE32F-NEXT:    lw t2, 32(a0)
-; RV32ZVE32F-NEXT:    lw t3, 28(a0)
-; RV32ZVE32F-NEXT:    lw t4, 24(a0)
-; RV32ZVE32F-NEXT:    lw t5, 20(a0)
-; RV32ZVE32F-NEXT:    lw t6, 16(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw t3, 24(a0)
+; RV32ZVE32F-NEXT:    lw t4, 28(a0)
+; RV32ZVE32F-NEXT:    lw t1, 32(a0)
+; RV32ZVE32F-NEXT:    lw t2, 36(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
+; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw t5, 16(a0)
+; RV32ZVE32F-NEXT:    lw t6, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -4422,8 +4422,8 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB46_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -4450,56 +4450,56 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    sw t5, 0(a0)
+; RV32ZVE32F-NEXT:    sw t6, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_4
 ; RV32ZVE32F-NEXT:  .LBB46_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t4, 0(a0)
-; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    sw t3, 0(a0)
+; RV32ZVE32F-NEXT:    sw t4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_5
 ; RV32ZVE32F-NEXT:  .LBB46_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t2, 0(a0)
-; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    sw t1, 0(a0)
+; RV32ZVE32F-NEXT:    sw t2, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_6
 ; RV32ZVE32F-NEXT:  .LBB46_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_7
 ; RV32ZVE32F-NEXT:  .LBB46_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB46_8
 ; RV32ZVE32F-NEXT:    j .LBB46_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i16_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
-; RV64ZVE32F-NEXT:    andi t2, a4, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV64ZVE32F-NEXT:    andi t2, a5, 1
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB46_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
@@ -4509,7 +4509,7 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    add t2, a1, t2
 ; RV64ZVE32F-NEXT:    sd a0, 0(t2)
 ; RV64ZVE32F-NEXT:  .LBB46_2: # %else
-; RV64ZVE32F-NEXT:    andi a0, a4, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB46_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
@@ -4521,31 +4521,31 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB46_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a4, 4
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB46_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB46_13
 ; RV64ZVE32F-NEXT:  .LBB46_6: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB46_14
 ; RV64ZVE32F-NEXT:  .LBB46_7: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB46_9
 ; RV64ZVE32F-NEXT:  .LBB46_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd a5, 0(a0)
+; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB46_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a4, 64
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB46_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB46_16
 ; RV64ZVE32F-NEXT:  .LBB46_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -4554,7 +4554,7 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd t0, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB46_6
 ; RV64ZVE32F-NEXT:  .LBB46_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -4562,14 +4562,14 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a7, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB46_7
 ; RV64ZVE32F-NEXT:  .LBB46_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v9
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a6, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB46_8
 ; RV64ZVE32F-NEXT:    j .LBB46_9
 ; RV64ZVE32F-NEXT:  .LBB46_15: # %cond.store11
@@ -4577,7 +4577,7 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a3, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB46_11
 ; RV64ZVE32F-NEXT:  .LBB46_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -4621,20 +4621,20 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t1, 36(a0)
-; RV32ZVE32F-NEXT:    lw t2, 32(a0)
-; RV32ZVE32F-NEXT:    lw t3, 28(a0)
-; RV32ZVE32F-NEXT:    lw t4, 24(a0)
-; RV32ZVE32F-NEXT:    lw t5, 20(a0)
-; RV32ZVE32F-NEXT:    lw t6, 16(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw t3, 24(a0)
+; RV32ZVE32F-NEXT:    lw t4, 28(a0)
+; RV32ZVE32F-NEXT:    lw t1, 32(a0)
+; RV32ZVE32F-NEXT:    lw t2, 36(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
+; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw t5, 16(a0)
+; RV32ZVE32F-NEXT:    lw t6, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf2 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -4669,8 +4669,8 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB47_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -4697,57 +4697,57 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    sw t5, 0(a0)
+; RV32ZVE32F-NEXT:    sw t6, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_4
 ; RV32ZVE32F-NEXT:  .LBB47_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t4, 0(a0)
-; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    sw t3, 0(a0)
+; RV32ZVE32F-NEXT:    sw t4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_5
 ; RV32ZVE32F-NEXT:  .LBB47_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t2, 0(a0)
-; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    sw t1, 0(a0)
+; RV32ZVE32F-NEXT:    sw t2, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_6
 ; RV32ZVE32F-NEXT:  .LBB47_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_7
 ; RV32ZVE32F-NEXT:  .LBB47_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB47_8
 ; RV32ZVE32F-NEXT:    j .LBB47_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a6, 40(a0)
-; RV64ZVE32F-NEXT:    ld a7, 32(a0)
-; RV64ZVE32F-NEXT:    ld t0, 24(a0)
-; RV64ZVE32F-NEXT:    ld t1, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t2, 8(a0)
+; RV64ZVE32F-NEXT:    ld t1, 16(a0)
+; RV64ZVE32F-NEXT:    ld t0, 24(a0)
+; RV64ZVE32F-NEXT:    ld a7, 32(a0)
 ; RV64ZVE32F-NEXT:    lui a4, 16
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
-; RV64ZVE32F-NEXT:    andi t3, a5, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a6, v0
+; RV64ZVE32F-NEXT:    andi t3, a6, 1
 ; RV64ZVE32F-NEXT:    addiw a4, a4, -1
 ; RV64ZVE32F-NEXT:    beqz t3, .LBB47_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
@@ -4759,7 +4759,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    add t3, a1, t3
 ; RV64ZVE32F-NEXT:    sd a0, 0(t3)
 ; RV64ZVE32F-NEXT:  .LBB47_2: # %else
-; RV64ZVE32F-NEXT:    andi a0, a5, 2
+; RV64ZVE32F-NEXT:    andi a0, a6, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
@@ -4772,18 +4772,18 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB47_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a5, 4
+; RV64ZVE32F-NEXT:    andi a0, a6, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a5, 8
+; RV64ZVE32F-NEXT:    andi a0, a6, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_13
 ; RV64ZVE32F-NEXT:  .LBB47_6: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a5, 16
+; RV64ZVE32F-NEXT:    andi a0, a6, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_14
 ; RV64ZVE32F-NEXT:  .LBB47_7: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a5, 32
+; RV64ZVE32F-NEXT:    andi a0, a6, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_9
 ; RV64ZVE32F-NEXT:  .LBB47_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
@@ -4791,13 +4791,13 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    and a0, a0, a4
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd a6, 0(a0)
+; RV64ZVE32F-NEXT:    sd a5, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB47_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a5, 64
+; RV64ZVE32F-NEXT:    andi a0, a6, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a5, -128
+; RV64ZVE32F-NEXT:    andi a0, a6, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_16
 ; RV64ZVE32F-NEXT:  .LBB47_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -4807,7 +4807,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd t1, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a5, 8
+; RV64ZVE32F-NEXT:    andi a0, a6, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_6
 ; RV64ZVE32F-NEXT:  .LBB47_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -4816,7 +4816,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd t0, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a5, 16
+; RV64ZVE32F-NEXT:    andi a0, a6, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_7
 ; RV64ZVE32F-NEXT:  .LBB47_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v9
@@ -4824,7 +4824,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a7, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a5, 32
+; RV64ZVE32F-NEXT:    andi a0, a6, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_8
 ; RV64ZVE32F-NEXT:    j .LBB47_9
 ; RV64ZVE32F-NEXT:  .LBB47_15: # %cond.store11
@@ -4833,7 +4833,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a3, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a5, -128
+; RV64ZVE32F-NEXT:    andi a0, a6, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_11
 ; RV64ZVE32F-NEXT:  .LBB47_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -4876,55 +4876,55 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw t0, 40(a0)
-; RV32ZVE32F-NEXT:    lw t1, 36(a0)
-; RV32ZVE32F-NEXT:    lw t2, 32(a0)
-; RV32ZVE32F-NEXT:    lw t3, 28(a0)
-; RV32ZVE32F-NEXT:    lw t4, 24(a0)
-; RV32ZVE32F-NEXT:    lw t5, 20(a0)
-; RV32ZVE32F-NEXT:    lw t6, 16(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw t3, 24(a0)
+; RV32ZVE32F-NEXT:    lw t4, 28(a0)
+; RV32ZVE32F-NEXT:    lw t1, 32(a0)
+; RV32ZVE32F-NEXT:    lw t2, 36(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
+; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw t5, 16(a0)
+; RV32ZVE32F-NEXT:    lw t6, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a7, v0
-; RV32ZVE32F-NEXT:    andi s2, a7, 1
+; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
+; RV32ZVE32F-NEXT:    andi s2, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez s2, .LBB48_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, a7, 2
+; RV32ZVE32F-NEXT:    andi a0, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_11
 ; RV32ZVE32F-NEXT:  .LBB48_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, a7, 4
+; RV32ZVE32F-NEXT:    andi a0, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_12
 ; RV32ZVE32F-NEXT:  .LBB48_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, a7, 8
+; RV32ZVE32F-NEXT:    andi a0, t0, 8
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_13
 ; RV32ZVE32F-NEXT:  .LBB48_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, a7, 16
+; RV32ZVE32F-NEXT:    andi a0, t0, 16
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_14
 ; RV32ZVE32F-NEXT:  .LBB48_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, a7, 32
+; RV32ZVE32F-NEXT:    andi a0, t0, 32
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_15
 ; RV32ZVE32F-NEXT:  .LBB48_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, a7, 64
+; RV32ZVE32F-NEXT:    andi a0, t0, 64
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_16
 ; RV32ZVE32F-NEXT:  .LBB48_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, a7, -128
+; RV32ZVE32F-NEXT:    andi a0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_9
 ; RV32ZVE32F-NEXT:  .LBB48_8: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB48_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -4937,7 +4937,7 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV32ZVE32F-NEXT:    vmv.x.s s2, v8
 ; RV32ZVE32F-NEXT:    sw a1, 4(s2)
 ; RV32ZVE32F-NEXT:    sw a0, 0(s2)
-; RV32ZVE32F-NEXT:    andi a0, a7, 2
+; RV32ZVE32F-NEXT:    andi a0, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_2
 ; RV32ZVE32F-NEXT:  .LBB48_11: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
@@ -4945,62 +4945,62 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV32ZVE32F-NEXT:    sw s1, 4(a0)
 ; RV32ZVE32F-NEXT:    sw s0, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 4
+; RV32ZVE32F-NEXT:    andi a0, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_3
 ; RV32ZVE32F-NEXT:  .LBB48_12: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    sw t5, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 8
+; RV32ZVE32F-NEXT:    sw t5, 0(a0)
+; RV32ZVE32F-NEXT:    sw t6, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_4
 ; RV32ZVE32F-NEXT:  .LBB48_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t4, 0(a0)
-; RV32ZVE32F-NEXT:    sw t3, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 16
+; RV32ZVE32F-NEXT:    sw t3, 0(a0)
+; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_5
 ; RV32ZVE32F-NEXT:  .LBB48_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t2, 0(a0)
-; RV32ZVE32F-NEXT:    sw t1, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 32
+; RV32ZVE32F-NEXT:    sw t1, 0(a0)
+; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_6
 ; RV32ZVE32F-NEXT:  .LBB48_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t0, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 64
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_7
 ; RV32ZVE32F-NEXT:  .LBB48_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, -128
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_8
 ; RV32ZVE32F-NEXT:    j .LBB48_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i32_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
-; RV64ZVE32F-NEXT:    andi t2, a4, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV64ZVE32F-NEXT:    andi t2, a5, 1
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB48_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
@@ -5010,7 +5010,7 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV64ZVE32F-NEXT:    add t2, a1, t2
 ; RV64ZVE32F-NEXT:    sd a0, 0(t2)
 ; RV64ZVE32F-NEXT:  .LBB48_2: # %else
-; RV64ZVE32F-NEXT:    andi a0, a4, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB48_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
@@ -5022,31 +5022,31 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV64ZVE32F-NEXT:  .LBB48_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a4, 4
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB48_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB48_13
 ; RV64ZVE32F-NEXT:  .LBB48_6: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB48_14
 ; RV64ZVE32F-NEXT:  .LBB48_7: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB48_9
 ; RV64ZVE32F-NEXT:  .LBB48_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd a5, 0(a0)
+; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB48_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a4, 64
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB48_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB48_16
 ; RV64ZVE32F-NEXT:  .LBB48_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -5055,7 +5055,7 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd t0, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB48_6
 ; RV64ZVE32F-NEXT:  .LBB48_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -5063,14 +5063,14 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a7, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB48_7
 ; RV64ZVE32F-NEXT:  .LBB48_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a6, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB48_8
 ; RV64ZVE32F-NEXT:    j .LBB48_9
 ; RV64ZVE32F-NEXT:  .LBB48_15: # %cond.store11
@@ -5078,7 +5078,7 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a3, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB48_11
 ; RV64ZVE32F-NEXT:  .LBB48_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -5119,55 +5119,55 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw t0, 40(a0)
-; RV32ZVE32F-NEXT:    lw t1, 36(a0)
-; RV32ZVE32F-NEXT:    lw t2, 32(a0)
-; RV32ZVE32F-NEXT:    lw t3, 28(a0)
-; RV32ZVE32F-NEXT:    lw t4, 24(a0)
-; RV32ZVE32F-NEXT:    lw t5, 20(a0)
-; RV32ZVE32F-NEXT:    lw t6, 16(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw t3, 24(a0)
+; RV32ZVE32F-NEXT:    lw t4, 28(a0)
+; RV32ZVE32F-NEXT:    lw t1, 32(a0)
+; RV32ZVE32F-NEXT:    lw t2, 36(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
+; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw t5, 16(a0)
+; RV32ZVE32F-NEXT:    lw t6, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a7, v0
-; RV32ZVE32F-NEXT:    andi s2, a7, 1
+; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
+; RV32ZVE32F-NEXT:    andi s2, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez s2, .LBB49_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, a7, 2
+; RV32ZVE32F-NEXT:    andi a0, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_11
 ; RV32ZVE32F-NEXT:  .LBB49_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, a7, 4
+; RV32ZVE32F-NEXT:    andi a0, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_12
 ; RV32ZVE32F-NEXT:  .LBB49_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, a7, 8
+; RV32ZVE32F-NEXT:    andi a0, t0, 8
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_13
 ; RV32ZVE32F-NEXT:  .LBB49_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, a7, 16
+; RV32ZVE32F-NEXT:    andi a0, t0, 16
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_14
 ; RV32ZVE32F-NEXT:  .LBB49_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, a7, 32
+; RV32ZVE32F-NEXT:    andi a0, t0, 32
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_15
 ; RV32ZVE32F-NEXT:  .LBB49_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, a7, 64
+; RV32ZVE32F-NEXT:    andi a0, t0, 64
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_16
 ; RV32ZVE32F-NEXT:  .LBB49_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, a7, -128
+; RV32ZVE32F-NEXT:    andi a0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_9
 ; RV32ZVE32F-NEXT:  .LBB49_8: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB49_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -5180,7 +5180,7 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vmv.x.s s2, v8
 ; RV32ZVE32F-NEXT:    sw a1, 4(s2)
 ; RV32ZVE32F-NEXT:    sw a0, 0(s2)
-; RV32ZVE32F-NEXT:    andi a0, a7, 2
+; RV32ZVE32F-NEXT:    andi a0, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_2
 ; RV32ZVE32F-NEXT:  .LBB49_11: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
@@ -5188,62 +5188,62 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV32ZVE32F-NEXT:    sw s1, 4(a0)
 ; RV32ZVE32F-NEXT:    sw s0, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 4
+; RV32ZVE32F-NEXT:    andi a0, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_3
 ; RV32ZVE32F-NEXT:  .LBB49_12: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    sw t5, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 8
+; RV32ZVE32F-NEXT:    sw t5, 0(a0)
+; RV32ZVE32F-NEXT:    sw t6, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_4
 ; RV32ZVE32F-NEXT:  .LBB49_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t4, 0(a0)
-; RV32ZVE32F-NEXT:    sw t3, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 16
+; RV32ZVE32F-NEXT:    sw t3, 0(a0)
+; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_5
 ; RV32ZVE32F-NEXT:  .LBB49_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t2, 0(a0)
-; RV32ZVE32F-NEXT:    sw t1, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 32
+; RV32ZVE32F-NEXT:    sw t1, 0(a0)
+; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_6
 ; RV32ZVE32F-NEXT:  .LBB49_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t0, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 64
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_7
 ; RV32ZVE32F-NEXT:  .LBB49_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, -128
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_8
 ; RV32ZVE32F-NEXT:    j .LBB49_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i32_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
-; RV64ZVE32F-NEXT:    andi t2, a4, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV64ZVE32F-NEXT:    andi t2, a5, 1
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB49_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
@@ -5253,7 +5253,7 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    add t2, a1, t2
 ; RV64ZVE32F-NEXT:    sd a0, 0(t2)
 ; RV64ZVE32F-NEXT:  .LBB49_2: # %else
-; RV64ZVE32F-NEXT:    andi a0, a4, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB49_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
@@ -5265,31 +5265,31 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB49_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a4, 4
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB49_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB49_13
 ; RV64ZVE32F-NEXT:  .LBB49_6: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB49_14
 ; RV64ZVE32F-NEXT:  .LBB49_7: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB49_9
 ; RV64ZVE32F-NEXT:  .LBB49_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd a5, 0(a0)
+; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB49_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a4, 64
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB49_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB49_16
 ; RV64ZVE32F-NEXT:  .LBB49_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -5298,7 +5298,7 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd t0, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB49_6
 ; RV64ZVE32F-NEXT:  .LBB49_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -5306,14 +5306,14 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a7, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB49_7
 ; RV64ZVE32F-NEXT:  .LBB49_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a6, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB49_8
 ; RV64ZVE32F-NEXT:    j .LBB49_9
 ; RV64ZVE32F-NEXT:  .LBB49_15: # %cond.store11
@@ -5321,7 +5321,7 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a3, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB49_11
 ; RV64ZVE32F-NEXT:  .LBB49_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -5363,55 +5363,55 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw t0, 40(a0)
-; RV32ZVE32F-NEXT:    lw t1, 36(a0)
-; RV32ZVE32F-NEXT:    lw t2, 32(a0)
-; RV32ZVE32F-NEXT:    lw t3, 28(a0)
-; RV32ZVE32F-NEXT:    lw t4, 24(a0)
-; RV32ZVE32F-NEXT:    lw t5, 20(a0)
-; RV32ZVE32F-NEXT:    lw t6, 16(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw t3, 24(a0)
+; RV32ZVE32F-NEXT:    lw t4, 28(a0)
+; RV32ZVE32F-NEXT:    lw t1, 32(a0)
+; RV32ZVE32F-NEXT:    lw t2, 36(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
+; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw t5, 16(a0)
+; RV32ZVE32F-NEXT:    lw t6, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a7, v0
-; RV32ZVE32F-NEXT:    andi s2, a7, 1
+; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
+; RV32ZVE32F-NEXT:    andi s2, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez s2, .LBB50_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, a7, 2
+; RV32ZVE32F-NEXT:    andi a0, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_11
 ; RV32ZVE32F-NEXT:  .LBB50_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, a7, 4
+; RV32ZVE32F-NEXT:    andi a0, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_12
 ; RV32ZVE32F-NEXT:  .LBB50_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, a7, 8
+; RV32ZVE32F-NEXT:    andi a0, t0, 8
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_13
 ; RV32ZVE32F-NEXT:  .LBB50_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, a7, 16
+; RV32ZVE32F-NEXT:    andi a0, t0, 16
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_14
 ; RV32ZVE32F-NEXT:  .LBB50_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, a7, 32
+; RV32ZVE32F-NEXT:    andi a0, t0, 32
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_15
 ; RV32ZVE32F-NEXT:  .LBB50_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, a7, 64
+; RV32ZVE32F-NEXT:    andi a0, t0, 64
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_16
 ; RV32ZVE32F-NEXT:  .LBB50_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, a7, -128
+; RV32ZVE32F-NEXT:    andi a0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_9
 ; RV32ZVE32F-NEXT:  .LBB50_8: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB50_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -5424,7 +5424,7 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vmv.x.s s2, v8
 ; RV32ZVE32F-NEXT:    sw a1, 4(s2)
 ; RV32ZVE32F-NEXT:    sw a0, 0(s2)
-; RV32ZVE32F-NEXT:    andi a0, a7, 2
+; RV32ZVE32F-NEXT:    andi a0, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_2
 ; RV32ZVE32F-NEXT:  .LBB50_11: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
@@ -5432,62 +5432,62 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV32ZVE32F-NEXT:    sw s1, 4(a0)
 ; RV32ZVE32F-NEXT:    sw s0, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 4
+; RV32ZVE32F-NEXT:    andi a0, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_3
 ; RV32ZVE32F-NEXT:  .LBB50_12: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    sw t5, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 8
+; RV32ZVE32F-NEXT:    sw t5, 0(a0)
+; RV32ZVE32F-NEXT:    sw t6, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_4
 ; RV32ZVE32F-NEXT:  .LBB50_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t4, 0(a0)
-; RV32ZVE32F-NEXT:    sw t3, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 16
+; RV32ZVE32F-NEXT:    sw t3, 0(a0)
+; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_5
 ; RV32ZVE32F-NEXT:  .LBB50_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t2, 0(a0)
-; RV32ZVE32F-NEXT:    sw t1, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 32
+; RV32ZVE32F-NEXT:    sw t1, 0(a0)
+; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_6
 ; RV32ZVE32F-NEXT:  .LBB50_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t0, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, 64
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_7
 ; RV32ZVE32F-NEXT:  .LBB50_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a7, -128
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_8
 ; RV32ZVE32F-NEXT:    j .LBB50_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i32_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
-; RV64ZVE32F-NEXT:    andi t2, a4, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV64ZVE32F-NEXT:    andi t2, a5, 1
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB50_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
@@ -5498,7 +5498,7 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    add t2, a1, t2
 ; RV64ZVE32F-NEXT:    sd a0, 0(t2)
 ; RV64ZVE32F-NEXT:  .LBB50_2: # %else
-; RV64ZVE32F-NEXT:    andi a0, a4, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB50_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
@@ -5511,18 +5511,18 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB50_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a4, 4
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB50_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB50_13
 ; RV64ZVE32F-NEXT:  .LBB50_6: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB50_14
 ; RV64ZVE32F-NEXT:  .LBB50_7: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB50_9
 ; RV64ZVE32F-NEXT:  .LBB50_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
@@ -5530,13 +5530,13 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    slli a0, a0, 32
 ; RV64ZVE32F-NEXT:    srli a0, a0, 29
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd a5, 0(a0)
+; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB50_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a4, 64
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB50_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB50_16
 ; RV64ZVE32F-NEXT:  .LBB50_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -5546,7 +5546,7 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    srli a0, a0, 29
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd t0, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB50_6
 ; RV64ZVE32F-NEXT:  .LBB50_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -5555,7 +5555,7 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    srli a0, a0, 29
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a7, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB50_7
 ; RV64ZVE32F-NEXT:  .LBB50_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v10
@@ -5563,7 +5563,7 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    srli a0, a0, 29
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a6, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB50_8
 ; RV64ZVE32F-NEXT:    j .LBB50_9
 ; RV64ZVE32F-NEXT:  .LBB50_15: # %cond.store11
@@ -5572,7 +5572,7 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    srli a0, a0, 29
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a3, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB50_11
 ; RV64ZVE32F-NEXT:  .LBB50_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -5627,24 +5627,24 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    .cfi_offset s6, -28
 ; RV32ZVE32F-NEXT:    .cfi_offset s7, -32
 ; RV32ZVE32F-NEXT:    .cfi_offset s8, -36
-; RV32ZVE32F-NEXT:    lw a3, 60(a0)
-; RV32ZVE32F-NEXT:    lw a4, 56(a0)
-; RV32ZVE32F-NEXT:    lw a5, 52(a0)
-; RV32ZVE32F-NEXT:    lw a6, 48(a0)
-; RV32ZVE32F-NEXT:    lw a7, 44(a0)
-; RV32ZVE32F-NEXT:    lw t0, 40(a0)
-; RV32ZVE32F-NEXT:    lw t1, 36(a0)
-; RV32ZVE32F-NEXT:    lw t2, 32(a0)
-; RV32ZVE32F-NEXT:    lw t3, 28(a0)
-; RV32ZVE32F-NEXT:    lw t4, 24(a0)
-; RV32ZVE32F-NEXT:    lw t5, 20(a0)
-; RV32ZVE32F-NEXT:    lw t6, 16(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw a3, 56(a0)
+; RV32ZVE32F-NEXT:    lw a4, 60(a0)
+; RV32ZVE32F-NEXT:    lw a7, 40(a0)
+; RV32ZVE32F-NEXT:    lw t0, 44(a0)
+; RV32ZVE32F-NEXT:    lw a5, 48(a0)
+; RV32ZVE32F-NEXT:    lw a6, 52(a0)
+; RV32ZVE32F-NEXT:    lw t3, 24(a0)
+; RV32ZVE32F-NEXT:    lw t4, 28(a0)
+; RV32ZVE32F-NEXT:    lw t1, 32(a0)
+; RV32ZVE32F-NEXT:    lw t2, 36(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
-; RV32ZVE32F-NEXT:    lw s2, 56(a2)
-; RV32ZVE32F-NEXT:    lw s3, 48(a2)
-; RV32ZVE32F-NEXT:    lw s4, 40(a2)
-; RV32ZVE32F-NEXT:    lw s5, 32(a2)
+; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw t5, 16(a0)
+; RV32ZVE32F-NEXT:    lw t6, 20(a0)
+; RV32ZVE32F-NEXT:    lw s2, 32(a2)
+; RV32ZVE32F-NEXT:    lw s3, 40(a2)
+; RV32ZVE32F-NEXT:    lw s4, 48(a2)
+; RV32ZVE32F-NEXT:    lw s5, 56(a2)
 ; RV32ZVE32F-NEXT:    lw s6, 0(a2)
 ; RV32ZVE32F-NEXT:    lw s7, 8(a2)
 ; RV32ZVE32F-NEXT:    lw s8, 16(a2)
@@ -5654,10 +5654,10 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s8
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s2
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s3
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
@@ -5690,8 +5690,8 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a4, 0(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
+; RV32ZVE32F-NEXT:    sw a4, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB51_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
@@ -5724,40 +5724,40 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    sw t5, 0(a0)
+; RV32ZVE32F-NEXT:    sw t6, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a2, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB51_4
 ; RV32ZVE32F-NEXT:  .LBB51_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t4, 0(a0)
-; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    sw t3, 0(a0)
+; RV32ZVE32F-NEXT:    sw t4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a2, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB51_5
 ; RV32ZVE32F-NEXT:  .LBB51_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t2, 0(a0)
-; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    sw t1, 0(a0)
+; RV32ZVE32F-NEXT:    sw t2, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a2, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB51_6
 ; RV32ZVE32F-NEXT:  .LBB51_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t0, 0(a0)
-; RV32ZVE32F-NEXT:    sw a7, 4(a0)
+; RV32ZVE32F-NEXT:    sw a7, 0(a0)
+; RV32ZVE32F-NEXT:    sw t0, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a2, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB51_7
 ; RV32ZVE32F-NEXT:  .LBB51_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a6, 0(a0)
-; RV32ZVE32F-NEXT:    sw a5, 4(a0)
+; RV32ZVE32F-NEXT:    sw a5, 0(a0)
+; RV32ZVE32F-NEXT:    sw a6, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a2, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB51_8
 ; RV32ZVE32F-NEXT:    j .LBB51_9
@@ -5774,20 +5774,20 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV64ZVE32F-NEXT:    .cfi_offset s1, -16
 ; RV64ZVE32F-NEXT:    .cfi_offset s2, -24
 ; RV64ZVE32F-NEXT:    .cfi_offset s3, -32
-; RV64ZVE32F-NEXT:    ld a3, 56(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 48(a0)
-; RV64ZVE32F-NEXT:    ld a6, 40(a0)
-; RV64ZVE32F-NEXT:    ld t1, 32(a0)
+; RV64ZVE32F-NEXT:    ld a3, 56(a0)
+; RV64ZVE32F-NEXT:    ld s0, 8(a0)
+; RV64ZVE32F-NEXT:    ld t5, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t3, 24(a0)
-; RV64ZVE32F-NEXT:    ld t6, 16(a0)
-; RV64ZVE32F-NEXT:    ld s1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t1, 32(a0)
 ; RV64ZVE32F-NEXT:    ld s2, 8(a2)
-; RV64ZVE32F-NEXT:    ld s0, 16(a2)
-; RV64ZVE32F-NEXT:    ld t5, 24(a2)
+; RV64ZVE32F-NEXT:    ld s1, 16(a2)
+; RV64ZVE32F-NEXT:    ld t6, 24(a2)
 ; RV64ZVE32F-NEXT:    ld t4, 32(a2)
 ; RV64ZVE32F-NEXT:    ld t2, 40(a2)
 ; RV64ZVE32F-NEXT:    ld t0, 48(a2)
-; RV64ZVE32F-NEXT:    ld a5, 56(a2)
+; RV64ZVE32F-NEXT:    ld a6, 56(a2)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a7, v0
 ; RV64ZVE32F-NEXT:    andi s3, a7, 1
@@ -5814,8 +5814,8 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV64ZVE32F-NEXT:    andi a0, a7, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB51_9
 ; RV64ZVE32F-NEXT:  .LBB51_8: # %cond.store13
-; RV64ZVE32F-NEXT:    slli a5, a5, 3
-; RV64ZVE32F-NEXT:    add a1, a1, a5
+; RV64ZVE32F-NEXT:    slli a6, a6, 3
+; RV64ZVE32F-NEXT:    add a1, a1, a6
 ; RV64ZVE32F-NEXT:    sd a3, 0(a1)
 ; RV64ZVE32F-NEXT:  .LBB51_9: # %else14
 ; RV64ZVE32F-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
@@ -5835,19 +5835,19 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB51_11: # %cond.store1
 ; RV64ZVE32F-NEXT:    slli s2, s2, 3
 ; RV64ZVE32F-NEXT:    add s2, a1, s2
-; RV64ZVE32F-NEXT:    sd s1, 0(s2)
+; RV64ZVE32F-NEXT:    sd s0, 0(s2)
 ; RV64ZVE32F-NEXT:    andi a0, a7, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB51_3
 ; RV64ZVE32F-NEXT:  .LBB51_12: # %cond.store3
-; RV64ZVE32F-NEXT:    slli s0, s0, 3
-; RV64ZVE32F-NEXT:    add s0, a1, s0
-; RV64ZVE32F-NEXT:    sd t6, 0(s0)
+; RV64ZVE32F-NEXT:    slli s1, s1, 3
+; RV64ZVE32F-NEXT:    add s1, a1, s1
+; RV64ZVE32F-NEXT:    sd t5, 0(s1)
 ; RV64ZVE32F-NEXT:    andi a0, a7, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB51_4
 ; RV64ZVE32F-NEXT:  .LBB51_13: # %cond.store5
-; RV64ZVE32F-NEXT:    slli t5, t5, 3
-; RV64ZVE32F-NEXT:    add t5, a1, t5
-; RV64ZVE32F-NEXT:    sd t3, 0(t5)
+; RV64ZVE32F-NEXT:    slli t6, t6, 3
+; RV64ZVE32F-NEXT:    add t6, a1, t6
+; RV64ZVE32F-NEXT:    sd t3, 0(t6)
 ; RV64ZVE32F-NEXT:    andi a0, a7, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB51_5
 ; RV64ZVE32F-NEXT:  .LBB51_14: # %cond.store7
@@ -5859,7 +5859,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB51_15: # %cond.store9
 ; RV64ZVE32F-NEXT:    slli t2, t2, 3
 ; RV64ZVE32F-NEXT:    add t2, a1, t2
-; RV64ZVE32F-NEXT:    sd a6, 0(t2)
+; RV64ZVE32F-NEXT:    sd a5, 0(t2)
 ; RV64ZVE32F-NEXT:    andi a0, a7, 64
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB51_7
 ; RV64ZVE32F-NEXT:  .LBB51_16: # %cond.store11
@@ -5972,9 +5972,9 @@ define void @mscatter_v4f16(<4 x half> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4f16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -6033,17 +6033,17 @@ define void @mscatter_truemask_v4f16(<4 x half> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4f16:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 0(a0)
-; RV64ZVE32F-NEXT:    ld a2, 24(a0)
-; RV64ZVE32F-NEXT:    ld a3, 8(a0)
-; RV64ZVE32F-NEXT:    ld a0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 8(a0)
+; RV64ZVE32F-NEXT:    ld a3, 16(a0)
+; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vse16.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT:    vse16.v v9, (a3)
+; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vse16.v v9, (a0)
+; RV64ZVE32F-NEXT:    vse16.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
-; RV64ZVE32F-NEXT:    vse16.v v8, (a2)
+; RV64ZVE32F-NEXT:    vse16.v v8, (a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> %val, <4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1))
   ret void
@@ -6074,37 +6074,37 @@ define void @mscatter_v8f16(<8 x half> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8f16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
+; RV64ZVE32F-NEXT:    ld a3, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
-; RV64ZVE32F-NEXT:    andi t1, a3, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
+; RV64ZVE32F-NEXT:    andi t1, a4, 1
 ; RV64ZVE32F-NEXT:    bnez t1, .LBB57_9
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a3, 2
+; RV64ZVE32F-NEXT:    andi a0, a4, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB57_10
 ; RV64ZVE32F-NEXT:  .LBB57_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a3, 4
+; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB57_11
 ; RV64ZVE32F-NEXT:  .LBB57_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a3, 8
+; RV64ZVE32F-NEXT:    andi a0, a4, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB57_12
 ; RV64ZVE32F-NEXT:  .LBB57_4: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a3, 16
+; RV64ZVE32F-NEXT:    andi a0, a4, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB57_13
 ; RV64ZVE32F-NEXT:  .LBB57_5: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a3, 32
+; RV64ZVE32F-NEXT:    andi a0, a4, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB57_14
 ; RV64ZVE32F-NEXT:  .LBB57_6: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a3, 64
+; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB57_15
 ; RV64ZVE32F-NEXT:  .LBB57_7: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a3, -128
+; RV64ZVE32F-NEXT:    andi a0, a4, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB57_16
 ; RV64ZVE32F-NEXT:  .LBB57_8: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -6112,43 +6112,43 @@ define void @mscatter_v8f16(<8 x half> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse16.v v8, (a0)
-; RV64ZVE32F-NEXT:    andi a0, a3, 2
+; RV64ZVE32F-NEXT:    andi a0, a4, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB57_2
 ; RV64ZVE32F-NEXT:  .LBB57_10: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64ZVE32F-NEXT:    vse16.v v9, (t0)
-; RV64ZVE32F-NEXT:    andi a0, a3, 4
+; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB57_3
 ; RV64ZVE32F-NEXT:  .LBB57_11: # %cond.store3
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a7)
-; RV64ZVE32F-NEXT:    andi a0, a3, 8
+; RV64ZVE32F-NEXT:    andi a0, a4, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB57_4
 ; RV64ZVE32F-NEXT:  .LBB57_12: # %cond.store5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a6)
-; RV64ZVE32F-NEXT:    andi a0, a3, 16
+; RV64ZVE32F-NEXT:    andi a0, a4, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB57_5
 ; RV64ZVE32F-NEXT:  .LBB57_13: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a5)
-; RV64ZVE32F-NEXT:    andi a0, a3, 32
+; RV64ZVE32F-NEXT:    andi a0, a4, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB57_6
 ; RV64ZVE32F-NEXT:  .LBB57_14: # %cond.store9
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
-; RV64ZVE32F-NEXT:    vse16.v v9, (a4)
-; RV64ZVE32F-NEXT:    andi a0, a3, 64
+; RV64ZVE32F-NEXT:    vse16.v v9, (a3)
+; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB57_7
 ; RV64ZVE32F-NEXT:  .LBB57_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 6
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
-; RV64ZVE32F-NEXT:    andi a0, a3, -128
+; RV64ZVE32F-NEXT:    andi a0, a4, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB57_8
 ; RV64ZVE32F-NEXT:  .LBB57_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
@@ -6794,9 +6794,9 @@ define void @mscatter_v4f32(<4 x float> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4f32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -6855,17 +6855,17 @@ define void @mscatter_truemask_v4f32(<4 x float> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4f32:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 0(a0)
-; RV64ZVE32F-NEXT:    ld a2, 24(a0)
-; RV64ZVE32F-NEXT:    ld a3, 8(a0)
-; RV64ZVE32F-NEXT:    ld a0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 8(a0)
+; RV64ZVE32F-NEXT:    ld a3, 16(a0)
+; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT:    vse32.v v9, (a3)
+; RV64ZVE32F-NEXT:    vse32.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vse32.v v9, (a0)
+; RV64ZVE32F-NEXT:    vse32.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
-; RV64ZVE32F-NEXT:    vse32.v v8, (a2)
+; RV64ZVE32F-NEXT:    vse32.v v8, (a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %val, <4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 1))
   ret void
@@ -6896,37 +6896,37 @@ define void @mscatter_v8f32(<8 x float> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8f32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
+; RV64ZVE32F-NEXT:    ld a3, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
-; RV64ZVE32F-NEXT:    andi t1, a3, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
+; RV64ZVE32F-NEXT:    andi t1, a4, 1
 ; RV64ZVE32F-NEXT:    bnez t1, .LBB67_9
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a3, 2
+; RV64ZVE32F-NEXT:    andi a0, a4, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB67_10
 ; RV64ZVE32F-NEXT:  .LBB67_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a3, 4
+; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB67_11
 ; RV64ZVE32F-NEXT:  .LBB67_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a3, 8
+; RV64ZVE32F-NEXT:    andi a0, a4, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB67_12
 ; RV64ZVE32F-NEXT:  .LBB67_4: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a3, 16
+; RV64ZVE32F-NEXT:    andi a0, a4, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB67_13
 ; RV64ZVE32F-NEXT:  .LBB67_5: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a3, 32
+; RV64ZVE32F-NEXT:    andi a0, a4, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB67_14
 ; RV64ZVE32F-NEXT:  .LBB67_6: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a3, 64
+; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB67_15
 ; RV64ZVE32F-NEXT:  .LBB67_7: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a3, -128
+; RV64ZVE32F-NEXT:    andi a0, a4, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB67_16
 ; RV64ZVE32F-NEXT:  .LBB67_8: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -6934,46 +6934,46 @@ define void @mscatter_v8f32(<8 x float> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v8, (a0)
-; RV64ZVE32F-NEXT:    andi a0, a3, 2
+; RV64ZVE32F-NEXT:    andi a0, a4, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB67_2
 ; RV64ZVE32F-NEXT:  .LBB67_10: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-NEXT:    vse32.v v10, (t0)
-; RV64ZVE32F-NEXT:    andi a0, a3, 4
+; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB67_3
 ; RV64ZVE32F-NEXT:  .LBB67_11: # %cond.store3
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV64ZVE32F-NEXT:    vse32.v v10, (a7)
-; RV64ZVE32F-NEXT:    andi a0, a3, 8
+; RV64ZVE32F-NEXT:    andi a0, a4, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB67_4
 ; RV64ZVE32F-NEXT:  .LBB67_12: # %cond.store5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV64ZVE32F-NEXT:    vse32.v v10, (a6)
-; RV64ZVE32F-NEXT:    andi a0, a3, 16
+; RV64ZVE32F-NEXT:    andi a0, a4, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB67_5
 ; RV64ZVE32F-NEXT:  .LBB67_13: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v10, (a5)
-; RV64ZVE32F-NEXT:    andi a0, a3, 32
+; RV64ZVE32F-NEXT:    andi a0, a4, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB67_6
 ; RV64ZVE32F-NEXT:  .LBB67_14: # %cond.store9
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v10, (a4)
-; RV64ZVE32F-NEXT:    andi a0, a3, 64
+; RV64ZVE32F-NEXT:    vse32.v v10, (a3)
+; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB67_7
 ; RV64ZVE32F-NEXT:  .LBB67_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v10, (a2)
-; RV64ZVE32F-NEXT:    andi a0, a3, -128
+; RV64ZVE32F-NEXT:    andi a0, a4, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB67_8
 ; RV64ZVE32F-NEXT:  .LBB67_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -8126,9 +8126,9 @@ define void @mscatter_v4f64(<4 x double> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4f64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -8317,68 +8317,68 @@ define void @mscatter_v8f64(<8 x double> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8f64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
+; RV64ZVE32F-NEXT:    ld a3, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
-; RV64ZVE32F-NEXT:    andi t1, a3, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
+; RV64ZVE32F-NEXT:    andi t1, a4, 1
 ; RV64ZVE32F-NEXT:    bnez t1, .LBB80_9
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a3, 2
+; RV64ZVE32F-NEXT:    andi a0, a4, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB80_10
 ; RV64ZVE32F-NEXT:  .LBB80_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a3, 4
+; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB80_11
 ; RV64ZVE32F-NEXT:  .LBB80_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a3, 8
+; RV64ZVE32F-NEXT:    andi a0, a4, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB80_12
 ; RV64ZVE32F-NEXT:  .LBB80_4: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a3, 16
+; RV64ZVE32F-NEXT:    andi a0, a4, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB80_13
 ; RV64ZVE32F-NEXT:  .LBB80_5: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a3, 32
+; RV64ZVE32F-NEXT:    andi a0, a4, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB80_14
 ; RV64ZVE32F-NEXT:  .LBB80_6: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a3, 64
+; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB80_15
 ; RV64ZVE32F-NEXT:  .LBB80_7: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a3, -128
+; RV64ZVE32F-NEXT:    andi a0, a4, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB80_16
 ; RV64ZVE32F-NEXT:  .LBB80_8: # %else14
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB80_9: # %cond.store
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    fsd fa0, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a3, 2
+; RV64ZVE32F-NEXT:    andi a0, a4, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB80_2
 ; RV64ZVE32F-NEXT:  .LBB80_10: # %cond.store1
 ; RV64ZVE32F-NEXT:    fsd fa1, 0(t0)
-; RV64ZVE32F-NEXT:    andi a0, a3, 4
+; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB80_3
 ; RV64ZVE32F-NEXT:  .LBB80_11: # %cond.store3
 ; RV64ZVE32F-NEXT:    fsd fa2, 0(a7)
-; RV64ZVE32F-NEXT:    andi a0, a3, 8
+; RV64ZVE32F-NEXT:    andi a0, a4, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB80_4
 ; RV64ZVE32F-NEXT:  .LBB80_12: # %cond.store5
 ; RV64ZVE32F-NEXT:    fsd fa3, 0(a6)
-; RV64ZVE32F-NEXT:    andi a0, a3, 16
+; RV64ZVE32F-NEXT:    andi a0, a4, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB80_5
 ; RV64ZVE32F-NEXT:  .LBB80_13: # %cond.store7
 ; RV64ZVE32F-NEXT:    fsd fa4, 0(a5)
-; RV64ZVE32F-NEXT:    andi a0, a3, 32
+; RV64ZVE32F-NEXT:    andi a0, a4, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB80_6
 ; RV64ZVE32F-NEXT:  .LBB80_14: # %cond.store9
-; RV64ZVE32F-NEXT:    fsd fa5, 0(a4)
-; RV64ZVE32F-NEXT:    andi a0, a3, 64
+; RV64ZVE32F-NEXT:    fsd fa5, 0(a3)
+; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB80_7
 ; RV64ZVE32F-NEXT:  .LBB80_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    fsd fa6, 0(a2)
-; RV64ZVE32F-NEXT:    andi a0, a3, -128
+; RV64ZVE32F-NEXT:    andi a0, a4, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB80_8
 ; RV64ZVE32F-NEXT:  .LBB80_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    fsd fa7, 0(a1)
@@ -10240,10 +10240,10 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, ptr %base, <8 x i64> %idx
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_v8f64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a2, 56(a1)
-; RV32ZVE32F-NEXT:    lw a3, 48(a1)
-; RV32ZVE32F-NEXT:    lw a4, 40(a1)
-; RV32ZVE32F-NEXT:    lw a5, 32(a1)
+; RV32ZVE32F-NEXT:    lw a2, 32(a1)
+; RV32ZVE32F-NEXT:    lw a3, 40(a1)
+; RV32ZVE32F-NEXT:    lw a4, 48(a1)
+; RV32ZVE32F-NEXT:    lw a5, 56(a1)
 ; RV32ZVE32F-NEXT:    lw a6, 0(a1)
 ; RV32ZVE32F-NEXT:    lw a7, 8(a1)
 ; RV32ZVE32F-NEXT:    lw t0, 16(a1)
@@ -10253,10 +10253,10 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, ptr %base, <8 x i64> %idx
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index e57b6a22dd6ea..9385fa69b2f04 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -736,18 +736,18 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-NEXT:    li a5, 40
 ; ZVE32F-NEXT:  .LBB13_1: # %bb2
 ; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
-; ZVE32F-NEXT:    ld a6, 8(a1)
-; ZVE32F-NEXT:    ld a7, 0(a1)
-; ZVE32F-NEXT:    ld t0, 24(a1)
-; ZVE32F-NEXT:    ld t1, 16(a1)
+; ZVE32F-NEXT:    ld a6, 0(a1)
+; ZVE32F-NEXT:    ld a7, 8(a1)
+; ZVE32F-NEXT:    ld t0, 16(a1)
+; ZVE32F-NEXT:    ld t1, 24(a1)
 ; ZVE32F-NEXT:    mul t2, a4, a5
 ; ZVE32F-NEXT:    add t2, a0, t2
 ; ZVE32F-NEXT:    mul t3, a2, a5
 ; ZVE32F-NEXT:    add t3, a0, t3
-; ZVE32F-NEXT:    sd a7, 0(t3)
-; ZVE32F-NEXT:    sd a6, 0(t2)
-; ZVE32F-NEXT:    sd t1, 80(t3)
-; ZVE32F-NEXT:    sd t0, 80(t2)
+; ZVE32F-NEXT:    sd a6, 0(t3)
+; ZVE32F-NEXT:    sd a7, 0(t2)
+; ZVE32F-NEXT:    sd t0, 80(t3)
+; ZVE32F-NEXT:    sd t1, 80(t2)
 ; ZVE32F-NEXT:    addi a2, a2, 4
 ; ZVE32F-NEXT:    addi a1, a1, 32
 ; ZVE32F-NEXT:    addi a4, a4, 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll
index 62a479bdedf64..b953cf1f5bed8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll
@@ -33,9 +33,8 @@ define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x b
 ; ZVFBMIN32-NEXT:    .cfi_offset s0, -8
 ; ZVFBMIN32-NEXT:    .cfi_offset fs0, -16
 ; ZVFBMIN32-NEXT:    csrr a0, vlenb
-; ZVFBMIN32-NEXT:    slli a0, a0, 1
 ; ZVFBMIN32-NEXT:    sub sp, sp, a0
-; ZVFBMIN32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 2 * vlenb
+; ZVFBMIN32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 1 * vlenb
 ; ZVFBMIN32-NEXT:    fmv.s fs0, fa0
 ; ZVFBMIN32-NEXT:    addi a0, sp, 16
 ; ZVFBMIN32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
@@ -58,7 +57,6 @@ define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x b
 ; ZVFBMIN32-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFBMIN32-NEXT:    vfmacc.vv v8, v10, v9
 ; ZVFBMIN32-NEXT:    csrr a0, vlenb
-; ZVFBMIN32-NEXT:    slli a0, a0, 1
 ; ZVFBMIN32-NEXT:    add sp, sp, a0
 ; ZVFBMIN32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; ZVFBMIN32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -77,9 +75,8 @@ define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x b
 ; ZVFBMIN64-NEXT:    .cfi_offset s0, -16
 ; ZVFBMIN64-NEXT:    .cfi_offset fs0, -24
 ; ZVFBMIN64-NEXT:    csrr a0, vlenb
-; ZVFBMIN64-NEXT:    slli a0, a0, 1
 ; ZVFBMIN64-NEXT:    sub sp, sp, a0
-; ZVFBMIN64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
+; ZVFBMIN64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; ZVFBMIN64-NEXT:    fmv.s fs0, fa0
 ; ZVFBMIN64-NEXT:    addi a0, sp, 32
 ; ZVFBMIN64-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
@@ -104,7 +101,6 @@ define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x b
 ; ZVFBMIN64-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFBMIN64-NEXT:    vfmacc.vv v8, v10, v9
 ; ZVFBMIN64-NEXT:    csrr a0, vlenb
-; ZVFBMIN64-NEXT:    slli a0, a0, 1
 ; ZVFBMIN64-NEXT:    add sp, sp, a0
 ; ZVFBMIN64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; ZVFBMIN64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -144,9 +140,8 @@ define <1 x float> @vfwmaccbf16_vf_v1f32(<1 x float> %a, bfloat %b, <1 x bfloat>
 ; ZVFBMIN32-NEXT:    .cfi_offset ra, -4
 ; ZVFBMIN32-NEXT:    .cfi_offset fs0, -16
 ; ZVFBMIN32-NEXT:    csrr a0, vlenb
-; ZVFBMIN32-NEXT:    slli a0, a0, 1
 ; ZVFBMIN32-NEXT:    sub sp, sp, a0
-; ZVFBMIN32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; ZVFBMIN32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
 ; ZVFBMIN32-NEXT:    fmv.s fs0, fa0
 ; ZVFBMIN32-NEXT:    addi a0, sp, 32
 ; ZVFBMIN32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
@@ -167,7 +162,6 @@ define <1 x float> @vfwmaccbf16_vf_v1f32(<1 x float> %a, bfloat %b, <1 x bfloat>
 ; ZVFBMIN32-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFBMIN32-NEXT:    vfmacc.vv v8, v10, v9
 ; ZVFBMIN32-NEXT:    csrr a0, vlenb
-; ZVFBMIN32-NEXT:    slli a0, a0, 1
 ; ZVFBMIN32-NEXT:    add sp, sp, a0
 ; ZVFBMIN32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; ZVFBMIN32-NEXT:    fld fs0, 32(sp) # 8-byte Folded Reload
@@ -183,9 +177,8 @@ define <1 x float> @vfwmaccbf16_vf_v1f32(<1 x float> %a, bfloat %b, <1 x bfloat>
 ; ZVFBMIN64-NEXT:    .cfi_offset ra, -8
 ; ZVFBMIN64-NEXT:    .cfi_offset fs0, -16
 ; ZVFBMIN64-NEXT:    csrr a0, vlenb
-; ZVFBMIN64-NEXT:    slli a0, a0, 1
 ; ZVFBMIN64-NEXT:    sub sp, sp, a0
-; ZVFBMIN64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; ZVFBMIN64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
 ; ZVFBMIN64-NEXT:    fmv.s fs0, fa0
 ; ZVFBMIN64-NEXT:    addi a0, sp, 32
 ; ZVFBMIN64-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
@@ -208,7 +201,6 @@ define <1 x float> @vfwmaccbf16_vf_v1f32(<1 x float> %a, bfloat %b, <1 x bfloat>
 ; ZVFBMIN64-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFBMIN64-NEXT:    vfmacc.vv v8, v10, v9
 ; ZVFBMIN64-NEXT:    csrr a0, vlenb
-; ZVFBMIN64-NEXT:    slli a0, a0, 1
 ; ZVFBMIN64-NEXT:    add sp, sp, a0
 ; ZVFBMIN64-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; ZVFBMIN64-NEXT:    fld fs0, 32(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
index 0c180cd148b81..c055039876191 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
@@ -1844,10 +1844,10 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a4, 10
-; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    slli a4, a3, 3
+; RV64-NEXT:    add a3, a4, a3
 ; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x0a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 10 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v24, (a1)
@@ -1888,8 +1888,8 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 10
-; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    slli a1, a0, 3
+; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -1931,10 +1931,10 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a4, 10
-; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    slli a4, a3, 3
+; RV64-NEXT:    add a3, a4, a3
 ; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x0a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 10 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v24, (a1)
@@ -1975,8 +1975,8 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 10
-; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    slli a1, a0, 3
+; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 320db35770cb8..dd01e1c1ee66d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -294,10 +294,10 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 34
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a2, a1, 5
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmv8r.v v0, v8
 ; CHECK-NEXT:    csrr a2, vlenb
@@ -426,8 +426,8 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 34
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 5
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -968,10 +968,10 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 34
-; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 5
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v24, v0
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
@@ -1100,8 +1100,8 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 34
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 5
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
@@ -1525,10 +1525,10 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 36
+; CHECK-NEXT:    li a3, 35
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x24, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 36 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x23, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 35 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
@@ -1683,7 +1683,7 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 36
+; CHECK-NEXT:    li a1, 35
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 03e3969f9141e..85cac8d187059 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -294,10 +294,10 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 34
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a2, a1, 5
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmv8r.v v0, v8
 ; CHECK-NEXT:    csrr a2, vlenb
@@ -426,8 +426,8 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 34
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 5
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -968,10 +968,10 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 34
-; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 5
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v24, v0
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
@@ -1100,8 +1100,8 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 34
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 5
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
@@ -1525,10 +1525,10 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 36
+; CHECK-NEXT:    li a3, 35
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x24, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 36 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x23, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 35 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
@@ -1683,7 +1683,7 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 36
+; CHECK-NEXT:    li a1, 35
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 1395dc914bb40..5a880105f6837 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -340,21 +340,21 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
-; CHECK-NOV-NEXT:    lhu s1, 24(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu a1, 16(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -432,9 +432,10 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    .cfi_offset s2, -32
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a2, a1, 1
+; CHECK-V-NEXT:    add a1, a2, a1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb
 ; CHECK-V-NEXT:    lhu s0, 0(a0)
 ; CHECK-V-NEXT:    lhu s1, 8(a0)
 ; CHECK-V-NEXT:    lhu s2, 16(a0)
@@ -482,7 +483,8 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a1, a0, 1
+; CHECK-V-NEXT:    add a0, a1, a0
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -522,17 +524,17 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 24(a1)
-; CHECK-NOV-NEXT:    lhu s3, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
@@ -593,9 +595,10 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    .cfi_offset s2, -32
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a2, a1, 1
+; CHECK-V-NEXT:    add a1, a2, a1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb
 ; CHECK-V-NEXT:    lhu s0, 0(a0)
 ; CHECK-V-NEXT:    lhu s1, 8(a0)
 ; CHECK-V-NEXT:    lhu s2, 16(a0)
@@ -643,7 +646,8 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a1, a0, 1
+; CHECK-V-NEXT:    add a0, a1, a0
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -680,21 +684,21 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
-; CHECK-NOV-NEXT:    lhu s1, 24(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu a1, 16(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -764,9 +768,10 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    .cfi_offset s2, -32
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a2, a1, 1
+; CHECK-V-NEXT:    add a1, a2, a1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb
 ; CHECK-V-NEXT:    lhu s0, 0(a0)
 ; CHECK-V-NEXT:    lhu s1, 8(a0)
 ; CHECK-V-NEXT:    lhu s2, 16(a0)
@@ -815,7 +820,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a1, a0, 1
+; CHECK-V-NEXT:    add a0, a1, a0
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -1190,37 +1196,37 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 56(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu s4, 16(a1)
-; CHECK-NOV-NEXT:    lhu s5, 24(a1)
-; CHECK-NOV-NEXT:    lhu s6, 32(a1)
-; CHECK-NOV-NEXT:    lhu s7, 40(a1)
-; CHECK-NOV-NEXT:    lhu a1, 48(a1)
+; CHECK-NOV-NEXT:    lhu s1, 32(a1)
+; CHECK-NOV-NEXT:    lhu s2, 40(a1)
+; CHECK-NOV-NEXT:    lhu a2, 48(a1)
+; CHECK-NOV-NEXT:    lhu s3, 56(a1)
+; CHECK-NOV-NEXT:    lhu s4, 0(a1)
+; CHECK-NOV-NEXT:    lhu s5, 8(a1)
+; CHECK-NOV-NEXT:    lhu s6, 16(a1)
+; CHECK-NOV-NEXT:    lhu s7, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -1545,37 +1551,37 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 56(a1)
+; CHECK-NOV-NEXT:    lhu s1, 32(a1)
+; CHECK-NOV-NEXT:    lhu s2, 40(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 48(a1)
-; CHECK-NOV-NEXT:    lhu s4, 40(a1)
-; CHECK-NOV-NEXT:    lhu s5, 32(a1)
-; CHECK-NOV-NEXT:    lhu s6, 24(a1)
-; CHECK-NOV-NEXT:    lhu s7, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu s4, 56(a1)
+; CHECK-NOV-NEXT:    lhu s5, 0(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s6, 16(a1)
+; CHECK-NOV-NEXT:    lhu s7, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    fcvt.lu.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.lu.s a0, fa0, rtz
@@ -1856,37 +1862,37 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 56(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu s4, 16(a1)
-; CHECK-NOV-NEXT:    lhu s5, 24(a1)
-; CHECK-NOV-NEXT:    lhu s6, 32(a1)
-; CHECK-NOV-NEXT:    lhu s7, 40(a1)
-; CHECK-NOV-NEXT:    lhu a1, 48(a1)
+; CHECK-NOV-NEXT:    lhu s1, 32(a1)
+; CHECK-NOV-NEXT:    lhu s2, 40(a1)
+; CHECK-NOV-NEXT:    lhu a2, 48(a1)
+; CHECK-NOV-NEXT:    lhu s3, 56(a1)
+; CHECK-NOV-NEXT:    lhu s4, 0(a1)
+; CHECK-NOV-NEXT:    lhu s5, 8(a1)
+; CHECK-NOV-NEXT:    lhu s6, 16(a1)
+; CHECK-NOV-NEXT:    lhu s7, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -2250,9 +2256,8 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    sub sp, sp, a0
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2261,10 +2266,7 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    call __fixdfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
-; CHECK-V-NEXT:    addi a0, sp, 32
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    fld fa0, 32(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    call __fixdfti
 ; CHECK-V-NEXT:    li a2, -1
 ; CHECK-V-NEXT:    srli a3, a2, 1
@@ -2322,7 +2324,6 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v9, s0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -2383,9 +2384,8 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    sub sp, sp, a0
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2394,10 +2394,7 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    call __fixunsdfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
-; CHECK-V-NEXT:    addi a0, sp, 32
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    fld fa0, 32(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    call __fixunsdfti
 ; CHECK-V-NEXT:    snez a1, a1
 ; CHECK-V-NEXT:    snez a2, s1
@@ -2410,7 +2407,6 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v9, a2
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -2495,9 +2491,8 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    sub sp, sp, a0
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2506,10 +2501,7 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    call __fixdfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
-; CHECK-V-NEXT:    addi a0, sp, 32
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    fld fa0, 32(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    call __fixdfti
 ; CHECK-V-NEXT:    mv a2, s1
 ; CHECK-V-NEXT:    blez s1, .LBB20_2
@@ -2549,7 +2541,6 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v9, a2
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -2657,9 +2648,8 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    sub sp, sp, a0
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2668,10 +2658,7 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
-; CHECK-V-NEXT:    addi a0, sp, 32
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    flw fa0, 32(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    li a2, -1
 ; CHECK-V-NEXT:    srli a3, a2, 1
@@ -2729,7 +2716,6 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v9, s0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -2790,9 +2776,8 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    sub sp, sp, a0
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2801,10 +2786,7 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    call __fixunssfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
-; CHECK-V-NEXT:    addi a0, sp, 32
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    flw fa0, 32(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    call __fixunssfti
 ; CHECK-V-NEXT:    snez a1, a1
 ; CHECK-V-NEXT:    snez a2, s1
@@ -2817,7 +2799,6 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v9, a2
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -2902,9 +2883,8 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    sub sp, sp, a0
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2913,10 +2893,7 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
-; CHECK-V-NEXT:    addi a0, sp, 32
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    flw fa0, 32(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv a2, s1
 ; CHECK-V-NEXT:    blez s1, .LBB23_2
@@ -2956,7 +2933,6 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v9, a2
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -3693,21 +3669,21 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
-; CHECK-NOV-NEXT:    lhu s1, 24(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu a1, 16(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -3785,9 +3761,10 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    .cfi_offset s2, -32
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a2, a1, 1
+; CHECK-V-NEXT:    add a1, a2, a1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb
 ; CHECK-V-NEXT:    lhu s0, 0(a0)
 ; CHECK-V-NEXT:    lhu s1, 8(a0)
 ; CHECK-V-NEXT:    lhu s2, 16(a0)
@@ -3835,7 +3812,8 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a1, a0, 1
+; CHECK-V-NEXT:    add a0, a1, a0
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -3873,17 +3851,17 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 24(a1)
-; CHECK-NOV-NEXT:    lhu s3, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
@@ -3944,9 +3922,10 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    .cfi_offset s2, -32
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a2, a1, 1
+; CHECK-V-NEXT:    add a1, a2, a1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb
 ; CHECK-V-NEXT:    lhu s0, 0(a0)
 ; CHECK-V-NEXT:    lhu s1, 8(a0)
 ; CHECK-V-NEXT:    lhu s2, 16(a0)
@@ -3994,7 +3973,8 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a1, a0, 1
+; CHECK-V-NEXT:    add a0, a1, a0
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -4030,21 +4010,21 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
-; CHECK-NOV-NEXT:    lhu s1, 24(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu a1, 16(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -4114,9 +4094,10 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    .cfi_offset s2, -32
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a2, a1, 1
+; CHECK-V-NEXT:    add a1, a2, a1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb
 ; CHECK-V-NEXT:    lhu s0, 0(a0)
 ; CHECK-V-NEXT:    lhu s1, 8(a0)
 ; CHECK-V-NEXT:    lhu s2, 16(a0)
@@ -4165,7 +4146,8 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a1, a0, 1
+; CHECK-V-NEXT:    add a0, a1, a0
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -4528,37 +4510,37 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 56(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu s4, 16(a1)
-; CHECK-NOV-NEXT:    lhu s5, 24(a1)
-; CHECK-NOV-NEXT:    lhu s6, 32(a1)
-; CHECK-NOV-NEXT:    lhu s7, 40(a1)
-; CHECK-NOV-NEXT:    lhu a1, 48(a1)
+; CHECK-NOV-NEXT:    lhu s1, 32(a1)
+; CHECK-NOV-NEXT:    lhu s2, 40(a1)
+; CHECK-NOV-NEXT:    lhu a2, 48(a1)
+; CHECK-NOV-NEXT:    lhu s3, 56(a1)
+; CHECK-NOV-NEXT:    lhu s4, 0(a1)
+; CHECK-NOV-NEXT:    lhu s5, 8(a1)
+; CHECK-NOV-NEXT:    lhu s6, 16(a1)
+; CHECK-NOV-NEXT:    lhu s7, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -4881,37 +4863,37 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 56(a1)
+; CHECK-NOV-NEXT:    lhu s1, 32(a1)
+; CHECK-NOV-NEXT:    lhu s2, 40(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 48(a1)
-; CHECK-NOV-NEXT:    lhu s4, 40(a1)
-; CHECK-NOV-NEXT:    lhu s5, 32(a1)
-; CHECK-NOV-NEXT:    lhu s6, 24(a1)
-; CHECK-NOV-NEXT:    lhu s7, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu s4, 56(a1)
+; CHECK-NOV-NEXT:    lhu s5, 0(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s6, 16(a1)
+; CHECK-NOV-NEXT:    lhu s7, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    fcvt.lu.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.lu.s a0, fa0, rtz
@@ -5191,37 +5173,37 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 56(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu s4, 16(a1)
-; CHECK-NOV-NEXT:    lhu s5, 24(a1)
-; CHECK-NOV-NEXT:    lhu s6, 32(a1)
-; CHECK-NOV-NEXT:    lhu s7, 40(a1)
-; CHECK-NOV-NEXT:    lhu a1, 48(a1)
+; CHECK-NOV-NEXT:    lhu s1, 32(a1)
+; CHECK-NOV-NEXT:    lhu s2, 40(a1)
+; CHECK-NOV-NEXT:    lhu a2, 48(a1)
+; CHECK-NOV-NEXT:    lhu s3, 56(a1)
+; CHECK-NOV-NEXT:    lhu s4, 0(a1)
+; CHECK-NOV-NEXT:    lhu s5, 8(a1)
+; CHECK-NOV-NEXT:    lhu s6, 16(a1)
+; CHECK-NOV-NEXT:    lhu s7, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -5586,9 +5568,8 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    sub sp, sp, a0
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -5597,10 +5578,7 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    call __fixdfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
-; CHECK-V-NEXT:    addi a0, sp, 32
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    fld fa0, 32(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    call __fixdfti
 ; CHECK-V-NEXT:    li a2, -1
 ; CHECK-V-NEXT:    srli a3, a2, 1
@@ -5661,7 +5639,6 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v9, s0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -5720,9 +5697,8 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    sub sp, sp, a0
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -5747,7 +5723,6 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v9, a0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -5820,9 +5795,8 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    sub sp, sp, a0
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -5831,10 +5805,7 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    call __fixdfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
-; CHECK-V-NEXT:    addi a0, sp, 32
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    fld fa0, 32(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    call __fixdfti
 ; CHECK-V-NEXT:    mv a2, a1
 ; CHECK-V-NEXT:    blez a1, .LBB47_2
@@ -5863,7 +5834,6 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v9, a1
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -5972,9 +5942,8 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    sub sp, sp, a0
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -5983,10 +5952,7 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
-; CHECK-V-NEXT:    addi a0, sp, 32
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    flw fa0, 32(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    li a2, -1
 ; CHECK-V-NEXT:    srli a3, a2, 1
@@ -6047,7 +6013,6 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v9, s0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -6106,9 +6071,8 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    sub sp, sp, a0
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
@@ -6133,7 +6097,6 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v9, a0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -6206,9 +6169,8 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    sub sp, sp, a0
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -6217,10 +6179,7 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
-; CHECK-V-NEXT:    addi a0, sp, 32
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    flw fa0, 32(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv a2, a1
 ; CHECK-V-NEXT:    blez a1, .LBB50_2
@@ -6249,7 +6208,6 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v9, a1
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll
index 0e102d98c79cf..ccfe94ecad286 100644
--- a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll
@@ -87,7 +87,6 @@ define <vscale x 1 x float> @just_call(<vscale x 1 x float> %0) nounwind {
 ; CHECK-NEXT:    addi sp, sp, -48
 ; CHECK-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 32
 ; CHECK-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
@@ -95,7 +94,6 @@ define <vscale x 1 x float> @just_call(<vscale x 1 x float> %0) nounwind {
 ; CHECK-NEXT:    addi a0, sp, 32
 ; CHECK-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 48
@@ -106,7 +104,6 @@ define <vscale x 1 x float> @just_call(<vscale x 1 x float> %0) nounwind {
 ; UNOPT-NEXT:    addi sp, sp, -48
 ; UNOPT-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; UNOPT-NEXT:    csrr a0, vlenb
-; UNOPT-NEXT:    slli a0, a0, 1
 ; UNOPT-NEXT:    sub sp, sp, a0
 ; UNOPT-NEXT:    addi a0, sp, 32
 ; UNOPT-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
@@ -114,7 +111,6 @@ define <vscale x 1 x float> @just_call(<vscale x 1 x float> %0) nounwind {
 ; UNOPT-NEXT:    addi a0, sp, 32
 ; UNOPT-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; UNOPT-NEXT:    csrr a0, vlenb
-; UNOPT-NEXT:    slli a0, a0, 1
 ; UNOPT-NEXT:    add sp, sp, a0
 ; UNOPT-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; UNOPT-NEXT:    addi sp, sp, 48
@@ -130,7 +126,6 @@ define <vscale x 1 x float> @before_call1(<vscale x 1 x float> %0, <vscale x 1 x
 ; CHECK-NEXT:    addi sp, sp, -48
 ; CHECK-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -142,7 +137,6 @@ define <vscale x 1 x float> @before_call1(<vscale x 1 x float> %0, <vscale x 1 x
 ; CHECK-NEXT:    addi a0, sp, 32
 ; CHECK-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 48
@@ -153,7 +147,6 @@ define <vscale x 1 x float> @before_call1(<vscale x 1 x float> %0, <vscale x 1 x
 ; UNOPT-NEXT:    addi sp, sp, -48
 ; UNOPT-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; UNOPT-NEXT:    csrr a1, vlenb
-; UNOPT-NEXT:    slli a1, a1, 1
 ; UNOPT-NEXT:    sub sp, sp, a1
 ; UNOPT-NEXT:    fsrmi a1, 0
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -165,7 +158,6 @@ define <vscale x 1 x float> @before_call1(<vscale x 1 x float> %0, <vscale x 1 x
 ; UNOPT-NEXT:    addi a0, sp, 32
 ; UNOPT-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; UNOPT-NEXT:    csrr a0, vlenb
-; UNOPT-NEXT:    slli a0, a0, 1
 ; UNOPT-NEXT:    add sp, sp, a0
 ; UNOPT-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; UNOPT-NEXT:    addi sp, sp, 48
@@ -186,7 +178,6 @@ define <vscale x 1 x float> @before_call2(<vscale x 1 x float> %0, <vscale x 1 x
 ; CHECK-NEXT:    addi sp, sp, -48
 ; CHECK-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
@@ -196,7 +187,6 @@ define <vscale x 1 x float> @before_call2(<vscale x 1 x float> %0, <vscale x 1 x
 ; CHECK-NEXT:    addi a0, sp, 32
 ; CHECK-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 48
@@ -207,7 +197,6 @@ define <vscale x 1 x float> @before_call2(<vscale x 1 x float> %0, <vscale x 1 x
 ; UNOPT-NEXT:    addi sp, sp, -48
 ; UNOPT-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; UNOPT-NEXT:    csrr a1, vlenb
-; UNOPT-NEXT:    slli a1, a1, 1
 ; UNOPT-NEXT:    sub sp, sp, a1
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v9
@@ -217,7 +206,6 @@ define <vscale x 1 x float> @before_call2(<vscale x 1 x float> %0, <vscale x 1 x
 ; UNOPT-NEXT:    addi a0, sp, 32
 ; UNOPT-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; UNOPT-NEXT:    csrr a0, vlenb
-; UNOPT-NEXT:    slli a0, a0, 1
 ; UNOPT-NEXT:    add sp, sp, a0
 ; UNOPT-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; UNOPT-NEXT:    addi sp, sp, 48
@@ -238,7 +226,6 @@ define <vscale x 1 x float> @after_call1(<vscale x 1 x float> %0, <vscale x 1 x
 ; CHECK-NEXT:    addi sp, sp, -48
 ; CHECK-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -250,7 +237,6 @@ define <vscale x 1 x float> @after_call1(<vscale x 1 x float> %0, <vscale x 1 x
 ; CHECK-NEXT:    addi a0, sp, 32
 ; CHECK-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 48
@@ -261,7 +247,6 @@ define <vscale x 1 x float> @after_call1(<vscale x 1 x float> %0, <vscale x 1 x
 ; UNOPT-NEXT:    addi sp, sp, -48
 ; UNOPT-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; UNOPT-NEXT:    csrr a1, vlenb
-; UNOPT-NEXT:    slli a1, a1, 1
 ; UNOPT-NEXT:    sub sp, sp, a1
 ; UNOPT-NEXT:    fsrmi a1, 0
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -273,7 +258,6 @@ define <vscale x 1 x float> @after_call1(<vscale x 1 x float> %0, <vscale x 1 x
 ; UNOPT-NEXT:    addi a0, sp, 32
 ; UNOPT-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; UNOPT-NEXT:    csrr a0, vlenb
-; UNOPT-NEXT:    slli a0, a0, 1
 ; UNOPT-NEXT:    add sp, sp, a0
 ; UNOPT-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; UNOPT-NEXT:    addi sp, sp, 48
@@ -294,7 +278,6 @@ define <vscale x 1 x float> @after_call2(<vscale x 1 x float> %0, <vscale x 1 x
 ; CHECK-NEXT:    addi sp, sp, -48
 ; CHECK-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
@@ -304,7 +287,6 @@ define <vscale x 1 x float> @after_call2(<vscale x 1 x float> %0, <vscale x 1 x
 ; CHECK-NEXT:    addi a0, sp, 32
 ; CHECK-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 48
@@ -315,7 +297,6 @@ define <vscale x 1 x float> @after_call2(<vscale x 1 x float> %0, <vscale x 1 x
 ; UNOPT-NEXT:    addi sp, sp, -48
 ; UNOPT-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; UNOPT-NEXT:    csrr a1, vlenb
-; UNOPT-NEXT:    slli a1, a1, 1
 ; UNOPT-NEXT:    sub sp, sp, a1
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v9
@@ -325,7 +306,6 @@ define <vscale x 1 x float> @after_call2(<vscale x 1 x float> %0, <vscale x 1 x
 ; UNOPT-NEXT:    addi a0, sp, 32
 ; UNOPT-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; UNOPT-NEXT:    csrr a0, vlenb
-; UNOPT-NEXT:    slli a0, a0, 1
 ; UNOPT-NEXT:    add sp, sp, a0
 ; UNOPT-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; UNOPT-NEXT:    addi sp, sp, 48
diff --git a/llvm/test/CodeGen/RISCV/rvv/large-rvv-stack-size.mir b/llvm/test/CodeGen/RISCV/rvv/large-rvv-stack-size.mir
index b4d8805b65bd8..22a7425bf98b8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/large-rvv-stack-size.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/large-rvv-stack-size.mir
@@ -33,9 +33,7 @@
   ; CHECK-NEXT:    vs1r.v v25, (a0) # Unknown-size Folded Spill
   ; CHECK-NEXT:    ld a0, 8(sp)
   ; CHECK-NEXT:    call spillslot
-  ; CHECK-NEXT:    addi sp, s0, -2048
-  ; CHECK-NEXT:    addi sp, sp, -256
-  ; CHECK-NEXT:    addi sp, sp, 272
+  ; CHECK-NEXT:    addi sp, s0, -2032
   ; CHECK-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload
   ; CHECK-NEXT:    ld s0, 2016(sp) # 8-byte Folded Reload
   ; CHECK-NEXT:    addi sp, sp, 2032
diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-load-fp.ll b/llvm/test/CodeGen/RISCV/rvv/masked-load-fp.ll
index df1bd889c1042..9c7ad239bcade 100644
--- a/llvm/test/CodeGen/RISCV/rvv/masked-load-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/masked-load-fp.ll
@@ -1,6 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 1 x bfloat> @masked_load_nxv1bf16(ptr %a, <vscale x 1 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_load_nxv1bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 1 x bfloat> @llvm.masked.load.nxv1bf16(ptr %a, i32 2, <vscale x 1 x i1> %mask, <vscale x 1 x bfloat> undef)
+  ret <vscale x 1 x bfloat> %load
+}
+declare <vscale x 1 x bfloat> @llvm.masked.load.nxv1bf16(ptr, i32, <vscale x 1 x i1>, <vscale x 1 x bfloat>)
 
 define <vscale x 1 x half> @masked_load_nxv1f16(ptr %a, <vscale x 1 x i1> %mask) nounwind {
 ; CHECK-LABEL: masked_load_nxv1f16:
@@ -35,6 +48,17 @@ define <vscale x 1 x double> @masked_load_nxv1f64(ptr %a, <vscale x 1 x i1> %mas
 }
 declare <vscale x 1 x double> @llvm.masked.load.nxv1f64(ptr, i32, <vscale x 1 x i1>, <vscale x 1 x double>)
 
+define <vscale x 2 x bfloat> @masked_load_nxv2bf16(ptr %a, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_load_nxv2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x bfloat> @llvm.masked.load.nxv2bf16(ptr %a, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x bfloat> undef)
+  ret <vscale x 2 x bfloat> %load
+}
+declare <vscale x 2 x bfloat> @llvm.masked.load.nxv2bf16(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x bfloat>)
+
 define <vscale x 2 x half> @masked_load_nxv2f16(ptr %a, <vscale x 2 x i1> %mask) nounwind {
 ; CHECK-LABEL: masked_load_nxv2f16:
 ; CHECK:       # %bb.0:
@@ -68,6 +92,17 @@ define <vscale x 2 x double> @masked_load_nxv2f64(ptr %a, <vscale x 2 x i1> %mas
 }
 declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
 
+define <vscale x 4 x bfloat> @masked_load_nxv4bf16(ptr %a, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_load_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x bfloat> @llvm.masked.load.nxv4bf16(ptr %a, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x bfloat> undef)
+  ret <vscale x 4 x bfloat> %load
+}
+declare <vscale x 4 x bfloat> @llvm.masked.load.nxv4bf16(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x bfloat>)
+
 define <vscale x 4 x half> @masked_load_nxv4f16(ptr %a, <vscale x 4 x i1> %mask) nounwind {
 ; CHECK-LABEL: masked_load_nxv4f16:
 ; CHECK:       # %bb.0:
@@ -101,6 +136,17 @@ define <vscale x 4 x double> @masked_load_nxv4f64(ptr %a, <vscale x 4 x i1> %mas
 }
 declare <vscale x 4 x double> @llvm.masked.load.nxv4f64(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x double>)
 
+define <vscale x 8 x bfloat> @masked_load_nxv8bf16(ptr %a, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_load_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(ptr %a, i32 2, <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> undef)
+  ret <vscale x 8 x bfloat> %load
+}
+declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
+
 define <vscale x 8 x half> @masked_load_nxv8f16(ptr %a, <vscale x 8 x i1> %mask) nounwind {
 ; CHECK-LABEL: masked_load_nxv8f16:
 ; CHECK:       # %bb.0:
@@ -134,6 +180,17 @@ define <vscale x 8 x double> @masked_load_nxv8f64(ptr %a, <vscale x 8 x i1> %mas
 }
 declare <vscale x 8 x double> @llvm.masked.load.nxv8f64(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x double>)
 
+define <vscale x 16 x bfloat> @masked_load_nxv16bf16(ptr %a, <vscale x 16 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_load_nxv16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 16 x bfloat> @llvm.masked.load.nxv16bf16(ptr %a, i32 2, <vscale x 16 x i1> %mask, <vscale x 16 x bfloat> undef)
+  ret <vscale x 16 x bfloat> %load
+}
+declare <vscale x 16 x bfloat> @llvm.masked.load.nxv16bf16(ptr, i32, <vscale x 16 x i1>, <vscale x 16 x bfloat>)
+
 define <vscale x 16 x half> @masked_load_nxv16f16(ptr %a, <vscale x 16 x i1> %mask) nounwind {
 ; CHECK-LABEL: masked_load_nxv16f16:
 ; CHECK:       # %bb.0:
@@ -156,6 +213,17 @@ define <vscale x 16 x float> @masked_load_nxv16f32(ptr %a, <vscale x 16 x i1> %m
 }
 declare <vscale x 16 x float> @llvm.masked.load.nxv16f32(ptr, i32, <vscale x 16 x i1>, <vscale x 16 x float>)
 
+define <vscale x 32 x bfloat> @masked_load_nxv32bf16(ptr %a, <vscale x 32 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_load_nxv32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 32 x bfloat> @llvm.masked.load.nxv32bf16(ptr %a, i32 2, <vscale x 32 x i1> %mask, <vscale x 32 x bfloat> undef)
+  ret <vscale x 32 x bfloat> %load
+}
+declare <vscale x 32 x bfloat> @llvm.masked.load.nxv32bf16(ptr, i32, <vscale x 32 x i1>, <vscale x 32 x bfloat>)
+
 define <vscale x 32 x half> @masked_load_nxv32f16(ptr %a, <vscale x 32 x i1> %mask) nounwind {
 ; CHECK-LABEL: masked_load_nxv32f16:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-store-fp.ll b/llvm/test/CodeGen/RISCV/rvv/masked-store-fp.ll
index 17193aef1dff9..ddb56e0d979a1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/masked-store-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/masked-store-fp.ll
@@ -1,6 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s
+
+define void @masked_store_nxv1bf16(<vscale x 1 x bfloat> %val, ptr %a, <vscale x 1 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_store_nxv1bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.nxv1bf16.p0(<vscale x 1 x bfloat> %val, ptr %a, i32 2, <vscale x 1 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.store.nxv1bf16.p0(<vscale x 1 x bfloat>, ptr, i32, <vscale x 1 x i1>)
 
 define void @masked_store_nxv1f16(<vscale x 1 x half> %val, ptr %a, <vscale x 1 x i1> %mask) nounwind {
 ; CHECK-LABEL: masked_store_nxv1f16:
@@ -35,6 +48,17 @@ define void @masked_store_nxv1f64(<vscale x 1 x double> %val, ptr %a, <vscale x
 }
 declare void @llvm.masked.store.nxv1f64.p0(<vscale x 1 x double>, ptr, i32, <vscale x 1 x i1>)
 
+define void @masked_store_nxv2bf16(<vscale x 2 x bfloat> %val, ptr %a, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_store_nxv2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.nxv2bf16.p0(<vscale x 2 x bfloat> %val, ptr %a, i32 2, <vscale x 2 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.store.nxv2bf16.p0(<vscale x 2 x bfloat>, ptr, i32, <vscale x 2 x i1>)
+
 define void @masked_store_nxv2f16(<vscale x 2 x half> %val, ptr %a, <vscale x 2 x i1> %mask) nounwind {
 ; CHECK-LABEL: masked_store_nxv2f16:
 ; CHECK:       # %bb.0:
@@ -68,6 +92,17 @@ define void @masked_store_nxv2f64(<vscale x 2 x double> %val, ptr %a, <vscale x
 }
 declare void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double>, ptr, i32, <vscale x 2 x i1>)
 
+define void @masked_store_nxv4bf16(<vscale x 4 x bfloat> %val, ptr %a, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_store_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.nxv4bf16.p0(<vscale x 4 x bfloat> %val, ptr %a, i32 2, <vscale x 4 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.store.nxv4bf16.p0(<vscale x 4 x bfloat>, ptr, i32, <vscale x 4 x i1>)
+
 define void @masked_store_nxv4f16(<vscale x 4 x half> %val, ptr %a, <vscale x 4 x i1> %mask) nounwind {
 ; CHECK-LABEL: masked_store_nxv4f16:
 ; CHECK:       # %bb.0:
@@ -101,6 +136,17 @@ define void @masked_store_nxv4f64(<vscale x 4 x double> %val, ptr %a, <vscale x
 }
 declare void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double>, ptr, i32, <vscale x 4 x i1>)
 
+define void @masked_store_nxv8bf16(<vscale x 8 x bfloat> %val, ptr %a, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_store_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> %val, ptr %a, i32 2, <vscale x 8 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat>, ptr, i32, <vscale x 8 x i1>)
+
 define void @masked_store_nxv8f16(<vscale x 8 x half> %val, ptr %a, <vscale x 8 x i1> %mask) nounwind {
 ; CHECK-LABEL: masked_store_nxv8f16:
 ; CHECK:       # %bb.0:
@@ -134,6 +180,17 @@ define void @masked_store_nxv8f64(<vscale x 8 x double> %val, ptr %a, <vscale x
 }
 declare void @llvm.masked.store.nxv8f64.p0(<vscale x 8 x double>, ptr, i32, <vscale x 8 x i1>)
 
+define void @masked_store_nxv16bf16(<vscale x 16 x bfloat> %val, ptr %a, <vscale x 16 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_store_nxv16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.nxv16bf16.p0(<vscale x 16 x bfloat> %val, ptr %a, i32 2, <vscale x 16 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.store.nxv16bf16.p0(<vscale x 16 x bfloat>, ptr, i32, <vscale x 16 x i1>)
+
 define void @masked_store_nxv16f16(<vscale x 16 x half> %val, ptr %a, <vscale x 16 x i1> %mask) nounwind {
 ; CHECK-LABEL: masked_store_nxv16f16:
 ; CHECK:       # %bb.0:
@@ -156,6 +213,17 @@ define void @masked_store_nxv16f32(<vscale x 16 x float> %val, ptr %a, <vscale x
 }
 declare void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float>, ptr, i32, <vscale x 16 x i1>)
 
+define void @masked_store_nxv32bf16(<vscale x 32 x bfloat> %val, ptr %a, <vscale x 32 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_store_nxv32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.nxv32bf16.p0(<vscale x 32 x bfloat> %val, ptr %a, i32 2, <vscale x 32 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.store.nxv32bf16.p0(<vscale x 32 x bfloat>, ptr, i32, <vscale x 32 x i1>)
+
 define void @masked_store_nxv32f16(<vscale x 32 x half> %val, ptr %a, <vscale x 32 x i1> %mask) nounwind {
 ; CHECK-LABEL: masked_store_nxv32f16:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index be37be06f0e77..189ba08dddc7a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -1,8 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zvfbfmin,+v \
+; RUN:     -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zvfbfmin,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfhmin,+zvfbfmin,+v \
+; RUN:     -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfhmin,+zvfbfmin,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0(<vscale x 1 x ptr>, i32, <vscale x 1 x i1>, <vscale x 1 x i8>)
 
@@ -1257,6 +1265,206 @@ define void @mgather_nxv16i64(<vscale x 8 x ptr> %ptrs0, <vscale x 8 x ptr> %ptr
   ret void
 }
 
+declare <vscale x 1 x bfloat> @llvm.masked.gather.nxv1bf16.nxv1p0(<vscale x 1 x ptr>, i32, <vscale x 1 x i1>, <vscale x 1 x bfloat>)
+
+define <vscale x 1 x bfloat> @mgather_nxv1bf16(<vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, <vscale x 1 x bfloat> %passthru) {
+; RV32-LABEL: mgather_nxv1bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
+; RV32-NEXT:    vluxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_nxv1bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
+; RV64-NEXT:    vluxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <vscale x 1 x bfloat> @llvm.masked.gather.nxv1bf16.nxv1p0(<vscale x 1 x ptr> %ptrs, i32 2, <vscale x 1 x i1> %m, <vscale x 1 x bfloat> %passthru)
+  ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.masked.gather.nxv2bf16.nxv2p0(<vscale x 2 x ptr>, i32, <vscale x 2 x i1>, <vscale x 2 x bfloat>)
+
+define <vscale x 2 x bfloat> @mgather_nxv2bf16(<vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x bfloat> %passthru) {
+; RV32-LABEL: mgather_nxv2bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
+; RV32-NEXT:    vluxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_nxv2bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
+; RV64-NEXT:    vluxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <vscale x 2 x bfloat> @llvm.masked.gather.nxv2bf16.nxv2p0(<vscale x 2 x ptr> %ptrs, i32 2, <vscale x 2 x i1> %m, <vscale x 2 x bfloat> %passthru)
+  ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.masked.gather.nxv4bf16.nxv4p0(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x bfloat>)
+
+define <vscale x 4 x bfloat> @mgather_nxv4bf16(<vscale x 4 x ptr> %ptrs, <vscale x 4 x i1> %m, <vscale x 4 x bfloat> %passthru) {
+; RV32-LABEL: mgather_nxv4bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
+; RV32-NEXT:    vluxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_nxv4bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
+; RV64-NEXT:    vluxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT:    vmv.v.v v8, v12
+; RV64-NEXT:    ret
+  %v = call <vscale x 4 x bfloat> @llvm.masked.gather.nxv4bf16.nxv4p0(<vscale x 4 x ptr> %ptrs, i32 2, <vscale x 4 x i1> %m, <vscale x 4 x bfloat> %passthru)
+  ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @mgather_truemask_nxv4bf16(<vscale x 4 x ptr> %ptrs, <vscale x 4 x bfloat> %passthru) {
+; RV32-LABEL: mgather_truemask_nxv4bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT:    vluxei32.v v10, (zero), v8
+; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_truemask_nxv4bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT:    vluxei64.v v12, (zero), v8
+; RV64-NEXT:    vmv.v.v v8, v12
+; RV64-NEXT:    ret
+  %v = call <vscale x 4 x bfloat> @llvm.masked.gather.nxv4bf16.nxv4p0(<vscale x 4 x ptr> %ptrs, i32 2, <vscale x 4 x i1> splat (i1 1), <vscale x 4 x bfloat> %passthru)
+  ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @mgather_falsemask_nxv4bf16(<vscale x 4 x ptr> %ptrs, <vscale x 4 x bfloat> %passthru) {
+; RV32-LABEL: mgather_falsemask_nxv4bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_falsemask_nxv4bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vmv1r.v v8, v12
+; RV64-NEXT:    ret
+  %v = call <vscale x 4 x bfloat> @llvm.masked.gather.nxv4bf16.nxv4p0(<vscale x 4 x ptr> %ptrs, i32 2, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x bfloat> %passthru)
+  ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.masked.gather.nxv8bf16.nxv8p0(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
+
+define <vscale x 8 x bfloat> @mgather_nxv8bf16(<vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, <vscale x 8 x bfloat> %passthru) {
+; RV32-LABEL: mgather_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
+; RV32-NEXT:    vluxei32.v v12, (zero), v8, v0.t
+; RV32-NEXT:    vmv.v.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
+; RV64-NEXT:    vluxei64.v v16, (zero), v8, v0.t
+; RV64-NEXT:    vmv.v.v v8, v16
+; RV64-NEXT:    ret
+  %v = call <vscale x 8 x bfloat> @llvm.masked.gather.nxv8bf16.nxv8p0(<vscale x 8 x ptr> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x bfloat> %passthru)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @mgather_baseidx_nxv8i8_nxv8bf16(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x bfloat> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i8_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsext.vf4 v12, v8
+; RV32-NEXT:    vadd.vv v12, v12, v12
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
+; RV32-NEXT:    vluxei32.v v10, (a0), v12, v0.t
+; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i8_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf8 v16, v8
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
+; RV64-NEXT:    vluxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT:    vmv.v.v v8, v10
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i8> %idxs
+  %v = call <vscale x 8 x bfloat> @llvm.masked.gather.nxv8bf16.nxv8p0(<vscale x 8 x ptr> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x bfloat> %passthru)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @mgather_baseidx_sext_nxv8i8_nxv8bf16(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x bfloat> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_nxv8i8_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsext.vf4 v12, v8
+; RV32-NEXT:    vadd.vv v12, v12, v12
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
+; RV32-NEXT:    vluxei32.v v10, (a0), v12, v0.t
+; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_sext_nxv8i8_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf8 v16, v8
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
+; RV64-NEXT:    vluxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT:    vmv.v.v v8, v10
+; RV64-NEXT:    ret
+  %eidxs = sext <vscale x 8 x i8> %idxs to <vscale x 8 x i16>
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i16> %eidxs
+  %v = call <vscale x 8 x bfloat> @llvm.masked.gather.nxv8bf16.nxv8p0(<vscale x 8 x ptr> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x bfloat> %passthru)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @mgather_baseidx_zext_nxv8i8_nxv8bf16(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x bfloat> %passthru) {
+; CHECK-LABEL: mgather_baseidx_zext_nxv8i8_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwaddu.vv v12, v8, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v10, (a0), v12, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i16>
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i16> %eidxs
+  %v = call <vscale x 8 x bfloat> @llvm.masked.gather.nxv8bf16.nxv8p0(<vscale x 8 x ptr> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x bfloat> %passthru)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @mgather_baseidx_nxv8bf16(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x bfloat> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
+; RV32-NEXT:    vwadd.vv v12, v8, v8
+; RV32-NEXT:    vluxei32.v v10, (a0), v12, v0.t
+; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf4 v16, v8
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
+; RV64-NEXT:    vluxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT:    vmv.v.v v8, v10
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i16> %idxs
+  %v = call <vscale x 8 x bfloat> @llvm.masked.gather.nxv8bf16.nxv8p0(<vscale x 8 x ptr> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x bfloat> %passthru)
+  ret <vscale x 8 x bfloat> %v
+}
 
 declare <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr>, i32, <vscale x 1 x i1>, <vscale x 1 x half>)
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
index 9bfa0f31dc3a6..29db67b4b0a41 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
@@ -1,8 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zvfbfmin,+v \
+; RUN:     -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zvfbfmin,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfhmin,+zvfbfmin,+v \
+; RUN:     -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfhmin,+zvfbfmin,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,RV64
 
 declare void @llvm.masked.scatter.nxv1i8.nxv1p0(<vscale x 1 x i8>, <vscale x 1 x ptr>, i32, <vscale x 1 x i1>)
 
@@ -967,6 +975,184 @@ define void @mscatter_baseidx_nxv8i64(<vscale x 8 x i64> %val, ptr %base, <vscal
   ret void
 }
 
+declare void @llvm.masked.scatter.nxv1bf16.nxv1p0(<vscale x 1 x bfloat>, <vscale x 1 x ptr>, i32, <vscale x 1 x i1>)
+
+define void @mscatter_nxv1bf16(<vscale x 1 x bfloat> %val, <vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m) {
+; RV32-LABEL: mscatter_nxv1bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_nxv1bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.nxv1bf16.nxv1p0(<vscale x 1 x bfloat> %val, <vscale x 1 x ptr> %ptrs, i32 2, <vscale x 1 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.nxv2bf16.nxv2p0(<vscale x 2 x bfloat>, <vscale x 2 x ptr>, i32, <vscale x 2 x i1>)
+
+define void @mscatter_nxv2bf16(<vscale x 2 x bfloat> %val, <vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m) {
+; RV32-LABEL: mscatter_nxv2bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_nxv2bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.nxv2bf16.nxv2p0(<vscale x 2 x bfloat> %val, <vscale x 2 x ptr> %ptrs, i32 2, <vscale x 2 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.nxv4bf16.nxv4p0(<vscale x 4 x bfloat>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
+
+define void @mscatter_nxv4bf16(<vscale x 4 x bfloat> %val, <vscale x 4 x ptr> %ptrs, <vscale x 4 x i1> %m) {
+; RV32-LABEL: mscatter_nxv4bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (zero), v10, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_nxv4bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (zero), v12, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.nxv4bf16.nxv4p0(<vscale x 4 x bfloat> %val, <vscale x 4 x ptr> %ptrs, i32 2, <vscale x 4 x i1> %m)
+  ret void
+}
+
+define void @mscatter_truemask_nxv4bf16(<vscale x 4 x bfloat> %val, <vscale x 4 x ptr> %ptrs) {
+; RV32-LABEL: mscatter_truemask_nxv4bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (zero), v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_truemask_nxv4bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (zero), v12
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.nxv4bf16.nxv4p0(<vscale x 4 x bfloat> %val, <vscale x 4 x ptr> %ptrs, i32 2, <vscale x 4 x i1> splat (i1 1))
+  ret void
+}
+
+define void @mscatter_falsemask_nxv4bf16(<vscale x 4 x bfloat> %val, <vscale x 4 x ptr> %ptrs) {
+; CHECK-LABEL: mscatter_falsemask_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  call void @llvm.masked.scatter.nxv4bf16.nxv4p0(<vscale x 4 x bfloat> %val, <vscale x 4 x ptr> %ptrs, i32 2, <vscale x 4 x i1> zeroinitializer)
+  ret void
+}
+
+declare void @llvm.masked.scatter.nxv8bf16.nxv8p0(<vscale x 8 x bfloat>, <vscale x 8 x ptr>, i32, <vscale x 8 x i1>)
+
+define void @mscatter_nxv8bf16(<vscale x 8 x bfloat> %val, <vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m) {
+; RV32-LABEL: mscatter_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (zero), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (zero), v16, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.nxv8bf16.nxv8p0(<vscale x 8 x bfloat> %val, <vscale x 8 x ptr> %ptrs, i32 2, <vscale x 8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_nxv8i8_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsext.vf4 v12, v10
+; RV32-NEXT:    vadd.vv v12, v12, v12
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_nxv8i8_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf8 v16, v10
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i8> %idxs
+  call void @llvm.masked.scatter.nxv8bf16.nxv8p0(<vscale x 8 x bfloat> %val, <vscale x 8 x ptr> %ptrs, i32 2, <vscale x 8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_sext_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsext.vf4 v12, v10
+; RV32-NEXT:    vadd.vv v12, v12, v12
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf8 v16, v10
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <vscale x 8 x i8> %idxs to <vscale x 8 x i16>
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i16> %eidxs
+  call void @llvm.masked.scatter.nxv8bf16.nxv8p0(<vscale x 8 x bfloat> %val, <vscale x 8 x ptr> %ptrs, i32 2, <vscale x 8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_zext_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m) {
+; CHECK-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwaddu.vv v12, v10, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+  %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i16>
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i16> %eidxs
+  call void @llvm.masked.scatter.nxv8bf16.nxv8p0(<vscale x 8 x bfloat> %val, <vscale x 8 x ptr> %ptrs, i32 2, <vscale x 8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_nxv8bf16(<vscale x 8 x bfloat> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwadd.vv v12, v10, v10
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf4 v16, v10
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i16> %idxs
+  call void @llvm.masked.scatter.nxv8bf16.nxv8p0(<vscale x 8 x bfloat> %val, <vscale x 8 x ptr> %ptrs, i32 2, <vscale x 8 x i1> %m)
+  ret void
+}
+
 declare void @llvm.masked.scatter.nxv1f16.nxv1p0(<vscale x 1 x half>, <vscale x 1 x ptr>, i32, <vscale x 1 x i1>)
 
 define void @mscatter_nxv1f16(<vscale x 1 x half> %val, <vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll b/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll
index 47b88ba71d556..9cdced88c7bdc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll
@@ -16,7 +16,6 @@ define signext i32 @foo(i32 signext %aa) #0 {
 ; CHECK-NEXT:    addi s0, sp, 96
 ; CHECK-NEXT:    .cfi_def_cfa s0, 0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    andi sp, sp, -16
 ; CHECK-NEXT:    mv s1, sp
diff --git a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-vector.ll b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-vector.ll
index 1e3b90e306c83..f0cd067fd0448 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-vector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-vector.ll
@@ -9,7 +9,6 @@ define <vscale x 1 x i32> @spill_lmul_mf2(<vscale x 1 x i32> %va) nounwind {
 ; SPILL-O0:       # %bb.0: # %entry
 ; SPILL-O0-NEXT:    addi sp, sp, -16
 ; SPILL-O0-NEXT:    csrr a0, vlenb
-; SPILL-O0-NEXT:    slli a0, a0, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a0
 ; SPILL-O0-NEXT:    addi a0, sp, 16
 ; SPILL-O0-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
@@ -18,7 +17,6 @@ define <vscale x 1 x i32> @spill_lmul_mf2(<vscale x 1 x i32> %va) nounwind {
 ; SPILL-O0-NEXT:    addi a0, sp, 16
 ; SPILL-O0-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; SPILL-O0-NEXT:    csrr a0, vlenb
-; SPILL-O0-NEXT:    slli a0, a0, 1
 ; SPILL-O0-NEXT:    add sp, sp, a0
 ; SPILL-O0-NEXT:    addi sp, sp, 16
 ; SPILL-O0-NEXT:    ret
@@ -27,7 +25,6 @@ define <vscale x 1 x i32> @spill_lmul_mf2(<vscale x 1 x i32> %va) nounwind {
 ; SPILL-O2:       # %bb.0: # %entry
 ; SPILL-O2-NEXT:    addi sp, sp, -16
 ; SPILL-O2-NEXT:    csrr a0, vlenb
-; SPILL-O2-NEXT:    slli a0, a0, 1
 ; SPILL-O2-NEXT:    sub sp, sp, a0
 ; SPILL-O2-NEXT:    addi a0, sp, 16
 ; SPILL-O2-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
@@ -35,7 +32,6 @@ define <vscale x 1 x i32> @spill_lmul_mf2(<vscale x 1 x i32> %va) nounwind {
 ; SPILL-O2-NEXT:    #NO_APP
 ; SPILL-O2-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; SPILL-O2-NEXT:    csrr a0, vlenb
-; SPILL-O2-NEXT:    slli a0, a0, 1
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
@@ -51,7 +47,6 @@ define <vscale x 2 x i32> @spill_lmul_1(<vscale x 2 x i32> %va) nounwind {
 ; SPILL-O0:       # %bb.0: # %entry
 ; SPILL-O0-NEXT:    addi sp, sp, -16
 ; SPILL-O0-NEXT:    csrr a0, vlenb
-; SPILL-O0-NEXT:    slli a0, a0, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a0
 ; SPILL-O0-NEXT:    addi a0, sp, 16
 ; SPILL-O0-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
@@ -60,7 +55,6 @@ define <vscale x 2 x i32> @spill_lmul_1(<vscale x 2 x i32> %va) nounwind {
 ; SPILL-O0-NEXT:    addi a0, sp, 16
 ; SPILL-O0-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; SPILL-O0-NEXT:    csrr a0, vlenb
-; SPILL-O0-NEXT:    slli a0, a0, 1
 ; SPILL-O0-NEXT:    add sp, sp, a0
 ; SPILL-O0-NEXT:    addi sp, sp, 16
 ; SPILL-O0-NEXT:    ret
@@ -69,7 +63,6 @@ define <vscale x 2 x i32> @spill_lmul_1(<vscale x 2 x i32> %va) nounwind {
 ; SPILL-O2:       # %bb.0: # %entry
 ; SPILL-O2-NEXT:    addi sp, sp, -16
 ; SPILL-O2-NEXT:    csrr a0, vlenb
-; SPILL-O2-NEXT:    slli a0, a0, 1
 ; SPILL-O2-NEXT:    sub sp, sp, a0
 ; SPILL-O2-NEXT:    addi a0, sp, 16
 ; SPILL-O2-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
@@ -77,7 +70,6 @@ define <vscale x 2 x i32> @spill_lmul_1(<vscale x 2 x i32> %va) nounwind {
 ; SPILL-O2-NEXT:    #NO_APP
 ; SPILL-O2-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; SPILL-O2-NEXT:    csrr a0, vlenb
-; SPILL-O2-NEXT:    slli a0, a0, 1
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
index c12df627b49d6..b34952b64f09e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
@@ -11,7 +11,6 @@ define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0:       # %bb.0: # %entry
 ; SPILL-O0-NEXT:    addi sp, sp, -16
 ; SPILL-O0-NEXT:    csrr a2, vlenb
-; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
 ; SPILL-O0-NEXT:    # implicit-def: $v8_v9
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, mf2, tu, ma
@@ -24,7 +23,6 @@ define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0-NEXT:    addi a0, sp, 16
 ; SPILL-O0-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; SPILL-O0-NEXT:    csrr a0, vlenb
-; SPILL-O0-NEXT:    slli a0, a0, 1
 ; SPILL-O0-NEXT:    add sp, sp, a0
 ; SPILL-O0-NEXT:    addi sp, sp, 16
 ; SPILL-O0-NEXT:    ret
@@ -88,7 +86,6 @@ define <vscale x 2 x i32> @spill_zvlsseg_nxv2i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0:       # %bb.0: # %entry
 ; SPILL-O0-NEXT:    addi sp, sp, -16
 ; SPILL-O0-NEXT:    csrr a2, vlenb
-; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
 ; SPILL-O0-NEXT:    # implicit-def: $v8_v9
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
@@ -101,7 +98,6 @@ define <vscale x 2 x i32> @spill_zvlsseg_nxv2i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0-NEXT:    addi a0, sp, 16
 ; SPILL-O0-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; SPILL-O0-NEXT:    csrr a0, vlenb
-; SPILL-O0-NEXT:    slli a0, a0, 1
 ; SPILL-O0-NEXT:    add sp, sp, a0
 ; SPILL-O0-NEXT:    addi sp, sp, 16
 ; SPILL-O0-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-vector.ll b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-vector.ll
index 30075c2dad516..1e6ff0baddaef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-vector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-vector.ll
@@ -9,7 +9,6 @@ define <vscale x 1 x i64> @spill_lmul_1(<vscale x 1 x i64> %va) nounwind {
 ; SPILL-O0:       # %bb.0: # %entry
 ; SPILL-O0-NEXT:    addi sp, sp, -16
 ; SPILL-O0-NEXT:    csrr a0, vlenb
-; SPILL-O0-NEXT:    slli a0, a0, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a0
 ; SPILL-O0-NEXT:    addi a0, sp, 16
 ; SPILL-O0-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
@@ -18,7 +17,6 @@ define <vscale x 1 x i64> @spill_lmul_1(<vscale x 1 x i64> %va) nounwind {
 ; SPILL-O0-NEXT:    addi a0, sp, 16
 ; SPILL-O0-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; SPILL-O0-NEXT:    csrr a0, vlenb
-; SPILL-O0-NEXT:    slli a0, a0, 1
 ; SPILL-O0-NEXT:    add sp, sp, a0
 ; SPILL-O0-NEXT:    addi sp, sp, 16
 ; SPILL-O0-NEXT:    ret
@@ -27,7 +25,6 @@ define <vscale x 1 x i64> @spill_lmul_1(<vscale x 1 x i64> %va) nounwind {
 ; SPILL-O2:       # %bb.0: # %entry
 ; SPILL-O2-NEXT:    addi sp, sp, -16
 ; SPILL-O2-NEXT:    csrr a0, vlenb
-; SPILL-O2-NEXT:    slli a0, a0, 1
 ; SPILL-O2-NEXT:    sub sp, sp, a0
 ; SPILL-O2-NEXT:    addi a0, sp, 16
 ; SPILL-O2-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
@@ -35,7 +32,6 @@ define <vscale x 1 x i64> @spill_lmul_1(<vscale x 1 x i64> %va) nounwind {
 ; SPILL-O2-NEXT:    #NO_APP
 ; SPILL-O2-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; SPILL-O2-NEXT:    csrr a0, vlenb
-; SPILL-O2-NEXT:    slli a0, a0, 1
 ; SPILL-O2-NEXT:    add sp, sp, a0
 ; SPILL-O2-NEXT:    addi sp, sp, 16
 ; SPILL-O2-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll
index afb4b1560728c..361adb55ef12f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll
@@ -11,7 +11,6 @@ define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0:       # %bb.0: # %entry
 ; SPILL-O0-NEXT:    addi sp, sp, -16
 ; SPILL-O0-NEXT:    csrr a2, vlenb
-; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
 ; SPILL-O0-NEXT:    # implicit-def: $v8_v9
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, mf2, tu, ma
@@ -24,7 +23,6 @@ define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0-NEXT:    addi a0, sp, 16
 ; SPILL-O0-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; SPILL-O0-NEXT:    csrr a0, vlenb
-; SPILL-O0-NEXT:    slli a0, a0, 1
 ; SPILL-O0-NEXT:    add sp, sp, a0
 ; SPILL-O0-NEXT:    addi sp, sp, 16
 ; SPILL-O0-NEXT:    ret
@@ -88,7 +86,6 @@ define <vscale x 2 x i32> @spill_zvlsseg_nxv2i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0:       # %bb.0: # %entry
 ; SPILL-O0-NEXT:    addi sp, sp, -16
 ; SPILL-O0-NEXT:    csrr a2, vlenb
-; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
 ; SPILL-O0-NEXT:    # implicit-def: $v8_v9
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
@@ -101,7 +98,6 @@ define <vscale x 2 x i32> @spill_zvlsseg_nxv2i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0-NEXT:    addi a0, sp, 16
 ; SPILL-O0-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; SPILL-O0-NEXT:    csrr a0, vlenb
-; SPILL-O0-NEXT:    slli a0, a0, 1
 ; SPILL-O0-NEXT:    add sp, sp, a0
 ; SPILL-O0-NEXT:    addi sp, sp, 16
 ; SPILL-O0-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/scalar-stack-align.ll b/llvm/test/CodeGen/RISCV/rvv/scalar-stack-align.ll
index 409ef50aa53c8..fcb5f07664aa5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/scalar-stack-align.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/scalar-stack-align.ll
@@ -1,49 +1,77 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+zve64x -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefix=RV32
+; RUN:   | FileCheck %s --check-prefixes=RV32,RV32-ZVE64
 ; RUN: llc -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefix=RV64
+; RUN:   | FileCheck %s --check-prefixes=RV64,RV64-ZVE64
 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefix=RV32
+; RUN:   | FileCheck %s --check-prefixes=RV32,RV32-V
 ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefix=RV64
-
-; FIXME: We are over-aligning the stack on V, wasting stack space.
+; RUN:   | FileCheck %s --check-prefixes=RV64,RV64-V
 
 define ptr @scalar_stack_align16() nounwind {
-; RV32-LABEL: scalar_stack_align16:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 1
-; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    addi a0, sp, 32
-; RV32-NEXT:    call extern
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 1
-; RV32-NEXT:    add sp, sp, a1
-; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 48
-; RV32-NEXT:    ret
+; RV32-ZVE64-LABEL: scalar_stack_align16:
+; RV32-ZVE64:       # %bb.0:
+; RV32-ZVE64-NEXT:    addi sp, sp, -48
+; RV32-ZVE64-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
+; RV32-ZVE64-NEXT:    csrr a0, vlenb
+; RV32-ZVE64-NEXT:    slli a0, a0, 1
+; RV32-ZVE64-NEXT:    sub sp, sp, a0
+; RV32-ZVE64-NEXT:    addi a0, sp, 32
+; RV32-ZVE64-NEXT:    call extern
+; RV32-ZVE64-NEXT:    addi a0, sp, 16
+; RV32-ZVE64-NEXT:    csrr a1, vlenb
+; RV32-ZVE64-NEXT:    slli a1, a1, 1
+; RV32-ZVE64-NEXT:    add sp, sp, a1
+; RV32-ZVE64-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-ZVE64-NEXT:    addi sp, sp, 48
+; RV32-ZVE64-NEXT:    ret
+;
+; RV64-ZVE64-LABEL: scalar_stack_align16:
+; RV64-ZVE64:       # %bb.0:
+; RV64-ZVE64-NEXT:    addi sp, sp, -48
+; RV64-ZVE64-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64-ZVE64-NEXT:    csrr a0, vlenb
+; RV64-ZVE64-NEXT:    slli a0, a0, 1
+; RV64-ZVE64-NEXT:    sub sp, sp, a0
+; RV64-ZVE64-NEXT:    addi a0, sp, 32
+; RV64-ZVE64-NEXT:    call extern
+; RV64-ZVE64-NEXT:    addi a0, sp, 16
+; RV64-ZVE64-NEXT:    csrr a1, vlenb
+; RV64-ZVE64-NEXT:    slli a1, a1, 1
+; RV64-ZVE64-NEXT:    add sp, sp, a1
+; RV64-ZVE64-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64-ZVE64-NEXT:    addi sp, sp, 48
+; RV64-ZVE64-NEXT:    ret
+;
+; RV32-V-LABEL: scalar_stack_align16:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -48
+; RV32-V-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
+; RV32-V-NEXT:    csrr a0, vlenb
+; RV32-V-NEXT:    sub sp, sp, a0
+; RV32-V-NEXT:    addi a0, sp, 32
+; RV32-V-NEXT:    call extern
+; RV32-V-NEXT:    addi a0, sp, 16
+; RV32-V-NEXT:    csrr a1, vlenb
+; RV32-V-NEXT:    add sp, sp, a1
+; RV32-V-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-V-NEXT:    addi sp, sp, 48
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: scalar_stack_align16:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -48
-; RV64-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 1
-; RV64-NEXT:    sub sp, sp, a0
-; RV64-NEXT:    addi a0, sp, 32
-; RV64-NEXT:    call extern
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 1
-; RV64-NEXT:    add sp, sp, a1
-; RV64-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 48
-; RV64-NEXT:    ret
+; RV64-V-LABEL: scalar_stack_align16:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    addi sp, sp, -48
+; RV64-V-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64-V-NEXT:    csrr a0, vlenb
+; RV64-V-NEXT:    sub sp, sp, a0
+; RV64-V-NEXT:    addi a0, sp, 32
+; RV64-V-NEXT:    call extern
+; RV64-V-NEXT:    addi a0, sp, 16
+; RV64-V-NEXT:    csrr a1, vlenb
+; RV64-V-NEXT:    add sp, sp, a1
+; RV64-V-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64-V-NEXT:    addi sp, sp, 48
+; RV64-V-NEXT:    ret
   %a = alloca <vscale x 2 x i32>
   %c = alloca i64, align 16
   call void @extern(ptr %a)
@@ -51,3 +79,6 @@ define ptr @scalar_stack_align16() nounwind {
 }
 
 declare void @extern(ptr)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-folding.ll b/llvm/test/CodeGen/RISCV/rvv/stack-folding.ll
new file mode 100644
index 0000000000000..70fcabe59889f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-folding.ll
@@ -0,0 +1,366 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfbfmin,+zvfh,+zvfbfmin -verify-machineinstrs | FileCheck --check-prefixes=CHECK,RV32,ZFMIN %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfbfmin,+zvfh,+zvfbfmin -verify-machineinstrs | FileCheck --check-prefixes=CHECK,RV64,ZFMIN %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs | FileCheck --check-prefixes=CHECK,RV32,NOZFMIN %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs | FileCheck --check-prefixes=CHECK,RV64,NOZFMIN %s
+
+define i64 @i64(<vscale x 1 x i64> %v, i1 %c) {
+; RV32-LABEL: i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    #APP
+; RV32-NEXT:    #NO_APP
+; RV32-NEXT:    beqz a0, .LBB0_2
+; RV32-NEXT:  # %bb.1: # %truebb
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vl1r.v v9, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vx v8, v9, a0
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    j .LBB0_3
+; RV32-NEXT:  .LBB0_2: # %falsebb
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:  .LBB0_3: # %falsebb
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    add sp, sp, a2
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    #APP
+; RV64-NEXT:    #NO_APP
+; RV64-NEXT:    beqz a0, .LBB0_2
+; RV64-NEXT:  # %bb.1: # %truebb
+; RV64-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:  .LBB0_2: # %falsebb
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    add sp, sp, a1
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  br i1 %c, label %truebb, label %falsebb
+truebb:
+  %x = extractelement <vscale x 1 x i64> %v, i32 0
+  ret i64 %x
+falsebb:
+  ret i64 0
+}
+
+define i32 @i32(<vscale x 2 x i32> %v, i1 %c) {
+; CHECK-LABEL: i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    beqz a0, .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %truebb
+; CHECK-NEXT:    lw a0, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT:  .LBB1_2: # %falsebb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add sp, sp, a1
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  br i1 %c, label %truebb, label %falsebb
+truebb:
+  %x = extractelement <vscale x 2 x i32> %v, i32 0
+  ret i32 %x
+falsebb:
+  ret i32 0
+}
+
+define i16 @i16(<vscale x 4 x i16> %v, i1 %c) {
+; CHECK-LABEL: i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    beqz a0, .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %truebb
+; CHECK-NEXT:    lh a0, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT:  .LBB2_2: # %falsebb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add sp, sp, a1
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  br i1 %c, label %truebb, label %falsebb
+truebb:
+  %x = extractelement <vscale x 4 x i16> %v, i32 0
+  ret i16 %x
+falsebb:
+  ret i16 0
+}
+
+define i8 @i8(<vscale x 8 x i8> %v, i1 %c) {
+; CHECK-LABEL: i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    beqz a0, .LBB3_2
+; CHECK-NEXT:  # %bb.1: # %truebb
+; CHECK-NEXT:    lb a0, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT:  .LBB3_2: # %falsebb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add sp, sp, a1
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  br i1 %c, label %truebb, label %falsebb
+truebb:
+  %x = extractelement <vscale x 8 x i8> %v, i32 0
+  ret i8 %x
+falsebb:
+  ret i8 0
+}
+
+define double @f64(<vscale x 1 x double> %v, i1 %c) {
+; RV32-LABEL: f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    #APP
+; RV32-NEXT:    #NO_APP
+; RV32-NEXT:    beqz a0, .LBB4_2
+; RV32-NEXT:  # %bb.1: # %truebb
+; RV32-NEXT:    fld fa0, 16(sp) # 8-byte Folded Reload
+; RV32-NEXT:    j .LBB4_3
+; RV32-NEXT:  .LBB4_2: # %falsebb
+; RV32-NEXT:    fcvt.d.w fa0, zero
+; RV32-NEXT:  .LBB4_3: # %falsebb
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    #APP
+; RV64-NEXT:    #NO_APP
+; RV64-NEXT:    beqz a0, .LBB4_2
+; RV64-NEXT:  # %bb.1: # %truebb
+; RV64-NEXT:    fld fa0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    j .LBB4_3
+; RV64-NEXT:  .LBB4_2: # %falsebb
+; RV64-NEXT:    fmv.d.x fa0, zero
+; RV64-NEXT:  .LBB4_3: # %falsebb
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  br i1 %c, label %truebb, label %falsebb
+truebb:
+  %x = extractelement <vscale x 1 x double> %v, i32 0
+  ret double %x
+falsebb:
+  ret double 0.0
+}
+
+define float @f32(<vscale x 2 x float> %v, i1 %c) {
+; CHECK-LABEL: f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    beqz a0, .LBB5_2
+; CHECK-NEXT:  # %bb.1: # %truebb
+; CHECK-NEXT:    flw fa0, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    j .LBB5_3
+; CHECK-NEXT:  .LBB5_2: # %falsebb
+; CHECK-NEXT:    fmv.w.x fa0, zero
+; CHECK-NEXT:  .LBB5_3: # %falsebb
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  br i1 %c, label %truebb, label %falsebb
+truebb:
+  %x = extractelement <vscale x 2 x float> %v, i32 0
+  ret float %x
+falsebb:
+  ret float 0.0
+}
+
+define half @f16(<vscale x 1 x half> %v, i1 %c) {
+; ZFMIN-LABEL: f16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    addi sp, sp, -16
+; ZFMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZFMIN-NEXT:    csrr a1, vlenb
+; ZFMIN-NEXT:    sub sp, sp, a1
+; ZFMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; ZFMIN-NEXT:    addi a1, sp, 16
+; ZFMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; ZFMIN-NEXT:    andi a0, a0, 1
+; ZFMIN-NEXT:    #APP
+; ZFMIN-NEXT:    #NO_APP
+; ZFMIN-NEXT:    beqz a0, .LBB6_2
+; ZFMIN-NEXT:  # %bb.1: # %truebb
+; ZFMIN-NEXT:    flh fa0, 16(sp) # 8-byte Folded Reload
+; ZFMIN-NEXT:    j .LBB6_3
+; ZFMIN-NEXT:  .LBB6_2: # %falsebb
+; ZFMIN-NEXT:    fmv.h.x fa0, zero
+; ZFMIN-NEXT:  .LBB6_3: # %falsebb
+; ZFMIN-NEXT:    csrr a0, vlenb
+; ZFMIN-NEXT:    add sp, sp, a0
+; ZFMIN-NEXT:    addi sp, sp, 16
+; ZFMIN-NEXT:    ret
+;
+; NOZFMIN-LABEL: f16:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    addi sp, sp, -16
+; NOZFMIN-NEXT:    .cfi_def_cfa_offset 16
+; NOZFMIN-NEXT:    csrr a1, vlenb
+; NOZFMIN-NEXT:    sub sp, sp, a1
+; NOZFMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; NOZFMIN-NEXT:    addi a1, sp, 16
+; NOZFMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; NOZFMIN-NEXT:    andi a0, a0, 1
+; NOZFMIN-NEXT:    #APP
+; NOZFMIN-NEXT:    #NO_APP
+; NOZFMIN-NEXT:    beqz a0, .LBB6_2
+; NOZFMIN-NEXT:  # %bb.1: # %truebb
+; NOZFMIN-NEXT:    lh a0, 16(sp) # 8-byte Folded Reload
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    j .LBB6_3
+; NOZFMIN-NEXT:  .LBB6_2: # %falsebb
+; NOZFMIN-NEXT:    lui a0, 1048560
+; NOZFMIN-NEXT:  .LBB6_3: # %falsebb
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    csrr a0, vlenb
+; NOZFMIN-NEXT:    add sp, sp, a0
+; NOZFMIN-NEXT:    addi sp, sp, 16
+; NOZFMIN-NEXT:    ret
+  tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  br i1 %c, label %truebb, label %falsebb
+truebb:
+  %x = extractelement <vscale x 1 x half> %v, i32 0
+  ret half %x
+falsebb:
+  ret half 0.0
+}
+
+define bfloat @bf16(<vscale x 2 x bfloat> %v, i1 %c) {
+; ZFMIN-LABEL: bf16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    addi sp, sp, -16
+; ZFMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZFMIN-NEXT:    csrr a1, vlenb
+; ZFMIN-NEXT:    sub sp, sp, a1
+; ZFMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; ZFMIN-NEXT:    addi a1, sp, 16
+; ZFMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; ZFMIN-NEXT:    andi a0, a0, 1
+; ZFMIN-NEXT:    #APP
+; ZFMIN-NEXT:    #NO_APP
+; ZFMIN-NEXT:    beqz a0, .LBB7_2
+; ZFMIN-NEXT:  # %bb.1: # %truebb
+; ZFMIN-NEXT:    lh a0, 16(sp) # 8-byte Folded Reload
+; ZFMIN-NEXT:    fmv.h.x fa0, a0
+; ZFMIN-NEXT:    j .LBB7_3
+; ZFMIN-NEXT:  .LBB7_2: # %falsebb
+; ZFMIN-NEXT:    fmv.h.x fa0, zero
+; ZFMIN-NEXT:  .LBB7_3: # %falsebb
+; ZFMIN-NEXT:    csrr a0, vlenb
+; ZFMIN-NEXT:    add sp, sp, a0
+; ZFMIN-NEXT:    addi sp, sp, 16
+; ZFMIN-NEXT:    ret
+;
+; NOZFMIN-LABEL: bf16:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    addi sp, sp, -16
+; NOZFMIN-NEXT:    .cfi_def_cfa_offset 16
+; NOZFMIN-NEXT:    csrr a1, vlenb
+; NOZFMIN-NEXT:    sub sp, sp, a1
+; NOZFMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; NOZFMIN-NEXT:    addi a1, sp, 16
+; NOZFMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; NOZFMIN-NEXT:    andi a0, a0, 1
+; NOZFMIN-NEXT:    #APP
+; NOZFMIN-NEXT:    #NO_APP
+; NOZFMIN-NEXT:    beqz a0, .LBB7_2
+; NOZFMIN-NEXT:  # %bb.1: # %truebb
+; NOZFMIN-NEXT:    lh a0, 16(sp) # 8-byte Folded Reload
+; NOZFMIN-NEXT:    lui a1, 1048560
+; NOZFMIN-NEXT:    or a0, a0, a1
+; NOZFMIN-NEXT:    j .LBB7_3
+; NOZFMIN-NEXT:  .LBB7_2: # %falsebb
+; NOZFMIN-NEXT:    lui a0, 1048560
+; NOZFMIN-NEXT:  .LBB7_3: # %falsebb
+; NOZFMIN-NEXT:    fmv.w.x fa0, a0
+; NOZFMIN-NEXT:    csrr a0, vlenb
+; NOZFMIN-NEXT:    add sp, sp, a0
+; NOZFMIN-NEXT:    addi sp, sp, 16
+; NOZFMIN-NEXT:    ret
+  tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  br i1 %c, label %truebb, label %falsebb
+truebb:
+  %x = extractelement <vscale x 2 x bfloat> %v, i32 0
+  ret bfloat %x
+falsebb:
+  ret bfloat 0.0
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
index 14976f21b7dbb..87ff1859a4d2d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
@@ -1,16 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+zvfbfmin,+optimized-zero-stride-load \
 ; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT,CHECK-OPT-RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT,CHECK-OPT-ZVFH,CHECK-OPT-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+zvfbfmin,+optimized-zero-stride-load \
 ; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT,CHECK-OPT-RV64
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT,CHECK-OPT-ZVFH,CHECK-OPT-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+zvfbfmin \
 ; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT,CHECK-NO-OPT-RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT,CHECK-NO-OPT-ZVFH,CHECK-NO-OPT-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+zvfbfmin \
 ; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT,CHECK-NO-OPT-RV64
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT,CHECK-NO-OPT-ZVFH,CHECK-NO-OPT-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfhmin,+zvfbfmin,+optimized-zero-stride-load \
+; RUN:   -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT,CHECK-OPT-ZVFHMIN,CHECK-OPT-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfhmin,+zvfbfmin,+optimized-zero-stride-load \
+; RUN:   -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT,CHECK-OPT-ZVFHMIN,CHECK-OPT-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfhmin,+zvfbfmin \
+; RUN:   -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT,CHECK-NO-OPT-ZVFHMIN,CHECK-NO-OPT-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfhmin,+zvfbfmin \
+; RUN:   -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT,CHECK-NO-OPT-ZVFHMIN,CHECK-NO-OPT-RV64
 
 declare <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr, i8, <vscale x 1 x i1>, i32)
 
@@ -352,6 +364,74 @@ define <vscale x 8 x i64> @strided_vpload_nxv8i64(ptr %ptr, i32 signext %stride,
   ret <vscale x 8 x i64> %load
 }
 
+declare <vscale x 1 x bfloat> @llvm.experimental.vp.strided.load.nxv1bf16.p0.i32(ptr, i32, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @strided_vpload_nxv1bf16(ptr %ptr, i32 signext %stride, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpload_nxv1bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 1 x bfloat> @llvm.experimental.vp.strided.load.nxv1bf16.p0.i32(ptr %ptr, i32 signext %stride, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x bfloat> %load
+}
+
+declare <vscale x 2 x bfloat> @llvm.experimental.vp.strided.load.nxv2bf16.p0.i32(ptr, i32, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @strided_vpload_nxv2bf16(ptr %ptr, i32 signext %stride, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpload_nxv2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x bfloat> @llvm.experimental.vp.strided.load.nxv2bf16.p0.i32(ptr %ptr, i32 signext %stride, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x bfloat> %load
+}
+
+define <vscale x 2 x bfloat> @strided_vpload_nxv2bf16_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpload_nxv2bf16_allones_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x bfloat> @llvm.experimental.vp.strided.load.nxv2bf16.p0.i32(ptr %ptr, i32 signext %stride, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 2 x bfloat> %load
+}
+
+declare <vscale x 4 x bfloat> @llvm.experimental.vp.strided.load.nxv4bf16.p0.i32(ptr, i32, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @strided_vpload_nxv4bf16(ptr %ptr, i32 signext %stride, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpload_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x bfloat> @llvm.experimental.vp.strided.load.nxv4bf16.p0.i32(ptr %ptr, i32 signext %stride, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x bfloat> %load
+}
+
+define <vscale x 4 x bfloat> @strided_vpload_nxv4bf16_unit_stride(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpload_nxv4bf16_unit_stride:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x bfloat> @llvm.experimental.vp.strided.load.nxv4bf16.p0.i32(ptr %ptr, i32 2, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x bfloat> %load
+}
+
+declare <vscale x 8 x bfloat> @llvm.experimental.vp.strided.load.nxv8bf16.p0.i32(ptr, i32, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @strided_vpload_nxv8bf16(ptr %ptr, i32 signext %stride, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpload_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 8 x bfloat> @llvm.experimental.vp.strided.load.nxv8bf16.p0.i32(ptr %ptr, i32 signext %stride, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x bfloat> %load
+}
+
 declare <vscale x 1 x half> @llvm.experimental.vp.strided.load.nxv1f16.p0.i32(ptr, i32, <vscale x 1 x i1>, i32)
 
 define <vscale x 1 x half> @strided_vpload_nxv1f16(ptr %ptr, i32 signext %stride, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -589,10 +669,10 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32-NEXT:    addi a5, a5, -1
 ; CHECK-RV32-NEXT:    and a2, a5, a2
-; CHECK-RV32-NEXT:    bltu a3, a4, .LBB49_2
+; CHECK-RV32-NEXT:    bltu a3, a4, .LBB55_2
 ; CHECK-RV32-NEXT:  # %bb.1:
 ; CHECK-RV32-NEXT:    mv a3, a4
-; CHECK-RV32-NEXT:  .LBB49_2:
+; CHECK-RV32-NEXT:  .LBB55_2:
 ; CHECK-RV32-NEXT:    mul a5, a3, a1
 ; CHECK-RV32-NEXT:    srli a4, a4, 3
 ; CHECK-RV32-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
@@ -614,10 +694,10 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:    sltu a5, a2, a3
 ; CHECK-RV64-NEXT:    addi a5, a5, -1
 ; CHECK-RV64-NEXT:    and a3, a5, a3
-; CHECK-RV64-NEXT:    bltu a2, a4, .LBB49_2
+; CHECK-RV64-NEXT:    bltu a2, a4, .LBB55_2
 ; CHECK-RV64-NEXT:  # %bb.1:
 ; CHECK-RV64-NEXT:    mv a2, a4
-; CHECK-RV64-NEXT:  .LBB49_2:
+; CHECK-RV64-NEXT:  .LBB55_2:
 ; CHECK-RV64-NEXT:    mul a5, a2, a1
 ; CHECK-RV64-NEXT:    srli a4, a4, 3
 ; CHECK-RV64-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
@@ -642,10 +722,10 @@ define <vscale x 16 x double> @strided_load_nxv16f64_allones_mask(ptr %ptr, i64
 ; CHECK-RV32-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32-NEXT:    addi a5, a5, -1
 ; CHECK-RV32-NEXT:    and a2, a5, a2
-; CHECK-RV32-NEXT:    bltu a3, a4, .LBB50_2
+; CHECK-RV32-NEXT:    bltu a3, a4, .LBB56_2
 ; CHECK-RV32-NEXT:  # %bb.1:
 ; CHECK-RV32-NEXT:    mv a3, a4
-; CHECK-RV32-NEXT:  .LBB50_2:
+; CHECK-RV32-NEXT:  .LBB56_2:
 ; CHECK-RV32-NEXT:    mul a4, a3, a1
 ; CHECK-RV32-NEXT:    add a4, a0, a4
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -661,10 +741,10 @@ define <vscale x 16 x double> @strided_load_nxv16f64_allones_mask(ptr %ptr, i64
 ; CHECK-RV64-NEXT:    sltu a5, a2, a3
 ; CHECK-RV64-NEXT:    addi a5, a5, -1
 ; CHECK-RV64-NEXT:    and a3, a5, a3
-; CHECK-RV64-NEXT:    bltu a2, a4, .LBB50_2
+; CHECK-RV64-NEXT:    bltu a2, a4, .LBB56_2
 ; CHECK-RV64-NEXT:  # %bb.1:
 ; CHECK-RV64-NEXT:    mv a2, a4
-; CHECK-RV64-NEXT:  .LBB50_2:
+; CHECK-RV64-NEXT:  .LBB56_2:
 ; CHECK-RV64-NEXT:    mul a4, a2, a1
 ; CHECK-RV64-NEXT:    add a4, a0, a4
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
@@ -689,19 +769,19 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:    slli a7, a2, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v8, v0
 ; CHECK-RV32-NEXT:    mv a6, a3
-; CHECK-RV32-NEXT:    bltu a3, a7, .LBB51_2
+; CHECK-RV32-NEXT:    bltu a3, a7, .LBB57_2
 ; CHECK-RV32-NEXT:  # %bb.1:
 ; CHECK-RV32-NEXT:    mv a6, a7
-; CHECK-RV32-NEXT:  .LBB51_2:
+; CHECK-RV32-NEXT:  .LBB57_2:
 ; CHECK-RV32-NEXT:    sub a5, a6, a2
 ; CHECK-RV32-NEXT:    sltu t0, a6, a5
 ; CHECK-RV32-NEXT:    addi t0, t0, -1
 ; CHECK-RV32-NEXT:    and t0, t0, a5
 ; CHECK-RV32-NEXT:    mv a5, a6
-; CHECK-RV32-NEXT:    bltu a6, a2, .LBB51_4
+; CHECK-RV32-NEXT:    bltu a6, a2, .LBB57_4
 ; CHECK-RV32-NEXT:  # %bb.3:
 ; CHECK-RV32-NEXT:    mv a5, a2
-; CHECK-RV32-NEXT:  .LBB51_4:
+; CHECK-RV32-NEXT:  .LBB57_4:
 ; CHECK-RV32-NEXT:    mul t1, a5, a1
 ; CHECK-RV32-NEXT:    srli t2, a2, 3
 ; CHECK-RV32-NEXT:    vsetvli t3, zero, e8, mf4, ta, ma
@@ -713,10 +793,10 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:    sltu a3, a3, a7
 ; CHECK-RV32-NEXT:    addi a3, a3, -1
 ; CHECK-RV32-NEXT:    and a3, a3, a7
-; CHECK-RV32-NEXT:    bltu a3, a2, .LBB51_6
+; CHECK-RV32-NEXT:    bltu a3, a2, .LBB57_6
 ; CHECK-RV32-NEXT:  # %bb.5:
 ; CHECK-RV32-NEXT:    mv a3, a2
-; CHECK-RV32-NEXT:  .LBB51_6:
+; CHECK-RV32-NEXT:  .LBB57_6:
 ; CHECK-RV32-NEXT:    mul a6, a6, a1
 ; CHECK-RV32-NEXT:    srli a2, a2, 2
 ; CHECK-RV32-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
@@ -736,19 +816,19 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:    slli a7, a4, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v8, v0
 ; CHECK-RV64-NEXT:    mv a6, a2
-; CHECK-RV64-NEXT:    bltu a2, a7, .LBB51_2
+; CHECK-RV64-NEXT:    bltu a2, a7, .LBB57_2
 ; CHECK-RV64-NEXT:  # %bb.1:
 ; CHECK-RV64-NEXT:    mv a6, a7
-; CHECK-RV64-NEXT:  .LBB51_2:
+; CHECK-RV64-NEXT:  .LBB57_2:
 ; CHECK-RV64-NEXT:    sub a5, a6, a4
 ; CHECK-RV64-NEXT:    sltu t0, a6, a5
 ; CHECK-RV64-NEXT:    addi t0, t0, -1
 ; CHECK-RV64-NEXT:    and t0, t0, a5
 ; CHECK-RV64-NEXT:    mv a5, a6
-; CHECK-RV64-NEXT:    bltu a6, a4, .LBB51_4
+; CHECK-RV64-NEXT:    bltu a6, a4, .LBB57_4
 ; CHECK-RV64-NEXT:  # %bb.3:
 ; CHECK-RV64-NEXT:    mv a5, a4
-; CHECK-RV64-NEXT:  .LBB51_4:
+; CHECK-RV64-NEXT:  .LBB57_4:
 ; CHECK-RV64-NEXT:    mul t1, a5, a1
 ; CHECK-RV64-NEXT:    srli t2, a4, 3
 ; CHECK-RV64-NEXT:    vsetvli t3, zero, e8, mf4, ta, ma
@@ -760,10 +840,10 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:    sltu a2, a2, a7
 ; CHECK-RV64-NEXT:    addi a2, a2, -1
 ; CHECK-RV64-NEXT:    and a2, a2, a7
-; CHECK-RV64-NEXT:    bltu a2, a4, .LBB51_6
+; CHECK-RV64-NEXT:    bltu a2, a4, .LBB57_6
 ; CHECK-RV64-NEXT:  # %bb.5:
 ; CHECK-RV64-NEXT:    mv a2, a4
-; CHECK-RV64-NEXT:  .LBB51_6:
+; CHECK-RV64-NEXT:  .LBB57_6:
 ; CHECK-RV64-NEXT:    mul a6, a6, a1
 ; CHECK-RV64-NEXT:    srli a4, a4, 2
 ; CHECK-RV64-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
@@ -813,12 +893,19 @@ define <vscale x 1 x half> @zero_strided_unmasked_vpload_nxv1f16(ptr %ptr) {
 ; CHECK-OPT-NEXT:    vlse16.v v8, (a0), zero
 ; CHECK-OPT-NEXT:    ret
 ;
-; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_nxv1f16:
-; CHECK-NO-OPT:       # %bb.0:
-; CHECK-NO-OPT-NEXT:    flh fa5, 0(a0)
-; CHECK-NO-OPT-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; CHECK-NO-OPT-NEXT:    vfmv.v.f v8, fa5
-; CHECK-NO-OPT-NEXT:    ret
+; CHECK-NO-OPT-ZVFH-LABEL: zero_strided_unmasked_vpload_nxv1f16:
+; CHECK-NO-OPT-ZVFH:       # %bb.0:
+; CHECK-NO-OPT-ZVFH-NEXT:    flh fa5, 0(a0)
+; CHECK-NO-OPT-ZVFH-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; CHECK-NO-OPT-ZVFH-NEXT:    vfmv.v.f v8, fa5
+; CHECK-NO-OPT-ZVFH-NEXT:    ret
+;
+; CHECK-NO-OPT-ZVFHMIN-LABEL: zero_strided_unmasked_vpload_nxv1f16:
+; CHECK-NO-OPT-ZVFHMIN:       # %bb.0:
+; CHECK-NO-OPT-ZVFHMIN-NEXT:    lh a0, 0(a0)
+; CHECK-NO-OPT-ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; CHECK-NO-OPT-ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; CHECK-NO-OPT-ZVFHMIN-NEXT:    ret
   %load = call <vscale x 1 x half> @llvm.experimental.vp.strided.load.nxv1f16.p0.i32(ptr %ptr, i32 0, <vscale x 1 x i1> splat (i1 true), i32 4)
   ret <vscale x 1 x half> %load
 }
@@ -854,10 +941,10 @@ define <vscale x 16 x i64> @zero_strided_vadd_nxv16i64(<vscale x 16 x i64> %v, p
 ; CHECK-RV32-NEXT:    and a3, a4, a3
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v24, (a0), zero
-; CHECK-RV32-NEXT:    bltu a2, a1, .LBB55_2
+; CHECK-RV32-NEXT:    bltu a2, a1, .LBB61_2
 ; CHECK-RV32-NEXT:  # %bb.1:
 ; CHECK-RV32-NEXT:    mv a2, a1
-; CHECK-RV32-NEXT:  .LBB55_2:
+; CHECK-RV32-NEXT:  .LBB61_2:
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v0, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
@@ -908,3 +995,6 @@ define <vscale x 1 x ptr> @zero_strided_vadd_nxv1p0(<vscale x 1 x ptr> %v, ptr %
   %load = call <vscale x 1 x ptr> @llvm.experimental.vp.strided.load.nxv1p0.p0.i32(ptr %ptr, i32 0, <vscale x 1 x i1> splat (i1 true), i32 %vscale)
   ret <vscale x 1 x ptr> %load
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-OPT-ZVFH: {{.*}}
+; CHECK-OPT-ZVFHMIN: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
index e8704b35f31f7..abdf9ab09bb9a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
@@ -1,8 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+zvfbfmin \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     -check-prefixes=CHECK,CHECK-RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+zvfbfmin \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     -check-prefixes=CHECK,CHECK-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfhmin,+zvfbfmin \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     -check-prefixes=CHECK,CHECK-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfhmin,+zvfbfmin \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     -check-prefixes=CHECK,CHECK-RV64
 
@@ -280,6 +286,64 @@ define void @strided_vpstore_nxv8i64(<vscale x 8 x i64> %val, ptr %ptr, i32 sign
   ret void
 }
 
+declare void @llvm.experimental.vp.strided.store.nxv1bf16.p0.i32(<vscale x 1 x bfloat>, ptr, i32, <vscale x 1 x i1>, i32)
+
+define void @strided_vpstore_nxv1bf16(<vscale x 1 x bfloat> %val, ptr %ptr, i32 signext %strided, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_nxv1bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.experimental.vp.strided.store.nxv1bf16.p0.i32(<vscale x 1 x bfloat> %val, ptr %ptr, i32 %strided, <vscale x 1 x i1> %m, i32 %evl)
+  ret void
+}
+
+declare void @llvm.experimental.vp.strided.store.nxv2bf16.p0.i32(<vscale x 2 x bfloat>, ptr, i32, <vscale x 2 x i1>, i32)
+
+define void @strided_vpstore_nxv2bf16(<vscale x 2 x bfloat> %val, ptr %ptr, i32 signext %strided, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_nxv2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.experimental.vp.strided.store.nxv2bf16.p0.i32(<vscale x 2 x bfloat> %val, ptr %ptr, i32 %strided, <vscale x 2 x i1> %m, i32 %evl)
+  ret void
+}
+
+declare void @llvm.experimental.vp.strided.store.nxv4bf16.p0.i32(<vscale x 4 x bfloat>, ptr, i32, <vscale x 4 x i1>, i32)
+
+define void @strided_vpstore_nxv4bf16(<vscale x 4 x bfloat> %val, ptr %ptr, i32 signext %strided, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.experimental.vp.strided.store.nxv4bf16.p0.i32(<vscale x 4 x bfloat> %val, ptr %ptr, i32 %strided, <vscale x 4 x i1> %m, i32 %evl)
+  ret void
+}
+
+define void @strided_vpstore_nxv4bf16_unit_stride(<vscale x 4 x bfloat> %val, ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_nxv4bf16_unit_stride:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.experimental.vp.strided.store.nxv4bf16.p0.i32(<vscale x 4 x bfloat> %val, ptr %ptr, i32 2, <vscale x 4 x i1> %m, i32 %evl)
+  ret void
+}
+
+declare void @llvm.experimental.vp.strided.store.nxv8bf16.p0.i32(<vscale x 8 x bfloat>, ptr, i32, <vscale x 8 x i1>, i32)
+
+define void @strided_vpstore_nxv8bf16(<vscale x 8 x bfloat> %val, ptr %ptr, i32 signext %strided, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.experimental.vp.strided.store.nxv8bf16.p0.i32(<vscale x 8 x bfloat> %val, ptr %ptr, i32 %strided, <vscale x 8 x i1> %m, i32 %evl)
+  ret void
+}
+
 declare void @llvm.experimental.vp.strided.store.nxv1f16.p0.i32(<vscale x 1 x half>, ptr, i32, <vscale x 1 x i1>, i32)
 
 define void @strided_vpstore_nxv1f16(<vscale x 1 x half> %val, ptr %ptr, i32 signext %strided, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -493,10 +557,10 @@ define void @strided_store_nxv16f64(<vscale x 16 x double> %v, ptr %ptr, i32 sig
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    bltu a2, a3, .LBB41_2
+; CHECK-NEXT:    bltu a2, a3, .LBB46_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:  .LBB41_2:
+; CHECK-NEXT:  .LBB46_2:
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
 ; CHECK-NEXT:    sub a5, a2, a3
@@ -520,10 +584,10 @@ define void @strided_store_nxv16f64_allones_mask(<vscale x 16 x double> %v, ptr
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    bltu a2, a3, .LBB42_2
+; CHECK-NEXT:    bltu a2, a3, .LBB47_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:  .LBB42_2:
+; CHECK-NEXT:  .LBB47_2:
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a0), a1
 ; CHECK-NEXT:    sub a3, a2, a3
@@ -549,15 +613,15 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    slli a6, a4, 1
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    mv a5, a3
-; CHECK-NEXT:    bltu a3, a6, .LBB43_2
+; CHECK-NEXT:    bltu a3, a6, .LBB48_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a5, a6
-; CHECK-NEXT:  .LBB43_2:
+; CHECK-NEXT:  .LBB48_2:
 ; CHECK-NEXT:    mv a7, a5
-; CHECK-NEXT:    bltu a5, a4, .LBB43_4
+; CHECK-NEXT:    bltu a5, a4, .LBB48_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a7, a4
-; CHECK-NEXT:  .LBB43_4:
+; CHECK-NEXT:  .LBB48_4:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr t0, vlenb
@@ -585,10 +649,10 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    and a0, a3, a0
 ; CHECK-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v16, (a7), a2, v0.t
-; CHECK-NEXT:    bltu a0, a4, .LBB43_6
+; CHECK-NEXT:    bltu a0, a4, .LBB48_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a0, a4
-; CHECK-NEXT:  .LBB43_6:
+; CHECK-NEXT:  .LBB48_6:
 ; CHECK-NEXT:    mul a3, a5, a2
 ; CHECK-NEXT:    srli a4, a4, 2
 ; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index b3de904d20622..4c298ab2b5e6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -508,12 +508,10 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a2, a1, 4
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
 ; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
@@ -577,10 +575,8 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -1301,12 +1297,10 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
 ; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
@@ -1370,10 +1364,8 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index aa39fe5b5ec85..0fe6c5dec4264 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -470,12 +470,10 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a2, a1, 4
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
 ; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
@@ -539,10 +537,8 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -1213,12 +1209,10 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
 ; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
@@ -1282,10 +1276,8 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index baecb7bb7d248..f0c74d064016a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -642,14 +642,13 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a3, a3, a2
 ; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    add a2, a2, a3
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vl8re16.v v0, (a0)
 ; CHECK-NEXT:    csrr a2, vlenb
@@ -776,9 +775,8 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a1, a1, a0
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
@@ -897,14 +895,13 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a2, a2, a1
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v24, a1
@@ -1038,9 +1035,8 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a1, a1, a0
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
@@ -1059,14 +1055,13 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_commute(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a2, a2, a1
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v24, a1
@@ -1200,9 +1195,8 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_commute(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a1, a1, a0
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
@@ -2247,14 +2241,13 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 1
 ; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a3, a3, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v24, v0
 ; ZVFHMIN-NEXT:    vl8re16.v v0, (a0)
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
@@ -2381,9 +2374,8 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a1, a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a1
@@ -2515,14 +2507,13 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a2, a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v24, a1
@@ -2656,9 +2647,8 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a1, a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a1
@@ -2683,14 +2673,13 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a2, a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v24, a1
@@ -2824,9 +2813,8 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a1, a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a1
@@ -8663,14 +8651,13 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 1
 ; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a3, a3, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a0, 5
@@ -8802,9 +8789,8 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a1, a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a1
@@ -12183,12 +12169,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 5
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    mv a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -12311,10 +12295,8 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 5
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index 93160c1a13fbf..3114fb5d3bfa3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -604,12 +604,10 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
 ; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
@@ -673,10 +671,8 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index 449130e59876f..dd57b65b50f4f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -470,12 +470,10 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a2, a1, 4
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
 ; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
@@ -539,10 +537,8 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -1213,12 +1209,10 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
 ; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
@@ -1282,10 +1276,8 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
index 5fbdefda9f402..0da05c1bd4364 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
@@ -1,6 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,NOZFMIN,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,NOZFMIN,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,NOZFMIN,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,NOZFMIN,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZFMIN
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZFMIN
 
 define <vscale x 1 x i8> @vp_splat_nxv1i8(i8 %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_splat_nxv1i8:
@@ -270,62 +274,254 @@ define <vscale x 8 x i64> @vp_splat_nxv8i64(i64 %val, <vscale x 8 x i1> %m, i32
   ret <vscale x 8 x i64> %splat
 }
 
+define <vscale x 1 x bfloat> @vp_splat_nxv1bf16(bfloat %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; NOZFMIN-LABEL: vp_splat_nxv1bf16:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    fmv.x.w a1, fa0
+; NOZFMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; NOZFMIN-NEXT:    vmv.v.x v8, a1
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vp_splat_nxv1bf16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a1, fa0
+; ZFMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a1
+; ZFMIN-NEXT:    ret
+  %splat = call <vscale x 1 x bfloat> @llvm.experimental.vp.splat.nxv1bf16(bfloat %val, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x bfloat> %splat
+}
+
+define <vscale x 2 x bfloat> @vp_splat_nxv2bf16(bfloat %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; NOZFMIN-LABEL: vp_splat_nxv2bf16:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    fmv.x.w a1, fa0
+; NOZFMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; NOZFMIN-NEXT:    vmv.v.x v8, a1
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vp_splat_nxv2bf16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a1, fa0
+; ZFMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a1
+; ZFMIN-NEXT:    ret
+  %splat = call <vscale x 2 x bfloat> @llvm.experimental.vp.splat.nxv2bf16(bfloat %val, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x bfloat> %splat
+}
+
+define <vscale x 4 x bfloat> @vp_splat_nxv4bf16(bfloat %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; NOZFMIN-LABEL: vp_splat_nxv4bf16:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    fmv.x.w a1, fa0
+; NOZFMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; NOZFMIN-NEXT:    vmv.v.x v8, a1
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vp_splat_nxv4bf16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a1, fa0
+; ZFMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a1
+; ZFMIN-NEXT:    ret
+  %splat = call <vscale x 4 x bfloat> @llvm.experimental.vp.splat.nxv4bf16(bfloat %val, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x bfloat> %splat
+}
+
+define <vscale x 8 x bfloat> @vp_splat_nxv8bf16(bfloat %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; NOZFMIN-LABEL: vp_splat_nxv8bf16:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    fmv.x.w a1, fa0
+; NOZFMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; NOZFMIN-NEXT:    vmv.v.x v8, a1
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vp_splat_nxv8bf16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a1, fa0
+; ZFMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a1
+; ZFMIN-NEXT:    ret
+  %splat = call <vscale x 8 x bfloat> @llvm.experimental.vp.splat.nxv8bf16(bfloat %val, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x bfloat> %splat
+}
+
+define <vscale x 16 x bfloat> @vp_splat_nxv16bf16(bfloat %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; NOZFMIN-LABEL: vp_splat_nxv16bf16:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    fmv.x.w a1, fa0
+; NOZFMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; NOZFMIN-NEXT:    vmv.v.x v8, a1
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vp_splat_nxv16bf16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a1, fa0
+; ZFMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a1
+; ZFMIN-NEXT:    ret
+  %splat = call <vscale x 16 x bfloat> @llvm.experimental.vp.splat.nxv16bf16(bfloat %val, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x bfloat> %splat
+}
+
+define <vscale x 32 x bfloat> @vp_splat_nxv32bf16(bfloat %val, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; NOZFMIN-LABEL: vp_splat_nxv32bf16:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    fmv.x.w a1, fa0
+; NOZFMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; NOZFMIN-NEXT:    vmv.v.x v8, a1
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vp_splat_nxv32bf16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a1, fa0
+; ZFMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a1
+; ZFMIN-NEXT:    ret
+  %splat = call <vscale x 32 x bfloat> @llvm.experimental.vp.splat.nxv32bf16(bfloat %val, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x bfloat> %splat
+}
+
 define <vscale x 1 x half> @vp_splat_nxv1f16(half %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv1f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vp_splat_nxv1f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfmv.v.f v8, fa0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vp_splat_nxv1f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vp_splat_nxv1f16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a1, fa0
+; ZFMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a1
+; ZFMIN-NEXT:    ret
   %splat = call <vscale x 1 x half> @llvm.experimental.vp.splat.nxv1f16(half %val, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %splat
 }
 
 define <vscale x 2 x half> @vp_splat_nxv2f16(half %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv2f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vp_splat_nxv2f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfmv.v.f v8, fa0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vp_splat_nxv2f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vp_splat_nxv2f16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a1, fa0
+; ZFMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a1
+; ZFMIN-NEXT:    ret
   %splat = call <vscale x 2 x half> @llvm.experimental.vp.splat.nxv2f16(half %val, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %splat
 }
 
 define <vscale x 4 x half> @vp_splat_nxv4f16(half %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv4f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vp_splat_nxv4f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vfmv.v.f v8, fa0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vp_splat_nxv4f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vp_splat_nxv4f16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a1, fa0
+; ZFMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a1
+; ZFMIN-NEXT:    ret
   %splat = call <vscale x 4 x half> @llvm.experimental.vp.splat.nxv4f16(half %val, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %splat
 }
 
 define <vscale x 8 x half> @vp_splat_nxv8f16(half %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv8f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vp_splat_nxv8f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vfmv.v.f v8, fa0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vp_splat_nxv8f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vp_splat_nxv8f16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a1, fa0
+; ZFMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a1
+; ZFMIN-NEXT:    ret
   %splat = call <vscale x 8 x half> @llvm.experimental.vp.splat.nxv8f16(half %val, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %splat
 }
 
 define <vscale x 16 x half> @vp_splat_nxv16f16(half %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv16f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vp_splat_nxv16f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfmv.v.f v8, fa0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vp_splat_nxv16f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vp_splat_nxv16f16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a1, fa0
+; ZFMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a1
+; ZFMIN-NEXT:    ret
   %splat = call <vscale x 16 x half> @llvm.experimental.vp.splat.nxv16f16(half %val, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %splat
 }
 
 define <vscale x 32 x half> @vp_splat_nxv32f16(half %val, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv32f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vp_splat_nxv32f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vfmv.v.f v8, fa0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vp_splat_nxv32f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vp_splat_nxv32f16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a1, fa0
+; ZFMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a1
+; ZFMIN-NEXT:    ret
   %splat = call <vscale x 32 x half> @llvm.experimental.vp.splat.nxv32f16(half %val, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %splat
 }
@@ -452,10 +648,10 @@ define <vscale x 32 x i32> @vp_splat_nxv32i32(i32 %val, <vscale x 32 x i1> %m, i
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a0
-; CHECK-NEXT:    bltu a1, a2, .LBB39_2
+; CHECK-NEXT:    bltu a1, a2, .LBB45_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a2
-; CHECK-NEXT:  .LBB39_2:
+; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
index c0d7ecf74956b..84c8321b5b934 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
@@ -1,7 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64
 
 declare <vscale x 1 x i8> @llvm.vp.gather.nxv1i8.nxv1p0(<vscale x 1 x ptr>, <vscale x 1 x i1>, i32)
@@ -1237,6 +1241,195 @@ define <vscale x 8 x i64> @vpgather_baseidx_nxv8i64(ptr %base, <vscale x 8 x i64
   ret <vscale x 8 x i64> %v
 }
 
+declare <vscale x 1 x bfloat> @llvm.vp.gather.nxv1bf16.nxv1p0(<vscale x 1 x ptr>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vpgather_nxv1bf16(<vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpgather_nxv1bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; RV32-NEXT:    vluxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpgather_nxv1bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; RV64-NEXT:    vluxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <vscale x 1 x bfloat> @llvm.vp.gather.nxv1bf16.nxv1p0(<vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.gather.nxv2bf16.nxv2p0(<vscale x 2 x ptr>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vpgather_nxv2bf16(<vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpgather_nxv2bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; RV32-NEXT:    vluxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpgather_nxv2bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; RV64-NEXT:    vluxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <vscale x 2 x bfloat> @llvm.vp.gather.nxv2bf16.nxv2p0(<vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.gather.nxv4bf16.nxv4p0(<vscale x 4 x ptr>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vpgather_nxv4bf16(<vscale x 4 x ptr> %ptrs, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpgather_nxv4bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; RV32-NEXT:    vluxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpgather_nxv4bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; RV64-NEXT:    vluxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT:    vmv.v.v v8, v12
+; RV64-NEXT:    ret
+  %v = call <vscale x 4 x bfloat> @llvm.vp.gather.nxv4bf16.nxv4p0(<vscale x 4 x ptr> %ptrs, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vpgather_truemask_nxv4bf16(<vscale x 4 x ptr> %ptrs, i32 zeroext %evl) {
+; RV32-LABEL: vpgather_truemask_nxv4bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; RV32-NEXT:    vluxei32.v v10, (zero), v8
+; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpgather_truemask_nxv4bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; RV64-NEXT:    vluxei64.v v12, (zero), v8
+; RV64-NEXT:    vmv.v.v v8, v12
+; RV64-NEXT:    ret
+  %v = call <vscale x 4 x bfloat> @llvm.vp.gather.nxv4bf16.nxv4p0(<vscale x 4 x ptr> %ptrs, <vscale x 4 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.gather.nxv8bf16.nxv8p0(<vscale x 8 x ptr>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vpgather_nxv8bf16(<vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpgather_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; RV32-NEXT:    vluxei32.v v12, (zero), v8, v0.t
+; RV32-NEXT:    vmv.v.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpgather_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; RV64-NEXT:    vluxei64.v v16, (zero), v8, v0.t
+; RV64-NEXT:    vmv.v.v v8, v16
+; RV64-NEXT:    ret
+  %v = call <vscale x 8 x bfloat> @llvm.vp.gather.nxv8bf16.nxv8p0(<vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vpgather_baseidx_nxv8i8_nxv8bf16(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpgather_baseidx_nxv8i8_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsext.vf4 v12, v8
+; RV32-NEXT:    vadd.vv v12, v12, v12
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpgather_baseidx_nxv8i8_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf8 v16, v8
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i8> %idxs
+  %v = call <vscale x 8 x bfloat> @llvm.vp.gather.nxv8bf16.nxv8p0(<vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vpgather_baseidx_sext_nxv8i8_nxv8bf16(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpgather_baseidx_sext_nxv8i8_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsext.vf4 v12, v8
+; RV32-NEXT:    vadd.vv v12, v12, v12
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpgather_baseidx_sext_nxv8i8_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf8 v16, v8
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <vscale x 8 x i8> %idxs to <vscale x 8 x i16>
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i16> %eidxs
+  %v = call <vscale x 8 x bfloat> @llvm.vp.gather.nxv8bf16.nxv8p0(<vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vpgather_baseidx_zext_nxv8i8_nxv8bf16(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpgather_baseidx_zext_nxv8i8_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; RV32-NEXT:    vwaddu.vv v10, v8, v8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpgather_baseidx_zext_nxv8i8_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; RV64-NEXT:    vwaddu.vv v10, v8, v8
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i16>
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i16> %eidxs
+  %v = call <vscale x 8 x bfloat> @llvm.vp.gather.nxv8bf16.nxv8p0(<vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vpgather_baseidx_nxv8bf16(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpgather_baseidx_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwadd.vv v12, v8, v8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpgather_baseidx_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf4 v16, v8
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i16> %idxs
+  %v = call <vscale x 8 x bfloat> @llvm.vp.gather.nxv8bf16.nxv8p0(<vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x bfloat> %v
+}
+
 declare <vscale x 1 x half> @llvm.vp.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr>, <vscale x 1 x i1>, i32)
 
 define <vscale x 1 x half> @vpgather_nxv1f16(<vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -2275,10 +2468,10 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 ; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (zero), v12, v0.t
-; RV32-NEXT:    bltu a0, a1, .LBB102_2
+; RV32-NEXT:    bltu a0, a1, .LBB111_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:  .LBB102_2:
+; RV32-NEXT:  .LBB111_2:
 ; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v24, (zero), v8, v0.t
@@ -2298,10 +2491,10 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 ; RV64-NEXT:    and a2, a3, a2
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (zero), v16, v0.t
-; RV64-NEXT:    bltu a0, a1, .LBB102_2
+; RV64-NEXT:    bltu a0, a1, .LBB111_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB102_2:
+; RV64-NEXT:  .LBB111_2:
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (zero), v8, v0.t
@@ -2327,10 +2520,10 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 ; RV32-NEXT:    and a3, a4, a3
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
-; RV32-NEXT:    bltu a1, a2, .LBB103_2
+; RV32-NEXT:    bltu a1, a2, .LBB112_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a1, a2
-; RV32-NEXT:  .LBB103_2:
+; RV32-NEXT:  .LBB112_2:
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
@@ -2355,10 +2548,10 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 ; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v24, v8
 ; RV64-NEXT:    vsll.vi v24, v24, 3
-; RV64-NEXT:    bltu a1, a2, .LBB103_2
+; RV64-NEXT:    bltu a1, a2, .LBB112_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a1, a2
-; RV64-NEXT:  .LBB103_2:
+; RV64-NEXT:  .LBB112_2:
 ; RV64-NEXT:    vmv1r.v v0, v12
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v24, v0.t
@@ -2385,10 +2578,10 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    and a3, a4, a3
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
-; RV32-NEXT:    bltu a1, a2, .LBB104_2
+; RV32-NEXT:    bltu a1, a2, .LBB113_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a1, a2
-; RV32-NEXT:  .LBB104_2:
+; RV32-NEXT:  .LBB113_2:
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
@@ -2413,10 +2606,10 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v24, v8
 ; RV64-NEXT:    vsll.vi v24, v24, 3
-; RV64-NEXT:    bltu a1, a2, .LBB104_2
+; RV64-NEXT:    bltu a1, a2, .LBB113_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a1, a2
-; RV64-NEXT:  .LBB104_2:
+; RV64-NEXT:  .LBB113_2:
 ; RV64-NEXT:    vmv1r.v v0, v12
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v24, v0.t
@@ -2444,10 +2637,10 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    and a3, a4, a3
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
-; RV32-NEXT:    bltu a1, a2, .LBB105_2
+; RV32-NEXT:    bltu a1, a2, .LBB114_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a1, a2
-; RV32-NEXT:  .LBB105_2:
+; RV32-NEXT:  .LBB114_2:
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
@@ -2469,10 +2662,10 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v16, (a0), v28, v0.t
-; RV64-NEXT:    bltu a1, a2, .LBB105_2
+; RV64-NEXT:    bltu a1, a2, .LBB114_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a1, a2
-; RV64-NEXT:  .LBB105_2:
+; RV64-NEXT:  .LBB114_2:
 ; RV64-NEXT:    vmv1r.v v0, v12
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v8, (a0), v24, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
index d4f117fad37ee..0a98b672fb19c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
@@ -1,7 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.vp.load.nxv1i8.p0(ptr, <vscale x 1 x i1>, i32)
@@ -269,6 +273,64 @@ define <vscale x 8 x i64> @vpload_nxv8i64(ptr %ptr, <vscale x 8 x i1> %m, i32 ze
   ret <vscale x 8 x i64> %load
 }
 
+declare <vscale x 1 x bfloat> @llvm.vp.load.nxv1bf16.p0(ptr, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vpload_nxv1bf16(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpload_nxv1bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 1 x bfloat> @llvm.vp.load.nxv1bf16.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x bfloat> %load
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.load.nxv2bf16.p0(ptr, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vpload_nxv2bf16(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpload_nxv2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x bfloat> @llvm.vp.load.nxv2bf16.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x bfloat> %load
+}
+
+define <vscale x 2 x bfloat> @vpload_nxv2bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vpload_nxv2bf16_allones_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x bfloat> @llvm.vp.load.nxv2bf16.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 2 x bfloat> %load
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.load.nxv4bf16.p0(ptr, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vpload_nxv4bf16(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpload_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x bfloat> @llvm.vp.load.nxv4bf16.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x bfloat> %load
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.load.nxv8bf16.p0(ptr, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vpload_nxv8bf16(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpload_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 8 x bfloat> @llvm.vp.load.nxv8bf16.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x bfloat> %load
+}
+
 declare <vscale x 1 x half> @llvm.vp.load.nxv1f16.p0(ptr, <vscale x 1 x i1>, i32)
 
 define <vscale x 1 x half> @vpload_nxv1f16(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -461,10 +523,10 @@ define <vscale x 16 x double> @vpload_nxv16f64(ptr %ptr, <vscale x 16 x i1> %m,
 ; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a4), v0.t
-; CHECK-NEXT:    bltu a1, a2, .LBB38_2
+; CHECK-NEXT:    bltu a1, a2, .LBB43_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a2
-; CHECK-NEXT:  .LBB38_2:
+; CHECK-NEXT:  .LBB43_2:
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
@@ -491,10 +553,10 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
 ; CHECK-NEXT:    slli a5, a3, 1
 ; CHECK-NEXT:    vmv1r.v v8, v0
 ; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    bltu a2, a5, .LBB39_2
+; CHECK-NEXT:    bltu a2, a5, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a4, a5
-; CHECK-NEXT:  .LBB39_2:
+; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    sub a6, a4, a3
 ; CHECK-NEXT:    sltu a7, a4, a6
 ; CHECK-NEXT:    addi a7, a7, -1
@@ -510,10 +572,10 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
 ; CHECK-NEXT:    sltu a2, a2, a5
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a5
-; CHECK-NEXT:    bltu a2, a3, .LBB39_4
+; CHECK-NEXT:    bltu a2, a3, .LBB44_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a2, a3
-; CHECK-NEXT:  .LBB39_4:
+; CHECK-NEXT:  .LBB44_4:
 ; CHECK-NEXT:    slli a5, a3, 4
 ; CHECK-NEXT:    srli a6, a3, 2
 ; CHECK-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
@@ -521,10 +583,10 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
 ; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a5), v0.t
-; CHECK-NEXT:    bltu a4, a3, .LBB39_6
+; CHECK-NEXT:    bltu a4, a3, .LBB44_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:  .LBB39_6:
+; CHECK-NEXT:  .LBB44_6:
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
index 59662db42898f..0028f3035c273 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
@@ -1,7 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v,+m \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v,+m \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v,+m \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v,+m \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64
 
 declare void @llvm.vp.scatter.nxv1i8.nxv1p0(<vscale x 1 x i8>, <vscale x 1 x ptr>, <vscale x 1 x i1>, i32)
@@ -1106,6 +1110,185 @@ define void @vpscatter_baseidx_nxv8i64(<vscale x 8 x i64> %val, ptr %base, <vsca
   ret void
 }
 
+declare void @llvm.vp.scatter.nxv1bf16.nxv1p0(<vscale x 1 x bfloat>, <vscale x 1 x ptr>, <vscale x 1 x i1>, i32)
+
+define void @vpscatter_nxv1bf16(<vscale x 1 x bfloat> %val, <vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpscatter_nxv1bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpscatter_nxv1bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.vp.scatter.nxv1bf16.nxv1p0(<vscale x 1 x bfloat> %val, <vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 %evl)
+  ret void
+}
+
+declare void @llvm.vp.scatter.nxv2bf16.nxv2p0(<vscale x 2 x bfloat>, <vscale x 2 x ptr>, <vscale x 2 x i1>, i32)
+
+define void @vpscatter_nxv2bf16(<vscale x 2 x bfloat> %val, <vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpscatter_nxv2bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpscatter_nxv2bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.vp.scatter.nxv2bf16.nxv2p0(<vscale x 2 x bfloat> %val, <vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m, i32 %evl)
+  ret void
+}
+
+declare void @llvm.vp.scatter.nxv4bf16.nxv4p0(<vscale x 4 x bfloat>, <vscale x 4 x ptr>, <vscale x 4 x i1>, i32)
+
+define void @vpscatter_nxv4bf16(<vscale x 4 x bfloat> %val, <vscale x 4 x ptr> %ptrs, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpscatter_nxv4bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (zero), v10, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpscatter_nxv4bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (zero), v12, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.vp.scatter.nxv4bf16.nxv4p0(<vscale x 4 x bfloat> %val, <vscale x 4 x ptr> %ptrs, <vscale x 4 x i1> %m, i32 %evl)
+  ret void
+}
+
+define void @vpscatter_truemask_nxv4bf16(<vscale x 4 x bfloat> %val, <vscale x 4 x ptr> %ptrs, i32 zeroext %evl) {
+; RV32-LABEL: vpscatter_truemask_nxv4bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (zero), v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpscatter_truemask_nxv4bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (zero), v12
+; RV64-NEXT:    ret
+  call void @llvm.vp.scatter.nxv4bf16.nxv4p0(<vscale x 4 x bfloat> %val, <vscale x 4 x ptr> %ptrs, <vscale x 4 x i1> splat (i1 1), i32 %evl)
+  ret void
+}
+
+declare void @llvm.vp.scatter.nxv8bf16.nxv8p0(<vscale x 8 x bfloat>, <vscale x 8 x ptr>, <vscale x 8 x i1>, i32)
+
+define void @vpscatter_nxv8bf16(<vscale x 8 x bfloat> %val, <vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpscatter_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (zero), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpscatter_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (zero), v16, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.vp.scatter.nxv8bf16.nxv8p0(<vscale x 8 x bfloat> %val, <vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
+  ret void
+}
+
+define void @vpscatter_baseidx_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpscatter_baseidx_nxv8i8_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsext.vf4 v12, v10
+; RV32-NEXT:    vadd.vv v12, v12, v12
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpscatter_baseidx_nxv8i8_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf8 v16, v10
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i8> %idxs
+  call void @llvm.vp.scatter.nxv8bf16.nxv8p0(<vscale x 8 x bfloat> %val, <vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
+  ret void
+}
+
+define void @vpscatter_baseidx_sext_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpscatter_baseidx_sext_nxv8i8_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsext.vf4 v12, v10
+; RV32-NEXT:    vadd.vv v12, v12, v12
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpscatter_baseidx_sext_nxv8i8_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf8 v16, v10
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <vscale x 8 x i8> %idxs to <vscale x 8 x i16>
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i16> %eidxs
+  call void @llvm.vp.scatter.nxv8bf16.nxv8p0(<vscale x 8 x bfloat> %val, <vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
+  ret void
+}
+
+define void @vpscatter_baseidx_zext_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpscatter_baseidx_zext_nxv8i8_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; RV32-NEXT:    vwaddu.vv v12, v10, v10
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpscatter_baseidx_zext_nxv8i8_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; RV64-NEXT:    vwaddu.vv v12, v10, v10
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i16>
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i16> %eidxs
+  call void @llvm.vp.scatter.nxv8bf16.nxv8p0(<vscale x 8 x bfloat> %val, <vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
+  ret void
+}
+
+define void @vpscatter_baseidx_nxv8bf16(<vscale x 8 x bfloat> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpscatter_baseidx_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwadd.vv v12, v10, v10
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpscatter_baseidx_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf4 v16, v10
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds bfloat, ptr %base, <vscale x 8 x i16> %idxs
+  call void @llvm.vp.scatter.nxv8bf16.nxv8p0(<vscale x 8 x bfloat> %val, <vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
+  ret void
+}
+
 declare void @llvm.vp.scatter.nxv1f16.nxv1p0(<vscale x 1 x half>, <vscale x 1 x ptr>, <vscale x 1 x i1>, i32)
 
 define void @vpscatter_nxv1f16(<vscale x 1 x half> %val, <vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -2115,10 +2298,10 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
 ; RV32-NEXT:    vl8re32.v v24, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    bltu a1, a0, .LBB99_2
+; RV32-NEXT:    bltu a1, a0, .LBB108_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:  .LBB99_2:
+; RV32-NEXT:  .LBB108_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (zero), v24, v0.t
 ; RV32-NEXT:    sub a2, a1, a0
@@ -2148,10 +2331,10 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
 ; RV64-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vl8re64.v v24, (a0)
 ; RV64-NEXT:    mv a0, a2
-; RV64-NEXT:    bltu a2, a1, .LBB99_2
+; RV64-NEXT:    bltu a2, a1, .LBB108_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB99_2:
+; RV64-NEXT:  .LBB108_2:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (zero), v24, v0.t
 ; RV64-NEXT:    sub a0, a2, a1
@@ -2183,10 +2366,10 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    vsll.vi v24, v24, 3
 ; RV32-NEXT:    mv a3, a2
-; RV32-NEXT:    bltu a2, a1, .LBB100_2
+; RV32-NEXT:    bltu a2, a1, .LBB109_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a3, a1
-; RV32-NEXT:  .LBB100_2:
+; RV32-NEXT:  .LBB109_2:
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
@@ -2223,10 +2406,10 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    vsll.vi v24, v16, 3
 ; RV64-NEXT:    mv a3, a2
-; RV64-NEXT:    bltu a2, a1, .LBB100_2
+; RV64-NEXT:    bltu a2, a1, .LBB109_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a3, a1
-; RV64-NEXT:  .LBB100_2:
+; RV64-NEXT:  .LBB109_2:
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a2, a1
@@ -2264,10 +2447,10 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    vsll.vi v24, v24, 3
 ; RV32-NEXT:    mv a3, a2
-; RV32-NEXT:    bltu a2, a1, .LBB101_2
+; RV32-NEXT:    bltu a2, a1, .LBB110_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a3, a1
-; RV32-NEXT:  .LBB101_2:
+; RV32-NEXT:  .LBB110_2:
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
@@ -2286,10 +2469,10 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a4, 10
-; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    slli a4, a3, 3
+; RV64-NEXT:    add a3, a4, a3
 ; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x0a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 10 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
 ; RV64-NEXT:    vl4re16.v v24, (a1)
 ; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
@@ -2304,10 +2487,10 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    vsll.vi v24, v0, 3
 ; RV64-NEXT:    mv a3, a2
-; RV64-NEXT:    bltu a2, a1, .LBB101_2
+; RV64-NEXT:    bltu a2, a1, .LBB110_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a3, a1
-; RV64-NEXT:  .LBB101_2:
+; RV64-NEXT:  .LBB110_2:
 ; RV64-NEXT:    addi a4, sp, 16
 ; RV64-NEXT:    vl1r.v v0, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
@@ -2326,8 +2509,8 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 10
-; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    slli a1, a0, 3
+; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -2346,10 +2529,10 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    vsll.vi v24, v24, 3
 ; RV32-NEXT:    mv a3, a2
-; RV32-NEXT:    bltu a2, a1, .LBB102_2
+; RV32-NEXT:    bltu a2, a1, .LBB111_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a3, a1
-; RV32-NEXT:  .LBB102_2:
+; RV32-NEXT:  .LBB111_2:
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
@@ -2371,10 +2554,10 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    vsll.vi v24, v24, 3
 ; RV64-NEXT:    mv a3, a2
-; RV64-NEXT:    bltu a2, a1, .LBB102_2
+; RV64-NEXT:    bltu a2, a1, .LBB111_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a3, a1
-; RV64-NEXT:  .LBB102_2:
+; RV64-NEXT:  .LBB111_2:
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a2, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
index 015d7645aaa29..d935e52149d20 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
@@ -1,7 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 declare void @llvm.vp.store.nxv1i8.p0(<vscale x 1 x i8>, ptr, <vscale x 1 x i1>, i32)
@@ -208,6 +212,54 @@ define void @vpstore_nxv8i64(<vscale x 8 x i64> %val, ptr %ptr, <vscale x 8 x i1
   ret void
 }
 
+declare void @llvm.vp.store.nxv1bf16.p0(<vscale x 1 x bfloat>, ptr, <vscale x 1 x i1>, i32)
+
+define void @vpstore_nxv1bf16(<vscale x 1 x bfloat> %val, ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpstore_nxv1bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.vp.store.nxv1bf16.p0(<vscale x 1 x bfloat> %val, ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+  ret void
+}
+
+declare void @llvm.vp.store.nxv2bf16.p0(<vscale x 2 x bfloat>, ptr, <vscale x 2 x i1>, i32)
+
+define void @vpstore_nxv2bf16(<vscale x 2 x bfloat> %val, ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpstore_nxv2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.vp.store.nxv2bf16.p0(<vscale x 2 x bfloat> %val, ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+  ret void
+}
+
+declare void @llvm.vp.store.nxv4bf16.p0(<vscale x 4 x bfloat>, ptr, <vscale x 4 x i1>, i32)
+
+define void @vpstore_nxv4bf16(<vscale x 4 x bfloat> %val, ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpstore_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.vp.store.nxv4bf16.p0(<vscale x 4 x bfloat> %val, ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+  ret void
+}
+
+declare void @llvm.vp.store.nxv8bf16.p0(<vscale x 8 x bfloat>, ptr, <vscale x 8 x i1>, i32)
+
+define void @vpstore_nxv8bf16(<vscale x 8 x bfloat> %val, ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpstore_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.vp.store.nxv8bf16.p0(<vscale x 8 x bfloat> %val, ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+  ret void
+}
+
 declare void @llvm.vp.store.nxv1f16.p0(<vscale x 1 x half>, ptr, <vscale x 1 x i1>, i32)
 
 define void @vpstore_nxv1f16(<vscale x 1 x half> %val, ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -369,10 +421,10 @@ define void @vpstore_nxv16f64(<vscale x 16 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    bltu a1, a2, .LBB30_2
+; CHECK-NEXT:    bltu a1, a2, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:  .LBB30_2:
+; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0), v0.t
 ; CHECK-NEXT:    sub a3, a1, a2
@@ -402,15 +454,15 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    slli a4, a3, 1
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    mv a5, a2
-; CHECK-NEXT:    bltu a2, a4, .LBB31_2
+; CHECK-NEXT:    bltu a2, a4, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:  .LBB31_2:
+; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    mv a6, a5
-; CHECK-NEXT:    bltu a5, a3, .LBB31_4
+; CHECK-NEXT:    bltu a5, a3, .LBB35_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a6, a3
-; CHECK-NEXT:  .LBB31_4:
+; CHECK-NEXT:  .LBB35_4:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a7, vlenb
@@ -438,10 +490,10 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    and a0, a2, a0
 ; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v16, (a6), v0.t
-; CHECK-NEXT:    bltu a0, a3, .LBB31_6
+; CHECK-NEXT:    bltu a0, a3, .LBB35_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a0, a3
-; CHECK-NEXT:  .LBB31_6:
+; CHECK-NEXT:  .LBB35_6:
 ; CHECK-NEXT:    slli a2, a3, 4
 ; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsplats-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vsplats-fp.ll
index 8317690e3fd25..c29c2533b8499 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsplats-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsplats-fp.ll
@@ -1,20 +1,38 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+f,+d,+zfh,+zvfh,+v,+optimized-zero-stride-load -target-abi ilp32d -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,ZVFH,OPTIMIZED
-; RUN: llc -mtriple=riscv64 -mattr=+f,+d,+zfh,+zvfh,+v,+optimized-zero-stride-load -target-abi lp64d -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,ZVFH,OPTIMIZED
-; RUN: llc -mtriple=riscv32 -mattr=+f,+d,+zfh,+zvfh,+v -target-abi ilp32d -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,ZVFH,NOT-OPTIMIZED
-; RUN: llc -mtriple=riscv64 -mattr=+f,+d,+zfh,+zvfh,+v -target-abi lp64d -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,ZVFH,NOT-OPTIMIZED
-; RUN: llc -mtriple=riscv32 -mattr=+f,+d,+zfh,+zvfhmin,+v,+optimized-zero-stride-load -target-abi ilp32d -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,OPTIMIZED
-; RUN: llc -mtriple=riscv64 -mattr=+f,+d,+zfh,+zvfhmin,+v,+optimized-zero-stride-load -target-abi lp64d -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,OPTIMIZED
-; RUN: llc -mtriple=riscv32 -mattr=+f,+d,+zfh,+zvfhmin,+v -target-abi ilp32d -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,NOT-OPTIMIZED
-; RUN: llc -mtriple=riscv64 -mattr=+f,+d,+zfh,+zvfhmin,+v -target-abi lp64d -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,NOT-OPTIMIZED
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NOZFMIN,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NOZFMIN,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NOZFMIN,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NOZFMIN,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZFMIN
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZFMIN
+
+define <vscale x 8 x bfloat> @vsplat_nxv8bf16(bfloat %f) {
+; NOZFMIN-LABEL: vsplat_nxv8bf16:
+; NOZFMIN:       # %bb.0:
+; NOZFMIN-NEXT:    fmv.x.w a0, fa0
+; NOZFMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; NOZFMIN-NEXT:    vmv.v.x v8, a0
+; NOZFMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vsplat_nxv8bf16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a0, fa0
+; ZFMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a0
+; ZFMIN-NEXT:    ret
+  %head = insertelement <vscale x 8 x bfloat> poison, bfloat %f, i32 0
+  %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x bfloat> %splat
+}
+
+define <vscale x 8 x bfloat> @vsplat_zero_nxv8bf16() {
+; CHECK-LABEL: vsplat_zero_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x bfloat> splat (bfloat zeroinitializer)
+}
 
 define <vscale x 8 x half> @vsplat_nxv8f16(half %f) {
 ; ZVFH-LABEL: vsplat_nxv8f16:
@@ -25,10 +43,17 @@ define <vscale x 8 x half> @vsplat_nxv8f16(half %f) {
 ;
 ; ZVFHMIN-LABEL: vsplat_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    fmv.x.w a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZFMIN-LABEL: vsplat_nxv8f16:
+; ZFMIN:       # %bb.0:
+; ZFMIN-NEXT:    fmv.x.h a0, fa0
+; ZFMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZFMIN-NEXT:    vmv.v.x v8, a0
+; ZFMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %f, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   ret <vscale x 8 x half> %splat
@@ -83,20 +108,26 @@ define <vscale x 8 x double> @vsplat_zero_nxv8f64() {
   ret <vscale x 8 x double> splat (double zeroinitializer)
 }
 
-; Test that we fold this to a vlse with 0 stride.
 define <vscale x 8 x float> @vsplat_load_nxv8f32(ptr %ptr) {
-; OPTIMIZED-LABEL: vsplat_load_nxv8f32:
-; OPTIMIZED:       # %bb.0:
-; OPTIMIZED-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; OPTIMIZED-NEXT:    vlse32.v v8, (a0), zero
-; OPTIMIZED-NEXT:    ret
-;
-; NOT-OPTIMIZED-LABEL: vsplat_load_nxv8f32:
-; NOT-OPTIMIZED:       # %bb.0:
-; NOT-OPTIMIZED-NEXT:    flw fa5, 0(a0)
-; NOT-OPTIMIZED-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; NOT-OPTIMIZED-NEXT:    vfmv.v.f v8, fa5
-; NOT-OPTIMIZED-NEXT:    ret
+; CHECK-LABEL: vsplat_load_nxv8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    flw fa5, 0(a0)
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa5
+; CHECK-NEXT:    ret
+  %f = load float, ptr %ptr
+  %head = insertelement <vscale x 8 x float> poison, float %f, i32 0
+  %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x float> %splat
+}
+
+; Test that we fold this to a vlse with 0 stride.
+define <vscale x 8 x float> @vsplat_load_nxv8f32_optimized(ptr %ptr) "target-features"="+optimized-zero-stride-load" {
+; CHECK-LABEL: vsplat_load_nxv8f32_optimized:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vlse32.v v8, (a0), zero
+; CHECK-NEXT:    ret
   %f = load float, ptr %ptr
   %head = insertelement <vscale x 8 x float> poison, float %f, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll
index a869b433a4952..72f25268109a1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll
@@ -75,7 +75,6 @@ define <vscale x 1 x i8> @test3(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vsc
 ; RV32-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    mv s0, a0
 ; RV32-NEXT:    addi a1, sp, 16
@@ -90,7 +89,6 @@ define <vscale x 1 x i8> @test3(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vsc
 ; RV32-NEXT:    vsetvli zero, s0, e8, mf8, ta, ma
 ; RV32-NEXT:    vaadd.vv v8, v8, v9
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -103,7 +101,6 @@ define <vscale x 1 x i8> @test3(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vsc
 ; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 1
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    mv s0, a0
 ; RV64-NEXT:    addi a1, sp, 16
@@ -118,7 +115,6 @@ define <vscale x 1 x i8> @test3(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vsc
 ; RV64-NEXT:    vsetvli zero, s0, e8, mf8, ta, ma
 ; RV64-NEXT:    vaadd.vv v8, v8, v9
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir b/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir
index 8248c26636793..2ec51911a65f7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple riscv64 -mattr=+m,+v -run-pass=prologepilog \
 # RUN:     -riscv-v-vector-bits-min=512 -o - %s | FileCheck %s
 #
@@ -25,7 +26,6 @@
 # |--------------------------| -- <-- New SP
 
 --- |
-  ; ModuleID = 'wrong-stack-offset-for-rvv-object.ll'
   source_filename = "wrong-stack-offset-for-rvv-object.ll"
   target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
   target triple = "riscv64"
@@ -120,28 +120,6 @@ machineFunctionInfo:
   varArgsSaveSize: 16
 body:             |
   ; CHECK-LABEL: name: asm_fprintf
-  ; CHECK: stack:
-  ; CHECK-NEXT:  - { id: 0, name: buf1, type: default, offset: -48, size: 1, alignment: 8,
-  ; CHECK-NEXT:      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-  ; CHECK-NEXT:      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  ; CHECK-NEXT:  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
-  ; CHECK-NEXT:      stack-id: scalable-vector, callee-saved-register: '', callee-saved-restored: true,
-  ; CHECK-NEXT:      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  ; CHECK-NEXT:  - { id: 2, name: '', type: spill-slot, offset: -24, size: 8, alignment: 8,
-  ; CHECK-NEXT:      stack-id: default, callee-saved-register: '$x1', callee-saved-restored: true,
-  ; CHECK-NEXT:      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  ; CHECK-NEXT:  - { id: 3, name: '', type: spill-slot, offset: -32, size: 8, alignment: 8,
-  ; CHECK-NEXT:      stack-id: default, callee-saved-register: '$x8', callee-saved-restored: true,
-  ; CHECK-NEXT:      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  ; CHECK-NEXT:  - { id: 4, name: '', type: spill-slot, offset: -40, size: 8, alignment: 8,
-  ; CHECK-NEXT:      stack-id: default, callee-saved-register: '$x9', callee-saved-restored: true,
-  ; CHECK-NEXT:      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  ; CHECK-NEXT:  - { id: 5, name: '', type: default, offset: -56, size: 8, alignment: 8,
-  ; CHECK-NEXT:      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-  ; CHECK-NEXT:      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  ; CHECK-NEXT:  - { id: 6, name: '', type: default, offset: -64, size: 8, alignment: 8,
-  ; CHECK-NEXT:      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-  ; CHECK-NEXT:      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $x11, $x14, $x16, $x17, $x1, $x8, $x9
@@ -155,17 +133,14 @@ body:             |
   ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x8, -32
   ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x9, -40
   ; CHECK-NEXT:   $x10 = frame-setup PseudoReadVLENB
-  ; CHECK-NEXT:   $x10 = frame-setup SLLI killed $x10, 1
   ; CHECK-NEXT:   $x2 = frame-setup SUB $x2, killed $x10
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22
   ; CHECK-NEXT:   renamable $x8 = COPY $x14
   ; CHECK-NEXT:   renamable $x9 = COPY $x11
   ; CHECK-NEXT:   $x10 = PseudoReadVLENB
-  ; CHECK-NEXT:   $x10 = SLLI killed $x10, 1
   ; CHECK-NEXT:   $x10 = ADD $x2, killed $x10
   ; CHECK-NEXT:   SD killed renamable $x17, killed $x10, 72 :: (store (s64))
   ; CHECK-NEXT:   $x10 = PseudoReadVLENB
-  ; CHECK-NEXT:   $x10 = SLLI killed $x10, 1
   ; CHECK-NEXT:   $x10 = ADD $x2, killed $x10
   ; CHECK-NEXT:   SD killed renamable $x16, killed $x10, 64 :: (store (s64) into %fixed-stack.1, align 16)
   ; CHECK-NEXT:   dead $x0 = PseudoVSETIVLI 2, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype
diff --git a/llvm/test/CodeGen/RISCV/scmp.ll b/llvm/test/CodeGen/RISCV/scmp.ll
index e79b6989410a6..a212714db53e0 100644
--- a/llvm/test/CodeGen/RISCV/scmp.ll
+++ b/llvm/test/CodeGen/RISCV/scmp.ll
@@ -87,10 +87,10 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
 ; RV32I-LABEL: scmp.8.128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 4(a1)
-; RV32I-NEXT:    lw a3, 4(a0)
 ; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a5, 12(a1)
 ; RV32I-NEXT:    lw a6, 12(a0)
+; RV32I-NEXT:    lw a3, 4(a0)
 ; RV32I-NEXT:    lw a7, 8(a0)
 ; RV32I-NEXT:    beq a6, a5, .LBB4_2
 ; RV32I-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 5ba8755201ddf..dcc99ebaa5514 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -171,21 +171,21 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a5, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    slli a5, a4, 1
-; RV32I-NEXT:    andi a6, a2, 31
-; RV32I-NEXT:    xori a6, a6, 31
-; RV32I-NEXT:    lw a7, 8(a1)
-; RV32I-NEXT:    sll a5, a5, a6
-; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    slli a6, a4, 1
+; RV32I-NEXT:    andi a7, a2, 31
+; RV32I-NEXT:    xori a7, a7, 31
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    or a3, a3, a6
 ; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    slli a5, a7, 1
-; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    sll a5, a5, a6
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    srl a5, a7, a2
-; RV32I-NEXT:    slli a7, a1, 1
-; RV32I-NEXT:    sll a6, a7, a6
+; RV32I-NEXT:    slli a6, a5, 1
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    srl a5, a5, a2
+; RV32I-NEXT:    slli a6, a1, 1
+; RV32I-NEXT:    sll a6, a6, a7
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    srl a1, a1, a2
 ; RV32I-NEXT:    sw a1, 12(a0)
@@ -221,41 +221,41 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: ashr128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lw a3, 8(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    lw a5, 0(a1)
+; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    lw a2, 0(a2)
-; RV32I-NEXT:    lw a3, 12(a1)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a5, 4(a1)
-; RV32I-NEXT:    lw a1, 0(a1)
-; RV32I-NEXT:    sw a3, 12(sp)
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 4(sp)
-; RV32I-NEXT:    sw a1, 0(sp)
-; RV32I-NEXT:    srai a3, a3, 31
-; RV32I-NEXT:    sw a3, 28(sp)
-; RV32I-NEXT:    sw a3, 24(sp)
-; RV32I-NEXT:    sw a3, 20(sp)
-; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a3, 8(sp)
+; RV32I-NEXT:    sw a1, 4(sp)
+; RV32I-NEXT:    sw a5, 0(sp)
+; RV32I-NEXT:    srai a4, a4, 31
+; RV32I-NEXT:    sw a4, 28(sp)
+; RV32I-NEXT:    sw a4, 24(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a4, 16(sp)
 ; RV32I-NEXT:    srli a1, a2, 3
 ; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    mv a3, sp
 ; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a5, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    slli a5, a4, 1
-; RV32I-NEXT:    andi a6, a2, 31
-; RV32I-NEXT:    xori a6, a6, 31
-; RV32I-NEXT:    lw a7, 8(a1)
-; RV32I-NEXT:    sll a5, a5, a6
-; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    slli a6, a4, 1
+; RV32I-NEXT:    andi a7, a2, 31
+; RV32I-NEXT:    xori a7, a7, 31
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    or a3, a3, a6
 ; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    slli a5, a7, 1
-; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    sll a5, a5, a6
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    srl a5, a7, a2
-; RV32I-NEXT:    slli a7, a1, 1
-; RV32I-NEXT:    sll a6, a7, a6
+; RV32I-NEXT:    slli a6, a5, 1
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    srl a5, a5, a2
+; RV32I-NEXT:    slli a6, a1, 1
+; RV32I-NEXT:    sll a6, a6, a7
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    sra a1, a1, a2
 ; RV32I-NEXT:    sw a1, 12(a0)
@@ -310,27 +310,27 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sub a3, a3, a1
 ; RV32I-NEXT:    lw a1, 4(a3)
 ; RV32I-NEXT:    lw a4, 0(a3)
-; RV32I-NEXT:    sll a5, a1, a2
-; RV32I-NEXT:    srli a6, a4, 1
-; RV32I-NEXT:    andi a7, a2, 31
-; RV32I-NEXT:    lw t0, 8(a3)
-; RV32I-NEXT:    xori a7, a7, 31
-; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    sll a6, t0, a2
+; RV32I-NEXT:    lw a5, 8(a3)
 ; RV32I-NEXT:    lw a3, 12(a3)
+; RV32I-NEXT:    sll a6, a1, a2
+; RV32I-NEXT:    srli a7, a4, 1
+; RV32I-NEXT:    andi t0, a2, 31
+; RV32I-NEXT:    xori t0, t0, 31
+; RV32I-NEXT:    srl a7, a7, t0
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    sll a7, a5, a2
 ; RV32I-NEXT:    srli a1, a1, 1
-; RV32I-NEXT:    srl a1, a1, a7
-; RV32I-NEXT:    or a1, a6, a1
+; RV32I-NEXT:    srl a1, a1, t0
+; RV32I-NEXT:    or a1, a7, a1
 ; RV32I-NEXT:    sll a3, a3, a2
-; RV32I-NEXT:    srli a6, t0, 1
-; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    srli a5, a5, 1
+; RV32I-NEXT:    srl a5, a5, t0
+; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:    sll a2, a4, a2
 ; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    sw a1, 8(a0)
-; RV32I-NEXT:    sw a5, 4(a0)
+; RV32I-NEXT:    sw a6, 4(a0)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
@@ -394,10 +394,10 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
 define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV32I-LABEL: fshr128_minsize:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a3, 8(a1)
-; RV32I-NEXT:    lw t2, 0(a1)
 ; RV32I-NEXT:    lw a2, 0(a2)
+; RV32I-NEXT:    lw t2, 0(a1)
 ; RV32I-NEXT:    lw a7, 4(a1)
+; RV32I-NEXT:    lw a3, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    andi t1, a2, 64
 ; RV32I-NEXT:    mv t0, a7
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 162f7e34536a7..5d00e90366c3b 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -308,22 +308,22 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lbu a0, 12(a0)
-; RV32-NEXT:    lw a1, 8(s0)
-; RV32-NEXT:    slli a2, a0, 30
-; RV32-NEXT:    lw a3, 4(s0)
-; RV32-NEXT:    srli s1, a1, 2
-; RV32-NEXT:    or s1, s1, a2
-; RV32-NEXT:    slli a2, a1, 31
-; RV32-NEXT:    srli a4, a3, 1
-; RV32-NEXT:    or s2, a4, a2
-; RV32-NEXT:    srli a0, a0, 2
-; RV32-NEXT:    slli a0, a0, 31
-; RV32-NEXT:    srai s3, a0, 31
-; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    lbu a1, 12(a0)
+; RV32-NEXT:    lw a2, 8(a0)
+; RV32-NEXT:    lw a3, 4(a0)
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    slli a4, a1, 30
+; RV32-NEXT:    srli s1, a2, 2
+; RV32-NEXT:    or s1, s1, a4
+; RV32-NEXT:    slli a4, a2, 31
+; RV32-NEXT:    srli a5, a3, 1
+; RV32-NEXT:    or s2, a5, a4
+; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    slli a1, a1, 31
-; RV32-NEXT:    lw a0, 0(s0)
-; RV32-NEXT:    srai s4, a1, 31
+; RV32-NEXT:    srai s3, a1, 31
+; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    slli a2, a2, 31
+; RV32-NEXT:    srai s4, a2, 31
 ; RV32-NEXT:    slli a1, a3, 31
 ; RV32-NEXT:    srai a1, a1, 31
 ; RV32-NEXT:    li a2, 6
@@ -389,8 +389,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64-NEXT:    mv s0, a0
 ; RV64-NEXT:    lbu a0, 12(a0)
 ; RV64-NEXT:    lwu a1, 8(s0)
-; RV64-NEXT:    slli a0, a0, 32
 ; RV64-NEXT:    ld a2, 0(s0)
+; RV64-NEXT:    slli a0, a0, 32
 ; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    slli a0, a0, 29
 ; RV64-NEXT:    srai s1, a0, 31
@@ -460,22 +460,22 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32M-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
 ; RV32M-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
 ; RV32M-NEXT:    mv s0, a0
-; RV32M-NEXT:    lbu a0, 12(a0)
-; RV32M-NEXT:    lw a1, 8(s0)
-; RV32M-NEXT:    slli a2, a0, 30
-; RV32M-NEXT:    lw a3, 4(s0)
-; RV32M-NEXT:    srli s1, a1, 2
-; RV32M-NEXT:    or s1, s1, a2
-; RV32M-NEXT:    slli a2, a1, 31
-; RV32M-NEXT:    srli a4, a3, 1
-; RV32M-NEXT:    or s2, a4, a2
-; RV32M-NEXT:    srli a0, a0, 2
-; RV32M-NEXT:    slli a0, a0, 31
-; RV32M-NEXT:    srai s3, a0, 31
-; RV32M-NEXT:    srli a1, a1, 1
+; RV32M-NEXT:    lbu a1, 12(a0)
+; RV32M-NEXT:    lw a2, 8(a0)
+; RV32M-NEXT:    lw a3, 4(a0)
+; RV32M-NEXT:    lw a0, 0(a0)
+; RV32M-NEXT:    slli a4, a1, 30
+; RV32M-NEXT:    srli s1, a2, 2
+; RV32M-NEXT:    or s1, s1, a4
+; RV32M-NEXT:    slli a4, a2, 31
+; RV32M-NEXT:    srli a5, a3, 1
+; RV32M-NEXT:    or s2, a5, a4
+; RV32M-NEXT:    srli a1, a1, 2
 ; RV32M-NEXT:    slli a1, a1, 31
-; RV32M-NEXT:    lw a0, 0(s0)
-; RV32M-NEXT:    srai s4, a1, 31
+; RV32M-NEXT:    srai s3, a1, 31
+; RV32M-NEXT:    srli a2, a2, 1
+; RV32M-NEXT:    slli a2, a2, 31
+; RV32M-NEXT:    srai s4, a2, 31
 ; RV32M-NEXT:    slli a1, a3, 31
 ; RV32M-NEXT:    srai a1, a1, 31
 ; RV32M-NEXT:    li a2, 6
@@ -534,34 +534,34 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    ld a1, 0(a0)
 ; RV64M-NEXT:    lwu a2, 8(a0)
-; RV64M-NEXT:    srli a3, a1, 2
-; RV64M-NEXT:    lbu a4, 12(a0)
+; RV64M-NEXT:    lbu a3, 12(a0)
+; RV64M-NEXT:    srli a4, a1, 2
 ; RV64M-NEXT:    slli a5, a2, 62
-; RV64M-NEXT:    or a3, a5, a3
-; RV64M-NEXT:    srai a3, a3, 31
-; RV64M-NEXT:    slli a4, a4, 32
-; RV64M-NEXT:    or a2, a2, a4
+; RV64M-NEXT:    or a4, a5, a4
+; RV64M-NEXT:    srai a4, a4, 31
+; RV64M-NEXT:    slli a3, a3, 32
+; RV64M-NEXT:    or a2, a2, a3
 ; RV64M-NEXT:    slli a2, a2, 29
-; RV64M-NEXT:    lui a4, %hi(.LCPI3_0)
-; RV64M-NEXT:    ld a4, %lo(.LCPI3_0)(a4)
+; RV64M-NEXT:    lui a3, %hi(.LCPI3_0)
+; RV64M-NEXT:    ld a3, %lo(.LCPI3_0)(a3)
 ; RV64M-NEXT:    srai a2, a2, 31
 ; RV64M-NEXT:    slli a1, a1, 31
 ; RV64M-NEXT:    srai a1, a1, 31
-; RV64M-NEXT:    mulh a4, a2, a4
-; RV64M-NEXT:    srli a5, a4, 63
-; RV64M-NEXT:    srai a4, a4, 1
-; RV64M-NEXT:    add a4, a4, a5
+; RV64M-NEXT:    mulh a3, a2, a3
+; RV64M-NEXT:    srli a5, a3, 63
+; RV64M-NEXT:    srai a3, a3, 1
+; RV64M-NEXT:    add a3, a3, a5
 ; RV64M-NEXT:    lui a5, %hi(.LCPI3_1)
 ; RV64M-NEXT:    ld a5, %lo(.LCPI3_1)(a5)
-; RV64M-NEXT:    add a2, a2, a4
-; RV64M-NEXT:    slli a4, a4, 2
-; RV64M-NEXT:    add a2, a2, a4
-; RV64M-NEXT:    mulh a4, a3, a5
-; RV64M-NEXT:    srli a5, a4, 63
-; RV64M-NEXT:    srai a4, a4, 1
-; RV64M-NEXT:    add a4, a4, a5
-; RV64M-NEXT:    slli a5, a4, 3
-; RV64M-NEXT:    add a3, a3, a4
+; RV64M-NEXT:    add a2, a2, a3
+; RV64M-NEXT:    slli a3, a3, 2
+; RV64M-NEXT:    add a2, a2, a3
+; RV64M-NEXT:    mulh a3, a4, a5
+; RV64M-NEXT:    srli a5, a3, 63
+; RV64M-NEXT:    srai a3, a3, 1
+; RV64M-NEXT:    add a3, a3, a5
+; RV64M-NEXT:    slli a5, a3, 3
+; RV64M-NEXT:    add a3, a4, a3
 ; RV64M-NEXT:    sub a3, a3, a5
 ; RV64M-NEXT:    addi a3, a3, -1
 ; RV64M-NEXT:    seqz a3, a3
@@ -610,22 +610,22 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    slli a1, a1, 1
 ; RV32MV-NEXT:    sub sp, sp, a1
 ; RV32MV-NEXT:    mv s0, a0
-; RV32MV-NEXT:    lbu a0, 12(a0)
-; RV32MV-NEXT:    lw a1, 8(s0)
-; RV32MV-NEXT:    slli a2, a0, 30
-; RV32MV-NEXT:    lw a3, 4(s0)
-; RV32MV-NEXT:    srli s1, a1, 2
-; RV32MV-NEXT:    or s1, s1, a2
-; RV32MV-NEXT:    slli a2, a1, 31
-; RV32MV-NEXT:    srli a4, a3, 1
-; RV32MV-NEXT:    or s2, a4, a2
-; RV32MV-NEXT:    srli a0, a0, 2
-; RV32MV-NEXT:    slli a0, a0, 31
-; RV32MV-NEXT:    srai s3, a0, 31
-; RV32MV-NEXT:    srli a1, a1, 1
+; RV32MV-NEXT:    lbu a1, 12(a0)
+; RV32MV-NEXT:    lw a2, 8(a0)
+; RV32MV-NEXT:    lw a3, 4(a0)
+; RV32MV-NEXT:    lw a0, 0(a0)
+; RV32MV-NEXT:    slli a4, a1, 30
+; RV32MV-NEXT:    srli s1, a2, 2
+; RV32MV-NEXT:    or s1, s1, a4
+; RV32MV-NEXT:    slli a4, a2, 31
+; RV32MV-NEXT:    srli a5, a3, 1
+; RV32MV-NEXT:    or s2, a5, a4
+; RV32MV-NEXT:    srli a1, a1, 2
 ; RV32MV-NEXT:    slli a1, a1, 31
-; RV32MV-NEXT:    srai s4, a1, 31
-; RV32MV-NEXT:    lw a0, 0(s0)
+; RV32MV-NEXT:    srai s3, a1, 31
+; RV32MV-NEXT:    srli a2, a2, 1
+; RV32MV-NEXT:    slli a2, a2, 31
+; RV32MV-NEXT:    srai s4, a2, 31
 ; RV32MV-NEXT:    slli a1, a3, 31
 ; RV32MV-NEXT:    srai a1, a1, 31
 ; RV32MV-NEXT:    li a2, 1
@@ -728,8 +728,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64MV:       # %bb.0:
 ; RV64MV-NEXT:    lbu a1, 12(a0)
 ; RV64MV-NEXT:    lwu a2, 8(a0)
-; RV64MV-NEXT:    slli a1, a1, 32
 ; RV64MV-NEXT:    ld a3, 0(a0)
+; RV64MV-NEXT:    slli a1, a1, 32
 ; RV64MV-NEXT:    or a1, a2, a1
 ; RV64MV-NEXT:    slli a1, a1, 29
 ; RV64MV-NEXT:    srai a1, a1, 31
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index 7fc4713ac2d6e..90443051d4b57 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -18,29 +18,29 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lh s0, 12(a1)
-; RV32I-NEXT:    lh s1, 8(a1)
-; RV32I-NEXT:    lh s2, 4(a1)
 ; RV32I-NEXT:    lh a2, 0(a1)
+; RV32I-NEXT:    lh s0, 4(a1)
+; RV32I-NEXT:    lh s1, 8(a1)
+; RV32I-NEXT:    lh s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, -124
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __modsi3
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 98
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, -1003
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    sh a0, 6(s3)
 ; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh s2, 2(s3)
+; RV32I-NEXT:    sh s0, 2(s3)
 ; RV32I-NEXT:    sh s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -53,52 +53,52 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_srem_vec_1:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
-; RV32IM-NEXT:    lh a3, 8(a1)
-; RV32IM-NEXT:    lh a4, 0(a1)
-; RV32IM-NEXT:    lh a1, 4(a1)
+; RV32IM-NEXT:    lh a2, 0(a1)
+; RV32IM-NEXT:    lh a3, 4(a1)
+; RV32IM-NEXT:    lh a4, 8(a1)
+; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a5, a4, a5
-; RV32IM-NEXT:    add a5, a5, a4
+; RV32IM-NEXT:    mulh a5, a2, a5
+; RV32IM-NEXT:    add a5, a5, a2
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    lui a5, 507375
 ; RV32IM-NEXT:    addi a5, a5, 1981
-; RV32IM-NEXT:    mulh a5, a1, a5
-; RV32IM-NEXT:    sub a5, a5, a1
+; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    sub a5, a5, a3
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, -124
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sub a3, a3, a5
 ; RV32IM-NEXT:    lui a5, 342392
 ; RV32IM-NEXT:    addi a5, a5, 669
-; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    mulh a5, a4, a5
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 5
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, 98
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 780943
 ; RV32IM-NEXT:    addi a5, a5, 1809
-; RV32IM-NEXT:    mulh a5, a2, a5
+; RV32IM-NEXT:    mulh a5, a1, a5
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 8
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, -1003
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_srem_vec_1:
@@ -110,29 +110,29 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lh s0, 24(a1)
-; RV64I-NEXT:    lh s1, 16(a1)
-; RV64I-NEXT:    lh s2, 8(a1)
 ; RV64I-NEXT:    lh a2, 0(a1)
+; RV64I-NEXT:    lh s0, 8(a1)
+; RV64I-NEXT:    lh s1, 16(a1)
+; RV64I-NEXT:    lh s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, -124
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __moddi3
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 98
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, -1003
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    sh a0, 6(s3)
 ; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh s2, 2(s3)
+; RV64I-NEXT:    sh s0, 2(s3)
 ; RV64I-NEXT:    sh s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -145,52 +145,52 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_srem_vec_1:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 0(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI0_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI0_0)(a3)
-; RV64IM-NEXT:    lh a4, 24(a1)
+; RV64IM-NEXT:    lui a2, %hi(.LCPI0_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
+; RV64IM-NEXT:    lh a3, 0(a1)
+; RV64IM-NEXT:    lh a4, 8(a1)
 ; RV64IM-NEXT:    lh a5, 16(a1)
-; RV64IM-NEXT:    lh a1, 8(a1)
-; RV64IM-NEXT:    mulh a3, a2, a3
-; RV64IM-NEXT:    add a3, a3, a2
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 6
-; RV64IM-NEXT:    add a3, a3, a6
+; RV64IM-NEXT:    lh a1, 24(a1)
+; RV64IM-NEXT:    mulh a2, a3, a2
+; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 6
+; RV64IM-NEXT:    add a2, a2, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_1)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_1)(a6)
 ; RV64IM-NEXT:    li a7, 95
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a1, a6
-; RV64IM-NEXT:    sub a3, a3, a1
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 6
-; RV64IM-NEXT:    add a3, a3, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulh a2, a4, a6
+; RV64IM-NEXT:    sub a2, a2, a4
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 6
+; RV64IM-NEXT:    add a2, a2, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_2)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_2)(a6)
 ; RV64IM-NEXT:    li a7, -124
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    mulh a3, a5, a6
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 5
-; RV64IM-NEXT:    add a3, a3, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a4, a4, a2
+; RV64IM-NEXT:    mulh a2, a5, a6
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 5
+; RV64IM-NEXT:    add a2, a2, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_3)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_3)(a6)
 ; RV64IM-NEXT:    li a7, 98
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a5, a5, a3
-; RV64IM-NEXT:    mulh a3, a4, a6
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 7
-; RV64IM-NEXT:    add a3, a3, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a5, a5, a2
+; RV64IM-NEXT:    mulh a2, a1, a6
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 7
+; RV64IM-NEXT:    add a2, a2, a6
 ; RV64IM-NEXT:    li a6, -1003
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 -124, i16 98, i16 -1003>
   ret <4 x i16> %1
@@ -206,29 +206,29 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lh s0, 12(a1)
-; RV32I-NEXT:    lh s1, 8(a1)
-; RV32I-NEXT:    lh s2, 4(a1)
 ; RV32I-NEXT:    lh a2, 0(a1)
+; RV32I-NEXT:    lh s0, 4(a1)
+; RV32I-NEXT:    lh s1, 8(a1)
+; RV32I-NEXT:    lh s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __modsi3
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    sh a0, 6(s3)
 ; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh s2, 2(s3)
+; RV32I-NEXT:    sh s0, 2(s3)
 ; RV32I-NEXT:    sh s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -241,45 +241,45 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_srem_vec_2:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
-; RV32IM-NEXT:    lh a3, 8(a1)
-; RV32IM-NEXT:    lh a4, 0(a1)
-; RV32IM-NEXT:    lh a1, 4(a1)
+; RV32IM-NEXT:    lh a2, 0(a1)
+; RV32IM-NEXT:    lh a3, 4(a1)
+; RV32IM-NEXT:    lh a4, 8(a1)
+; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a6, a4, a5
-; RV32IM-NEXT:    add a6, a6, a4
+; RV32IM-NEXT:    mulh a6, a2, a5
+; RV32IM-NEXT:    add a6, a6, a2
 ; RV32IM-NEXT:    srli a7, a6, 31
 ; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, a7
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a4, a4, a6
-; RV32IM-NEXT:    mulh a6, a1, a5
-; RV32IM-NEXT:    add a6, a6, a1
+; RV32IM-NEXT:    sub a2, a2, a6
+; RV32IM-NEXT:    mulh a6, a3, a5
+; RV32IM-NEXT:    add a6, a6, a3
 ; RV32IM-NEXT:    srli t0, a6, 31
 ; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, t0
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a1, a1, a6
-; RV32IM-NEXT:    mulh a6, a3, a5
-; RV32IM-NEXT:    add a6, a6, a3
+; RV32IM-NEXT:    sub a3, a3, a6
+; RV32IM-NEXT:    mulh a6, a4, a5
+; RV32IM-NEXT:    add a6, a6, a4
 ; RV32IM-NEXT:    srli t0, a6, 31
 ; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, t0
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a3, a3, a6
-; RV32IM-NEXT:    mulh a5, a2, a5
-; RV32IM-NEXT:    add a5, a5, a2
+; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    mulh a5, a1, a5
+; RV32IM-NEXT:    add a5, a5, a1
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    mul a5, a5, a7
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_srem_vec_2:
@@ -291,29 +291,29 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lh s0, 24(a1)
-; RV64I-NEXT:    lh s1, 16(a1)
-; RV64I-NEXT:    lh s2, 8(a1)
 ; RV64I-NEXT:    lh a2, 0(a1)
+; RV64I-NEXT:    lh s0, 8(a1)
+; RV64I-NEXT:    lh s1, 16(a1)
+; RV64I-NEXT:    lh s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __moddi3
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    sh a0, 6(s3)
 ; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh s2, 2(s3)
+; RV64I-NEXT:    sh s0, 2(s3)
 ; RV64I-NEXT:    sh s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -326,45 +326,45 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_srem_vec_2:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 0(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI1_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI1_0)(a3)
-; RV64IM-NEXT:    lh a4, 24(a1)
+; RV64IM-NEXT:    lui a2, %hi(.LCPI1_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI1_0)(a2)
+; RV64IM-NEXT:    lh a3, 0(a1)
+; RV64IM-NEXT:    lh a4, 8(a1)
 ; RV64IM-NEXT:    lh a5, 16(a1)
-; RV64IM-NEXT:    lh a1, 8(a1)
-; RV64IM-NEXT:    mulh a6, a2, a3
-; RV64IM-NEXT:    add a6, a6, a2
+; RV64IM-NEXT:    lh a1, 24(a1)
+; RV64IM-NEXT:    mulh a6, a3, a2
+; RV64IM-NEXT:    add a6, a6, a3
 ; RV64IM-NEXT:    srli a7, a6, 63
 ; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    add a6, a6, a7
 ; RV64IM-NEXT:    li a7, 95
 ; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a2, a2, a6
-; RV64IM-NEXT:    mulh a6, a1, a3
-; RV64IM-NEXT:    add a6, a6, a1
+; RV64IM-NEXT:    subw a3, a3, a6
+; RV64IM-NEXT:    mulh a6, a4, a2
+; RV64IM-NEXT:    add a6, a6, a4
 ; RV64IM-NEXT:    srli t0, a6, 63
 ; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    add a6, a6, t0
 ; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a1, a1, a6
-; RV64IM-NEXT:    mulh a6, a5, a3
+; RV64IM-NEXT:    subw a4, a4, a6
+; RV64IM-NEXT:    mulh a6, a5, a2
 ; RV64IM-NEXT:    add a6, a6, a5
 ; RV64IM-NEXT:    srli t0, a6, 63
 ; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    add a6, a6, t0
 ; RV64IM-NEXT:    mul a6, a6, a7
 ; RV64IM-NEXT:    subw a5, a5, a6
-; RV64IM-NEXT:    mulh a3, a4, a3
-; RV64IM-NEXT:    add a3, a3, a4
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 6
-; RV64IM-NEXT:    add a3, a3, a6
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    mulh a2, a1, a2
+; RV64IM-NEXT:    add a2, a2, a1
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 6
+; RV64IM-NEXT:    add a2, a2, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   ret <4 x i16> %1
@@ -445,14 +445,14 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: combine_srem_sdiv:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 0(a1)
-; RV32IM-NEXT:    lh a3, 4(a1)
-; RV32IM-NEXT:    lh a4, 12(a1)
+; RV32IM-NEXT:    lh a2, 12(a1)
+; RV32IM-NEXT:    lh a3, 0(a1)
+; RV32IM-NEXT:    lh a4, 4(a1)
 ; RV32IM-NEXT:    lh a1, 8(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a6, a4, a5
-; RV32IM-NEXT:    add a6, a6, a4
+; RV32IM-NEXT:    mulh a6, a2, a5
+; RV32IM-NEXT:    add a6, a6, a2
 ; RV32IM-NEXT:    srli a7, a6, 31
 ; RV32IM-NEXT:    srai a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, a7
@@ -464,30 +464,30 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    srai t1, t1, 6
 ; RV32IM-NEXT:    add t1, t1, t2
 ; RV32IM-NEXT:    mul t2, t1, a7
-; RV32IM-NEXT:    mulh t3, a3, a5
-; RV32IM-NEXT:    add t3, t3, a3
+; RV32IM-NEXT:    mulh t3, a4, a5
+; RV32IM-NEXT:    add t3, t3, a4
 ; RV32IM-NEXT:    srli t4, t3, 31
 ; RV32IM-NEXT:    srai t3, t3, 6
 ; RV32IM-NEXT:    add t3, t3, t4
 ; RV32IM-NEXT:    mul t4, t3, a7
-; RV32IM-NEXT:    mulh a5, a2, a5
-; RV32IM-NEXT:    add a5, a5, a2
+; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    add a5, a5, a3
 ; RV32IM-NEXT:    srli t5, a5, 31
 ; RV32IM-NEXT:    srai a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, t5
 ; RV32IM-NEXT:    mul a7, a5, a7
-; RV32IM-NEXT:    add a2, a2, a5
-; RV32IM-NEXT:    sub a2, a2, a7
-; RV32IM-NEXT:    add a3, a3, t3
-; RV32IM-NEXT:    sub a3, a3, t4
+; RV32IM-NEXT:    add a3, a3, a5
+; RV32IM-NEXT:    sub a3, a3, a7
+; RV32IM-NEXT:    add a4, a4, t3
+; RV32IM-NEXT:    sub a4, a4, t4
 ; RV32IM-NEXT:    add a1, a1, t1
 ; RV32IM-NEXT:    sub a1, a1, t2
-; RV32IM-NEXT:    add a4, a4, a6
-; RV32IM-NEXT:    sub a4, a4, t0
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    add a2, a2, a6
+; RV32IM-NEXT:    sub a2, a2, t0
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
-; RV32IM-NEXT:    sh a2, 0(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
+; RV32IM-NEXT:    sh a3, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: combine_srem_sdiv:
@@ -624,21 +624,21 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lh a2, 0(a1)
+; RV32I-NEXT:    lh a3, 4(a1)
+; RV32I-NEXT:    lh a4, 8(a1)
 ; RV32I-NEXT:    lh a0, 12(a1)
-; RV32I-NEXT:    lh a3, 8(a1)
-; RV32I-NEXT:    lh a1, 4(a1)
-; RV32I-NEXT:    srli a4, a2, 26
-; RV32I-NEXT:    add a4, a2, a4
-; RV32I-NEXT:    andi a4, a4, -64
-; RV32I-NEXT:    sub s1, a2, a4
-; RV32I-NEXT:    srli a2, a1, 27
-; RV32I-NEXT:    add a2, a1, a2
-; RV32I-NEXT:    andi a2, a2, -32
-; RV32I-NEXT:    sub s2, a1, a2
-; RV32I-NEXT:    srli a1, a3, 29
+; RV32I-NEXT:    srli a1, a2, 26
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    andi a1, a1, -64
+; RV32I-NEXT:    sub s1, a2, a1
+; RV32I-NEXT:    srli a1, a3, 27
 ; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    andi a1, a1, -32
+; RV32I-NEXT:    sub s2, a3, a1
+; RV32I-NEXT:    srli a1, a4, 29
+; RV32I-NEXT:    add a1, a4, a1
 ; RV32I-NEXT:    andi a1, a1, -8
-; RV32I-NEXT:    sub s3, a3, a1
+; RV32I-NEXT:    sub s3, a4, a1
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    sh a0, 6(s0)
@@ -655,8 +655,8 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_srem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 8(a1)
-; RV32IM-NEXT:    lh a3, 4(a1)
+; RV32IM-NEXT:    lh a2, 4(a1)
+; RV32IM-NEXT:    lh a3, 8(a1)
 ; RV32IM-NEXT:    lh a4, 12(a1)
 ; RV32IM-NEXT:    lh a1, 0(a1)
 ; RV32IM-NEXT:    lui a5, 706409
@@ -673,16 +673,16 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    add a5, a1, a5
 ; RV32IM-NEXT:    andi a5, a5, -64
 ; RV32IM-NEXT:    sub a1, a1, a5
-; RV32IM-NEXT:    srli a5, a3, 27
-; RV32IM-NEXT:    add a5, a3, a5
-; RV32IM-NEXT:    andi a5, a5, -32
-; RV32IM-NEXT:    sub a3, a3, a5
-; RV32IM-NEXT:    srli a5, a2, 29
+; RV32IM-NEXT:    srli a5, a2, 27
 ; RV32IM-NEXT:    add a5, a2, a5
-; RV32IM-NEXT:    andi a5, a5, -8
+; RV32IM-NEXT:    andi a5, a5, -32
 ; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    srli a5, a3, 29
+; RV32IM-NEXT:    add a5, a3, a5
+; RV32IM-NEXT:    andi a5, a5, -8
+; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    sh a1, 0(a0)
 ; RV32IM-NEXT:    sh a4, 6(a0)
 ; RV32IM-NEXT:    ret
@@ -697,21 +697,21 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lh a2, 0(a1)
+; RV64I-NEXT:    lh a3, 8(a1)
+; RV64I-NEXT:    lh a4, 16(a1)
 ; RV64I-NEXT:    lh a0, 24(a1)
-; RV64I-NEXT:    lh a3, 16(a1)
-; RV64I-NEXT:    lh a1, 8(a1)
-; RV64I-NEXT:    srli a4, a2, 58
-; RV64I-NEXT:    add a4, a2, a4
-; RV64I-NEXT:    andi a4, a4, -64
-; RV64I-NEXT:    subw s1, a2, a4
-; RV64I-NEXT:    srli a2, a1, 59
-; RV64I-NEXT:    add a2, a1, a2
-; RV64I-NEXT:    andi a2, a2, -32
-; RV64I-NEXT:    subw s2, a1, a2
-; RV64I-NEXT:    srli a1, a3, 61
+; RV64I-NEXT:    srli a1, a2, 58
+; RV64I-NEXT:    add a1, a2, a1
+; RV64I-NEXT:    andi a1, a1, -64
+; RV64I-NEXT:    subw s1, a2, a1
+; RV64I-NEXT:    srli a1, a3, 59
 ; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    andi a1, a1, -32
+; RV64I-NEXT:    subw s2, a3, a1
+; RV64I-NEXT:    srli a1, a4, 61
+; RV64I-NEXT:    add a1, a4, a1
 ; RV64I-NEXT:    andi a1, a1, -8
-; RV64I-NEXT:    subw s3, a3, a1
+; RV64I-NEXT:    subw s3, a4, a1
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    sh a0, 6(s0)
@@ -773,24 +773,24 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lh s0, 12(a1)
-; RV32I-NEXT:    lh s1, 8(a1)
 ; RV32I-NEXT:    lh a2, 4(a1)
+; RV32I-NEXT:    lh s0, 8(a1)
+; RV32I-NEXT:    lh s1, 12(a1)
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    li a1, 654
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 23
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __modsi3
-; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a1, a0, 1327
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    sh a0, 6(s2)
-; RV32I-NEXT:    sh s1, 4(s2)
+; RV32I-NEXT:    sh s0, 4(s2)
 ; RV32I-NEXT:    sh s3, 2(s2)
 ; RV32I-NEXT:    sh zero, 0(s2)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -803,43 +803,43 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_srem_one:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
-; RV32IM-NEXT:    lh a3, 4(a1)
-; RV32IM-NEXT:    lh a1, 8(a1)
+; RV32IM-NEXT:    lh a2, 4(a1)
+; RV32IM-NEXT:    lh a3, 8(a1)
+; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a4, 820904
 ; RV32IM-NEXT:    addi a4, a4, -1903
-; RV32IM-NEXT:    mulh a4, a3, a4
-; RV32IM-NEXT:    add a4, a4, a3
+; RV32IM-NEXT:    mulh a4, a2, a4
+; RV32IM-NEXT:    add a4, a4, a2
 ; RV32IM-NEXT:    srli a5, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 9
 ; RV32IM-NEXT:    add a4, a4, a5
 ; RV32IM-NEXT:    li a5, 654
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a3, a3, a4
+; RV32IM-NEXT:    sub a2, a2, a4
 ; RV32IM-NEXT:    lui a4, 729444
 ; RV32IM-NEXT:    addi a4, a4, 713
-; RV32IM-NEXT:    mulh a4, a1, a4
-; RV32IM-NEXT:    add a4, a4, a1
+; RV32IM-NEXT:    mulh a4, a3, a4
+; RV32IM-NEXT:    add a4, a4, a3
 ; RV32IM-NEXT:    srli a5, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 4
 ; RV32IM-NEXT:    add a4, a4, a5
 ; RV32IM-NEXT:    li a5, 23
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a1, a1, a4
+; RV32IM-NEXT:    sub a3, a3, a4
 ; RV32IM-NEXT:    lui a4, 395996
 ; RV32IM-NEXT:    addi a4, a4, -2009
-; RV32IM-NEXT:    mulh a4, a2, a4
+; RV32IM-NEXT:    mulh a4, a1, a4
 ; RV32IM-NEXT:    srli a5, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 11
 ; RV32IM-NEXT:    add a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 1
 ; RV32IM-NEXT:    addi a5, a5, 1327
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a2, a2, a4
+; RV32IM-NEXT:    sub a1, a1, a4
 ; RV32IM-NEXT:    sh zero, 0(a0)
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_srem_one:
@@ -850,24 +850,24 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lh s0, 24(a1)
-; RV64I-NEXT:    lh s1, 16(a1)
 ; RV64I-NEXT:    lh a2, 8(a1)
+; RV64I-NEXT:    lh s0, 16(a1)
+; RV64I-NEXT:    lh s1, 24(a1)
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    li a1, 654
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __moddi3
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    sh a0, 6(s2)
-; RV64I-NEXT:    sh s1, 4(s2)
+; RV64I-NEXT:    sh s0, 4(s2)
 ; RV64I-NEXT:    sh s3, 2(s2)
 ; RV64I-NEXT:    sh zero, 0(s2)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -880,42 +880,42 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_srem_one:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 16(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI4_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI4_0)(a3)
-; RV64IM-NEXT:    lh a4, 24(a1)
-; RV64IM-NEXT:    lh a1, 8(a1)
-; RV64IM-NEXT:    mulh a3, a2, a3
-; RV64IM-NEXT:    add a3, a3, a2
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 4
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    lui a2, %hi(.LCPI4_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI4_0)(a2)
+; RV64IM-NEXT:    lh a3, 16(a1)
+; RV64IM-NEXT:    lh a4, 8(a1)
+; RV64IM-NEXT:    lh a1, 24(a1)
+; RV64IM-NEXT:    mulh a2, a3, a2
+; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 4
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_1)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_1)(a5)
 ; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a1, a5
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 8
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulh a2, a4, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 8
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_2)(a5)
 ; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    mulh a3, a4, a5
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 11
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a4, a4, a2
+; RV64IM-NEXT:    mulh a2, a1, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 11
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addi a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    subw a4, a4, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a2
 ; RV64IM-NEXT:    sh zero, 0(a0)
-; RV64IM-NEXT:    sh a4, 6(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 4(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 4(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
   ret <4 x i16> %1
@@ -933,8 +933,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lh a2, 4(a1)
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lh s1, 12(a1)
 ; RV32I-NEXT:    lh a0, 8(a1)
+; RV32I-NEXT:    lh s1, 12(a1)
 ; RV32I-NEXT:    srli a1, a2, 17
 ; RV32I-NEXT:    add a1, a2, a1
 ; RV32I-NEXT:    lui a3, 8
@@ -1005,8 +1005,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lh a2, 8(a1)
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lh s1, 24(a1)
 ; RV64I-NEXT:    lh a0, 16(a1)
+; RV64I-NEXT:    lh s1, 24(a1)
 ; RV64I-NEXT:    srli a1, a2, 49
 ; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    lui a3, 8
@@ -1033,38 +1033,38 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_i16_smax:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 16(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI5_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI5_0)(a3)
-; RV64IM-NEXT:    lh a4, 24(a1)
-; RV64IM-NEXT:    mulh a3, a2, a3
-; RV64IM-NEXT:    add a3, a3, a2
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 4
-; RV64IM-NEXT:    add a3, a3, a5
-; RV64IM-NEXT:    li a5, 23
-; RV64IM-NEXT:    lui a6, %hi(.LCPI5_1)
-; RV64IM-NEXT:    ld a6, %lo(.LCPI5_1)(a6)
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    lh a1, 8(a1)
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a4, a6
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 11
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    lui a2, %hi(.LCPI5_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI5_0)(a2)
+; RV64IM-NEXT:    lh a3, 16(a1)
+; RV64IM-NEXT:    lh a4, 8(a1)
+; RV64IM-NEXT:    lh a1, 24(a1)
+; RV64IM-NEXT:    mulh a2, a3, a2
+; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 4
+; RV64IM-NEXT:    add a2, a2, a5
+; RV64IM-NEXT:    lui a5, %hi(.LCPI5_1)
+; RV64IM-NEXT:    ld a5, %lo(.LCPI5_1)(a5)
+; RV64IM-NEXT:    li a6, 23
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulh a2, a1, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 11
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addi a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    srli a3, a1, 49
-; RV64IM-NEXT:    add a3, a1, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    srli a2, a4, 49
+; RV64IM-NEXT:    add a2, a4, a2
 ; RV64IM-NEXT:    lui a5, 8
-; RV64IM-NEXT:    and a3, a3, a5
-; RV64IM-NEXT:    subw a1, a1, a3
+; RV64IM-NEXT:    and a2, a2, a5
+; RV64IM-NEXT:    subw a4, a4, a2
 ; RV64IM-NEXT:    sh zero, 0(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a4, 6(a0)
-; RV64IM-NEXT:    sh a2, 4(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
+; RV64IM-NEXT:    sh a3, 4(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 1, i16 32768, i16 23, i16 5423>
   ret <4 x i16> %1
@@ -1085,17 +1085,18 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 24(a1)
-; RV32I-NEXT:    lw s1, 28(a1)
-; RV32I-NEXT:    lw s2, 16(a1)
-; RV32I-NEXT:    lw s3, 20(a1)
+; RV32I-NEXT:    lw s0, 16(a1)
+; RV32I-NEXT:    lw s1, 20(a1)
+; RV32I-NEXT:    lw s2, 24(a1)
+; RV32I-NEXT:    lw s3, 28(a1)
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw s4, 8(a1)
 ; RV32I-NEXT:    lw s5, 12(a1)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    li a2, 1
 ; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3
 ; RV32I-NEXT:    mv s7, a0
@@ -1108,22 +1109,22 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    mv s5, a1
 ; RV32I-NEXT:    li a2, 23
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3
-; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    mv s3, a1
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a2, a0, 1327
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3
 ; RV32I-NEXT:    sw a1, 28(s6)
 ; RV32I-NEXT:    sw a0, 24(s6)
-; RV32I-NEXT:    sw s3, 20(s6)
-; RV32I-NEXT:    sw s2, 16(s6)
+; RV32I-NEXT:    sw s1, 20(s6)
+; RV32I-NEXT:    sw s0, 16(s6)
 ; RV32I-NEXT:    sw s5, 12(s6)
 ; RV32I-NEXT:    sw s4, 8(s6)
 ; RV32I-NEXT:    sw s8, 4(s6)
@@ -1154,17 +1155,18 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw s0, 24(a1)
-; RV32IM-NEXT:    lw s1, 28(a1)
-; RV32IM-NEXT:    lw s2, 16(a1)
-; RV32IM-NEXT:    lw s3, 20(a1)
+; RV32IM-NEXT:    lw s0, 16(a1)
+; RV32IM-NEXT:    lw s1, 20(a1)
+; RV32IM-NEXT:    lw s2, 24(a1)
+; RV32IM-NEXT:    lw s3, 28(a1)
+; RV32IM-NEXT:    lw a3, 0(a1)
+; RV32IM-NEXT:    lw a4, 4(a1)
 ; RV32IM-NEXT:    lw s4, 8(a1)
 ; RV32IM-NEXT:    lw s5, 12(a1)
-; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a1, 4(a1)
 ; RV32IM-NEXT:    mv s6, a0
 ; RV32IM-NEXT:    li a2, 1
 ; RV32IM-NEXT:    mv a0, a3
+; RV32IM-NEXT:    mv a1, a4
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3
 ; RV32IM-NEXT:    mv s7, a0
@@ -1177,22 +1179,22 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    mv s4, a0
 ; RV32IM-NEXT:    mv s5, a1
 ; RV32IM-NEXT:    li a2, 23
-; RV32IM-NEXT:    mv a0, s2
-; RV32IM-NEXT:    mv a1, s3
+; RV32IM-NEXT:    mv a0, s0
+; RV32IM-NEXT:    mv a1, s1
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3
-; RV32IM-NEXT:    mv s2, a0
-; RV32IM-NEXT:    mv s3, a1
+; RV32IM-NEXT:    mv s0, a0
+; RV32IM-NEXT:    mv s1, a1
 ; RV32IM-NEXT:    lui a0, 1
 ; RV32IM-NEXT:    addi a2, a0, 1327
-; RV32IM-NEXT:    mv a0, s0
-; RV32IM-NEXT:    mv a1, s1
+; RV32IM-NEXT:    mv a0, s2
+; RV32IM-NEXT:    mv a1, s3
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3
 ; RV32IM-NEXT:    sw a1, 28(s6)
 ; RV32IM-NEXT:    sw a0, 24(s6)
-; RV32IM-NEXT:    sw s3, 20(s6)
-; RV32IM-NEXT:    sw s2, 16(s6)
+; RV32IM-NEXT:    sw s1, 20(s6)
+; RV32IM-NEXT:    sw s0, 16(s6)
 ; RV32IM-NEXT:    sw s5, 12(s6)
 ; RV32IM-NEXT:    sw s4, 8(s6)
 ; RV32IM-NEXT:    sw s8, 4(s6)
@@ -1218,24 +1220,24 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    ld s0, 24(a1)
-; RV64I-NEXT:    ld s1, 16(a1)
 ; RV64I-NEXT:    ld a2, 8(a1)
+; RV64I-NEXT:    ld s0, 16(a1)
+; RV64I-NEXT:    ld s1, 24(a1)
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    li a1, 654
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __moddi3
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    sd a0, 24(s2)
-; RV64I-NEXT:    sd s1, 16(s2)
+; RV64I-NEXT:    sd s0, 16(s2)
 ; RV64I-NEXT:    sd s3, 8(s2)
 ; RV64I-NEXT:    sd zero, 0(s2)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -1248,42 +1250,42 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_srem_i64:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    ld a2, 16(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI6_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI6_0)(a3)
-; RV64IM-NEXT:    ld a4, 24(a1)
-; RV64IM-NEXT:    ld a1, 8(a1)
-; RV64IM-NEXT:    mulh a3, a2, a3
-; RV64IM-NEXT:    add a3, a3, a2
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srai a3, a3, 4
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    lui a2, %hi(.LCPI6_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI6_0)(a2)
+; RV64IM-NEXT:    ld a3, 16(a1)
+; RV64IM-NEXT:    ld a4, 8(a1)
+; RV64IM-NEXT:    ld a1, 24(a1)
+; RV64IM-NEXT:    mulh a2, a3, a2
+; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srai a2, a2, 4
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI6_1)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI6_1)(a5)
 ; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    sub a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a1, a5
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srai a3, a3, 8
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    sub a3, a3, a2
+; RV64IM-NEXT:    mulh a2, a4, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srai a2, a2, 8
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI6_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI6_2)(a5)
 ; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    sub a1, a1, a3
-; RV64IM-NEXT:    mulh a3, a4, a5
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srai a3, a3, 11
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    sub a4, a4, a2
+; RV64IM-NEXT:    mulh a2, a1, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srai a2, a2, 11
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addiw a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    sub a4, a4, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    sub a1, a1, a2
 ; RV64IM-NEXT:    sd zero, 0(a0)
-; RV64IM-NEXT:    sd a4, 24(a0)
-; RV64IM-NEXT:    sd a1, 8(a0)
-; RV64IM-NEXT:    sd a2, 16(a0)
+; RV64IM-NEXT:    sd a1, 24(a0)
+; RV64IM-NEXT:    sd a4, 8(a0)
+; RV64IM-NEXT:    sd a3, 16(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
   ret <4 x i64> %1
diff --git a/llvm/test/CodeGen/RISCV/stack-realignment.ll b/llvm/test/CodeGen/RISCV/stack-realignment.ll
index 034ebadc76af2..58876e4888a91 100644
--- a/llvm/test/CodeGen/RISCV/stack-realignment.ll
+++ b/llvm/test/CodeGen/RISCV/stack-realignment.ll
@@ -815,8 +815,7 @@ define void @caller1024() {
 ; RV32I-NEXT:    andi sp, sp, -1024
 ; RV32I-NEXT:    addi a0, sp, 1024
 ; RV32I-NEXT:    call callee
-; RV32I-NEXT:    addi sp, s0, -2048
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    addi sp, s0, -2032
 ; RV32I-NEXT:    lw ra, 2028(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 2024(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 2032
@@ -836,8 +835,7 @@ define void @caller1024() {
 ; RV32I-ILP32E-NEXT:    andi sp, sp, -1024
 ; RV32I-ILP32E-NEXT:    addi a0, sp, 1024
 ; RV32I-ILP32E-NEXT:    call callee
-; RV32I-ILP32E-NEXT:    addi sp, s0, -2048
-; RV32I-ILP32E-NEXT:    addi sp, sp, 4
+; RV32I-ILP32E-NEXT:    addi sp, s0, -2044
 ; RV32I-ILP32E-NEXT:    lw ra, 2040(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    lw s0, 2036(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    addi sp, sp, 2044
@@ -857,8 +855,7 @@ define void @caller1024() {
 ; RV64I-NEXT:    andi sp, sp, -1024
 ; RV64I-NEXT:    addi a0, sp, 1024
 ; RV64I-NEXT:    call callee
-; RV64I-NEXT:    addi sp, s0, -2048
-; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    addi sp, s0, -2032
 ; RV64I-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 2016(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 2032
@@ -878,8 +875,7 @@ define void @caller1024() {
 ; RV64I-LP64E-NEXT:    andi sp, sp, -1024
 ; RV64I-LP64E-NEXT:    addi a0, sp, 1024
 ; RV64I-LP64E-NEXT:    call callee
-; RV64I-LP64E-NEXT:    addi sp, s0, -2048
-; RV64I-LP64E-NEXT:    addi sp, sp, 8
+; RV64I-LP64E-NEXT:    addi sp, s0, -2040
 ; RV64I-LP64E-NEXT:    ld ra, 2032(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    ld s0, 2024(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    addi sp, sp, 2040
@@ -959,10 +955,7 @@ define void @caller2048() {
 ; RV32I-NEXT:    addi a0, sp, 2047
 ; RV32I-NEXT:    addi a0, a0, 1
 ; RV32I-NEXT:    call callee
-; RV32I-NEXT:    lui a0, 1
-; RV32I-NEXT:    sub sp, s0, a0
-; RV32I-NEXT:    addi sp, sp, 2032
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    addi sp, s0, -2032
 ; RV32I-NEXT:    lw ra, 2028(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 2024(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 2032
@@ -984,10 +977,7 @@ define void @caller2048() {
 ; RV32I-ILP32E-NEXT:    addi a0, sp, 2047
 ; RV32I-ILP32E-NEXT:    addi a0, a0, 1
 ; RV32I-ILP32E-NEXT:    call callee
-; RV32I-ILP32E-NEXT:    lui a0, 1
-; RV32I-ILP32E-NEXT:    sub sp, s0, a0
-; RV32I-ILP32E-NEXT:    addi sp, sp, 2044
-; RV32I-ILP32E-NEXT:    addi sp, sp, 8
+; RV32I-ILP32E-NEXT:    addi sp, s0, -2044
 ; RV32I-ILP32E-NEXT:    lw ra, 2040(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    lw s0, 2036(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    addi sp, sp, 2044
@@ -1009,10 +999,7 @@ define void @caller2048() {
 ; RV64I-NEXT:    addi a0, sp, 2047
 ; RV64I-NEXT:    addi a0, a0, 1
 ; RV64I-NEXT:    call callee
-; RV64I-NEXT:    lui a0, 1
-; RV64I-NEXT:    sub sp, s0, a0
-; RV64I-NEXT:    addi sp, sp, 2032
-; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    addi sp, s0, -2032
 ; RV64I-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 2016(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 2032
@@ -1034,10 +1021,7 @@ define void @caller2048() {
 ; RV64I-LP64E-NEXT:    addi a0, sp, 2047
 ; RV64I-LP64E-NEXT:    addi a0, a0, 1
 ; RV64I-LP64E-NEXT:    call callee
-; RV64I-LP64E-NEXT:    lui a0, 1
-; RV64I-LP64E-NEXT:    sub sp, s0, a0
-; RV64I-LP64E-NEXT:    addi sp, sp, 2040
-; RV64I-LP64E-NEXT:    addi sp, sp, 16
+; RV64I-LP64E-NEXT:    addi sp, s0, -2040
 ; RV64I-LP64E-NEXT:    ld ra, 2032(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    ld s0, 2024(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    addi sp, sp, 2040
@@ -1119,10 +1103,7 @@ define void @caller4096() {
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    add a0, sp, a0
 ; RV32I-NEXT:    call callee
-; RV32I-NEXT:    lui a0, 2
-; RV32I-NEXT:    sub sp, s0, a0
-; RV32I-NEXT:    addi a0, a0, -2032
-; RV32I-NEXT:    add sp, sp, a0
+; RV32I-NEXT:    addi sp, s0, -2032
 ; RV32I-NEXT:    lw ra, 2028(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 2024(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 2032
@@ -1146,10 +1127,7 @@ define void @caller4096() {
 ; RV32I-ILP32E-NEXT:    lui a0, 1
 ; RV32I-ILP32E-NEXT:    add a0, sp, a0
 ; RV32I-ILP32E-NEXT:    call callee
-; RV32I-ILP32E-NEXT:    lui a0, 2
-; RV32I-ILP32E-NEXT:    sub sp, s0, a0
-; RV32I-ILP32E-NEXT:    addi a0, a0, -2044
-; RV32I-ILP32E-NEXT:    add sp, sp, a0
+; RV32I-ILP32E-NEXT:    addi sp, s0, -2044
 ; RV32I-ILP32E-NEXT:    lw ra, 2040(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    lw s0, 2036(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    addi sp, sp, 2044
@@ -1173,10 +1151,7 @@ define void @caller4096() {
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    add a0, sp, a0
 ; RV64I-NEXT:    call callee
-; RV64I-NEXT:    lui a0, 2
-; RV64I-NEXT:    sub sp, s0, a0
-; RV64I-NEXT:    addiw a0, a0, -2032
-; RV64I-NEXT:    add sp, sp, a0
+; RV64I-NEXT:    addi sp, s0, -2032
 ; RV64I-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 2016(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 2032
@@ -1200,10 +1175,7 @@ define void @caller4096() {
 ; RV64I-LP64E-NEXT:    lui a0, 1
 ; RV64I-LP64E-NEXT:    add a0, sp, a0
 ; RV64I-LP64E-NEXT:    call callee
-; RV64I-LP64E-NEXT:    lui a0, 2
-; RV64I-LP64E-NEXT:    sub sp, s0, a0
-; RV64I-LP64E-NEXT:    addiw a0, a0, -2040
-; RV64I-LP64E-NEXT:    add sp, sp, a0
+; RV64I-LP64E-NEXT:    addi sp, s0, -2040
 ; RV64I-LP64E-NEXT:    ld ra, 2032(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    ld s0, 2024(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    addi sp, sp, 2040
diff --git a/llvm/test/CodeGen/RISCV/stack-store-check.ll b/llvm/test/CodeGen/RISCV/stack-store-check.ll
index 91cfb2a4cef70..b51a759a87b85 100644
--- a/llvm/test/CodeGen/RISCV/stack-store-check.ll
+++ b/llvm/test/CodeGen/RISCV/stack-store-check.ll
@@ -143,15 +143,15 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    addi a2, sp, 392
 ; CHECK-NEXT:    sw a3, 392(sp)
 ; CHECK-NEXT:    call __subtf3
-; CHECK-NEXT:    lw a0, 424(sp)
+; CHECK-NEXT:    lw a0, 432(sp)
 ; CHECK-NEXT:    lw a1, 436(sp)
-; CHECK-NEXT:    lw a2, 432(sp)
+; CHECK-NEXT:    lw a2, 424(sp)
 ; CHECK-NEXT:    lw a3, 428(sp)
 ; CHECK-NEXT:    lui a4, %hi(X)
 ; CHECK-NEXT:    sw a1, %lo(X+12)(a4)
-; CHECK-NEXT:    sw a2, %lo(X+8)(a4)
+; CHECK-NEXT:    sw a0, %lo(X+8)(a4)
 ; CHECK-NEXT:    sw a3, %lo(X+4)(a4)
-; CHECK-NEXT:    sw a0, %lo(X)(a4)
+; CHECK-NEXT:    sw a2, %lo(X)(a4)
 ; CHECK-NEXT:    lw s8, 4(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    sw s8, 212(sp)
 ; CHECK-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
@@ -190,15 +190,15 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    addi a2, sp, 344
 ; CHECK-NEXT:    sw s9, 360(sp)
 ; CHECK-NEXT:    call __multf3
-; CHECK-NEXT:    lw a0, 376(sp)
+; CHECK-NEXT:    lw a0, 384(sp)
 ; CHECK-NEXT:    lw a1, 388(sp)
-; CHECK-NEXT:    lw a2, 384(sp)
+; CHECK-NEXT:    lw a2, 376(sp)
 ; CHECK-NEXT:    lw a3, 380(sp)
 ; CHECK-NEXT:    lui a4, %hi(S)
 ; CHECK-NEXT:    sw a1, %lo(S+12)(a4)
-; CHECK-NEXT:    sw a2, %lo(S+8)(a4)
+; CHECK-NEXT:    sw a0, %lo(S+8)(a4)
 ; CHECK-NEXT:    sw a3, %lo(S+4)(a4)
-; CHECK-NEXT:    sw a0, %lo(S)(a4)
+; CHECK-NEXT:    sw a2, %lo(S)(a4)
 ; CHECK-NEXT:    lw a0, 48(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    sw a0, 260(sp)
 ; CHECK-NEXT:    sw s10, 256(sp)
@@ -216,15 +216,15 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    lw a3, 44(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    sw a3, 264(sp)
 ; CHECK-NEXT:    call __subtf3
-; CHECK-NEXT:    lw a0, 280(sp)
+; CHECK-NEXT:    lw a0, 288(sp)
 ; CHECK-NEXT:    lw a1, 292(sp)
-; CHECK-NEXT:    lw a2, 288(sp)
+; CHECK-NEXT:    lw a2, 280(sp)
 ; CHECK-NEXT:    lw a3, 284(sp)
 ; CHECK-NEXT:    lui a4, %hi(T)
 ; CHECK-NEXT:    sw a1, %lo(T+12)(a4)
-; CHECK-NEXT:    sw a2, %lo(T+8)(a4)
+; CHECK-NEXT:    sw a0, %lo(T+8)(a4)
 ; CHECK-NEXT:    sw a3, %lo(T+4)(a4)
-; CHECK-NEXT:    sw a0, %lo(T)(a4)
+; CHECK-NEXT:    sw a2, %lo(T)(a4)
 ; CHECK-NEXT:    sw zero, 164(sp)
 ; CHECK-NEXT:    sw zero, 160(sp)
 ; CHECK-NEXT:    sw zero, 156(sp)
@@ -238,15 +238,15 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    addi a2, sp, 152
 ; CHECK-NEXT:    sw s1, 168(sp)
 ; CHECK-NEXT:    call __addtf3
-; CHECK-NEXT:    lw a0, 184(sp)
+; CHECK-NEXT:    lw a0, 192(sp)
 ; CHECK-NEXT:    lw a1, 196(sp)
-; CHECK-NEXT:    lw a2, 192(sp)
+; CHECK-NEXT:    lw a2, 184(sp)
 ; CHECK-NEXT:    lw a3, 188(sp)
 ; CHECK-NEXT:    lui a4, %hi(Y)
 ; CHECK-NEXT:    sw a1, %lo(Y+12)(a4)
-; CHECK-NEXT:    sw a2, %lo(Y+8)(a4)
+; CHECK-NEXT:    sw a0, %lo(Y+8)(a4)
 ; CHECK-NEXT:    sw a3, %lo(Y+4)(a4)
-; CHECK-NEXT:    sw a0, %lo(Y)(a4)
+; CHECK-NEXT:    sw a2, %lo(Y)(a4)
 ; CHECK-NEXT:    sw zero, 116(sp)
 ; CHECK-NEXT:    sw zero, 112(sp)
 ; CHECK-NEXT:    sw zero, 108(sp)
diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll
index d3e495bb723ad..4dd6ed68ff981 100644
--- a/llvm/test/CodeGen/RISCV/tail-calls.ll
+++ b/llvm/test/CodeGen/RISCV/tail-calls.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple riscv32-unknown-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple riscv32-unknown-linux-gnu -mattr=experimental-zicfilp \
+; RUN:   -code-model=large -o - %s \
+; RUN:   | FileCheck %s -check-prefix=CHECK-LARGE-ZICFILP
 ; RUN: llc -mtriple riscv32-unknown-elf       -o - %s | FileCheck %s
 
 ; Perform tail call optimization for global address.
@@ -8,6 +11,14 @@ define i32 @caller_tail(i32 %i) nounwind {
 ; CHECK-LABEL: caller_tail:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    tail callee_tail
+;
+; CHECK-LARGE-ZICFILP-LABEL: caller_tail:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi0:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI0_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi0)(a1)
+; CHECK-LARGE-ZICFILP-NEXT:    jr t2
 entry:
   %r = tail call i32 @callee_tail(i32 %i)
   ret i32 %r
@@ -26,6 +37,21 @@ define void @caller_extern(ptr %src) optsize {
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:    mv a1, a3
 ; CHECK-NEXT:    tail memcpy
+;
+; CHECK-LARGE-ZICFILP-LABEL: caller_extern:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi1:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI1_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi1)(a1)
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi2:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a2, %pcrel_hi(.LCPI1_1)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi2)(a2)
+; CHECK-LARGE-ZICFILP-NEXT:    li a2, 7
+; CHECK-LARGE-ZICFILP-NEXT:    mv a3, a0
+; CHECK-LARGE-ZICFILP-NEXT:    mv a0, a1
+; CHECK-LARGE-ZICFILP-NEXT:    mv a1, a3
+; CHECK-LARGE-ZICFILP-NEXT:    jr t2
 entry:
   tail call void @llvm.memcpy.p0.p0.i32(ptr @dest, ptr %src, i32 7, i1 false)
   ret void
@@ -43,6 +69,21 @@ define void @caller_extern_pgso(ptr %src) !prof !14 {
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:    mv a1, a3
 ; CHECK-NEXT:    tail memcpy
+;
+; CHECK-LARGE-ZICFILP-LABEL: caller_extern_pgso:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi3:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI2_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi3)(a1)
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi4:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a2, %pcrel_hi(.LCPI2_1)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi4)(a2)
+; CHECK-LARGE-ZICFILP-NEXT:    li a2, 7
+; CHECK-LARGE-ZICFILP-NEXT:    mv a3, a0
+; CHECK-LARGE-ZICFILP-NEXT:    mv a0, a1
+; CHECK-LARGE-ZICFILP-NEXT:    mv a1, a3
+; CHECK-LARGE-ZICFILP-NEXT:    jr t2
 entry:
   tail call void @llvm.memcpy.p0.p0.i32(ptr @dest_pgso, ptr %src, i32 7, i1 false)
   ret void
@@ -63,8 +104,21 @@ define void @caller_indirect_tail(i32 %a) nounwind {
 ; CHECK-NEXT:    lui t1, %hi(callee_indirect1)
 ; CHECK-NEXT:    addi t1, t1, %lo(callee_indirect1)
 ; CHECK-NEXT:    jr t1
-
-
+;
+; CHECK-LARGE-ZICFILP-LABEL: caller_indirect_tail:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:    beqz a0, .LBB3_2
+; CHECK-LARGE-ZICFILP-NEXT:  # %bb.1: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi6:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI3_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t1, %pcrel_lo(.Lpcrel_hi6)(a0)
+; CHECK-LARGE-ZICFILP-NEXT:    jr t1
+; CHECK-LARGE-ZICFILP-NEXT:  .LBB3_2:
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi5:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI3_1)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t1, %pcrel_lo(.Lpcrel_hi5)(a0)
+; CHECK-LARGE-ZICFILP-NEXT:    jr t1
 entry:
   %tobool = icmp eq i32 %a, 0
   %callee = select i1 %tobool, ptr @callee_indirect1, ptr @callee_indirect2
@@ -86,6 +140,19 @@ define i32 @caller_indirect_no_t0(ptr %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5
 ; CHECK-NEXT:    mv a5, a6
 ; CHECK-NEXT:    mv a6, a7
 ; CHECK-NEXT:    jr t1
+;
+; CHECK-LARGE-ZICFILP-LABEL: caller_indirect_no_t0:
+; CHECK-LARGE-ZICFILP:       # %bb.0:
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:    mv t1, a0
+; CHECK-LARGE-ZICFILP-NEXT:    mv a0, a1
+; CHECK-LARGE-ZICFILP-NEXT:    mv a1, a2
+; CHECK-LARGE-ZICFILP-NEXT:    mv a2, a3
+; CHECK-LARGE-ZICFILP-NEXT:    mv a3, a4
+; CHECK-LARGE-ZICFILP-NEXT:    mv a4, a5
+; CHECK-LARGE-ZICFILP-NEXT:    mv a5, a6
+; CHECK-LARGE-ZICFILP-NEXT:    mv a6, a7
+; CHECK-LARGE-ZICFILP-NEXT:    jr t1
   %9 = tail call i32 %0(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7)
   ret i32 %9
 }
@@ -108,6 +175,26 @@ define void @caller_varargs(i32 %a, i32 %b) nounwind {
 ; CHECK-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
+;
+; CHECK-LARGE-ZICFILP-LABEL: caller_varargs:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
+; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi7:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a2, %pcrel_hi(.LCPI5_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi7)(a2)
+; CHECK-LARGE-ZICFILP-NEXT:    sw a0, 0(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    mv a2, a1
+; CHECK-LARGE-ZICFILP-NEXT:    mv a3, a0
+; CHECK-LARGE-ZICFILP-NEXT:    mv a4, a0
+; CHECK-LARGE-ZICFILP-NEXT:    mv a5, a1
+; CHECK-LARGE-ZICFILP-NEXT:    mv a6, a1
+; CHECK-LARGE-ZICFILP-NEXT:    mv a7, a0
+; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
+; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 16
+; CHECK-LARGE-ZICFILP-NEXT:    ret
 entry:
   %call = tail call i32 (i32, ...) @callee_varargs(i32 %a, i32 %b, i32 %b, i32 %a, i32 %a, i32 %b, i32 %b, i32 %a, i32 %a)
   ret void
@@ -136,6 +223,31 @@ define i32 @caller_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g
 ; CHECK-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 32
 ; CHECK-NEXT:    ret
+;
+; CHECK-LARGE-ZICFILP-LABEL: caller_args:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -32
+; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    lw t0, 32(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t1, 36(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t3, 40(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t4, 44(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, 48(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t5, 52(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw t5, 20(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw t2, 16(sp)
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi8:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc t2, %pcrel_hi(.LCPI6_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi8)(t2)
+; CHECK-LARGE-ZICFILP-NEXT:    sw t4, 12(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw t3, 8(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw t1, 4(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw t0, 0(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
+; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 32
+; CHECK-LARGE-ZICFILP-NEXT:    ret
 entry:
   %r = tail call i32 @callee_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n)
   ret i32 %r
@@ -158,6 +270,25 @@ define void @caller_indirect_args() nounwind {
 ; CHECK-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 32
 ; CHECK-NEXT:    ret
+;
+; CHECK-LARGE-ZICFILP-LABEL: caller_indirect_args:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -32
+; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    lui a0, 262128
+; CHECK-LARGE-ZICFILP-NEXT:    sw a0, 12(sp)
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi9:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI7_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi9)(a0)
+; CHECK-LARGE-ZICFILP-NEXT:    sw zero, 8(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw zero, 4(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    mv a0, sp
+; CHECK-LARGE-ZICFILP-NEXT:    sw zero, 0(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
+; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 32
+; CHECK-LARGE-ZICFILP-NEXT:    ret
 entry:
   %call = tail call i32 @callee_indirect_args(fp128 0xL00000000000000003FFF000000000000)
   ret void
@@ -169,6 +300,14 @@ define void @caller_weak() nounwind {
 ; CHECK-LABEL: caller_weak:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    tail callee_weak
+;
+; CHECK-LARGE-ZICFILP-LABEL: caller_weak:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi10:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI8_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi10)(a0)
+; CHECK-LARGE-ZICFILP-NEXT:    jr t2
 entry:
   tail call void @callee_weak()
   ret void
@@ -217,6 +356,48 @@ define void @caller_irq() nounwind "interrupt"="machine" {
 ; CHECK-NEXT:    lw t6, 0(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 64
 ; CHECK-NEXT:    mret
+;
+; CHECK-LARGE-ZICFILP-LABEL: caller_irq:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -64
+; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw t0, 56(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw t1, 52(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw t2, 48(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw a0, 44(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw a1, 40(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw a2, 36(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw a3, 32(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw a4, 28(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw a5, 24(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw a6, 20(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw a7, 16(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw t3, 12(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw t4, 8(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw t5, 4(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw t6, 0(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi11:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI9_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi11)(a0)
+; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
+; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw t0, 56(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw t1, 52(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, 48(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw a0, 44(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw a1, 40(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw a2, 36(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw a3, 32(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw a4, 28(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw a5, 24(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw a6, 20(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw a7, 16(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw t3, 12(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw t4, 8(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw t5, 4(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    lw t6, 0(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 64
+; CHECK-LARGE-ZICFILP-NEXT:    mret
 entry:
   tail call void @callee_irq()
   ret void
@@ -238,6 +419,22 @@ define i32 @caller_byval() nounwind {
 ; CHECK-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
+;
+; CHECK-LARGE-ZICFILP-LABEL: caller_byval:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
+; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    lw a0, 8(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw a0, 4(sp)
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi12:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI10_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi12)(a0)
+; CHECK-LARGE-ZICFILP-NEXT:    addi a0, sp, 4
+; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
+; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 16
+; CHECK-LARGE-ZICFILP-NEXT:    ret
 entry:
   %a = alloca ptr
   %r = tail call i32 @callee_byval(ptr byval(ptr) %a)
@@ -260,6 +457,22 @@ define void @caller_nostruct() nounwind {
 ; CHECK-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
+;
+; CHECK-LARGE-ZICFILP-LABEL: caller_nostruct:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
+; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi13:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI11_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw a0, %pcrel_lo(.Lpcrel_hi13)(a0)
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi14:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI11_1)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi14)(a1)
+; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
+; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 16
+; CHECK-LARGE-ZICFILP-NEXT:    ret
 entry:
   tail call void @callee_struct(ptr sret(%struct.A) @a)
   ret void
@@ -276,6 +489,19 @@ define void @caller_struct(ptr sret(%struct.A) %a) nounwind {
 ; CHECK-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
+;
+; CHECK-LARGE-ZICFILP-LABEL: caller_struct:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
+; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi15:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI12_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi15)(a0)
+; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
+; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 16
+; CHECK-LARGE-ZICFILP-NEXT:    ret
 entry:
   tail call void @callee_nostruct()
   ret void
@@ -291,6 +517,19 @@ define i32 @disable_tail_calls(i32 %i) nounwind "disable-tail-calls"="true" {
 ; CHECK-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
+;
+; CHECK-LARGE-ZICFILP-LABEL: disable_tail_calls:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
+; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi16:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI13_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi16)(a1)
+; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
+; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 16
+; CHECK-LARGE-ZICFILP-NEXT:    ret
 entry:
   %rv = tail call i32 @callee_tail(i32 %i)
   ret i32 %rv
@@ -317,6 +556,35 @@ define i32 @duplicate_returns(i32 %a, i32 %b) nounwind {
 ; CHECK-NEXT:    tail test1
 ; CHECK-NEXT:  .LBB14_6: # %if.else8
 ; CHECK-NEXT:    tail test3
+;
+; CHECK-LARGE-ZICFILP-LABEL: duplicate_returns:
+; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
+; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:    beqz a0, .LBB14_4
+; CHECK-LARGE-ZICFILP-NEXT:  # %bb.1: # %if.else
+; CHECK-LARGE-ZICFILP-NEXT:    beqz a1, .LBB14_5
+; CHECK-LARGE-ZICFILP-NEXT:  # %bb.2: # %if.else4
+; CHECK-LARGE-ZICFILP-NEXT:    bge a1, a0, .LBB14_6
+; CHECK-LARGE-ZICFILP-NEXT:  # %bb.3: # %if.then6
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi19:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI14_1)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi19)(a0)
+; CHECK-LARGE-ZICFILP-NEXT:    jr t2
+; CHECK-LARGE-ZICFILP-NEXT:  .LBB14_4: # %if.then
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi17:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI14_3)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi17)(a0)
+; CHECK-LARGE-ZICFILP-NEXT:    jr t2
+; CHECK-LARGE-ZICFILP-NEXT:  .LBB14_5: # %if.then2
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi18:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI14_2)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi18)(a0)
+; CHECK-LARGE-ZICFILP-NEXT:    jr t2
+; CHECK-LARGE-ZICFILP-NEXT:  .LBB14_6: # %if.else8
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi20:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI14_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi20)(a0)
+; CHECK-LARGE-ZICFILP-NEXT:    jr t2
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
diff --git a/llvm/test/CodeGen/RISCV/ucmp.ll b/llvm/test/CodeGen/RISCV/ucmp.ll
index c74bc6838ff7d..50da56fbc5951 100644
--- a/llvm/test/CodeGen/RISCV/ucmp.ll
+++ b/llvm/test/CodeGen/RISCV/ucmp.ll
@@ -87,10 +87,10 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 ; RV32I-LABEL: ucmp.8.128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 4(a1)
-; RV32I-NEXT:    lw a3, 4(a0)
 ; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a5, 12(a1)
 ; RV32I-NEXT:    lw a6, 12(a0)
+; RV32I-NEXT:    lw a3, 4(a0)
 ; RV32I-NEXT:    lw a7, 8(a0)
 ; RV32I-NEXT:    beq a6, a5, .LBB4_2
 ; RV32I-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index f1ae320017563..dde69667b8ec3 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -10,47 +10,47 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RISCV32-NEXT:    lw a3, 12(a1)
-; RISCV32-NEXT:    lw a7, 12(a2)
-; RISCV32-NEXT:    lw a6, 8(a1)
-; RISCV32-NEXT:    lw a4, 0(a2)
-; RISCV32-NEXT:    lw a5, 0(a1)
+; RISCV32-NEXT:    lw a3, 0(a1)
 ; RISCV32-NEXT:    lw t2, 4(a1)
-; RISCV32-NEXT:    lw t0, 8(a2)
-; RISCV32-NEXT:    lw a2, 4(a2)
-; RISCV32-NEXT:    mulhu a1, a5, a4
-; RISCV32-NEXT:    mul t1, t2, a4
-; RISCV32-NEXT:    add a1, t1, a1
-; RISCV32-NEXT:    sltu t1, a1, t1
-; RISCV32-NEXT:    mulhu t3, t2, a4
+; RISCV32-NEXT:    lw a4, 8(a1)
+; RISCV32-NEXT:    lw a5, 12(a1)
+; RISCV32-NEXT:    lw a1, 0(a2)
+; RISCV32-NEXT:    lw t0, 4(a2)
+; RISCV32-NEXT:    lw a6, 8(a2)
+; RISCV32-NEXT:    lw a7, 12(a2)
+; RISCV32-NEXT:    mulhu a2, a3, a1
+; RISCV32-NEXT:    mul t1, t2, a1
+; RISCV32-NEXT:    add a2, t1, a2
+; RISCV32-NEXT:    sltu t1, a2, t1
+; RISCV32-NEXT:    mulhu t3, t2, a1
 ; RISCV32-NEXT:    add t4, t3, t1
-; RISCV32-NEXT:    mul t1, a5, a2
-; RISCV32-NEXT:    add a1, t1, a1
-; RISCV32-NEXT:    sltu t1, a1, t1
-; RISCV32-NEXT:    mulhu t3, a5, a2
+; RISCV32-NEXT:    mul t1, a3, t0
+; RISCV32-NEXT:    add a2, t1, a2
+; RISCV32-NEXT:    sltu t1, a2, t1
+; RISCV32-NEXT:    mulhu t3, a3, t0
 ; RISCV32-NEXT:    add t1, t3, t1
 ; RISCV32-NEXT:    add t5, t4, t1
-; RISCV32-NEXT:    mul t6, t2, a2
+; RISCV32-NEXT:    mul t6, t2, t0
 ; RISCV32-NEXT:    add s0, t6, t5
-; RISCV32-NEXT:    mul t1, t0, a5
-; RISCV32-NEXT:    mul s3, a6, a4
+; RISCV32-NEXT:    mul t1, a6, a3
+; RISCV32-NEXT:    mul s3, a4, a1
 ; RISCV32-NEXT:    add s4, s3, t1
 ; RISCV32-NEXT:    add t1, s0, s4
 ; RISCV32-NEXT:    sltu t3, t1, s0
 ; RISCV32-NEXT:    sltu s0, s0, t6
 ; RISCV32-NEXT:    sltu t4, t5, t4
-; RISCV32-NEXT:    mulhu t5, t2, a2
+; RISCV32-NEXT:    mulhu t5, t2, t0
 ; RISCV32-NEXT:    add t4, t5, t4
 ; RISCV32-NEXT:    add s0, t4, s0
-; RISCV32-NEXT:    mul t4, t2, t0
-; RISCV32-NEXT:    mul t5, a7, a5
+; RISCV32-NEXT:    mul t4, t2, a6
+; RISCV32-NEXT:    mul t5, a7, a3
 ; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu s1, t0, a5
+; RISCV32-NEXT:    mulhu s1, a6, a3
 ; RISCV32-NEXT:    add s2, s1, t4
-; RISCV32-NEXT:    mul t4, a2, a6
-; RISCV32-NEXT:    mul t5, a3, a4
+; RISCV32-NEXT:    mul t4, t0, a4
+; RISCV32-NEXT:    mul t5, a5, a1
 ; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu t5, a6, a4
+; RISCV32-NEXT:    mulhu t5, a4, a1
 ; RISCV32-NEXT:    add t6, t5, t4
 ; RISCV32-NEXT:    add t4, t6, s2
 ; RISCV32-NEXT:    sltu s3, s4, s3
@@ -65,39 +65,39 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    snez s1, t2
 ; RISCV32-NEXT:    snez s2, a7
 ; RISCV32-NEXT:    and s1, s2, s1
-; RISCV32-NEXT:    mulhu s2, a7, a5
+; RISCV32-NEXT:    mulhu s2, a7, a3
 ; RISCV32-NEXT:    snez s2, s2
 ; RISCV32-NEXT:    or s1, s1, s2
-; RISCV32-NEXT:    mulhu t2, t2, t0
+; RISCV32-NEXT:    mulhu t2, t2, a6
 ; RISCV32-NEXT:    snez t2, t2
 ; RISCV32-NEXT:    or t2, s1, t2
 ; RISCV32-NEXT:    or t2, t2, s0
 ; RISCV32-NEXT:    sltu t5, t6, t5
-; RISCV32-NEXT:    snez t6, a2
-; RISCV32-NEXT:    snez s0, a3
+; RISCV32-NEXT:    snez t6, t0
+; RISCV32-NEXT:    snez s0, a5
 ; RISCV32-NEXT:    and t6, s0, t6
-; RISCV32-NEXT:    mulhu s0, a3, a4
+; RISCV32-NEXT:    mulhu s0, a5, a1
 ; RISCV32-NEXT:    snez s0, s0
 ; RISCV32-NEXT:    or t6, t6, s0
-; RISCV32-NEXT:    mulhu a2, a2, a6
-; RISCV32-NEXT:    snez a2, a2
-; RISCV32-NEXT:    or a2, t6, a2
-; RISCV32-NEXT:    or a2, a2, t5
-; RISCV32-NEXT:    or a7, t0, a7
-; RISCV32-NEXT:    snez a7, a7
-; RISCV32-NEXT:    or a3, a6, a3
-; RISCV32-NEXT:    snez a3, a3
-; RISCV32-NEXT:    and a3, a3, a7
-; RISCV32-NEXT:    or a2, a3, a2
-; RISCV32-NEXT:    or a2, a2, t2
-; RISCV32-NEXT:    or a2, a2, t3
-; RISCV32-NEXT:    mul a3, a5, a4
-; RISCV32-NEXT:    andi a2, a2, 1
-; RISCV32-NEXT:    sw a3, 0(a0)
-; RISCV32-NEXT:    sw a1, 4(a0)
+; RISCV32-NEXT:    mulhu t0, t0, a4
+; RISCV32-NEXT:    snez t0, t0
+; RISCV32-NEXT:    or t0, t6, t0
+; RISCV32-NEXT:    or t0, t0, t5
+; RISCV32-NEXT:    or a6, a6, a7
+; RISCV32-NEXT:    snez a6, a6
+; RISCV32-NEXT:    or a4, a4, a5
+; RISCV32-NEXT:    snez a4, a4
+; RISCV32-NEXT:    and a4, a4, a6
+; RISCV32-NEXT:    or a4, a4, t0
+; RISCV32-NEXT:    or a4, a4, t2
+; RISCV32-NEXT:    or a4, a4, t3
+; RISCV32-NEXT:    mul a1, a3, a1
+; RISCV32-NEXT:    andi a4, a4, 1
+; RISCV32-NEXT:    sw a1, 0(a0)
+; RISCV32-NEXT:    sw a2, 4(a0)
 ; RISCV32-NEXT:    sw t1, 8(a0)
 ; RISCV32-NEXT:    sw t4, 12(a0)
-; RISCV32-NEXT:    sb a2, 16(a0)
+; RISCV32-NEXT:    sb a4, 16(a0)
 ; RISCV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index 10497db6edc49..74d34b2b64d41 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
-; RUN:   | FileCheck -check-prefixes=ALL,SLOW,RV32I %s
+; RUN:   | FileCheck -check-prefixes=ALL,SLOW,SLOWBASE,RV32I %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
-; RUN:   | FileCheck -check-prefixes=ALL,SLOW,RV64I %s
+; RUN:   | FileCheck -check-prefixes=ALL,SLOW,SLOWBASE,RV64I %s
+; RUN: llc -mtriple=riscv32 -mattr=+zbkb -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=ALL,SLOW,SLOWZBKB,RV32IZBKB %s
+; RUN: llc -mtriple=riscv64 -mattr=+zbkb -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=ALL,SLOW,SLOWZBKB,RV64IZBKB %s
 ; RUN: llc -mtriple=riscv32 -mattr=+unaligned-scalar-mem -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ALL,FAST,RV32I-FAST %s
 ; RUN: llc -mtriple=riscv64 -mattr=+unaligned-scalar-mem -verify-machineinstrs < %s \
@@ -37,16 +41,35 @@ define i16 @load_i16(ptr %p) {
 }
 
 define i24 @load_i24(ptr %p) {
-; SLOW-LABEL: load_i24:
-; SLOW:       # %bb.0:
-; SLOW-NEXT:    lbu a1, 1(a0)
-; SLOW-NEXT:    lbu a2, 0(a0)
-; SLOW-NEXT:    lbu a0, 2(a0)
-; SLOW-NEXT:    slli a1, a1, 8
-; SLOW-NEXT:    or a1, a1, a2
-; SLOW-NEXT:    slli a0, a0, 16
-; SLOW-NEXT:    or a0, a1, a0
-; SLOW-NEXT:    ret
+; SLOWBASE-LABEL: load_i24:
+; SLOWBASE:       # %bb.0:
+; SLOWBASE-NEXT:    lbu a1, 1(a0)
+; SLOWBASE-NEXT:    lbu a2, 0(a0)
+; SLOWBASE-NEXT:    lbu a0, 2(a0)
+; SLOWBASE-NEXT:    slli a1, a1, 8
+; SLOWBASE-NEXT:    or a1, a1, a2
+; SLOWBASE-NEXT:    slli a0, a0, 16
+; SLOWBASE-NEXT:    or a0, a1, a0
+; SLOWBASE-NEXT:    ret
+;
+; RV32IZBKB-LABEL: load_i24:
+; RV32IZBKB:       # %bb.0:
+; RV32IZBKB-NEXT:    lbu a1, 0(a0)
+; RV32IZBKB-NEXT:    lbu a2, 1(a0)
+; RV32IZBKB-NEXT:    lbu a0, 2(a0)
+; RV32IZBKB-NEXT:    packh a1, a1, a2
+; RV32IZBKB-NEXT:    pack a0, a1, a0
+; RV32IZBKB-NEXT:    ret
+;
+; RV64IZBKB-LABEL: load_i24:
+; RV64IZBKB:       # %bb.0:
+; RV64IZBKB-NEXT:    lbu a1, 0(a0)
+; RV64IZBKB-NEXT:    lbu a2, 1(a0)
+; RV64IZBKB-NEXT:    lbu a0, 2(a0)
+; RV64IZBKB-NEXT:    packh a1, a1, a2
+; RV64IZBKB-NEXT:    slli a0, a0, 16
+; RV64IZBKB-NEXT:    or a0, a1, a0
+; RV64IZBKB-NEXT:    ret
 ;
 ; FAST-LABEL: load_i24:
 ; FAST:       # %bb.0:
@@ -60,19 +83,32 @@ define i24 @load_i24(ptr %p) {
 }
 
 define i32 @load_i32(ptr %p) {
-; SLOW-LABEL: load_i32:
-; SLOW:       # %bb.0:
-; SLOW-NEXT:    lbu a1, 1(a0)
-; SLOW-NEXT:    lbu a2, 0(a0)
-; SLOW-NEXT:    lbu a3, 2(a0)
-; SLOW-NEXT:    lbu a0, 3(a0)
-; SLOW-NEXT:    slli a1, a1, 8
-; SLOW-NEXT:    or a1, a1, a2
-; SLOW-NEXT:    slli a3, a3, 16
-; SLOW-NEXT:    slli a0, a0, 24
-; SLOW-NEXT:    or a0, a0, a3
-; SLOW-NEXT:    or a0, a0, a1
-; SLOW-NEXT:    ret
+; SLOWBASE-LABEL: load_i32:
+; SLOWBASE:       # %bb.0:
+; SLOWBASE-NEXT:    lbu a1, 1(a0)
+; SLOWBASE-NEXT:    lbu a2, 0(a0)
+; SLOWBASE-NEXT:    lbu a3, 2(a0)
+; SLOWBASE-NEXT:    lbu a0, 3(a0)
+; SLOWBASE-NEXT:    slli a1, a1, 8
+; SLOWBASE-NEXT:    or a1, a1, a2
+; SLOWBASE-NEXT:    slli a3, a3, 16
+; SLOWBASE-NEXT:    slli a0, a0, 24
+; SLOWBASE-NEXT:    or a0, a0, a3
+; SLOWBASE-NEXT:    or a0, a0, a1
+; SLOWBASE-NEXT:    ret
+;
+; SLOWZBKB-LABEL: load_i32:
+; SLOWZBKB:       # %bb.0:
+; SLOWZBKB-NEXT:    lbu a1, 0(a0)
+; SLOWZBKB-NEXT:    lbu a2, 1(a0)
+; SLOWZBKB-NEXT:    lbu a3, 2(a0)
+; SLOWZBKB-NEXT:    lbu a0, 3(a0)
+; SLOWZBKB-NEXT:    packh a1, a1, a2
+; SLOWZBKB-NEXT:    slli a3, a3, 16
+; SLOWZBKB-NEXT:    slli a0, a0, 24
+; SLOWZBKB-NEXT:    or a0, a0, a3
+; SLOWZBKB-NEXT:    or a0, a0, a1
+; SLOWZBKB-NEXT:    ret
 ;
 ; FAST-LABEL: load_i32:
 ; FAST:       # %bb.0:
@@ -94,17 +130,17 @@ define i64 @load_i64(ptr %p) {
 ; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a4, a4, 24
 ; RV32I-NEXT:    or a2, a4, a3
-; RV32I-NEXT:    or a2, a2, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
 ; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 6(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    or a2, a2, a1
+; RV32I-NEXT:    lbu a1, 6(a0)
 ; RV32I-NEXT:    lbu a0, 7(a0)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    or a1, a0, a3
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
 ;
@@ -119,21 +155,66 @@ define i64 @load_i64(ptr %p) {
 ; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a4, a4, 24
 ; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a2, 4(a0)
+; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    lbu a2, 5(a0)
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    lbu a4, 6(a0)
+; RV64I-NEXT:    lbu a3, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a2, a2, 8
-; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a2, a4, a2
+; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32IZBKB-LABEL: load_i64:
+; RV32IZBKB:       # %bb.0:
+; RV32IZBKB-NEXT:    lbu a1, 0(a0)
+; RV32IZBKB-NEXT:    lbu a2, 1(a0)
+; RV32IZBKB-NEXT:    lbu a3, 2(a0)
+; RV32IZBKB-NEXT:    lbu a4, 3(a0)
+; RV32IZBKB-NEXT:    packh a1, a1, a2
+; RV32IZBKB-NEXT:    slli a3, a3, 16
+; RV32IZBKB-NEXT:    slli a4, a4, 24
+; RV32IZBKB-NEXT:    or a3, a4, a3
+; RV32IZBKB-NEXT:    lbu a2, 4(a0)
+; RV32IZBKB-NEXT:    lbu a4, 5(a0)
+; RV32IZBKB-NEXT:    lbu a5, 6(a0)
+; RV32IZBKB-NEXT:    lbu a6, 7(a0)
+; RV32IZBKB-NEXT:    or a0, a3, a1
+; RV32IZBKB-NEXT:    packh a1, a2, a4
+; RV32IZBKB-NEXT:    slli a5, a5, 16
+; RV32IZBKB-NEXT:    slli a6, a6, 24
+; RV32IZBKB-NEXT:    or a2, a6, a5
+; RV32IZBKB-NEXT:    or a1, a2, a1
+; RV32IZBKB-NEXT:    ret
+;
+; RV64IZBKB-LABEL: load_i64:
+; RV64IZBKB:       # %bb.0:
+; RV64IZBKB-NEXT:    lbu a1, 4(a0)
+; RV64IZBKB-NEXT:    lbu a2, 5(a0)
+; RV64IZBKB-NEXT:    lbu a3, 6(a0)
+; RV64IZBKB-NEXT:    lbu a4, 7(a0)
+; RV64IZBKB-NEXT:    packh a1, a1, a2
+; RV64IZBKB-NEXT:    slli a3, a3, 16
+; RV64IZBKB-NEXT:    slli a4, a4, 24
+; RV64IZBKB-NEXT:    or a3, a4, a3
+; RV64IZBKB-NEXT:    lbu a2, 0(a0)
+; RV64IZBKB-NEXT:    lbu a4, 1(a0)
+; RV64IZBKB-NEXT:    lbu a5, 2(a0)
+; RV64IZBKB-NEXT:    lbu a0, 3(a0)
+; RV64IZBKB-NEXT:    or a1, a3, a1
+; RV64IZBKB-NEXT:    packh a2, a2, a4
+; RV64IZBKB-NEXT:    slli a5, a5, 16
+; RV64IZBKB-NEXT:    slli a0, a0, 24
+; RV64IZBKB-NEXT:    or a0, a0, a5
+; RV64IZBKB-NEXT:    or a0, a0, a2
+; RV64IZBKB-NEXT:    pack a0, a0, a1
+; RV64IZBKB-NEXT:    ret
+;
 ; RV32I-FAST-LABEL: load_i64:
 ; RV32I-FAST:       # %bb.0:
 ; RV32I-FAST-NEXT:    lw a2, 0(a0)
@@ -252,6 +333,43 @@ define void @store_i64(ptr %p, i64 %v) {
 ; RV64I-NEXT:    sb a1, 1(a0)
 ; RV64I-NEXT:    ret
 ;
+; RV32IZBKB-LABEL: store_i64:
+; RV32IZBKB:       # %bb.0:
+; RV32IZBKB-NEXT:    sb a2, 4(a0)
+; RV32IZBKB-NEXT:    sb a1, 0(a0)
+; RV32IZBKB-NEXT:    srli a3, a2, 24
+; RV32IZBKB-NEXT:    sb a3, 7(a0)
+; RV32IZBKB-NEXT:    srli a3, a2, 16
+; RV32IZBKB-NEXT:    sb a3, 6(a0)
+; RV32IZBKB-NEXT:    srli a2, a2, 8
+; RV32IZBKB-NEXT:    sb a2, 5(a0)
+; RV32IZBKB-NEXT:    srli a2, a1, 24
+; RV32IZBKB-NEXT:    sb a2, 3(a0)
+; RV32IZBKB-NEXT:    srli a2, a1, 16
+; RV32IZBKB-NEXT:    sb a2, 2(a0)
+; RV32IZBKB-NEXT:    srli a1, a1, 8
+; RV32IZBKB-NEXT:    sb a1, 1(a0)
+; RV32IZBKB-NEXT:    ret
+;
+; RV64IZBKB-LABEL: store_i64:
+; RV64IZBKB:       # %bb.0:
+; RV64IZBKB-NEXT:    sb a1, 0(a0)
+; RV64IZBKB-NEXT:    srli a2, a1, 56
+; RV64IZBKB-NEXT:    sb a2, 7(a0)
+; RV64IZBKB-NEXT:    srli a2, a1, 48
+; RV64IZBKB-NEXT:    sb a2, 6(a0)
+; RV64IZBKB-NEXT:    srli a2, a1, 40
+; RV64IZBKB-NEXT:    sb a2, 5(a0)
+; RV64IZBKB-NEXT:    srli a2, a1, 32
+; RV64IZBKB-NEXT:    sb a2, 4(a0)
+; RV64IZBKB-NEXT:    srli a2, a1, 24
+; RV64IZBKB-NEXT:    sb a2, 3(a0)
+; RV64IZBKB-NEXT:    srli a2, a1, 16
+; RV64IZBKB-NEXT:    sb a2, 2(a0)
+; RV64IZBKB-NEXT:    srli a1, a1, 8
+; RV64IZBKB-NEXT:    sb a1, 1(a0)
+; RV64IZBKB-NEXT:    ret
+;
 ; RV32I-FAST-LABEL: store_i64:
 ; RV32I-FAST:       # %bb.0:
 ; RV32I-FAST-NEXT:    sw a2, 4(a0)
diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
index c016e8f316363..5a5ae66b5fa76 100644
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -522,10 +522,10 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV32MV-LABEL: test_urem_vec:
 ; RV32MV:       # %bb.0:
 ; RV32MV-NEXT:    lw a1, 0(a0)
-; RV32MV-NEXT:    andi a2, a1, 2047
-; RV32MV-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV32MV-NEXT:    vmv.v.x v8, a2
 ; RV32MV-NEXT:    lbu a2, 4(a0)
+; RV32MV-NEXT:    andi a3, a1, 2047
+; RV32MV-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV32MV-NEXT:    vmv.v.x v8, a3
 ; RV32MV-NEXT:    slli a3, a1, 10
 ; RV32MV-NEXT:    srli a3, a3, 21
 ; RV32MV-NEXT:    vslide1down.vx v8, v8, a3
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index c057c656e0fb7..b0e790ed60635 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -19,29 +19,29 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu s0, 12(a1)
-; RV32I-NEXT:    lhu s1, 8(a1)
-; RV32I-NEXT:    lhu s2, 4(a1)
 ; RV32I-NEXT:    lhu a2, 0(a1)
+; RV32I-NEXT:    lhu s0, 4(a1)
+; RV32I-NEXT:    lhu s1, 8(a1)
+; RV32I-NEXT:    lhu s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 124
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __umodsi3
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 98
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, 1003
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    sh a0, 6(s3)
 ; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh s2, 2(s3)
+; RV32I-NEXT:    sh s0, 2(s3)
 ; RV32I-NEXT:    sh s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -54,39 +54,39 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_urem_vec_1:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
-; RV32IM-NEXT:    lhu a3, 8(a1)
-; RV32IM-NEXT:    lhu a4, 4(a1)
-; RV32IM-NEXT:    lhu a1, 0(a1)
+; RV32IM-NEXT:    lhu a2, 0(a1)
+; RV32IM-NEXT:    lhu a3, 4(a1)
+; RV32IM-NEXT:    lhu a4, 8(a1)
+; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 8456
 ; RV32IM-NEXT:    addi a5, a5, 1058
-; RV32IM-NEXT:    mulhu a5, a4, a5
+; RV32IM-NEXT:    mulhu a5, a3, a5
 ; RV32IM-NEXT:    slli a6, a5, 7
 ; RV32IM-NEXT:    slli a5, a5, 2
 ; RV32IM-NEXT:    sub a5, a5, a6
-; RV32IM-NEXT:    add a4, a4, a5
+; RV32IM-NEXT:    add a3, a3, a5
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a5, a1, a5
+; RV32IM-NEXT:    mulhu a5, a2, a5
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    lui a5, 10700
 ; RV32IM-NEXT:    addi a5, a5, -1003
-; RV32IM-NEXT:    mulhu a5, a3, a5
+; RV32IM-NEXT:    mulhu a5, a4, a5
 ; RV32IM-NEXT:    li a6, 98
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 1045
 ; RV32IM-NEXT:    addi a5, a5, 1801
-; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    mulhu a5, a1, a5
 ; RV32IM-NEXT:    li a6, 1003
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a1, 0(a0)
-; RV32IM-NEXT:    sh a4, 2(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_urem_vec_1:
@@ -98,29 +98,29 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu s0, 24(a1)
-; RV64I-NEXT:    lhu s1, 16(a1)
-; RV64I-NEXT:    lhu s2, 8(a1)
 ; RV64I-NEXT:    lhu a2, 0(a1)
+; RV64I-NEXT:    lhu s0, 8(a1)
+; RV64I-NEXT:    lhu s1, 16(a1)
+; RV64I-NEXT:    lhu s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 124
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __umoddi3
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 98
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, 1003
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    sh a0, 6(s3)
 ; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh s2, 2(s3)
+; RV64I-NEXT:    sh s0, 2(s3)
 ; RV64I-NEXT:    sh s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -133,38 +133,38 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_urem_vec_1:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lhu a2, 8(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI0_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI0_0)(a3)
-; RV64IM-NEXT:    lhu a4, 24(a1)
+; RV64IM-NEXT:    lui a2, %hi(.LCPI0_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
+; RV64IM-NEXT:    lhu a3, 8(a1)
+; RV64IM-NEXT:    lhu a4, 0(a1)
 ; RV64IM-NEXT:    lhu a5, 16(a1)
-; RV64IM-NEXT:    lhu a1, 0(a1)
-; RV64IM-NEXT:    mulhu a3, a2, a3
-; RV64IM-NEXT:    slli a6, a3, 7
+; RV64IM-NEXT:    lhu a1, 24(a1)
+; RV64IM-NEXT:    mulhu a2, a3, a2
+; RV64IM-NEXT:    slli a6, a2, 7
 ; RV64IM-NEXT:    lui a7, %hi(.LCPI0_1)
 ; RV64IM-NEXT:    ld a7, %lo(.LCPI0_1)(a7)
-; RV64IM-NEXT:    slli a3, a3, 2
-; RV64IM-NEXT:    subw a3, a3, a6
-; RV64IM-NEXT:    add a2, a2, a3
-; RV64IM-NEXT:    mulhu a3, a1, a7
+; RV64IM-NEXT:    slli a2, a2, 2
+; RV64IM-NEXT:    subw a2, a2, a6
+; RV64IM-NEXT:    add a2, a3, a2
+; RV64IM-NEXT:    mulhu a3, a4, a7
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_2)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_2)(a6)
 ; RV64IM-NEXT:    li a7, 95
 ; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a1, a1, a3
+; RV64IM-NEXT:    subw a4, a4, a3
 ; RV64IM-NEXT:    mulhu a3, a5, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_3)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_3)(a6)
 ; RV64IM-NEXT:    li a7, 98
 ; RV64IM-NEXT:    mul a3, a3, a7
 ; RV64IM-NEXT:    subw a5, a5, a3
-; RV64IM-NEXT:    mulhu a3, a4, a6
+; RV64IM-NEXT:    mulhu a3, a1, a6
 ; RV64IM-NEXT:    li a6, 1003
 ; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    subw a1, a1, a3
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
-; RV64IM-NEXT:    sh a1, 0(a0)
+; RV64IM-NEXT:    sh a4, 0(a0)
 ; RV64IM-NEXT:    sh a2, 2(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
@@ -181,29 +181,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu s0, 12(a1)
-; RV32I-NEXT:    lhu s1, 8(a1)
-; RV32I-NEXT:    lhu s2, 4(a1)
 ; RV32I-NEXT:    lhu a2, 0(a1)
+; RV32I-NEXT:    lhu s0, 4(a1)
+; RV32I-NEXT:    lhu s1, 8(a1)
+; RV32I-NEXT:    lhu s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __umodsi3
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    sh a0, 6(s3)
 ; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh s2, 2(s3)
+; RV32I-NEXT:    sh s0, 2(s3)
 ; RV32I-NEXT:    sh s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -216,29 +216,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_urem_vec_2:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
-; RV32IM-NEXT:    lhu a3, 8(a1)
-; RV32IM-NEXT:    lhu a4, 0(a1)
-; RV32IM-NEXT:    lhu a1, 4(a1)
+; RV32IM-NEXT:    lhu a2, 0(a1)
+; RV32IM-NEXT:    lhu a3, 4(a1)
+; RV32IM-NEXT:    lhu a4, 8(a1)
+; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a6, a4, a5
+; RV32IM-NEXT:    mulhu a6, a2, a5
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a4, a4, a6
-; RV32IM-NEXT:    mulhu a6, a1, a5
-; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a1, a1, a6
+; RV32IM-NEXT:    sub a2, a2, a6
 ; RV32IM-NEXT:    mulhu a6, a3, a5
 ; RV32IM-NEXT:    mul a6, a6, a7
 ; RV32IM-NEXT:    sub a3, a3, a6
-; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    mulhu a6, a4, a5
+; RV32IM-NEXT:    mul a6, a6, a7
+; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    mulhu a5, a1, a5
 ; RV32IM-NEXT:    mul a5, a5, a7
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_urem_vec_2:
@@ -250,29 +250,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu s0, 24(a1)
-; RV64I-NEXT:    lhu s1, 16(a1)
-; RV64I-NEXT:    lhu s2, 8(a1)
 ; RV64I-NEXT:    lhu a2, 0(a1)
+; RV64I-NEXT:    lhu s0, 8(a1)
+; RV64I-NEXT:    lhu s1, 16(a1)
+; RV64I-NEXT:    lhu s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __umoddi3
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    sh a0, 6(s3)
 ; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh s2, 2(s3)
+; RV64I-NEXT:    sh s0, 2(s3)
 ; RV64I-NEXT:    sh s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -285,29 +285,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_urem_vec_2:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lhu a2, 0(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI1_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI1_0)(a3)
-; RV64IM-NEXT:    lhu a4, 24(a1)
+; RV64IM-NEXT:    lui a2, %hi(.LCPI1_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI1_0)(a2)
+; RV64IM-NEXT:    lhu a3, 0(a1)
+; RV64IM-NEXT:    lhu a4, 8(a1)
 ; RV64IM-NEXT:    lhu a5, 16(a1)
-; RV64IM-NEXT:    lhu a1, 8(a1)
-; RV64IM-NEXT:    mulhu a6, a2, a3
+; RV64IM-NEXT:    lhu a1, 24(a1)
+; RV64IM-NEXT:    mulhu a6, a3, a2
 ; RV64IM-NEXT:    li a7, 95
 ; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a2, a2, a6
-; RV64IM-NEXT:    mulhu a6, a1, a3
+; RV64IM-NEXT:    subw a3, a3, a6
+; RV64IM-NEXT:    mulhu a6, a4, a2
 ; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a1, a1, a6
-; RV64IM-NEXT:    mulhu a6, a5, a3
+; RV64IM-NEXT:    subw a4, a4, a6
+; RV64IM-NEXT:    mulhu a6, a5, a2
 ; RV64IM-NEXT:    mul a6, a6, a7
 ; RV64IM-NEXT:    subw a5, a5, a6
-; RV64IM-NEXT:    mulhu a3, a4, a3
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    mulhu a2, a1, a2
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   ret <4 x i16> %1
@@ -388,33 +388,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: combine_urem_udiv:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 0(a1)
-; RV32IM-NEXT:    lhu a3, 4(a1)
-; RV32IM-NEXT:    lhu a4, 12(a1)
+; RV32IM-NEXT:    lhu a2, 12(a1)
+; RV32IM-NEXT:    lhu a3, 0(a1)
+; RV32IM-NEXT:    lhu a4, 4(a1)
 ; RV32IM-NEXT:    lhu a1, 8(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a6, a4, a5
+; RV32IM-NEXT:    mulhu a6, a2, a5
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul t0, a6, a7
 ; RV32IM-NEXT:    mulhu t1, a1, a5
 ; RV32IM-NEXT:    mul t2, t1, a7
-; RV32IM-NEXT:    mulhu t3, a3, a5
+; RV32IM-NEXT:    mulhu t3, a4, a5
 ; RV32IM-NEXT:    mul t4, t3, a7
-; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    mulhu a5, a3, a5
 ; RV32IM-NEXT:    mul a7, a5, a7
-; RV32IM-NEXT:    add a2, a2, a5
-; RV32IM-NEXT:    sub a2, a2, a7
-; RV32IM-NEXT:    add a3, a3, t3
-; RV32IM-NEXT:    sub a3, a3, t4
+; RV32IM-NEXT:    add a3, a3, a5
+; RV32IM-NEXT:    sub a3, a3, a7
+; RV32IM-NEXT:    add a4, a4, t3
+; RV32IM-NEXT:    sub a4, a4, t4
 ; RV32IM-NEXT:    add a1, a1, t1
 ; RV32IM-NEXT:    sub a1, a1, t2
-; RV32IM-NEXT:    add a4, a4, a6
-; RV32IM-NEXT:    sub a4, a4, t0
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    add a2, a2, a6
+; RV32IM-NEXT:    sub a2, a2, t0
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
-; RV32IM-NEXT:    sh a2, 0(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
+; RV32IM-NEXT:    sh a3, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: combine_urem_udiv:
@@ -533,19 +533,19 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu s1, 8(a1)
+; RV32I-NEXT:    lhu s1, 0(a1)
 ; RV32I-NEXT:    lhu s2, 4(a1)
-; RV32I-NEXT:    lhu s3, 0(a1)
+; RV32I-NEXT:    lhu s3, 8(a1)
 ; RV32I-NEXT:    lhu a2, 12(a1)
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3
-; RV32I-NEXT:    andi a1, s3, 63
+; RV32I-NEXT:    andi a1, s1, 63
 ; RV32I-NEXT:    andi a2, s2, 31
-; RV32I-NEXT:    andi s1, s1, 7
+; RV32I-NEXT:    andi a3, s3, 7
 ; RV32I-NEXT:    sh a0, 6(s0)
-; RV32I-NEXT:    sh s1, 4(s0)
+; RV32I-NEXT:    sh a3, 4(s0)
 ; RV32I-NEXT:    sh a2, 2(s0)
 ; RV32I-NEXT:    sh a1, 0(s0)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -558,8 +558,8 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_urem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 8(a1)
-; RV32IM-NEXT:    lhu a3, 4(a1)
+; RV32IM-NEXT:    lhu a2, 4(a1)
+; RV32IM-NEXT:    lhu a3, 8(a1)
 ; RV32IM-NEXT:    lhu a4, 12(a1)
 ; RV32IM-NEXT:    lhu a1, 0(a1)
 ; RV32IM-NEXT:    lui a5, 11038
@@ -569,10 +569,10 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    mul a5, a5, a6
 ; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    andi a1, a1, 63
-; RV32IM-NEXT:    andi a3, a3, 31
-; RV32IM-NEXT:    andi a2, a2, 7
-; RV32IM-NEXT:    sh a2, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    andi a2, a2, 31
+; RV32IM-NEXT:    andi a3, a3, 7
+; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    sh a1, 0(a0)
 ; RV32IM-NEXT:    sh a4, 6(a0)
 ; RV32IM-NEXT:    ret
@@ -585,19 +585,19 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu s1, 16(a1)
+; RV64I-NEXT:    lhu s1, 0(a1)
 ; RV64I-NEXT:    lhu s2, 8(a1)
-; RV64I-NEXT:    lhu s3, 0(a1)
+; RV64I-NEXT:    lhu s3, 16(a1)
 ; RV64I-NEXT:    lhu a2, 24(a1)
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
-; RV64I-NEXT:    andi a1, s3, 63
+; RV64I-NEXT:    andi a1, s1, 63
 ; RV64I-NEXT:    andi a2, s2, 31
-; RV64I-NEXT:    andi s1, s1, 7
+; RV64I-NEXT:    andi a3, s3, 7
 ; RV64I-NEXT:    sh a0, 6(s0)
-; RV64I-NEXT:    sh s1, 4(s0)
+; RV64I-NEXT:    sh a3, 4(s0)
 ; RV64I-NEXT:    sh a2, 2(s0)
 ; RV64I-NEXT:    sh a1, 0(s0)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -642,24 +642,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu s0, 12(a1)
-; RV32I-NEXT:    lhu s1, 8(a1)
 ; RV32I-NEXT:    lhu a2, 4(a1)
+; RV32I-NEXT:    lhu s0, 8(a1)
+; RV32I-NEXT:    lhu s1, 12(a1)
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    li a1, 654
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 23
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __umodsi3
-; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a1, a0, 1327
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    sh a0, 6(s2)
-; RV32I-NEXT:    sh s1, 4(s2)
+; RV32I-NEXT:    sh s0, 4(s2)
 ; RV32I-NEXT:    sh s3, 2(s2)
 ; RV32I-NEXT:    sh zero, 0(s2)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -672,32 +672,32 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_urem_one:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
-; RV32IM-NEXT:    lhu a3, 4(a1)
-; RV32IM-NEXT:    lhu a1, 8(a1)
+; RV32IM-NEXT:    lhu a2, 4(a1)
+; RV32IM-NEXT:    lhu a3, 8(a1)
+; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a4, 1603
 ; RV32IM-NEXT:    addi a4, a4, 1341
-; RV32IM-NEXT:    mulhu a4, a3, a4
+; RV32IM-NEXT:    mulhu a4, a2, a4
 ; RV32IM-NEXT:    li a5, 654
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a3, a3, a4
+; RV32IM-NEXT:    sub a2, a2, a4
 ; RV32IM-NEXT:    lui a4, 45590
 ; RV32IM-NEXT:    addi a4, a4, 1069
-; RV32IM-NEXT:    mulhu a4, a1, a4
+; RV32IM-NEXT:    mulhu a4, a3, a4
 ; RV32IM-NEXT:    li a5, 23
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a1, a1, a4
+; RV32IM-NEXT:    sub a3, a3, a4
 ; RV32IM-NEXT:    lui a4, 193
 ; RV32IM-NEXT:    addi a4, a4, 1464
-; RV32IM-NEXT:    mulhu a4, a2, a4
+; RV32IM-NEXT:    mulhu a4, a1, a4
 ; RV32IM-NEXT:    lui a5, 1
 ; RV32IM-NEXT:    addi a5, a5, 1327
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a2, a2, a4
+; RV32IM-NEXT:    sub a1, a1, a4
 ; RV32IM-NEXT:    sh zero, 0(a0)
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_urem_one:
@@ -708,24 +708,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu s0, 24(a1)
-; RV64I-NEXT:    lhu s1, 16(a1)
 ; RV64I-NEXT:    lhu a2, 8(a1)
+; RV64I-NEXT:    lhu s0, 16(a1)
+; RV64I-NEXT:    lhu s1, 24(a1)
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    li a1, 654
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __umoddi3
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    sh a0, 6(s2)
-; RV64I-NEXT:    sh s1, 4(s2)
+; RV64I-NEXT:    sh s0, 4(s2)
 ; RV64I-NEXT:    sh s3, 2(s2)
 ; RV64I-NEXT:    sh zero, 0(s2)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -738,32 +738,32 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_one:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lhu a2, 8(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI4_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI4_0)(a3)
-; RV64IM-NEXT:    lhu a4, 24(a1)
-; RV64IM-NEXT:    lhu a1, 16(a1)
-; RV64IM-NEXT:    mulhu a3, a2, a3
+; RV64IM-NEXT:    lui a2, %hi(.LCPI4_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI4_0)(a2)
+; RV64IM-NEXT:    lhu a3, 8(a1)
+; RV64IM-NEXT:    lhu a4, 16(a1)
+; RV64IM-NEXT:    lhu a1, 24(a1)
+; RV64IM-NEXT:    mulhu a2, a3, a2
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_1)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_1)(a5)
 ; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulhu a3, a1, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulhu a2, a4, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_2)(a5)
 ; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    mulhu a3, a4, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a4, a4, a2
+; RV64IM-NEXT:    mulhu a2, a1, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addi a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    subw a4, a4, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a2
 ; RV64IM-NEXT:    sh zero, 0(a0)
-; RV64IM-NEXT:    sh a4, 6(a0)
-; RV64IM-NEXT:    sh a1, 4(a0)
-; RV64IM-NEXT:    sh a2, 2(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
+; RV64IM-NEXT:    sh a4, 4(a0)
+; RV64IM-NEXT:    sh a3, 2(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
   ret <4 x i16> %1
@@ -793,17 +793,18 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 24(a1)
-; RV32I-NEXT:    lw s1, 28(a1)
-; RV32I-NEXT:    lw s2, 16(a1)
-; RV32I-NEXT:    lw s3, 20(a1)
+; RV32I-NEXT:    lw s0, 16(a1)
+; RV32I-NEXT:    lw s1, 20(a1)
+; RV32I-NEXT:    lw s2, 24(a1)
+; RV32I-NEXT:    lw s3, 28(a1)
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw s4, 8(a1)
 ; RV32I-NEXT:    lw s5, 12(a1)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    li a2, 1
 ; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3
 ; RV32I-NEXT:    mv s7, a0
@@ -816,22 +817,22 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    mv s5, a1
 ; RV32I-NEXT:    li a2, 23
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3
-; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    mv s3, a1
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a2, a0, 1327
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3
 ; RV32I-NEXT:    sw a1, 28(s6)
 ; RV32I-NEXT:    sw a0, 24(s6)
-; RV32I-NEXT:    sw s3, 20(s6)
-; RV32I-NEXT:    sw s2, 16(s6)
+; RV32I-NEXT:    sw s1, 20(s6)
+; RV32I-NEXT:    sw s0, 16(s6)
 ; RV32I-NEXT:    sw s5, 12(s6)
 ; RV32I-NEXT:    sw s4, 8(s6)
 ; RV32I-NEXT:    sw s8, 4(s6)
@@ -862,17 +863,18 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw s0, 24(a1)
-; RV32IM-NEXT:    lw s1, 28(a1)
-; RV32IM-NEXT:    lw s2, 16(a1)
-; RV32IM-NEXT:    lw s3, 20(a1)
+; RV32IM-NEXT:    lw s0, 16(a1)
+; RV32IM-NEXT:    lw s1, 20(a1)
+; RV32IM-NEXT:    lw s2, 24(a1)
+; RV32IM-NEXT:    lw s3, 28(a1)
+; RV32IM-NEXT:    lw a3, 0(a1)
+; RV32IM-NEXT:    lw a4, 4(a1)
 ; RV32IM-NEXT:    lw s4, 8(a1)
 ; RV32IM-NEXT:    lw s5, 12(a1)
-; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a1, 4(a1)
 ; RV32IM-NEXT:    mv s6, a0
 ; RV32IM-NEXT:    li a2, 1
 ; RV32IM-NEXT:    mv a0, a3
+; RV32IM-NEXT:    mv a1, a4
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3
 ; RV32IM-NEXT:    mv s7, a0
@@ -885,22 +887,22 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    mv s4, a0
 ; RV32IM-NEXT:    mv s5, a1
 ; RV32IM-NEXT:    li a2, 23
-; RV32IM-NEXT:    mv a0, s2
-; RV32IM-NEXT:    mv a1, s3
+; RV32IM-NEXT:    mv a0, s0
+; RV32IM-NEXT:    mv a1, s1
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3
-; RV32IM-NEXT:    mv s2, a0
-; RV32IM-NEXT:    mv s3, a1
+; RV32IM-NEXT:    mv s0, a0
+; RV32IM-NEXT:    mv s1, a1
 ; RV32IM-NEXT:    lui a0, 1
 ; RV32IM-NEXT:    addi a2, a0, 1327
-; RV32IM-NEXT:    mv a0, s0
-; RV32IM-NEXT:    mv a1, s1
+; RV32IM-NEXT:    mv a0, s2
+; RV32IM-NEXT:    mv a1, s3
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3
 ; RV32IM-NEXT:    sw a1, 28(s6)
 ; RV32IM-NEXT:    sw a0, 24(s6)
-; RV32IM-NEXT:    sw s3, 20(s6)
-; RV32IM-NEXT:    sw s2, 16(s6)
+; RV32IM-NEXT:    sw s1, 20(s6)
+; RV32IM-NEXT:    sw s0, 16(s6)
 ; RV32IM-NEXT:    sw s5, 12(s6)
 ; RV32IM-NEXT:    sw s4, 8(s6)
 ; RV32IM-NEXT:    sw s8, 4(s6)
@@ -926,24 +928,24 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    ld s0, 24(a1)
-; RV64I-NEXT:    ld s1, 16(a1)
 ; RV64I-NEXT:    ld a2, 8(a1)
+; RV64I-NEXT:    ld s0, 16(a1)
+; RV64I-NEXT:    ld s1, 24(a1)
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    li a1, 654
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __umoddi3
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    sd a0, 24(s2)
-; RV64I-NEXT:    sd s1, 16(s2)
+; RV64I-NEXT:    sd s0, 16(s2)
 ; RV64I-NEXT:    sd s3, 8(s2)
 ; RV64I-NEXT:    sd zero, 0(s2)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -956,39 +958,39 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_i64:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    ld a2, 16(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI6_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI6_0)(a3)
-; RV64IM-NEXT:    ld a4, 24(a1)
-; RV64IM-NEXT:    ld a1, 8(a1)
-; RV64IM-NEXT:    mulhu a3, a2, a3
-; RV64IM-NEXT:    sub a5, a2, a3
+; RV64IM-NEXT:    lui a2, %hi(.LCPI6_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI6_0)(a2)
+; RV64IM-NEXT:    ld a3, 16(a1)
+; RV64IM-NEXT:    ld a4, 8(a1)
+; RV64IM-NEXT:    ld a1, 24(a1)
+; RV64IM-NEXT:    mulhu a2, a3, a2
+; RV64IM-NEXT:    sub a5, a3, a2
 ; RV64IM-NEXT:    srli a5, a5, 1
-; RV64IM-NEXT:    add a3, a5, a3
-; RV64IM-NEXT:    srli a3, a3, 4
+; RV64IM-NEXT:    add a2, a5, a2
+; RV64IM-NEXT:    srli a2, a2, 4
 ; RV64IM-NEXT:    li a5, 23
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI6_1)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI6_1)(a6)
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    sub a2, a2, a3
-; RV64IM-NEXT:    srli a3, a1, 1
-; RV64IM-NEXT:    mulhu a3, a3, a6
-; RV64IM-NEXT:    srli a3, a3, 7
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    sub a3, a3, a2
+; RV64IM-NEXT:    srli a2, a4, 1
+; RV64IM-NEXT:    mulhu a2, a2, a6
+; RV64IM-NEXT:    srli a2, a2, 7
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI6_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI6_2)(a5)
 ; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    sub a1, a1, a3
-; RV64IM-NEXT:    mulhu a3, a4, a5
-; RV64IM-NEXT:    srli a3, a3, 12
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    sub a4, a4, a2
+; RV64IM-NEXT:    mulhu a2, a1, a5
+; RV64IM-NEXT:    srli a2, a2, 12
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addiw a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    sub a4, a4, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    sub a1, a1, a2
 ; RV64IM-NEXT:    sd zero, 0(a0)
-; RV64IM-NEXT:    sd a4, 24(a0)
-; RV64IM-NEXT:    sd a1, 8(a0)
-; RV64IM-NEXT:    sd a2, 16(a0)
+; RV64IM-NEXT:    sd a1, 24(a0)
+; RV64IM-NEXT:    sd a4, 8(a0)
+; RV64IM-NEXT:    sd a3, 16(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
   ret <4 x i64> %1
diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll
index 621f54946e4cd..d2c30c5439070 100644
--- a/llvm/test/CodeGen/RISCV/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/vararg.ll
@@ -822,11 +822,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 20
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a0, 12(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 27
-; ILP32-ILP32F-FPELIM-NEXT:    andi a0, a0, -8
-; ILP32-ILP32F-FPELIM-NEXT:    addi a1, sp, 35
-; ILP32-ILP32F-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a1, 4(a0)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 0(a0)
+; ILP32-ILP32F-FPELIM-NEXT:    andi a1, a0, -8
+; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 35
+; ILP32-ILP32F-FPELIM-NEXT:    sw a0, 12(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 0(a1)
+; ILP32-ILP32F-FPELIM-NEXT:    lw a1, 4(a1)
 ; ILP32-ILP32F-FPELIM-NEXT:    addi sp, sp, 48
 ; ILP32-ILP32F-FPELIM-NEXT:    ret
 ;
@@ -846,11 +846,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 4
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a0, -12(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 11
-; ILP32-ILP32F-WITHFP-NEXT:    andi a0, a0, -8
-; ILP32-ILP32F-WITHFP-NEXT:    addi a1, s0, 19
-; ILP32-ILP32F-WITHFP-NEXT:    sw a1, -12(s0)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a1, 4(a0)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 0(a0)
+; ILP32-ILP32F-WITHFP-NEXT:    andi a1, a0, -8
+; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 19
+; ILP32-ILP32F-WITHFP-NEXT:    sw a0, -12(s0)
+; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 0(a1)
+; ILP32-ILP32F-WITHFP-NEXT:    lw a1, 4(a1)
 ; ILP32-ILP32F-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-WITHFP-NEXT:    addi sp, sp, 48
@@ -869,11 +869,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 20
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a0, 12(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 27
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a0, a0, -8
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a1, sp, 35
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a1, 12(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a1, 4(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 0(a0)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a1, a0, -8
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 35
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a0, 12(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 0(a1)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a1, 4(a1)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi sp, sp, 48
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    ret
 ;
@@ -888,11 +888,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32E-FPELIM-NEXT:    addi a0, sp, 8
 ; ILP32E-FPELIM-NEXT:    sw a0, 0(sp)
 ; ILP32E-FPELIM-NEXT:    addi a0, sp, 15
-; ILP32E-FPELIM-NEXT:    andi a0, a0, -8
-; ILP32E-FPELIM-NEXT:    addi a1, sp, 23
-; ILP32E-FPELIM-NEXT:    sw a1, 0(sp)
-; ILP32E-FPELIM-NEXT:    lw a1, 4(a0)
-; ILP32E-FPELIM-NEXT:    lw a0, 0(a0)
+; ILP32E-FPELIM-NEXT:    andi a1, a0, -8
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 23
+; ILP32E-FPELIM-NEXT:    sw a0, 0(sp)
+; ILP32E-FPELIM-NEXT:    lw a0, 0(a1)
+; ILP32E-FPELIM-NEXT:    lw a1, 4(a1)
 ; ILP32E-FPELIM-NEXT:    addi sp, sp, 28
 ; ILP32E-FPELIM-NEXT:    ret
 ;
@@ -910,11 +910,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    addi a0, s0, 4
 ; ILP32E-WITHFP-NEXT:    sw a0, -12(s0)
 ; ILP32E-WITHFP-NEXT:    addi a0, s0, 11
-; ILP32E-WITHFP-NEXT:    andi a0, a0, -8
-; ILP32E-WITHFP-NEXT:    addi a1, s0, 19
-; ILP32E-WITHFP-NEXT:    sw a1, -12(s0)
-; ILP32E-WITHFP-NEXT:    lw a1, 4(a0)
-; ILP32E-WITHFP-NEXT:    lw a0, 0(a0)
+; ILP32E-WITHFP-NEXT:    andi a1, a0, -8
+; ILP32E-WITHFP-NEXT:    addi a0, s0, 19
+; ILP32E-WITHFP-NEXT:    sw a0, -12(s0)
+; ILP32E-WITHFP-NEXT:    lw a0, 0(a1)
+; ILP32E-WITHFP-NEXT:    lw a1, 4(a1)
 ; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
 ; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
 ; ILP32E-WITHFP-NEXT:    addi sp, sp, 36
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index 29fe0a7de6b3d..3e14317a00474 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -8,8 +8,8 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -38,17 +38,17 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -72,8 +72,8 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -102,17 +102,17 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -136,8 +136,8 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -166,17 +166,17 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -207,39 +207,39 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 5(a1)
 ; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 6(a1)
+; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 6(a1)
 ; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
+; RV64I-NEXT:    or a3, a6, a3
 ; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a4, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a3, a3, 35
 ; RV64I-NEXT:    or a1, a3, a1
@@ -272,17 +272,17 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    slli a5, a1, 3
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    srl a1, a3, a5
@@ -343,39 +343,39 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 5(a1)
 ; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 6(a1)
+; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 6(a1)
 ; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
+; RV64I-NEXT:    or a3, a6, a3
 ; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a4, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a3, a3, 35
 ; RV64I-NEXT:    or a1, a3, a1
@@ -408,17 +408,17 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    slli a5, a1, 3
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    sll a1, a3, a5
@@ -479,39 +479,39 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 5(a1)
 ; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 6(a1)
+; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 6(a1)
 ; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
+; RV64I-NEXT:    or a3, a6, a3
 ; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a4, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a3, a3, 35
 ; RV64I-NEXT:    or a1, a3, a1
@@ -544,18 +544,18 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a4, a6, 24
 ; RV32I-NEXT:    or a5, a4, a5
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    lbu a5, 1(a1)
 ; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    lbu a5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    slli a5, a1, 3
+; RV32I-NEXT:    or a5, a1, a6
+; RV32I-NEXT:    slli a5, a5, 3
 ; RV32I-NEXT:    addi a6, a5, -32
 ; RV32I-NEXT:    sra a1, a3, a5
 ; RV32I-NEXT:    bltz a6, .LBB5_2
@@ -616,39 +616,39 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
 ; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 14(a0)
 ; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a1)
 ; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a4, a4, 35
 ; RV64I-NEXT:    or a5, a4, a1
@@ -668,17 +668,17 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a0)
 ; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t1, 5(a0)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    srl a0, a0, a5
@@ -733,46 +733,46 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t0
 ; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    lbu a7, 0(a1)
-; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu t0, 1(a1)
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    lbu a7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a6, t0, a6
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    sw zero, 28(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
@@ -785,48 +785,48 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a0, a1, 12
 ; RV32I-NEXT:    mv a3, sp
 ; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    lw a3, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    srl a4, a3, a1
 ; RV32I-NEXT:    lw a5, 8(a0)
-; RV32I-NEXT:    andi a6, a1, 24
-; RV32I-NEXT:    xori a6, a6, 31
-; RV32I-NEXT:    lw a7, 0(a0)
-; RV32I-NEXT:    slli t0, a5, 1
-; RV32I-NEXT:    sll t0, t0, a6
-; RV32I-NEXT:    or t0, a4, t0
-; RV32I-NEXT:    srl a7, a7, a1
-; RV32I-NEXT:    slli a3, a3, 1
 ; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    sll a3, a3, a6
-; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a6, a4, a1
+; RV32I-NEXT:    andi a7, a1, 24
+; RV32I-NEXT:    xori a7, a7, 31
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a7
+; RV32I-NEXT:    or t0, a6, t0
+; RV32I-NEXT:    srl a3, a3, a1
+; RV32I-NEXT:    slli a4, a4, 1
+; RV32I-NEXT:    sll a4, a4, a7
+; RV32I-NEXT:    or a4, a3, a4
 ; RV32I-NEXT:    srl a5, a5, a1
 ; RV32I-NEXT:    slli t1, a0, 1
-; RV32I-NEXT:    sll a6, t1, a6
-; RV32I-NEXT:    or a6, a5, a6
+; RV32I-NEXT:    sll a7, t1, a7
+; RV32I-NEXT:    or a7, a5, a7
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    sb a5, 8(a2)
 ; RV32I-NEXT:    sb a0, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
+; RV32I-NEXT:    sb a6, 4(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
 ; RV32I-NEXT:    sb a1, 14(a2)
 ; RV32I-NEXT:    srli a1, a0, 24
 ; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 13(a2)
-; RV32I-NEXT:    srli a0, a6, 16
+; RV32I-NEXT:    srli a0, a7, 16
 ; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    srli a0, a7, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    srli a0, a7, 8
 ; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    srli a0, a4, 16
 ; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    srli a0, a4, 24
 ; RV32I-NEXT:    sb a0, 3(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 1(a2)
 ; RV32I-NEXT:    srli a0, t0, 16
 ; RV32I-NEXT:    sb a0, 6(a2)
 ; RV32I-NEXT:    srli a0, t0, 24
@@ -855,39 +855,39 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
 ; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 14(a0)
 ; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a1)
 ; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a1, a1, 5
 ; RV64I-NEXT:    slli a4, a4, 37
 ; RV64I-NEXT:    or a5, a4, a1
@@ -907,17 +907,17 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a0)
 ; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t1, 5(a0)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    srl a0, a0, a5
@@ -972,37 +972,37 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t0
 ; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a0, a0, a7
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    sw zero, 28(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
@@ -1070,39 +1070,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a1)
 ; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a4, a4, 35
 ; RV64I-NEXT:    or a5, a4, a1
@@ -1122,17 +1122,17 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 13(a0)
 ; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t1, 13(a0)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 14(a0)
 ; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    sll a0, a0, a5
@@ -1187,46 +1187,46 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t0
 ; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    lbu a7, 0(a1)
-; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu t0, 1(a1)
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    lbu a7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a6, t0, a6
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    sw zero, 12(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
@@ -1239,53 +1239,53 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a0, a1, 12
 ; RV32I-NEXT:    addi a3, sp, 16
 ; RV32I-NEXT:    sub a3, a3, a0
-; RV32I-NEXT:    lw a0, 4(a3)
+; RV32I-NEXT:    lw a0, 0(a3)
+; RV32I-NEXT:    lw a4, 4(a3)
 ; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    lw a4, 0(a3)
-; RV32I-NEXT:    sll a5, a0, a1
-; RV32I-NEXT:    andi a6, a1, 24
-; RV32I-NEXT:    xori a6, a6, 31
-; RV32I-NEXT:    srli a7, a4, 1
-; RV32I-NEXT:    lw t0, 12(a3)
-; RV32I-NEXT:    lw a3, 8(a3)
-; RV32I-NEXT:    srl a7, a7, a6
-; RV32I-NEXT:    or a7, a5, a7
-; RV32I-NEXT:    sll t0, t0, a1
-; RV32I-NEXT:    srli t1, a3, 1
-; RV32I-NEXT:    srl t1, t1, a6
-; RV32I-NEXT:    or t1, t0, t1
+; RV32I-NEXT:    lw a5, 8(a3)
+; RV32I-NEXT:    lw a3, 12(a3)
+; RV32I-NEXT:    sll a6, a4, a1
+; RV32I-NEXT:    andi a7, a1, 24
+; RV32I-NEXT:    xori a7, a7, 31
+; RV32I-NEXT:    srli t0, a0, 1
+; RV32I-NEXT:    srl t0, t0, a7
+; RV32I-NEXT:    or t0, a6, t0
 ; RV32I-NEXT:    sll a3, a3, a1
-; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    srl a0, a0, a6
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    sll a1, a4, a1
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli t1, a5, 1
+; RV32I-NEXT:    srl t1, t1, a7
+; RV32I-NEXT:    or t1, a3, t1
+; RV32I-NEXT:    sll a5, a5, a1
+; RV32I-NEXT:    srli a4, a4, 1
+; RV32I-NEXT:    srl a4, a4, a7
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 11(a2)
 ; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, t0, 24
 ; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    sb a5, 7(a2)
-; RV32I-NEXT:    sb a0, 8(a2)
-; RV32I-NEXT:    sb t1, 12(a2)
-; RV32I-NEXT:    sb a7, 4(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 3(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    sb a4, 8(a2)
+; RV32I-NEXT:    sb t1, 12(a2)
+; RV32I-NEXT:    sb t0, 4(a2)
+; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 9(a2)
 ; RV32I-NEXT:    srli a0, t1, 16
 ; RV32I-NEXT:    sb a0, 14(a2)
 ; RV32I-NEXT:    srli a0, t1, 8
 ; RV32I-NEXT:    sb a0, 13(a2)
-; RV32I-NEXT:    srli a0, a7, 16
+; RV32I-NEXT:    srli a0, t0, 16
 ; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    srli a0, a7, 8
+; RV32I-NEXT:    srli a0, t0, 8
 ; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
@@ -1309,39 +1309,39 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a1)
 ; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a1, a1, 5
 ; RV64I-NEXT:    slli a4, a4, 37
 ; RV64I-NEXT:    or a5, a4, a1
@@ -1361,17 +1361,17 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 13(a0)
 ; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t1, 13(a0)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 14(a0)
 ; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    sll a0, a0, a5
@@ -1426,37 +1426,37 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t0
 ; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a0, a0, a7
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    sw zero, 12(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
@@ -1525,39 +1525,39 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
 ; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 14(a0)
 ; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a5, a4, 32
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a5, 5(a1)
 ; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    lbu a5, 6(a1)
 ; RV64I-NEXT:    lbu t0, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a5, a5, 35
 ; RV64I-NEXT:    or a5, a5, a1
@@ -1579,17 +1579,17 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a6, 5(a0)
 ; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 6(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    lbu a6, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    srl a0, a0, a5
@@ -1642,47 +1642,47 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a7, a0, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a6, a0, a6
 ; RV32I-NEXT:    lbu t0, 0(a1)
-; RV32I-NEXT:    lbu t1, 2(a1)
+; RV32I-NEXT:    lbu t1, 1(a1)
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    lbu a7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t1
 ; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a1, a1, t0
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sw a0, 28(sp)
 ; RV32I-NEXT:    sw a0, 24(sp)
@@ -1695,48 +1695,48 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a0, a1, 12
 ; RV32I-NEXT:    mv a3, sp
 ; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    lw a3, 0(a0)
+; RV32I-NEXT:    lw a4, 4(a0)
 ; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    srl a4, a3, a1
 ; RV32I-NEXT:    lw a5, 8(a0)
-; RV32I-NEXT:    andi a6, a1, 24
-; RV32I-NEXT:    xori a6, a6, 31
-; RV32I-NEXT:    lw a7, 0(a0)
-; RV32I-NEXT:    slli t0, a5, 1
-; RV32I-NEXT:    sll t0, t0, a6
-; RV32I-NEXT:    or t0, a4, t0
-; RV32I-NEXT:    srl a7, a7, a1
-; RV32I-NEXT:    slli a3, a3, 1
 ; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    sll a3, a3, a6
-; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a6, a4, a1
+; RV32I-NEXT:    andi a7, a1, 24
+; RV32I-NEXT:    xori a7, a7, 31
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a7
+; RV32I-NEXT:    or t0, a6, t0
+; RV32I-NEXT:    srl a3, a3, a1
+; RV32I-NEXT:    slli a4, a4, 1
+; RV32I-NEXT:    sll a4, a4, a7
+; RV32I-NEXT:    or a4, a3, a4
 ; RV32I-NEXT:    srl a5, a5, a1
 ; RV32I-NEXT:    slli t1, a0, 1
-; RV32I-NEXT:    sll a6, t1, a6
-; RV32I-NEXT:    or a6, a5, a6
+; RV32I-NEXT:    sll a7, t1, a7
+; RV32I-NEXT:    or a7, a5, a7
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    sb a5, 8(a2)
 ; RV32I-NEXT:    sb a0, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
+; RV32I-NEXT:    sb a6, 4(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
 ; RV32I-NEXT:    sb a1, 14(a2)
 ; RV32I-NEXT:    srli a1, a0, 24
 ; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 13(a2)
-; RV32I-NEXT:    srli a0, a6, 16
+; RV32I-NEXT:    srli a0, a7, 16
 ; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    srli a0, a7, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    srli a0, a7, 8
 ; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    srli a0, a4, 16
 ; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    srli a0, a4, 24
 ; RV32I-NEXT:    sb a0, 3(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 1(a2)
 ; RV32I-NEXT:    srli a0, t0, 16
 ; RV32I-NEXT:    sb a0, 6(a2)
 ; RV32I-NEXT:    srli a0, t0, 24
@@ -1765,39 +1765,39 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
 ; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 14(a0)
 ; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a5, a4, 32
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a5, 5(a1)
 ; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    lbu a5, 6(a1)
 ; RV64I-NEXT:    lbu t0, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    slli a1, a1, 5
 ; RV64I-NEXT:    slli a5, a5, 37
 ; RV64I-NEXT:    or a5, a5, a1
@@ -1819,17 +1819,17 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a6, 5(a0)
 ; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 6(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    lbu a6, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    srl a0, a0, a5
@@ -1882,37 +1882,37 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a7, a0, t0
-; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a6, a0, a6
+; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sw a0, 28(sp)
@@ -1982,105 +1982,105 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a6, 9(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 10(a0)
 ; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 14(a0)
 ; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
 ; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu a7, 17(a0)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 18(a0)
 ; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t0, 21(a0)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 22(a0)
 ; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 26(a0)
 ; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a1)
 ; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu t1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sd zero, 56(sp)
@@ -2093,31 +2093,31 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    andi a0, a1, 24
 ; RV64I-NEXT:    mv a3, sp
-; RV64I-NEXT:    add a3, a3, a0
-; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    ld a3, 0(a0)
+; RV64I-NEXT:    ld a4, 8(a0)
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    srl a5, a4, a1
-; RV64I-NEXT:    ld a6, 16(a3)
+; RV64I-NEXT:    ld a5, 16(a0)
+; RV64I-NEXT:    ld a6, 24(a0)
+; RV64I-NEXT:    srl a7, a4, a1
 ; RV64I-NEXT:    andi a0, a1, 56
-; RV64I-NEXT:    xori a7, a0, 63
-; RV64I-NEXT:    ld t0, 0(a3)
-; RV64I-NEXT:    slli a0, a6, 1
-; RV64I-NEXT:    sll a0, a0, a7
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    srl t0, t0, a1
+; RV64I-NEXT:    xori t0, a0, 63
+; RV64I-NEXT:    slli a0, a5, 1
+; RV64I-NEXT:    sll a0, a0, t0
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    srl a3, a3, a1
 ; RV64I-NEXT:    slli a4, a4, 1
-; RV64I-NEXT:    ld a3, 24(a3)
-; RV64I-NEXT:    sll a4, a4, a7
-; RV64I-NEXT:    or a4, t0, a4
-; RV64I-NEXT:    srl a6, a6, a1
-; RV64I-NEXT:    slli t1, a3, 1
-; RV64I-NEXT:    sll a7, t1, a7
-; RV64I-NEXT:    or a7, a6, a7
-; RV64I-NEXT:    srl a1, a3, a1
-; RV64I-NEXT:    sb a6, 16(a2)
+; RV64I-NEXT:    sll a4, a4, t0
+; RV64I-NEXT:    or a4, a3, a4
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    slli t1, a6, 1
+; RV64I-NEXT:    sll t0, t1, t0
+; RV64I-NEXT:    or t0, a5, t0
+; RV64I-NEXT:    srl a1, a6, a1
+; RV64I-NEXT:    sb a5, 16(a2)
 ; RV64I-NEXT:    sb a1, 24(a2)
-; RV64I-NEXT:    sb t0, 0(a2)
-; RV64I-NEXT:    sb a5, 8(a2)
+; RV64I-NEXT:    sb a3, 0(a2)
+; RV64I-NEXT:    sb a7, 8(a2)
 ; RV64I-NEXT:    srli a3, a1, 56
 ; RV64I-NEXT:    sb a3, 31(a2)
 ; RV64I-NEXT:    srli a3, a1, 48
@@ -2132,19 +2132,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a3, 26(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 25(a2)
-; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    srli a1, t0, 56
 ; RV64I-NEXT:    sb a1, 23(a2)
-; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    srli a1, t0, 48
 ; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    srli a1, t0, 40
 ; RV64I-NEXT:    sb a1, 21(a2)
-; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    srli a1, t0, 32
 ; RV64I-NEXT:    sb a1, 20(a2)
-; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    srli a1, t0, 24
 ; RV64I-NEXT:    sb a1, 19(a2)
-; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    srli a1, t0, 16
 ; RV64I-NEXT:    sb a1, 18(a2)
-; RV64I-NEXT:    srli a1, a7, 8
+; RV64I-NEXT:    srli a1, t0, 8
 ; RV64I-NEXT:    sb a1, 17(a2)
 ; RV64I-NEXT:    srli a1, a4, 56
 ; RV64I-NEXT:    sb a1, 7(a2)
@@ -2192,86 +2192,86 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t1, 17(a0)
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    lbu a7, 18(a0)
 ; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    or a7, t2, a7
 ; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    lbu t0, 22(a0)
 ; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    or t0, t3, t0
 ; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    lbu t1, 26(a0)
 ; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    or t1, t4, t1
 ; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu t4, 29(a0)
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    lbu t2, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t4
 ; RV32I-NEXT:    or a0, a0, t2
-; RV32I-NEXT:    lbu t2, 1(a1)
-; RV32I-NEXT:    lbu t3, 0(a1)
-; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu t2, 0(a1)
+; RV32I-NEXT:    lbu t4, 1(a1)
+; RV32I-NEXT:    or a0, a0, t3
+; RV32I-NEXT:    lbu t3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t2, t4, t2
+; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a1, a1, t3
 ; RV32I-NEXT:    or a1, a1, t2
 ; RV32I-NEXT:    sw zero, 60(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
@@ -2291,54 +2291,54 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    andi a0, a1, 28
 ; RV32I-NEXT:    mv a3, sp
-; RV32I-NEXT:    add a5, a3, a0
-; RV32I-NEXT:    lw a3, 4(a5)
-; RV32I-NEXT:    slli a6, a1, 3
-; RV32I-NEXT:    srl a4, a3, a6
-; RV32I-NEXT:    lw a7, 8(a5)
-; RV32I-NEXT:    andi a0, a6, 24
-; RV32I-NEXT:    xori t0, a0, 31
-; RV32I-NEXT:    lw a1, 0(a5)
-; RV32I-NEXT:    slli a0, a7, 1
-; RV32I-NEXT:    sll a0, a0, t0
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    lw a6, 0(a3)
+; RV32I-NEXT:    lw a7, 4(a3)
+; RV32I-NEXT:    slli a5, a1, 3
+; RV32I-NEXT:    lw t0, 8(a3)
+; RV32I-NEXT:    lw t1, 12(a3)
+; RV32I-NEXT:    srl a4, a7, a5
+; RV32I-NEXT:    andi a0, a5, 24
+; RV32I-NEXT:    xori t2, a0, 31
+; RV32I-NEXT:    slli a0, t0, 1
+; RV32I-NEXT:    sll a0, a0, t2
 ; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    srl t1, a1, a6
-; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    lw t2, 12(a5)
-; RV32I-NEXT:    lw t3, 16(a5)
-; RV32I-NEXT:    sll a1, a3, t0
-; RV32I-NEXT:    or a1, t1, a1
-; RV32I-NEXT:    srl t4, t2, a6
+; RV32I-NEXT:    srl a6, a6, a5
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    sll a1, a7, t2
+; RV32I-NEXT:    or a1, a6, a1
+; RV32I-NEXT:    srl a7, t1, a5
+; RV32I-NEXT:    lw t3, 16(a3)
+; RV32I-NEXT:    lw t4, 20(a3)
+; RV32I-NEXT:    lw t5, 24(a3)
+; RV32I-NEXT:    lw t6, 28(a3)
 ; RV32I-NEXT:    slli a3, t3, 1
-; RV32I-NEXT:    sll a3, a3, t0
-; RV32I-NEXT:    or a3, t4, a3
-; RV32I-NEXT:    srl a7, a7, a6
-; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    lw t5, 20(a5)
-; RV32I-NEXT:    lw t6, 24(a5)
-; RV32I-NEXT:    sll t2, t2, t0
-; RV32I-NEXT:    or t2, a7, t2
-; RV32I-NEXT:    srl s0, t5, a6
-; RV32I-NEXT:    slli s1, t6, 1
-; RV32I-NEXT:    sll s1, s1, t0
+; RV32I-NEXT:    sll a3, a3, t2
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl t0, t0, a5
+; RV32I-NEXT:    slli t1, t1, 1
+; RV32I-NEXT:    sll t1, t1, t2
+; RV32I-NEXT:    or t1, t0, t1
+; RV32I-NEXT:    srl s0, t4, a5
+; RV32I-NEXT:    slli s1, t5, 1
+; RV32I-NEXT:    sll s1, s1, t2
 ; RV32I-NEXT:    or s1, s0, s1
-; RV32I-NEXT:    srl t3, t3, a6
-; RV32I-NEXT:    slli t5, t5, 1
-; RV32I-NEXT:    lw a5, 28(a5)
-; RV32I-NEXT:    sll t5, t5, t0
-; RV32I-NEXT:    or t5, t3, t5
-; RV32I-NEXT:    srl t6, t6, a6
-; RV32I-NEXT:    slli s2, a5, 1
-; RV32I-NEXT:    sll t0, s2, t0
-; RV32I-NEXT:    or t0, t6, t0
-; RV32I-NEXT:    srl a5, a5, a6
-; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    srl t3, t3, a5
+; RV32I-NEXT:    slli t4, t4, 1
+; RV32I-NEXT:    sll t4, t4, t2
+; RV32I-NEXT:    or t4, t3, t4
+; RV32I-NEXT:    srl t5, t5, a5
+; RV32I-NEXT:    slli s2, t6, 1
+; RV32I-NEXT:    sll t2, s2, t2
+; RV32I-NEXT:    or t2, t5, t2
+; RV32I-NEXT:    srl a5, t6, a5
+; RV32I-NEXT:    sb t5, 24(a2)
 ; RV32I-NEXT:    sb a5, 28(a2)
 ; RV32I-NEXT:    sb t3, 16(a2)
 ; RV32I-NEXT:    sb s0, 20(a2)
-; RV32I-NEXT:    sb a7, 8(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t1, 0(a2)
+; RV32I-NEXT:    sb t0, 8(a2)
+; RV32I-NEXT:    sb a7, 12(a2)
+; RV32I-NEXT:    sb a6, 0(a2)
 ; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    srli a4, a5, 24
 ; RV32I-NEXT:    sb a4, 31(a2)
@@ -2346,17 +2346,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a4, 30(a2)
 ; RV32I-NEXT:    srli a5, a5, 8
 ; RV32I-NEXT:    sb a5, 29(a2)
-; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    srli a4, t2, 24
 ; RV32I-NEXT:    sb a4, 27(a2)
-; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    srli a4, t2, 16
 ; RV32I-NEXT:    sb a4, 26(a2)
-; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    srli a4, t2, 8
 ; RV32I-NEXT:    sb a4, 25(a2)
-; RV32I-NEXT:    srli a4, t5, 24
+; RV32I-NEXT:    srli a4, t4, 24
 ; RV32I-NEXT:    sb a4, 19(a2)
-; RV32I-NEXT:    srli a4, t5, 16
+; RV32I-NEXT:    srli a4, t4, 16
 ; RV32I-NEXT:    sb a4, 18(a2)
-; RV32I-NEXT:    srli a4, t5, 8
+; RV32I-NEXT:    srli a4, t4, 8
 ; RV32I-NEXT:    sb a4, 17(a2)
 ; RV32I-NEXT:    srli a4, s1, 24
 ; RV32I-NEXT:    sb a4, 23(a2)
@@ -2364,11 +2364,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a4, 22(a2)
 ; RV32I-NEXT:    srli s1, s1, 8
 ; RV32I-NEXT:    sb s1, 21(a2)
-; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    srli a4, t1, 24
 ; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a4, t2, 16
+; RV32I-NEXT:    srli a4, t1, 16
 ; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, t2, 8
+; RV32I-NEXT:    srli a4, t1, 8
 ; RV32I-NEXT:    sb a4, 9(a2)
 ; RV32I-NEXT:    srli a4, a3, 24
 ; RV32I-NEXT:    sb a4, 15(a2)
@@ -2414,105 +2414,105 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a6, 9(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 10(a0)
 ; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 14(a0)
 ; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
 ; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu a7, 17(a0)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 18(a0)
 ; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t0, 21(a0)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 22(a0)
 ; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 26(a0)
 ; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a1)
 ; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu t1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sd zero, 56(sp)
@@ -2526,70 +2526,70 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli a0, a1, 2
 ; RV64I-NEXT:    andi a0, a0, 24
 ; RV64I-NEXT:    mv a3, sp
-; RV64I-NEXT:    add a3, a3, a0
-; RV64I-NEXT:    ld a4, 8(a3)
-; RV64I-NEXT:    slli a5, a1, 5
-; RV64I-NEXT:    srl a1, a4, a5
-; RV64I-NEXT:    ld a6, 16(a3)
-; RV64I-NEXT:    andi a0, a5, 32
-; RV64I-NEXT:    xori a7, a0, 63
-; RV64I-NEXT:    ld t0, 0(a3)
-; RV64I-NEXT:    slli a0, a6, 1
-; RV64I-NEXT:    sll a0, a0, a7
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    srl t0, t0, a5
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    ld a3, 0(a0)
+; RV64I-NEXT:    ld a4, 8(a0)
+; RV64I-NEXT:    slli a1, a1, 5
+; RV64I-NEXT:    ld a5, 16(a0)
+; RV64I-NEXT:    ld a6, 24(a0)
+; RV64I-NEXT:    srl a7, a4, a1
+; RV64I-NEXT:    andi a0, a1, 32
+; RV64I-NEXT:    xori t0, a0, 63
+; RV64I-NEXT:    slli a0, a5, 1
+; RV64I-NEXT:    sll a0, a0, t0
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    srl a3, a3, a1
 ; RV64I-NEXT:    slli a4, a4, 1
-; RV64I-NEXT:    ld a3, 24(a3)
-; RV64I-NEXT:    sll a4, a4, a7
-; RV64I-NEXT:    or a4, t0, a4
-; RV64I-NEXT:    srl a6, a6, a5
-; RV64I-NEXT:    slli t1, a3, 1
-; RV64I-NEXT:    sll a7, t1, a7
-; RV64I-NEXT:    or a7, a6, a7
-; RV64I-NEXT:    srl a3, a3, a5
-; RV64I-NEXT:    sb a6, 16(a2)
-; RV64I-NEXT:    sb a3, 24(a2)
-; RV64I-NEXT:    sb t0, 0(a2)
-; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    srli a5, a6, 24
-; RV64I-NEXT:    sb a5, 19(a2)
-; RV64I-NEXT:    srli a5, a6, 16
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    srli a5, a6, 8
+; RV64I-NEXT:    sll a4, a4, t0
+; RV64I-NEXT:    or a4, a3, a4
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    slli t1, a6, 1
+; RV64I-NEXT:    sll t0, t1, t0
+; RV64I-NEXT:    or t0, a5, t0
+; RV64I-NEXT:    srl a1, a6, a1
+; RV64I-NEXT:    sb a5, 16(a2)
+; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    sb a3, 0(a2)
+; RV64I-NEXT:    sb a7, 8(a2)
+; RV64I-NEXT:    srli a6, a5, 24
+; RV64I-NEXT:    sb a6, 19(a2)
+; RV64I-NEXT:    srli a6, a5, 16
+; RV64I-NEXT:    sb a6, 18(a2)
+; RV64I-NEXT:    srli a5, a5, 8
 ; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a5, a3, 56
+; RV64I-NEXT:    srli a5, a1, 56
 ; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a3, 48
+; RV64I-NEXT:    srli a5, a1, 48
 ; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    srli a5, a3, 40
+; RV64I-NEXT:    srli a5, a1, 40
 ; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    srli a5, a3, 32
+; RV64I-NEXT:    srli a5, a1, 32
 ; RV64I-NEXT:    sb a5, 28(a2)
-; RV64I-NEXT:    srli a5, a3, 24
+; RV64I-NEXT:    srli a5, a1, 24
 ; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, a3, 16
+; RV64I-NEXT:    srli a5, a1, 16
 ; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a3, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a3, 16
+; RV64I-NEXT:    sb a1, 2(a2)
 ; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 25(a2)
-; RV64I-NEXT:    srli a3, t0, 24
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a3, t0, 16
-; RV64I-NEXT:    sb a3, 2(a2)
-; RV64I-NEXT:    srli a3, t0, 8
 ; RV64I-NEXT:    sb a3, 1(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    srli a3, a1, 16
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a1, a7, 8
 ; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    srli a1, t0, 56
 ; RV64I-NEXT:    sb a1, 23(a2)
-; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    srli a1, t0, 48
 ; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    srli a1, t0, 40
 ; RV64I-NEXT:    sb a1, 21(a2)
-; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    srli a1, t0, 32
 ; RV64I-NEXT:    sb a1, 20(a2)
 ; RV64I-NEXT:    srli a1, a4, 56
 ; RV64I-NEXT:    sb a1, 7(a2)
@@ -2622,77 +2622,77 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t1, 17(a0)
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    lbu a7, 18(a0)
 ; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    or a7, t2, a7
 ; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    lbu t0, 22(a0)
 ; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    or t0, t3, t0
 ; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    lbu t1, 26(a0)
 ; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    or t1, t4, t1
 ; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu t4, 29(a0)
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    lbu t2, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t4
 ; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    or a0, a0, t3
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    sw zero, 60(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
@@ -2713,64 +2713,64 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    slli a1, a1, 2
 ; RV32I-NEXT:    andi a1, a1, 28
 ; RV32I-NEXT:    mv a0, sp
-; RV32I-NEXT:    add a3, a0, a1
-; RV32I-NEXT:    lw a0, 4(a3)
-; RV32I-NEXT:    lw a1, 0(a3)
-; RV32I-NEXT:    lw a4, 12(a3)
-; RV32I-NEXT:    lw a5, 8(a3)
-; RV32I-NEXT:    lw a6, 24(a3)
-; RV32I-NEXT:    lw a7, 28(a3)
-; RV32I-NEXT:    lw t0, 16(a3)
-; RV32I-NEXT:    lw a3, 20(a3)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb a7, 28(a2)
-; RV32I-NEXT:    sb t0, 16(a2)
-; RV32I-NEXT:    sb a3, 20(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
-; RV32I-NEXT:    sb a4, 12(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    lw a3, 16(a1)
+; RV32I-NEXT:    lw a4, 20(a1)
+; RV32I-NEXT:    lw a5, 24(a1)
+; RV32I-NEXT:    lw a6, 28(a1)
+; RV32I-NEXT:    lw a7, 0(a1)
+; RV32I-NEXT:    lw a0, 4(a1)
+; RV32I-NEXT:    lw t0, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sb a5, 24(a2)
+; RV32I-NEXT:    sb a6, 28(a2)
+; RV32I-NEXT:    sb a3, 16(a2)
+; RV32I-NEXT:    sb a4, 20(a2)
+; RV32I-NEXT:    sb t0, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb a7, 0(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    srli t1, a5, 24
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    srli t1, a5, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 25(a2)
-; RV32I-NEXT:    srli a6, a7, 24
-; RV32I-NEXT:    sb a6, 31(a2)
-; RV32I-NEXT:    srli a6, a7, 16
-; RV32I-NEXT:    sb a6, 30(a2)
-; RV32I-NEXT:    srli a6, a7, 8
-; RV32I-NEXT:    sb a6, 29(a2)
-; RV32I-NEXT:    srli a6, t0, 24
-; RV32I-NEXT:    sb a6, 19(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 18(a2)
-; RV32I-NEXT:    srli a6, t0, 8
-; RV32I-NEXT:    sb a6, 17(a2)
-; RV32I-NEXT:    srli a6, a3, 24
-; RV32I-NEXT:    sb a6, 23(a2)
-; RV32I-NEXT:    srli a6, a3, 16
-; RV32I-NEXT:    sb a6, 22(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 21(a2)
-; RV32I-NEXT:    srli a3, a5, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a5, 16
-; RV32I-NEXT:    sb a3, 10(a2)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb a5, 25(a2)
+; RV32I-NEXT:    srli a5, a6, 24
+; RV32I-NEXT:    sb a5, 31(a2)
+; RV32I-NEXT:    srli a5, a6, 16
+; RV32I-NEXT:    sb a5, 30(a2)
+; RV32I-NEXT:    srli a5, a6, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    sb a5, 19(a2)
+; RV32I-NEXT:    srli a5, a3, 16
+; RV32I-NEXT:    sb a5, 18(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 17(a2)
 ; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    sb a3, 23(a2)
 ; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    sb a3, 22(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a3, t0, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, t0, 16
+; RV32I-NEXT:    sb a3, 10(a2)
+; RV32I-NEXT:    srli a3, t0, 8
+; RV32I-NEXT:    sb a3, 9(a2)
 ; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    sb a3, 15(a2)
 ; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    sb a3, 14(a2)
 ; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a7, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a1, a7, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a7, 8
 ; RV32I-NEXT:    sb a1, 1(a2)
 ; RV32I-NEXT:    srli a1, a0, 24
 ; RV32I-NEXT:    sb a1, 7(a2)
@@ -2801,83 +2801,83 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a6, 9(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 10(a0)
 ; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 14(a0)
 ; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
 ; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu a7, 17(a0)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 18(a0)
 ; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t0, 21(a0)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 22(a0)
 ; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 26(a0)
 ; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -2972,77 +2972,77 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t1, 17(a0)
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    lbu a7, 18(a0)
 ; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    or a7, t2, a7
 ; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    lbu t0, 22(a0)
 ; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    or t0, t3, t0
 ; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    lbu t1, 26(a0)
 ; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    or t1, t4, t1
 ; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu t4, 29(a0)
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    lbu t2, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t4
 ; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    or a0, a0, t3
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    sw zero, 60(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
@@ -3063,64 +3063,64 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    andi a1, a1, 24
 ; RV32I-NEXT:    mv a0, sp
-; RV32I-NEXT:    add a3, a0, a1
-; RV32I-NEXT:    lw a0, 4(a3)
-; RV32I-NEXT:    lw a1, 0(a3)
-; RV32I-NEXT:    lw a4, 12(a3)
-; RV32I-NEXT:    lw a5, 8(a3)
-; RV32I-NEXT:    lw a6, 24(a3)
-; RV32I-NEXT:    lw a7, 28(a3)
-; RV32I-NEXT:    lw t0, 16(a3)
-; RV32I-NEXT:    lw a3, 20(a3)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb a7, 28(a2)
-; RV32I-NEXT:    sb t0, 16(a2)
-; RV32I-NEXT:    sb a3, 20(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
-; RV32I-NEXT:    sb a4, 12(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    lw a3, 16(a1)
+; RV32I-NEXT:    lw a4, 20(a1)
+; RV32I-NEXT:    lw a5, 24(a1)
+; RV32I-NEXT:    lw a6, 28(a1)
+; RV32I-NEXT:    lw a7, 0(a1)
+; RV32I-NEXT:    lw a0, 4(a1)
+; RV32I-NEXT:    lw t0, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sb a5, 24(a2)
+; RV32I-NEXT:    sb a6, 28(a2)
+; RV32I-NEXT:    sb a3, 16(a2)
+; RV32I-NEXT:    sb a4, 20(a2)
+; RV32I-NEXT:    sb t0, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb a7, 0(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    srli t1, a5, 24
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    srli t1, a5, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 25(a2)
-; RV32I-NEXT:    srli a6, a7, 24
-; RV32I-NEXT:    sb a6, 31(a2)
-; RV32I-NEXT:    srli a6, a7, 16
-; RV32I-NEXT:    sb a6, 30(a2)
-; RV32I-NEXT:    srli a6, a7, 8
-; RV32I-NEXT:    sb a6, 29(a2)
-; RV32I-NEXT:    srli a6, t0, 24
-; RV32I-NEXT:    sb a6, 19(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 18(a2)
-; RV32I-NEXT:    srli a6, t0, 8
-; RV32I-NEXT:    sb a6, 17(a2)
-; RV32I-NEXT:    srli a6, a3, 24
-; RV32I-NEXT:    sb a6, 23(a2)
-; RV32I-NEXT:    srli a6, a3, 16
-; RV32I-NEXT:    sb a6, 22(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 21(a2)
-; RV32I-NEXT:    srli a3, a5, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a5, 16
-; RV32I-NEXT:    sb a3, 10(a2)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb a5, 25(a2)
+; RV32I-NEXT:    srli a5, a6, 24
+; RV32I-NEXT:    sb a5, 31(a2)
+; RV32I-NEXT:    srli a5, a6, 16
+; RV32I-NEXT:    sb a5, 30(a2)
+; RV32I-NEXT:    srli a5, a6, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    sb a5, 19(a2)
+; RV32I-NEXT:    srli a5, a3, 16
+; RV32I-NEXT:    sb a5, 18(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 17(a2)
 ; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    sb a3, 23(a2)
 ; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    sb a3, 22(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a3, t0, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, t0, 16
+; RV32I-NEXT:    sb a3, 10(a2)
+; RV32I-NEXT:    srli a3, t0, 8
+; RV32I-NEXT:    sb a3, 9(a2)
 ; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    sb a3, 15(a2)
 ; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    sb a3, 14(a2)
 ; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a7, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a1, a7, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a7, 8
 ; RV32I-NEXT:    sb a1, 1(a2)
 ; RV32I-NEXT:    srli a1, a0, 24
 ; RV32I-NEXT:    sb a1, 7(a2)
@@ -3151,105 +3151,105 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a6, 9(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 10(a0)
 ; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 14(a0)
 ; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
 ; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu a7, 17(a0)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 18(a0)
 ; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t0, 21(a0)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 22(a0)
 ; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 26(a0)
 ; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a1)
 ; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu t1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sd zero, 24(sp)
@@ -3263,30 +3263,30 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    andi a0, a1, 24
 ; RV64I-NEXT:    addi a3, sp, 32
 ; RV64I-NEXT:    sub a3, a3, a0
-; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    ld a4, 0(a3)
+; RV64I-NEXT:    ld a5, 8(a3)
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    ld a5, 0(a3)
-; RV64I-NEXT:    sll a6, a4, a1
+; RV64I-NEXT:    ld a6, 16(a3)
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a7, a5, a1
 ; RV64I-NEXT:    andi a0, a1, 56
-; RV64I-NEXT:    xori a7, a0, 63
-; RV64I-NEXT:    srli a0, a5, 1
-; RV64I-NEXT:    ld t0, 24(a3)
-; RV64I-NEXT:    ld a3, 16(a3)
-; RV64I-NEXT:    srl a0, a0, a7
-; RV64I-NEXT:    or a0, a6, a0
-; RV64I-NEXT:    sll t0, t0, a1
-; RV64I-NEXT:    srli t1, a3, 1
-; RV64I-NEXT:    srl t1, t1, a7
-; RV64I-NEXT:    or t1, t0, t1
+; RV64I-NEXT:    xori t0, a0, 63
+; RV64I-NEXT:    srli a0, a4, 1
+; RV64I-NEXT:    srl a0, a0, t0
+; RV64I-NEXT:    or a0, a7, a0
 ; RV64I-NEXT:    sll a3, a3, a1
-; RV64I-NEXT:    srli a4, a4, 1
-; RV64I-NEXT:    srl a4, a4, a7
-; RV64I-NEXT:    or a4, a3, a4
-; RV64I-NEXT:    sll a1, a5, a1
+; RV64I-NEXT:    srli t1, a6, 1
+; RV64I-NEXT:    srl t1, t1, t0
+; RV64I-NEXT:    or t1, a3, t1
+; RV64I-NEXT:    sll a6, a6, a1
+; RV64I-NEXT:    srli a5, a5, 1
+; RV64I-NEXT:    srl a5, a5, t0
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    sll a1, a4, a1
 ; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    srli a4, a6, 56
+; RV64I-NEXT:    sb a4, 23(a2)
 ; RV64I-NEXT:    srli a3, a3, 56
-; RV64I-NEXT:    sb a3, 23(a2)
-; RV64I-NEXT:    srli a3, t0, 56
 ; RV64I-NEXT:    sb a3, 31(a2)
 ; RV64I-NEXT:    srli a3, a1, 56
 ; RV64I-NEXT:    sb a3, 7(a2)
@@ -3302,23 +3302,23 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a3, 2(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 1(a2)
-; RV64I-NEXT:    srli a1, a6, 56
+; RV64I-NEXT:    srli a1, a7, 56
 ; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb a5, 16(a2)
 ; RV64I-NEXT:    sb t1, 24(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    srli a1, a5, 48
 ; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    srli a1, a5, 40
 ; RV64I-NEXT:    sb a1, 21(a2)
-; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    srli a1, a5, 32
 ; RV64I-NEXT:    sb a1, 20(a2)
-; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    srli a1, a5, 24
 ; RV64I-NEXT:    sb a1, 19(a2)
-; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    srli a1, a5, 16
 ; RV64I-NEXT:    sb a1, 18(a2)
-; RV64I-NEXT:    srli a4, a4, 8
-; RV64I-NEXT:    sb a4, 17(a2)
+; RV64I-NEXT:    srli a5, a5, 8
+; RV64I-NEXT:    sb a5, 17(a2)
 ; RV64I-NEXT:    srli a1, t1, 48
 ; RV64I-NEXT:    sb a1, 30(a2)
 ; RV64I-NEXT:    srli a1, t1, 40
@@ -3361,86 +3361,86 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t1, 17(a0)
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    lbu a7, 18(a0)
 ; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    or a7, t2, a7
 ; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    lbu t0, 22(a0)
 ; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    or t0, t3, t0
 ; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    lbu t1, 26(a0)
 ; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    or t1, t4, t1
 ; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu t4, 29(a0)
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    lbu t2, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t4
 ; RV32I-NEXT:    or a0, a0, t2
-; RV32I-NEXT:    lbu t2, 1(a1)
-; RV32I-NEXT:    lbu t3, 0(a1)
-; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu t2, 0(a1)
+; RV32I-NEXT:    lbu t4, 1(a1)
+; RV32I-NEXT:    or a0, a0, t3
+; RV32I-NEXT:    lbu t3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t2, t4, t2
+; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a1, a1, t3
 ; RV32I-NEXT:    or a1, a1, t2
 ; RV32I-NEXT:    sw zero, 28(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
@@ -3460,91 +3460,91 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw a3, 32(sp)
 ; RV32I-NEXT:    andi a0, a1, 28
 ; RV32I-NEXT:    addi a3, sp, 32
-; RV32I-NEXT:    sub a6, a3, a0
-; RV32I-NEXT:    lw a3, 4(a6)
+; RV32I-NEXT:    sub a5, a3, a0
+; RV32I-NEXT:    lw a6, 0(a5)
+; RV32I-NEXT:    lw a3, 4(a5)
 ; RV32I-NEXT:    slli a7, a1, 3
-; RV32I-NEXT:    lw t0, 0(a6)
+; RV32I-NEXT:    lw t0, 8(a5)
+; RV32I-NEXT:    lw t1, 12(a5)
 ; RV32I-NEXT:    sll a4, a3, a7
 ; RV32I-NEXT:    andi a0, a7, 24
-; RV32I-NEXT:    xori t1, a0, 31
-; RV32I-NEXT:    srli a0, t0, 1
-; RV32I-NEXT:    lw t2, 12(a6)
-; RV32I-NEXT:    lw a5, 8(a6)
-; RV32I-NEXT:    srl a0, a0, t1
+; RV32I-NEXT:    xori t2, a0, 31
+; RV32I-NEXT:    srli a0, a6, 1
+; RV32I-NEXT:    srl a0, a0, t2
 ; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    sll t3, t2, a7
-; RV32I-NEXT:    srli a1, a5, 1
-; RV32I-NEXT:    srl a1, a1, t1
+; RV32I-NEXT:    sll t3, t1, a7
+; RV32I-NEXT:    srli a1, t0, 1
+; RV32I-NEXT:    srl a1, a1, t2
 ; RV32I-NEXT:    or a1, t3, a1
-; RV32I-NEXT:    sll t4, a5, a7
+; RV32I-NEXT:    sll t0, t0, a7
 ; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    lw t5, 20(a6)
-; RV32I-NEXT:    lw t6, 16(a6)
-; RV32I-NEXT:    srl a3, a3, t1
-; RV32I-NEXT:    or a3, t4, a3
+; RV32I-NEXT:    srl a3, a3, t2
+; RV32I-NEXT:    lw t4, 16(a5)
+; RV32I-NEXT:    lw t5, 20(a5)
+; RV32I-NEXT:    or a3, t0, a3
+; RV32I-NEXT:    lw t6, 24(a5)
+; RV32I-NEXT:    lw a5, 28(a5)
 ; RV32I-NEXT:    sll s0, t5, a7
-; RV32I-NEXT:    srli a5, t6, 1
-; RV32I-NEXT:    srl a5, a5, t1
-; RV32I-NEXT:    or a5, s0, a5
+; RV32I-NEXT:    srli s1, t4, 1
+; RV32I-NEXT:    srl s1, s1, t2
+; RV32I-NEXT:    or s1, s0, s1
+; RV32I-NEXT:    sll t4, t4, a7
+; RV32I-NEXT:    srli t1, t1, 1
+; RV32I-NEXT:    srl t1, t1, t2
+; RV32I-NEXT:    or t1, t4, t1
+; RV32I-NEXT:    sll a5, a5, a7
+; RV32I-NEXT:    srli s2, t6, 1
+; RV32I-NEXT:    srl s2, s2, t2
+; RV32I-NEXT:    or s2, a5, s2
 ; RV32I-NEXT:    sll t6, t6, a7
-; RV32I-NEXT:    srli t2, t2, 1
-; RV32I-NEXT:    lw s1, 28(a6)
-; RV32I-NEXT:    lw a6, 24(a6)
-; RV32I-NEXT:    srl t2, t2, t1
+; RV32I-NEXT:    srli t5, t5, 1
+; RV32I-NEXT:    srl t2, t5, t2
 ; RV32I-NEXT:    or t2, t6, t2
-; RV32I-NEXT:    sll s1, s1, a7
-; RV32I-NEXT:    srli s2, a6, 1
-; RV32I-NEXT:    srl s2, s2, t1
-; RV32I-NEXT:    or s2, s1, s2
 ; RV32I-NEXT:    sll a6, a6, a7
-; RV32I-NEXT:    srli t5, t5, 1
-; RV32I-NEXT:    srl t1, t5, t1
-; RV32I-NEXT:    or t1, a6, t1
-; RV32I-NEXT:    sll a7, t0, a7
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    srli a6, a6, 24
-; RV32I-NEXT:    sb a6, 27(a2)
-; RV32I-NEXT:    srli s1, s1, 24
-; RV32I-NEXT:    sb s1, 31(a2)
-; RV32I-NEXT:    srli a6, t6, 24
-; RV32I-NEXT:    sb a6, 19(a2)
+; RV32I-NEXT:    sb a6, 0(a2)
+; RV32I-NEXT:    srli a7, t6, 24
+; RV32I-NEXT:    sb a7, 27(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 31(a2)
+; RV32I-NEXT:    srli a5, t4, 24
+; RV32I-NEXT:    sb a5, 19(a2)
 ; RV32I-NEXT:    srli s0, s0, 24
 ; RV32I-NEXT:    sb s0, 23(a2)
-; RV32I-NEXT:    srli a6, t4, 24
-; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    srli a6, t3, 24
-; RV32I-NEXT:    sb a6, 15(a2)
-; RV32I-NEXT:    srli a6, a7, 24
-; RV32I-NEXT:    sb a6, 3(a2)
-; RV32I-NEXT:    srli a6, a7, 16
-; RV32I-NEXT:    sb a6, 2(a2)
-; RV32I-NEXT:    srli a6, a7, 8
-; RV32I-NEXT:    sb a6, 1(a2)
+; RV32I-NEXT:    srli a5, t0, 24
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a5, t3, 24
+; RV32I-NEXT:    sb a5, 15(a2)
+; RV32I-NEXT:    srli a5, a6, 24
+; RV32I-NEXT:    sb a5, 3(a2)
+; RV32I-NEXT:    srli a5, a6, 16
+; RV32I-NEXT:    sb a5, 2(a2)
+; RV32I-NEXT:    srli a5, a6, 8
+; RV32I-NEXT:    sb a5, 1(a2)
 ; RV32I-NEXT:    srli a4, a4, 24
 ; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb t1, 24(a2)
+; RV32I-NEXT:    sb t2, 24(a2)
 ; RV32I-NEXT:    sb s2, 28(a2)
-; RV32I-NEXT:    sb t2, 16(a2)
-; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb t1, 16(a2)
+; RV32I-NEXT:    sb s1, 20(a2)
 ; RV32I-NEXT:    sb a3, 8(a2)
 ; RV32I-NEXT:    sb a1, 12(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    srli a4, t2, 16
 ; RV32I-NEXT:    sb a4, 26(a2)
-; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    srli a4, t2, 8
 ; RV32I-NEXT:    sb a4, 25(a2)
 ; RV32I-NEXT:    srli a4, s2, 16
 ; RV32I-NEXT:    sb a4, 30(a2)
 ; RV32I-NEXT:    srli a4, s2, 8
 ; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    srli a4, t2, 16
+; RV32I-NEXT:    srli a4, t1, 16
 ; RV32I-NEXT:    sb a4, 18(a2)
-; RV32I-NEXT:    srli a4, t2, 8
+; RV32I-NEXT:    srli a4, t1, 8
 ; RV32I-NEXT:    sb a4, 17(a2)
-; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    srli a4, s1, 16
 ; RV32I-NEXT:    sb a4, 22(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 21(a2)
+; RV32I-NEXT:    srli s1, s1, 8
+; RV32I-NEXT:    sb s1, 21(a2)
 ; RV32I-NEXT:    srli a4, a3, 16
 ; RV32I-NEXT:    sb a4, 10(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
@@ -3583,105 +3583,105 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a6, 9(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 10(a0)
 ; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 14(a0)
 ; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
 ; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu a7, 17(a0)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 18(a0)
 ; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t0, 21(a0)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 22(a0)
 ; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 26(a0)
 ; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a1)
 ; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu t1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sd zero, 24(sp)
@@ -3695,75 +3695,75 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    slli a0, a1, 2
 ; RV64I-NEXT:    andi a0, a0, 24
 ; RV64I-NEXT:    addi a3, sp, 32
-; RV64I-NEXT:    sub a0, a3, a0
-; RV64I-NEXT:    ld a4, 8(a0)
-; RV64I-NEXT:    slli a5, a1, 5
-; RV64I-NEXT:    ld a6, 0(a0)
-; RV64I-NEXT:    sll a3, a4, a5
-; RV64I-NEXT:    andi a1, a5, 32
-; RV64I-NEXT:    xori a7, a1, 63
-; RV64I-NEXT:    srli a1, a6, 1
-; RV64I-NEXT:    ld t0, 24(a0)
-; RV64I-NEXT:    ld t1, 16(a0)
-; RV64I-NEXT:    srl a0, a1, a7
+; RV64I-NEXT:    sub a3, a3, a0
+; RV64I-NEXT:    ld a4, 0(a3)
+; RV64I-NEXT:    ld a5, 8(a3)
+; RV64I-NEXT:    slli a6, a1, 5
+; RV64I-NEXT:    ld a7, 16(a3)
+; RV64I-NEXT:    ld a1, 24(a3)
+; RV64I-NEXT:    sll a3, a5, a6
+; RV64I-NEXT:    andi a0, a6, 32
+; RV64I-NEXT:    xori t0, a0, 63
+; RV64I-NEXT:    srli a0, a4, 1
+; RV64I-NEXT:    srl a0, a0, t0
 ; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    sll t0, t0, a5
-; RV64I-NEXT:    srli a1, t1, 1
-; RV64I-NEXT:    srl a1, a1, a7
-; RV64I-NEXT:    or a1, t0, a1
-; RV64I-NEXT:    sll t1, t1, a5
-; RV64I-NEXT:    srli a4, a4, 1
-; RV64I-NEXT:    srl a4, a4, a7
-; RV64I-NEXT:    or a4, t1, a4
-; RV64I-NEXT:    sll a5, a6, a5
-; RV64I-NEXT:    sb a5, 0(a2)
-; RV64I-NEXT:    srli a6, t1, 56
+; RV64I-NEXT:    sll t1, a1, a6
+; RV64I-NEXT:    srli a1, a7, 1
+; RV64I-NEXT:    srl a1, a1, t0
+; RV64I-NEXT:    or a1, t1, a1
+; RV64I-NEXT:    sll a7, a7, a6
+; RV64I-NEXT:    srli a5, a5, 1
+; RV64I-NEXT:    srl a5, a5, t0
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    sll a4, a4, a6
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    srli a6, a7, 56
 ; RV64I-NEXT:    sb a6, 23(a2)
-; RV64I-NEXT:    srli a6, t1, 48
+; RV64I-NEXT:    srli a6, a7, 48
 ; RV64I-NEXT:    sb a6, 22(a2)
-; RV64I-NEXT:    srli a6, t1, 40
+; RV64I-NEXT:    srli a6, a7, 40
 ; RV64I-NEXT:    sb a6, 21(a2)
-; RV64I-NEXT:    srli a6, t1, 32
+; RV64I-NEXT:    srli a6, a7, 32
 ; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    srli a6, t0, 56
+; RV64I-NEXT:    srli a6, t1, 56
 ; RV64I-NEXT:    sb a6, 31(a2)
-; RV64I-NEXT:    srli a6, t0, 48
+; RV64I-NEXT:    srli a6, t1, 48
 ; RV64I-NEXT:    sb a6, 30(a2)
-; RV64I-NEXT:    srli a6, t0, 40
+; RV64I-NEXT:    srli a6, t1, 40
 ; RV64I-NEXT:    sb a6, 29(a2)
-; RV64I-NEXT:    srli a6, t0, 32
+; RV64I-NEXT:    srli a6, t1, 32
 ; RV64I-NEXT:    sb a6, 28(a2)
-; RV64I-NEXT:    srli a6, a5, 56
+; RV64I-NEXT:    srli a6, a4, 56
 ; RV64I-NEXT:    sb a6, 7(a2)
-; RV64I-NEXT:    srli a6, a5, 48
+; RV64I-NEXT:    srli a6, a4, 48
 ; RV64I-NEXT:    sb a6, 6(a2)
-; RV64I-NEXT:    srli a6, a5, 40
+; RV64I-NEXT:    srli a6, a4, 40
 ; RV64I-NEXT:    sb a6, 5(a2)
-; RV64I-NEXT:    srli a6, a5, 32
+; RV64I-NEXT:    srli a6, a4, 32
 ; RV64I-NEXT:    sb a6, 4(a2)
-; RV64I-NEXT:    srli a6, a5, 24
+; RV64I-NEXT:    srli a6, a4, 24
 ; RV64I-NEXT:    sb a6, 3(a2)
-; RV64I-NEXT:    srli a6, a5, 16
+; RV64I-NEXT:    srli a6, a4, 16
 ; RV64I-NEXT:    sb a6, 2(a2)
-; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 1(a2)
-; RV64I-NEXT:    srli a5, a3, 56
-; RV64I-NEXT:    sb a5, 15(a2)
-; RV64I-NEXT:    srli a5, a3, 48
-; RV64I-NEXT:    sb a5, 14(a2)
-; RV64I-NEXT:    srli a5, a3, 40
-; RV64I-NEXT:    sb a5, 13(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 1(a2)
+; RV64I-NEXT:    srli a4, a3, 56
+; RV64I-NEXT:    sb a4, 15(a2)
+; RV64I-NEXT:    srli a4, a3, 48
+; RV64I-NEXT:    sb a4, 14(a2)
+; RV64I-NEXT:    srli a4, a3, 40
+; RV64I-NEXT:    sb a4, 13(a2)
 ; RV64I-NEXT:    srli a3, a3, 32
 ; RV64I-NEXT:    sb a3, 12(a2)
-; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb a5, 16(a2)
 ; RV64I-NEXT:    sb a1, 24(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    srli a3, a4, 24
+; RV64I-NEXT:    srli a3, a5, 24
 ; RV64I-NEXT:    sb a3, 19(a2)
-; RV64I-NEXT:    srli a3, a4, 16
+; RV64I-NEXT:    srli a3, a5, 16
 ; RV64I-NEXT:    sb a3, 18(a2)
-; RV64I-NEXT:    srli a4, a4, 8
-; RV64I-NEXT:    sb a4, 17(a2)
+; RV64I-NEXT:    srli a5, a5, 8
+; RV64I-NEXT:    sb a5, 17(a2)
 ; RV64I-NEXT:    srli a3, a1, 24
 ; RV64I-NEXT:    sb a3, 27(a2)
 ; RV64I-NEXT:    srli a3, a1, 16
@@ -3791,77 +3791,77 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t1, 17(a0)
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    lbu a7, 18(a0)
 ; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    or a7, t2, a7
 ; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    lbu t0, 22(a0)
 ; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    or t0, t3, t0
 ; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    lbu t1, 26(a0)
 ; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    or t1, t4, t1
 ; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu t4, 29(a0)
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    lbu t2, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t4
 ; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    or a0, a0, t3
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    sw zero, 28(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
@@ -3882,64 +3882,64 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    slli a1, a1, 2
 ; RV32I-NEXT:    andi a1, a1, 28
 ; RV32I-NEXT:    addi a0, sp, 32
-; RV32I-NEXT:    sub a3, a0, a1
-; RV32I-NEXT:    lw a0, 4(a3)
-; RV32I-NEXT:    lw a1, 0(a3)
-; RV32I-NEXT:    lw a4, 12(a3)
-; RV32I-NEXT:    lw a5, 8(a3)
-; RV32I-NEXT:    lw a6, 24(a3)
-; RV32I-NEXT:    lw a7, 28(a3)
-; RV32I-NEXT:    lw t0, 16(a3)
-; RV32I-NEXT:    lw a3, 20(a3)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb a7, 28(a2)
-; RV32I-NEXT:    sb t0, 16(a2)
-; RV32I-NEXT:    sb a3, 20(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
-; RV32I-NEXT:    sb a4, 12(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sub a1, a0, a1
+; RV32I-NEXT:    lw a3, 16(a1)
+; RV32I-NEXT:    lw a4, 20(a1)
+; RV32I-NEXT:    lw a5, 24(a1)
+; RV32I-NEXT:    lw a6, 28(a1)
+; RV32I-NEXT:    lw a7, 0(a1)
+; RV32I-NEXT:    lw a0, 4(a1)
+; RV32I-NEXT:    lw t0, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sb a5, 24(a2)
+; RV32I-NEXT:    sb a6, 28(a2)
+; RV32I-NEXT:    sb a3, 16(a2)
+; RV32I-NEXT:    sb a4, 20(a2)
+; RV32I-NEXT:    sb t0, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb a7, 0(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    srli t1, a5, 24
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    srli t1, a5, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 25(a2)
-; RV32I-NEXT:    srli a6, a7, 24
-; RV32I-NEXT:    sb a6, 31(a2)
-; RV32I-NEXT:    srli a6, a7, 16
-; RV32I-NEXT:    sb a6, 30(a2)
-; RV32I-NEXT:    srli a6, a7, 8
-; RV32I-NEXT:    sb a6, 29(a2)
-; RV32I-NEXT:    srli a6, t0, 24
-; RV32I-NEXT:    sb a6, 19(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 18(a2)
-; RV32I-NEXT:    srli a6, t0, 8
-; RV32I-NEXT:    sb a6, 17(a2)
-; RV32I-NEXT:    srli a6, a3, 24
-; RV32I-NEXT:    sb a6, 23(a2)
-; RV32I-NEXT:    srli a6, a3, 16
-; RV32I-NEXT:    sb a6, 22(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 21(a2)
-; RV32I-NEXT:    srli a3, a5, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a5, 16
-; RV32I-NEXT:    sb a3, 10(a2)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb a5, 25(a2)
+; RV32I-NEXT:    srli a5, a6, 24
+; RV32I-NEXT:    sb a5, 31(a2)
+; RV32I-NEXT:    srli a5, a6, 16
+; RV32I-NEXT:    sb a5, 30(a2)
+; RV32I-NEXT:    srli a5, a6, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    sb a5, 19(a2)
+; RV32I-NEXT:    srli a5, a3, 16
+; RV32I-NEXT:    sb a5, 18(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 17(a2)
 ; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    sb a3, 23(a2)
 ; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    sb a3, 22(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a3, t0, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, t0, 16
+; RV32I-NEXT:    sb a3, 10(a2)
+; RV32I-NEXT:    srli a3, t0, 8
+; RV32I-NEXT:    sb a3, 9(a2)
 ; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    sb a3, 15(a2)
 ; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    sb a3, 14(a2)
 ; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a7, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a1, a7, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a7, 8
 ; RV32I-NEXT:    sb a1, 1(a2)
 ; RV32I-NEXT:    srli a1, a0, 24
 ; RV32I-NEXT:    sb a1, 7(a2)
@@ -3970,83 +3970,83 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a6, 9(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 10(a0)
 ; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 14(a0)
 ; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
 ; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu a7, 17(a0)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 18(a0)
 ; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t0, 21(a0)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 22(a0)
 ; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 26(a0)
 ; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -4141,77 +4141,77 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t1, 17(a0)
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    lbu a7, 18(a0)
 ; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    or a7, t2, a7
 ; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    lbu t0, 22(a0)
 ; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    or t0, t3, t0
 ; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    lbu t1, 26(a0)
 ; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    or t1, t4, t1
 ; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu t4, 29(a0)
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    lbu t2, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t4
 ; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    or a0, a0, t3
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    sw zero, 28(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
@@ -4232,64 +4232,64 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    andi a1, a1, 24
 ; RV32I-NEXT:    addi a0, sp, 32
-; RV32I-NEXT:    sub a3, a0, a1
-; RV32I-NEXT:    lw a0, 4(a3)
-; RV32I-NEXT:    lw a1, 0(a3)
-; RV32I-NEXT:    lw a4, 12(a3)
-; RV32I-NEXT:    lw a5, 8(a3)
-; RV32I-NEXT:    lw a6, 24(a3)
-; RV32I-NEXT:    lw a7, 28(a3)
-; RV32I-NEXT:    lw t0, 16(a3)
-; RV32I-NEXT:    lw a3, 20(a3)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb a7, 28(a2)
-; RV32I-NEXT:    sb t0, 16(a2)
-; RV32I-NEXT:    sb a3, 20(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
-; RV32I-NEXT:    sb a4, 12(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sub a1, a0, a1
+; RV32I-NEXT:    lw a3, 16(a1)
+; RV32I-NEXT:    lw a4, 20(a1)
+; RV32I-NEXT:    lw a5, 24(a1)
+; RV32I-NEXT:    lw a6, 28(a1)
+; RV32I-NEXT:    lw a7, 0(a1)
+; RV32I-NEXT:    lw a0, 4(a1)
+; RV32I-NEXT:    lw t0, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sb a5, 24(a2)
+; RV32I-NEXT:    sb a6, 28(a2)
+; RV32I-NEXT:    sb a3, 16(a2)
+; RV32I-NEXT:    sb a4, 20(a2)
+; RV32I-NEXT:    sb t0, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb a7, 0(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    srli t1, a5, 24
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    srli t1, a5, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 25(a2)
-; RV32I-NEXT:    srli a6, a7, 24
-; RV32I-NEXT:    sb a6, 31(a2)
-; RV32I-NEXT:    srli a6, a7, 16
-; RV32I-NEXT:    sb a6, 30(a2)
-; RV32I-NEXT:    srli a6, a7, 8
-; RV32I-NEXT:    sb a6, 29(a2)
-; RV32I-NEXT:    srli a6, t0, 24
-; RV32I-NEXT:    sb a6, 19(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 18(a2)
-; RV32I-NEXT:    srli a6, t0, 8
-; RV32I-NEXT:    sb a6, 17(a2)
-; RV32I-NEXT:    srli a6, a3, 24
-; RV32I-NEXT:    sb a6, 23(a2)
-; RV32I-NEXT:    srli a6, a3, 16
-; RV32I-NEXT:    sb a6, 22(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 21(a2)
-; RV32I-NEXT:    srli a3, a5, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a5, 16
-; RV32I-NEXT:    sb a3, 10(a2)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb a5, 25(a2)
+; RV32I-NEXT:    srli a5, a6, 24
+; RV32I-NEXT:    sb a5, 31(a2)
+; RV32I-NEXT:    srli a5, a6, 16
+; RV32I-NEXT:    sb a5, 30(a2)
+; RV32I-NEXT:    srli a5, a6, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    sb a5, 19(a2)
+; RV32I-NEXT:    srli a5, a3, 16
+; RV32I-NEXT:    sb a5, 18(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 17(a2)
 ; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    sb a3, 23(a2)
 ; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    sb a3, 22(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a3, t0, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, t0, 16
+; RV32I-NEXT:    sb a3, 10(a2)
+; RV32I-NEXT:    srli a3, t0, 8
+; RV32I-NEXT:    sb a3, 9(a2)
 ; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    sb a3, 15(a2)
 ; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    sb a3, 14(a2)
 ; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a7, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a1, a7, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a7, 8
 ; RV32I-NEXT:    sb a1, 1(a2)
 ; RV32I-NEXT:    srli a1, a0, 24
 ; RV32I-NEXT:    sb a1, 7(a2)
@@ -4320,105 +4320,105 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a6, 9(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 10(a0)
 ; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 14(a0)
 ; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
 ; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu a7, 17(a0)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 18(a0)
 ; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t0, 21(a0)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 22(a0)
 ; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 26(a0)
 ; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a7, a0, 32
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 1(a1)
 ; RV64I-NEXT:    lbu t0, 0(a1)
-; RV64I-NEXT:    lbu t1, 2(a1)
+; RV64I-NEXT:    lbu t1, 1(a1)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 2(a1)
 ; RV64I-NEXT:    lbu t2, 3(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    or a7, t2, a7
 ; RV64I-NEXT:    lbu t1, 4(a1)
-; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    lbu t0, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or t0, t0, t1
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    sraiw a0, a0, 31
@@ -4432,31 +4432,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    andi a0, a1, 24
 ; RV64I-NEXT:    mv a3, sp
-; RV64I-NEXT:    add a3, a3, a0
-; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    ld a3, 0(a0)
+; RV64I-NEXT:    ld a4, 8(a0)
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    srl a5, a4, a1
-; RV64I-NEXT:    ld a6, 16(a3)
+; RV64I-NEXT:    ld a5, 16(a0)
+; RV64I-NEXT:    ld a6, 24(a0)
+; RV64I-NEXT:    srl a7, a4, a1
 ; RV64I-NEXT:    andi a0, a1, 56
-; RV64I-NEXT:    xori a7, a0, 63
-; RV64I-NEXT:    ld t0, 0(a3)
-; RV64I-NEXT:    slli a0, a6, 1
-; RV64I-NEXT:    sll a0, a0, a7
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    srl t0, t0, a1
+; RV64I-NEXT:    xori t0, a0, 63
+; RV64I-NEXT:    slli a0, a5, 1
+; RV64I-NEXT:    sll a0, a0, t0
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    srl a3, a3, a1
 ; RV64I-NEXT:    slli a4, a4, 1
-; RV64I-NEXT:    ld a3, 24(a3)
-; RV64I-NEXT:    sll a4, a4, a7
-; RV64I-NEXT:    or a4, t0, a4
-; RV64I-NEXT:    srl a6, a6, a1
-; RV64I-NEXT:    slli t1, a3, 1
-; RV64I-NEXT:    sll a7, t1, a7
-; RV64I-NEXT:    or a7, a6, a7
-; RV64I-NEXT:    sra a1, a3, a1
-; RV64I-NEXT:    sb a6, 16(a2)
+; RV64I-NEXT:    sll a4, a4, t0
+; RV64I-NEXT:    or a4, a3, a4
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    slli t1, a6, 1
+; RV64I-NEXT:    sll t0, t1, t0
+; RV64I-NEXT:    or t0, a5, t0
+; RV64I-NEXT:    sra a1, a6, a1
+; RV64I-NEXT:    sb a5, 16(a2)
 ; RV64I-NEXT:    sb a1, 24(a2)
-; RV64I-NEXT:    sb t0, 0(a2)
-; RV64I-NEXT:    sb a5, 8(a2)
+; RV64I-NEXT:    sb a3, 0(a2)
+; RV64I-NEXT:    sb a7, 8(a2)
 ; RV64I-NEXT:    srli a3, a1, 56
 ; RV64I-NEXT:    sb a3, 31(a2)
 ; RV64I-NEXT:    srli a3, a1, 48
@@ -4471,19 +4471,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a3, 26(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 25(a2)
-; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    srli a1, t0, 56
 ; RV64I-NEXT:    sb a1, 23(a2)
-; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    srli a1, t0, 48
 ; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    srli a1, t0, 40
 ; RV64I-NEXT:    sb a1, 21(a2)
-; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    srli a1, t0, 32
 ; RV64I-NEXT:    sb a1, 20(a2)
-; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    srli a1, t0, 24
 ; RV64I-NEXT:    sb a1, 19(a2)
-; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    srli a1, t0, 16
 ; RV64I-NEXT:    sb a1, 18(a2)
-; RV64I-NEXT:    srli a1, a7, 8
+; RV64I-NEXT:    srli a1, t0, 8
 ; RV64I-NEXT:    sb a1, 17(a2)
 ; RV64I-NEXT:    srli a1, a4, 56
 ; RV64I-NEXT:    sb a1, 7(a2)
@@ -4531,87 +4531,87 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t1, 17(a0)
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    lbu a7, 18(a0)
 ; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    or a7, t2, a7
 ; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    lbu t0, 22(a0)
 ; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    or t0, t3, t0
 ; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    lbu t1, 26(a0)
 ; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    or t1, t4, t1
 ; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu t4, 29(a0)
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    lbu t2, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or t3, a0, t4
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    lbu t3, 1(a1)
+; RV32I-NEXT:    or t2, a0, t2
 ; RV32I-NEXT:    lbu t4, 0(a1)
-; RV32I-NEXT:    lbu t5, 2(a1)
+; RV32I-NEXT:    lbu t5, 1(a1)
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    lbu t3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t3, t3, t4
-; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t5
 ; RV32I-NEXT:    or a1, a1, t3
+; RV32I-NEXT:    or a1, a1, t4
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sw a0, 60(sp)
 ; RV32I-NEXT:    sw a0, 56(sp)
@@ -4631,54 +4631,54 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    andi a0, a1, 28
 ; RV32I-NEXT:    mv a3, sp
-; RV32I-NEXT:    add a5, a3, a0
-; RV32I-NEXT:    lw a3, 4(a5)
-; RV32I-NEXT:    slli a6, a1, 3
-; RV32I-NEXT:    srl a4, a3, a6
-; RV32I-NEXT:    lw a7, 8(a5)
-; RV32I-NEXT:    andi a0, a6, 24
-; RV32I-NEXT:    xori t0, a0, 31
-; RV32I-NEXT:    lw a1, 0(a5)
-; RV32I-NEXT:    slli a0, a7, 1
-; RV32I-NEXT:    sll a0, a0, t0
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    lw a6, 0(a3)
+; RV32I-NEXT:    lw a7, 4(a3)
+; RV32I-NEXT:    slli a5, a1, 3
+; RV32I-NEXT:    lw t0, 8(a3)
+; RV32I-NEXT:    lw t1, 12(a3)
+; RV32I-NEXT:    srl a4, a7, a5
+; RV32I-NEXT:    andi a0, a5, 24
+; RV32I-NEXT:    xori t2, a0, 31
+; RV32I-NEXT:    slli a0, t0, 1
+; RV32I-NEXT:    sll a0, a0, t2
 ; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    srl t1, a1, a6
-; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    lw t2, 12(a5)
-; RV32I-NEXT:    lw t3, 16(a5)
-; RV32I-NEXT:    sll a1, a3, t0
-; RV32I-NEXT:    or a1, t1, a1
-; RV32I-NEXT:    srl t4, t2, a6
+; RV32I-NEXT:    srl a6, a6, a5
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    sll a1, a7, t2
+; RV32I-NEXT:    or a1, a6, a1
+; RV32I-NEXT:    srl a7, t1, a5
+; RV32I-NEXT:    lw t3, 16(a3)
+; RV32I-NEXT:    lw t4, 20(a3)
+; RV32I-NEXT:    lw t5, 24(a3)
+; RV32I-NEXT:    lw t6, 28(a3)
 ; RV32I-NEXT:    slli a3, t3, 1
-; RV32I-NEXT:    sll a3, a3, t0
-; RV32I-NEXT:    or a3, t4, a3
-; RV32I-NEXT:    srl a7, a7, a6
-; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    lw t5, 20(a5)
-; RV32I-NEXT:    lw t6, 24(a5)
-; RV32I-NEXT:    sll t2, t2, t0
-; RV32I-NEXT:    or t2, a7, t2
-; RV32I-NEXT:    srl s0, t5, a6
-; RV32I-NEXT:    slli s1, t6, 1
-; RV32I-NEXT:    sll s1, s1, t0
+; RV32I-NEXT:    sll a3, a3, t2
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl t0, t0, a5
+; RV32I-NEXT:    slli t1, t1, 1
+; RV32I-NEXT:    sll t1, t1, t2
+; RV32I-NEXT:    or t1, t0, t1
+; RV32I-NEXT:    srl s0, t4, a5
+; RV32I-NEXT:    slli s1, t5, 1
+; RV32I-NEXT:    sll s1, s1, t2
 ; RV32I-NEXT:    or s1, s0, s1
-; RV32I-NEXT:    srl t3, t3, a6
-; RV32I-NEXT:    slli t5, t5, 1
-; RV32I-NEXT:    lw a5, 28(a5)
-; RV32I-NEXT:    sll t5, t5, t0
-; RV32I-NEXT:    or t5, t3, t5
-; RV32I-NEXT:    srl t6, t6, a6
-; RV32I-NEXT:    slli s2, a5, 1
-; RV32I-NEXT:    sll t0, s2, t0
-; RV32I-NEXT:    or t0, t6, t0
-; RV32I-NEXT:    sra a5, a5, a6
-; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    srl t3, t3, a5
+; RV32I-NEXT:    slli t4, t4, 1
+; RV32I-NEXT:    sll t4, t4, t2
+; RV32I-NEXT:    or t4, t3, t4
+; RV32I-NEXT:    srl t5, t5, a5
+; RV32I-NEXT:    slli s2, t6, 1
+; RV32I-NEXT:    sll t2, s2, t2
+; RV32I-NEXT:    or t2, t5, t2
+; RV32I-NEXT:    sra a5, t6, a5
+; RV32I-NEXT:    sb t5, 24(a2)
 ; RV32I-NEXT:    sb a5, 28(a2)
 ; RV32I-NEXT:    sb t3, 16(a2)
 ; RV32I-NEXT:    sb s0, 20(a2)
-; RV32I-NEXT:    sb a7, 8(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t1, 0(a2)
+; RV32I-NEXT:    sb t0, 8(a2)
+; RV32I-NEXT:    sb a7, 12(a2)
+; RV32I-NEXT:    sb a6, 0(a2)
 ; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    srli a4, a5, 24
 ; RV32I-NEXT:    sb a4, 31(a2)
@@ -4686,17 +4686,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a4, 30(a2)
 ; RV32I-NEXT:    srli a5, a5, 8
 ; RV32I-NEXT:    sb a5, 29(a2)
-; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    srli a4, t2, 24
 ; RV32I-NEXT:    sb a4, 27(a2)
-; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    srli a4, t2, 16
 ; RV32I-NEXT:    sb a4, 26(a2)
-; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    srli a4, t2, 8
 ; RV32I-NEXT:    sb a4, 25(a2)
-; RV32I-NEXT:    srli a4, t5, 24
+; RV32I-NEXT:    srli a4, t4, 24
 ; RV32I-NEXT:    sb a4, 19(a2)
-; RV32I-NEXT:    srli a4, t5, 16
+; RV32I-NEXT:    srli a4, t4, 16
 ; RV32I-NEXT:    sb a4, 18(a2)
-; RV32I-NEXT:    srli a4, t5, 8
+; RV32I-NEXT:    srli a4, t4, 8
 ; RV32I-NEXT:    sb a4, 17(a2)
 ; RV32I-NEXT:    srli a4, s1, 24
 ; RV32I-NEXT:    sb a4, 23(a2)
@@ -4704,11 +4704,11 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a4, 22(a2)
 ; RV32I-NEXT:    srli s1, s1, 8
 ; RV32I-NEXT:    sb s1, 21(a2)
-; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    srli a4, t1, 24
 ; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a4, t2, 16
+; RV32I-NEXT:    srli a4, t1, 16
 ; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, t2, 8
+; RV32I-NEXT:    srli a4, t1, 8
 ; RV32I-NEXT:    sb a4, 9(a2)
 ; RV32I-NEXT:    srli a4, a3, 24
 ; RV32I-NEXT:    sb a4, 15(a2)
@@ -4754,105 +4754,105 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a6, 9(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 10(a0)
 ; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 14(a0)
 ; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
 ; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu a7, 17(a0)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 18(a0)
 ; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t0, 21(a0)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 22(a0)
 ; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 26(a0)
 ; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a7, a0, 32
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 1(a1)
 ; RV64I-NEXT:    lbu t0, 0(a1)
-; RV64I-NEXT:    lbu t1, 2(a1)
+; RV64I-NEXT:    lbu t1, 1(a1)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 2(a1)
 ; RV64I-NEXT:    lbu t2, 3(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    or a7, t2, a7
 ; RV64I-NEXT:    lbu t1, 4(a1)
-; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    lbu t0, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or t0, t0, t1
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    sraiw a0, a0, 31
@@ -4867,70 +4867,70 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli a0, a1, 2
 ; RV64I-NEXT:    andi a0, a0, 24
 ; RV64I-NEXT:    mv a3, sp
-; RV64I-NEXT:    add a3, a3, a0
-; RV64I-NEXT:    ld a4, 8(a3)
-; RV64I-NEXT:    slli a5, a1, 5
-; RV64I-NEXT:    srl a1, a4, a5
-; RV64I-NEXT:    ld a6, 16(a3)
-; RV64I-NEXT:    andi a0, a5, 32
-; RV64I-NEXT:    xori a7, a0, 63
-; RV64I-NEXT:    ld t0, 0(a3)
-; RV64I-NEXT:    slli a0, a6, 1
-; RV64I-NEXT:    sll a0, a0, a7
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    srl t0, t0, a5
-; RV64I-NEXT:    slli a4, a4, 1
-; RV64I-NEXT:    ld a3, 24(a3)
-; RV64I-NEXT:    sll a4, a4, a7
-; RV64I-NEXT:    or a4, t0, a4
-; RV64I-NEXT:    srl a6, a6, a5
-; RV64I-NEXT:    slli t1, a3, 1
-; RV64I-NEXT:    sll a7, t1, a7
-; RV64I-NEXT:    or a7, a6, a7
-; RV64I-NEXT:    sra a3, a3, a5
-; RV64I-NEXT:    sb a6, 16(a2)
-; RV64I-NEXT:    sb a3, 24(a2)
-; RV64I-NEXT:    sb t0, 0(a2)
-; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    srli a5, a6, 24
-; RV64I-NEXT:    sb a5, 19(a2)
-; RV64I-NEXT:    srli a5, a6, 16
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    srli a5, a6, 8
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    ld a3, 0(a0)
+; RV64I-NEXT:    ld a4, 8(a0)
+; RV64I-NEXT:    slli a1, a1, 5
+; RV64I-NEXT:    ld a5, 16(a0)
+; RV64I-NEXT:    ld a6, 24(a0)
+; RV64I-NEXT:    srl a7, a4, a1
+; RV64I-NEXT:    andi a0, a1, 32
+; RV64I-NEXT:    xori t0, a0, 63
+; RV64I-NEXT:    slli a0, a5, 1
+; RV64I-NEXT:    sll a0, a0, t0
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    srl a3, a3, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    sll a4, a4, t0
+; RV64I-NEXT:    or a4, a3, a4
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    slli t1, a6, 1
+; RV64I-NEXT:    sll t0, t1, t0
+; RV64I-NEXT:    or t0, a5, t0
+; RV64I-NEXT:    sra a1, a6, a1
+; RV64I-NEXT:    sb a5, 16(a2)
+; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    sb a3, 0(a2)
+; RV64I-NEXT:    sb a7, 8(a2)
+; RV64I-NEXT:    srli a6, a5, 24
+; RV64I-NEXT:    sb a6, 19(a2)
+; RV64I-NEXT:    srli a6, a5, 16
+; RV64I-NEXT:    sb a6, 18(a2)
+; RV64I-NEXT:    srli a5, a5, 8
 ; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a5, a3, 56
+; RV64I-NEXT:    srli a5, a1, 56
 ; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a3, 48
+; RV64I-NEXT:    srli a5, a1, 48
 ; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    srli a5, a3, 40
+; RV64I-NEXT:    srli a5, a1, 40
 ; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    srli a5, a3, 32
+; RV64I-NEXT:    srli a5, a1, 32
 ; RV64I-NEXT:    sb a5, 28(a2)
-; RV64I-NEXT:    srli a5, a3, 24
+; RV64I-NEXT:    srli a5, a1, 24
 ; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, a3, 16
+; RV64I-NEXT:    srli a5, a1, 16
 ; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a3, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a3, 16
+; RV64I-NEXT:    sb a1, 2(a2)
 ; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 25(a2)
-; RV64I-NEXT:    srli a3, t0, 24
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a3, t0, 16
-; RV64I-NEXT:    sb a3, 2(a2)
-; RV64I-NEXT:    srli a3, t0, 8
 ; RV64I-NEXT:    sb a3, 1(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    srli a3, a1, 16
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a1, a7, 8
 ; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    srli a1, t0, 56
 ; RV64I-NEXT:    sb a1, 23(a2)
-; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    srli a1, t0, 48
 ; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    srli a1, t0, 40
 ; RV64I-NEXT:    sb a1, 21(a2)
-; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    srli a1, t0, 32
 ; RV64I-NEXT:    sb a1, 20(a2)
 ; RV64I-NEXT:    srli a1, a4, 56
 ; RV64I-NEXT:    sb a1, 7(a2)
@@ -4963,77 +4963,77 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t1, 17(a0)
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    lbu a7, 18(a0)
 ; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    or a7, t2, a7
 ; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    lbu t0, 22(a0)
 ; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    or t0, t3, t0
 ; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    lbu t1, 26(a0)
 ; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    or t1, t4, t1
 ; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu t4, 29(a0)
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    lbu t2, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or t3, a0, t4
-; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    or t2, a0, t2
+; RV32I-NEXT:    or t2, t2, t3
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sw a0, 60(sp)
@@ -5055,64 +5055,64 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    slli a1, a1, 2
 ; RV32I-NEXT:    andi a1, a1, 28
 ; RV32I-NEXT:    mv a0, sp
-; RV32I-NEXT:    add a3, a0, a1
-; RV32I-NEXT:    lw a0, 4(a3)
-; RV32I-NEXT:    lw a1, 0(a3)
-; RV32I-NEXT:    lw a4, 12(a3)
-; RV32I-NEXT:    lw a5, 8(a3)
-; RV32I-NEXT:    lw a6, 24(a3)
-; RV32I-NEXT:    lw a7, 28(a3)
-; RV32I-NEXT:    lw t0, 16(a3)
-; RV32I-NEXT:    lw a3, 20(a3)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb a7, 28(a2)
-; RV32I-NEXT:    sb t0, 16(a2)
-; RV32I-NEXT:    sb a3, 20(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
-; RV32I-NEXT:    sb a4, 12(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    lw a3, 16(a1)
+; RV32I-NEXT:    lw a4, 20(a1)
+; RV32I-NEXT:    lw a5, 24(a1)
+; RV32I-NEXT:    lw a6, 28(a1)
+; RV32I-NEXT:    lw a7, 0(a1)
+; RV32I-NEXT:    lw a0, 4(a1)
+; RV32I-NEXT:    lw t0, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sb a5, 24(a2)
+; RV32I-NEXT:    sb a6, 28(a2)
+; RV32I-NEXT:    sb a3, 16(a2)
+; RV32I-NEXT:    sb a4, 20(a2)
+; RV32I-NEXT:    sb t0, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb a7, 0(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    srli t1, a5, 24
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    srli t1, a5, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 25(a2)
-; RV32I-NEXT:    srli a6, a7, 24
-; RV32I-NEXT:    sb a6, 31(a2)
-; RV32I-NEXT:    srli a6, a7, 16
-; RV32I-NEXT:    sb a6, 30(a2)
-; RV32I-NEXT:    srli a6, a7, 8
-; RV32I-NEXT:    sb a6, 29(a2)
-; RV32I-NEXT:    srli a6, t0, 24
-; RV32I-NEXT:    sb a6, 19(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 18(a2)
-; RV32I-NEXT:    srli a6, t0, 8
-; RV32I-NEXT:    sb a6, 17(a2)
-; RV32I-NEXT:    srli a6, a3, 24
-; RV32I-NEXT:    sb a6, 23(a2)
-; RV32I-NEXT:    srli a6, a3, 16
-; RV32I-NEXT:    sb a6, 22(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 21(a2)
-; RV32I-NEXT:    srli a3, a5, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a5, 16
-; RV32I-NEXT:    sb a3, 10(a2)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb a5, 25(a2)
+; RV32I-NEXT:    srli a5, a6, 24
+; RV32I-NEXT:    sb a5, 31(a2)
+; RV32I-NEXT:    srli a5, a6, 16
+; RV32I-NEXT:    sb a5, 30(a2)
+; RV32I-NEXT:    srli a5, a6, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    sb a5, 19(a2)
+; RV32I-NEXT:    srli a5, a3, 16
+; RV32I-NEXT:    sb a5, 18(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 17(a2)
 ; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    sb a3, 23(a2)
 ; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    sb a3, 22(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a3, t0, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, t0, 16
+; RV32I-NEXT:    sb a3, 10(a2)
+; RV32I-NEXT:    srli a3, t0, 8
+; RV32I-NEXT:    sb a3, 9(a2)
 ; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    sb a3, 15(a2)
 ; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    sb a3, 14(a2)
 ; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a7, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a1, a7, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a7, 8
 ; RV32I-NEXT:    sb a1, 1(a2)
 ; RV32I-NEXT:    srli a1, a0, 24
 ; RV32I-NEXT:    sb a1, 7(a2)
@@ -5143,83 +5143,83 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a6, 9(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 10(a0)
 ; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 14(a0)
 ; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
 ; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu a7, 17(a0)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 18(a0)
 ; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t0, 21(a0)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 22(a0)
 ; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 26(a0)
 ; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a7, a0, 32
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -5315,77 +5315,77 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t1, 17(a0)
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    lbu a7, 18(a0)
 ; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    or a7, t2, a7
 ; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    lbu t0, 22(a0)
 ; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    or t0, t3, t0
 ; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    lbu t1, 26(a0)
 ; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    or t1, t4, t1
 ; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu t4, 29(a0)
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    lbu t2, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or t3, a0, t4
-; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    or t2, a0, t2
+; RV32I-NEXT:    or t2, t2, t3
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sw a0, 60(sp)
@@ -5407,64 +5407,64 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    andi a1, a1, 24
 ; RV32I-NEXT:    mv a0, sp
-; RV32I-NEXT:    add a3, a0, a1
-; RV32I-NEXT:    lw a0, 4(a3)
-; RV32I-NEXT:    lw a1, 0(a3)
-; RV32I-NEXT:    lw a4, 12(a3)
-; RV32I-NEXT:    lw a5, 8(a3)
-; RV32I-NEXT:    lw a6, 24(a3)
-; RV32I-NEXT:    lw a7, 28(a3)
-; RV32I-NEXT:    lw t0, 16(a3)
-; RV32I-NEXT:    lw a3, 20(a3)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb a7, 28(a2)
-; RV32I-NEXT:    sb t0, 16(a2)
-; RV32I-NEXT:    sb a3, 20(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
-; RV32I-NEXT:    sb a4, 12(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    lw a3, 16(a1)
+; RV32I-NEXT:    lw a4, 20(a1)
+; RV32I-NEXT:    lw a5, 24(a1)
+; RV32I-NEXT:    lw a6, 28(a1)
+; RV32I-NEXT:    lw a7, 0(a1)
+; RV32I-NEXT:    lw a0, 4(a1)
+; RV32I-NEXT:    lw t0, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sb a5, 24(a2)
+; RV32I-NEXT:    sb a6, 28(a2)
+; RV32I-NEXT:    sb a3, 16(a2)
+; RV32I-NEXT:    sb a4, 20(a2)
+; RV32I-NEXT:    sb t0, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb a7, 0(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    srli t1, a5, 24
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    srli t1, a5, 16
 ; RV32I-NEXT:    sb t1, 26(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 25(a2)
-; RV32I-NEXT:    srli a6, a7, 24
-; RV32I-NEXT:    sb a6, 31(a2)
-; RV32I-NEXT:    srli a6, a7, 16
-; RV32I-NEXT:    sb a6, 30(a2)
-; RV32I-NEXT:    srli a6, a7, 8
-; RV32I-NEXT:    sb a6, 29(a2)
-; RV32I-NEXT:    srli a6, t0, 24
-; RV32I-NEXT:    sb a6, 19(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 18(a2)
-; RV32I-NEXT:    srli a6, t0, 8
-; RV32I-NEXT:    sb a6, 17(a2)
-; RV32I-NEXT:    srli a6, a3, 24
-; RV32I-NEXT:    sb a6, 23(a2)
-; RV32I-NEXT:    srli a6, a3, 16
-; RV32I-NEXT:    sb a6, 22(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 21(a2)
-; RV32I-NEXT:    srli a3, a5, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a5, 16
-; RV32I-NEXT:    sb a3, 10(a2)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb a5, 25(a2)
+; RV32I-NEXT:    srli a5, a6, 24
+; RV32I-NEXT:    sb a5, 31(a2)
+; RV32I-NEXT:    srli a5, a6, 16
+; RV32I-NEXT:    sb a5, 30(a2)
+; RV32I-NEXT:    srli a5, a6, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    sb a5, 19(a2)
+; RV32I-NEXT:    srli a5, a3, 16
+; RV32I-NEXT:    sb a5, 18(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 17(a2)
 ; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    sb a3, 23(a2)
 ; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    sb a3, 22(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a3, t0, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, t0, 16
+; RV32I-NEXT:    sb a3, 10(a2)
+; RV32I-NEXT:    srli a3, t0, 8
+; RV32I-NEXT:    sb a3, 9(a2)
 ; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    sb a3, 15(a2)
 ; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    sb a3, 14(a2)
 ; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a7, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a1, a7, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a7, 8
 ; RV32I-NEXT:    sb a1, 1(a2)
 ; RV32I-NEXT:    srli a1, a0, 24
 ; RV32I-NEXT:    sb a1, 7(a2)
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index 7e879b137b4f0..190d67a5d8c11 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -8,8 +8,8 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -37,17 +37,17 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -69,8 +69,8 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -98,17 +98,17 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -130,8 +130,8 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -159,17 +159,17 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -198,39 +198,39 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 1(a1)
 ; RV64I-NEXT:    lbu a4, 0(a1)
-; RV64I-NEXT:    lbu a5, 2(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 2(a1)
 ; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
+; RV64I-NEXT:    or a3, a6, a3
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a4, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    srl a0, a0, a1
@@ -262,17 +262,17 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a5, a1, a6
-; RV32I-NEXT:    or a5, a5, a4
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a5, a1, a5
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    srl a1, a3, a5
 ; RV32I-NEXT:    bltz a4, .LBB3_2
@@ -331,39 +331,39 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 1(a1)
 ; RV64I-NEXT:    lbu a4, 0(a1)
-; RV64I-NEXT:    lbu a5, 2(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 2(a1)
 ; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
+; RV64I-NEXT:    or a3, a6, a3
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a4, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    sll a0, a0, a1
@@ -395,17 +395,17 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a5, a1, a6
-; RV32I-NEXT:    or a5, a5, a4
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a5, a1, a5
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    sll a1, a3, a5
 ; RV32I-NEXT:    bltz a4, .LBB4_2
@@ -464,39 +464,39 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 1(a1)
 ; RV64I-NEXT:    lbu a4, 0(a1)
-; RV64I-NEXT:    lbu a5, 2(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 2(a1)
 ; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
+; RV64I-NEXT:    or a3, a6, a3
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a4, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    sra a0, a0, a1
@@ -528,17 +528,17 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a4, a6, 24
 ; RV32I-NEXT:    or a5, a4, a5
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    lbu a5, 1(a1)
 ; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    lbu a5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    or a5, a1, a5
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a5, a1, a6
 ; RV32I-NEXT:    addi a6, a5, -32
 ; RV32I-NEXT:    sra a1, a3, a5
 ; RV32I-NEXT:    bltz a6, .LBB5_2
@@ -598,39 +598,39 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
 ; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 14(a0)
 ; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 2(a1)
 ; RV64I-NEXT:    lbu a7, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a1, a4
 ; RV64I-NEXT:    addi a4, a5, -64
@@ -649,17 +649,17 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a0)
 ; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t1, 5(a0)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    srl a0, a0, a5
@@ -714,46 +714,46 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t0
 ; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    lbu a7, 0(a1)
-; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu t0, 1(a1)
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    lbu a7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a6, t0, a6
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    sw zero, 28(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
@@ -768,23 +768,23 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    mv a3, sp
 ; RV32I-NEXT:    add a0, a3, a0
 ; RV32I-NEXT:    lw a3, 4(a0)
-; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a4, 0(a0)
 ; RV32I-NEXT:    lw a5, 8(a0)
-; RV32I-NEXT:    andi a6, a1, 31
-; RV32I-NEXT:    xori a6, a6, 31
-; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    srl a6, a3, a1
+; RV32I-NEXT:    andi a7, a1, 31
+; RV32I-NEXT:    xori a7, a7, 31
 ; RV32I-NEXT:    slli t0, a5, 1
-; RV32I-NEXT:    sll t0, t0, a6
-; RV32I-NEXT:    or a4, a4, t0
-; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    sll t0, t0, a7
+; RV32I-NEXT:    or a6, a6, t0
+; RV32I-NEXT:    srl a4, a4, a1
 ; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    sll a3, a3, a6
-; RV32I-NEXT:    or a3, a7, a3
-; RV32I-NEXT:    srl a5, a5, a1
-; RV32I-NEXT:    slli a7, a0, 1
-; RV32I-NEXT:    sll a6, a7, a6
-; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    sll a3, a3, a7
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    srl a4, a5, a1
+; RV32I-NEXT:    slli a5, a0, 1
+; RV32I-NEXT:    sll a5, a5, a7
+; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -793,27 +793,27 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 13(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a4, 8(a2)
 ; RV32I-NEXT:    sb a3, 0(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
-; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    sb a6, 4(a2)
+; RV32I-NEXT:    srli a0, a4, 16
 ; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    srli a0, a5, 24
+; RV32I-NEXT:    srli a0, a4, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 9(a2)
 ; RV32I-NEXT:    srli a0, a3, 16
 ; RV32I-NEXT:    sb a0, 2(a2)
 ; RV32I-NEXT:    srli a0, a3, 24
 ; RV32I-NEXT:    sb a0, 3(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
 ; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    srli a0, a6, 16
 ; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    srli a0, a4, 24
+; RV32I-NEXT:    srli a0, a6, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -834,39 +834,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 2(a1)
 ; RV64I-NEXT:    lbu a7, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a1, a4
 ; RV64I-NEXT:    addi a4, a5, -64
@@ -885,17 +885,17 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 13(a0)
 ; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t1, 13(a0)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 14(a0)
 ; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    sll a0, a0, a5
@@ -950,46 +950,46 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t0
 ; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    lbu a7, 0(a1)
-; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu t0, 1(a1)
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    lbu a7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a6, t0, a6
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    sw zero, 12(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
@@ -1005,51 +1005,51 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sub a3, a3, a0
 ; RV32I-NEXT:    lw a0, 4(a3)
 ; RV32I-NEXT:    lw a4, 0(a3)
-; RV32I-NEXT:    sll a5, a0, a1
-; RV32I-NEXT:    andi a6, a1, 31
-; RV32I-NEXT:    xori a6, a6, 31
-; RV32I-NEXT:    srli a7, a4, 1
-; RV32I-NEXT:    lw t0, 12(a3)
-; RV32I-NEXT:    lw a3, 8(a3)
-; RV32I-NEXT:    srl a7, a7, a6
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    sll a7, t0, a1
-; RV32I-NEXT:    srli t0, a3, 1
-; RV32I-NEXT:    srl t0, t0, a6
-; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    lw a5, 8(a3)
+; RV32I-NEXT:    lw a3, 12(a3)
+; RV32I-NEXT:    sll a6, a0, a1
+; RV32I-NEXT:    andi a7, a1, 31
+; RV32I-NEXT:    xori a7, a7, 31
+; RV32I-NEXT:    srli t0, a4, 1
+; RV32I-NEXT:    srl t0, t0, a7
+; RV32I-NEXT:    or a6, a6, t0
 ; RV32I-NEXT:    sll a3, a3, a1
+; RV32I-NEXT:    srli t0, a5, 1
+; RV32I-NEXT:    srl t0, t0, a7
+; RV32I-NEXT:    or a3, a3, t0
+; RV32I-NEXT:    sll a5, a5, a1
 ; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    srl a0, a0, a6
-; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    srl a0, a0, a7
+; RV32I-NEXT:    or a0, a5, a0
 ; RV32I-NEXT:    sll a1, a4, a1
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a4, a1, 16
+; RV32I-NEXT:    sb a4, 2(a2)
+; RV32I-NEXT:    srli a4, a1, 24
+; RV32I-NEXT:    sb a4, 3(a2)
 ; RV32I-NEXT:    srli a1, a1, 8
 ; RV32I-NEXT:    sb a1, 1(a2)
 ; RV32I-NEXT:    sb a0, 8(a2)
-; RV32I-NEXT:    sb a7, 12(a2)
-; RV32I-NEXT:    sb a5, 4(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb a6, 4(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
 ; RV32I-NEXT:    sb a1, 10(a2)
 ; RV32I-NEXT:    srli a1, a0, 24
 ; RV32I-NEXT:    sb a1, 11(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    srli a0, a7, 16
+; RV32I-NEXT:    srli a0, a3, 16
 ; RV32I-NEXT:    sb a0, 14(a2)
-; RV32I-NEXT:    srli a0, a7, 24
+; RV32I-NEXT:    srli a0, a3, 24
 ; RV32I-NEXT:    sb a0, 15(a2)
-; RV32I-NEXT:    srli a0, a7, 8
-; RV32I-NEXT:    sb a0, 13(a2)
-; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a0, a6, 16
 ; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    srli a0, a5, 24
+; RV32I-NEXT:    srli a0, a6, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 5(a2)
+; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1070,39 +1070,39 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
 ; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 14(a0)
 ; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a5, a4, 32
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a5, 1(a1)
 ; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu t0, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 4(a1)
-; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a1, a5
 ; RV64I-NEXT:    addi a6, a5, -64
@@ -1123,17 +1123,17 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a6, 5(a0)
 ; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 6(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    lbu a6, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    srl a0, a0, a5
@@ -1186,47 +1186,47 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a7, a0, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a6, a0, a6
 ; RV32I-NEXT:    lbu t0, 0(a1)
-; RV32I-NEXT:    lbu t1, 2(a1)
+; RV32I-NEXT:    lbu t1, 1(a1)
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    lbu a7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t1
 ; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a1, a1, t0
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sw a0, 28(sp)
 ; RV32I-NEXT:    sw a0, 24(sp)
@@ -1241,23 +1241,23 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    mv a3, sp
 ; RV32I-NEXT:    add a0, a3, a0
 ; RV32I-NEXT:    lw a3, 4(a0)
-; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a4, 0(a0)
 ; RV32I-NEXT:    lw a5, 8(a0)
-; RV32I-NEXT:    andi a6, a1, 31
-; RV32I-NEXT:    xori a6, a6, 31
-; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    srl a6, a3, a1
+; RV32I-NEXT:    andi a7, a1, 31
+; RV32I-NEXT:    xori a7, a7, 31
 ; RV32I-NEXT:    slli t0, a5, 1
-; RV32I-NEXT:    sll t0, t0, a6
-; RV32I-NEXT:    or a4, a4, t0
-; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    sll t0, t0, a7
+; RV32I-NEXT:    or a6, a6, t0
+; RV32I-NEXT:    srl a4, a4, a1
 ; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    sll a3, a3, a6
-; RV32I-NEXT:    or a3, a7, a3
-; RV32I-NEXT:    srl a5, a5, a1
-; RV32I-NEXT:    slli a7, a0, 1
-; RV32I-NEXT:    sll a6, a7, a6
-; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    sll a3, a3, a7
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    srl a4, a5, a1
+; RV32I-NEXT:    slli a5, a0, 1
+; RV32I-NEXT:    sll a5, a5, a7
+; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -1266,27 +1266,27 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 13(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a4, 8(a2)
 ; RV32I-NEXT:    sb a3, 0(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
-; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    sb a6, 4(a2)
+; RV32I-NEXT:    srli a0, a4, 16
 ; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    srli a0, a5, 24
+; RV32I-NEXT:    srli a0, a4, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 9(a2)
 ; RV32I-NEXT:    srli a0, a3, 16
 ; RV32I-NEXT:    sb a0, 2(a2)
 ; RV32I-NEXT:    srli a0, a3, 24
 ; RV32I-NEXT:    sb a0, 3(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
 ; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    srli a0, a6, 16
 ; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    srli a0, a4, 24
+; RV32I-NEXT:    srli a0, a6, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1309,105 +1309,105 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a6, 9(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 10(a0)
 ; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 14(a0)
 ; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
 ; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu a7, 17(a0)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 18(a0)
 ; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t0, 21(a0)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 22(a0)
 ; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 26(a0)
 ; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a1)
 ; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu t1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sd zero, 56(sp)
@@ -1421,72 +1421,72 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    srli a0, a1, 3
 ; RV64I-NEXT:    andi a0, a0, 24
 ; RV64I-NEXT:    mv a3, sp
-; RV64I-NEXT:    add a3, a3, a0
-; RV64I-NEXT:    ld a4, 8(a3)
-; RV64I-NEXT:    srl a0, a4, a1
-; RV64I-NEXT:    ld a5, 16(a3)
-; RV64I-NEXT:    andi a6, a1, 63
-; RV64I-NEXT:    xori a6, a6, 63
-; RV64I-NEXT:    ld a7, 0(a3)
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    ld a3, 8(a0)
+; RV64I-NEXT:    ld a4, 0(a0)
+; RV64I-NEXT:    ld a5, 16(a0)
+; RV64I-NEXT:    ld a6, 24(a0)
+; RV64I-NEXT:    srl a0, a3, a1
+; RV64I-NEXT:    andi a7, a1, 63
+; RV64I-NEXT:    xori a7, a7, 63
 ; RV64I-NEXT:    slli t0, a5, 1
-; RV64I-NEXT:    sll t0, t0, a6
+; RV64I-NEXT:    sll t0, t0, a7
 ; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    srl a7, a7, a1
-; RV64I-NEXT:    slli a4, a4, 1
-; RV64I-NEXT:    ld a3, 24(a3)
-; RV64I-NEXT:    sll a4, a4, a6
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    srl a5, a5, a1
-; RV64I-NEXT:    slli a7, a3, 1
-; RV64I-NEXT:    sll a6, a7, a6
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    srl a1, a3, a1
+; RV64I-NEXT:    srl a4, a4, a1
+; RV64I-NEXT:    slli a3, a3, 1
+; RV64I-NEXT:    sll a3, a3, a7
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    srl a4, a5, a1
+; RV64I-NEXT:    slli a5, a6, 1
+; RV64I-NEXT:    sll a5, a5, a7
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    srl a1, a6, a1
 ; RV64I-NEXT:    sb a1, 24(a2)
-; RV64I-NEXT:    srli a3, a1, 56
-; RV64I-NEXT:    sb a3, 31(a2)
-; RV64I-NEXT:    srli a3, a1, 48
-; RV64I-NEXT:    sb a3, 30(a2)
-; RV64I-NEXT:    srli a3, a1, 40
-; RV64I-NEXT:    sb a3, 29(a2)
-; RV64I-NEXT:    srli a3, a1, 32
-; RV64I-NEXT:    sb a3, 28(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    sb a3, 27(a2)
-; RV64I-NEXT:    srli a3, a1, 16
-; RV64I-NEXT:    sb a3, 26(a2)
+; RV64I-NEXT:    srli a5, a1, 56
+; RV64I-NEXT:    sb a5, 31(a2)
+; RV64I-NEXT:    srli a5, a1, 48
+; RV64I-NEXT:    sb a5, 30(a2)
+; RV64I-NEXT:    srli a5, a1, 40
+; RV64I-NEXT:    sb a5, 29(a2)
+; RV64I-NEXT:    srli a5, a1, 32
+; RV64I-NEXT:    sb a5, 28(a2)
+; RV64I-NEXT:    srli a5, a1, 24
+; RV64I-NEXT:    sb a5, 27(a2)
+; RV64I-NEXT:    srli a5, a1, 16
+; RV64I-NEXT:    sb a5, 26(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 25(a2)
-; RV64I-NEXT:    sb a5, 16(a2)
-; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb a3, 0(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    srli a1, a5, 56
+; RV64I-NEXT:    srli a1, a4, 56
 ; RV64I-NEXT:    sb a1, 23(a2)
-; RV64I-NEXT:    srli a1, a5, 48
+; RV64I-NEXT:    srli a1, a4, 48
 ; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    srli a1, a5, 40
+; RV64I-NEXT:    srli a1, a4, 40
 ; RV64I-NEXT:    sb a1, 21(a2)
-; RV64I-NEXT:    srli a1, a5, 32
+; RV64I-NEXT:    srli a1, a4, 32
 ; RV64I-NEXT:    sb a1, 20(a2)
-; RV64I-NEXT:    srli a1, a5, 24
+; RV64I-NEXT:    srli a1, a4, 24
 ; RV64I-NEXT:    sb a1, 19(a2)
-; RV64I-NEXT:    srli a1, a5, 16
+; RV64I-NEXT:    srli a1, a4, 16
 ; RV64I-NEXT:    sb a1, 18(a2)
-; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 17(a2)
+; RV64I-NEXT:    srli a1, a3, 56
 ; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    srli a1, a3, 48
 ; RV64I-NEXT:    sb a1, 6(a2)
-; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    srli a1, a3, 40
 ; RV64I-NEXT:    sb a1, 5(a2)
-; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    srli a1, a3, 32
 ; RV64I-NEXT:    sb a1, 4(a2)
-; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    srli a1, a3, 24
 ; RV64I-NEXT:    sb a1, 3(a2)
-; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    srli a1, a3, 16
 ; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    srli a4, a4, 8
-; RV64I-NEXT:    sb a4, 1(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 1(a2)
 ; RV64I-NEXT:    srli a1, a0, 56
 ; RV64I-NEXT:    sb a1, 15(a2)
 ; RV64I-NEXT:    srli a1, a0, 48
@@ -1516,87 +1516,87 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t1, 17(a0)
+; RV32I-NEXT:    or a7, a6, a7
+; RV32I-NEXT:    lbu a6, 18(a0)
 ; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or t0, t0, a7
-; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    or a6, t2, a6
 ; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    or t0, a6, t0
+; RV32I-NEXT:    lbu a6, 22(a0)
 ; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t1
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t1, t1, a7
-; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    or a6, t3, a6
 ; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    or t1, a6, t1
+; RV32I-NEXT:    lbu a6, 26(a0)
 ; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or t2, t2, a7
-; RV32I-NEXT:    lbu a7, 29(a0)
+; RV32I-NEXT:    or a6, t4, a6
 ; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu t4, 29(a0)
+; RV32I-NEXT:    or t2, a6, t2
+; RV32I-NEXT:    lbu a6, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t4
-; RV32I-NEXT:    or a0, a0, a7
-; RV32I-NEXT:    lbu a7, 1(a1)
-; RV32I-NEXT:    lbu t3, 0(a1)
-; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu t4, 1(a1)
+; RV32I-NEXT:    or a0, a0, t3
+; RV32I-NEXT:    lbu t3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a6, t4, a6
+; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t4
-; RV32I-NEXT:    or a7, a1, a7
+; RV32I-NEXT:    or a1, a1, t3
+; RV32I-NEXT:    or a6, a1, a6
 ; RV32I-NEXT:    sw zero, 60(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 52(sp)
@@ -1609,91 +1609,91 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw t2, 24(sp)
 ; RV32I-NEXT:    sw t1, 20(sp)
 ; RV32I-NEXT:    sw t0, 16(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a7, 12(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    srli a0, a7, 3
+; RV32I-NEXT:    srli a0, a6, 3
 ; RV32I-NEXT:    andi a0, a0, 28
 ; RV32I-NEXT:    mv a1, sp
-; RV32I-NEXT:    add a4, a1, a0
-; RV32I-NEXT:    lw a1, 4(a4)
-; RV32I-NEXT:    srl a0, a1, a7
-; RV32I-NEXT:    lw a5, 8(a4)
-; RV32I-NEXT:    andi a3, a7, 31
-; RV32I-NEXT:    xori a6, a3, 31
-; RV32I-NEXT:    lw a3, 0(a4)
-; RV32I-NEXT:    slli t0, a5, 1
-; RV32I-NEXT:    sll t0, t0, a6
-; RV32I-NEXT:    or a0, a0, t0
-; RV32I-NEXT:    srl a3, a3, a7
+; RV32I-NEXT:    add a3, a1, a0
+; RV32I-NEXT:    lw a1, 4(a3)
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    lw a5, 8(a3)
+; RV32I-NEXT:    lw a7, 12(a3)
+; RV32I-NEXT:    srl a0, a1, a6
+; RV32I-NEXT:    andi t0, a6, 31
+; RV32I-NEXT:    xori t0, t0, 31
+; RV32I-NEXT:    slli t1, a5, 1
+; RV32I-NEXT:    sll t1, t1, t0
+; RV32I-NEXT:    or a0, a0, t1
+; RV32I-NEXT:    srl a4, a4, a6
 ; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    lw t0, 12(a4)
-; RV32I-NEXT:    lw t1, 16(a4)
-; RV32I-NEXT:    sll a1, a1, a6
-; RV32I-NEXT:    or a1, a3, a1
-; RV32I-NEXT:    srl a3, t0, a7
-; RV32I-NEXT:    slli t2, t1, 1
-; RV32I-NEXT:    sll t2, t2, a6
-; RV32I-NEXT:    or a3, a3, t2
-; RV32I-NEXT:    srl a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 1
-; RV32I-NEXT:    lw t2, 20(a4)
-; RV32I-NEXT:    lw t3, 24(a4)
-; RV32I-NEXT:    sll t0, t0, a6
-; RV32I-NEXT:    or a5, a5, t0
-; RV32I-NEXT:    srl t0, t2, a7
-; RV32I-NEXT:    slli t4, t3, 1
-; RV32I-NEXT:    sll t4, t4, a6
-; RV32I-NEXT:    or t0, t0, t4
-; RV32I-NEXT:    srl t1, t1, a7
+; RV32I-NEXT:    sll a1, a1, t0
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    srl a4, a7, a6
+; RV32I-NEXT:    lw t1, 16(a3)
+; RV32I-NEXT:    lw t2, 20(a3)
+; RV32I-NEXT:    lw t3, 24(a3)
+; RV32I-NEXT:    lw t4, 28(a3)
+; RV32I-NEXT:    slli a3, t1, 1
+; RV32I-NEXT:    sll a3, a3, t0
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    srl a4, a5, a6
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    sll a5, a7, t0
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    srl a5, t2, a6
+; RV32I-NEXT:    slli a7, t3, 1
+; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    srl a7, t1, a6
 ; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    lw a4, 28(a4)
-; RV32I-NEXT:    sll t2, t2, a6
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    srl t2, t3, a7
-; RV32I-NEXT:    slli t3, a4, 1
-; RV32I-NEXT:    sll a6, t3, a6
-; RV32I-NEXT:    or a6, t2, a6
-; RV32I-NEXT:    srl a4, a4, a7
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    srli a7, a4, 24
-; RV32I-NEXT:    sb a7, 31(a2)
-; RV32I-NEXT:    srli a7, a4, 16
-; RV32I-NEXT:    sb a7, 30(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb t1, 16(a2)
-; RV32I-NEXT:    sb t0, 20(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sll t1, t2, t0
+; RV32I-NEXT:    or a7, a7, t1
+; RV32I-NEXT:    srl t1, t3, a6
+; RV32I-NEXT:    slli t2, t4, 1
+; RV32I-NEXT:    sll t0, t2, t0
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    srl a6, t4, a6
+; RV32I-NEXT:    sb a6, 28(a2)
+; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    sb t1, 31(a2)
+; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    sb t1, 30(a2)
+; RV32I-NEXT:    srli a6, a6, 8
+; RV32I-NEXT:    sb a6, 29(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    sb a7, 16(a2)
+; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb a4, 8(a2)
 ; RV32I-NEXT:    sb a3, 12(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 24
-; RV32I-NEXT:    sb a4, 27(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 26(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 25(a2)
-; RV32I-NEXT:    srli a4, t1, 24
-; RV32I-NEXT:    sb a4, 19(a2)
-; RV32I-NEXT:    srli a4, t1, 16
-; RV32I-NEXT:    sb a4, 18(a2)
-; RV32I-NEXT:    srli a4, t1, 8
-; RV32I-NEXT:    sb a4, 17(a2)
-; RV32I-NEXT:    srli a4, t0, 24
-; RV32I-NEXT:    sb a4, 23(a2)
-; RV32I-NEXT:    srli a4, t0, 16
-; RV32I-NEXT:    sb a4, 22(a2)
-; RV32I-NEXT:    srli a4, t0, 8
-; RV32I-NEXT:    sb a4, 21(a2)
-; RV32I-NEXT:    srli a4, a5, 24
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a4, a5, 16
-; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a6, t0, 24
+; RV32I-NEXT:    sb a6, 27(a2)
+; RV32I-NEXT:    srli a6, t0, 16
+; RV32I-NEXT:    sb a6, 26(a2)
+; RV32I-NEXT:    srli a6, t0, 8
+; RV32I-NEXT:    sb a6, 25(a2)
+; RV32I-NEXT:    srli a6, a7, 24
+; RV32I-NEXT:    sb a6, 19(a2)
+; RV32I-NEXT:    srli a6, a7, 16
+; RV32I-NEXT:    sb a6, 18(a2)
+; RV32I-NEXT:    srli a6, a7, 8
+; RV32I-NEXT:    sb a6, 17(a2)
+; RV32I-NEXT:    srli a6, a5, 24
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    srli a6, a5, 16
+; RV32I-NEXT:    sb a6, 22(a2)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb a5, 21(a2)
+; RV32I-NEXT:    srli a5, a4, 24
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    sb a5, 10(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 9(a2)
 ; RV32I-NEXT:    srli a4, a3, 24
 ; RV32I-NEXT:    sb a4, 15(a2)
 ; RV32I-NEXT:    srli a4, a3, 16
@@ -1733,105 +1733,105 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a6, 9(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 10(a0)
 ; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 14(a0)
 ; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
 ; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu a7, 17(a0)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 18(a0)
 ; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t0, 21(a0)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 22(a0)
 ; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 26(a0)
 ; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a1)
 ; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu t1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sd zero, 24(sp)
@@ -1848,69 +1848,69 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sub a3, a3, a0
 ; RV64I-NEXT:    ld a4, 8(a3)
 ; RV64I-NEXT:    ld a5, 0(a3)
+; RV64I-NEXT:    ld a6, 16(a3)
+; RV64I-NEXT:    ld a3, 24(a3)
 ; RV64I-NEXT:    sll a0, a4, a1
-; RV64I-NEXT:    andi a6, a1, 63
-; RV64I-NEXT:    xori a6, a6, 63
-; RV64I-NEXT:    srli a7, a5, 1
-; RV64I-NEXT:    ld t0, 24(a3)
-; RV64I-NEXT:    ld a3, 16(a3)
-; RV64I-NEXT:    srl a7, a7, a6
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    sll a7, t0, a1
-; RV64I-NEXT:    srli t0, a3, 1
-; RV64I-NEXT:    srl t0, t0, a6
-; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    andi a7, a1, 63
+; RV64I-NEXT:    xori a7, a7, 63
+; RV64I-NEXT:    srli t0, a5, 1
+; RV64I-NEXT:    srl t0, t0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    sll a3, a3, a1
+; RV64I-NEXT:    srli t0, a6, 1
+; RV64I-NEXT:    srl t0, t0, a7
+; RV64I-NEXT:    or a3, a3, t0
+; RV64I-NEXT:    sll a6, a6, a1
 ; RV64I-NEXT:    srli a4, a4, 1
-; RV64I-NEXT:    srl a4, a4, a6
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    srl a4, a4, a7
+; RV64I-NEXT:    or a4, a6, a4
 ; RV64I-NEXT:    sll a1, a5, a1
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    srli a4, a1, 56
-; RV64I-NEXT:    sb a4, 7(a2)
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    sb a4, 6(a2)
-; RV64I-NEXT:    srli a4, a1, 40
-; RV64I-NEXT:    sb a4, 5(a2)
-; RV64I-NEXT:    srli a4, a1, 32
-; RV64I-NEXT:    sb a4, 4(a2)
-; RV64I-NEXT:    srli a4, a1, 24
-; RV64I-NEXT:    sb a4, 3(a2)
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    sb a4, 2(a2)
+; RV64I-NEXT:    srli a5, a1, 56
+; RV64I-NEXT:    sb a5, 7(a2)
+; RV64I-NEXT:    srli a5, a1, 48
+; RV64I-NEXT:    sb a5, 6(a2)
+; RV64I-NEXT:    srli a5, a1, 40
+; RV64I-NEXT:    sb a5, 5(a2)
+; RV64I-NEXT:    srli a5, a1, 32
+; RV64I-NEXT:    sb a5, 4(a2)
+; RV64I-NEXT:    srli a5, a1, 24
+; RV64I-NEXT:    sb a5, 3(a2)
+; RV64I-NEXT:    srli a5, a1, 16
+; RV64I-NEXT:    sb a5, 2(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 1(a2)
-; RV64I-NEXT:    sb a3, 16(a2)
-; RV64I-NEXT:    sb a7, 24(a2)
+; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb a3, 24(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    srli a1, a3, 56
+; RV64I-NEXT:    srli a1, a4, 56
 ; RV64I-NEXT:    sb a1, 23(a2)
-; RV64I-NEXT:    srli a1, a3, 48
+; RV64I-NEXT:    srli a1, a4, 48
 ; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    srli a1, a3, 40
+; RV64I-NEXT:    srli a1, a4, 40
 ; RV64I-NEXT:    sb a1, 21(a2)
-; RV64I-NEXT:    srli a1, a3, 32
+; RV64I-NEXT:    srli a1, a4, 32
 ; RV64I-NEXT:    sb a1, 20(a2)
-; RV64I-NEXT:    srli a1, a3, 24
+; RV64I-NEXT:    srli a1, a4, 24
 ; RV64I-NEXT:    sb a1, 19(a2)
-; RV64I-NEXT:    srli a1, a3, 16
+; RV64I-NEXT:    srli a1, a4, 16
 ; RV64I-NEXT:    sb a1, 18(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 17(a2)
-; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 17(a2)
+; RV64I-NEXT:    srli a1, a3, 56
 ; RV64I-NEXT:    sb a1, 31(a2)
-; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    srli a1, a3, 48
 ; RV64I-NEXT:    sb a1, 30(a2)
-; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    srli a1, a3, 40
 ; RV64I-NEXT:    sb a1, 29(a2)
-; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    srli a1, a3, 32
 ; RV64I-NEXT:    sb a1, 28(a2)
-; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    srli a1, a3, 24
 ; RV64I-NEXT:    sb a1, 27(a2)
-; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    srli a1, a3, 16
 ; RV64I-NEXT:    sb a1, 26(a2)
-; RV64I-NEXT:    srli a1, a7, 8
-; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 25(a2)
 ; RV64I-NEXT:    srli a1, a0, 56
 ; RV64I-NEXT:    sb a1, 15(a2)
 ; RV64I-NEXT:    srli a1, a0, 48
@@ -1940,86 +1940,86 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t1, 17(a0)
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    lbu a7, 18(a0)
 ; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or t0, t0, a7
-; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    or a7, t2, a7
 ; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    or t0, a7, t0
+; RV32I-NEXT:    lbu a7, 22(a0)
 ; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t1
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t1, t1, a7
-; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    or a7, t3, a7
 ; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    or t1, a7, t1
+; RV32I-NEXT:    lbu a7, 26(a0)
 ; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or t2, t2, a7
-; RV32I-NEXT:    lbu a7, 29(a0)
+; RV32I-NEXT:    or a7, t4, a7
 ; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu t4, 29(a0)
+; RV32I-NEXT:    or t2, a7, t2
+; RV32I-NEXT:    lbu a7, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t4
 ; RV32I-NEXT:    or a0, a0, a7
-; RV32I-NEXT:    lbu a7, 1(a1)
-; RV32I-NEXT:    lbu t3, 0(a1)
-; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t4, 1(a1)
+; RV32I-NEXT:    or a0, a0, t3
+; RV32I-NEXT:    lbu t3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a7, t4, a7
+; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a1, a1, t3
 ; RV32I-NEXT:    or a7, a1, a7
 ; RV32I-NEXT:    sw zero, 28(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
@@ -2043,68 +2043,68 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sub a4, a1, a0
 ; RV32I-NEXT:    lw a3, 4(a4)
 ; RV32I-NEXT:    lw a5, 0(a4)
+; RV32I-NEXT:    lw a6, 8(a4)
+; RV32I-NEXT:    lw t0, 12(a4)
 ; RV32I-NEXT:    sll a0, a3, a7
 ; RV32I-NEXT:    andi a1, a7, 31
-; RV32I-NEXT:    xori a6, a1, 31
+; RV32I-NEXT:    xori t1, a1, 31
 ; RV32I-NEXT:    srli a1, a5, 1
-; RV32I-NEXT:    lw t0, 12(a4)
-; RV32I-NEXT:    lw t1, 8(a4)
-; RV32I-NEXT:    srl a1, a1, a6
+; RV32I-NEXT:    srl a1, a1, t1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    sll a1, t0, a7
-; RV32I-NEXT:    srli t2, t1, 1
-; RV32I-NEXT:    srl t2, t2, a6
+; RV32I-NEXT:    srli t2, a6, 1
+; RV32I-NEXT:    srl t2, t2, t1
 ; RV32I-NEXT:    or a1, a1, t2
-; RV32I-NEXT:    sll t1, t1, a7
+; RV32I-NEXT:    sll a6, a6, a7
 ; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    lw t2, 20(a4)
-; RV32I-NEXT:    lw t3, 16(a4)
-; RV32I-NEXT:    srl a3, a3, a6
-; RV32I-NEXT:    or a3, t1, a3
-; RV32I-NEXT:    sll t1, t2, a7
-; RV32I-NEXT:    srli t4, t3, 1
-; RV32I-NEXT:    srl t4, t4, a6
-; RV32I-NEXT:    or t1, t1, t4
-; RV32I-NEXT:    sll t3, t3, a7
+; RV32I-NEXT:    srl a3, a3, t1
+; RV32I-NEXT:    lw t2, 16(a4)
+; RV32I-NEXT:    lw t3, 20(a4)
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    lw a6, 24(a4)
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    sll t4, t3, a7
+; RV32I-NEXT:    srli t5, t2, 1
+; RV32I-NEXT:    srl t5, t5, t1
+; RV32I-NEXT:    or t4, t4, t5
+; RV32I-NEXT:    sll t2, t2, a7
 ; RV32I-NEXT:    srli t0, t0, 1
-; RV32I-NEXT:    lw t4, 28(a4)
-; RV32I-NEXT:    lw a4, 24(a4)
-; RV32I-NEXT:    srl t0, t0, a6
-; RV32I-NEXT:    or t0, t3, t0
-; RV32I-NEXT:    sll t3, t4, a7
-; RV32I-NEXT:    srli t4, a4, 1
-; RV32I-NEXT:    srl t4, t4, a6
-; RV32I-NEXT:    or t3, t3, t4
+; RV32I-NEXT:    srl t0, t0, t1
+; RV32I-NEXT:    or t0, t2, t0
 ; RV32I-NEXT:    sll a4, a4, a7
-; RV32I-NEXT:    srli t2, t2, 1
-; RV32I-NEXT:    srl a6, t2, a6
-; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    srli t2, a6, 1
+; RV32I-NEXT:    srl t2, t2, t1
+; RV32I-NEXT:    or a4, a4, t2
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    srli t2, t3, 1
+; RV32I-NEXT:    srl t1, t2, t1
+; RV32I-NEXT:    or a6, a6, t1
 ; RV32I-NEXT:    sll a5, a5, a7
 ; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    srli a6, a5, 24
-; RV32I-NEXT:    sb a6, 3(a2)
-; RV32I-NEXT:    srli a6, a5, 16
-; RV32I-NEXT:    sb a6, 2(a2)
+; RV32I-NEXT:    srli a7, a5, 24
+; RV32I-NEXT:    sb a7, 3(a2)
+; RV32I-NEXT:    srli a7, a5, 16
+; RV32I-NEXT:    sb a7, 2(a2)
 ; RV32I-NEXT:    srli a5, a5, 8
 ; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    sb a4, 24(a2)
-; RV32I-NEXT:    sb t3, 28(a2)
+; RV32I-NEXT:    sb a6, 24(a2)
+; RV32I-NEXT:    sb a4, 28(a2)
 ; RV32I-NEXT:    sb t0, 16(a2)
-; RV32I-NEXT:    sb t1, 20(a2)
+; RV32I-NEXT:    sb t4, 20(a2)
 ; RV32I-NEXT:    sb a3, 8(a2)
 ; RV32I-NEXT:    sb a1, 12(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a5, a4, 24
+; RV32I-NEXT:    srli a5, a6, 24
 ; RV32I-NEXT:    sb a5, 27(a2)
-; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    srli a5, a6, 16
 ; RV32I-NEXT:    sb a5, 26(a2)
+; RV32I-NEXT:    srli a5, a6, 8
+; RV32I-NEXT:    sb a5, 25(a2)
+; RV32I-NEXT:    srli a5, a4, 24
+; RV32I-NEXT:    sb a5, 31(a2)
+; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    sb a5, 30(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 25(a2)
-; RV32I-NEXT:    srli a4, t3, 24
-; RV32I-NEXT:    sb a4, 31(a2)
-; RV32I-NEXT:    srli a4, t3, 16
-; RV32I-NEXT:    sb a4, 30(a2)
-; RV32I-NEXT:    srli a4, t3, 8
 ; RV32I-NEXT:    sb a4, 29(a2)
 ; RV32I-NEXT:    srli a4, t0, 24
 ; RV32I-NEXT:    sb a4, 19(a2)
@@ -2112,11 +2112,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a4, 18(a2)
 ; RV32I-NEXT:    srli a4, t0, 8
 ; RV32I-NEXT:    sb a4, 17(a2)
-; RV32I-NEXT:    srli a4, t1, 24
+; RV32I-NEXT:    srli a4, t4, 24
 ; RV32I-NEXT:    sb a4, 23(a2)
-; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    srli a4, t4, 16
 ; RV32I-NEXT:    sb a4, 22(a2)
-; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    srli a4, t4, 8
 ; RV32I-NEXT:    sb a4, 21(a2)
 ; RV32I-NEXT:    srli a4, a3, 24
 ; RV32I-NEXT:    sb a4, 11(a2)
@@ -2157,105 +2157,105 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a6, 9(a0)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 10(a0)
 ; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu a5, 14(a0)
 ; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
 ; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu a7, 17(a0)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 18(a0)
 ; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t0, 21(a0)
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu a6, 22(a0)
 ; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 26(a0)
 ; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    or a6, t1, a6
 ; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a7, a0, 32
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 1(a1)
 ; RV64I-NEXT:    lbu t0, 0(a1)
-; RV64I-NEXT:    lbu t1, 2(a1)
+; RV64I-NEXT:    lbu t1, 1(a1)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 2(a1)
 ; RV64I-NEXT:    lbu t2, 3(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    or a7, t2, a7
 ; RV64I-NEXT:    lbu t1, 4(a1)
-; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    lbu t0, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or t0, t0, t1
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    sraiw a0, a0, 31
@@ -2270,72 +2270,72 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    srli a0, a1, 3
 ; RV64I-NEXT:    andi a0, a0, 24
 ; RV64I-NEXT:    mv a3, sp
-; RV64I-NEXT:    add a3, a3, a0
-; RV64I-NEXT:    ld a4, 8(a3)
-; RV64I-NEXT:    srl a0, a4, a1
-; RV64I-NEXT:    ld a5, 16(a3)
-; RV64I-NEXT:    andi a6, a1, 63
-; RV64I-NEXT:    xori a6, a6, 63
-; RV64I-NEXT:    ld a7, 0(a3)
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    ld a3, 8(a0)
+; RV64I-NEXT:    ld a4, 0(a0)
+; RV64I-NEXT:    ld a5, 16(a0)
+; RV64I-NEXT:    ld a6, 24(a0)
+; RV64I-NEXT:    srl a0, a3, a1
+; RV64I-NEXT:    andi a7, a1, 63
+; RV64I-NEXT:    xori a7, a7, 63
 ; RV64I-NEXT:    slli t0, a5, 1
-; RV64I-NEXT:    sll t0, t0, a6
+; RV64I-NEXT:    sll t0, t0, a7
 ; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    srl a7, a7, a1
-; RV64I-NEXT:    slli a4, a4, 1
-; RV64I-NEXT:    ld a3, 24(a3)
-; RV64I-NEXT:    sll a4, a4, a6
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    srl a5, a5, a1
-; RV64I-NEXT:    slli a7, a3, 1
-; RV64I-NEXT:    sll a6, a7, a6
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    sra a1, a3, a1
+; RV64I-NEXT:    srl a4, a4, a1
+; RV64I-NEXT:    slli a3, a3, 1
+; RV64I-NEXT:    sll a3, a3, a7
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    srl a4, a5, a1
+; RV64I-NEXT:    slli a5, a6, 1
+; RV64I-NEXT:    sll a5, a5, a7
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    sra a1, a6, a1
 ; RV64I-NEXT:    sb a1, 24(a2)
-; RV64I-NEXT:    srli a3, a1, 56
-; RV64I-NEXT:    sb a3, 31(a2)
-; RV64I-NEXT:    srli a3, a1, 48
-; RV64I-NEXT:    sb a3, 30(a2)
-; RV64I-NEXT:    srli a3, a1, 40
-; RV64I-NEXT:    sb a3, 29(a2)
-; RV64I-NEXT:    srli a3, a1, 32
-; RV64I-NEXT:    sb a3, 28(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    sb a3, 27(a2)
-; RV64I-NEXT:    srli a3, a1, 16
-; RV64I-NEXT:    sb a3, 26(a2)
+; RV64I-NEXT:    srli a5, a1, 56
+; RV64I-NEXT:    sb a5, 31(a2)
+; RV64I-NEXT:    srli a5, a1, 48
+; RV64I-NEXT:    sb a5, 30(a2)
+; RV64I-NEXT:    srli a5, a1, 40
+; RV64I-NEXT:    sb a5, 29(a2)
+; RV64I-NEXT:    srli a5, a1, 32
+; RV64I-NEXT:    sb a5, 28(a2)
+; RV64I-NEXT:    srli a5, a1, 24
+; RV64I-NEXT:    sb a5, 27(a2)
+; RV64I-NEXT:    srli a5, a1, 16
+; RV64I-NEXT:    sb a5, 26(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 25(a2)
-; RV64I-NEXT:    sb a5, 16(a2)
-; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb a3, 0(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    srli a1, a5, 56
+; RV64I-NEXT:    srli a1, a4, 56
 ; RV64I-NEXT:    sb a1, 23(a2)
-; RV64I-NEXT:    srli a1, a5, 48
+; RV64I-NEXT:    srli a1, a4, 48
 ; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    srli a1, a5, 40
+; RV64I-NEXT:    srli a1, a4, 40
 ; RV64I-NEXT:    sb a1, 21(a2)
-; RV64I-NEXT:    srli a1, a5, 32
+; RV64I-NEXT:    srli a1, a4, 32
 ; RV64I-NEXT:    sb a1, 20(a2)
-; RV64I-NEXT:    srli a1, a5, 24
+; RV64I-NEXT:    srli a1, a4, 24
 ; RV64I-NEXT:    sb a1, 19(a2)
-; RV64I-NEXT:    srli a1, a5, 16
+; RV64I-NEXT:    srli a1, a4, 16
 ; RV64I-NEXT:    sb a1, 18(a2)
-; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 17(a2)
+; RV64I-NEXT:    srli a1, a3, 56
 ; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    srli a1, a3, 48
 ; RV64I-NEXT:    sb a1, 6(a2)
-; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    srli a1, a3, 40
 ; RV64I-NEXT:    sb a1, 5(a2)
-; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    srli a1, a3, 32
 ; RV64I-NEXT:    sb a1, 4(a2)
-; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    srli a1, a3, 24
 ; RV64I-NEXT:    sb a1, 3(a2)
-; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    srli a1, a3, 16
 ; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    srli a4, a4, 8
-; RV64I-NEXT:    sb a4, 1(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 1(a2)
 ; RV64I-NEXT:    srli a1, a0, 56
 ; RV64I-NEXT:    sb a1, 15(a2)
 ; RV64I-NEXT:    srli a1, a0, 48
@@ -2365,87 +2365,87 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 6(a0)
 ; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    or a4, a7, a4
 ; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    lbu a5, 10(a0)
 ; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    lbu a6, 14(a0)
 ; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t1, 17(a0)
+; RV32I-NEXT:    or a7, a6, a7
+; RV32I-NEXT:    lbu a6, 18(a0)
 ; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or t0, t0, a7
-; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    or a6, t2, a6
 ; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    or t0, a6, t0
+; RV32I-NEXT:    lbu a6, 22(a0)
 ; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t1
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t1, t1, a7
-; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    or a6, t3, a6
 ; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    or t1, a6, t1
+; RV32I-NEXT:    lbu a6, 26(a0)
 ; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or t2, t2, a7
-; RV32I-NEXT:    lbu a7, 29(a0)
+; RV32I-NEXT:    or a6, t4, a6
 ; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu t4, 29(a0)
+; RV32I-NEXT:    or t2, a6, t2
+; RV32I-NEXT:    lbu a6, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t3
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or t3, a0, t4
-; RV32I-NEXT:    or t3, t3, a7
-; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a6, a0, a6
 ; RV32I-NEXT:    lbu t4, 0(a1)
-; RV32I-NEXT:    lbu t5, 2(a1)
+; RV32I-NEXT:    lbu t5, 1(a1)
+; RV32I-NEXT:    or t3, a6, t3
+; RV32I-NEXT:    lbu a6, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t4
-; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t5
-; RV32I-NEXT:    or a7, a1, a7
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a6, a1, t4
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sw a0, 60(sp)
 ; RV32I-NEXT:    sw a0, 56(sp)
@@ -2459,91 +2459,91 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw t2, 24(sp)
 ; RV32I-NEXT:    sw t1, 20(sp)
 ; RV32I-NEXT:    sw t0, 16(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a7, 12(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    srli a0, a7, 3
+; RV32I-NEXT:    srli a0, a6, 3
 ; RV32I-NEXT:    andi a0, a0, 28
 ; RV32I-NEXT:    mv a1, sp
-; RV32I-NEXT:    add a4, a1, a0
-; RV32I-NEXT:    lw a1, 4(a4)
-; RV32I-NEXT:    srl a0, a1, a7
-; RV32I-NEXT:    lw a5, 8(a4)
-; RV32I-NEXT:    andi a3, a7, 31
-; RV32I-NEXT:    xori a6, a3, 31
-; RV32I-NEXT:    lw a3, 0(a4)
-; RV32I-NEXT:    slli t0, a5, 1
-; RV32I-NEXT:    sll t0, t0, a6
-; RV32I-NEXT:    or a0, a0, t0
-; RV32I-NEXT:    srl a3, a3, a7
+; RV32I-NEXT:    add a3, a1, a0
+; RV32I-NEXT:    lw a1, 4(a3)
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    lw a5, 8(a3)
+; RV32I-NEXT:    lw a7, 12(a3)
+; RV32I-NEXT:    srl a0, a1, a6
+; RV32I-NEXT:    andi t0, a6, 31
+; RV32I-NEXT:    xori t0, t0, 31
+; RV32I-NEXT:    slli t1, a5, 1
+; RV32I-NEXT:    sll t1, t1, t0
+; RV32I-NEXT:    or a0, a0, t1
+; RV32I-NEXT:    srl a4, a4, a6
 ; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    lw t0, 12(a4)
-; RV32I-NEXT:    lw t1, 16(a4)
-; RV32I-NEXT:    sll a1, a1, a6
-; RV32I-NEXT:    or a1, a3, a1
-; RV32I-NEXT:    srl a3, t0, a7
-; RV32I-NEXT:    slli t2, t1, 1
-; RV32I-NEXT:    sll t2, t2, a6
-; RV32I-NEXT:    or a3, a3, t2
-; RV32I-NEXT:    srl a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 1
-; RV32I-NEXT:    lw t2, 20(a4)
-; RV32I-NEXT:    lw t3, 24(a4)
-; RV32I-NEXT:    sll t0, t0, a6
-; RV32I-NEXT:    or a5, a5, t0
-; RV32I-NEXT:    srl t0, t2, a7
-; RV32I-NEXT:    slli t4, t3, 1
-; RV32I-NEXT:    sll t4, t4, a6
-; RV32I-NEXT:    or t0, t0, t4
-; RV32I-NEXT:    srl t1, t1, a7
+; RV32I-NEXT:    sll a1, a1, t0
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    srl a4, a7, a6
+; RV32I-NEXT:    lw t1, 16(a3)
+; RV32I-NEXT:    lw t2, 20(a3)
+; RV32I-NEXT:    lw t3, 24(a3)
+; RV32I-NEXT:    lw t4, 28(a3)
+; RV32I-NEXT:    slli a3, t1, 1
+; RV32I-NEXT:    sll a3, a3, t0
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    srl a4, a5, a6
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    sll a5, a7, t0
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    srl a5, t2, a6
+; RV32I-NEXT:    slli a7, t3, 1
+; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    srl a7, t1, a6
 ; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    lw a4, 28(a4)
-; RV32I-NEXT:    sll t2, t2, a6
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    srl t2, t3, a7
-; RV32I-NEXT:    slli t3, a4, 1
-; RV32I-NEXT:    sll a6, t3, a6
-; RV32I-NEXT:    or a6, t2, a6
-; RV32I-NEXT:    sra a4, a4, a7
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    srli a7, a4, 24
-; RV32I-NEXT:    sb a7, 31(a2)
-; RV32I-NEXT:    srli a7, a4, 16
-; RV32I-NEXT:    sb a7, 30(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb t1, 16(a2)
-; RV32I-NEXT:    sb t0, 20(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sll t1, t2, t0
+; RV32I-NEXT:    or a7, a7, t1
+; RV32I-NEXT:    srl t1, t3, a6
+; RV32I-NEXT:    slli t2, t4, 1
+; RV32I-NEXT:    sll t0, t2, t0
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    sra a6, t4, a6
+; RV32I-NEXT:    sb a6, 28(a2)
+; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    sb t1, 31(a2)
+; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    sb t1, 30(a2)
+; RV32I-NEXT:    srli a6, a6, 8
+; RV32I-NEXT:    sb a6, 29(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    sb a7, 16(a2)
+; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb a4, 8(a2)
 ; RV32I-NEXT:    sb a3, 12(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 24
-; RV32I-NEXT:    sb a4, 27(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 26(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 25(a2)
-; RV32I-NEXT:    srli a4, t1, 24
-; RV32I-NEXT:    sb a4, 19(a2)
-; RV32I-NEXT:    srli a4, t1, 16
-; RV32I-NEXT:    sb a4, 18(a2)
-; RV32I-NEXT:    srli a4, t1, 8
-; RV32I-NEXT:    sb a4, 17(a2)
-; RV32I-NEXT:    srli a4, t0, 24
-; RV32I-NEXT:    sb a4, 23(a2)
-; RV32I-NEXT:    srli a4, t0, 16
-; RV32I-NEXT:    sb a4, 22(a2)
-; RV32I-NEXT:    srli a4, t0, 8
-; RV32I-NEXT:    sb a4, 21(a2)
-; RV32I-NEXT:    srli a4, a5, 24
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a4, a5, 16
-; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a6, t0, 24
+; RV32I-NEXT:    sb a6, 27(a2)
+; RV32I-NEXT:    srli a6, t0, 16
+; RV32I-NEXT:    sb a6, 26(a2)
+; RV32I-NEXT:    srli a6, t0, 8
+; RV32I-NEXT:    sb a6, 25(a2)
+; RV32I-NEXT:    srli a6, a7, 24
+; RV32I-NEXT:    sb a6, 19(a2)
+; RV32I-NEXT:    srli a6, a7, 16
+; RV32I-NEXT:    sb a6, 18(a2)
+; RV32I-NEXT:    srli a6, a7, 8
+; RV32I-NEXT:    sb a6, 17(a2)
+; RV32I-NEXT:    srli a6, a5, 24
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    srli a6, a5, 16
+; RV32I-NEXT:    sb a6, 22(a2)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb a5, 21(a2)
+; RV32I-NEXT:    srli a5, a4, 24
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    sb a5, 10(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 9(a2)
 ; RV32I-NEXT:    srli a4, a3, 24
 ; RV32I-NEXT:    sb a4, 15(a2)
 ; RV32I-NEXT:    srli a4, a3, 16
diff --git a/llvm/test/CodeGen/RISCV/xcvalu.ll b/llvm/test/CodeGen/RISCV/xcvalu.ll
index 1ddfa102aca71..54634383dfdac 100644
--- a/llvm/test/CodeGen/RISCV/xcvalu.ll
+++ b/llvm/test/CodeGen/RISCV/xcvalu.ll
@@ -91,6 +91,26 @@ define i32 @exthz(i16 %a) {
   ret i32 %1
 }
 
+define i32 @extbs(i8 %a) {
+; CHECK-LABEL: extbs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $x11 killed $x10
+; CHECK-NEXT:    cv.extbs a0, a0
+; CHECK-NEXT:    ret
+  %1 = sext i8 %a to i32
+  ret i32 %1
+}
+
+define i32 @extbz(i8 %a) {
+; CHECK-LABEL: extbz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $x11 killed $x10
+; CHECK-NEXT:    cv.extbz a0, a0
+; CHECK-NEXT:    ret
+  %1 = zext i8 %a to i32
+  ret i32 %1
+}
+
 declare i32 @llvm.riscv.cv.alu.clip(i32, i32)
 
 define i32 @test.cv.alu.clip.case.a(i32 %a) {
@@ -133,170 +153,170 @@ define i32 @test.cv.alu.clipu.case.b(i32 %a) {
   ret i32 %1
 }
 
-declare i32 @llvm.riscv.cv.alu.addn(i32, i32, i32)
+declare i32 @llvm.riscv.cv.alu.addN(i32, i32, i32)
 
-define i32 @test.cv.alu.addn.case.a(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.addn.case.a:
+define i32 @test.cv.alu.addN.case.a(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.addN.case.a:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cv.addn a0, a0, a1, 15
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.addn(i32 %a, i32 %b, i32 15)
+  %1 = call i32 @llvm.riscv.cv.alu.addN(i32 %a, i32 %b, i32 15)
   ret i32 %1
 }
 
-define i32 @test.cv.alu.addn.case.b(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.addn.case.b:
+define i32 @test.cv.alu.addN.case.b(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.addN.case.b:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    cv.addnr a0, a1, a2
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.addn(i32 %a, i32 %b, i32 32)
+  %1 = call i32 @llvm.riscv.cv.alu.addN(i32 %a, i32 %b, i32 32)
   ret i32 %1
 }
 
-declare i32 @llvm.riscv.cv.alu.addun(i32, i32, i32)
+declare i32 @llvm.riscv.cv.alu.adduN(i32, i32, i32)
 
-define i32 @test.cv.alu.addun.case.a(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.addun.case.a:
+define i32 @test.cv.alu.adduN.case.a(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.adduN.case.a:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cv.addun a0, a0, a1, 15
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.addun(i32 %a, i32 %b, i32 15)
+  %1 = call i32 @llvm.riscv.cv.alu.adduN(i32 %a, i32 %b, i32 15)
   ret i32 %1
 }
 
-define i32 @test.cv.alu.addun.case.b(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.addun.case.b:
+define i32 @test.cv.alu.adduN.case.b(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.adduN.case.b:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    cv.addunr a0, a1, a2
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.addun(i32 %a, i32 %b, i32 32)
+  %1 = call i32 @llvm.riscv.cv.alu.adduN(i32 %a, i32 %b, i32 32)
   ret i32 %1
 }
 
-declare i32 @llvm.riscv.cv.alu.addrn(i32, i32, i32)
+declare i32 @llvm.riscv.cv.alu.addRN(i32, i32, i32)
 
-define i32 @test.cv.alu.addrn.case.a(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.addrn.case.a:
+define i32 @test.cv.alu.addRN.case.a(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.addRN.case.a:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cv.addrn a0, a0, a1, 15
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.addrn(i32 %a, i32 %b, i32 15)
+  %1 = call i32 @llvm.riscv.cv.alu.addRN(i32 %a, i32 %b, i32 15)
   ret i32 %1
 }
 
-define i32 @test.cv.alu.addrn.case.b(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.addrn.case.b:
+define i32 @test.cv.alu.addRN.case.b(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.addRN.case.b:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    cv.addrnr a0, a1, a2
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.addrn(i32 %a, i32 %b, i32 32)
+  %1 = call i32 @llvm.riscv.cv.alu.addRN(i32 %a, i32 %b, i32 32)
   ret i32 %1
 }
 
-declare i32 @llvm.riscv.cv.alu.addurn(i32, i32, i32)
+declare i32 @llvm.riscv.cv.alu.adduRN(i32, i32, i32)
 
-define i32 @test.cv.alu.addurn.case.a(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.addurn.case.a:
+define i32 @test.cv.alu.adduRN.case.a(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.adduRN.case.a:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cv.addurn a0, a0, a1, 15
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.addurn(i32 %a, i32 %b, i32 15)
+  %1 = call i32 @llvm.riscv.cv.alu.adduRN(i32 %a, i32 %b, i32 15)
   ret i32 %1
 }
 
-define i32 @test.cv.alu.addurn.case.b(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.addurn.case.b:
+define i32 @test.cv.alu.adduRN.case.b(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.adduRN.case.b:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    cv.addurnr a0, a1, a2
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.addurn(i32 %a, i32 %b, i32 32)
+  %1 = call i32 @llvm.riscv.cv.alu.adduRN(i32 %a, i32 %b, i32 32)
   ret i32 %1
 }
 
-declare i32 @llvm.riscv.cv.alu.subn(i32, i32, i32)
+declare i32 @llvm.riscv.cv.alu.subN(i32, i32, i32)
 
-define i32 @test.cv.alu.subn.case.a(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.subn.case.a:
+define i32 @test.cv.alu.subN.case.a(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.subN.case.a:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cv.subn a0, a0, a1, 15
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.subn(i32 %a, i32 %b, i32 15)
+  %1 = call i32 @llvm.riscv.cv.alu.subN(i32 %a, i32 %b, i32 15)
   ret i32 %1
 }
 
-define i32 @test.cv.alu.subn.case.b(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.subn.case.b:
+define i32 @test.cv.alu.subN.case.b(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.subN.case.b:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    cv.subnr a0, a1, a2
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.subn(i32 %a, i32 %b, i32 32)
+  %1 = call i32 @llvm.riscv.cv.alu.subN(i32 %a, i32 %b, i32 32)
   ret i32 %1
 }
 
-declare i32 @llvm.riscv.cv.alu.subun(i32, i32, i32)
+declare i32 @llvm.riscv.cv.alu.subuN(i32, i32, i32)
 
-define i32 @test.cv.alu.subun.case.a(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.subun.case.a:
+define i32 @test.cv.alu.subuN.case.a(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.subuN.case.a:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cv.subun a0, a0, a1, 15
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.subun(i32 %a, i32 %b, i32 15)
+  %1 = call i32 @llvm.riscv.cv.alu.subuN(i32 %a, i32 %b, i32 15)
   ret i32 %1
 }
 
-define i32 @test.cv.alu.subun.case.b(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.subun.case.b:
+define i32 @test.cv.alu.subuN.case.b(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.subuN.case.b:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    cv.subunr a0, a1, a2
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.subun(i32 %a, i32 %b, i32 32)
+  %1 = call i32 @llvm.riscv.cv.alu.subuN(i32 %a, i32 %b, i32 32)
   ret i32 %1
 }
 
-declare i32 @llvm.riscv.cv.alu.subrn(i32, i32, i32)
+declare i32 @llvm.riscv.cv.alu.subRN(i32, i32, i32)
 
-define i32 @test.cv.alu.subrn.case.a(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.subrn.case.a:
+define i32 @test.cv.alu.subRN.case.a(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.subRN.case.a:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cv.subrn a0, a0, a1, 15
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.subrn(i32 %a, i32 %b, i32 15)
+  %1 = call i32 @llvm.riscv.cv.alu.subRN(i32 %a, i32 %b, i32 15)
   ret i32 %1
 }
 
-define i32 @test.cv.alu.subrn.case.b(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.subrn.case.b:
+define i32 @test.cv.alu.subRN.case.b(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.subRN.case.b:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    cv.subrnr a0, a1, a2
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.subrn(i32 %a, i32 %b, i32 32)
+  %1 = call i32 @llvm.riscv.cv.alu.subRN(i32 %a, i32 %b, i32 32)
   ret i32 %1
 }
 
-declare i32 @llvm.riscv.cv.alu.suburn(i32, i32, i32)
+declare i32 @llvm.riscv.cv.alu.subuRN(i32, i32, i32)
 
-define i32 @test.cv.alu.suburn.case.a(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.suburn.case.a:
+define i32 @test.cv.alu.subuRN.case.a(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.subuRN.case.a:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cv.suburn a0, a0, a1, 15
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.suburn(i32 %a, i32 %b, i32 15)
+  %1 = call i32 @llvm.riscv.cv.alu.subuRN(i32 %a, i32 %b, i32 15)
   ret i32 %1
 }
 
-define i32 @test.cv.alu.suburn.case.b(i32 %a, i32 %b) {
-; CHECK-LABEL: test.cv.alu.suburn.case.b:
+define i32 @test.cv.alu.subuRN.case.b(i32 %a, i32 %b) {
+; CHECK-LABEL: test.cv.alu.subuRN.case.b:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    cv.suburnr a0, a1, a2
 ; CHECK-NEXT:    ret
-  %1 = call i32 @llvm.riscv.cv.alu.suburn(i32 %a, i32 %b, i32 32)
+  %1 = call i32 @llvm.riscv.cv.alu.subuRN(i32 %a, i32 %b, i32 32)
   ret i32 %1
 }
diff --git a/llvm/test/CodeGen/RISCV/xtheadmempair.ll b/llvm/test/CodeGen/RISCV/xtheadmempair.ll
index 333fd4c047242..3a74bb66d9ec2 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmempair.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmempair.ll
@@ -57,14 +57,14 @@ define i64 @lwud(ptr %a) {
 define i64 @ldd(ptr %a) {
 ; RV32XTHEADMEMPAIR-LABEL: ldd:
 ; RV32XTHEADMEMPAIR:       # %bb.0:
-; RV32XTHEADMEMPAIR-NEXT:    lw a1, 32(a0)
-; RV32XTHEADMEMPAIR-NEXT:    lw a2, 36(a0)
-; RV32XTHEADMEMPAIR-NEXT:    lw a3, 44(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a1, 44(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a2, 32(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a3, 36(a0)
 ; RV32XTHEADMEMPAIR-NEXT:    lw a0, 40(a0)
-; RV32XTHEADMEMPAIR-NEXT:    add a2, a2, a3
-; RV32XTHEADMEMPAIR-NEXT:    add a0, a1, a0
-; RV32XTHEADMEMPAIR-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMPAIR-NEXT:    add a1, a2, a1
+; RV32XTHEADMEMPAIR-NEXT:    add a1, a3, a1
+; RV32XTHEADMEMPAIR-NEXT:    add a0, a2, a0
+; RV32XTHEADMEMPAIR-NEXT:    sltu a2, a0, a2
+; RV32XTHEADMEMPAIR-NEXT:    add a1, a1, a2
 ; RV32XTHEADMEMPAIR-NEXT:    ret
 ;
 ; RV64XTHEADMEMPAIR-LABEL: ldd:
diff --git a/llvm/test/CodeGen/SPARC/data-align.ll b/llvm/test/CodeGen/SPARC/data-align.ll
new file mode 100644
index 0000000000000..d4a39524da44f
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/data-align.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=sparc | FileCheck %s
+; RUN: llc < %s -march=sparcel | FileCheck %s
+; RUN: llc < %s -march=sparcv9 | FileCheck %s
+
+; CHECK:      .Li8:
+; CHECK-DAG: .size .Li8, 1
+@i8 = private constant i8 42
+
+; CHECK:      .p2align 1
+; CHECK-NEXT: .Li16:
+; CHECK-DAG:  .size .Li16, 2
+@i16 = private constant i16 42
+
+; CHECK:      .p2align 2
+; CHECK-NEXT: .Li32:
+; CHECK-DAG:  .size .Li32, 4
+@i32 = private constant i32 42
+
+; CHECK:      .p2align 3
+; CHECK-NEXT: .Li64:
+; CHECK-DAG:  .size .Li64, 8
+@i64 = private constant i64 42
+
+; CHECK:      .p2align 4
+; CHECK-NEXT: .Li128:
+; CHECK-DAG:  .size .Li128, 16
+@i128 = private constant i128 42
diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll
index 794dcd6d9f3fb..54eb0e45dccee 100644
--- a/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll
+++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll
@@ -6,21 +6,29 @@
 ; CHECK-MIR-DAG: [[type_void:%[0-9]+\:type]] = OpTypeVoid
 ; CHECK-MIR-DAG: [[type_i64:%[0-9]+\:type\(s64\)]] = OpTypeInt 32, 0
 ; CHECK-MIR-DAG: [[dwarf_version:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 5
-; CHECK-MIR-DAG: [[source_language:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 3
-; CHECK-MIR-DAG: [[debug_info_version:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 21
-; CHECK-MIR-DAG: [[filename_str:%[0-9]+\:id\(s32\)]] = OpString 1094795567, 1094795585, 792805697, 1111638594, 1111638594, 1128481583, 1128481603, {{1697596227|1700545347}}, 1886216568, 1663985004, 0
-; CHECK-MIR-DAG: [[debug_source:%[0-9]+\:id\(s32\)]] = OpExtInst [[type_void]], 3, 35, [[filename_str]]
-; CHECK-MIR-DAG: [[debug_compilation_unit:%[0-9]+\:id\(s32\)]] = OpExtInst [[type_void]], 3, 1, [[source_language]], [[dwarf_version]], [[debug_source]], [[debug_info_version]]
+; CHECK-MIR-DAG: [[debug_info_version:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 3
+; CHECK-MIR-DAG: [[source_language_sycl:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 7
+; CHECK-MIR-DAG: [[source_language_cpp:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 4
+; CHECK-MIR-DAG: [[filename_str_sycl:%[0-9]+\:id\(s32\)]] = OpString 1094795567, 1094795585, 792805697, 1111638594, 1111638594, 1128481583, 1128481603, {{1697596227|1700545347}}, 1886216568, 1663985004, 0
+; CHECK-MIR-DAG: [[filename_str_cpp:%[0-9]+\:id\(s32\)]] = OpString 1145324591, 1145324612, 793003076, 1162167621, 1162167621, 1179010607, 1179010630, {{1697596998|1700546118}}, 1886216568, 774989164, 7368803
+; CHECK-MIR-DAG: [[debug_source_sycl:%[0-9]+\:id\(s32\)]] = OpExtInst [[type_void]], 3, 35, [[filename_str_sycl]]
+; CHECK-MIR-DAG: OpExtInst [[type_void]], 3, 1, [[debug_info_version]], [[dwarf_version]], [[debug_source_sycl]], [[source_language_sycl]]
+; CHECK-MIR-DAG: [[debug_source_cpp:%[0-9]+\:id\(s32\)]] = OpExtInst [[type_void]], 3, 35, [[filename_str_cpp]]
+; CHECK-MIR-DAG: OpExtInst [[type_void]], 3, 1, [[debug_info_version]], [[dwarf_version]], [[debug_source_cpp]], [[source_language_cpp]]
 
 ; CHECK-SPIRV: [[ext_inst_non_semantic:%[0-9]+]] = OpExtInstImport "NonSemantic.Shader.DebugInfo.100"
-; CHECK-SPIRV: [[filename_str:%[0-9]+]] = OpString "/AAAAAAAAAA/BBBBBBBB/CCCCCCCCC{{[/\\]}}example.c"
+; CHECK-SPIRV: [[filename_str_sycl:%[0-9]+]] = OpString "/AAAAAAAAAA/BBBBBBBB/CCCCCCCCC{{[/\\]}}example.c"
+; CHECK-SPIRV: [[filename_str_cpp:%[0-9]+]] = OpString "/DDDDDDDDDD/EEEEEEEE/FFFFFFFFF{{[/\\]}}example1.cpp"
 ; CHECK-SPIRV-DAG: [[type_void:%[0-9]+]] = OpTypeVoid
 ; CHECK-SPIRV-DAG: [[type_i32:%[0-9]+]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG: [[dwarf_version:%[0-9]+]] = OpConstant [[type_i32]] 5
-; CHECK-SPIRV-DAG: [[debug_info_version:%[0-9]+]] = OpConstant [[type_i32]] 21
-; CHECK-SPIRV-DAG: [[source_language:%[0-9]+]] = OpConstant [[type_i32]] 3
-; CHECK-SPIRV: [[debug_source:%[0-9]+]] = OpExtInst [[type_void]] [[ext_inst_non_semantic]] DebugSource [[filename_str]]
-; CHECK-SPIRV: [[debug_compiation_unit:%[0-9]+]] = OpExtInst [[type_void]] [[ext_inst_non_semantic]] DebugCompilationUnit [[source_language]] [[dwarf_version]] [[debug_source]] [[debug_info_version]]
+; CHECK-SPIRV-DAG: [[source_language_sycl:%[0-9]+]] = OpConstant [[type_i32]] 7
+; CHECK-SPIRV-DAG: [[source_language_cpp:%[0-9]+]] = OpConstant [[type_i32]] 4
+; CHECK-SPIRV-DAG: [[debug_info_version:%[0-9]+]] = OpConstant [[type_i32]] 3
+; CHECK-SPIRV: [[debug_source_sycl:%[0-9]+]] = OpExtInst [[type_void]] [[ext_inst_non_semantic]] DebugSource [[filename_str_sycl]]
+; CHECK-SPIRV: OpExtInst [[type_void]] [[ext_inst_non_semantic]] DebugCompilationUnit [[debug_info_version]] [[dwarf_version]] [[debug_source_sycl]] [[source_language_sycl]]
+; CHECK-SPIRV: [[debug_source_cpp:%[0-9]+]] = OpExtInst [[type_void]] [[ext_inst_non_semantic]] DebugSource [[filename_str_cpp]]
+; CHECK-SPIRV: OpExtInst [[type_void]] [[ext_inst_non_semantic]] DebugCompilationUnit [[debug_info_version]] [[dwarf_version]] [[debug_source_cpp]] [[source_language_cpp]]
 
 ; CHECK-OPTION-NOT: OpExtInstImport "NonSemantic.Shader.DebugInfo.100"
 ; CHECK-OPTION-NOT: OpString "/AAAAAAAAAA/BBBBBBBB/CCCCCCCCC{{[/\\]}}example.c"
@@ -37,12 +45,14 @@ entry:
 }
 ; CHECK-SPIRV-NOT: Lfunc_end1:
 
-!llvm.dbg.cu = !{!0}
+!llvm.dbg.cu = !{!0, !6}
 !llvm.module.flags = !{!2, !3, !4, !5}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_OpenCL, file: !1, producer: "clang version XX.X.XXXX (FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!0 = distinct !DICompileUnit(language: DW_LANG_SYCL, file: !1, producer: "clang version XX.X.XXXX (FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
 !1 = !DIFile(filename: "example.c", directory: "/AAAAAAAAAA/BBBBBBBB/CCCCCCCCC", checksumkind: CSK_MD5, checksum: "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF")
 !2 = !{i32 7, !"Dwarf Version", i32 5}
 !3 = !{i32 2, !"Debug Info Version", i32 3}
 !4 = !{i32 1, !"wchar_size", i32 4}
 !5 = !{i32 7, !"frame-pointer", i32 2}
+!6 = distinct !DICompileUnit(language: DW_LANG_OpenCL_CPP, file: !7, producer: "clang version XX.X.XXXX (FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!7 = !DIFile(filename: "example1.cpp", directory: "/DDDDDDDDDD/EEEEEEEE/FFFFFFFFF", checksumkind: CSK_MD5, checksum: "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF")
diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-basic.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-basic.ll
index e3c382d01c6c0..d12914d378542 100644
--- a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-basic.ll
+++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-basic.ll
@@ -163,7 +163,7 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!2, !3, !4, !5}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version XX.X.XXXX (FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!0 = distinct !DICompileUnit(language: DW_LANG_Zig, file: !1, producer: "clang version XX.X.XXXX (FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
 !1 = !DIFile(filename: "example.cpp", directory: "/AAAAAAAAAA/BBBBBBBB/CCCCCCCCC", checksumkind: CSK_MD5, checksum: "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF")
 !2 = !{i32 7, !"Dwarf Version", i32 5}
 !3 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
new file mode 100644
index 0000000000000..b238b07bbc04e
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
@@ -0,0 +1,28 @@
+; The goal of this test case is to check that cases covered by pointers/PtrCast-in-OpSpecConstantOp.ll and
+; pointers/PtrCast-null-in-OpSpecConstantOp.ll (that is OpSpecConstantOp with ptr-cast operation) correctly
+; work also for function pointers.
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - --spirv-ext=+SPV_INTEL_function_pointers | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Running with -verify-machineinstrs would lead to "Reading virtual register without a def"
+; error, because OpConstantFunctionPointerINTEL forward-refers to a function definition.
+
+; CHECK-COUNT-3: %[[#]] = OpSpecConstantOp %[[#]] 121 %[[#]]
+; CHECK-COUNT-3: OpPtrCastToGeneric
+
+@G1 = addrspace(1) constant { [3 x ptr addrspace(4)] } { [3 x ptr addrspace(4)] [ptr addrspace(4) null, ptr addrspace(4) addrspacecast (ptr @foo to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr @bar to ptr addrspace(4))] }
+@G2 = addrspace(1) constant { [3 x ptr addrspace(4)] } { [3 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr @bar to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr @foo to ptr addrspace(4))] }
+
+define void @foo(ptr addrspace(4) %p) {
+entry:
+  %r1 = addrspacecast ptr @foo to ptr addrspace(4)
+  %r2 = addrspacecast ptr null to ptr addrspace(4)
+  ret void
+}
+
+define void @bar(ptr addrspace(4) %p) {
+entry:
+  %r1 = addrspacecast ptr @bar to ptr addrspace(4)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll
new file mode 100644
index 0000000000000..bdbfc133efa29
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll
@@ -0,0 +1,49 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+
+define noundef float @atan2_float(float noundef %a, float noundef %b) {
+entry:
+; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]]
+  %elt.atan2 = call float @llvm.atan2.f32(float %a, float %b)
+  ret float %elt.atan2
+}
+
+define noundef half @atan2_half(half noundef %a, half noundef %b) {
+entry:
+; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]]
+  %elt.atan2 = call half @llvm.atan2.f16(half %a, half %b)
+  ret half %elt.atan2
+}
+
+define noundef <4 x float> @atan2_float4(<4 x float> noundef %a, <4 x float> noundef %b) {
+entry:
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]]
+  %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %elt.atan2
+}
+
+define noundef <4 x half> @atan2_half4(<4 x half> noundef %a, <4 x half> noundef %b) {
+entry:
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]]
+  %elt.atan2 = call <4 x half> @llvm.atan2.v4f16(<4 x half> %a, <4 x half> %b)
+  ret <4 x half> %elt.atan2
+}
+
+declare half @llvm.atan2.f16(half, half)
+declare float @llvm.atan2.f32(float, float)
+declare <4 x half> @llvm.atan2.v4f16(<4 x half>, <4 x half>)
+declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/arithmetic-fence.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/arithmetic-fence.ll
new file mode 100644
index 0000000000000..5d8f547054dbf
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/arithmetic-fence.ll
@@ -0,0 +1,60 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefixes=CHECK-NOEXT
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-linux %s -o - --spirv-ext=+SPV_EXT_arithmetic_fence | FileCheck %s --check-prefixes=CHECK-EXT
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-NOEXT-NO: OpCapability ArithmeticFenceEXT
+; CHECK-NOEXT-NO: OpExtension "SPV_EXT_arithmetic_fence"
+; CHECK-NOEXT: OpFunction
+; CHECK-NOEXT: OpFMul
+; CHECK-NOEXT: OpFAdd
+; CHECK-NOEXT-NO: OpArithmeticFenceEXT
+; CHECK-NOEXT: OpFunction
+; CHECK-NOEXT-NO: OpArithmeticFenceEXT
+; CHECK-NOEXT: OpFunction
+; CHECK-NOEXT-NO: OpArithmeticFenceEXT
+
+; CHECK-EXT: OpCapability ArithmeticFenceEXT
+; CHECK-EXT: OpExtension "SPV_EXT_arithmetic_fence"
+; CHECK-EXT: OpFunction
+; CHECK-EXT: [[R1:%.*]] = OpFMul [[I32Ty:%.*]] %[[#]] %[[#]]
+; CHECK-EXT: [[R2:%.*]] = OpArithmeticFenceEXT [[I32Ty]] [[R1]]
+; CHECK-EXT: %[[#]] = OpFAdd [[I32Ty]] [[R2]] %[[#]]
+; CHECK-EXT: OpFunction
+; CHECK-EXT: [[R3:%.*]] = OpFAdd [[I64Ty:%.*]] [[A1:%.*]] [[A1]]
+; CHECK-EXT: [[R4:%.*]] = OpArithmeticFenceEXT [[I64Ty]] [[R3]]
+; CHECK-EXT: [[R5:%.*]] = OpFAdd [[I64Ty]] [[A1]] [[A1]]
+; CHECK-EXT: %[[#]] = OpFAdd [[I64Ty]] [[R4]] [[R5]]
+; CHECK-EXT: OpFunction
+; CHECK-EXT: [[R6:%.*]] = OpFAdd [[I32VecTy:%.*]] [[A2:%.*]] [[A2]]
+; CHECK-EXT: [[R7:%.*]] = OpArithmeticFenceEXT [[I32VecTy]] [[R6]]
+; CHECK-EXT: [[R8:%.*]] = OpFAdd [[I32VecTy]] [[A2]] [[A2]]
+; CHECK-EXT: %[[#]] = OpFAdd [[I32VecTy]] [[R7]] [[R8]]
+
+define float @f1(float %a, float %b, float %c) {
+  %mul = fmul fast float %b, %a
+  %tmp = call float @llvm.arithmetic.fence.f32(float %mul)
+  %add = fadd fast float %tmp, %c
+  ret float %add
+}
+
+define double @f2(double %a) {
+  %1 = fadd fast double %a, %a
+  %t = call double @llvm.arithmetic.fence.f64(double %1)
+  %2 = fadd fast double %a, %a
+  %3 = fadd fast double %t, %2
+  ret double %3
+}
+
+define <2 x float> @f3(<2 x float> %a) {
+  %1 = fadd fast <2 x float> %a, %a
+  %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1)
+  %2 = fadd fast <2 x float> %a, %a
+  %3 = fadd fast <2 x float> %t, %2
+  ret <2 x float> %3
+}
+
+declare float @llvm.arithmetic.fence.f32(float)
+declare double @llvm.arithmetic.fence.f64(double)
+declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll
new file mode 100644
index 0000000000000..a15a80754cd60
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll
@@ -0,0 +1,25 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Ensure that these calls do not represent any code and don't cause a crash.
+; CHECK: OpFunction
+; CHECK-NEXT: OpFunctionParameter
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: OpReturn
+; CHECK-NEXT: OpFunctionEnd
+
+define spir_kernel void @foo(ptr %p) {
+entry:
+  call void @llvm.trap()
+  call void @llvm.debugtrap()
+  call void @llvm.ubsantrap(i8 100)
+
+  %r1 = call ptr @llvm.invariant.start.p0(i64 1024, ptr %p)
+  call void @llvm.invariant.end.p0(ptr %r1, i64 1024, ptr %p)
+
+  call void @llvm.instrprof.increment(ptr %p, i64 0, i32 1, i32 0)
+  call void @llvm.instrprof.increment.step(ptr %p, i64 0, i32 1, i32 0, i64 1)
+  call void @llvm.instrprof.value.profile(ptr %p, i64 0, i64 0, i32 1, i32 0)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/smul.with.overflow.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/smul.with.overflow.ll
new file mode 100644
index 0000000000000..2281ccf52bbb4
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/smul.with.overflow.ll
@@ -0,0 +1,89 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[Char:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[Void:.*]] = OpTypeVoid
+; CHECK-DAG: %[[PtrChar:.*]] = OpTypePointer Function %[[Char]]
+; CHECK-DAG: %[[StructChar:.*]] = OpTypeStruct %[[Char]] %[[Char]]
+; CHECK-DAG: %[[ZeroChar:.*]] = OpConstant %[[Char]] 0
+; CHECK-DAG: %[[Int:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[PtrInt:.*]] = OpTypePointer Function %[[Int]]
+; CHECK-DAG: %[[StructInt:.*]] = OpTypeStruct %[[Int]] %[[Int]]
+; CHECK-DAG: %[[ZeroInt:.*]] = OpConstant %[[Int]] 0
+; CHECK-DAG: %[[Bool:.*]] = OpTypeBool
+; CHECK-DAG: %[[V2Bool:.*]] = OpTypeVector %[[Bool]] 2
+; CHECK-DAG: %[[Long:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[V2Long:.*]] = OpTypeVector %[[Long]] 2
+; CHECK-DAG: %[[PtrV2Long:.*]] = OpTypePointer Function %[[V2Long]]
+; CHECK-DAG: %[[StructV2Long:.*]] = OpTypeStruct %[[V2Long]] %[[V2Long]]
+; CHECK-DAG: %[[ZeroV2Long:.*]] = OpConstantNull %[[V2Long]]
+
+; CHECK: OpFunction
+; CHECK: %[[A:.*]] = OpFunctionParameter %[[Char]]
+; CHECK: %[[B:.*]] = OpFunctionParameter %[[Char]]
+; CHECK: %[[Ptr:.*]] = OpFunctionParameter %[[PtrChar]]
+; CHECK: %[[Struct:.*]] = OpSMulExtended %[[StructChar]] %[[A]] %[[B]]
+; CHECK: %[[Val:.*]] = OpCompositeExtract %[[Char]] %[[Struct]] 0
+; CHECK: %[[Over:.*]] = OpCompositeExtract %[[Char]] %[[Struct]] 1
+; CHECK: %[[IsOver:.*]] = OpINotEqual %[[Bool]] %[[Over]] %[[ZeroChar]]
+; CHECK: %[[Res:.*]] = OpSelect %[[Char]] %[[IsOver]] %[[ZeroChar]] %[[Val]]
+; CHECK: OpStore %[[Ptr]] %[[Res]] Aligned 1
+; CHECK: OpReturn
+define dso_local spir_func void @umulo_i8(i8 zeroext %a, i8 zeroext %b, ptr nocapture %c) local_unnamed_addr {
+entry:
+  %umul = tail call { i8, i1 } @llvm.smul.with.overflow.i8(i8 %a, i8 %b)
+  %cmp = extractvalue { i8, i1 } %umul, 1
+  %umul.value = extractvalue { i8, i1 } %umul, 0
+  %storemerge = select i1 %cmp, i8 0, i8 %umul.value
+  store i8 %storemerge, ptr %c, align 1
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[A2:.*]] = OpFunctionParameter %[[Int]]
+; CHECK: %[[B2:.*]] = OpFunctionParameter %[[Int]]
+; CHECK: %[[Ptr2:.*]] = OpFunctionParameter %[[PtrInt]]
+; CHECK: %[[Struct2:.*]] = OpSMulExtended %[[StructInt]] %[[B2]] %[[A2]]
+; CHECK: %[[Val2:.*]] = OpCompositeExtract %[[Int]] %[[Struct2]] 0
+; CHECK: %[[Over2:.*]] = OpCompositeExtract %[[Int]] %[[Struct2]] 1
+; CHECK: %[[IsOver2:.*]] = OpINotEqual %[[Bool]] %[[Over2]] %[[ZeroInt]]
+; CHECK: %[[Res2:.*]] = OpSelect %[[Int]] %[[IsOver2]] %[[ZeroInt]] %[[Val2]]
+; CHECK: OpStore %[[Ptr2]] %[[Res2]] Aligned 4
+; CHECK: OpReturn
+define dso_local spir_func void @umulo_i32(i32 %a, i32 %b, ptr nocapture %c) local_unnamed_addr {
+entry:
+  %umul = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %b, i32 %a)
+  %umul.val = extractvalue { i32, i1 } %umul, 0
+  %umul.ov = extractvalue { i32, i1 } %umul, 1
+  %spec.select = select i1 %umul.ov, i32 0, i32 %umul.val
+  store i32 %spec.select, ptr %c, align 4
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[A3:.*]] = OpFunctionParameter %[[V2Long]]
+; CHECK: %[[B3:.*]] = OpFunctionParameter %[[V2Long]]
+; CHECK: %[[Ptr3:.*]] = OpFunctionParameter %[[PtrV2Long]]
+; CHECK: %[[Struct3:.*]] = OpSMulExtended %[[StructV2Long]] %[[A3]] %[[B3]]
+; CHECK: %[[Val3:.*]] = OpCompositeExtract %[[V2Long]] %[[Struct3]] 0
+; CHECK: %[[Over3:.*]] = OpCompositeExtract %[[V2Long]] %[[Struct3]] 1
+; CHECK: %[[IsOver3:.*]] = OpINotEqual %[[V2Bool]] %[[Over3]] %[[ZeroV2Long]]
+; CHECK: %[[Res3:.*]] = OpSelect %[[V2Long]] %[[IsOver3]] %[[ZeroV2Long]] %[[Val3]]
+; CHECK: OpStore %[[Ptr3]] %[[Res3]] Aligned 16
+; CHECK: OpReturn
+define dso_local spir_func void @umulo_v2i64(<2 x i64> %a, <2 x i64> %b, ptr %p) nounwind {
+  %umul = call {<2 x i64>, <2 x i1>} @llvm.smul.with.overflow.v2i64(<2 x i64> %a, <2 x i64> %b)
+  %umul.val = extractvalue {<2 x i64>, <2 x i1>} %umul, 0
+  %umul.ov = extractvalue {<2 x i64>, <2 x i1>} %umul, 1
+  %zero = alloca <2 x i64>, align 16
+  %spec.select = select <2 x i1> %umul.ov, <2 x i64> <i64 0, i64 0>, <2 x i64> %umul.val
+  store <2 x i64> %spec.select, ptr %p
+  ret void
+}
+
+declare {i8, i1} @llvm.smul.with.overflow.i8(i8, i8)
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32)
+declare {<2 x i64>, <2 x i1>} @llvm.smul.with.overflow.v2i64(<2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/uadd.with.overflow.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/uadd.with.overflow.ll
new file mode 100644
index 0000000000000..cecd6f60655dc
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/uadd.with.overflow.ll
@@ -0,0 +1,89 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[Char:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[Void:.*]] = OpTypeVoid
+; CHECK-DAG: %[[PtrChar:.*]] = OpTypePointer Function %[[Char]]
+; CHECK-DAG: %[[StructChar:.*]] = OpTypeStruct %[[Char]] %[[Char]]
+; CHECK-DAG: %[[ZeroChar:.*]] = OpConstant %[[Char]] 0
+; CHECK-DAG: %[[Int:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[PtrInt:.*]] = OpTypePointer Function %[[Int]]
+; CHECK-DAG: %[[StructInt:.*]] = OpTypeStruct %[[Int]] %[[Int]]
+; CHECK-DAG: %[[ZeroInt:.*]] = OpConstant %[[Int]] 0
+; CHECK-DAG: %[[Bool:.*]] = OpTypeBool
+; CHECK-DAG: %[[V2Bool:.*]] = OpTypeVector %[[Bool]] 2
+; CHECK-DAG: %[[Long:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[V2Long:.*]] = OpTypeVector %[[Long]] 2
+; CHECK-DAG: %[[PtrV2Long:.*]] = OpTypePointer Function %[[V2Long]]
+; CHECK-DAG: %[[StructV2Long:.*]] = OpTypeStruct %[[V2Long]] %[[V2Long]]
+; CHECK-DAG: %[[ZeroV2Long:.*]] = OpConstantNull %[[V2Long]]
+
+; CHECK: OpFunction
+; CHECK: %[[A:.*]] = OpFunctionParameter %[[Char]]
+; CHECK: %[[B:.*]] = OpFunctionParameter %[[Char]]
+; CHECK: %[[Ptr:.*]] = OpFunctionParameter %[[PtrChar]]
+; CHECK: %[[Struct:.*]] = OpIAddCarry %[[StructChar]] %[[A]] %[[B]]
+; CHECK: %[[Val:.*]] = OpCompositeExtract %[[Char]] %[[Struct]] 0
+; CHECK: %[[Over:.*]] = OpCompositeExtract %[[Char]] %[[Struct]] 1
+; CHECK: %[[IsOver:.*]] = OpINotEqual %[[Bool]] %[[Over]] %[[ZeroChar]]
+; CHECK: %[[Res:.*]] = OpSelect %[[Char]] %[[IsOver]] %[[ZeroChar]] %[[Val]]
+; CHECK: OpStore %[[Ptr]] %[[Res]] Aligned 1
+; CHECK: OpReturn
+define dso_local spir_func void @umulo_i8(i8 zeroext %a, i8 zeroext %b, ptr nocapture %c) local_unnamed_addr {
+entry:
+  %umul = tail call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
+  %cmp = extractvalue { i8, i1 } %umul, 1
+  %umul.value = extractvalue { i8, i1 } %umul, 0
+  %storemerge = select i1 %cmp, i8 0, i8 %umul.value
+  store i8 %storemerge, ptr %c, align 1
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[A2:.*]] = OpFunctionParameter %[[Int]]
+; CHECK: %[[B2:.*]] = OpFunctionParameter %[[Int]]
+; CHECK: %[[Ptr2:.*]] = OpFunctionParameter %[[PtrInt]]
+; CHECK: %[[Struct2:.*]] = OpIAddCarry %[[StructInt]] %[[B2]] %[[A2]]
+; CHECK: %[[Val2:.*]] = OpCompositeExtract %[[Int]] %[[Struct2]] 0
+; CHECK: %[[Over2:.*]] = OpCompositeExtract %[[Int]] %[[Struct2]] 1
+; CHECK: %[[IsOver2:.*]] = OpINotEqual %[[Bool]] %[[Over2]] %[[ZeroInt]]
+; CHECK: %[[Res2:.*]] = OpSelect %[[Int]] %[[IsOver2]] %[[ZeroInt]] %[[Val2]]
+; CHECK: OpStore %[[Ptr2]] %[[Res2]] Aligned 4
+; CHECK: OpReturn
+define dso_local spir_func void @umulo_i32(i32 %a, i32 %b, ptr nocapture %c) local_unnamed_addr {
+entry:
+  %umul = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %b, i32 %a)
+  %umul.val = extractvalue { i32, i1 } %umul, 0
+  %umul.ov = extractvalue { i32, i1 } %umul, 1
+  %spec.select = select i1 %umul.ov, i32 0, i32 %umul.val
+  store i32 %spec.select, ptr %c, align 4
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[A3:.*]] = OpFunctionParameter %[[V2Long]]
+; CHECK: %[[B3:.*]] = OpFunctionParameter %[[V2Long]]
+; CHECK: %[[Ptr3:.*]] = OpFunctionParameter %[[PtrV2Long]]
+; CHECK: %[[Struct3:.*]] = OpIAddCarry %[[StructV2Long]] %[[A3]] %[[B3]]
+; CHECK: %[[Val3:.*]] = OpCompositeExtract %[[V2Long]] %[[Struct3]] 0
+; CHECK: %[[Over3:.*]] = OpCompositeExtract %[[V2Long]] %[[Struct3]] 1
+; CHECK: %[[IsOver3:.*]] = OpINotEqual %[[V2Bool]] %[[Over3]] %[[ZeroV2Long]]
+; CHECK: %[[Res3:.*]] = OpSelect %[[V2Long]] %[[IsOver3]] %[[ZeroV2Long]] %[[Val3]]
+; CHECK: OpStore %[[Ptr3]] %[[Res3]] Aligned 16
+; CHECK: OpReturn
+define dso_local spir_func void @umulo_v2i64(<2 x i64> %a, <2 x i64> %b, ptr %p) nounwind {
+  %umul = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a, <2 x i64> %b)
+  %umul.val = extractvalue {<2 x i64>, <2 x i1>} %umul, 0
+  %umul.ov = extractvalue {<2 x i64>, <2 x i1>} %umul, 1
+  %zero = alloca <2 x i64>, align 16
+  %spec.select = select <2 x i1> %umul.ov, <2 x i64> <i64 0, i64 0>, <2 x i64> %umul.val
+  store <2 x i64> %spec.select, ptr %p
+  ret void
+}
+
+declare {i8, i1} @llvm.uadd.with.overflow.i8(i8, i8)
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
+declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/umul.with.overflow.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/umul.with.overflow.ll
index c34771bf381ea..7113dd692f6ac 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/umul.with.overflow.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/umul.with.overflow.ll
@@ -1,54 +1,89 @@
-; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-SPIRV: OpName %[[#NAME_UMUL_FUNC_8:]] "spirv.llvm_umul_with_overflow_i8"
-; CHECK-SPIRV: OpName %[[#NAME_UMUL_FUNC_32:]] "spirv.llvm_umul_with_overflow_i32"
-; CHECK-SPIRV: OpName %[[#NAME_UMUL_FUNC_VEC_I64:]] "spirv.llvm_umul_with_overflow_v2i64"
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-define dso_local spir_func void @_Z4foo8hhPh(i8 zeroext %a, i8 zeroext %b, i8* nocapture %c) local_unnamed_addr {
+; CHECK-DAG: %[[Char:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[Void:.*]] = OpTypeVoid
+; CHECK-DAG: %[[PtrChar:.*]] = OpTypePointer Function %[[Char]]
+; CHECK-DAG: %[[StructChar:.*]] = OpTypeStruct %[[Char]] %[[Char]]
+; CHECK-DAG: %[[ZeroChar:.*]] = OpConstant %[[Char]] 0
+; CHECK-DAG: %[[Int:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[PtrInt:.*]] = OpTypePointer Function %[[Int]]
+; CHECK-DAG: %[[StructInt:.*]] = OpTypeStruct %[[Int]] %[[Int]]
+; CHECK-DAG: %[[ZeroInt:.*]] = OpConstant %[[Int]] 0
+; CHECK-DAG: %[[Bool:.*]] = OpTypeBool
+; CHECK-DAG: %[[V2Bool:.*]] = OpTypeVector %[[Bool]] 2
+; CHECK-DAG: %[[Long:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[V2Long:.*]] = OpTypeVector %[[Long]] 2
+; CHECK-DAG: %[[PtrV2Long:.*]] = OpTypePointer Function %[[V2Long]]
+; CHECK-DAG: %[[StructV2Long:.*]] = OpTypeStruct %[[V2Long]] %[[V2Long]]
+; CHECK-DAG: %[[ZeroV2Long:.*]] = OpConstantNull %[[V2Long]]
+
+; CHECK: OpFunction
+; CHECK: %[[A:.*]] = OpFunctionParameter %[[Char]]
+; CHECK: %[[B:.*]] = OpFunctionParameter %[[Char]]
+; CHECK: %[[Ptr:.*]] = OpFunctionParameter %[[PtrChar]]
+; CHECK: %[[Struct:.*]] = OpUMulExtended %[[StructChar]] %[[A]] %[[B]]
+; CHECK: %[[Val:.*]] = OpCompositeExtract %[[Char]] %[[Struct]] 0
+; CHECK: %[[Over:.*]] = OpCompositeExtract %[[Char]] %[[Struct]] 1
+; CHECK: %[[IsOver:.*]] = OpINotEqual %[[Bool]] %[[Over]] %[[ZeroChar]]
+; CHECK: %[[Res:.*]] = OpSelect %[[Char]] %[[IsOver]] %[[ZeroChar]] %[[Val]]
+; CHECK: OpStore %[[Ptr]] %[[Res]] Aligned 1
+; CHECK: OpReturn
+define dso_local spir_func void @umulo_i8(i8 zeroext %a, i8 zeroext %b, ptr nocapture %c) local_unnamed_addr {
 entry:
-  ; CHECK-SPIRV: %[[#]] = OpFunctionCall %[[#]] %[[#NAME_UMUL_FUNC_8]]
   %umul = tail call { i8, i1 } @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
   %cmp = extractvalue { i8, i1 } %umul, 1
   %umul.value = extractvalue { i8, i1 } %umul, 0
   %storemerge = select i1 %cmp, i8 0, i8 %umul.value
-  store i8 %storemerge, i8* %c, align 1
+  store i8 %storemerge, ptr %c, align 1
   ret void
 }
 
-define dso_local spir_func void @_Z5foo32jjPj(i32 %a, i32 %b, i32* nocapture %c) local_unnamed_addr {
+; CHECK: OpFunction
+; CHECK: %[[A2:.*]] = OpFunctionParameter %[[Int]]
+; CHECK: %[[B2:.*]] = OpFunctionParameter %[[Int]]
+; CHECK: %[[Ptr2:.*]] = OpFunctionParameter %[[PtrInt]]
+; CHECK: %[[Struct2:.*]] = OpUMulExtended %[[StructInt]] %[[B2]] %[[A2]]
+; CHECK: %[[Val2:.*]] = OpCompositeExtract %[[Int]] %[[Struct2]] 0
+; CHECK: %[[Over2:.*]] = OpCompositeExtract %[[Int]] %[[Struct2]] 1
+; CHECK: %[[IsOver2:.*]] = OpINotEqual %[[Bool]] %[[Over2]] %[[ZeroInt]]
+; CHECK: %[[Res2:.*]] = OpSelect %[[Int]] %[[IsOver2]] %[[ZeroInt]] %[[Val2]]
+; CHECK: OpStore %[[Ptr2]] %[[Res2]] Aligned 4
+; CHECK: OpReturn
+define dso_local spir_func void @umulo_i32(i32 %a, i32 %b, ptr nocapture %c) local_unnamed_addr {
 entry:
-  ; CHECK-SPIRV: %[[#]] = OpFunctionCall %[[#]] %[[#NAME_UMUL_FUNC_32]]
   %umul = tail call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %b, i32 %a)
   %umul.val = extractvalue { i32, i1 } %umul, 0
   %umul.ov = extractvalue { i32, i1 } %umul, 1
   %spec.select = select i1 %umul.ov, i32 0, i32 %umul.val
-  store i32 %spec.select, i32* %c, align 4
+  store i32 %spec.select, ptr %c, align 4
   ret void
 }
 
-define dso_local spir_func void @umulo_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %p) nounwind {
-  ; CHECK-SPIRV: %[[#]] = OpFunctionCall %[[#]] %[[#NAME_UMUL_FUNC_VEC_I64]]
+; CHECK: OpFunction
+; CHECK: %[[A3:.*]] = OpFunctionParameter %[[V2Long]]
+; CHECK: %[[B3:.*]] = OpFunctionParameter %[[V2Long]]
+; CHECK: %[[Ptr3:.*]] = OpFunctionParameter %[[PtrV2Long]]
+; CHECK: %[[Struct3:.*]] = OpUMulExtended %[[StructV2Long]] %[[A3]] %[[B3]]
+; CHECK: %[[Val3:.*]] = OpCompositeExtract %[[V2Long]] %[[Struct3]] 0
+; CHECK: %[[Over3:.*]] = OpCompositeExtract %[[V2Long]] %[[Struct3]] 1
+; CHECK: %[[IsOver3:.*]] = OpINotEqual %[[V2Bool]] %[[Over3]] %[[ZeroV2Long]]
+; CHECK: %[[Res3:.*]] = OpSelect %[[V2Long]] %[[IsOver3]] %[[ZeroV2Long]] %[[Val3]]
+; CHECK: OpStore %[[Ptr3]] %[[Res3]] Aligned 16
+; CHECK: OpReturn
+define dso_local spir_func void @umulo_v2i64(<2 x i64> %a, <2 x i64> %b, ptr %p) nounwind {
   %umul = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a, <2 x i64> %b)
   %umul.val = extractvalue {<2 x i64>, <2 x i1>} %umul, 0
   %umul.ov = extractvalue {<2 x i64>, <2 x i1>} %umul, 1
   %zero = alloca <2 x i64>, align 16
   %spec.select = select <2 x i1> %umul.ov, <2 x i64> <i64 0, i64 0>, <2 x i64> %umul.val
-  store <2 x i64> %spec.select, <2 x i64>* %p
+  store <2 x i64> %spec.select, ptr %p
   ret void
 }
 
-; CHECK-SPIRV: %[[#NAME_UMUL_FUNC_8]] = OpFunction %[[#]]
-; CHECK-SPIRV: %[[#VAR_A:]] = OpFunctionParameter %[[#]]
-; CHECK-SPIRV: %[[#VAR_B:]] = OpFunctionParameter %[[#]]
-; CHECK-SPIRV: %[[#MUL_RES:]] = OpIMul %[[#]] %[[#VAR_A]] %[[#VAR_B]]
-; CHECK-SPIRV: %[[#DIV_RES:]] = OpUDiv %[[#]] %[[#MUL_RES]] %[[#VAR_A]]
-; CHECK-SPIRV: %[[#CMP_RES:]] = OpINotEqual %[[#]] %[[#VAR_A]] %[[#DIV_RES]]
-; CHECK-SPIRV: %[[#INSERT_RES:]] = OpCompositeInsert %[[#]] %[[#MUL_RES]]
-; CHECK-SPIRV: %[[#INSERT_RES_1:]] = OpCompositeInsert %[[#]] %[[#CMP_RES]] %[[#INSERT_RES]]
-; CHECK-SPIRV: OpReturnValue %[[#INSERT_RES_1]]
-
-declare { i8, i1 } @llvm.umul.with.overflow.i8(i8, i8)
-
-declare { i32, i1 } @llvm.umul.with.overflow.i32(i32, i32)
-
+declare {i8, i1} @llvm.umul.with.overflow.i8(i8, i8)
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32)
 declare {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/usub.with.overflow.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/usub.with.overflow.ll
new file mode 100644
index 0000000000000..963dd70f606b6
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/usub.with.overflow.ll
@@ -0,0 +1,89 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[Char:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[Void:.*]] = OpTypeVoid
+; CHECK-DAG: %[[PtrChar:.*]] = OpTypePointer Function %[[Char]]
+; CHECK-DAG: %[[StructChar:.*]] = OpTypeStruct %[[Char]] %[[Char]]
+; CHECK-DAG: %[[ZeroChar:.*]] = OpConstant %[[Char]] 0
+; CHECK-DAG: %[[Int:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[PtrInt:.*]] = OpTypePointer Function %[[Int]]
+; CHECK-DAG: %[[StructInt:.*]] = OpTypeStruct %[[Int]] %[[Int]]
+; CHECK-DAG: %[[ZeroInt:.*]] = OpConstant %[[Int]] 0
+; CHECK-DAG: %[[Bool:.*]] = OpTypeBool
+; CHECK-DAG: %[[V2Bool:.*]] = OpTypeVector %[[Bool]] 2
+; CHECK-DAG: %[[Long:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[V2Long:.*]] = OpTypeVector %[[Long]] 2
+; CHECK-DAG: %[[PtrV2Long:.*]] = OpTypePointer Function %[[V2Long]]
+; CHECK-DAG: %[[StructV2Long:.*]] = OpTypeStruct %[[V2Long]] %[[V2Long]]
+; CHECK-DAG: %[[ZeroV2Long:.*]] = OpConstantNull %[[V2Long]]
+
+; CHECK: OpFunction
+; CHECK: %[[A:.*]] = OpFunctionParameter %[[Char]]
+; CHECK: %[[B:.*]] = OpFunctionParameter %[[Char]]
+; CHECK: %[[Ptr:.*]] = OpFunctionParameter %[[PtrChar]]
+; CHECK: %[[Struct:.*]] = OpISubBorrow %[[StructChar]] %[[A]] %[[B]]
+; CHECK: %[[Val:.*]] = OpCompositeExtract %[[Char]] %[[Struct]] 0
+; CHECK: %[[Over:.*]] = OpCompositeExtract %[[Char]] %[[Struct]] 1
+; CHECK: %[[IsOver:.*]] = OpINotEqual %[[Bool]] %[[Over]] %[[ZeroChar]]
+; CHECK: %[[Res:.*]] = OpSelect %[[Char]] %[[IsOver]] %[[ZeroChar]] %[[Val]]
+; CHECK: OpStore %[[Ptr]] %[[Res]] Aligned 1
+; CHECK: OpReturn
+define dso_local spir_func void @umulo_i8(i8 zeroext %a, i8 zeroext %b, ptr nocapture %c) local_unnamed_addr {
+entry:
+  %umul = tail call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %a, i8 %b)
+  %cmp = extractvalue { i8, i1 } %umul, 1
+  %umul.value = extractvalue { i8, i1 } %umul, 0
+  %storemerge = select i1 %cmp, i8 0, i8 %umul.value
+  store i8 %storemerge, ptr %c, align 1
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[A2:.*]] = OpFunctionParameter %[[Int]]
+; CHECK: %[[B2:.*]] = OpFunctionParameter %[[Int]]
+; CHECK: %[[Ptr2:.*]] = OpFunctionParameter %[[PtrInt]]
+; CHECK: %[[Struct2:.*]] = OpISubBorrow %[[StructInt]] %[[B2]] %[[A2]]
+; CHECK: %[[Val2:.*]] = OpCompositeExtract %[[Int]] %[[Struct2]] 0
+; CHECK: %[[Over2:.*]] = OpCompositeExtract %[[Int]] %[[Struct2]] 1
+; CHECK: %[[IsOver2:.*]] = OpINotEqual %[[Bool]] %[[Over2]] %[[ZeroInt]]
+; CHECK: %[[Res2:.*]] = OpSelect %[[Int]] %[[IsOver2]] %[[ZeroInt]] %[[Val2]]
+; CHECK: OpStore %[[Ptr2]] %[[Res2]] Aligned 4
+; CHECK: OpReturn
+define dso_local spir_func void @umulo_i32(i32 %a, i32 %b, ptr nocapture %c) local_unnamed_addr {
+entry:
+  %umul = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %b, i32 %a)
+  %umul.val = extractvalue { i32, i1 } %umul, 0
+  %umul.ov = extractvalue { i32, i1 } %umul, 1
+  %spec.select = select i1 %umul.ov, i32 0, i32 %umul.val
+  store i32 %spec.select, ptr %c, align 4
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[A3:.*]] = OpFunctionParameter %[[V2Long]]
+; CHECK: %[[B3:.*]] = OpFunctionParameter %[[V2Long]]
+; CHECK: %[[Ptr3:.*]] = OpFunctionParameter %[[PtrV2Long]]
+; CHECK: %[[Struct3:.*]] = OpISubBorrow %[[StructV2Long]] %[[A3]] %[[B3]]
+; CHECK: %[[Val3:.*]] = OpCompositeExtract %[[V2Long]] %[[Struct3]] 0
+; CHECK: %[[Over3:.*]] = OpCompositeExtract %[[V2Long]] %[[Struct3]] 1
+; CHECK: %[[IsOver3:.*]] = OpINotEqual %[[V2Bool]] %[[Over3]] %[[ZeroV2Long]]
+; CHECK: %[[Res3:.*]] = OpSelect %[[V2Long]] %[[IsOver3]] %[[ZeroV2Long]] %[[Val3]]
+; CHECK: OpStore %[[Ptr3]] %[[Res3]] Aligned 16
+; CHECK: OpReturn
+define dso_local spir_func void @umulo_v2i64(<2 x i64> %a, <2 x i64> %b, ptr %p) nounwind {
+  %umul = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %a, <2 x i64> %b)
+  %umul.val = extractvalue {<2 x i64>, <2 x i1>} %umul, 0
+  %umul.ov = extractvalue {<2 x i64>, <2 x i1>} %umul, 1
+  %zero = alloca <2 x i64>, align 16
+  %spec.select = select <2 x i1> %umul.ov, <2 x i64> <i64 0, i64 0>, <2 x i64> %umul.val
+  store <2 x i64> %spec.select, ptr %p
+  ret void
+}
+
+declare {i8, i1} @llvm.usub.with.overflow.i8(i8, i8)
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32)
+declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll b/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll
new file mode 100644
index 0000000000000..1a630f77a44c5
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll
@@ -0,0 +1,56 @@
+; This test aims to check ability to support "Arithmetic with Overflow" intrinsics
+; in the special case when those intrinsics are being generated by the CodeGenPrepare;
+; pass during translations with optimization (note -O3 in llc arguments).
+
+; RUN: llc -O3 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O3 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpName %[[Val:.*]] "math"
+; CHECK-DAG: OpName %[[IsOver:.*]] "ov"
+; CHECK-DAG: %[[Int:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[Char:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[PtrChar:.*]] = OpTypePointer Generic %[[Char]]
+; CHECK-DAG: %[[Bool:.*]] = OpTypeBool
+; CHECK-DAG: %[[Struct:.*]] = OpTypeStruct %[[Int]] %[[Int]]
+; CHECK-DAG: %[[Const1:.*]] = OpConstant %[[Int]] 1
+; CHECK-DAG: %[[Const42:.*]] = OpConstant %[[Char]] 42
+; CHECK-DAG: %[[Zero:.*]] = OpConstantNull %[[Int]]
+
+; CHECK: OpFunction
+; CHECK: %[[A:.*]] = OpFunctionParameter %[[Int]]
+; CHECK: %[[Ptr:.*]] = OpFunctionParameter %[[PtrChar]]
+; CHECK: %[[#]] = OpLabel
+; CHECK: OpBranch %[[#]]
+; CHECK: %[[#]] = OpLabel
+; CHECK: %[[PhiRes:.*]] = OpPhi %[[Int]] %[[A]] %[[#]] %[[Val]] %[[#]]
+; CHECK: %[[AggRes:.*]] = OpIAddCarry %[[Struct]] %[[PhiRes]] %[[Const1]]
+; CHECK: %[[Val]] = OpCompositeExtract %[[Int]] %[[AggRes]] 0
+; CHECK: %[[Over:.*]] = OpCompositeExtract %[[Int]] %[[AggRes]] 1
+; CHECK: %[[IsOver]] = OpINotEqual %[[Bool:.*]] %[[Over]] %[[Zero]]
+; CHECK: OpBranchConditional %[[IsOver]] %[[#]] %[[#]]
+; CHECK: OpStore %[[Ptr]] %[[Const42]] Aligned 1
+; CHECK: OpBranch %[[#]]
+; CHECK: %[[#]] = OpLabel
+; CHECK: OpReturnValue %[[Val]]
+; CHECK: OpFunctionEnd
+
+define spir_func i32 @foo(i32 %a, ptr addrspace(4) %p) {
+entry:
+  br label %l1
+
+body:
+  store i8 42, ptr addrspace(4) %p
+  br label %l1
+
+l1:
+  %e = phi i32 [ %a, %entry ], [ %i, %body ]
+  %i = add nsw i32 %e, 1
+  %fl = icmp eq i32 %i, 0
+  br i1 %fl, label %exit, label %body
+
+exit:
+  ret i32 %i
+}
diff --git a/llvm/test/CodeGen/SPIRV/passes/translate-aggregate-uaddo.ll b/llvm/test/CodeGen/SPIRV/passes/translate-aggregate-uaddo.ll
new file mode 100644
index 0000000000000..cd4d9325c7659
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/passes/translate-aggregate-uaddo.ll
@@ -0,0 +1,64 @@
+; This test shows how value attributes are being passed during different translation steps.
+; See also test/CodeGen/SPIRV/optimizations/add-check-overflow.ll.
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -print-after=prepare-functions 2>&1 | FileCheck %s  --check-prefix=CHECK-PREPARE
+; Intrinsics with aggregate return type are not substituted/removed.
+; CHECK-PREPARE: @llvm.uadd.with.overflow.i32
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -print-after=emit-intrinsics 2>&1 | FileCheck %s  --check-prefix=CHECK-IR
+; Aggregate data are wrapped into @llvm.fake.use(),
+; and their attributes are packed into a metadata for @llvm.spv.value.md().
+; CHECK-IR: %[[R1:.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32
+; CHECK-IR: call void @llvm.spv.value.md(metadata !0)
+; CHECK-IR: call void (...) @llvm.fake.use({ i32, i1 } %[[R1]])
+; CHECK-IR: %math = extractvalue { i32, i1 } %[[R1]], 0
+; CHECK-IR: %ov = extractvalue { i32, i1 } %[[R1]], 1
+; Type/Name attributes of the value.
+; CHECK-IR: !0 = !{{[{]}}!1, !""{{[}]}}
+; Origin data type of the value.
+; CHECK-IR: !1 = !{{[{]}}{{[{]}} i32, i1 {{[}]}} poison{{[}]}}
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -print-after=irtranslator 2>&1 | FileCheck %s  --check-prefix=CHECK-GMIR
+; Required info succeeded to get through IRTranslator.
+; CHECK-GMIR: %[[phires:.*]]:_(s32) = G_PHI
+; CHECK-GMIR: %[[math:.*]]:id(s32), %[[ov:.*]]:_(s1) = G_UADDO %[[phires]]:_, %[[#]]:_
+; CHECK-GMIR: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.value.md), !0
+; CHECK-GMIR: FAKE_USE %[[math]]:id(s32), %[[ov]]:_(s1)
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -print-after=spirv-prelegalizer 2>&1 | FileCheck %s  --check-prefix=CHECK-PRE
+; Internal service instructions are consumed.
+; CHECK-PRE: G_UADDO
+; CHECK-PRE-NO: llvm.spv.value.md
+; CHECK-PRE-NO: FAKE_USE
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -print-after=instruction-select 2>&1 | FileCheck %s  --check-prefix=CHECK-ISEL
+; Names and types are restored and correctly encoded. Correct instruction selection is completed.
+; CHECK-ISEL-DAG: %[[int32:.*]]:type = OpTypeInt 32, 0
+; CHECK-ISEL-DAG: %[[struct:.*]]:type = OpTypeStruct %[[int32]]:type, %[[int32]]:type
+; CHECK-ISEL-DAG: %[[bool:.*]]:type = OpTypeBool
+; CHECK-ISEL-DAG: %[[zero32:.*]]:iid = OpConstantNull %[[int32]]:type
+; CHECK-ISEL-DAG: %[[res:.*]]:iid = OpIAddCarryS %[[struct]]:type
+; CHECK-ISEL-DAG: %[[math:.*]]:id = OpCompositeExtract %[[int32]]:type, %[[res]]:iid, 0
+; CHECK-ISEL-DAG: %[[ov32:.*]]:iid = OpCompositeExtract %[[int32]]:type, %[[res]]:iid, 1
+; CHECK-ISEL-DAG: %[[ov:.*]]:iid = OpINotEqual %[[bool]]:type, %[[ov32]]:iid, %[[zero32:.*]]:iid
+; CHECK-ISEL-DAG: OpName %[[math]]:id, 1752457581, 0
+; CHECK-ISEL-DAG: OpName %[[ov]]:iid, 30319
+
+define spir_func i32 @foo(i32 %a, ptr addrspace(4) %p) {
+entry:
+  br label %l1
+
+l1:                                               ; preds = %body, %entry
+  %e = phi i32 [ %a, %entry ], [ %math, %body ]
+  %0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %e, i32 1)
+  %math = extractvalue { i32, i1 } %0, 0
+  %ov = extractvalue { i32, i1 } %0, 1
+  br i1 %ov, label %exit, label %body
+
+body:                                             ; preds = %l1
+  store i8 42, ptr addrspace(4) %p, align 1
+  br label %l1
+
+exit:                                             ; preds = %l1
+  ret i32 %math
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll
new file mode 100644
index 0000000000000..cd1a1b0080c62
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll
@@ -0,0 +1,63 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpName %[[F:.*]] "F"
+; CHECK-DAG: OpName %[[B:.*]] "B"
+; CHECK-DAG: OpName %[[G1:.*]] "G1"
+; CHECK-DAG: OpName %[[G2:.*]] "G2"
+; CHECK-DAG: OpName %[[X:.*]] "X"
+; CHECK-DAG: OpName %[[Y:.*]] "Y"
+; CHECK-DAG: OpName %[[G3:.*]] "G3"
+; CHECK-DAG: OpName %[[G4:.*]] "G4"
+
+; CHECK-DAG: %[[Int:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[Char:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[GenPtrChar:.*]] = OpTypePointer Generic %[[Char]]
+; CHECK-DAG: %[[CWPtrChar:.*]] = OpTypePointer CrossWorkgroup %[[Char]]
+; CHECK-DAG: %[[Arr1:.*]] = OpTypeArray %[[CWPtrChar]] %[[#]]
+; CHECK-DAG: %[[Struct1:.*]] = OpTypeStruct %8
+; CHECK-DAG: %[[Arr2:.*]] = OpTypeArray %[[GenPtrChar]] %[[#]]
+; CHECK-DAG: %[[Struct2:.*]] = OpTypeStruct %[[Arr2]]
+; CHECK-DAG: %[[GenPtr:.*]] = OpTypePointer Generic %[[Int]]
+; CHECK-DAG: %[[CWPtr:.*]] = OpTypePointer CrossWorkgroup %[[Int]]
+; CHECK-DAG: %[[WPtr:.*]] = OpTypePointer Workgroup %[[Int]]
+
+; CHECK-DAG: %[[F]] = OpVariable %[[CWPtr]] CrossWorkgroup %[[#]]
+; CHECK-DAG: %[[GenF:.*]] = OpSpecConstantOp %[[GenPtrChar]] 121 %[[F]]
+; CHECK-DAG: %[[B]] = OpVariable %[[CWPtr]] CrossWorkgroup %[[#]]
+; CHECK-DAG: %[[GenB:.*]] = OpSpecConstantOp %[[GenPtrChar]] 121 %[[B]]
+; CHECK-DAG: %[[GenFB:.*]] = OpConstantComposite %[[Arr2]] %[[GenF]] %[[GenB]]
+; CHECK-DAG: %[[GenBF:.*]] = OpConstantComposite %[[Arr2]] %[[GenB]] %[[GenF]]
+; CHECK-DAG: %[[CG1:.*]] = OpConstantComposite %[[Struct2]] %[[GenFB]]
+; CHECK-DAG: %[[CG2:.*]] = OpConstantComposite %[[Struct2]] %[[GenBF]]
+
+; CHECK-DAG: %[[X]] = OpVariable %[[WPtr]] Workgroup %[[#]]
+; CHECK-DAG: %[[GenX:.*]] = OpSpecConstantOp %[[GenPtr]] 121 %[[X]]
+; CHECK-DAG: %[[CWX:.*]] = OpSpecConstantOp %[[CWPtrChar]] 122 %[[GenX]]
+; CHECK-DAG: %[[Y]] = OpVariable %[[WPtr]] Workgroup %[[#]]
+; CHECK-DAG: %[[GenY:.*]] = OpSpecConstantOp %[[GenPtr]] 121 %[[Y]]
+; CHECK-DAG: %[[CWY:.*]] = OpSpecConstantOp %[[CWPtrChar]] 122 %[[GenY]]
+; CHECK-DAG: %[[CWXY:.*]] = OpConstantComposite %[[Arr1]] %[[CWX]] %[[CWY]]
+; CHECK-DAG: %[[CWYX:.*]] = OpConstantComposite %[[Arr1]] %[[CWY]] %[[CWX]]
+; CHECK-DAG: %[[CG3:.*]] = OpConstantComposite %[[Struct1]] %[[CWXY]]
+; CHECK-DAG: %[[CG4:.*]] = OpConstantComposite %[[Struct1]] %[[CWYX]]
+
+; CHECK-DAG: %[[G4]] = OpVariable %[[#]] CrossWorkgroup %[[CG4]]
+; CHECK-DAG: %[[G3]] = OpVariable %[[#]] CrossWorkgroup %[[CG3]]
+; CHECK-DAG: %[[G2]] = OpVariable %[[#]] CrossWorkgroup %[[CG2]]
+; CHECK-DAG: %[[G1]] = OpVariable %[[#]] CrossWorkgroup %[[CG1]]
+
+@F = addrspace(1) constant i32 0
+@B = addrspace(1) constant i32 1
+@G1 = addrspace(1) constant { [2 x ptr addrspace(4)] } { [2 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) @F to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @B to ptr addrspace(4))] }
+@G2 = addrspace(1) constant { [2 x ptr addrspace(4)] } { [2 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) @B to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @F to ptr addrspace(4))] }
+
+@X = addrspace(3) constant i32 0
+@Y = addrspace(3) constant i32 1
+@G3 = addrspace(1) constant { [2 x ptr addrspace(1)] } { [2 x ptr addrspace(1)] [ptr addrspace(1) addrspacecast (ptr addrspace(3) @X to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr addrspace(3) @Y to ptr addrspace(1))] }
+@G4 = addrspace(1) constant { [2 x ptr addrspace(1)] } { [2 x ptr addrspace(1)] [ptr addrspace(1) addrspacecast (ptr addrspace(3) @Y to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr addrspace(3) @X to ptr addrspace(1))] }
+
+define void @foo() {
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll
new file mode 100644
index 0000000000000..99e2c3e6d3963
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll
@@ -0,0 +1,19 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[Array:.*]] = OpTypeArray %[[#]] %[[#]]
+; CHECK-DAG: %[[Struct:.*]] = OpTypeStruct %[[Array]]
+; CHECK-DAG: %[[Zero:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[Null:.*]] = OpConstantNull %[[Zero]]
+; CHECK-DAG: %[[R1:.*]] = OpConstantComposite %[[Array]] %[[Null]]
+; CHECK-DAG: %[[#]] = OpConstantComposite %[[Struct]] %[[R1]]
+; CHECK-DAG: %[[R2:.*]] = OpConstantComposite %[[Array]] %[[Null]]
+; CHECK-DAG: %[[#]] = OpConstantComposite %[[Struct]] %[[R2]]
+
+@G1 = addrspace(1) constant { [1 x ptr addrspace(4)] } { [1 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4))] }
+@G2 = addrspace(1) constant { [1 x ptr addrspace(4)] } { [1 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4))] }
+
+define void @foo() {
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types-rev.ll b/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types-rev.ll
new file mode 100644
index 0000000000000..6fa3f4e53cc59
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types-rev.ll
@@ -0,0 +1,55 @@
+; The goal of the test case is to ensure that OpPhi is consistent with respect to operand types.
+; -verify-machineinstrs is not available due to mutually exclusive requirements for G_BITCAST and G_PHI.
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: %[[#Char:]] = OpTypeInt 8 0
+; CHECK: %[[#PtrChar:]] = OpTypePointer Function %[[#Char]]
+; CHECK: %[[#Int:]] = OpTypeInt 32 0
+; CHECK: %[[#PtrInt:]] = OpTypePointer Function %[[#Int]]
+; CHECK: %[[#R1:]] = OpFunctionCall %[[#PtrChar]] %[[#]]
+; CHECK: %[[#R2:]] = OpFunctionCall %[[#PtrInt]] %[[#]]
+; CHECK-DAG: %[[#Casted1:]] = OpBitcast %[[#PtrChar]] %[[#R2]]
+; CHECK-DAG: %[[#Casted2:]] = OpBitcast %[[#PtrChar]] %[[#R2]]
+; CHECK: OpBranchConditional
+; CHECK-DAG: OpPhi %[[#PtrChar]] %[[#R1]] %[[#]] %[[#Casted1]] %[[#]]
+; CHECK-DAG: OpPhi %[[#PtrChar]] %[[#R1]] %[[#]] %[[#Casted2]] %[[#]]
+
+define void @f0(ptr %arg) {
+entry:
+  ret void
+}
+
+define ptr @f1() {
+entry:
+  %p = alloca i8
+  store i8 8, ptr %p
+  ret ptr %p
+}
+
+define ptr @f2() {
+entry:
+  %p = alloca i32
+  store i32 32, ptr %p
+  ret ptr %p
+}
+
+define ptr @foo(i1 %arg) {
+entry:
+  %r1 = tail call ptr @f1()
+  %r2 = tail call ptr @f2()
+  br i1 %arg, label %l1, label %l2
+
+l1:
+  br label %exit
+
+l2:
+  br label %exit
+
+exit:
+  %ret = phi ptr [ %r1, %l1 ], [ %r2, %l2 ]
+  %ret2 = phi ptr [ %r1, %l1 ], [ %r2, %l2 ]
+  tail call void @f0(ptr %ret)
+  ret ptr %ret2
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types.ll b/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types.ll
new file mode 100644
index 0000000000000..4fbaae2556730
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types.ll
@@ -0,0 +1,53 @@
+; The goal of the test case is to ensure that OpPhi is consistent with respect to operand types.
+; -verify-machineinstrs is not available due to mutually exclusive requirements for G_BITCAST and G_PHI.
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: %[[#Char:]] = OpTypeInt 8 0
+; CHECK: %[[#PtrChar:]] = OpTypePointer Function %[[#Char]]
+; CHECK: %[[#Int:]] = OpTypeInt 32 0
+; CHECK: %[[#PtrInt:]] = OpTypePointer Function %[[#Int]]
+; CHECK: %[[#R1:]] = OpFunctionCall %[[#PtrChar]] %[[#]]
+; CHECK: %[[#R2:]] = OpFunctionCall %[[#PtrInt]] %[[#]]
+; CHECK: %[[#Casted:]] = OpBitcast %[[#PtrChar]] %[[#R2]]
+; CHECK: OpPhi %[[#PtrChar]] %[[#R1]] %[[#]] %[[#Casted]] %[[#]]
+; CHECK: OpPhi %[[#PtrChar]] %[[#R1]] %[[#]] %[[#Casted]] %[[#]]
+
+define ptr @foo(i1 %arg) {
+entry:
+  %r1 = tail call ptr @f1()
+  %r2 = tail call ptr @f2()
+  br i1 %arg, label %l1, label %l2
+
+l1:
+  br label %exit
+
+l2:
+  br label %exit
+
+exit:
+  %ret = phi ptr [ %r1, %l1 ], [ %r2, %l2 ]
+  %ret2 = phi ptr [ %r1, %l1 ], [ %r2, %l2 ]
+  tail call void @f0(ptr %ret)
+  ret ptr %ret2
+}
+
+define void @f0(ptr %arg) {
+entry:
+  ret void
+}
+
+define ptr @f1() {
+entry:
+  %p = alloca i8
+  store i8 8, ptr %p
+  ret ptr %p
+}
+
+define ptr @f2() {
+entry:
+  %p = alloca i32
+  store i32 32, ptr %p
+  ret ptr %p
+}
diff --git a/llvm/test/CodeGen/SystemZ/DAGCombine_extract_vector_elt.ll b/llvm/test/CodeGen/SystemZ/DAGCombine_extract_vector_elt.ll
new file mode 100644
index 0000000000000..d568af47dbafd
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/DAGCombine_extract_vector_elt.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z16 < %s  | FileCheck %s
+;
+; Check that DAGCombiner doesn't crash in SystemZ combineExtract()
+; when handling EXTRACT_VECTOR_ELT with a vector of i1:s.
+
+define i32 @fun(i32 %arg) {
+; CHECK-LABEL: fun:
+entry:
+  %cc = icmp eq i32 %arg, 0
+  br label %loop
+
+loop:
+  %P = phi <128 x i1> [ zeroinitializer, %entry ], [ bitcast (<2 x i64> <i64 3, i64 3> to <128 x i1>), %loop ]
+  br i1 %cc, label %exit, label %loop
+
+exit:
+  %E = extractelement <128 x i1> %P, i64 0
+  %Res = zext i1 %E to i32
+  ret i32 %Res
+}
diff --git a/llvm/test/CodeGen/SystemZ/args-15.ll b/llvm/test/CodeGen/SystemZ/args-15.ll
index c810aeb8c46c5..64217a2a29a6f 100644
--- a/llvm/test/CodeGen/SystemZ/args-15.ll
+++ b/llvm/test/CodeGen/SystemZ/args-15.ll
@@ -8,4 +8,6 @@ define i32 @callee_MissingRetAttr() {
   ret i32 -1
 }
 
-; CHECK: Narrow integer argument must have a valid extension type.
+; CHECK: ERROR: Missing extension attribute of returned value from function:
+; CHECK: i32 @callee_MissingRetAttr()
+; CHECK: UNREACHABLE executed
diff --git a/llvm/test/CodeGen/SystemZ/args-16.ll b/llvm/test/CodeGen/SystemZ/args-16.ll
index b76a2afea5077..846100146e790 100644
--- a/llvm/test/CodeGen/SystemZ/args-16.ll
+++ b/llvm/test/CodeGen/SystemZ/args-16.ll
@@ -8,5 +8,7 @@ define i16 @callee_MissingRetAttr() {
   ret i16 -1
 }
 
-; CHECK: Narrow integer argument must have a valid extension type.
+; CHECK: ERROR: Missing extension attribute of returned value from function:
+; CHECK: i16 @callee_MissingRetAttr()
+; CHECK: UNREACHABLE executed
 
diff --git a/llvm/test/CodeGen/SystemZ/args-17.ll b/llvm/test/CodeGen/SystemZ/args-17.ll
index bce54b3d2aa1f..4231d7e9e4772 100644
--- a/llvm/test/CodeGen/SystemZ/args-17.ll
+++ b/llvm/test/CodeGen/SystemZ/args-17.ll
@@ -8,4 +8,6 @@ define i8 @callee_MissingRetAttr() {
   ret i8 -1
 }
 
-; CHECK: Narrow integer argument must have a valid extension type.
+; CHECK: ERROR: Missing extension attribute of returned value from function:
+; CHECK: i8 @callee_MissingRetAttr()
+; CHECK: UNREACHABLE executed
diff --git a/llvm/test/CodeGen/SystemZ/args-18.ll b/llvm/test/CodeGen/SystemZ/args-18.ll
index 82e9729d3a2df..bd368fa056c6c 100644
--- a/llvm/test/CodeGen/SystemZ/args-18.ll
+++ b/llvm/test/CodeGen/SystemZ/args-18.ll
@@ -11,4 +11,6 @@ define void @caller() {
 
 declare void @bar_Struct(i32 %Arg)
 
-; CHECK: Narrow integer argument must have a valid extension type
+; CHECK: ERROR: Missing extension attribute of passed value in call to function:
+; CHECK: Callee:  void @bar_Struct(i32)
+; CHECK: Caller:  void @caller()
diff --git a/llvm/test/CodeGen/SystemZ/args-19.ll b/llvm/test/CodeGen/SystemZ/args-19.ll
index 40a794417b4c6..8b5f421f59fdd 100644
--- a/llvm/test/CodeGen/SystemZ/args-19.ll
+++ b/llvm/test/CodeGen/SystemZ/args-19.ll
@@ -11,4 +11,6 @@ define void @caller() {
 
 declare void @bar_Struct(i16 %Arg)
 
-; CHECK: Narrow integer argument must have a valid extension type
+; CHECK: ERROR: Missing extension attribute of passed value in call to function:
+; CHECK: Callee:  void @bar_Struct(i16)
+; CHECK: Caller:  void @caller()
diff --git a/llvm/test/CodeGen/SystemZ/args-20.ll b/llvm/test/CodeGen/SystemZ/args-20.ll
index ce8b828a2d539..ed6f2e52bf6ee 100644
--- a/llvm/test/CodeGen/SystemZ/args-20.ll
+++ b/llvm/test/CodeGen/SystemZ/args-20.ll
@@ -11,4 +11,6 @@ define void @caller() {
 
 declare void @bar_Struct(i8 %Arg)
 
-; CHECK: Narrow integer argument must have a valid extension type
+; CHECK: ERROR: Missing extension attribute of passed value in call to function:
+; CHECK: Callee:  void @bar_Struct(i8)
+; CHECK: Caller:  void @caller()
diff --git a/llvm/test/CodeGen/SystemZ/args-21.ll b/llvm/test/CodeGen/SystemZ/args-21.ll
index c64233094c7df..da5c8fe5ffc7f 100644
--- a/llvm/test/CodeGen/SystemZ/args-21.ll
+++ b/llvm/test/CodeGen/SystemZ/args-21.ll
@@ -16,4 +16,6 @@ define void @foo() {
   ret void
 }
 
-; CHECK: Narrow integer argument must have a valid extension type
+; CHECK: ERROR: Missing extension attribute of returned value from function:
+; CHECK: i32 @bar(i32)
+; CHECK: UNREACHABLE executed
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
index 8b71987246ee5..c0bc34c2b06ef 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@@ -4,11 +4,10 @@
 define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
 ; CHECK-LABEL: float_float_mul:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    bxeq lr
-; CHECK-NEXT:  .LBB0_1: @ %for.body.preheader
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    beq .LBB0_10
+; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhi .LBB0_3
 ; CHECK-NEXT:  @ %bb.2:
@@ -33,39 +32,42 @@ define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr noca
 ; CHECK-NEXT:    beq .LBB0_11
 ; CHECK-NEXT:  .LBB0_4: @ %for.body.preheader22
 ; CHECK-NEXT:    mvn.w r7, r12
-; CHECK-NEXT:    add.w r8, r7, r3
-; CHECK-NEXT:    and r5, r3, #3
-; CHECK-NEXT:    wls lr, r5, .LBB0_7
+; CHECK-NEXT:    adds r4, r7, r3
+; CHECK-NEXT:    and r7, r3, #3
+; CHECK-NEXT:    add.w r8, r12, r7
+; CHECK-NEXT:    wls lr, r7, .LBB0_7
 ; CHECK-NEXT:  @ %bb.5: @ %for.body.prol.preheader
-; CHECK-NEXT:    add.w r4, r12, r5
-; CHECK-NEXT:    add.w r5, r0, r12, lsl #2
-; CHECK-NEXT:    add.w r6, r1, r12, lsl #2
-; CHECK-NEXT:    add.w r7, r2, r12, lsl #2
-; CHECK-NEXT:    mov r12, r4
+; CHECK-NEXT:    add.w r6, r0, r12, lsl #2
+; CHECK-NEXT:    add.w r7, r1, r12, lsl #2
+; CHECK-NEXT:    add.w r5, r2, r12, lsl #2
+; CHECK-NEXT:    mov r12, r8
 ; CHECK-NEXT:  .LBB0_6: @ %for.body.prol
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldmia r6!, {s0}
-; CHECK-NEXT:    vldmia r5!, {s2}
+; CHECK-NEXT:    vldmia r7!, {s0}
+; CHECK-NEXT:    vldmia r6!, {s2}
 ; CHECK-NEXT:    vmul.f32 s0, s2, s0
-; CHECK-NEXT:    vstmia r7!, {s0}
+; CHECK-NEXT:    vstmia r5!, {s0}
 ; CHECK-NEXT:    le lr, .LBB0_6
 ; CHECK-NEXT:  .LBB0_7: @ %for.body.prol.loopexit
-; CHECK-NEXT:    cmp.w r8, #3
+; CHECK-NEXT:    cmp r4, #3
 ; CHECK-NEXT:    blo .LBB0_10
 ; CHECK-NEXT:  @ %bb.8: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w r3, r3, r12
-; CHECK-NEXT:    lsl.w r12, r12, #2
+; CHECK-NEXT:    sub.w r3, r8, r3
+; CHECK-NEXT:    movs r7, #1
+; CHECK-NEXT:    rsb r3, r3, r3, lsl #30
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    add.w lr, r7, r3, lsr #2
+; CHECK-NEXT:    lsl.w r3, r12, #2
 ; CHECK-NEXT:  .LBB0_9: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r7, r1, r12
-; CHECK-NEXT:    add.w r6, r0, r12
-; CHECK-NEXT:    add.w r5, r2, r12
+; CHECK-NEXT:    adds r7, r1, r3
+; CHECK-NEXT:    adds r6, r0, r3
+; CHECK-NEXT:    adds r5, r2, r3
 ; CHECK-NEXT:    adds r0, #16
 ; CHECK-NEXT:    vldr s0, [r7]
 ; CHECK-NEXT:    adds r1, #16
 ; CHECK-NEXT:    vldr s2, [r6]
 ; CHECK-NEXT:    adds r2, #16
-; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vmul.f32 s0, s2, s0
 ; CHECK-NEXT:    vstr s0, [r5]
 ; CHECK-NEXT:    vldr s0, [r7, #4]
@@ -80,10 +82,9 @@ define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr noca
 ; CHECK-NEXT:    vldr s2, [r6, #12]
 ; CHECK-NEXT:    vmul.f32 s0, s2, s0
 ; CHECK-NEXT:    vstr s0, [r5, #12]
-; CHECK-NEXT:    bne .LBB0_9
-; CHECK-NEXT:  .LBB0_10:
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    le lr, .LBB0_9
+; CHECK-NEXT:  .LBB0_10: @ %for.cond.cleanup
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 ; CHECK-NEXT:  .LBB0_11: @ %vector.ph
 ; CHECK-NEXT:    bic r12, r3, #3
 ; CHECK-NEXT:    movs r6, #1
@@ -217,11 +218,10 @@ for.body:                                         ; preds = %for.body.prol.loope
 define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
 ; CHECK-LABEL: float_float_add:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    bxeq lr
-; CHECK-NEXT:  .LBB1_1: @ %for.body.preheader
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    beq .LBB1_10
+; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhi .LBB1_3
 ; CHECK-NEXT:  @ %bb.2:
@@ -246,39 +246,42 @@ define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr noca
 ; CHECK-NEXT:    beq .LBB1_11
 ; CHECK-NEXT:  .LBB1_4: @ %for.body.preheader22
 ; CHECK-NEXT:    mvn.w r7, r12
-; CHECK-NEXT:    add.w r8, r7, r3
-; CHECK-NEXT:    and r5, r3, #3
-; CHECK-NEXT:    wls lr, r5, .LBB1_7
+; CHECK-NEXT:    adds r4, r7, r3
+; CHECK-NEXT:    and r7, r3, #3
+; CHECK-NEXT:    add.w r8, r12, r7
+; CHECK-NEXT:    wls lr, r7, .LBB1_7
 ; CHECK-NEXT:  @ %bb.5: @ %for.body.prol.preheader
-; CHECK-NEXT:    add.w r4, r12, r5
-; CHECK-NEXT:    add.w r5, r0, r12, lsl #2
-; CHECK-NEXT:    add.w r6, r1, r12, lsl #2
-; CHECK-NEXT:    add.w r7, r2, r12, lsl #2
-; CHECK-NEXT:    mov r12, r4
+; CHECK-NEXT:    add.w r6, r0, r12, lsl #2
+; CHECK-NEXT:    add.w r7, r1, r12, lsl #2
+; CHECK-NEXT:    add.w r5, r2, r12, lsl #2
+; CHECK-NEXT:    mov r12, r8
 ; CHECK-NEXT:  .LBB1_6: @ %for.body.prol
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldmia r6!, {s0}
-; CHECK-NEXT:    vldmia r5!, {s2}
+; CHECK-NEXT:    vldmia r7!, {s0}
+; CHECK-NEXT:    vldmia r6!, {s2}
 ; CHECK-NEXT:    vadd.f32 s0, s2, s0
-; CHECK-NEXT:    vstmia r7!, {s0}
+; CHECK-NEXT:    vstmia r5!, {s0}
 ; CHECK-NEXT:    le lr, .LBB1_6
 ; CHECK-NEXT:  .LBB1_7: @ %for.body.prol.loopexit
-; CHECK-NEXT:    cmp.w r8, #3
+; CHECK-NEXT:    cmp r4, #3
 ; CHECK-NEXT:    blo .LBB1_10
 ; CHECK-NEXT:  @ %bb.8: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w r3, r3, r12
-; CHECK-NEXT:    lsl.w r12, r12, #2
+; CHECK-NEXT:    sub.w r3, r8, r3
+; CHECK-NEXT:    movs r7, #1
+; CHECK-NEXT:    rsb r3, r3, r3, lsl #30
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    add.w lr, r7, r3, lsr #2
+; CHECK-NEXT:    lsl.w r3, r12, #2
 ; CHECK-NEXT:  .LBB1_9: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r7, r1, r12
-; CHECK-NEXT:    add.w r6, r0, r12
-; CHECK-NEXT:    add.w r5, r2, r12
+; CHECK-NEXT:    adds r7, r1, r3
+; CHECK-NEXT:    adds r6, r0, r3
+; CHECK-NEXT:    adds r5, r2, r3
 ; CHECK-NEXT:    adds r0, #16
 ; CHECK-NEXT:    vldr s0, [r7]
 ; CHECK-NEXT:    adds r1, #16
 ; CHECK-NEXT:    vldr s2, [r6]
 ; CHECK-NEXT:    adds r2, #16
-; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vadd.f32 s0, s2, s0
 ; CHECK-NEXT:    vstr s0, [r5]
 ; CHECK-NEXT:    vldr s0, [r7, #4]
@@ -293,10 +296,9 @@ define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr noca
 ; CHECK-NEXT:    vldr s2, [r6, #12]
 ; CHECK-NEXT:    vadd.f32 s0, s2, s0
 ; CHECK-NEXT:    vstr s0, [r5, #12]
-; CHECK-NEXT:    bne .LBB1_9
-; CHECK-NEXT:  .LBB1_10:
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    le lr, .LBB1_9
+; CHECK-NEXT:  .LBB1_10: @ %for.cond.cleanup
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 ; CHECK-NEXT:  .LBB1_11: @ %vector.ph
 ; CHECK-NEXT:    bic r12, r3, #3
 ; CHECK-NEXT:    movs r6, #1
@@ -430,11 +432,10 @@ for.body:                                         ; preds = %for.body.prol.loope
 define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
 ; CHECK-LABEL: float_float_sub:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    bxeq lr
-; CHECK-NEXT:  .LBB2_1: @ %for.body.preheader
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    beq .LBB2_10
+; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhi .LBB2_3
 ; CHECK-NEXT:  @ %bb.2:
@@ -459,39 +460,42 @@ define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr noca
 ; CHECK-NEXT:    beq .LBB2_11
 ; CHECK-NEXT:  .LBB2_4: @ %for.body.preheader22
 ; CHECK-NEXT:    mvn.w r7, r12
-; CHECK-NEXT:    add.w r8, r7, r3
-; CHECK-NEXT:    and r5, r3, #3
-; CHECK-NEXT:    wls lr, r5, .LBB2_7
+; CHECK-NEXT:    adds r4, r7, r3
+; CHECK-NEXT:    and r7, r3, #3
+; CHECK-NEXT:    add.w r8, r12, r7
+; CHECK-NEXT:    wls lr, r7, .LBB2_7
 ; CHECK-NEXT:  @ %bb.5: @ %for.body.prol.preheader
-; CHECK-NEXT:    add.w r4, r12, r5
-; CHECK-NEXT:    add.w r5, r0, r12, lsl #2
-; CHECK-NEXT:    add.w r6, r1, r12, lsl #2
-; CHECK-NEXT:    add.w r7, r2, r12, lsl #2
-; CHECK-NEXT:    mov r12, r4
+; CHECK-NEXT:    add.w r6, r0, r12, lsl #2
+; CHECK-NEXT:    add.w r7, r1, r12, lsl #2
+; CHECK-NEXT:    add.w r5, r2, r12, lsl #2
+; CHECK-NEXT:    mov r12, r8
 ; CHECK-NEXT:  .LBB2_6: @ %for.body.prol
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldmia r6!, {s0}
-; CHECK-NEXT:    vldmia r5!, {s2}
+; CHECK-NEXT:    vldmia r7!, {s0}
+; CHECK-NEXT:    vldmia r6!, {s2}
 ; CHECK-NEXT:    vsub.f32 s0, s2, s0
-; CHECK-NEXT:    vstmia r7!, {s0}
+; CHECK-NEXT:    vstmia r5!, {s0}
 ; CHECK-NEXT:    le lr, .LBB2_6
 ; CHECK-NEXT:  .LBB2_7: @ %for.body.prol.loopexit
-; CHECK-NEXT:    cmp.w r8, #3
+; CHECK-NEXT:    cmp r4, #3
 ; CHECK-NEXT:    blo .LBB2_10
 ; CHECK-NEXT:  @ %bb.8: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w r3, r3, r12
-; CHECK-NEXT:    lsl.w r12, r12, #2
+; CHECK-NEXT:    sub.w r3, r8, r3
+; CHECK-NEXT:    movs r7, #1
+; CHECK-NEXT:    rsb r3, r3, r3, lsl #30
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    add.w lr, r7, r3, lsr #2
+; CHECK-NEXT:    lsl.w r3, r12, #2
 ; CHECK-NEXT:  .LBB2_9: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r7, r1, r12
-; CHECK-NEXT:    add.w r6, r0, r12
-; CHECK-NEXT:    add.w r5, r2, r12
+; CHECK-NEXT:    adds r7, r1, r3
+; CHECK-NEXT:    adds r6, r0, r3
+; CHECK-NEXT:    adds r5, r2, r3
 ; CHECK-NEXT:    adds r0, #16
 ; CHECK-NEXT:    vldr s0, [r7]
 ; CHECK-NEXT:    adds r1, #16
 ; CHECK-NEXT:    vldr s2, [r6]
 ; CHECK-NEXT:    adds r2, #16
-; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vsub.f32 s0, s2, s0
 ; CHECK-NEXT:    vstr s0, [r5]
 ; CHECK-NEXT:    vldr s0, [r7, #4]
@@ -506,10 +510,9 @@ define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr noca
 ; CHECK-NEXT:    vldr s2, [r6, #12]
 ; CHECK-NEXT:    vsub.f32 s0, s2, s0
 ; CHECK-NEXT:    vstr s0, [r5, #12]
-; CHECK-NEXT:    bne .LBB2_9
-; CHECK-NEXT:  .LBB2_10:
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    le lr, .LBB2_9
+; CHECK-NEXT:  .LBB2_10: @ %for.cond.cleanup
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 ; CHECK-NEXT:  .LBB2_11: @ %vector.ph
 ; CHECK-NEXT:    bic r12, r3, #3
 ; CHECK-NEXT:    movs r6, #1
@@ -643,11 +646,10 @@ for.body:                                         ; preds = %for.body.prol.loope
 define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
 ; CHECK-LABEL: float_int_mul:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    bxeq lr
-; CHECK-NEXT:  .LBB3_1: @ %for.body.preheader
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    beq.w .LBB3_13
+; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bls .LBB3_6
 ; CHECK-NEXT:  @ %bb.2: @ %vector.memcheck
@@ -681,42 +683,45 @@ define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapt
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:  .LBB3_7: @ %for.body.preheader16
 ; CHECK-NEXT:    mvn.w r7, r12
-; CHECK-NEXT:    add.w r8, r7, r3
-; CHECK-NEXT:    and r5, r3, #3
-; CHECK-NEXT:    wls lr, r5, .LBB3_10
+; CHECK-NEXT:    add.w r9, r7, r3
+; CHECK-NEXT:    and r7, r3, #3
+; CHECK-NEXT:    add.w r8, r12, r7
+; CHECK-NEXT:    wls lr, r7, .LBB3_10
 ; CHECK-NEXT:  @ %bb.8: @ %for.body.prol.preheader
-; CHECK-NEXT:    add.w r4, r12, r5
-; CHECK-NEXT:    add.w r5, r0, r12, lsl #2
-; CHECK-NEXT:    add.w r6, r1, r12, lsl #2
-; CHECK-NEXT:    add.w r7, r2, r12, lsl #2
-; CHECK-NEXT:    mov r12, r4
+; CHECK-NEXT:    add.w r6, r0, r12, lsl #2
+; CHECK-NEXT:    add.w r7, r1, r12, lsl #2
+; CHECK-NEXT:    add.w r5, r2, r12, lsl #2
+; CHECK-NEXT:    mov r12, r8
 ; CHECK-NEXT:  .LBB3_9: @ %for.body.prol
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r4, [r6], #4
-; CHECK-NEXT:    vldmia r5!, {s2}
+; CHECK-NEXT:    ldr r4, [r7], #4
+; CHECK-NEXT:    vldmia r6!, {s2}
 ; CHECK-NEXT:    vmov s0, r4
 ; CHECK-NEXT:    vcvt.f32.s32 s0, s0
 ; CHECK-NEXT:    vmul.f32 s0, s2, s0
-; CHECK-NEXT:    vstmia r7!, {s0}
+; CHECK-NEXT:    vstmia r5!, {s0}
 ; CHECK-NEXT:    le lr, .LBB3_9
 ; CHECK-NEXT:  .LBB3_10: @ %for.body.prol.loopexit
-; CHECK-NEXT:    cmp.w r8, #3
+; CHECK-NEXT:    cmp.w r9, #3
 ; CHECK-NEXT:    blo .LBB3_13
 ; CHECK-NEXT:  @ %bb.11: @ %for.body.preheader1
+; CHECK-NEXT:    sub.w r3, r8, r3
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #2
-; CHECK-NEXT:    sub.w r3, r3, r12
+; CHECK-NEXT:    movs r7, #1
 ; CHECK-NEXT:    adds r1, #8
-; CHECK-NEXT:    lsl.w r12, r12, #2
+; CHECK-NEXT:    rsb r3, r3, r3, lsl #30
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    add.w lr, r7, r3, lsr #2
+; CHECK-NEXT:    lsl.w r3, r12, #2
 ; CHECK-NEXT:  .LBB3_12: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr s0, [r1, #-8]
-; CHECK-NEXT:    add.w r7, r0, r12
-; CHECK-NEXT:    add.w r6, r2, r12
+; CHECK-NEXT:    adds r7, r0, r3
+; CHECK-NEXT:    adds r6, r2, r3
 ; CHECK-NEXT:    adds r0, #16
 ; CHECK-NEXT:    vcvt.f32.s32 s0, s0
 ; CHECK-NEXT:    vldr s2, [r7]
 ; CHECK-NEXT:    adds r2, #16
-; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vmul.f32 s0, s2, s0
 ; CHECK-NEXT:    vstr s0, [r6]
 ; CHECK-NEXT:    vldr s0, [r1, #-4]
@@ -730,15 +735,14 @@ define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapt
 ; CHECK-NEXT:    vmul.f32 s0, s2, s0
 ; CHECK-NEXT:    vstr s0, [r6, #8]
 ; CHECK-NEXT:    vldr s0, [r1, #4]
-; CHECK-NEXT:    add.w r1, r1, #16
+; CHECK-NEXT:    adds r1, #16
 ; CHECK-NEXT:    vldr s2, [r7, #12]
 ; CHECK-NEXT:    vcvt.f32.s32 s0, s0
 ; CHECK-NEXT:    vmul.f32 s0, s2, s0
 ; CHECK-NEXT:    vstr s0, [r6, #12]
-; CHECK-NEXT:    bne .LBB3_12
-; CHECK-NEXT:  .LBB3_13:
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    le lr, .LBB3_12
+; CHECK-NEXT:  .LBB3_13: @ %for.cond.cleanup
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 entry:
   %cmp8 = icmp eq i32 %N, 0
   br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
index d747da76a45fa..ad8921d2f7b02 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s
-; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
-; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s
-; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,FULLFP16
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,MVEFP
+; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,FULLFP16
+; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,MVEFP
 
 define arm_aapcs_vfpcc <4 x float> @sqrt_float32_t(<4 x float> %src) {
 ; CHECK-LABEL: sqrt_float32_t:
@@ -1091,107 +1091,123 @@ entry:
 }
 
 define arm_aapcs_vfpcc <4 x float> @copysign_float32_t(<4 x float> %src1, <4 x float> %src2) {
-; CHECK-LABEL: copysign_float32_t:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r12, r1, d2
-; CHECK-NEXT:    vmov r2, lr, d3
-; CHECK-NEXT:    vmov r3, r0, d0
-; CHECK-NEXT:    vmov r4, r5, d1
-; CHECK-NEXT:    lsrs r1, r1, #31
-; CHECK-NEXT:    bfi r0, r1, #31, #1
-; CHECK-NEXT:    lsrs r1, r2, #31
-; CHECK-NEXT:    bfi r4, r1, #31, #1
-; CHECK-NEXT:    lsr.w r1, lr, #31
-; CHECK-NEXT:    bfi r5, r1, #31, #1
-; CHECK-NEXT:    lsr.w r1, r12, #31
-; CHECK-NEXT:    bfi r3, r1, #31, #1
-; CHECK-NEXT:    vmov s2, r4
-; CHECK-NEXT:    vmov s3, r5
-; CHECK-NEXT:    vmov s1, r0
-; CHECK-NEXT:    vmov s0, r3
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; FULLFP16-LABEL: copysign_float32_t:
+; FULLFP16:       @ %bb.0: @ %entry
+; FULLFP16-NEXT:    .save {r4, r5, r7, lr}
+; FULLFP16-NEXT:    push {r4, r5, r7, lr}
+; FULLFP16-NEXT:    vmov r12, r1, d2
+; FULLFP16-NEXT:    vmov r2, lr, d3
+; FULLFP16-NEXT:    vmov r3, r0, d0
+; FULLFP16-NEXT:    vmov r4, r5, d1
+; FULLFP16-NEXT:    lsrs r1, r1, #31
+; FULLFP16-NEXT:    bfi r0, r1, #31, #1
+; FULLFP16-NEXT:    lsrs r1, r2, #31
+; FULLFP16-NEXT:    bfi r4, r1, #31, #1
+; FULLFP16-NEXT:    lsr.w r1, lr, #31
+; FULLFP16-NEXT:    bfi r5, r1, #31, #1
+; FULLFP16-NEXT:    lsr.w r1, r12, #31
+; FULLFP16-NEXT:    bfi r3, r1, #31, #1
+; FULLFP16-NEXT:    vmov s2, r4
+; FULLFP16-NEXT:    vmov s3, r5
+; FULLFP16-NEXT:    vmov s1, r0
+; FULLFP16-NEXT:    vmov s0, r3
+; FULLFP16-NEXT:    pop {r4, r5, r7, pc}
+;
+; MVEFP-LABEL: copysign_float32_t:
+; MVEFP:       @ %bb.0: @ %entry
+; MVEFP-NEXT:    vmov.i32 q2, #0x80000000
+; MVEFP-NEXT:    vbic.i32 q0, #0x80000000
+; MVEFP-NEXT:    vand q1, q1, q2
+; MVEFP-NEXT:    vorr q0, q0, q1
+; MVEFP-NEXT:    bx lr
 entry:
   %0 = call fast <4 x float> @llvm.copysign.v4f32(<4 x float> %src1, <4 x float> %src2)
   ret <4 x float> %0
 }
 
 define arm_aapcs_vfpcc <8 x half> @copysign_float16_t(<8 x half> %src1, <8 x half> %src2) {
-; CHECK-LABEL: copysign_float16_t:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vstr.16 s8, [sp, #24]
-; CHECK-NEXT:    vstr.16 s4, [sp, #28]
-; CHECK-NEXT:    vmovx.f16 s4, s5
-; CHECK-NEXT:    vstr.16 s4, [sp, #16]
-; CHECK-NEXT:    vmovx.f16 s4, s6
-; CHECK-NEXT:    vstr.16 s5, [sp, #20]
-; CHECK-NEXT:    vstr.16 s4, [sp, #8]
-; CHECK-NEXT:    vmovx.f16 s4, s7
-; CHECK-NEXT:    vstr.16 s6, [sp, #12]
-; CHECK-NEXT:    vstr.16 s4, [sp]
-; CHECK-NEXT:    vstr.16 s7, [sp, #4]
-; CHECK-NEXT:    ldrb.w r0, [sp, #25]
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vabs.f16 s4, s4
-; CHECK-NEXT:    vneg.f16 s6, s4
-; CHECK-NEXT:    lsls r0, r0, #24
-; CHECK-NEXT:    it pl
-; CHECK-NEXT:    vmovpl.f32 s6, s4
-; CHECK-NEXT:    ldrb.w r0, [sp, #29]
-; CHECK-NEXT:    vabs.f16 s4, s0
-; CHECK-NEXT:    vneg.f16 s0, s4
-; CHECK-NEXT:    lsls r0, r0, #24
-; CHECK-NEXT:    it pl
-; CHECK-NEXT:    vmovpl.f32 s0, s4
-; CHECK-NEXT:    ldrb.w r0, [sp, #17]
-; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vabs.f16 s4, s4
-; CHECK-NEXT:    vins.f16 s0, s6
-; CHECK-NEXT:    vneg.f16 s6, s4
-; CHECK-NEXT:    lsls r0, r0, #24
-; CHECK-NEXT:    it pl
-; CHECK-NEXT:    vmovpl.f32 s6, s4
-; CHECK-NEXT:    ldrb.w r0, [sp, #21]
-; CHECK-NEXT:    vabs.f16 s4, s1
-; CHECK-NEXT:    vneg.f16 s1, s4
-; CHECK-NEXT:    lsls r0, r0, #24
-; CHECK-NEXT:    it pl
-; CHECK-NEXT:    vmovpl.f32 s1, s4
-; CHECK-NEXT:    ldrb.w r0, [sp, #9]
-; CHECK-NEXT:    vmovx.f16 s4, s2
-; CHECK-NEXT:    vabs.f16 s4, s4
-; CHECK-NEXT:    vins.f16 s1, s6
-; CHECK-NEXT:    vneg.f16 s6, s4
-; CHECK-NEXT:    lsls r0, r0, #24
-; CHECK-NEXT:    it pl
-; CHECK-NEXT:    vmovpl.f32 s6, s4
-; CHECK-NEXT:    ldrb.w r0, [sp, #13]
-; CHECK-NEXT:    vabs.f16 s4, s2
-; CHECK-NEXT:    vneg.f16 s2, s4
-; CHECK-NEXT:    lsls r0, r0, #24
-; CHECK-NEXT:    it pl
-; CHECK-NEXT:    vmovpl.f32 s2, s4
-; CHECK-NEXT:    ldrb.w r0, [sp, #1]
-; CHECK-NEXT:    vmovx.f16 s4, s3
-; CHECK-NEXT:    vabs.f16 s4, s4
-; CHECK-NEXT:    vins.f16 s2, s6
-; CHECK-NEXT:    vneg.f16 s6, s4
-; CHECK-NEXT:    lsls r0, r0, #24
-; CHECK-NEXT:    it pl
-; CHECK-NEXT:    vmovpl.f32 s6, s4
-; CHECK-NEXT:    ldrb.w r0, [sp, #5]
-; CHECK-NEXT:    vabs.f16 s4, s3
-; CHECK-NEXT:    vneg.f16 s3, s4
-; CHECK-NEXT:    lsls r0, r0, #24
-; CHECK-NEXT:    it pl
-; CHECK-NEXT:    vmovpl.f32 s3, s4
-; CHECK-NEXT:    vins.f16 s3, s6
-; CHECK-NEXT:    add sp, #32
-; CHECK-NEXT:    bx lr
+; FULLFP16-LABEL: copysign_float16_t:
+; FULLFP16:       @ %bb.0: @ %entry
+; FULLFP16-NEXT:    .pad #32
+; FULLFP16-NEXT:    sub sp, #32
+; FULLFP16-NEXT:    vmovx.f16 s8, s4
+; FULLFP16-NEXT:    vstr.16 s8, [sp, #24]
+; FULLFP16-NEXT:    vstr.16 s4, [sp, #28]
+; FULLFP16-NEXT:    vmovx.f16 s4, s5
+; FULLFP16-NEXT:    vstr.16 s4, [sp, #16]
+; FULLFP16-NEXT:    vmovx.f16 s4, s6
+; FULLFP16-NEXT:    vstr.16 s5, [sp, #20]
+; FULLFP16-NEXT:    vstr.16 s4, [sp, #8]
+; FULLFP16-NEXT:    vmovx.f16 s4, s7
+; FULLFP16-NEXT:    vstr.16 s6, [sp, #12]
+; FULLFP16-NEXT:    vstr.16 s4, [sp]
+; FULLFP16-NEXT:    vstr.16 s7, [sp, #4]
+; FULLFP16-NEXT:    ldrb.w r0, [sp, #25]
+; FULLFP16-NEXT:    vmovx.f16 s4, s0
+; FULLFP16-NEXT:    vabs.f16 s4, s4
+; FULLFP16-NEXT:    vneg.f16 s6, s4
+; FULLFP16-NEXT:    lsls r0, r0, #24
+; FULLFP16-NEXT:    it pl
+; FULLFP16-NEXT:    vmovpl.f32 s6, s4
+; FULLFP16-NEXT:    ldrb.w r0, [sp, #29]
+; FULLFP16-NEXT:    vabs.f16 s4, s0
+; FULLFP16-NEXT:    vneg.f16 s0, s4
+; FULLFP16-NEXT:    lsls r0, r0, #24
+; FULLFP16-NEXT:    it pl
+; FULLFP16-NEXT:    vmovpl.f32 s0, s4
+; FULLFP16-NEXT:    ldrb.w r0, [sp, #17]
+; FULLFP16-NEXT:    vmovx.f16 s4, s1
+; FULLFP16-NEXT:    vabs.f16 s4, s4
+; FULLFP16-NEXT:    vins.f16 s0, s6
+; FULLFP16-NEXT:    vneg.f16 s6, s4
+; FULLFP16-NEXT:    lsls r0, r0, #24
+; FULLFP16-NEXT:    it pl
+; FULLFP16-NEXT:    vmovpl.f32 s6, s4
+; FULLFP16-NEXT:    ldrb.w r0, [sp, #21]
+; FULLFP16-NEXT:    vabs.f16 s4, s1
+; FULLFP16-NEXT:    vneg.f16 s1, s4
+; FULLFP16-NEXT:    lsls r0, r0, #24
+; FULLFP16-NEXT:    it pl
+; FULLFP16-NEXT:    vmovpl.f32 s1, s4
+; FULLFP16-NEXT:    ldrb.w r0, [sp, #9]
+; FULLFP16-NEXT:    vmovx.f16 s4, s2
+; FULLFP16-NEXT:    vabs.f16 s4, s4
+; FULLFP16-NEXT:    vins.f16 s1, s6
+; FULLFP16-NEXT:    vneg.f16 s6, s4
+; FULLFP16-NEXT:    lsls r0, r0, #24
+; FULLFP16-NEXT:    it pl
+; FULLFP16-NEXT:    vmovpl.f32 s6, s4
+; FULLFP16-NEXT:    ldrb.w r0, [sp, #13]
+; FULLFP16-NEXT:    vabs.f16 s4, s2
+; FULLFP16-NEXT:    vneg.f16 s2, s4
+; FULLFP16-NEXT:    lsls r0, r0, #24
+; FULLFP16-NEXT:    it pl
+; FULLFP16-NEXT:    vmovpl.f32 s2, s4
+; FULLFP16-NEXT:    ldrb.w r0, [sp, #1]
+; FULLFP16-NEXT:    vmovx.f16 s4, s3
+; FULLFP16-NEXT:    vabs.f16 s4, s4
+; FULLFP16-NEXT:    vins.f16 s2, s6
+; FULLFP16-NEXT:    vneg.f16 s6, s4
+; FULLFP16-NEXT:    lsls r0, r0, #24
+; FULLFP16-NEXT:    it pl
+; FULLFP16-NEXT:    vmovpl.f32 s6, s4
+; FULLFP16-NEXT:    ldrb.w r0, [sp, #5]
+; FULLFP16-NEXT:    vabs.f16 s4, s3
+; FULLFP16-NEXT:    vneg.f16 s3, s4
+; FULLFP16-NEXT:    lsls r0, r0, #24
+; FULLFP16-NEXT:    it pl
+; FULLFP16-NEXT:    vmovpl.f32 s3, s4
+; FULLFP16-NEXT:    vins.f16 s3, s6
+; FULLFP16-NEXT:    add sp, #32
+; FULLFP16-NEXT:    bx lr
+;
+; MVEFP-LABEL: copysign_float16_t:
+; MVEFP:       @ %bb.0: @ %entry
+; MVEFP-NEXT:    vmov.i16 q2, #0x8000
+; MVEFP-NEXT:    vbic.i16 q0, #0x8000
+; MVEFP-NEXT:    vand q1, q1, q2
+; MVEFP-NEXT:    vorr q0, q0, q1
+; MVEFP-NEXT:    bx lr
 entry:
   %0 = call fast <8 x half> @llvm.copysign.v8f16(<8 x half> %src1, <8 x half> %src2)
   ret <8 x half> %0
diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
index d4518d40e4298..75612ba645ca4 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
@@ -9,19 +9,21 @@ target triple = "wasm32-unknown-unknown"
 define void @shl_loop(ptr %a, i8 %shift, i32 %count) {
 ; CHECK-LABEL: shl_loop:
 ; CHECK:         .functype shl_loop (i32, i32, i32) -> ()
+; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:  .LBB0_1: # %body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    loop # label0:
 ; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.tee 3
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    v128.load 0:p2align=0
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i8x16.shl
-; CHECK-NEXT:    v128.store 16
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    local.get 3
 ; CHECK-NEXT:    local.set 0
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i32.const -1
@@ -56,23 +58,25 @@ exit:
 define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) {
 ; CHECK-LABEL: shl_phi_loop:
 ; CHECK:         .functype shl_phi_loop (i32, i32, i32) -> ()
+; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:  .LBB1_1: # %body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    loop # label1:
 ; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.tee 3
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    v128.load 0:p2align=0
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i8x16.shl
-; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    v128.store 0
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i32.const 1
 ; CHECK-NEXT:    i32.and
 ; CHECK-NEXT:    local.set 1
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 3
 ; CHECK-NEXT:    local.set 0
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i32.const -1
diff --git a/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll b/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll
index 1d6e073271efa..4660e1bce1ee6 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll
@@ -354,7 +354,11 @@ define <2 x i64> @rotr_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; ==============================================================================
 
 ; CHECK-LABEL: copysign_v4f32:
-; CHECK: f32.copysign
+; CHECK: v128.const
+; CHECK-NEXT: v128.and
+; CHECK-NEXT: v128.const
+; CHECK-NEXT: v128.and
+; CHECK-NEXT: v128.or
 declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
 define <4 x float> @copysign_v4f32(<4 x float> %x, <4 x float> %y) {
   %v = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %y)
@@ -454,7 +458,11 @@ define <4 x float> @round_v4f32(<4 x float> %x) {
 ; ==============================================================================
 
 ; CHECK-LABEL: copysign_v2f64:
-; CHECK: f64.copysign
+; CHECK: v128.const
+; CHECK-NEXT: v128.and
+; CHECK-NEXT: v128.const
+; CHECK-NEXT: v128.and
+; CHECK-NEXT: v128.or
 declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>)
 define <2 x double> @copysign_v2f64(<2 x double> %x, <2 x double> %y) {
   %v = call <2 x double> @llvm.copysign.v2f64(<2 x double> %x, <2 x double> %y)
diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
index d7633cb11e44c..bd794712a3109 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -789,12 +789,55 @@ define double @load_double_seq_cst(ptr %fptr) {
 }
 
 define void @store_bfloat(ptr %fptr, bfloat %v) {
-; X86-LABEL: store_bfloat:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movw %cx, (%eax)
-; X86-NEXT:    retl
+; X86-SSE1-LABEL: store_bfloat:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %esi
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT:    subl $8, %esp
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
+; X86-SSE1-NEXT:    .cfi_offset %esi, -8
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE1-NEXT:    calll __truncsfbf2
+; X86-SSE1-NEXT:    movw %ax, (%esi)
+; X86-SSE1-NEXT:    addl $8, %esp
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT:    popl %esi
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: store_bfloat:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movw %cx, (%eax)
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: store_bfloat:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movw %cx, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X86-NOSSE-LABEL: store_bfloat:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; X86-NOSSE-NEXT:    .cfi_offset %esi, -8
+; X86-NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstps (%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    calll __truncsfbf2
+; X86-NOSSE-NEXT:    movw %ax, (%esi)
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-NOSSE-NEXT:    popl %esi
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; X86-NOSSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: store_bfloat:
 ; X64-SSE:       # %bb.0:
@@ -811,8 +854,7 @@ define void @store_bfloat(ptr %fptr, bfloat %v) {
   ret void
 }
 
-; Work around issue #92899 by casting to float
-define float @load_bfloat(ptr %fptr) {
+define bfloat @load_bfloat(ptr %fptr) {
 ; X86-SSE1-LABEL: load_bfloat:
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    pushl %eax
@@ -828,30 +870,16 @@ define float @load_bfloat(ptr %fptr) {
 ;
 ; X86-SSE2-LABEL: load_bfloat:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %eax
-; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzwl (%eax), %eax
-; X86-SSE2-NEXT:    shll $16, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, (%esp)
-; X86-SSE2-NEXT:    flds (%esp)
-; X86-SSE2-NEXT:    popl %eax
-; X86-SSE2-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE2-NEXT:    pinsrw $0, %eax, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: load_bfloat:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    pushl %eax
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movzwl (%eax), %eax
-; X86-AVX-NEXT:    shll $16, %eax
-; X86-AVX-NEXT:    vmovd %eax, %xmm0
-; X86-AVX-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX-NEXT:    flds (%esp)
-; X86-AVX-NEXT:    popl %eax
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
+; X86-AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X86-NOSSE-LABEL: load_bfloat:
@@ -870,17 +898,14 @@ define float @load_bfloat(ptr %fptr) {
 ; X64-SSE-LABEL: load_bfloat:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    movzwl (%rdi), %eax
-; X64-SSE-NEXT:    shll $16, %eax
-; X64-SSE-NEXT:    movd %eax, %xmm0
+; X64-SSE-NEXT:    pinsrw $0, %eax, %xmm0
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: load_bfloat:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    movzwl (%rdi), %eax
-; X64-AVX-NEXT:    shll $16, %eax
-; X64-AVX-NEXT:    vmovd %eax, %xmm0
+; X64-AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
 ; X64-AVX-NEXT:    retq
   %v = load atomic bfloat, ptr %fptr unordered, align 2
-  %ext = fpext bfloat %v to float
-  ret float %ext
+  ret bfloat %v
 }
diff --git a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
index bafa33ff9a1c8..432d27ac04eda 100644
--- a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
@@ -33,9 +33,11 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8>
 ; X86NOBW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x04]
 ; X86NOBW-NEXT:    vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x05]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
+; X86NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd]
 ; X86NOBW-NEXT:    vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc]
 ; X86NOBW-NEXT:    vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8]
+; X86NOBW-NEXT:    # xmm2 = (~xmm2 & xmm5 & xmm0) | (xmm2 & ~xmm5 & ~xmm0) | (xmm2 & ~xmm5 & xmm0) | (xmm2 & xmm5 & xmm0)
 ; X86NOBW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86NOBW-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
@@ -47,9 +49,11 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8>
 ; X64NOBW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x04]
 ; X64NOBW-NEXT:    vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x05]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
+; X64NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd]
 ; X64NOBW-NEXT:    vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc]
 ; X64NOBW-NEXT:    vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8]
+; X64NOBW-NEXT:    # xmm2 = (~xmm2 & xmm5 & xmm0) | (xmm2 & ~xmm5 & ~xmm0) | (xmm2 & ~xmm5 & xmm0) | (xmm2 & xmm5 & xmm0)
 ; X64NOBW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X64NOBW-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64NOBW-NEXT:    retq # encoding: [0xc3]
@@ -95,12 +99,15 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8>
 ; X86NOBW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x04]
 ; X86NOBW-NEXT:    vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x05]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
+; X86NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff]
+; X86NOBW-NEXT:    # zmm5 {%k2} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01]
 ; X86NOBW-NEXT:    vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc]
 ; X86NOBW-NEXT:    vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8]
+; X86NOBW-NEXT:    # ymm2 = (~ymm2 & ymm5 & ymm0) | (ymm2 & ~ymm5 & ~ymm0) | (ymm2 & ~ymm5 & ymm0) | (ymm2 & ymm5 & ymm0)
 ; X86NOBW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
 ;
@@ -113,12 +120,15 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8>
 ; X64NOBW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x04]
 ; X64NOBW-NEXT:    vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x05]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
+; X64NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff]
+; X64NOBW-NEXT:    # zmm5 {%k2} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
 ; X64NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01]
 ; X64NOBW-NEXT:    vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc]
 ; X64NOBW-NEXT:    vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8]
+; X64NOBW-NEXT:    # ymm2 = (~ymm2 & ymm5 & ymm0) | (ymm2 & ~ymm5 & ~ymm0) | (ymm2 & ~ymm5 & ymm0) | (ymm2 & ymm5 & ymm0)
 ; X64NOBW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X64NOBW-NEXT:    retq # encoding: [0xc3]
   %1 = bitcast i32 %mask to <32 x i1>
@@ -166,18 +176,23 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8>
 ; X86NOBW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm3, %zmm4 # encoding: [0x62,0xf3,0xe5,0x48,0xcf,0xe1,0x04]
 ; X86NOBW-NEXT:    vgf2p8affineinvqb $5, %zmm1, %zmm3, %zmm3 # encoding: [0x62,0xf3,0xe5,0x48,0xcf,0xd9,0x05]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff]
+; X86NOBW-NEXT:    # zmm1 {%k4} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff]
+; X86NOBW-NEXT:    # zmm5 {%k3} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff]
+; X86NOBW-NEXT:    # zmm5 {%k2} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} # encoding: [0x62,0xf3,0x4d,0xc9,0x25,0xf6,0xff]
+; X86NOBW-NEXT:    # zmm6 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01]
 ; X86NOBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01]
 ; X86NOBW-NEXT:    vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc]
 ; X86NOBW-NEXT:    vpternlogq $184, %zmm3, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd3,0xb8]
+; X86NOBW-NEXT:    # zmm2 = (~zmm2 & zmm5 & zmm3) | (zmm2 & ~zmm5 & ~zmm3) | (zmm2 & ~zmm5 & zmm3) | (zmm2 & zmm5 & zmm3)
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64NOBW-LABEL: test_vgf2p8affineinvqb_512:
@@ -195,18 +210,23 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8>
 ; X64NOBW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xe1,0x04]
 ; X64NOBW-NEXT:    vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x05]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff]
+; X64NOBW-NEXT:    # zmm1 {%k4} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff]
+; X64NOBW-NEXT:    # zmm5 {%k3} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
 ; X64NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} # encoding: [0x62,0xf3,0x55,0xc9,0x25,0xed,0xff]
+; X64NOBW-NEXT:    # zmm5 {%k1} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} # encoding: [0x62,0xf3,0x4d,0xca,0x25,0xf6,0xff]
+; X64NOBW-NEXT:    # zmm6 {%k2} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6]
 ; X64NOBW-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01]
 ; X64NOBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01]
 ; X64NOBW-NEXT:    vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc]
 ; X64NOBW-NEXT:    vpternlogq $184, %zmm0, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd0,0xb8]
+; X64NOBW-NEXT:    # zmm2 = (~zmm2 & zmm5 & zmm0) | (zmm2 & ~zmm5 & ~zmm0) | (zmm2 & ~zmm5 & zmm0) | (zmm2 & zmm5 & zmm0)
 ; X64NOBW-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X64NOBW-NEXT:    retq # encoding: [0xc3]
   %1 = bitcast i64 %mask to <64 x i1>
@@ -250,9 +270,11 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s
 ; X86NOBW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x04]
 ; X86NOBW-NEXT:    vgf2p8affineqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x05]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
+; X86NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd]
 ; X86NOBW-NEXT:    vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc]
 ; X86NOBW-NEXT:    vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8]
+; X86NOBW-NEXT:    # xmm2 = (~xmm2 & xmm5 & xmm0) | (xmm2 & ~xmm5 & ~xmm0) | (xmm2 & ~xmm5 & xmm0) | (xmm2 & xmm5 & xmm0)
 ; X86NOBW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86NOBW-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
@@ -264,9 +286,11 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s
 ; X64NOBW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x04]
 ; X64NOBW-NEXT:    vgf2p8affineqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x05]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
+; X64NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd]
 ; X64NOBW-NEXT:    vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc]
 ; X64NOBW-NEXT:    vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8]
+; X64NOBW-NEXT:    # xmm2 = (~xmm2 & xmm5 & xmm0) | (xmm2 & ~xmm5 & ~xmm0) | (xmm2 & ~xmm5 & xmm0) | (xmm2 & xmm5 & xmm0)
 ; X64NOBW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X64NOBW-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64NOBW-NEXT:    retq # encoding: [0xc3]
@@ -312,12 +336,15 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s
 ; X86NOBW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x04]
 ; X86NOBW-NEXT:    vgf2p8affineqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x05]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
+; X86NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff]
+; X86NOBW-NEXT:    # zmm5 {%k2} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01]
 ; X86NOBW-NEXT:    vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc]
 ; X86NOBW-NEXT:    vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8]
+; X86NOBW-NEXT:    # ymm2 = (~ymm2 & ymm5 & ymm0) | (ymm2 & ~ymm5 & ~ymm0) | (ymm2 & ~ymm5 & ymm0) | (ymm2 & ymm5 & ymm0)
 ; X86NOBW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
 ;
@@ -330,12 +357,15 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s
 ; X64NOBW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x04]
 ; X64NOBW-NEXT:    vgf2p8affineqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x05]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
+; X64NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff]
+; X64NOBW-NEXT:    # zmm5 {%k2} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
 ; X64NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01]
 ; X64NOBW-NEXT:    vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc]
 ; X64NOBW-NEXT:    vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8]
+; X64NOBW-NEXT:    # ymm2 = (~ymm2 & ymm5 & ymm0) | (ymm2 & ~ymm5 & ~ymm0) | (ymm2 & ~ymm5 & ymm0) | (ymm2 & ymm5 & ymm0)
 ; X64NOBW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X64NOBW-NEXT:    retq # encoding: [0xc3]
   %1 = bitcast i32 %mask to <32 x i1>
@@ -383,18 +413,23 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s
 ; X86NOBW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm3, %zmm4 # encoding: [0x62,0xf3,0xe5,0x48,0xce,0xe1,0x04]
 ; X86NOBW-NEXT:    vgf2p8affineqb $5, %zmm1, %zmm3, %zmm3 # encoding: [0x62,0xf3,0xe5,0x48,0xce,0xd9,0x05]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff]
+; X86NOBW-NEXT:    # zmm1 {%k4} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff]
+; X86NOBW-NEXT:    # zmm5 {%k3} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff]
+; X86NOBW-NEXT:    # zmm5 {%k2} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} # encoding: [0x62,0xf3,0x4d,0xc9,0x25,0xf6,0xff]
+; X86NOBW-NEXT:    # zmm6 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01]
 ; X86NOBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01]
 ; X86NOBW-NEXT:    vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc]
 ; X86NOBW-NEXT:    vpternlogq $184, %zmm3, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd3,0xb8]
+; X86NOBW-NEXT:    # zmm2 = (~zmm2 & zmm5 & zmm3) | (zmm2 & ~zmm5 & ~zmm3) | (zmm2 & ~zmm5 & zmm3) | (zmm2 & zmm5 & zmm3)
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64NOBW-LABEL: test_vgf2p8affineqb_512:
@@ -412,18 +447,23 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s
 ; X64NOBW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xe1,0x04]
 ; X64NOBW-NEXT:    vgf2p8affineqb $5, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x05]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff]
+; X64NOBW-NEXT:    # zmm1 {%k4} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff]
+; X64NOBW-NEXT:    # zmm5 {%k3} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
 ; X64NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} # encoding: [0x62,0xf3,0x55,0xc9,0x25,0xed,0xff]
+; X64NOBW-NEXT:    # zmm5 {%k1} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} # encoding: [0x62,0xf3,0x4d,0xca,0x25,0xf6,0xff]
+; X64NOBW-NEXT:    # zmm6 {%k2} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6]
 ; X64NOBW-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01]
 ; X64NOBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01]
 ; X64NOBW-NEXT:    vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc]
 ; X64NOBW-NEXT:    vpternlogq $184, %zmm0, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd0,0xb8]
+; X64NOBW-NEXT:    # zmm2 = (~zmm2 & zmm5 & zmm0) | (zmm2 & ~zmm5 & ~zmm0) | (zmm2 & ~zmm5 & zmm0) | (zmm2 & zmm5 & zmm0)
 ; X64NOBW-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X64NOBW-NEXT:    retq # encoding: [0xc3]
   %1 = bitcast i64 %mask to <64 x i1>
@@ -468,8 +508,10 @@ define <16 x i8> @test_vgf2p8mulb_128_mask(<16 x i8> %src1, <16 x i8> %src2, <16
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86NOBW-NEXT:    vgf2p8mulb %xmm1, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc9]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff]
+; X86NOBW-NEXT:    # zmm0 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0]
 ; X86NOBW-NEXT:    vpternlogq $202, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0xca]
+; X86NOBW-NEXT:    # xmm0 = (~xmm0 & ~xmm1 & xmm2) | (~xmm0 & xmm1 & xmm2) | (xmm0 & xmm1 & ~xmm2) | (xmm0 & xmm1 & xmm2)
 ; X86NOBW-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
 ;
@@ -478,8 +520,10 @@ define <16 x i8> @test_vgf2p8mulb_128_mask(<16 x i8> %src1, <16 x i8> %src2, <16
 ; X64NOBW-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64NOBW-NEXT:    vgf2p8mulb %xmm1, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc9]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff]
+; X64NOBW-NEXT:    # zmm0 {%k1} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0]
 ; X64NOBW-NEXT:    vpternlogq $202, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0xca]
+; X64NOBW-NEXT:    # xmm0 = (~xmm0 & ~xmm1 & xmm2) | (~xmm0 & xmm1 & xmm2) | (xmm0 & xmm1 & ~xmm2) | (xmm0 & xmm1 & xmm2)
 ; X64NOBW-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64NOBW-NEXT:    retq # encoding: [0xc3]
   %1 = bitcast i16 %mask to <16 x i1>
@@ -506,6 +550,7 @@ define <16 x i8> @test_vgf2p8mulb_128_maskz(<16 x i8> %src1, <16 x i8> %src2, i1
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86NOBW-NEXT:    vgf2p8mulb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc1]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
+; X86NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X86NOBW-NEXT:    vpand %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xc0]
 ; X86NOBW-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
@@ -516,6 +561,7 @@ define <16 x i8> @test_vgf2p8mulb_128_maskz(<16 x i8> %src1, <16 x i8> %src2, i1
 ; X64NOBW-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64NOBW-NEXT:    vgf2p8mulb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc1]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
+; X64NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X64NOBW-NEXT:    vpand %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xc0]
 ; X64NOBW-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
@@ -557,11 +603,14 @@ define <32 x i8> @test_vgf2p8mulb_256_mask(<32 x i8> %src1, <32 x i8> %src2, <32
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06]
 ; X86NOBW-NEXT:    vgf2p8mulb %ymm1, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc9]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff]
+; X86NOBW-NEXT:    # zmm0 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff]
+; X86NOBW-NEXT:    # zmm3 {%k2} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01]
 ; X86NOBW-NEXT:    vpternlogq $202, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0xca]
+; X86NOBW-NEXT:    # ymm0 = (~ymm0 & ~ymm1 & ymm2) | (~ymm0 & ymm1 & ymm2) | (ymm0 & ymm1 & ~ymm2) | (ymm0 & ymm1 & ymm2)
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64NOBW-LABEL: test_vgf2p8mulb_256_mask:
@@ -571,11 +620,14 @@ define <32 x i8> @test_vgf2p8mulb_256_mask(<32 x i8> %src1, <32 x i8> %src2, <32
 ; X64NOBW-NEXT:    kmovw %edi, %k2 # encoding: [0xc5,0xf8,0x92,0xd7]
 ; X64NOBW-NEXT:    vgf2p8mulb %ymm1, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc9]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff]
+; X64NOBW-NEXT:    # zmm0 {%k1} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff]
+; X64NOBW-NEXT:    # zmm3 {%k2} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
 ; X64NOBW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01]
 ; X64NOBW-NEXT:    vpternlogq $202, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0xca]
+; X64NOBW-NEXT:    # ymm0 = (~ymm0 & ~ymm1 & ymm2) | (~ymm0 & ymm1 & ymm2) | (ymm0 & ymm1 & ~ymm2) | (ymm0 & ymm1 & ymm2)
 ; X64NOBW-NEXT:    retq # encoding: [0xc3]
   %1 = bitcast i32 %mask to <32 x i1>
   %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2)
@@ -602,8 +654,10 @@ define <32 x i8> @test_vgf2p8mulb_256_maskz(<32 x i8> %src1, <32 x i8> %src2, i3
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06]
 ; X86NOBW-NEXT:    vgf2p8mulb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc1]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
+; X86NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} # encoding: [0x62,0xf3,0x6d,0xca,0x25,0xd2,0xff]
+; X86NOBW-NEXT:    # zmm2 {%k2} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01]
 ; X86NOBW-NEXT:    vpand %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xdb,0xc0]
@@ -616,8 +670,10 @@ define <32 x i8> @test_vgf2p8mulb_256_maskz(<32 x i8> %src1, <32 x i8> %src2, i3
 ; X64NOBW-NEXT:    kmovw %edi, %k2 # encoding: [0xc5,0xf8,0x92,0xd7]
 ; X64NOBW-NEXT:    vgf2p8mulb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc1]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
+; X64NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} # encoding: [0x62,0xf3,0x6d,0xca,0x25,0xd2,0xff]
+; X64NOBW-NEXT:    # zmm2 {%k2} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2]
 ; X64NOBW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01]
 ; X64NOBW-NEXT:    vpand %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xdb,0xc0]
@@ -661,17 +717,22 @@ define <64 x i8> @test_vgf2p8mulb_512_mask(<64 x i8> %src1, <64 x i8> %src2, <64
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08]
 ; X86NOBW-NEXT:    vgf2p8mulb %zmm1, %zmm0, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc9]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} # encoding: [0x62,0xf3,0x7d,0xcc,0x25,0xc0,0xff]
+; X86NOBW-NEXT:    # zmm0 {%k4} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} # encoding: [0x62,0xf3,0x65,0xcb,0x25,0xdb,0xff]
+; X86NOBW-NEXT:    # zmm3 {%k3} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x65,0xc9,0x25,0xdb,0xff]
+; X86NOBW-NEXT:    # zmm3 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} # encoding: [0x62,0xf3,0x5d,0xca,0x25,0xe4,0xff]
+; X86NOBW-NEXT:    # zmm4 {%k2} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x65,0x38,0xdc,0x01]
 ; X86NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0 # encoding: [0x62,0xf3,0xe5,0x48,0x3a,0xc0,0x01]
 ; X86NOBW-NEXT:    vpternlogq $202, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0xca]
+; X86NOBW-NEXT:    # zmm0 = (~zmm0 & ~zmm1 & zmm2) | (~zmm0 & zmm1 & zmm2) | (zmm0 & zmm1 & ~zmm2) | (zmm0 & zmm1 & zmm2)
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64NOBW-LABEL: test_vgf2p8mulb_512_mask:
@@ -687,17 +748,22 @@ define <64 x i8> @test_vgf2p8mulb_512_mask(<64 x i8> %src1, <64 x i8> %src2, <64
 ; X64NOBW-NEXT:    kmovw %edi, %k4 # encoding: [0xc5,0xf8,0x92,0xe7]
 ; X64NOBW-NEXT:    vgf2p8mulb %zmm1, %zmm0, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc9]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} # encoding: [0x62,0xf3,0x7d,0xcc,0x25,0xc0,0xff]
+; X64NOBW-NEXT:    # zmm0 {%k4} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} # encoding: [0x62,0xf3,0x65,0xcb,0x25,0xdb,0xff]
+; X64NOBW-NEXT:    # zmm3 {%k3} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
 ; X64NOBW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x65,0xc9,0x25,0xdb,0xff]
+; X64NOBW-NEXT:    # zmm3 {%k1} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} # encoding: [0x62,0xf3,0x5d,0xca,0x25,0xe4,0xff]
+; X64NOBW-NEXT:    # zmm4 {%k2} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4]
 ; X64NOBW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x65,0x38,0xdc,0x01]
 ; X64NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0 # encoding: [0x62,0xf3,0xe5,0x48,0x3a,0xc0,0x01]
 ; X64NOBW-NEXT:    vpternlogq $202, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0xca]
+; X64NOBW-NEXT:    # zmm0 = (~zmm0 & ~zmm1 & zmm2) | (~zmm0 & zmm1 & zmm2) | (zmm0 & zmm1 & ~zmm2) | (zmm0 & zmm1 & zmm2)
 ; X64NOBW-NEXT:    retq # encoding: [0xc3]
   %1 = bitcast i64 %mask to <64 x i1>
   %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2)
@@ -726,13 +792,17 @@ define <64 x i8> @test_vgf2p8mulb_512_maskz(<64 x i8> %src1, <64 x i8> %src2, i6
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08]
 ; X86NOBW-NEXT:    vgf2p8mulb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc1]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff]
+; X86NOBW-NEXT:    # zmm1 {%k4} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} # encoding: [0x62,0xf3,0x6d,0xcb,0x25,0xd2,0xff]
+; X86NOBW-NEXT:    # zmm2 {%k3} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x6d,0xc9,0x25,0xd2,0xff]
+; X86NOBW-NEXT:    # zmm2 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff]
+; X86NOBW-NEXT:    # zmm3 {%k2} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x6d,0x38,0xd3,0x01]
 ; X86NOBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1 # encoding: [0x62,0xf3,0xed,0x48,0x3a,0xc9,0x01]
@@ -752,13 +822,17 @@ define <64 x i8> @test_vgf2p8mulb_512_maskz(<64 x i8> %src1, <64 x i8> %src2, i6
 ; X64NOBW-NEXT:    kmovw %edi, %k4 # encoding: [0xc5,0xf8,0x92,0xe7]
 ; X64NOBW-NEXT:    vgf2p8mulb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc1]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff]
+; X64NOBW-NEXT:    # zmm1 {%k4} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} # encoding: [0x62,0xf3,0x6d,0xcb,0x25,0xd2,0xff]
+; X64NOBW-NEXT:    # zmm2 {%k3} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2]
 ; X64NOBW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x6d,0xc9,0x25,0xd2,0xff]
+; X64NOBW-NEXT:    # zmm2 {%k1} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2]
 ; X64NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff]
+; X64NOBW-NEXT:    # zmm3 {%k2} {z} = -1
 ; X64NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
 ; X64NOBW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x6d,0x38,0xd3,0x01]
 ; X64NOBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1 # encoding: [0x62,0xf3,0xed,0x48,0x3a,0xc9,0x01]
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 6c9c28bc9e55e..9b94a9cf45ddf 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -7682,6 +7682,7 @@ define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32>
 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_512:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xc2,0x21]
+; CHECK-NEXT:    ## zmm0 = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2)
 ; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
   ret <16 x i32> %res
@@ -7692,12 +7693,14 @@ define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x
 ; X86:       ## %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x49,0x25,0xc2,0x21]
+; X86-NEXT:    ## zmm0 {%k1} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2)
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x49,0x25,0xc2,0x21]
+; X64-NEXT:    ## zmm0 {%k1} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2)
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
   ret <16 x i32> %res
@@ -7710,12 +7713,14 @@ define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x
 ; X86:       ## %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc2,0x21]
+; X86-NEXT:    ## zmm0 {%k1} {z} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2)
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc2,0x21]
+; X64-NEXT:    ## zmm0 {%k1} {z} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2)
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
   ret <16 x i32> %res
@@ -7727,6 +7732,7 @@ define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1
 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_512:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0x21]
+; CHECK-NEXT:    ## zmm0 = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2)
 ; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
   ret <8 x i64> %res
@@ -7738,12 +7744,14 @@ define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x49,0x25,0xc2,0x21]
+; X86-NEXT:    ## zmm0 {%k1} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2)
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x49,0x25,0xc2,0x21]
+; X64-NEXT:    ## zmm0 {%k1} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2)
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
   ret <8 x i64> %res
@@ -7757,12 +7765,14 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xc9,0x25,0xc2,0x21]
+; X86-NEXT:    ## zmm0 {%k1} {z} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2)
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xc9,0x25,0xc2,0x21]
+; X64-NEXT:    ## zmm0 {%k1} {z} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2)
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
   ret <8 x i64> %res
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index 86ebb1e40870f..832e55a835525 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -301,6 +301,7 @@ define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vcmpeqps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x00]
 ; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff]
+; AVX512-NEXT:    ## zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vpsrld $31, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x72,0xd0,0x1f]
 ; AVX512-NEXT:    retq ## encoding: [0xc3]
 ;
@@ -520,6 +521,7 @@ define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1
 ; AVX512-NEXT:    vpcmpgtq %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x48,0x37,0xcb]
 ; AVX512-NEXT:    kxnorw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc9]
 ; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff]
+; AVX512-NEXT:    ## zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512-NEXT:    retq ## encoding: [0xc3]
 ;
@@ -544,6 +546,7 @@ define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32>
 ; KNL-NEXT:    vpcmpgtd %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf1,0x6d,0x48,0x66,0xcb]
 ; KNL-NEXT:    kxorw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x47,0xc9]
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff]
+; KNL-NEXT:    ## zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0]
 ; KNL-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; KNL-NEXT:    retq ## encoding: [0xc3]
@@ -1233,6 +1236,7 @@ define <16 x i8> @test47(<16 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vptestnmd %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0x7e,0x48,0x27,0xc8]
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff]
+; KNL-NEXT:    ## zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0]
 ; KNL-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x4c,0xc1,0x00]
 ; KNL-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
@@ -1264,6 +1268,7 @@ define <16 x i16> @test48(<16 x i32> %a, <16 x i16> %b, <16 x i16> %c) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vptestnmd %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0x7e,0x48,0x27,0xc8]
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff]
+; KNL-NEXT:    ## zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x33,0xc0]
 ; KNL-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ## encoding: [0xc4,0xe3,0x6d,0x4c,0xc1,0x00]
 ; KNL-NEXT:    retq ## encoding: [0xc3]
@@ -1292,6 +1297,7 @@ define <8 x i16> @test49(<8 x i64> %a, <8 x i16> %b, <8 x i16> %c) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vptestnmq %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0xfe,0x48,0x27,0xc8]
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff]
+; KNL-NEXT:    ## zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x33,0xc0]
 ; KNL-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x4c,0xc1,0x00]
 ; KNL-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
@@ -1408,6 +1414,7 @@ define <4 x i32> @zext_bool_logic(<4 x i64> %cond1, <4 x i64> %cond2, <4 x i32>
 ; AVX512-NEXT:    vptestnmq %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0xf6,0x48,0x27,0xc9]
 ; AVX512-NEXT:    korw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x45,0xc9]
 ; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff]
+; AVX512-NEXT:    ## zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vpsubd %xmm0, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xfa,0xc0]
 ; AVX512-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; AVX512-NEXT:    retq ## encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index c0bb0037923dc..519f19740ab25 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -12346,6 +12346,7 @@ define <4 x i32>@test_int_x86_avx512_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1
 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0x75,0x08,0x25,0xc2,0x21]
+; CHECK-NEXT:    # xmm0 = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1)
   ret <4 x i32> %res
@@ -12357,12 +12358,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21]
+; X86-NEXT:    # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21]
+; X64-NEXT:    # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4)
   ret <4 x i32> %res
@@ -12376,12 +12379,14 @@ define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i3
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21]
+; X86-NEXT:    # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21]
+; X64-NEXT:    # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4)
   ret <4 x i32> %res
@@ -12393,6 +12398,7 @@ define <8 x i32>@test_int_x86_avx512_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1
 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0x75,0x28,0x25,0xc2,0x21]
+; CHECK-NEXT:    # ymm0 = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1)
   ret <8 x i32> %res
@@ -12404,12 +12410,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21]
+; X86-NEXT:    # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21]
+; X64-NEXT:    # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4)
   ret <8 x i32> %res
@@ -12423,12 +12431,14 @@ define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i3
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21]
+; X86-NEXT:    # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21]
+; X64-NEXT:    # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4)
   ret <8 x i32> %res
@@ -12440,6 +12450,7 @@ define <2 x i64>@test_int_x86_avx512_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1
 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0x21]
+; CHECK-NEXT:    # xmm0 = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1)
   ret <2 x i64> %res
@@ -12451,12 +12462,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21]
+; X86-NEXT:    # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21]
+; X64-NEXT:    # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4)
   ret <2 x i64> %res
@@ -12470,12 +12483,14 @@ define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i6
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21]
+; X86-NEXT:    # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21]
+; X64-NEXT:    # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4)
   ret <2 x i64> %res
@@ -12487,6 +12502,7 @@ define <4 x i64>@test_int_x86_avx512_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1
 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0x21]
+; CHECK-NEXT:    # ymm0 = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1)
   ret <4 x i64> %res
@@ -12498,12 +12514,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21]
+; X86-NEXT:    # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21]
+; X64-NEXT:    # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4)
   ret <4 x i64> %res
@@ -12517,12 +12535,14 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i6
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21]
+; X86-NEXT:    # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21]
+; X64-NEXT:    # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4)
   ret <4 x i64> %res
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
index f1c70378b1eb3..6c7a5d2f86341 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -4150,6 +4150,7 @@ define <4 x i32>@test_int_x86_avx512_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1
 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0x75,0x08,0x25,0xc2,0x21]
+; CHECK-NEXT:    # xmm0 = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33)
   ret <4 x i32> %1
@@ -4161,12 +4162,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21]
+; X86-NEXT:    # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21]
+; X64-NEXT:    # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33)
   %2 = bitcast i8 %x4 to <8 x i1>
@@ -4183,12 +4186,14 @@ define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i3
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21]
+; X86-NEXT:    # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21]
+; X64-NEXT:    # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33)
   %2 = bitcast i8 %x4 to <8 x i1>
@@ -4203,6 +4208,7 @@ define <8 x i32>@test_int_x86_avx512_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1
 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0x75,0x28,0x25,0xc2,0x21]
+; CHECK-NEXT:    # ymm0 = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33)
   ret <8 x i32> %1
@@ -4214,12 +4220,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21]
+; X86-NEXT:    # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21]
+; X64-NEXT:    # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33)
   %2 = bitcast i8 %x4 to <8 x i1>
@@ -4235,12 +4243,14 @@ define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i3
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21]
+; X86-NEXT:    # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21]
+; X64-NEXT:    # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33)
   %2 = bitcast i8 %x4 to <8 x i1>
@@ -4254,6 +4264,7 @@ define <2 x i64>@test_int_x86_avx512_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1
 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0x21]
+; CHECK-NEXT:    # xmm0 = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33)
   ret <2 x i64> %1
@@ -4265,12 +4276,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21]
+; X86-NEXT:    # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21]
+; X64-NEXT:    # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33)
   %2 = bitcast i8 %x4 to <8 x i1>
@@ -4285,12 +4298,14 @@ define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i6
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21]
+; X86-NEXT:    # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21]
+; X64-NEXT:    # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33)
   %2 = bitcast i8 %x4 to <8 x i1>
@@ -4305,6 +4320,7 @@ define <4 x i64>@test_int_x86_avx512_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1
 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0x21]
+; CHECK-NEXT:    # ymm0 = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33)
   ret <4 x i64> %1
@@ -4316,12 +4332,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21]
+; X86-NEXT:    # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21]
+; X64-NEXT:    # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33)
   %2 = bitcast i8 %x4 to <8 x i1>
@@ -4336,12 +4354,14 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i6
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21]
+; X86-NEXT:    # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21]
+; X64-NEXT:    # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2)
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33)
   %2 = bitcast i8 %x4 to <8 x i1>
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-labels-empty-block.ll b/llvm/test/CodeGen/X86/basic-block-address-map-empty-block.ll
similarity index 83%
rename from llvm/test/CodeGen/X86/basic-block-sections-labels-empty-block.ll
rename to llvm/test/CodeGen/X86/basic-block-address-map-empty-block.ll
index 8e0f4fa7bc928..84948b7ecf6e0 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-labels-empty-block.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-empty-block.ll
@@ -1,5 +1,5 @@
 ;; This test verifies that with -gc-empty-basic-blocks SHT_LLVM_BB_ADDR_MAP will not include entries for empty blocks.
-; RUN: llc < %s -mtriple=x86_64 -O0 -basic-block-sections=labels -gc-empty-basic-blocks | FileCheck --check-prefix=CHECK %s
+; RUN: llc < %s -mtriple=x86_64 -O0 -basic-block-address-map -gc-empty-basic-blocks | FileCheck --check-prefix=CHECK %s
 
 define void @foo(i1 zeroext %0) nounwind {
   br i1 %0, label %2, label %empty_block
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-labels-empty-function.ll b/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
similarity index 68%
rename from llvm/test/CodeGen/X86/basic-block-sections-labels-empty-function.ll
rename to llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
index 42d09212e6691..444655fc5299f 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-labels-empty-function.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
@@ -1,6 +1,6 @@
 ;; Verify that the BB address map is not emitted for empty functions.
-; RUN: llc < %s -mtriple=x86_64 -basic-block-sections=labels | FileCheck %s --check-prefixes=CHECK,BASIC
-; RUN: llc < %s -mtriple=x86_64 -basic-block-sections=labels -pgo-analysis-map=func-entry-count,bb-freq | FileCheck %s --check-prefixes=CHECK,PGO
+; RUN: llc < %s -mtriple=x86_64 -basic-block-address-map | FileCheck %s --check-prefixes=CHECK,BASIC
+; RUN: llc < %s -mtriple=x86_64 -basic-block-address-map -pgo-analysis-map=func-entry-count,bb-freq | FileCheck %s --check-prefixes=CHECK,PGO
 
 define void @empty_func() {
 entry:
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll b/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
index d7678604cffa2..9ff96381c2053 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
@@ -1,4 +1,3 @@
-; RUN: llc < %s -mtriple=x86_64 -function-sections -basic-block-sections=labels | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64 -function-sections -basic-block-address-map | FileCheck %s
 
 $_Z4fooTIiET_v = comdat any
diff --git a/llvm/test/CodeGen/X86/basic-block-labels-mir-parse.mir b/llvm/test/CodeGen/X86/basic-block-address-map-mir-parse.mir
similarity index 97%
rename from llvm/test/CodeGen/X86/basic-block-labels-mir-parse.mir
rename to llvm/test/CodeGen/X86/basic-block-address-map-mir-parse.mir
index 6408f0a30af7e..86f5f27494ec4 100644
--- a/llvm/test/CodeGen/X86/basic-block-labels-mir-parse.mir
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-mir-parse.mir
@@ -1,5 +1,5 @@
 # Start after bbsections0-prepare and check that the BB address map is generated.
-# RUN: llc -mtriple x86_64-unknown-linux-gnu -start-after=bbsections-prepare  %s -o - | FileCheck %s -check-prefix=CHECK
+# RUN: llc -mtriple x86_64-unknown-linux-gnu -start-after=bbsections-prepare -basic-block-address-map %s -o - | FileCheck %s -check-prefix=CHECK
 
 # How to generate the input:
 # foo.cc
@@ -9,7 +9,7 @@
 # }
 #
 # clang -O0 -S -emit-llvm foo.cc
-# llc < foo.ll -stop-after=bbsections-prepare -basic-block-sections=labels
+# llc < foo.ll -stop-after=bbsections-prepare -basic-block-address-map
 
 # CHECK: .section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text
 --- |
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-labels-pgo-features.ll b/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
similarity index 88%
rename from llvm/test/CodeGen/X86/basic-block-sections-labels-pgo-features.ll
rename to llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
index 92d3c88b4f601..73fe4f6ffedb0 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-labels-pgo-features.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
@@ -1,13 +1,13 @@
 ; Check the basic block sections labels option
-; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-sections=labels | FileCheck %s --check-prefixes=CHECK,BASIC
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-address-map | FileCheck %s --check-prefixes=CHECK,BASIC
 
 ;; Also verify this holds for all PGO features enabled
-; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-sections=labels -pgo-analysis-map=func-entry-count,bb-freq,br-prob | FileCheck %s --check-prefixes=CHECK,PGO-ALL,PGO-FEC,PGO-BBF,PGO-BRP
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-address-map -pgo-analysis-map=func-entry-count,bb-freq,br-prob | FileCheck %s --check-prefixes=CHECK,PGO-ALL,PGO-FEC,PGO-BBF,PGO-BRP
 
 ;; Also verify that pgo extension only includes the enabled feature
-; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-sections=labels -pgo-analysis-map=func-entry-count | FileCheck %s --check-prefixes=CHECK,PGO-FEC,FEC-ONLY
-; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-sections=labels -pgo-analysis-map=bb-freq | FileCheck %s --check-prefixes=CHECK,PGO-BBF,BBF-ONLY
-; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-sections=labels -pgo-analysis-map=br-prob | FileCheck %s --check-prefixes=CHECK,PGO-BRP,BRP-ONLY
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-address-map -pgo-analysis-map=func-entry-count | FileCheck %s --check-prefixes=CHECK,PGO-FEC,FEC-ONLY
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-address-map -pgo-analysis-map=bb-freq | FileCheck %s --check-prefixes=CHECK,PGO-BBF,BBF-ONLY
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-address-map -pgo-analysis-map=br-prob | FileCheck %s --check-prefixes=CHECK,PGO-BRP,BRP-ONLY
 
 
 define void @_Z3bazb(i1 zeroext, i1 zeroext) personality ptr @__gxx_personality_v0 !prof !0  {
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map.ll b/llvm/test/CodeGen/X86/basic-block-address-map.ll
index 6ab24b494936a..4f12258eeeea0 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map.ll
@@ -1,9 +1,7 @@
 ; Check the basic block sections labels option
 ; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-address-map | FileCheck %s --check-prefixes=CHECK,UNIQ
-; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-sections=labels | FileCheck %s --check-prefixes=CHECK,UNIQ
 ; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=false -basic-block-address-map | FileCheck %s --check-prefixes=CHECK,NOUNIQ
-; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=false -basic-block-sections=labels | FileCheck %s --check-prefixes=CHECK,NOUNIQ
-; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-sections=labels -split-machine-functions | FileCheck %s --check-prefixes=CHECK,UNIQ
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-address-map -split-machine-functions | FileCheck %s --check-prefixes=CHECK,UNIQ
 
 define void @_Z3bazb(i1 zeroext, i1 zeroext) personality ptr @__gxx_personality_v0 {
   br i1 %0, label %3, label %8
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-mir-print.ll b/llvm/test/CodeGen/X86/basic-block-sections-mir-print.ll
index 1767903561ce1..fec87656be195 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-mir-print.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-mir-print.ll
@@ -1,5 +1,5 @@
 ; Stop after bbsections-prepare and check MIR output for section type.
-; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=labels -stop-after=bbsections-prepare | FileCheck %s -check-prefix=BBLABELS
+; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-address-map -stop-after=bbsections-prepare | FileCheck %s -check-prefix=BBADDRMAP
 ; RUN: echo '!_Z3foob' > %t
 ; RUN: echo '!!1' >> %t
 ; RUN: echo '!!2' >> %t
@@ -33,7 +33,7 @@ define dso_local i32 @_Z3foob(i1 zeroext %0) {
 ; BBSECTIONS: bb.1 (%ir-block.7, bb_id 1)
 ; BBSECTIONS: bb.2 (%ir-block.8, bbsections 1, bb_id 2):
 
-; BBLABELS: bb.0 (%ir-block.1, bb_id 0):
-; BBLABELS: bb.1 (%ir-block.7, bb_id 1):
-; BBLABELS: bb.2 (%ir-block.8, bb_id 2):
-; BBLABELS: bb.3 (%ir-block.9, bb_id 3):
+; BBADDRMAP: bb.0 (%ir-block.1, bb_id 0):
+; BBADDRMAP: bb.1 (%ir-block.7, bb_id 1):
+; BBADDRMAP: bb.2 (%ir-block.8, bb_id 2):
+; BBADDRMAP: bb.3 (%ir-block.9, bb_id 3):
diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
new file mode 100644
index 0000000000000..f363cad816dfb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
@@ -0,0 +1,1243 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i386-linux-gnu < %s | FileCheck -check-prefixes=CHECK,NOSSE %s
+; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse < %s | FileCheck -check-prefixes=CHECK,SSE %s
+
+; Make sure no assert without SSE2 and bfloat. Issue 92899
+
+define bfloat @return_arg_bf16(bfloat %x) #0 {
+; CHECK-LABEL: return_arg_bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    retl
+  ret bfloat %x
+}
+
+define <2 x bfloat> @return_arg_v2bf16(<2 x bfloat> %x) #0 {
+; CHECK-LABEL: return_arg_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    retl
+  ret <2 x bfloat> %x
+}
+
+define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) #0 {
+; NOSSE-LABEL: return_arg_v3bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    pushl %eax
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %esi
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    # kill: def $ax killed $ax def $eax
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movzwl %si, %edi
+; NOSSE-NEXT:    orl %eax, %edi
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, 4(%esi)
+; NOSSE-NEXT:    movl %edi, (%esi)
+; NOSSE-NEXT:    movl %esi, %eax
+; NOSSE-NEXT:    addl $4, %esp
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    retl $4
+;
+; SSE-LABEL: return_arg_v3bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    pushl %eax
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %esi
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    # kill: def $ax killed $ax def $eax
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movzwl %si, %edi
+; SSE-NEXT:    orl %eax, %edi
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, 4(%esi)
+; SSE-NEXT:    movl %edi, (%esi)
+; SSE-NEXT:    movl %esi, %eax
+; SSE-NEXT:    addl $4, %esp
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    retl $4
+  ret <3 x bfloat> %x
+}
+
+define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) #0 {
+; NOSSE-LABEL: return_arg_v4bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %ebp
+; NOSSE-NEXT:    pushl %ebx
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    subl $12, %esp
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %esi
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %edi
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %ebx
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, 6(%ebp)
+; NOSSE-NEXT:    movw %bx, 4(%ebp)
+; NOSSE-NEXT:    movw %di, 2(%ebp)
+; NOSSE-NEXT:    movw %si, (%ebp)
+; NOSSE-NEXT:    movl %ebp, %eax
+; NOSSE-NEXT:    addl $12, %esp
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    popl %ebx
+; NOSSE-NEXT:    popl %ebp
+; NOSSE-NEXT:    retl $4
+;
+; SSE-LABEL: return_arg_v4bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    pushl %ebx
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    subl $12, %esp
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %esi
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %edi
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %ebx
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, 6(%ebp)
+; SSE-NEXT:    movw %bx, 4(%ebp)
+; SSE-NEXT:    movw %di, 2(%ebp)
+; SSE-NEXT:    movw %si, (%ebp)
+; SSE-NEXT:    movl %ebp, %eax
+; SSE-NEXT:    addl $12, %esp
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    popl %ebx
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    retl $4
+  ret <4 x bfloat> %x
+}
+
+define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) #0 {
+; NOSSE-LABEL: return_arg_v8bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %ebp
+; NOSSE-NEXT:    pushl %ebx
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    subl $12, %esp
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %esi
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %edi
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %ebx
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, 14(%ebp)
+; NOSSE-NEXT:    movw %bx, 12(%ebp)
+; NOSSE-NEXT:    movw %di, 10(%ebp)
+; NOSSE-NEXT:    movw %si, 8(%ebp)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 6(%ebp)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 4(%ebp)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 2(%ebp)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, (%ebp)
+; NOSSE-NEXT:    movl %ebp, %eax
+; NOSSE-NEXT:    addl $12, %esp
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    popl %ebx
+; NOSSE-NEXT:    popl %ebp
+; NOSSE-NEXT:    retl $4
+;
+; SSE-LABEL: return_arg_v8bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    pushl %ebx
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    subl $12, %esp
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %esi
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %edi
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %ebx
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, 14(%ebp)
+; SSE-NEXT:    movw %bx, 12(%ebp)
+; SSE-NEXT:    movw %di, 10(%ebp)
+; SSE-NEXT:    movw %si, 8(%ebp)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 6(%ebp)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 4(%ebp)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 2(%ebp)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, (%ebp)
+; SSE-NEXT:    movl %ebp, %eax
+; SSE-NEXT:    addl $12, %esp
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    popl %ebx
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    retl $4
+  ret <8 x bfloat> %x
+}
+
+define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 {
+; NOSSE-LABEL: return_arg_v16bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %ebp
+; NOSSE-NEXT:    pushl %ebx
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    subl $28, %esp
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %esi
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %ebx
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %ebp
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, 30(%edi)
+; NOSSE-NEXT:    movw %bp, 28(%edi)
+; NOSSE-NEXT:    movw %bx, 26(%edi)
+; NOSSE-NEXT:    movw %si, 24(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 22(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 20(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 18(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 16(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 14(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 12(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 10(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 8(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 6(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 4(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 2(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, (%edi)
+; NOSSE-NEXT:    movl %edi, %eax
+; NOSSE-NEXT:    addl $28, %esp
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    popl %ebx
+; NOSSE-NEXT:    popl %ebp
+; NOSSE-NEXT:    retl $4
+;
+; SSE-LABEL: return_arg_v16bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    pushl %ebx
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    subl $28, %esp
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %esi
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %ebx
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %ebp
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, 30(%edi)
+; SSE-NEXT:    movw %bp, 28(%edi)
+; SSE-NEXT:    movw %bx, 26(%edi)
+; SSE-NEXT:    movw %si, 24(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 22(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 20(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 18(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 16(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 14(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 12(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 10(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 8(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 6(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 4(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 2(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, (%edi)
+; SSE-NEXT:    movl %edi, %eax
+; SSE-NEXT:    addl $28, %esp
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    popl %ebx
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    retl $4
+  ret <16 x bfloat> %x
+}
+
+declare bfloat @returns_bf16(bfloat)
+declare <2 x bfloat> @returns_v2bf16(<2 x bfloat>)
+declare <3 x bfloat> @returns_v3bf16(<3 x bfloat>)
+declare <4 x bfloat> @returns_v4bf16(<4 x bfloat>)
+declare <8 x bfloat> @returns_v8bf16(<8 x bfloat>)
+declare <16 x bfloat> @returns_v16bf16(<16 x bfloat>)
+
+define void @call_ret_bf16(ptr %ptr) #0 {
+; NOSSE-LABEL: call_ret_bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    subl $8, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT:    movzwl (%esi), %eax
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll returns_bf16@PLT
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, (%esi)
+; NOSSE-NEXT:    addl $8, %esp
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    retl
+;
+; SSE-LABEL: call_ret_bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    subl $8, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT:    movzwl (%esi), %eax
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll returns_bf16@PLT
+; SSE-NEXT:    fstps (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, (%esi)
+; SSE-NEXT:    addl $8, %esp
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    retl
+  %val = load bfloat, ptr %ptr
+  %bf16 = call bfloat @returns_bf16(bfloat %val)
+  store bfloat %bf16, ptr %ptr
+  ret void
+}
+
+define void @call_ret_v2bf16(ptr %ptr) #0 {
+; NOSSE-LABEL: call_ret_v2bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    subl $20, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT:    movzwl 2(%edi), %eax
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl (%edi), %eax
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll returns_v2bf16@PLT
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %esi
+; NOSSE-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, 2(%edi)
+; NOSSE-NEXT:    movw %si, (%edi)
+; NOSSE-NEXT:    addl $20, %esp
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    retl
+;
+; SSE-LABEL: call_ret_v2bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    subl $36, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; SSE-NEXT:    movzwl 2(%edi), %eax
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl (%edi), %eax
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll returns_v2bf16@PLT
+; SSE-NEXT:    fxch %st(1)
+; SSE-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; SSE-NEXT:    fstps (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %esi
+; SSE-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; SSE-NEXT:    fstps (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, 2(%edi)
+; SSE-NEXT:    movw %si, (%edi)
+; SSE-NEXT:    addl $36, %esp
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    retl
+  %val = load <2 x bfloat>, ptr %ptr
+  %bf16 = call <2 x bfloat> @returns_v2bf16(<2 x bfloat> %val)
+  store <2 x bfloat> %bf16, ptr %ptr
+  ret void
+}
+
+define void @call_ret_v3bf16(ptr %ptr) #0 {
+; NOSSE-LABEL: call_ret_v3bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    subl $40, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT:    movl (%esi), %eax
+; NOSSE-NEXT:    movl 4(%esi), %ecx
+; NOSSE-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT:    movl %edx, (%esp)
+; NOSSE-NEXT:    shll $16, %ecx
+; NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl %eax, %ecx
+; NOSSE-NEXT:    andl $-65536, %ecx # imm = 0xFFFF0000
+; NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    calll returns_v3bf16@PLT
+; NOSSE-NEXT:    subl $4, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT:    movw %cx, 4(%esi)
+; NOSSE-NEXT:    movl %eax, (%esi)
+; NOSSE-NEXT:    addl $40, %esp
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    retl
+;
+; SSE-LABEL: call_ret_v3bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    subl $40, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT:    movl (%esi), %eax
+; SSE-NEXT:    movl 4(%esi), %ecx
+; SSE-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; SSE-NEXT:    movl %edx, (%esp)
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $-65536, %ecx # imm = 0xFFFF0000
+; SSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    calll returns_v3bf16@PLT
+; SSE-NEXT:    subl $4, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    movw %cx, 4(%esi)
+; SSE-NEXT:    movl %eax, (%esi)
+; SSE-NEXT:    addl $40, %esp
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    retl
+  %val = load <3 x bfloat>, ptr %ptr
+  %bf16 = call <3 x bfloat> @returns_v3bf16(<3 x bfloat> %val)
+  store <3 x bfloat> %bf16, ptr %ptr
+  ret void
+}
+
+define void @call_ret_v4bf16(ptr %ptr) #0 {
+; NOSSE-LABEL: call_ret_v4bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %ebx
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    subl $48, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT:    movzwl 2(%esi), %ecx
+; NOSSE-NEXT:    movl (%esi), %eax
+; NOSSE-NEXT:    movl 4(%esi), %edx
+; NOSSE-NEXT:    movzwl 6(%esi), %edi
+; NOSSE-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT:    movl %ebx, (%esp)
+; NOSSE-NEXT:    shll $16, %edi
+; NOSSE-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %edx
+; NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %ecx
+; NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    calll returns_v4bf16@PLT
+; NOSSE-NEXT:    subl $4, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT:    movw %dx, 6(%esi)
+; NOSSE-NEXT:    movw %cx, 4(%esi)
+; NOSSE-NEXT:    movl %eax, (%esi)
+; NOSSE-NEXT:    addl $48, %esp
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    popl %ebx
+; NOSSE-NEXT:    retl
+;
+; SSE-LABEL: call_ret_v4bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %ebx
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    subl $48, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT:    movzwl 2(%esi), %ecx
+; SSE-NEXT:    movl (%esi), %eax
+; SSE-NEXT:    movl 4(%esi), %edx
+; SSE-NEXT:    movzwl 6(%esi), %edi
+; SSE-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT:    movl %ebx, (%esp)
+; SSE-NEXT:    shll $16, %edi
+; SSE-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    calll returns_v4bf16@PLT
+; SSE-NEXT:    subl $4, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; SSE-NEXT:    movw %dx, 6(%esi)
+; SSE-NEXT:    movw %cx, 4(%esi)
+; SSE-NEXT:    movl %eax, (%esi)
+; SSE-NEXT:    addl $48, %esp
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    popl %ebx
+; SSE-NEXT:    retl
+  %val = load <4 x bfloat>, ptr %ptr
+  %bf16 = call <4 x bfloat> @returns_v4bf16(<4 x bfloat> %val)
+  store <4 x bfloat> %bf16, ptr %ptr
+  ret void
+}
+
+define void @call_ret_v8bf16(ptr %ptr) #0 {
+; NOSSE-LABEL: call_ret_v8bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %ebp
+; NOSSE-NEXT:    pushl %ebx
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    subl $108, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT:    movzwl 2(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl (%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 4(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl 6(%esi), %edi
+; NOSSE-NEXT:    movl 8(%esi), %ebx
+; NOSSE-NEXT:    movzwl 10(%esi), %ebp
+; NOSSE-NEXT:    movl 12(%esi), %ecx
+; NOSSE-NEXT:    movzwl 14(%esi), %eax
+; NOSSE-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT:    movl %edx, (%esp)
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %ecx
+; NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %ebp
+; NOSSE-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %ebx
+; NOSSE-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %edi
+; NOSSE-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    calll returns_v8bf16@PLT
+; NOSSE-NEXT:    subl $4, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ebp
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movw %ax, 14(%esi)
+; NOSSE-NEXT:    movw %bp, 12(%esi)
+; NOSSE-NEXT:    movw %bx, 10(%esi)
+; NOSSE-NEXT:    movw %di, 8(%esi)
+; NOSSE-NEXT:    movw %dx, 6(%esi)
+; NOSSE-NEXT:    movw %cx, 4(%esi)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    movl %eax, (%esi)
+; NOSSE-NEXT:    addl $108, %esp
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    popl %ebx
+; NOSSE-NEXT:    popl %ebp
+; NOSSE-NEXT:    retl
+;
+; SSE-LABEL: call_ret_v8bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    pushl %ebx
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    subl $108, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT:    movzwl 2(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl (%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 4(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl 6(%esi), %edi
+; SSE-NEXT:    movl 8(%esi), %ebx
+; SSE-NEXT:    movzwl 10(%esi), %ebp
+; SSE-NEXT:    movl 12(%esi), %ecx
+; SSE-NEXT:    movzwl 14(%esi), %eax
+; SSE-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; SSE-NEXT:    movl %edx, (%esp)
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %ebp
+; SSE-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %ebx
+; SSE-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %edi
+; SSE-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    calll returns_v8bf16@PLT
+; SSE-NEXT:    subl $4, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edi
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ebp
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movw %ax, 14(%esi)
+; SSE-NEXT:    movw %bp, 12(%esi)
+; SSE-NEXT:    movw %bx, 10(%esi)
+; SSE-NEXT:    movw %di, 8(%esi)
+; SSE-NEXT:    movw %dx, 6(%esi)
+; SSE-NEXT:    movw %cx, 4(%esi)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    movl %eax, (%esi)
+; SSE-NEXT:    addl $108, %esp
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    popl %ebx
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    retl
+  %val = load <8 x bfloat>, ptr %ptr
+  %bf16 = call <8 x bfloat> @returns_v8bf16(<8 x bfloat> %val)
+  store <8 x bfloat> %bf16, ptr %ptr
+  ret void
+}
+
+define void @call_ret_v16bf16(ptr %ptr) #0 {
+; NOSSE-LABEL: call_ret_v16bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %ebp
+; NOSSE-NEXT:    movl %esp, %ebp
+; NOSSE-NEXT:    pushl %ebx
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    andl $-32, %esp
+; NOSSE-NEXT:    subl $256, %esp # imm = 0x100
+; NOSSE-NEXT:    movl 8(%ebp), %esi
+; NOSSE-NEXT:    movzwl 2(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl (%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 4(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl 6(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 8(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl 10(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 12(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl 14(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 16(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl 18(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 20(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl 22(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 24(%esi), %edi
+; NOSSE-NEXT:    movzwl 26(%esi), %edx
+; NOSSE-NEXT:    movl 28(%esi), %ecx
+; NOSSE-NEXT:    movzwl 30(%esi), %eax
+; NOSSE-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT:    movl %ebx, (%esp)
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %ecx
+; NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %edx
+; NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %edi
+; NOSSE-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    calll returns_v16bf16@PLT
+; NOSSE-NEXT:    subl $4, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT:    movl %edx, 28(%esi)
+; NOSSE-NEXT:    movl %eax, 24(%esi)
+; NOSSE-NEXT:    movl %ecx, 20(%esi)
+; NOSSE-NEXT:    movl %ebx, 16(%esi)
+; NOSSE-NEXT:    movl %edi, 12(%esi)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    movl %eax, 8(%esi)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    movl %eax, 4(%esi)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    movl %eax, (%esi)
+; NOSSE-NEXT:    leal -12(%ebp), %esp
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    popl %ebx
+; NOSSE-NEXT:    popl %ebp
+; NOSSE-NEXT:    retl
+;
+; SSE-LABEL: call_ret_v16bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    movl %esp, %ebp
+; SSE-NEXT:    pushl %ebx
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    andl $-32, %esp
+; SSE-NEXT:    subl $256, %esp # imm = 0x100
+; SSE-NEXT:    movl 8(%ebp), %esi
+; SSE-NEXT:    movzwl 2(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl (%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 4(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl 6(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 8(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl 10(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 12(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl 14(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 16(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl 18(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 20(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl 22(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 24(%esi), %edi
+; SSE-NEXT:    movzwl 26(%esi), %edx
+; SSE-NEXT:    movl 28(%esi), %ecx
+; SSE-NEXT:    movzwl 30(%esi), %eax
+; SSE-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT:    movl %ebx, (%esp)
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %edi
+; SSE-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    calll returns_v16bf16@PLT
+; SSE-NEXT:    subl $4, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SSE-NEXT:    movl %edx, 28(%esi)
+; SSE-NEXT:    movl %eax, 24(%esi)
+; SSE-NEXT:    movl %ecx, 20(%esi)
+; SSE-NEXT:    movl %ebx, 16(%esi)
+; SSE-NEXT:    movl %edi, 12(%esi)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    movl %eax, 8(%esi)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    movl %eax, 4(%esi)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    movl %eax, (%esi)
+; SSE-NEXT:    leal -12(%ebp), %esp
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    popl %ebx
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    retl
+  %val = load <16 x bfloat>, ptr %ptr
+  %bf16 = call <16 x bfloat> @returns_v16bf16(<16 x bfloat> %val)
+  store <16 x bfloat> %bf16, ptr %ptr
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/fold-add.ll b/llvm/test/CodeGen/X86/fold-add.ll
index 8c28d66597fb3..3a4b1e6fcf77f 100644
--- a/llvm/test/CodeGen/X86/fold-add.ll
+++ b/llvm/test/CodeGen/X86/fold-add.ll
@@ -180,4 +180,71 @@ entry:
   ret i64 add (i64 ptrtoint (ptr @foo to i64), i64 -2147483649)
 }
 
+define internal void @bar() #0 {
+; STATIC-LABEL: bar:
+; STATIC:       # %bb.0:
+; STATIC-NEXT:    retq
+;
+; PIC-LABEL: bar:
+; PIC:       # %bb.0:
+; PIC-NEXT:    retq
+;
+; MSTATIC-LABEL: bar:
+; MSTATIC:       # %bb.0:
+; MSTATIC-NEXT:    retq
+;
+; MPIC-LABEL: bar:
+; MPIC:       # %bb.0:
+; MPIC-NEXT:    retq
+  ret void
+}
+
+define dso_local i64 @fun_neg_0xfeffffff() #0 {
+; STATIC-LABEL: fun_neg_0xfeffffff:
+; STATIC:       # %bb.0:
+; STATIC-NEXT:    movl $bar, %eax
+; STATIC-NEXT:    addq $-16777217, %rax # imm = 0xFEFFFFFF
+; STATIC-NEXT:    retq
+;
+; PIC-LABEL: fun_neg_0xfeffffff:
+; PIC:       # %bb.0:
+; PIC-NEXT:    leaq bar-16777217(%rip), %rax
+; PIC-NEXT:    retq
+;
+; MSTATIC-LABEL: fun_neg_0xfeffffff:
+; MSTATIC:       # %bb.0:
+; MSTATIC-NEXT:    movl $bar, %eax
+; MSTATIC-NEXT:    addq $-16777217, %rax # imm = 0xFEFFFFFF
+; MSTATIC-NEXT:    retq
+;
+; MPIC-LABEL: fun_neg_0xfeffffff:
+; MPIC:       # %bb.0:
+; MPIC-NEXT:    leaq bar-16777217(%rip), %rax
+; MPIC-NEXT:    retq
+  ret i64 add (i64 ptrtoint (ptr @bar to i64), i64 -16777217)
+}
+
+define dso_local i64 @fun_neg_ff000000() #0 {
+; STATIC-LABEL: fun_neg_ff000000:
+; STATIC:       # %bb.0:
+; STATIC-NEXT:    leaq bar-16777216(%rip), %rax
+; STATIC-NEXT:    retq
+;
+; PIC-LABEL: fun_neg_ff000000:
+; PIC:       # %bb.0:
+; PIC-NEXT:    leaq bar-16777216(%rip), %rax
+; PIC-NEXT:    retq
+;
+; MSTATIC-LABEL: fun_neg_ff000000:
+; MSTATIC:       # %bb.0:
+; MSTATIC-NEXT:    leaq bar-16777216(%rip), %rax
+; MSTATIC-NEXT:    retq
+;
+; MPIC-LABEL: fun_neg_ff000000:
+; MPIC:       # %bb.0:
+; MPIC-NEXT:    leaq bar-16777216(%rip), %rax
+; MPIC-NEXT:    retq
+  ret i64 add (i64 ptrtoint (ptr @bar to i64), i64 -16777216)
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/ipra-local-linkage-2.ll b/llvm/test/CodeGen/X86/ipra-local-linkage-2.ll
new file mode 100644
index 0000000000000..05d3f70820fb0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ipra-local-linkage-2.ll
@@ -0,0 +1,152 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=x86_64-- < %s | FileCheck --check-prefix=X64 %s
+; RUN: llc --mtriple=i386-- < %s | FileCheck --check-prefix=X86 %s
+
+; This test is to ensure rbp/rbx/ebp/esi is correctly saved/restored before clobbered when enable ipra.
+
+define internal void @callee_clobber_rbp() nounwind norecurse {
+; X64-LABEL: callee_clobber_rbp:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    #APP
+; X64-NEXT:    xorl %ebp, %ebp
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+;
+; X86-LABEL: callee_clobber_rbp:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    #APP
+; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  call void asm sideeffect "xor %ebp, %ebp", "~{ebp}"()
+  ret void
+}
+
+define internal void @callee_clobber_rbx(ptr %addr) nounwind norecurse {
+; X64-LABEL: callee_clobber_rbx:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    #APP
+; X64-NEXT:    xorl %ebx, %ebx
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+  call void asm sideeffect "xor %ebx, %ebx", "~{ebx}"()
+  ret void
+}
+
+define internal void @callee_clobber_esi(ptr %addr) nounwind norecurse {
+; X86-LABEL: callee_clobber_esi:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    #APP
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  call void asm sideeffect "xor %esi, %esi", "~{esi}"()
+  ret void
+}
+
+define void @caller_use_rbp() "frame-pointer"="all" nounwind {
+; X64-LABEL: caller_use_rbp:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    subq $16, %rsp
+; X64-NEXT:    callq callee_clobber_rbp
+; X64-NEXT:    movl $5, -4(%rbp)
+; X64-NEXT:    addq $16, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+;
+; X86-LABEL: caller_use_rbp:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll callee_clobber_rbp
+; X86-NEXT:    movl $5, -4(%ebp)
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  call void @callee_clobber_rbp()
+  %addr = alloca i32, align 4
+  store i32 5, ptr %addr, align 4
+  ret void
+}
+
+define void @caller_use_rbx(i32 %X) nounwind ssp {
+; X64-LABEL: caller_use_rbx:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $64, %rsp
+; X64-NEXT:    movq %rsp, %rbx
+; X64-NEXT:    movq __stack_chk_guard(%rip), %rax
+; X64-NEXT:    movq %rax, 32(%rbx)
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    leaq 15(,%rcx,4), %rcx
+; X64-NEXT:    andq $-16, %rcx
+; X64-NEXT:    subq %rcx, %rax
+; X64-NEXT:    movq %rax, %rsp
+; X64-NEXT:    movq %rbx, %rdi
+; X64-NEXT:    callq callee_clobber_rbx
+; X64-NEXT:    movq __stack_chk_guard(%rip), %rax
+; X64-NEXT:    cmpq 32(%rbx), %rax
+; X64-NEXT:    jne .LBB4_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    leaq -8(%rbp), %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_2:
+; X64-NEXT:    callq __stack_chk_fail@PLT
+  %realign = alloca i32, align 32
+  %addr = alloca i32, i32 %X
+  call void @callee_clobber_rbx(ptr %realign)
+  ret void
+}
+
+define void @caller_use_esi(i32 %X) nounwind ssp {
+; X86-LABEL: caller_use_esi:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-32, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    movl %esp, %esi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl __stack_chk_guard, %ecx
+; X86-NEXT:    movl %ecx, 16(%esi)
+; X86-NEXT:    movl %esp, %ecx
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, %esp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll callee_clobber_esi
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl __stack_chk_guard, %eax
+; X86-NEXT:    cmpl 16(%esi), %eax
+; X86-NEXT:    jne .LBB5_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB5_2:
+; X86-NEXT:    calll __stack_chk_fail
+  %realign = alloca i32, align 32
+  %addr = alloca i32, i32 %X
+  call void @callee_clobber_esi(ptr %realign)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/isel-fp-to-int.ll b/llvm/test/CodeGen/X86/isel-fp-to-int.ll
new file mode 100644
index 0000000000000..fae3db6ad0afa
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-fp-to-int.ll
@@ -0,0 +1,391 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s                                   -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes X64,SDAG-X64
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes X64,GISEL-X64
+; RUN: llc < %s                                   -mattr=+avx512f -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes AVX512,SDAG-AVX512
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mattr=+avx512f -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes AVX512,GISEL-AVX512
+
+define i64 @test_double_to_ui64(double %x) {
+; SDAG-X64-LABEL: test_double_to_ui64:
+; SDAG-X64:       # %bb.0: # %entry
+; SDAG-X64-NEXT:    cvttsd2si %xmm0, %rcx
+; SDAG-X64-NEXT:    movq %rcx, %rdx
+; SDAG-X64-NEXT:    sarq $63, %rdx
+; SDAG-X64-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SDAG-X64-NEXT:    cvttsd2si %xmm0, %rax
+; SDAG-X64-NEXT:    andq %rdx, %rax
+; SDAG-X64-NEXT:    orq %rcx, %rax
+; SDAG-X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: test_double_to_ui64:
+; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-NEXT:    cvttsd2si %xmm0, %rcx
+; GISEL-X64-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
+; GISEL-X64-NEXT:    movapd %xmm0, %xmm2
+; GISEL-X64-NEXT:    subsd %xmm1, %xmm2
+; GISEL-X64-NEXT:    cvttsd2si %xmm2, %rdx
+; GISEL-X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; GISEL-X64-NEXT:    xorq %rdx, %rax
+; GISEL-X64-NEXT:    xorl %edx, %edx
+; GISEL-X64-NEXT:    ucomisd %xmm1, %xmm0
+; GISEL-X64-NEXT:    setb %dl
+; GISEL-X64-NEXT:    andl $1, %edx
+; GISEL-X64-NEXT:    cmovneq %rcx, %rax
+; GISEL-X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_ui64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2usi %xmm0, %rax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptoui double %x to i64
+  ret i64 %conv
+}
+
+define i32 @test_double_to_ui32(double %x) {
+; X64-LABEL: test_double_to_ui32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_ui32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2usi %xmm0, %eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptoui double %x to i32
+  ret i32 %conv
+}
+
+define zeroext i16 @test_double_to_ui16(double %x) {
+; X64-LABEL: test_double_to_ui16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; SDAG-AVX512-LABEL: test_double_to_ui16:
+; SDAG-AVX512:       # %bb.0: # %entry
+; SDAG-AVX512-NEXT:    vcvttsd2si %xmm0, %eax
+; SDAG-AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; SDAG-AVX512-NEXT:    retq
+;
+; GISEL-AVX512-LABEL: test_double_to_ui16:
+; GISEL-AVX512:       # %bb.0: # %entry
+; GISEL-AVX512-NEXT:    vcvttsd2usi %xmm0, %eax
+; GISEL-AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; GISEL-AVX512-NEXT:    retq
+entry:
+  %conv = fptoui double %x to i16
+  ret i16 %conv
+}
+
+define zeroext i8 @test_double_to_ui8(double %x) {
+; X64-LABEL: test_double_to_ui8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; SDAG-AVX512-LABEL: test_double_to_ui8:
+; SDAG-AVX512:       # %bb.0: # %entry
+; SDAG-AVX512-NEXT:    vcvttsd2si %xmm0, %eax
+; SDAG-AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; SDAG-AVX512-NEXT:    retq
+;
+; GISEL-AVX512-LABEL: test_double_to_ui8:
+; GISEL-AVX512:       # %bb.0: # %entry
+; GISEL-AVX512-NEXT:    vcvttsd2usi %xmm0, %eax
+; GISEL-AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; GISEL-AVX512-NEXT:    retq
+entry:
+  %conv = fptoui double %x to i8
+  ret i8 %conv
+}
+
+define i64 @test_float_to_ui64(float %x) {
+; SDAG-X64-LABEL: test_float_to_ui64:
+; SDAG-X64:       # %bb.0: # %entry
+; SDAG-X64-NEXT:    cvttss2si %xmm0, %rcx
+; SDAG-X64-NEXT:    movq %rcx, %rdx
+; SDAG-X64-NEXT:    sarq $63, %rdx
+; SDAG-X64-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SDAG-X64-NEXT:    cvttss2si %xmm0, %rax
+; SDAG-X64-NEXT:    andq %rdx, %rax
+; SDAG-X64-NEXT:    orq %rcx, %rax
+; SDAG-X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: test_float_to_ui64:
+; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-NEXT:    cvttss2si %xmm0, %rcx
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    movaps %xmm0, %xmm2
+; GISEL-X64-NEXT:    subss %xmm1, %xmm2
+; GISEL-X64-NEXT:    cvttss2si %xmm2, %rdx
+; GISEL-X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; GISEL-X64-NEXT:    xorq %rdx, %rax
+; GISEL-X64-NEXT:    xorl %edx, %edx
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setb %dl
+; GISEL-X64-NEXT:    andl $1, %edx
+; GISEL-X64-NEXT:    cmovneq %rcx, %rax
+; GISEL-X64-NEXT:    retq
+;
+; AVX512-LABEL: test_float_to_ui64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttss2usi %xmm0, %rax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptoui float %x to i64
+  ret i64 %conv
+}
+
+define i32 @test_float_to_ui32(float %x) {
+; X64-LABEL: test_float_to_ui32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_float_to_ui32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttss2usi %xmm0, %eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptoui float %x to i32
+  ret i32 %conv
+}
+
+define zeroext i16 @test_float_to_ui16(float %x) {
+; X64-LABEL: test_float_to_ui16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; SDAG-AVX512-LABEL: test_float_to_ui16:
+; SDAG-AVX512:       # %bb.0: # %entry
+; SDAG-AVX512-NEXT:    vcvttss2si %xmm0, %eax
+; SDAG-AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; SDAG-AVX512-NEXT:    retq
+;
+; GISEL-AVX512-LABEL: test_float_to_ui16:
+; GISEL-AVX512:       # %bb.0: # %entry
+; GISEL-AVX512-NEXT:    vcvttss2usi %xmm0, %eax
+; GISEL-AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; GISEL-AVX512-NEXT:    retq
+entry:
+  %conv = fptoui float %x to i16
+  ret i16 %conv
+}
+
+define zeroext i8 @test_float_to_ui8(float %x) {
+; X64-LABEL: test_float_to_ui8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; SDAG-AVX512-LABEL: test_float_to_ui8:
+; SDAG-AVX512:       # %bb.0: # %entry
+; SDAG-AVX512-NEXT:    vcvttss2si %xmm0, %eax
+; SDAG-AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; SDAG-AVX512-NEXT:    retq
+;
+; GISEL-AVX512-LABEL: test_float_to_ui8:
+; GISEL-AVX512:       # %bb.0: # %entry
+; GISEL-AVX512-NEXT:    vcvttss2usi %xmm0, %eax
+; GISEL-AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; GISEL-AVX512-NEXT:    retq
+entry:
+  %conv = fptoui float %x to i8
+  ret i8 %conv
+}
+
+define i64 @test_double_to_si64(double %x) {
+; X64-LABEL: test_double_to_si64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %rax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_si64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi double %x to i64
+  ret i64 %conv
+}
+
+define i32 @test_double_to_si32(double %x) {
+; X64-LABEL: test_double_to_si32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %eax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_si32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2si %xmm0, %eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi double %x to i32
+  ret i32 %conv
+}
+
+define signext i16 @test_double_to_si16(double %x) {
+; X64-LABEL: test_double_to_si16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_si16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2si %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi double %x to i16
+  ret i16 %conv
+}
+
+define signext i8 @test_double_to_si8(double %x) {
+; X64-LABEL: test_double_to_si8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_si8:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2si %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi double %x to i8
+  ret i8 %conv
+}
+
+define i31 @test_double_to_si31(double %x) {
+; X64-LABEL: test_double_to_si31:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %eax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_si31:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2si %xmm0, %eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi double %x to i31
+  ret i31 %conv
+}
+
+define i33 @test_double_to_si33(double %x) {
+; X64-LABEL: test_double_to_si33:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %rax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_si33:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi double %x to i33
+  ret i33 %conv
+}
+
+define i64 @test_float_to_si64(float %x) {
+; X64-LABEL: test_float_to_si64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %rax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_float_to_si64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttss2si %xmm0, %rax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi float %x to i64
+  ret i64 %conv
+}
+
+define i32 @test_float_to_si32(float %x) {
+; X64-LABEL: test_float_to_si32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %eax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_float_to_si32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttss2si %xmm0, %eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi float %x to i32
+  ret i32 %conv
+}
+
+define signext i16 @test_float_to_si16(float %x) {
+; X64-LABEL: test_float_to_si16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_float_to_si16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttss2si %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi float %x to i16
+  ret i16 %conv
+}
+
+define signext i8 @test_float_to_si8(float %x) {
+; X64-LABEL: test_float_to_si8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_float_to_si8:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttss2si %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi float %x to i8
+  ret i8 %conv
+}
+
+define i31 @test_float_to_si31(float %x) {
+; X64-LABEL: test_float_to_si31:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %eax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_float_to_si31:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttss2si %xmm0, %eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi float %x to i31
+  ret i31 %conv
+}
+
+define i33 @test_float_to_si33(float %x) {
+; X64-LABEL: test_float_to_si33:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %rax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_float_to_si33:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttss2si %xmm0, %rax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi float %x to i33
+  ret i33 %conv
+}
diff --git a/llvm/test/CodeGen/X86/isel-int-to-fp.ll b/llvm/test/CodeGen/X86/isel-int-to-fp.ll
new file mode 100644
index 0000000000000..fc99ff95788f3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-int-to-fp.ll
@@ -0,0 +1,395 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s                                   -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes X64,SDAG-X64
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes X64,GISEL-X64
+; RUN: llc < %s                                   -mattr=+avx512f -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes AVX512,SDAG-AVX512
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mattr=+avx512f -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes AVX512,GISEL-AVX512
+
+define double @test_ui64_to_double(i64 %x) {
+; SDAG-X64-LABEL: test_ui64_to_double:
+; SDAG-X64:       # %bb.0: # %entry
+; SDAG-X64-NEXT:    movq %rdi, %xmm1
+; SDAG-X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; SDAG-X64-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SDAG-X64-NEXT:    movapd %xmm1, %xmm0
+; SDAG-X64-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SDAG-X64-NEXT:    addsd %xmm1, %xmm0
+; SDAG-X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: test_ui64_to_double:
+; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-NEXT:    movabsq $4841369599423283200, %rax # imm = 0x4330000000000000
+; GISEL-X64-NEXT:    movabsq $4985484787499139072, %rcx # imm = 0x4530000000000000
+; GISEL-X64-NEXT:    movsd {{.*#+}} xmm0 = [1.9342813118337666E+25,0.0E+0]
+; GISEL-X64-NEXT:    movl $4294967295, %edx # imm = 0xFFFFFFFF
+; GISEL-X64-NEXT:    andq %rdi, %rdx
+; GISEL-X64-NEXT:    orq %rax, %rdx
+; GISEL-X64-NEXT:    shrq $32, %rdi
+; GISEL-X64-NEXT:    orq %rdi, %rcx
+; GISEL-X64-NEXT:    movq %rcx, %xmm1
+; GISEL-X64-NEXT:    subsd %xmm0, %xmm1
+; GISEL-X64-NEXT:    movq %rdx, %xmm0
+; GISEL-X64-NEXT:    addsd %xmm1, %xmm0
+; GISEL-X64-NEXT:    retq
+;
+; AVX512-LABEL: test_ui64_to_double:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtusi2sd %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = uitofp i64 %x to double
+  ret double %conv
+}
+
+define double @test_ui32_to_double(i32 %x) {
+; X64-LABEL: test_ui32_to_double:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cvtsi2sd %rax, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_ui32_to_double:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = uitofp i32 %x to double
+  ret double %conv
+}
+
+define double @test_ui16_to_double(i16 zeroext %x) {
+; X64-LABEL: test_ui16_to_double:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvtsi2sd %edi, %xmm0
+; X64-NEXT:    retq
+;
+; SDAG-AVX512-LABEL: test_ui16_to_double:
+; SDAG-AVX512:       # %bb.0: # %entry
+; SDAG-AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; SDAG-AVX512-NEXT:    retq
+;
+; GISEL-AVX512-LABEL: test_ui16_to_double:
+; GISEL-AVX512:       # %bb.0: # %entry
+; GISEL-AVX512-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
+; GISEL-AVX512-NEXT:    retq
+entry:
+  %conv = uitofp i16 %x to double
+  ret double %conv
+}
+
+define double @test_ui8_to_double(i8 zeroext %x) {
+; X64-LABEL: test_ui8_to_double:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvtsi2sd %edi, %xmm0
+; X64-NEXT:    retq
+;
+; SDAG-AVX512-LABEL: test_ui8_to_double:
+; SDAG-AVX512:       # %bb.0: # %entry
+; SDAG-AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; SDAG-AVX512-NEXT:    retq
+;
+; GISEL-AVX512-LABEL: test_ui8_to_double:
+; GISEL-AVX512:       # %bb.0: # %entry
+; GISEL-AVX512-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
+; GISEL-AVX512-NEXT:    retq
+entry:
+  %conv = uitofp i8 %x to double
+  ret double %conv
+}
+
+define float @test_ui64_to_float(i64 %x) {
+; SDAG-X64-LABEL: test_ui64_to_float:
+; SDAG-X64:       # %bb.0: # %entry
+; SDAG-X64-NEXT:    testq %rdi, %rdi
+; SDAG-X64-NEXT:    js .LBB4_1
+; SDAG-X64-NEXT:  # %bb.2: # %entry
+; SDAG-X64-NEXT:    cvtsi2ss %rdi, %xmm0
+; SDAG-X64-NEXT:    retq
+; SDAG-X64-NEXT:  .LBB4_1:
+; SDAG-X64-NEXT:    movq %rdi, %rax
+; SDAG-X64-NEXT:    shrq %rax
+; SDAG-X64-NEXT:    andl $1, %edi
+; SDAG-X64-NEXT:    orq %rax, %rdi
+; SDAG-X64-NEXT:    cvtsi2ss %rdi, %xmm0
+; SDAG-X64-NEXT:    addss %xmm0, %xmm0
+; SDAG-X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: test_ui64_to_float:
+; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-NEXT:    cvtsi2ss %rdi, %xmm0
+; GISEL-X64-NEXT:    movq %rdi, %rax
+; GISEL-X64-NEXT:    shrq %rax
+; GISEL-X64-NEXT:    movq %rdi, %rcx
+; GISEL-X64-NEXT:    andq $1, %rcx
+; GISEL-X64-NEXT:    orq %rax, %rcx
+; GISEL-X64-NEXT:    cvtsi2ss %rcx, %xmm1
+; GISEL-X64-NEXT:    addss %xmm1, %xmm1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    cmpq $0, %rdi
+; GISEL-X64-NEXT:    setl %al
+; GISEL-X64-NEXT:    andl $1, %eax
+; GISEL-X64-NEXT:    movd %xmm1, %eax
+; GISEL-X64-NEXT:    movd %xmm0, %ecx
+; GISEL-X64-NEXT:    cmovnel %eax, %ecx
+; GISEL-X64-NEXT:    movd %ecx, %xmm0
+; GISEL-X64-NEXT:    retq
+;
+; AVX512-LABEL: test_ui64_to_float:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtusi2ss %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = uitofp i64 %x to float
+  ret float %conv
+}
+
+define float @test_ui32_to_float(i32 %x) {
+; X64-LABEL: test_ui32_to_float:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cvtsi2ss %rax, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_ui32_to_float:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = uitofp i32 %x to float
+  ret float %conv
+}
+
+define float @test_ui16_to_float(i16 zeroext %x) {
+; X64-LABEL: test_ui16_to_float:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    retq
+;
+; SDAG-AVX512-LABEL: test_ui16_to_float:
+; SDAG-AVX512:       # %bb.0: # %entry
+; SDAG-AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; SDAG-AVX512-NEXT:    retq
+;
+; GISEL-AVX512-LABEL: test_ui16_to_float:
+; GISEL-AVX512:       # %bb.0: # %entry
+; GISEL-AVX512-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; GISEL-AVX512-NEXT:    retq
+entry:
+  %conv = uitofp i16 %x to float
+  ret float %conv
+}
+
+define float @test_ui8_to_float(i8 zeroext %x) {
+; X64-LABEL: test_ui8_to_float:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    retq
+;
+; SDAG-AVX512-LABEL: test_ui8_to_float:
+; SDAG-AVX512:       # %bb.0: # %entry
+; SDAG-AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; SDAG-AVX512-NEXT:    retq
+;
+; GISEL-AVX512-LABEL: test_ui8_to_float:
+; GISEL-AVX512:       # %bb.0: # %entry
+; GISEL-AVX512-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; GISEL-AVX512-NEXT:    retq
+entry:
+  %conv = uitofp i8 %x to float
+  ret float %conv
+}
+
+define double @test_si64_to_double(i64 %x) {
+; X64-LABEL: test_si64_to_double:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvtsi2sd %rdi, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_si64_to_double:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtsi2sd %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = sitofp i64 %x to double
+  ret double %conv
+}
+
+define double @test_si32_to_double(i32 %x) {
+; X64-LABEL: test_si32_to_double:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvtsi2sd %edi, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_si32_to_double:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = sitofp i32 %x to double
+  ret double %conv
+}
+
+define double @test_si16_to_double(i16 signext %x) {
+; X64-LABEL: test_si16_to_double:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvtsi2sd %edi, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_si16_to_double:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = sitofp i16 %x to double
+  ret double %conv
+}
+
+define double @test_si8_to_double(i8 signext %x) {
+; X64-LABEL: test_si8_to_double:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvtsi2sd %edi, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_si8_to_double:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = sitofp i8 %x to double
+  ret double %conv
+}
+
+define double @test_si31_to_double(i31 %x) {
+; X64-LABEL: test_si31_to_double:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    addl %edi, %edi
+; X64-NEXT:    sarl %edi
+; X64-NEXT:    cvtsi2sd %edi, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_si31_to_double:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    addl %edi, %edi
+; AVX512-NEXT:    sarl %edi
+; AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = sitofp i31 %x to double
+  ret double %conv
+}
+
+define double @test_si33_to_double(i33 %x) {
+; X64-LABEL: test_si33_to_double:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    shlq $31, %rdi
+; X64-NEXT:    sarq $31, %rdi
+; X64-NEXT:    cvtsi2sd %rdi, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_si33_to_double:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    shlq $31, %rdi
+; AVX512-NEXT:    sarq $31, %rdi
+; AVX512-NEXT:    vcvtsi2sd %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = sitofp i33 %x to double
+  ret double %conv
+}
+
+define float @test_si64_to_float(i64 %x) {
+; X64-LABEL: test_si64_to_float:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvtsi2ss %rdi, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_si64_to_float:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = sitofp i64 %x to float
+  ret float %conv
+}
+
+define float @test_si32_to_float(i32 %x) {
+; X64-LABEL: test_si32_to_float:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_si32_to_float:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = sitofp i32 %x to float
+  ret float %conv
+}
+
+define float @test_si16_to_float(i16 signext %x) {
+; X64-LABEL: test_si16_to_float:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_si16_to_float:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = sitofp i16 %x to float
+  ret float %conv
+}
+
+define float @test_si8_to_float(i8 signext %x) {
+; X64-LABEL: test_si8_to_float:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_si8_to_float:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = sitofp i8 %x to float
+  ret float %conv
+}
+
+define float @test_si31_to_float(i31 %x) {
+; X64-LABEL: test_si31_to_float:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    addl %edi, %edi
+; X64-NEXT:    sarl %edi
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_si31_to_float:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    addl %edi, %edi
+; AVX512-NEXT:    sarl %edi
+; AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = sitofp i31 %x to float
+  ret float %conv
+}
+
+define float @test_si33_to_float(i33 %x) {
+; X64-LABEL: test_si33_to_float:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    shlq $31, %rdi
+; X64-NEXT:    sarq $31, %rdi
+; X64-NEXT:    cvtsi2ss %rdi, %xmm0
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_si33_to_float:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    shlq $31, %rdi
+; AVX512-NEXT:    sarq $31, %rdi
+; AVX512-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+entry:
+  %conv = sitofp i33 %x to float
+  ret float %conv
+}
diff --git a/llvm/test/CodeGen/X86/issue56055.ll b/llvm/test/CodeGen/X86/issue56055.ll
new file mode 100644
index 0000000000000..27eaf13e3b00b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/issue56055.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -fast-isel < %s | FileCheck -check-prefixes=CHECK,FASTISEL %s
+; RUN: llc < %s | FileCheck -check-prefixes=CHECK,SDAG %s
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-windows-msvc"
+
+define void @issue56055(ptr addrspace(270) %ptr, ptr %out) {
+; CHECK-LABEL: issue56055:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addl $2, %ecx
+; CHECK-NEXT:    movl %ecx, (%rdx)
+; CHECK-NEXT:    retq
+  %add.ptr = getelementptr inbounds i8, ptr addrspace(270) %ptr, i32 2
+  store ptr addrspace(270) %add.ptr, ptr %out
+  ret void
+}
+
+define void @issue56055_vector(<2 x ptr addrspace(270)> %ptr, ptr %out) {
+; CHECK-LABEL: issue56055_vector:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movdqa (%rcx), %xmm0
+; CHECK-NEXT:    paddd __xmm@00000000000000000000000200000002(%rip), %xmm0
+; CHECK-NEXT:    movq %xmm0, (%rdx)
+; CHECK-NEXT:    retq
+  %add.ptr = getelementptr inbounds i8, <2 x ptr addrspace(270)> %ptr, <2 x i32> <i32 2, i32 2>
+  store <2 x ptr addrspace(270)> %add.ptr, ptr %out
+  ret void
+}
+
+define void @issue56055_small_idx(ptr addrspace(270) %ptr, ptr %out, i16 %idx) {
+; CHECK-LABEL: issue56055_small_idx:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movswl %r8w, %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    movl %eax, (%rdx)
+; CHECK-NEXT:    retq
+  %add.ptr = getelementptr inbounds i8, ptr addrspace(270) %ptr, i16 %idx
+  store ptr addrspace(270) %add.ptr, ptr %out
+  ret void
+}
+
+define void @issue56055_small_idx_vector(<2 x ptr addrspace(270)> %ptr, ptr %out, <2 x i16> %idx) {
+; CHECK-LABEL: issue56055_small_idx_vector:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,0,2,1,4,5,6,7]
+; CHECK-NEXT:    psrad $16, %xmm0
+; CHECK-NEXT:    paddd (%rcx), %xmm0
+; CHECK-NEXT:    movq %xmm0, (%rdx)
+; CHECK-NEXT:    retq
+  %add.ptr = getelementptr inbounds i8, <2 x ptr addrspace(270)> %ptr, <2 x i16> %idx
+  store <2 x ptr addrspace(270)> %add.ptr, ptr %out
+  ret void
+}
+
+define void @issue56055_large_idx(ptr addrspace(270) %ptr, ptr %out, i64 %idx) {
+; CHECK-LABEL: issue56055_large_idx:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addl %ecx, %r8d
+; CHECK-NEXT:    movl %r8d, (%rdx)
+; CHECK-NEXT:    retq
+  %add.ptr = getelementptr inbounds i8, ptr addrspace(270) %ptr, i64 %idx
+  store ptr addrspace(270) %add.ptr, ptr %out
+  ret void
+}
+
+define void @issue56055_large_idx_vector(<2 x ptr addrspace(270)> %ptr, ptr %out, <2 x i64> %idx) {
+; CHECK-LABEL: issue56055_large_idx_vector:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; CHECK-NEXT:    paddd (%rcx), %xmm0
+; CHECK-NEXT:    movq %xmm0, (%rdx)
+; CHECK-NEXT:    retq
+  %add.ptr = getelementptr inbounds i8, <2 x ptr addrspace(270)> %ptr, <2 x i64> %idx
+  store <2 x ptr addrspace(270)> %add.ptr, ptr %out
+  ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTISEL: {{.*}}
+; SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index 502249a87c489..300da68d9a3b3 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -937,6 +937,65 @@ define <16 x i32> @zext_mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i32> %d
 }
 
+; PR109790
+define void @PR109790(ptr sret([32 x i8]) %ret, ptr %a) {
+; SSE-LABEL: PR109790:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; SSE-NEXT:    movdqa (%rsi), %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pand 16(%rsi), %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [64536,64536,64536,64536,64536,64536,64536,64536]
+; SSE-NEXT:    pmulhw %xmm2, %xmm0
+; SSE-NEXT:    pmulhw %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm1, (%rdi)
+; SSE-NEXT:    movdqa %xmm0, 16(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: PR109790:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536]
+; AVX2-NEXT:    vmovdqa %ymm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: PR109790:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512F-NEXT:    vpsrld $16, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: PR109790:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512BW-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0]
+; AVX512BW-NEXT:    vpsrld $16, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %load = load <16 x i16>, ptr %a, align 32
+  %and = and <16 x i16> %load, <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>
+  %ext = zext nneg <16 x i16> %and to <16 x i32>
+  %mul = mul nsw <16 x i32> %ext, <i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000>
+  %srl = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %res = trunc nuw <16 x i32> %srl to <16 x i16>
+  store <16 x i16> %res, ptr %ret, align 32
+  ret void
+}
+
 ; PR109790
 define <16 x i16> @zext_mulhuw_v16i16_negative_constant(<16 x i16> %a) {
 ; SSE-LABEL: zext_mulhuw_v16i16_negative_constant:
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index f2e48c7f308e5..86b8121f21cff 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -96,6 +96,7 @@ define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 ; AVX512-LABEL: test_mm_andnot_ps:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f]
+; AVX512-NEXT:    # xmm0 = (~xmm0 & ~xmm0 & ~xmm0) | (~xmm0 & ~xmm0 & xmm0) | (~xmm0 & xmm0 & ~xmm0) | (~xmm0 & xmm0 & xmm0)
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index adf4fc28208e7..853bb6367fe4d 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -288,6 +288,7 @@ define <2 x double> @test_mm_andnot_pd(<2 x double> %a0, <2 x double> %a1) nounw
 ; AVX512-LABEL: test_mm_andnot_pd:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f]
+; AVX512-NEXT:    # xmm0 = (~xmm0 & ~xmm0 & ~xmm0) | (~xmm0 & ~xmm0 & xmm0) | (~xmm0 & xmm0 & ~xmm0) | (~xmm0 & xmm0 & xmm0)
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %arg0 = bitcast <2 x double> %a0 to <4 x i32>
@@ -316,6 +317,7 @@ define <2 x i64> @test_mm_andnot_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; AVX512-LABEL: test_mm_andnot_si128:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f]
+; AVX512-NEXT:    # xmm0 = (~xmm0 & ~xmm0 & ~xmm0) | (~xmm0 & ~xmm0 & xmm0) | (~xmm0 & xmm0 & ~xmm0) | (~xmm0 & xmm0 & xmm0)
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %not = xor <2 x i64> %a0, <i64 -1, i64 -1>
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll
index fb3d57e595307..e8b0facf534b0 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll
@@ -442,7 +442,7 @@ define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
@@ -461,7 +461,7 @@ define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
@@ -2210,7 +2210,7 @@ define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
@@ -2286,7 +2286,7 @@ define <8 x i64> @stack_fold_permq(<8 x i64> %a0) {
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
 ; CHECK-NEXT:    # zmm0 = mem[3,2,2,3,7,6,6,7]
-; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
@@ -2307,7 +2307,7 @@ define <8 x i64> @stack_fold_permq_mask(ptr %passthru, <8 x i64> %a0, i8 %mask)
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; CHECK-NEXT:    vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
 ; CHECK-NEXT:    # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7]
-; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
@@ -2347,7 +2347,7 @@ define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
@@ -2368,7 +2368,7 @@ define <8 x i64> @stack_fold_permqvar_mask(ptr %passthru, <8 x i64> %a0, <8 x i6
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; CHECK-NEXT:    vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
-; CHECK-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
 ; CHECK-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
@@ -6091,7 +6091,7 @@ define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) {
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
 ; CHECK-NEXT:    # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
@@ -7047,6 +7047,7 @@ define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vpternlogd $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    # zmm0 = (~zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem)
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
@@ -7062,6 +7063,7 @@ define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vpternlogq $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    # zmm0 = (~zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem)
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index b18f08b62f0d4..fd9ba68d5707a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -3926,7 +3926,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufb %xmm7, %xmm13, %xmm7
 ; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm15, %zmm18, %zmm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm18 & zmm15) | (zmm7 & ~zmm18 & ~zmm15) | (zmm7 & ~zmm18 & zmm15) | (zmm7 & zmm18 & zmm15)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm7, %zmm16
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm14
@@ -3959,7 +3959,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3]
 ; AVX512-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
 ; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-NEXT:    vpternlogq $184, %zmm0, %zmm18, %zmm13
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm18 & zmm0) | (zmm13 & ~zmm18 & ~zmm0) | (zmm13 & ~zmm18 & zmm0) | (zmm13 & zmm18 & zmm0)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm19
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm7
@@ -4007,7 +4007,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa %ymm2, %ymm9
 ; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm15
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm0 & ~mem) | (zmm13 & ~zmm0 & mem) | (zmm13 & zmm0 & ~mem) | (zmm13 & zmm0 & mem)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm13, %ymm14
@@ -4032,7 +4032,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm2
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
 ; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT:    vpternlogq $184, %zmm13, %zmm18, %zmm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm18 & zmm13) | (zmm11 & ~zmm18 & ~zmm13) | (zmm11 & ~zmm18 & zmm13) | (zmm11 & zmm18 & zmm13)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm11, %zmm7
 ; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm2
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
@@ -4117,7 +4117,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
 ; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm13, %ymm13
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm12, %zmm18, %zmm13
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm18 & zmm12) | (zmm13 & ~zmm18 & ~zmm12) | (zmm13 & ~zmm18 & zmm12) | (zmm13 & zmm18 & zmm12)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm13, %zmm16
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
@@ -4133,7 +4133,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm15
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
 ; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm15
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm12, %zmm18, %zmm15
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm18 & zmm12) | (zmm15 & ~zmm18 & ~zmm12) | (zmm15 & ~zmm18 & zmm12) | (zmm15 & zmm18 & zmm12)
 ; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
 ; AVX512-FCP-NEXT:    vpsrlq $48, %xmm2, %xmm12
 ; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
@@ -4177,7 +4177,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm15, %ymm15
 ; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm12, %zmm18, %zmm14
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm18 & zmm12) | (zmm14 & ~zmm18 & ~zmm12) | (zmm14 & ~zmm18 & zmm12) | (zmm14 & zmm18 & zmm12)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm14, %zmm14
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm12
@@ -4201,7 +4201,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm15, %ymm15
 ; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm13
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm12
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm6 & mem) | (zmm12 & ~zmm6 & ~mem) | (zmm12 & zmm6 & ~mem) | (zmm12 & zmm6 & mem)
 ; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm6
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
@@ -4298,7 +4298,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm13, %xmm7
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm15, %zmm18, %zmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm18 & zmm15) | (zmm7 & ~zmm18 & ~zmm15) | (zmm7 & ~zmm18 & zmm15) | (zmm7 & zmm18 & zmm15)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm7, %zmm16
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm14
@@ -4331,7 +4331,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3]
 ; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm0, %zmm18, %zmm13
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm18 & zmm0) | (zmm13 & ~zmm18 & ~zmm0) | (zmm13 & ~zmm18 & zmm0) | (zmm13 & zmm18 & zmm0)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm19
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm7
@@ -4379,7 +4379,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm9
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm15
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm0 & ~mem) | (zmm13 & ~zmm0 & mem) | (zmm13 & zmm0 & ~mem) | (zmm13 & zmm0 & mem)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm13, %ymm14
@@ -4404,7 +4404,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm2
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm13, %zmm18, %zmm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm18 & zmm13) | (zmm11 & ~zmm18 & ~zmm13) | (zmm11 & ~zmm18 & zmm13) | (zmm11 & zmm18 & zmm13)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm11, %zmm7
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm2
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
@@ -4489,7 +4489,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm13, %ymm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm12, %zmm18, %zmm13
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm18 & zmm12) | (zmm13 & ~zmm18 & ~zmm12) | (zmm13 & ~zmm18 & zmm12) | (zmm13 & zmm18 & zmm12)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm13, %zmm16
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
@@ -4505,7 +4505,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm15
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm15
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm12, %zmm18, %zmm15
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm18 & zmm12) | (zmm15 & ~zmm18 & ~zmm12) | (zmm15 & ~zmm18 & zmm12) | (zmm15 & zmm18 & zmm12)
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm2, %xmm12
 ; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
@@ -4549,7 +4549,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm15, %ymm15
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm14
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm12, %zmm18, %zmm14
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm18 & zmm12) | (zmm14 & ~zmm18 & ~zmm12) | (zmm14 & ~zmm18 & zmm12) | (zmm14 & zmm18 & zmm12)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm14, %zmm14
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm12
@@ -4573,7 +4573,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm15, %ymm15
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm13
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm12
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm6 & mem) | (zmm12 & ~zmm6 & ~mem) | (zmm12 & zmm6 & ~mem) | (zmm12 & zmm6 & mem)
 ; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm6
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
@@ -8060,7 +8060,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
@@ -8118,7 +8118,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7]
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm19, %zmm27, %zmm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm27 & zmm19) | (zmm10 & ~zmm27 & ~zmm19) | (zmm10 & ~zmm27 & zmm19) | (zmm10 & zmm27 & zmm19)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm10, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15]
@@ -8146,6 +8146,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm2 = (~zmm2 & zmm27 & mem) | (zmm2 & ~zmm27 & ~mem) | (zmm2 & ~zmm27 & mem) | (zmm2 & zmm27 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm4
@@ -8177,6 +8178,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7]
 ; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm2 = (~zmm2 & zmm27 & mem) | (zmm2 & ~zmm27 & ~mem) | (zmm2 & ~zmm27 & mem) | (zmm2 & zmm27 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15]
@@ -8204,7 +8206,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm23
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpternlogq $184, %zmm28, %zmm27, %zmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm27 & zmm28) | (zmm0 & ~zmm27 & ~zmm28) | (zmm0 & ~zmm27 & zmm28) | (zmm0 & zmm27 & zmm28)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
@@ -8229,7 +8231,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm28
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
 ; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm27, %zmm13
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm27 & zmm2) | (zmm13 & ~zmm27 & ~zmm2) | (zmm13 & ~zmm27 & zmm2) | (zmm13 & zmm27 & zmm2)
 ; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm8
 ; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm4
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15]
@@ -8293,7 +8295,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
 ; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT:    vpternlogq $184, %zmm0, %zmm27, %zmm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm27 & zmm0) | (zmm10 & ~zmm27 & ~zmm0) | (zmm10 & ~zmm27 & zmm0) | (zmm10 & zmm27 & zmm0)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm10, %zmm23
 ; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm13
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15]
@@ -8326,7 +8328,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm0 & mem) | (zmm8 & ~zmm0 & ~mem) | (zmm8 & zmm0 & ~mem) | (zmm8 & zmm0 & mem)
 ; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
 ; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm10
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
@@ -8494,7 +8496,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
 ; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm14
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm4, %zmm8, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm8 & zmm4) | (zmm7 & ~zmm8 & ~zmm4) | (zmm7 & ~zmm8 & zmm4) | (zmm7 & zmm8 & zmm4)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm16
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -8536,7 +8538,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
 ; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm18, %ymm2
 ; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm16, %zmm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm16 & zmm1) | (zmm2 & ~zmm16 & ~zmm1) | (zmm2 & ~zmm16 & zmm1) | (zmm2 & zmm16 & zmm1)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15]
@@ -8581,7 +8583,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm25, %ymm4
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
 ; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm10, %zmm16, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm16 & zmm10) | (zmm4 & ~zmm16 & ~zmm10) | (zmm4 & ~zmm16 & zmm10) | (zmm4 & zmm16 & zmm10)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm10
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm4, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -8621,7 +8623,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm25, %ymm2
 ; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm10, %zmm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm10 & zmm0) | (zmm2 & ~zmm10 & ~zmm0) | (zmm2 & ~zmm10 & zmm0) | (zmm2 & zmm10 & zmm0)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15]
@@ -8666,7 +8668,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
 ; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm14
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm15, %zmm14
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm15 & zmm0) | (zmm14 & ~zmm15 & ~zmm0) | (zmm14 & ~zmm15 & zmm0) | (zmm14 & zmm15 & zmm0)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm14, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
@@ -8700,7 +8702,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm5, %ymm2
 ; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm15, %zmm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm15 & zmm1) | (zmm2 & ~zmm15 & ~zmm1) | (zmm2 & ~zmm15 & zmm1) | (zmm2 & zmm15 & zmm1)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm28
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm15
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm7
@@ -8769,7 +8771,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm3, %zmm5, %zmm25
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm5 & zmm3) | (zmm25 & ~zmm5 & ~zmm3) | (zmm25 & ~zmm5 & zmm3) | (zmm25 & zmm5 & zmm3)
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
@@ -8794,7 +8796,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm4
 ; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm24
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm5 & zmm4) | (zmm24 & ~zmm5 & ~zmm4) | (zmm24 & ~zmm5 & zmm4) | (zmm24 & zmm5 & zmm4)
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm4
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm5
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
@@ -8992,7 +8994,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
@@ -9050,7 +9052,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm19, %zmm27, %zmm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm27 & zmm19) | (zmm10 & ~zmm27 & ~zmm19) | (zmm10 & ~zmm27 & zmm19) | (zmm10 & zmm27 & zmm19)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm10, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15]
@@ -9078,6 +9080,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm2 = (~zmm2 & zmm27 & mem) | (zmm2 & ~zmm27 & ~mem) | (zmm2 & ~zmm27 & mem) | (zmm2 & zmm27 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm4
@@ -9109,6 +9112,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7]
 ; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm2 = (~zmm2 & zmm27 & mem) | (zmm2 & ~zmm27 & ~mem) | (zmm2 & ~zmm27 & mem) | (zmm2 & zmm27 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15]
@@ -9136,7 +9140,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm23
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm28, %zmm27, %zmm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm27 & zmm28) | (zmm0 & ~zmm27 & ~zmm28) | (zmm0 & ~zmm27 & zmm28) | (zmm0 & zmm27 & zmm28)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
@@ -9161,7 +9165,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm28
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm2, %zmm27, %zmm13
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm27 & zmm2) | (zmm13 & ~zmm27 & ~zmm2) | (zmm13 & ~zmm27 & zmm2) | (zmm13 & zmm27 & zmm2)
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm8
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm4
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15]
@@ -9225,7 +9229,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm0, %zmm27, %zmm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm27 & zmm0) | (zmm10 & ~zmm27 & ~zmm0) | (zmm10 & ~zmm27 & zmm0) | (zmm10 & zmm27 & zmm0)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm10, %zmm23
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm13
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15]
@@ -9258,7 +9262,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm0 & mem) | (zmm8 & ~zmm0 & ~mem) | (zmm8 & zmm0 & ~mem) | (zmm8 & zmm0 & mem)
 ; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm10
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
@@ -9426,7 +9430,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm14
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm4, %zmm8, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm8 & zmm4) | (zmm7 & ~zmm8 & ~zmm4) | (zmm7 & ~zmm8 & zmm4) | (zmm7 & zmm8 & zmm4)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm16
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -9468,7 +9472,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm18, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm16, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm16 & zmm1) | (zmm2 & ~zmm16 & ~zmm1) | (zmm2 & ~zmm16 & zmm1) | (zmm2 & zmm16 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15]
@@ -9513,7 +9517,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm25, %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm10, %zmm16, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm16 & zmm10) | (zmm4 & ~zmm16 & ~zmm10) | (zmm4 & ~zmm16 & zmm10) | (zmm4 & zmm16 & zmm10)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm10
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm4, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -9553,7 +9557,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm25, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm10, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm10 & zmm0) | (zmm2 & ~zmm10 & ~zmm0) | (zmm2 & ~zmm10 & zmm0) | (zmm2 & zmm10 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15]
@@ -9598,7 +9602,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm14
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm15, %zmm14
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm15 & zmm0) | (zmm14 & ~zmm15 & ~zmm0) | (zmm14 & ~zmm15 & zmm0) | (zmm14 & zmm15 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm14, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
@@ -9632,7 +9636,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm5, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm15, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm15 & zmm1) | (zmm2 & ~zmm15 & ~zmm1) | (zmm2 & ~zmm15 & zmm1) | (zmm2 & zmm15 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm28
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm15
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm7
@@ -9701,7 +9705,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm3, %zmm5, %zmm25
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm5 & zmm3) | (zmm25 & ~zmm5 & ~zmm3) | (zmm25 & ~zmm5 & zmm3) | (zmm25 & zmm5 & zmm3)
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
@@ -9726,7 +9730,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm4
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm24
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm5 & zmm4) | (zmm24 & ~zmm5 & ~zmm4) | (zmm24 & ~zmm5 & zmm4) | (zmm24 & zmm5 & zmm4)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index 605deed6536bf..a37b8e33ceffe 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -2497,7 +2497,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $236, %ymm11, %ymm7, %ymm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm7 & ~ymm11) | (~ymm10 & ymm7 & ymm11) | (ymm10 & ~ymm7 & ymm11) | (ymm10 & ymm7 & ~ymm11) | (ymm10 & ymm7 & ymm11)
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
@@ -2512,7 +2512,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq $236, %ymm11, %ymm4, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm4 & ~ymm11) | (~ymm3 & ymm4 & ymm11) | (ymm3 & ~ymm4 & ymm11) | (ymm3 & ymm4 & ~ymm11) | (ymm3 & ymm4 & ymm11)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
@@ -2608,7 +2608,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $236, %ymm11, %ymm7, %ymm10
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm7 & ~ymm11) | (~ymm10 & ymm7 & ymm11) | (ymm10 & ~ymm7 & ymm11) | (ymm10 & ymm7 & ~ymm11) | (ymm10 & ymm7 & ymm11)
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
@@ -2623,7 +2623,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm11, %ymm2, %ymm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm2 & ymm11) | (ymm3 & ~ymm2 & ~ymm11) | (ymm3 & ~ymm2 & ymm11) | (ymm3 & ymm2 & ~ymm11) | (ymm3 & ymm2 & ymm11)
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
 ; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm6
 ; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
@@ -2721,7 +2721,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm11, %ymm7, %ymm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm7 & ~ymm11) | (~ymm10 & ymm7 & ymm11) | (ymm10 & ~ymm7 & ymm11) | (ymm10 & ymm7 & ~ymm11) | (ymm10 & ymm7 & ymm11)
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
@@ -2736,7 +2736,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm11, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm4 & ~ymm11) | (~ymm3 & ymm4 & ymm11) | (ymm3 & ~ymm4 & ymm11) | (ymm3 & ymm4 & ~ymm11) | (ymm3 & ymm4 & ymm11)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
@@ -2832,7 +2832,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $236, %ymm11, %ymm7, %ymm10
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm7 & ~ymm11) | (~ymm10 & ymm7 & ymm11) | (ymm10 & ~ymm7 & ymm11) | (ymm10 & ymm7 & ~ymm11) | (ymm10 & ymm7 & ymm11)
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
@@ -2847,7 +2847,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm11, %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm2 & ymm11) | (ymm3 & ~ymm2 & ~ymm11) | (ymm3 & ~ymm2 & ymm11) | (ymm3 & ymm2 & ~ymm11) | (ymm3 & ymm2 & ymm11)
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
@@ -5147,9 +5147,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm16
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm17, %zmm0, %zmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm0 & zmm17) | (zmm2 & ~zmm0 & zmm17) | (zmm2 & zmm0 & ~zmm17) | (zmm2 & zmm0 & zmm17)
 ; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm17, %zmm16
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm17 & zmm2) | (zmm16 & ~zmm17 & ~zmm2) | (zmm16 & ~zmm17 & zmm2) | (zmm16 & zmm17 & zmm2)
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm11[0,1,3,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7]
@@ -5178,9 +5178,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm0, %zmm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm0 & zmm2) | (zmm5 & ~zmm0 & zmm2) | (zmm5 & zmm0 & ~zmm2) | (zmm5 & zmm0 & zmm2)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
-; AVX512-NEXT:    vpternlogq $184, %zmm5, %zmm17, %zmm20
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm17 & zmm5) | (zmm20 & ~zmm17 & ~zmm5) | (zmm20 & ~zmm17 & zmm5) | (zmm20 & zmm17 & zmm5)
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm0
 ; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm1
@@ -5211,7 +5211,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $236, %ymm11, %ymm10, %ymm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm10 & ~ymm11) | (~ymm8 & ymm10 & ymm11) | (ymm8 & ~ymm10 & ymm11) | (ymm8 & ymm10 & ~ymm11) | (ymm8 & ymm10 & ymm11)
 ; AVX512-NEXT:    movw $31, %ax
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm2 {%k1}
@@ -5248,7 +5248,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq $236, %ymm11, %ymm4, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm4 & ~ymm11) | (~ymm3 & ymm4 & ymm11) | (ymm3 & ~ymm4 & ymm11) | (ymm3 & ymm4 & ~ymm11) | (ymm3 & ymm4 & ymm11)
 ; AVX512-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7]
@@ -5261,16 +5261,18 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm4 = (~zmm4 & ~zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem)
 ; AVX512-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqa32 %zmm30, %zmm4 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm4, (%rsi)
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm15 = (~zmm15 & ~zmm3 & mem) | (zmm15 & ~zmm3 & mem) | (zmm15 & zmm3 & ~mem) | (zmm15 & zmm3 & mem)
 ; AVX512-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm15, (%rdx)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm17, %zmm10
-; AVX512-NEXT:    vpternlogq $184, %zmm0, %zmm17, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm17 & zmm2) | (zmm10 & ~zmm17 & ~zmm2) | (zmm10 & ~zmm17 & zmm2) | (zmm10 & zmm17 & zmm2)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm17 & zmm0) | (zmm1 & ~zmm17 & ~zmm0) | (zmm1 & ~zmm17 & zmm0) | (zmm1 & zmm17 & zmm0)
 ; AVX512-NEXT:    vmovdqa64 %zmm16, (%rcx)
 ; AVX512-NEXT:    vmovdqa64 %zmm20, (%r8)
 ; AVX512-NEXT:    vmovdqa64 %zmm10, (%r9)
@@ -5426,9 +5428,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm16
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm17, %zmm0, %zmm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm0 & zmm17) | (zmm11 & ~zmm0 & zmm17) | (zmm11 & zmm0 & ~zmm17) | (zmm11 & zmm0 & zmm17)
 ; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm11, %zmm17, %zmm16
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm17 & zmm11) | (zmm16 & ~zmm17 & ~zmm11) | (zmm16 & ~zmm17 & zmm11) | (zmm16 & zmm17 & zmm11)
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7]
@@ -5454,9 +5456,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm0, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm0 & zmm7) | (zmm4 & ~zmm0 & zmm7) | (zmm4 & zmm0 & ~zmm7) | (zmm4 & zmm0 & zmm7)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm4, %zmm17, %zmm20
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm17 & zmm4) | (zmm20 & ~zmm17 & ~zmm4) | (zmm20 & ~zmm17 & zmm4) | (zmm20 & zmm17 & zmm4)
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
@@ -5486,7 +5488,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $236, %ymm11, %ymm10, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm10 & ~ymm11) | (~ymm5 & ymm10 & ymm11) | (ymm5 & ~ymm10 & ymm11) | (ymm5 & ymm10 & ~ymm11) | (ymm5 & ymm10 & ymm11)
 ; AVX512-FCP-NEXT:    movw $31, %ax
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm5, %zmm2 {%k1}
@@ -5509,7 +5511,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm11, %ymm7, %ymm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm7 & ymm11) | (ymm0 & ~ymm7 & ~ymm11) | (ymm0 & ~ymm7 & ymm11) | (ymm0 & ymm7 & ~ymm11) | (ymm0 & ymm7 & ymm11)
 ; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
 ; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
@@ -5530,17 +5532,19 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm4 = (~zmm4 & ~zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem)
 ; AVX512-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm19, %zmm4 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm4 = (~zmm4 & ~zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm28, %zmm4 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rdx)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm2, %zmm17, %zmm8
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm17, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm17 & zmm2) | (zmm8 & ~zmm17 & ~zmm2) | (zmm8 & ~zmm17 & zmm2) | (zmm8 & zmm17 & zmm2)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm17 & zmm1) | (zmm0 & ~zmm17 & ~zmm1) | (zmm0 & ~zmm17 & zmm1) | (zmm0 & zmm17 & zmm1)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, (%r8)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, (%r9)
@@ -5612,7 +5616,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm16, %zmm17, %zmm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & ~zmm17 & zmm16) | (zmm10 & ~zmm17 & zmm16) | (zmm10 & zmm17 & ~zmm16) | (zmm10 & zmm17 & zmm16)
 ; AVX512DQ-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
@@ -5635,7 +5639,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm5, %xmm0
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm17, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm17 & zmm3) | (zmm2 & ~zmm17 & zmm3) | (zmm2 & zmm17 & ~zmm3) | (zmm2 & zmm17 & zmm3)
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
@@ -5702,9 +5706,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm18
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm16, %zmm0, %zmm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm0 & zmm16) | (zmm9 & ~zmm0 & zmm16) | (zmm9 & zmm0 & ~zmm16) | (zmm9 & zmm0 & zmm16)
 ; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm9, %zmm17, %zmm18
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm18 = (~zmm18 & zmm17 & zmm9) | (zmm18 & ~zmm17 & ~zmm9) | (zmm18 & ~zmm17 & zmm9) | (zmm18 & zmm17 & zmm9)
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7]
@@ -5731,9 +5735,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm7, %zmm0, %zmm4
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm0 & zmm7) | (zmm4 & ~zmm0 & zmm7) | (zmm4 & zmm0 & ~zmm7) | (zmm4 & zmm0 & zmm7)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm16
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm4, %zmm17, %zmm16
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm17 & zmm4) | (zmm16 & ~zmm17 & ~zmm4) | (zmm16 & ~zmm17 & zmm4) | (zmm16 & zmm17 & zmm4)
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm1
@@ -5763,7 +5767,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm14, %ymm8, %ymm6
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & ~ymm14) | (~ymm6 & ymm8 & ymm14) | (ymm6 & ~ymm8 & ymm14) | (ymm6 & ymm8 & ~ymm14) | (ymm6 & ymm8 & ymm14)
 ; AVX512DQ-NEXT:    movw $31, %ax
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm6, %zmm0, %zmm4 {%k1}
@@ -5800,7 +5804,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm14, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~ymm14) | (~ymm2 & ymm3 & ymm14) | (ymm2 & ~ymm3 & ymm14) | (ymm2 & ymm3 & ~ymm14) | (ymm2 & ymm3 & ymm14)
 ; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7]
@@ -5815,8 +5819,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm2, (%rdx)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm4, %zmm17, %zmm8
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm0, %zmm17, %zmm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm17 & zmm4) | (zmm8 & ~zmm17 & ~zmm4) | (zmm8 & ~zmm17 & zmm4) | (zmm8 & zmm17 & zmm4)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm17 & zmm0) | (zmm1 & ~zmm17 & ~zmm0) | (zmm1 & ~zmm17 & zmm0) | (zmm1 & zmm17 & zmm0)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm18, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm16, (%r8)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%r9)
@@ -5888,7 +5892,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm16, %zmm17, %zmm10
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & ~zmm17 & zmm16) | (zmm10 & ~zmm17 & zmm16) | (zmm10 & zmm17 & ~zmm16) | (zmm10 & zmm17 & zmm16)
 ; AVX512DQ-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
@@ -5911,7 +5915,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm5, %zmm17, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm17 & zmm5) | (zmm2 & ~zmm17 & zmm5) | (zmm2 & zmm17 & ~zmm5) | (zmm2 & zmm17 & zmm5)
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
@@ -5975,9 +5979,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm18
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm16, %zmm0, %zmm9
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm0 & zmm16) | (zmm9 & ~zmm0 & zmm16) | (zmm9 & zmm0 & ~zmm16) | (zmm9 & zmm0 & zmm16)
 ; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm9, %zmm17, %zmm18
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = (~zmm18 & zmm17 & zmm9) | (zmm18 & ~zmm17 & ~zmm9) | (zmm18 & ~zmm17 & zmm9) | (zmm18 & zmm17 & zmm9)
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7]
@@ -6001,9 +6005,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm0 & zmm7) | (zmm3 & ~zmm0 & zmm7) | (zmm3 & zmm0 & ~zmm7) | (zmm3 & zmm0 & zmm7)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm16
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm3, %zmm17, %zmm16
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm17 & zmm3) | (zmm16 & ~zmm17 & ~zmm3) | (zmm16 & ~zmm17 & zmm3) | (zmm16 & zmm17 & zmm3)
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
@@ -6032,7 +6036,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $236, %ymm13, %ymm11, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm11 & ~ymm13) | (~ymm5 & ymm11 & ymm13) | (ymm5 & ~ymm11 & ymm13) | (ymm5 & ymm11 & ~ymm13) | (ymm5 & ymm11 & ymm13)
 ; AVX512DQ-FCP-NEXT:    movw $31, %ax
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm5, %zmm0, %zmm0 {%k1}
@@ -6056,7 +6060,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm13, %ymm8, %ymm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm8 & ymm13) | (ymm2 & ~ymm8 & ~ymm13) | (ymm2 & ~ymm8 & ymm13) | (ymm2 & ymm8 & ~ymm13) | (ymm2 & ymm8 & ymm13)
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7]
@@ -6079,8 +6083,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rdx)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm17, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm17, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm17 & zmm0) | (zmm7 & ~zmm17 & ~zmm0) | (zmm7 & ~zmm17 & zmm0) | (zmm7 & zmm17 & zmm0)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm17 & zmm1) | (zmm2 & ~zmm17 & ~zmm1) | (zmm2 & ~zmm17 & zmm1) | (zmm2 & zmm17 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%r9)
@@ -10680,9 +10684,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm29, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm29 & zmm2) | (zmm1 & ~zmm29 & zmm2) | (zmm1 & zmm29 & ~zmm2) | (zmm1 & zmm29 & zmm2)
 ; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm2 & zmm1) | (zmm3 & ~zmm2 & ~zmm1) | (zmm3 & ~zmm2 & zmm1) | (zmm3 & zmm2 & zmm1)
 ; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm22
 ; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -10743,8 +10747,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-NEXT:    vpternlogq $226, %zmm23, %zmm29, %zmm2
-; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm22, %zmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm29 & zmm23) | (zmm2 & ~zmm29 & zmm23) | (zmm2 & zmm29 & ~zmm23) | (zmm2 & zmm29 & zmm23)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm22 & zmm2) | (zmm0 & ~zmm22 & ~zmm2) | (zmm0 & ~zmm22 & zmm2) | (zmm0 & zmm22 & zmm2)
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm0
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
@@ -10790,8 +10794,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm13
 ; AVX512-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm25
-; AVX512-NEXT:    vpternlogq $226, %zmm28, %zmm29, %zmm0
-; AVX512-NEXT:    vpternlogq $184, %zmm0, %zmm22, %zmm25
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm29 & zmm28) | (zmm0 & ~zmm29 & zmm28) | (zmm0 & zmm29 & ~zmm28) | (zmm0 & zmm29 & zmm28)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm22 & zmm0) | (zmm25 & ~zmm22 & ~zmm0) | (zmm25 & ~zmm22 & zmm0) | (zmm25 & zmm22 & zmm0)
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
@@ -10819,8 +10823,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm28
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm29, %zmm2
-; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm22, %zmm28
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm29 & zmm0) | (zmm2 & ~zmm29 & zmm0) | (zmm2 & zmm29 & ~zmm0) | (zmm2 & zmm29 & zmm0)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm28 = (~zmm28 & zmm22 & zmm2) | (zmm28 & ~zmm22 & ~zmm2) | (zmm28 & ~zmm22 & zmm2) | (zmm28 & zmm22 & zmm2)
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
@@ -10859,7 +10863,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm5, %ymm4
 ; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm26
-; AVX512-NEXT:    vpternlogq $236, %ymm29, %ymm4, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm4 & ~ymm29) | (~ymm3 & ymm4 & ymm29) | (ymm3 & ~ymm4 & ymm29) | (ymm3 & ymm4 & ~ymm29) | (ymm3 & ymm4 & ymm29)
 ; AVX512-NEXT:    movw $31, %ax
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqa32 %zmm3, %zmm30 {%k1}
@@ -10896,7 +10900,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
 ; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm17
-; AVX512-NEXT:    vpternlogq $236, %ymm29, %ymm0, %ymm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm0 & ~ymm29) | (~ymm4 & ymm0 & ymm29) | (ymm4 & ~ymm0 & ymm29) | (ymm4 & ymm0 & ~ymm29) | (ymm4 & ymm0 & ymm29)
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
@@ -10954,7 +10958,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm0
 ; AVX512-NEXT:    vpshufb %ymm10, %ymm0, %ymm13
-; AVX512-NEXT:    vpternlogq $236, %ymm29, %ymm13, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm13 & ~ymm29) | (~ymm3 & ymm13 & ymm29) | (ymm3 & ~ymm13 & ymm29) | (ymm3 & ymm13 & ~ymm29) | (ymm3 & ymm13 & ymm29)
 ; AVX512-NEXT:    vmovdqa32 %zmm3, %zmm2 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm0
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm3
@@ -10973,7 +10977,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[1,1,2,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7]
-; AVX512-NEXT:    vpternlogq $236, %ymm29, %ymm3, %ymm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm3 & ~ymm29) | (~ymm10 & ymm3 & ymm29) | (ymm10 & ~ymm3 & ymm29) | (ymm10 & ymm3 & ~ymm29) | (ymm10 & ymm3 & ymm29)
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
@@ -10999,6 +11003,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm7 = (~zmm7 & ~zmm6 & mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem)
 ; AVX512-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
@@ -11006,24 +11011,27 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 %zmm7, (%rsi)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm7 = (~zmm7 & ~zmm6 & mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm7 = (~zmm7 & ~zmm6 & mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm7, 64(%rdx)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm7 = (~zmm7 & ~zmm6 & mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512-NEXT:    vmovdqa32 %zmm6, %zmm7 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm7, (%rdx)
 ; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm6 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512-NEXT:    vpternlogq $184, %zmm30, %zmm6, %zmm3
-; AVX512-NEXT:    vpternlogq $184, %zmm31, %zmm6, %zmm5
-; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm6, %zmm0
-; AVX512-NEXT:    vpternlogq $184, %zmm1, %zmm6, %zmm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm6 & zmm30) | (zmm3 & ~zmm6 & ~zmm30) | (zmm3 & ~zmm6 & zmm30) | (zmm3 & zmm6 & zmm30)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm6 & zmm31) | (zmm5 & ~zmm6 & ~zmm31) | (zmm5 & ~zmm6 & zmm31) | (zmm5 & zmm6 & zmm31)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & zmm2) | (zmm0 & ~zmm6 & ~zmm2) | (zmm0 & ~zmm6 & zmm2) | (zmm0 & zmm6 & zmm2)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm6 & zmm1) | (zmm4 & ~zmm6 & ~zmm1) | (zmm4 & ~zmm6 & zmm1) | (zmm4 & zmm6 & zmm1)
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-NEXT:    vmovaps %zmm1, 64(%rcx)
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
@@ -11306,9 +11314,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm29, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm29 & zmm3) | (zmm4 & ~zmm29 & zmm3) | (zmm4 & zmm29 & ~zmm3) | (zmm4 & zmm29 & zmm3)
 ; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm26 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm4, %zmm26, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm26 & zmm4) | (zmm5 & ~zmm26 & ~zmm4) | (zmm5 & ~zmm26 & zmm4) | (zmm5 & zmm26 & zmm4)
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
@@ -11364,8 +11372,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm25
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm29, %zmm1
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm26, %zmm25
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm29 & zmm11) | (zmm1 & ~zmm29 & zmm11) | (zmm1 & zmm29 & ~zmm11) | (zmm1 & zmm29 & zmm11)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm26 & zmm1) | (zmm25 & ~zmm26 & ~zmm1) | (zmm25 & ~zmm26 & zmm1) | (zmm25 & zmm26 & zmm1)
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
@@ -11404,8 +11412,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,4,5]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm23
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm24, %zmm29, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm26, %zmm23
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm29 & zmm24) | (zmm0 & ~zmm29 & zmm24) | (zmm0 & zmm29 & ~zmm24) | (zmm0 & zmm29 & zmm24)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm26 & zmm0) | (zmm23 & ~zmm26 & ~zmm0) | (zmm23 & ~zmm26 & zmm0) | (zmm23 & zmm26 & zmm0)
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
@@ -11439,8 +11447,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm28
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm29, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm5, %zmm26, %zmm28
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm29 & zmm0) | (zmm5 & ~zmm29 & zmm0) | (zmm5 & zmm29 & ~zmm0) | (zmm5 & zmm29 & zmm0)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm28 = (~zmm28 & zmm26 & zmm5) | (zmm28 & ~zmm26 & ~zmm5) | (zmm28 & ~zmm26 & zmm5) | (zmm28 & zmm26 & zmm5)
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
@@ -11477,7 +11485,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm17
-; AVX512-FCP-NEXT:    vpternlogq $236, %ymm29, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~ymm29) | (~ymm2 & ymm3 & ymm29) | (ymm2 & ~ymm3 & ymm29) | (ymm2 & ymm3 & ~ymm29) | (ymm2 & ymm3 & ymm29)
 ; AVX512-FCP-NEXT:    movw $31, %ax
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm30 {%k1}
@@ -11512,7 +11520,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm13[2,2,2,2,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq $236, %ymm29, %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm0 & ~ymm29) | (~ymm3 & ymm0 & ymm29) | (ymm3 & ~ymm0 & ymm29) | (ymm3 & ymm0 & ~ymm29) | (ymm3 & ymm0 & ymm29)
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
@@ -11555,7 +11563,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
 ; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm10
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm17
-; AVX512-FCP-NEXT:    vpternlogq $236, %ymm29, %ymm10, %ymm14
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm10 & ~ymm29) | (~ymm14 & ymm10 & ymm29) | (ymm14 & ~ymm10 & ymm29) | (ymm14 & ymm10 & ~ymm29) | (ymm14 & ymm10 & ymm29)
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm10
 ; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm11, %xmm11
@@ -11584,7 +11592,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm13, %xmm13
 ; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq $236, %ymm29, %ymm14, %ymm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm14 & ~ymm29) | (~ymm6 & ymm14 & ymm29) | (ymm6 & ~ymm14 & ymm29) | (ymm6 & ymm14 & ~ymm29) | (ymm6 & ymm14 & ymm29)
 ; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm7
 ; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3],xmm7[4],xmm2[5,6,7]
@@ -11607,6 +11615,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm5 = (~zmm5 & ~zmm3 & mem) | (zmm5 & ~zmm3 & mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem)
 ; AVX512-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
@@ -11614,23 +11623,26 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm5 = (~zmm5 & ~zmm3 & mem) | (zmm5 & ~zmm3 & mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm6, %zmm5 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rsi)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm5 = (~zmm5 & ~zmm3 & mem) | (zmm5 & ~zmm3 & mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm6, %zmm5 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm5 = (~zmm5 & ~zmm3 & mem) | (zmm5 & ~zmm3 & mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm5 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm30, %zmm26, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm31, %zmm26, %zmm7
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm26, %zmm11
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm2, %zmm26, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm26 & zmm30) | (zmm4 & ~zmm26 & ~zmm30) | (zmm4 & ~zmm26 & zmm30) | (zmm4 & zmm26 & zmm30)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm26 & zmm31) | (zmm7 & ~zmm26 & ~zmm31) | (zmm7 & ~zmm26 & zmm31) | (zmm7 & zmm26 & zmm31)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm26 & zmm0) | (zmm11 & ~zmm26 & ~zmm0) | (zmm11 & ~zmm26 & zmm0) | (zmm11 & zmm26 & zmm0)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm26 & zmm2) | (zmm1 & ~zmm26 & ~zmm2) | (zmm1 & ~zmm26 & zmm2) | (zmm1 & zmm26 & zmm2)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, 64(%rcx)
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm0, (%rcx)
@@ -11719,7 +11731,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm16, %zmm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm16 & zmm0) | (zmm3 & ~zmm16 & zmm0) | (zmm3 & zmm16 & ~zmm0) | (zmm3 & zmm16 & zmm0)
 ; AVX512DQ-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1}
@@ -11784,7 +11796,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm17, %zmm16, %zmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm16 & zmm17) | (zmm7 & ~zmm16 & zmm17) | (zmm7 & zmm16 & ~zmm17) | (zmm7 & zmm16 & zmm17)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
@@ -11812,7 +11824,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm16, %zmm6
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm16 & zmm0) | (zmm6 & ~zmm16 & zmm0) | (zmm6 & zmm16 & ~zmm0) | (zmm6 & zmm16 & zmm0)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm0
@@ -11844,7 +11856,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm16, %zmm4
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm16 & zmm0) | (zmm4 & ~zmm16 & zmm0) | (zmm4 & zmm16 & ~zmm0) | (zmm4 & zmm16 & zmm0)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -11915,9 +11927,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm21, %zmm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm21 & zmm2) | (zmm1 & ~zmm21 & zmm2) | (zmm1 & zmm21 & ~zmm2) | (zmm1 & zmm21 & zmm2)
 ; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm2 & zmm1) | (zmm3 & ~zmm2 & ~zmm1) | (zmm3 & ~zmm2 & zmm1) | (zmm3 & zmm2 & zmm1)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm18
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -11978,8 +11990,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm28
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm19, %zmm21, %zmm2
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm2, %zmm18, %zmm28
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm21 & zmm19) | (zmm2 & ~zmm21 & zmm19) | (zmm2 & zmm21 & ~zmm19) | (zmm2 & zmm21 & zmm19)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm28 = (~zmm28 & zmm18 & zmm2) | (zmm28 & ~zmm18 & ~zmm2) | (zmm28 & ~zmm18 & zmm2) | (zmm28 & zmm18 & zmm2)
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm0
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm2
@@ -12016,8 +12028,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm27
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm20, %zmm21, %zmm0
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm0, %zmm18, %zmm27
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm21 & zmm20) | (zmm0 & ~zmm21 & zmm20) | (zmm0 & zmm21 & ~zmm20) | (zmm0 & zmm21 & zmm20)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm18 & zmm0) | (zmm27 & ~zmm18 & ~zmm0) | (zmm27 & ~zmm18 & zmm0) | (zmm27 & zmm18 & zmm0)
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
@@ -12045,8 +12057,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm21, %zmm2
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm2, %zmm18, %zmm20
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm21 & zmm0) | (zmm2 & ~zmm21 & zmm0) | (zmm2 & zmm21 & ~zmm0) | (zmm2 & zmm21 & zmm0)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm18 & zmm2) | (zmm20 & ~zmm18 & ~zmm2) | (zmm20 & ~zmm18 & zmm2) | (zmm20 & zmm18 & zmm2)
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
@@ -12084,7 +12096,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm4
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm31
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm22, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm4 & ~ymm22) | (~ymm2 & ymm4 & ymm22) | (ymm2 & ~ymm4 & ymm22) | (ymm2 & ymm4 & ~ymm22) | (ymm2 & ymm4 & ymm22)
 ; AVX512DQ-NEXT:    movw $31, %ax
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm2, %zmm0, %zmm23 {%k1}
@@ -12120,7 +12132,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm22, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm0 & ~ymm22) | (~ymm2 & ymm0 & ymm22) | (ymm2 & ~ymm0 & ymm22) | (ymm2 & ymm0 & ~ymm22) | (ymm2 & ymm0 & ymm22)
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm6 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
@@ -12178,7 +12190,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm0
 ; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm0, %ymm13
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm22, %ymm13, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm13 & ~ymm22) | (~ymm9 & ymm13 & ymm22) | (ymm9 & ~ymm13 & ymm22) | (ymm9 & ymm13 & ~ymm22) | (ymm9 & ymm13 & ymm22)
 ; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm9, %zmm0, %zmm26 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm0
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm9
@@ -12197,7 +12209,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm22, %ymm9, %ymm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm9 & ~ymm22) | (~ymm11 & ymm9 & ymm22) | (ymm11 & ~ymm9 & ymm22) | (ymm11 & ymm9 & ~ymm22) | (ymm11 & ymm9 & ymm22)
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
@@ -12226,10 +12238,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm3, (%rdx)
 ; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm23, %zmm3, %zmm24
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm25, %zmm3, %zmm21
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm26, %zmm3, %zmm0
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm3 & zmm23) | (zmm24 & ~zmm3 & ~zmm23) | (zmm24 & ~zmm3 & zmm23) | (zmm24 & zmm3 & zmm23)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm3 & zmm25) | (zmm21 & ~zmm3 & ~zmm25) | (zmm21 & ~zmm3 & zmm25) | (zmm21 & zmm3 & zmm25)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm3 & zmm26) | (zmm0 & ~zmm3 & ~zmm26) | (zmm0 & ~zmm3 & zmm26) | (zmm0 & zmm3 & zmm26)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm3 & zmm1) | (zmm2 & ~zmm3 & ~zmm1) | (zmm2 & ~zmm3 & zmm1) | (zmm2 & zmm3 & zmm1)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm1, (%rcx)
@@ -12320,7 +12332,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm16, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm16 & zmm0) | (zmm3 & ~zmm16 & zmm0) | (zmm3 & zmm16 & ~zmm0) | (zmm3 & zmm16 & zmm0)
 ; AVX512DQ-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1}
@@ -12385,7 +12397,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm17, %zmm16, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm16 & zmm17) | (zmm6 & ~zmm16 & zmm17) | (zmm6 & zmm16 & ~zmm17) | (zmm6 & zmm16 & zmm17)
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
@@ -12413,7 +12425,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm16, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm16 & zmm0) | (zmm6 & ~zmm16 & zmm0) | (zmm6 & zmm16 & ~zmm0) | (zmm6 & zmm16 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
@@ -12445,7 +12457,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm16, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm16 & zmm0) | (zmm4 & ~zmm16 & zmm0) | (zmm4 & zmm16 & ~zmm0) | (zmm4 & zmm16 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -12516,9 +12528,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm20, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm20 & zmm3) | (zmm4 & ~zmm20 & zmm3) | (zmm4 & zmm20 & ~zmm3) | (zmm4 & zmm20 & zmm3)
 ; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm28 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm4, %zmm28, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm28 & zmm4) | (zmm5 & ~zmm28 & ~zmm4) | (zmm5 & ~zmm28 & zmm4) | (zmm5 & zmm28 & zmm4)
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
@@ -12575,8 +12587,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm27
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm9, %zmm20, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm28, %zmm27
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm20 & zmm9) | (zmm1 & ~zmm20 & zmm9) | (zmm1 & zmm20 & ~zmm9) | (zmm1 & zmm20 & zmm9)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm28 & zmm1) | (zmm27 & ~zmm28 & ~zmm1) | (zmm27 & ~zmm28 & zmm1) | (zmm27 & zmm28 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm15, %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm26
@@ -12614,8 +12626,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,4,5]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm19
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm21, %zmm20, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm28, %zmm19
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm20 & zmm21) | (zmm1 & ~zmm20 & zmm21) | (zmm1 & zmm20 & ~zmm21) | (zmm1 & zmm20 & zmm21)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = (~zmm19 & zmm28 & zmm1) | (zmm19 & ~zmm28 & ~zmm1) | (zmm19 & ~zmm28 & zmm1) | (zmm19 & zmm28 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
@@ -12641,8 +12653,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm21
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm20, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm28, %zmm21
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm20 & zmm0) | (zmm1 & ~zmm20 & zmm0) | (zmm1 & zmm20 & ~zmm0) | (zmm1 & zmm20 & zmm0)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm28 & zmm1) | (zmm21 & ~zmm28 & ~zmm1) | (zmm21 & ~zmm28 & zmm1) | (zmm21 & zmm28 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm9 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
@@ -12678,7 +12690,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm14, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $236, %ymm20, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm3 & ~ymm20) | (~ymm0 & ymm3 & ymm20) | (ymm0 & ~ymm3 & ymm20) | (ymm0 & ymm3 & ~ymm20) | (ymm0 & ymm3 & ymm20)
 ; AVX512DQ-FCP-NEXT:    movw $31, %ax
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1}
@@ -12715,7 +12727,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm31
-; AVX512DQ-FCP-NEXT:    vpternlogq $236, %ymm20, %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm2 & ~ymm20) | (~ymm3 & ymm2 & ymm20) | (ymm3 & ~ymm2 & ymm20) | (ymm3 & ymm2 & ~ymm20) | (ymm3 & ymm2 & ymm20)
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
@@ -12757,7 +12769,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm14
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm26
-; AVX512DQ-FCP-NEXT:    vpternlogq $236, %ymm20, %ymm14, %ymm10
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm14 & ~ymm20) | (~ymm10 & ymm14 & ymm20) | (ymm10 & ~ymm14 & ymm20) | (ymm10 & ymm14 & ~ymm20) | (ymm10 & ymm14 & ymm20)
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm14
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm9[0],xmm14[1],xmm9[2,3],xmm14[4],xmm9[5,6,7]
@@ -12788,7 +12800,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $236, %ymm20, %ymm15, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm15 & ~ymm20) | (~ymm5 & ymm15 & ymm20) | (ymm5 & ~ymm15 & ymm20) | (ymm5 & ymm15 & ~ymm20) | (ymm5 & ymm15 & ymm20)
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6,7]
@@ -12813,10 +12825,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, 64(%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rdx)
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm22, %zmm28, %zmm23
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm24, %zmm28, %zmm13
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm28, %zmm10
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm2, %zmm28, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm28 & zmm22) | (zmm23 & ~zmm28 & ~zmm22) | (zmm23 & ~zmm28 & zmm22) | (zmm23 & zmm28 & zmm22)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm28 & zmm24) | (zmm13 & ~zmm28 & ~zmm24) | (zmm13 & ~zmm28 & zmm24) | (zmm13 & zmm28 & zmm24)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm28 & zmm0) | (zmm10 & ~zmm28 & ~zmm0) | (zmm10 & ~zmm28 & zmm0) | (zmm10 & zmm28 & zmm0)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm28 & zmm2) | (zmm1 & ~zmm28 & ~zmm2) | (zmm1 & ~zmm28 & zmm2) | (zmm1 & zmm28 & zmm2)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, 64(%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, (%rcx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index af340d15fe8f6..9c2fb7704d1d4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -6979,7 +6979,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4],xmm3[5],xmm7[6],xmm3[7]
 ; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm7
 ; AVX512-NEXT:    vpshufb %ymm7, %ymm3, %ymm3
-; AVX512-NEXT:    vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm0 & ~mem) | (ymm3 & ~ymm0 & ~mem) | (ymm3 & ~ymm0 & mem) | (ymm3 & ymm0 & ~mem) | (ymm3 & ymm0 & mem)
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm7
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
@@ -7128,18 +7128,18 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpternlogq $184, %zmm21, %zmm4, %zmm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm4 & zmm21) | (zmm11 & ~zmm4 & ~zmm21) | (zmm11 & ~zmm4 & zmm21) | (zmm11 & zmm4 & zmm21)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22
-; AVX512-NEXT:    vpternlogq $184, %zmm22, %zmm4, %zmm12
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm22 = (~zmm22 & zmm5 & ~mem) | (zmm22 & ~zmm5 & mem) | (zmm22 & zmm5 & ~mem) | (zmm22 & zmm5 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm4 & zmm22) | (zmm12 & ~zmm4 & ~zmm22) | (zmm12 & ~zmm4 & zmm22) | (zmm12 & zmm4 & zmm22)
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm19, %zmm4, %zmm23
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm4 & zmm19) | (zmm23 & ~zmm4 & ~zmm19) | (zmm23 & ~zmm4 & zmm19) | (zmm23 & zmm4 & zmm19)
 ; AVX512-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqa32 %zmm14, %zmm23 {%k1}
-; AVX512-NEXT:    vpternlogq $184, %zmm20, %zmm4, %zmm27
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm4 & zmm20) | (zmm27 & ~zmm4 & ~zmm20) | (zmm27 & ~zmm4 & zmm20) | (zmm27 & zmm4 & zmm20)
 ; AVX512-NEXT:    vmovdqa32 %zmm25, %zmm27 {%k1}
-; AVX512-NEXT:    vpternlogq $226, %zmm18, %zmm4, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm4 & zmm18) | (zmm1 & ~zmm4 & zmm18) | (zmm1 & zmm4 & ~zmm18) | (zmm1 & zmm4 & zmm18)
 ; AVX512-NEXT:    vmovdqa32 %zmm10, %zmm1 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm11, (%rsi)
 ; AVX512-NEXT:    vmovdqa64 %zmm12, (%rdx)
@@ -7147,11 +7147,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 %zmm27, (%r8)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vpternlogq $226, %zmm17, %zmm4, %zmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm4 & zmm17) | (zmm2 & ~zmm4 & zmm17) | (zmm2 & zmm4 & ~zmm17) | (zmm2 & zmm4 & zmm17)
 ; AVX512-NEXT:    vmovdqa32 %zmm26, %zmm2 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm2, (%rax)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vpternlogq $226, %zmm16, %zmm4, %zmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm4 & zmm16) | (zmm3 & ~zmm4 & zmm16) | (zmm3 & zmm4 & ~zmm16) | (zmm3 & zmm4 & zmm16)
 ; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm3, (%rax)
 ; AVX512-NEXT:    vzeroupper
@@ -7344,7 +7344,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm30, %zmm16
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm30 & zmm0) | (zmm16 & ~zmm30 & ~zmm0) | (zmm16 & ~zmm30 & zmm0) | (zmm16 & zmm30 & zmm0)
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7]
@@ -7429,17 +7429,17 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm23, %zmm3, %zmm26
-; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm24
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm24, %zmm3, %zmm27
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm21, %zmm30, %zmm15
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm3 & zmm23) | (zmm26 & ~zmm3 & ~zmm23) | (zmm26 & ~zmm3 & zmm23) | (zmm26 & zmm3 & zmm23)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm20 & ~mem) | (zmm24 & ~zmm20 & mem) | (zmm24 & zmm20 & ~mem) | (zmm24 & zmm20 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm3 & zmm24) | (zmm27 & ~zmm3 & ~zmm24) | (zmm27 & ~zmm3 & zmm24) | (zmm27 & zmm3 & zmm24)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm30 & zmm21) | (zmm15 & ~zmm30 & ~zmm21) | (zmm15 & ~zmm30 & zmm21) | (zmm15 & zmm30 & zmm21)
 ; AVX512-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm28, %zmm15 {%k1}
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm18, %zmm30, %zmm19
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = (~zmm19 & zmm30 & zmm18) | (zmm19 & ~zmm30 & ~zmm18) | (zmm19 & ~zmm30 & zmm18) | (zmm19 & zmm30 & zmm18)
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm12, %zmm19 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm9, %zmm16 {%k1}
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm25, %zmm30, %zmm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm30 & zmm25) | (zmm11 & ~zmm30 & zmm25) | (zmm11 & zmm30 & ~zmm25) | (zmm11 & zmm30 & zmm25)
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm11 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, (%rdx)
@@ -7448,7 +7448,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%r9)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm10, %zmm30, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm30 & zmm10) | (zmm0 & ~zmm30 & zmm10) | (zmm0 & zmm30 & ~zmm10) | (zmm0 & zmm30 & zmm10)
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
@@ -7639,7 +7639,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm3
 ; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm0 & ~mem) | (ymm2 & ~ymm0 & ~mem) | (ymm2 & ~ymm0 & mem) | (ymm2 & ymm0 & ~mem) | (ymm2 & ymm0 & mem)
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
@@ -7649,7 +7649,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm19, %zmm25, %zmm22
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm22 = (~zmm22 & zmm25 & zmm19) | (zmm22 & ~zmm25 & ~zmm19) | (zmm22 & ~zmm25 & zmm19) | (zmm22 & zmm25 & zmm19)
 ; AVX512DQ-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm22 {%k1}
@@ -7682,7 +7682,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm17, %zmm25, %zmm19
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm19 = (~zmm19 & zmm25 & zmm17) | (zmm19 & ~zmm25 & ~zmm17) | (zmm19 & ~zmm25 & zmm17) | (zmm19 & zmm25 & zmm17)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm19 {%k1}
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
@@ -7714,7 +7714,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
 ; AVX512DQ-NEXT:    vpor %ymm3, %ymm12, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm20, %zmm25, %zmm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm25 & zmm20) | (zmm11 & ~zmm25 & zmm20) | (zmm11 & zmm25 & ~zmm20) | (zmm11 & zmm25 & zmm20)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm11 {%k1}
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
@@ -7768,7 +7768,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm24, %zmm25, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm25 & zmm24) | (zmm2 & ~zmm25 & zmm24) | (zmm2 & zmm25 & ~zmm24) | (zmm2 & zmm25 & zmm24)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm17, %zmm0, %zmm2 {%k1}
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
@@ -7799,13 +7799,13 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm27, %zmm25, %zmm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm25 & zmm27) | (zmm3 & ~zmm25 & zmm27) | (zmm3 & zmm25 & ~zmm27) | (zmm3 & zmm25 & zmm27)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm18, %zmm1, %zmm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm1 & zmm18) | (zmm8 & ~zmm1 & ~zmm18) | (zmm8 & ~zmm1 & zmm18) | (zmm8 & zmm1 & zmm18)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm21, %zmm1, %zmm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm0 & ~mem) | (zmm21 & ~zmm0 & mem) | (zmm21 & zmm0 & ~mem) | (zmm21 & zmm0 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm1 & zmm21) | (zmm9 & ~zmm1 & ~zmm21) | (zmm9 & ~zmm1 & zmm21) | (zmm9 & zmm1 & zmm21)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%rsi)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm9, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm22, (%rcx)
@@ -7939,7 +7939,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm14, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm9, %zmm19, %zmm16
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm19 & zmm9) | (zmm16 & ~zmm19 & ~zmm9) | (zmm16 & ~zmm19 & zmm9) | (zmm16 & zmm19 & zmm9)
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512DQ-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
@@ -7977,7 +7977,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
 ; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm12, %ymm12
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm2, %zmm19, %zmm20
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm19 & zmm2) | (zmm20 & ~zmm19 & ~zmm2) | (zmm20 & ~zmm19 & zmm2) | (zmm20 & zmm19 & zmm2)
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm9, %zmm0, %zmm20 {%k1}
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -8002,7 +8002,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm15, %xmm11
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm12, %zmm19, %zmm10
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm19 & zmm12) | (zmm10 & ~zmm19 & ~zmm12) | (zmm10 & ~zmm19 & zmm12) | (zmm10 & zmm19 & zmm12)
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm11
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
@@ -8065,7 +8065,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
 ; AVX512DQ-FCP-NEXT:    vpor %ymm13, %ymm9, %ymm9
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm12, %zmm19, %zmm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm19 & zmm12) | (zmm11 & ~zmm19 & zmm12) | (zmm11 & zmm19 & ~zmm12) | (zmm11 & zmm19 & zmm12)
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1}
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,4,8,11,15,0,0,0]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm9
@@ -8094,12 +8094,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm19, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm19 & zmm2) | (zmm0 & ~zmm19 & zmm2) | (zmm0 & zmm19 & ~zmm2) | (zmm0 & zmm19 & zmm2)
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm23, %zmm1, %zmm9
-; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm24, %zmm1, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm1 & zmm23) | (zmm9 & ~zmm1 & ~zmm23) | (zmm9 & ~zmm1 & zmm23) | (zmm9 & zmm1 & zmm23)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm21 & ~mem) | (zmm24 & ~zmm21 & mem) | (zmm24 & zmm21 & ~mem) | (zmm24 & zmm21 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm24) | (zmm5 & ~zmm1 & ~zmm24) | (zmm5 & ~zmm1 & zmm24) | (zmm5 & zmm1 & zmm24)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%rcx)
@@ -14527,7 +14527,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $242, %ymm0, %ymm20, %ymm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm20 & ymm0) | (ymm11 & ~ymm20 & ~ymm0) | (ymm11 & ~ymm20 & ymm0) | (ymm11 & ymm20 & ~ymm0) | (ymm11 & ymm20 & ymm0)
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7]
 ; AVX512-NEXT:    vmovdqa %ymm15, %ymm13
 ; AVX512-NEXT:    vextracti32x4 $1, %ymm0, %xmm25
@@ -14546,7 +14546,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm11
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7]
 ; AVX512-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
-; AVX512-NEXT:    vpternlogq $242, %ymm0, %ymm20, %ymm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm0) | (ymm1 & ~ymm20 & ~ymm0) | (ymm1 & ~ymm20 & ymm0) | (ymm1 & ymm20 & ~ymm0) | (ymm1 & ymm20 & ymm0)
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm11
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
@@ -14700,7 +14700,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $184, %zmm0, %zmm19, %zmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm19 & zmm0) | (zmm3 & ~zmm19 & ~zmm0) | (zmm3 & ~zmm19 & zmm0) | (zmm3 & zmm19 & zmm0)
 ; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
 ; AVX512-NEXT:    vmovdqa %ymm4, %ymm14
@@ -14812,7 +14812,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $184, %zmm0, %zmm19, %zmm30
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm30 = (~zmm30 & zmm19 & zmm0) | (zmm30 & ~zmm19 & ~zmm0) | (zmm30 & ~zmm19 & zmm0) | (zmm30 & zmm19 & zmm0)
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
@@ -14868,7 +14868,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $184, %zmm6, %zmm19, %zmm13
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm19 & zmm6) | (zmm13 & ~zmm19 & ~zmm6) | (zmm13 & ~zmm19 & zmm6) | (zmm13 & zmm19 & zmm6)
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm8
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7]
@@ -14887,13 +14887,17 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm10 = (~zmm10 & ~zmm9 & mem) | (zmm10 & ~zmm9 & mem) | (zmm10 & zmm9 & ~mem) | (zmm10 & zmm9 & mem)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm12 = (~zmm12 & ~zmm9 & mem) | (zmm12 & ~zmm9 & mem) | (zmm12 & zmm9 & ~mem) | (zmm12 & zmm9 & mem)
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & ~zmm9 & mem) | (zmm1 & zmm9 & mem)
 ; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 64-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $184, %zmm10, %zmm9, %zmm2
-; AVX512-NEXT:    vpternlogq $184, %zmm12, %zmm9, %zmm4
+; AVX512-NEXT:    # zmm3 = (~zmm3 & zmm9 & mem) | (zmm3 & ~zmm9 & ~mem) | (zmm3 & ~zmm9 & mem) | (zmm3 & zmm9 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm9 & zmm10) | (zmm2 & ~zmm9 & ~zmm10) | (zmm2 & ~zmm9 & zmm10) | (zmm2 & zmm9 & zmm10)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm9 & zmm12) | (zmm4 & ~zmm9 & ~zmm12) | (zmm4 & ~zmm9 & zmm12) | (zmm4 & zmm9 & zmm12)
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1]
 ; AVX512-NEXT:    vpshufb %ymm5, %ymm8, %ymm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3,4,5,6,7]
@@ -14912,21 +14916,26 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm12 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm12 = (~zmm12 & zmm19 & mem) | (zmm12 & ~zmm19 & ~mem) | (zmm12 & ~zmm19 & mem) | (zmm12 & zmm19 & mem)
 ; AVX512-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqa32 %zmm6, %zmm12 {%k1}
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm6 = (~zmm6 & zmm19 & mem) | (zmm6 & ~zmm19 & ~mem) | (zmm6 & ~zmm19 & mem) | (zmm6 & zmm19 & mem)
 ; AVX512-NEXT:    vmovdqa32 %zmm7, %zmm6 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm7
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm6 = (~zmm6 & zmm19 & mem) | (zmm6 & ~zmm19 & ~mem) | (zmm6 & ~zmm19 & mem) | (zmm6 & zmm19 & mem)
 ; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm6 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm8
 ; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm25 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm25 = (~zmm25 & zmm19 & mem) | (zmm25 & ~zmm19 & ~mem) | (zmm25 & ~zmm19 & mem) | (zmm25 & zmm19 & mem)
 ; AVX512-NEXT:    vmovdqa32 %zmm9, %zmm25 {%k1}
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm6 = (~zmm6 & ~zmm19 & mem) | (zmm6 & ~zmm19 & mem) | (zmm6 & zmm19 & ~mem) | (zmm6 & zmm19 & mem)
 ; AVX512-NEXT:    vmovdqa32 %zmm10, %zmm6 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm3, (%rsi)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, 64(%rsi)
@@ -14941,11 +14950,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa32 %zmm14, %zmm2 {%k1}
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm1 = (~zmm1 & ~zmm19 & mem) | (zmm1 & ~zmm19 & mem) | (zmm1 & zmm19 & ~mem) | (zmm1 & zmm19 & mem)
 ; AVX512-NEXT:    vmovdqa32 %zmm15, %zmm1 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm2, 64(%r9)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm11 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm11 = (~zmm11 & ~zmm19 & mem) | (zmm11 & ~zmm19 & mem) | (zmm11 & zmm19 & ~mem) | (zmm11 & zmm19 & mem)
 ; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm1
 ; AVX512-NEXT:    vmovdqa32 %zmm23, %zmm1 {%k1}
 ; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
@@ -15433,7 +15444,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm3
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm29, %zmm27
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm29 & zmm1) | (zmm27 & ~zmm29 & ~zmm1) | (zmm27 & ~zmm29 & zmm1) | (zmm27 & zmm29 & zmm1)
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm12
@@ -15469,7 +15480,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpermd %zmm25, %zmm5, %zmm3
 ; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm3
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm29, %zmm26
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm29 & zmm1) | (zmm26 & ~zmm29 & ~zmm1) | (zmm26 & ~zmm29 & zmm1) | (zmm26 & zmm29 & zmm1)
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm5
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm4
@@ -15619,19 +15630,24 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm12 = (~zmm12 & ~zmm9 & mem) | (zmm12 & ~zmm9 & mem) | (zmm12 & zmm9 & ~mem) | (zmm12 & zmm9 & mem)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm13 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm13 = (~zmm13 & ~zmm9 & mem) | (zmm13 & ~zmm9 & mem) | (zmm13 & zmm9 & ~mem) | (zmm13 & zmm9 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm17 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm17 = (~zmm17 & zmm9 & mem) | (zmm17 & ~zmm9 & ~mem) | (zmm17 & ~zmm9 & mem) | (zmm17 & zmm9 & mem)
 ; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm12, %zmm9, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm13, %zmm9, %zmm11
+; AVX512-FCP-NEXT:    # zmm21 = (~zmm21 & zmm9 & mem) | (zmm21 & ~zmm9 & ~mem) | (zmm21 & ~zmm9 & mem) | (zmm21 & zmm9 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm9 & zmm12) | (zmm5 & ~zmm9 & ~zmm12) | (zmm5 & ~zmm9 & zmm12) | (zmm5 & zmm9 & zmm12)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm9 & zmm13) | (zmm11 & ~zmm9 & ~zmm13) | (zmm11 & ~zmm9 & zmm13) | (zmm11 & zmm9 & zmm13)
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm10[1,2],ymm8[3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm19 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm19 = (~zmm19 & zmm29 & mem) | (zmm19 & ~zmm29 & ~mem) | (zmm19 & ~zmm29 & mem) | (zmm19 & zmm29 & mem)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
@@ -15647,18 +15663,22 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm19 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm2 = (~zmm2 & zmm29 & mem) | (zmm2 & ~zmm29 & ~mem) | (zmm2 & ~zmm29 & mem) | (zmm2 & zmm29 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm9, %zmm2 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm8 = (~zmm8 & zmm29 & mem) | (zmm8 & ~zmm29 & ~mem) | (zmm8 & ~zmm29 & mem) | (zmm8 & zmm29 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm10, %zmm8 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm6 = (~zmm6 & zmm29 & mem) | (zmm6 & ~zmm29 & ~mem) | (zmm6 & ~zmm29 & mem) | (zmm6 & zmm29 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm12, %zmm6 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm13, %zmm9 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm15, %zmm10 {%k1}
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm1 = (~zmm1 & ~zmm29 & mem) | (zmm1 & ~zmm29 & mem) | (zmm1 & zmm29 & ~mem) | (zmm1 & zmm29 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm18, %zmm1 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
@@ -15674,13 +15694,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm1 = (~zmm1 & ~zmm29 & mem) | (zmm1 & ~zmm29 & mem) | (zmm1 & zmm29 & ~mem) | (zmm1 & zmm29 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm16, %zmm1 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm4 = (~zmm4 & ~zmm29 & mem) | (zmm4 & ~zmm29 & mem) | (zmm4 & zmm29 & ~mem) | (zmm4 & zmm29 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm4 {%k1}
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 64(%rax)
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm0 = (~zmm0 & ~zmm29 & mem) | (zmm0 & ~zmm29 & mem) | (zmm0 & zmm29 & ~mem) | (zmm0 & zmm29 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm7, %zmm0 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512-FCP-NEXT:    addq $1800, %rsp # imm = 0x708
@@ -16152,7 +16175,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm10, %ymm10
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $242, %ymm0, %ymm19, %ymm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm19 & ymm0) | (ymm10 & ~ymm19 & ~ymm0) | (ymm10 & ~ymm19 & ymm0) | (ymm10 & ymm19 & ~ymm0) | (ymm10 & ymm19 & ymm0)
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
 ; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm0, %xmm28
@@ -16165,6 +16188,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
 ; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm10 = (~zmm10 & zmm17 & mem) | (zmm10 & ~zmm17 & ~mem) | (zmm10 & ~zmm17 & mem) | (zmm10 & zmm17 & mem)
 ; AVX512DQ-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
@@ -16178,7 +16202,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm15
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm15[4],xmm11[5],xmm15[6],xmm11[7]
 ; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
-; AVX512DQ-NEXT:    vpternlogq $242, %ymm0, %ymm19, %ymm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm19 & ymm0) | (ymm11 & ~ymm19 & ~ymm0) | (ymm11 & ~ymm19 & ymm0) | (ymm11 & ymm19 & ~ymm0) | (ymm11 & ymm19 & ymm0)
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm12
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
@@ -16189,6 +16213,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
 ; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm11 = (~zmm11 & zmm17 & mem) | (zmm11 & ~zmm17 & ~mem) | (zmm11 & ~zmm17 & mem) | (zmm11 & zmm17 & mem)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm0
@@ -16227,6 +16252,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
 ; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm27 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm27 = (~zmm27 & zmm17 & mem) | (zmm27 & ~zmm17 & ~mem) | (zmm27 & ~zmm17 & mem) | (zmm27 & zmm17 & mem)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm2
@@ -16266,6 +16292,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
 ; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm29 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm29 = (~zmm29 & zmm17 & mem) | (zmm29 & ~zmm17 & ~mem) | (zmm29 & ~zmm17 & mem) | (zmm29 & zmm17 & mem)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-NEXT:    vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
@@ -16314,6 +16341,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm14
 ; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm7 = (~zmm7 & ~zmm17 & mem) | (zmm7 & ~zmm17 & mem) | (zmm7 & zmm17 & ~mem) | (zmm7 & zmm17 & mem)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload
@@ -16336,7 +16364,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm0, %zmm14, %zmm12
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm14 & zmm0) | (zmm12 & ~zmm14 & ~zmm0) | (zmm12 & ~zmm14 & zmm0) | (zmm12 & zmm14 & zmm0)
 ; AVX512DQ-NEXT:    vmovdqa %ymm6, %ymm9
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm7
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
@@ -16407,6 +16435,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm12 = (~zmm12 & ~zmm24 & mem) | (zmm12 & ~zmm24 & mem) | (zmm12 & zmm24 & ~mem) | (zmm12 & zmm24 & mem)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1}
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
@@ -16445,7 +16474,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm24, %zmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm24 & zmm1) | (zmm7 & ~zmm24 & ~zmm1) | (zmm7 & ~zmm24 & zmm1) | (zmm7 & zmm24 & zmm1)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7]
@@ -16486,6 +16515,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm11 = (~zmm11 & ~zmm24 & mem) | (zmm11 & ~zmm24 & mem) | (zmm11 & zmm24 & ~mem) | (zmm11 & zmm24 & mem)
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm11 {%k1}
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
@@ -16501,7 +16531,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm24, %zmm6
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm24 & zmm1) | (zmm6 & ~zmm24 & ~zmm1) | (zmm6 & ~zmm24 & zmm1) | (zmm6 & zmm24 & zmm1)
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm5
@@ -16527,13 +16557,17 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm0 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm1 = (~zmm1 & ~zmm0 & mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm4 = (~zmm4 & ~zmm0 & mem) | (zmm4 & ~zmm0 & mem) | (zmm4 & zmm0 & ~mem) | (zmm4 & zmm0 & mem)
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm28 = (~zmm28 & zmm0 & mem) | (zmm28 & ~zmm0 & ~mem) | (zmm28 & ~zmm0 & mem) | (zmm28 & zmm0 & mem)
 ; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm4, %zmm0, %zmm3
+; AVX512DQ-NEXT:    # zmm23 = (~zmm23 & zmm0 & mem) | (zmm23 & ~zmm0 & ~mem) | (zmm23 & ~zmm0 & mem) | (zmm23 & zmm0 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & zmm1) | (zmm2 & ~zmm0 & ~zmm1) | (zmm2 & ~zmm0 & zmm1) | (zmm2 & zmm0 & zmm1)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm0 & zmm4) | (zmm3 & ~zmm0 & ~zmm4) | (zmm3 & ~zmm0 & zmm4) | (zmm3 & zmm0 & zmm4)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm23, (%rsi)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm28, 64(%rsi)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
@@ -16946,6 +16980,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm6 = (~zmm6 & zmm25 & mem) | (zmm6 & ~zmm25 & ~mem) | (zmm6 & ~zmm25 & mem) | (zmm6 & zmm25 & mem)
 ; AVX512DQ-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1}
@@ -16963,6 +16998,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm1 = (~zmm1 & zmm25 & mem) | (zmm1 & ~zmm25 & ~mem) | (zmm1 & ~zmm25 & mem) | (zmm1 & zmm25 & mem)
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm22, %ymm7, %ymm0
@@ -17000,6 +17036,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpor %ymm6, %ymm2, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm23 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm23 = (~zmm23 & zmm25 & mem) | (zmm23 & ~zmm25 & ~mem) | (zmm23 & ~zmm25 & mem) | (zmm23 & zmm25 & mem)
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm23 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 864(%rdi), %ymm1
@@ -17022,6 +17059,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm29 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm29 = (~zmm29 & zmm25 & mem) | (zmm29 & ~zmm25 & ~mem) | (zmm29 & ~zmm25 & mem) | (zmm29 & zmm25 & mem)
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
@@ -17040,7 +17078,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm5
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm25, %zmm28
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm28 = (~zmm28 & zmm25 & zmm0) | (zmm28 & ~zmm25 & ~zmm0) | (zmm28 & ~zmm25 & zmm0) | (zmm28 & zmm25 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm19
@@ -17077,7 +17115,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm29
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm25, %zmm26
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm25 & zmm0) | (zmm26 & ~zmm25 & ~zmm0) | (zmm26 & ~zmm25 & zmm0) | (zmm26 & zmm25 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm5
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
@@ -17132,6 +17170,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm10, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm0 = (~zmm0 & ~zmm25 & mem) | (zmm0 & ~zmm25 & mem) | (zmm0 & zmm25 & ~mem) | (zmm0 & zmm25 & mem)
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm10
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7]
@@ -17160,6 +17199,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm9 = (~zmm9 & ~zmm25 & mem) | (zmm9 & ~zmm25 & mem) | (zmm9 & zmm25 & ~mem) | (zmm9 & zmm25 & mem)
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm2
@@ -17196,6 +17236,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm12 = (~zmm12 & ~zmm25 & mem) | (zmm12 & ~zmm25 & mem) | (zmm12 & zmm25 & ~mem) | (zmm12 & zmm25 & mem)
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm12 {%k1}
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
@@ -17227,17 +17268,22 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm1 = (~zmm1 & ~zmm25 & mem) | (zmm1 & ~zmm25 & mem) | (zmm1 & zmm25 & ~mem) | (zmm1 & zmm25 & mem)
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
 ; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm2 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm3 = (~zmm3 & ~zmm2 & mem) | (zmm3 & ~zmm2 & mem) | (zmm3 & zmm2 & ~mem) | (zmm3 & zmm2 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm4 = (~zmm4 & ~zmm2 & mem) | (zmm4 & ~zmm2 & mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm5 = (~zmm5 & zmm2 & mem) | (zmm5 & ~zmm2 & ~mem) | (zmm5 & ~zmm2 & mem) | (zmm5 & zmm2 & mem)
 ; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm3, %zmm2, %zmm10
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm4, %zmm2, %zmm8
+; AVX512DQ-FCP-NEXT:    # zmm6 = (~zmm6 & zmm2 & mem) | (zmm6 & ~zmm2 & ~mem) | (zmm6 & ~zmm2 & mem) | (zmm6 & zmm2 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & zmm3) | (zmm10 & ~zmm2 & ~zmm3) | (zmm10 & ~zmm2 & zmm3) | (zmm10 & zmm2 & zmm3)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm2 & zmm4) | (zmm8 & ~zmm2 & ~zmm4) | (zmm8 & ~zmm2 & zmm4) | (zmm8 & zmm2 & zmm4)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rdx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index e4dc257543d20..44684603e301d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -1979,7 +1979,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm4
 ; AVX512-NEXT:    vmovdqa %ymm0, %ymm5
-; AVX512-NEXT:    vpternlogq $202, %ymm4, %ymm3, %ymm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm3 & ymm4) | (~ymm5 & ymm3 & ymm4) | (ymm5 & ymm3 & ~ymm4) | (ymm5 & ymm3 & ymm4)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
 ; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
@@ -1990,34 +1990,34 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
 ; AVX512-NEXT:    vpor %xmm8, %xmm9, %xmm8
 ; AVX512-NEXT:    vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215]
-; AVX512-NEXT:    vpternlogq $184, %xmm7, %xmm9, %xmm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm8 = (~xmm8 & xmm9 & xmm7) | (xmm8 & ~xmm9 & ~xmm7) | (xmm8 & ~xmm9 & xmm7) | (xmm8 & xmm9 & xmm7)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm5, %xmm6, %xmm5
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
 ; AVX512-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512-NEXT:    vpternlogq $184, %xmm5, %xmm9, %xmm6
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm6 = (~xmm6 & xmm9 & xmm5) | (xmm6 & ~xmm9 & ~xmm5) | (xmm6 & ~xmm9 & xmm5) | (xmm6 & xmm9 & xmm5)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12]
 ; AVX512-NEXT:    vpor %xmm5, %xmm7, %xmm5
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogq $202, %ymm3, %ymm4, %ymm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm4 & ymm3) | (~ymm7 & ymm4 & ymm3) | (ymm7 & ymm4 & ~ymm3) | (ymm7 & ymm4 & ymm3)
 ; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm10
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm11, %xmm12, %xmm11
-; AVX512-NEXT:    vpternlogq $226, %xmm5, %xmm9, %xmm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm11 = (~xmm11 & ~xmm9 & xmm5) | (xmm11 & ~xmm9 & xmm5) | (xmm11 & xmm9 & ~xmm5) | (xmm11 & xmm9 & xmm5)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13]
 ; AVX512-NEXT:    vpor %xmm5, %xmm12, %xmm5
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm7, %xmm10, %xmm7
-; AVX512-NEXT:    vpternlogq $226, %xmm5, %xmm9, %xmm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm7 = (~xmm7 & ~xmm9 & xmm5) | (xmm7 & ~xmm9 & xmm5) | (xmm7 & xmm9 & ~xmm5) | (xmm7 & xmm9 & xmm5)
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14]
 ; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm9
-; AVX512-NEXT:    vpternlogq $202, %ymm3, %ymm4, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm4 & ymm3) | (~ymm0 & ymm4 & ymm3) | (ymm0 & ymm4 & ~ymm3) | (ymm0 & ymm4 & ymm3)
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm4
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7]
@@ -2051,7 +2051,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm4, %ymm3, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm3 & ymm4) | (~ymm5 & ymm3 & ymm4) | (ymm5 & ymm3 & ~ymm4) | (ymm5 & ymm3 & ymm4)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
@@ -2062,34 +2062,34 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
 ; AVX512-FCP-NEXT:    vpor %xmm8, %xmm9, %xmm8
 ; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215]
-; AVX512-FCP-NEXT:    vpternlogq $184, %xmm7, %xmm9, %xmm8
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm8 = (~xmm8 & xmm9 & xmm7) | (xmm8 & ~xmm9 & ~xmm7) | (xmm8 & ~xmm9 & xmm7) | (xmm8 & xmm9 & xmm7)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
 ; AVX512-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512-FCP-NEXT:    vpternlogq $184, %xmm5, %xmm9, %xmm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm6 = (~xmm6 & xmm9 & xmm5) | (xmm6 & ~xmm9 & ~xmm5) | (xmm6 & ~xmm9 & xmm5) | (xmm6 & xmm9 & xmm5)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12]
 ; AVX512-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm3, %ymm4, %ymm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm4 & ymm3) | (~ymm7 & ymm4 & ymm3) | (ymm7 & ymm4 & ~ymm3) | (ymm7 & ymm4 & ymm3)
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm10
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm11, %xmm12, %xmm11
-; AVX512-FCP-NEXT:    vpternlogq $226, %xmm5, %xmm9, %xmm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm11 = (~xmm11 & ~xmm9 & xmm5) | (xmm11 & ~xmm9 & xmm5) | (xmm11 & xmm9 & ~xmm5) | (xmm11 & xmm9 & xmm5)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13]
 ; AVX512-FCP-NEXT:    vpor %xmm5, %xmm12, %xmm5
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
-; AVX512-FCP-NEXT:    vpternlogq $226, %xmm5, %xmm9, %xmm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm7 = (~xmm7 & ~xmm9 & xmm5) | (xmm7 & ~xmm9 & xmm5) | (xmm7 & xmm9 & ~xmm5) | (xmm7 & xmm9 & xmm5)
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14]
 ; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm9
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm3, %ymm4, %ymm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm4 & ymm3) | (~ymm0 & ymm4 & ymm3) | (ymm0 & ymm4 & ~ymm3) | (ymm0 & ymm4 & ymm3)
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm4
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7]
@@ -2123,7 +2123,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm4
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm4, %ymm3, %ymm5
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm3 & ymm4) | (~ymm5 & ymm3 & ymm4) | (ymm5 & ymm3 & ~ymm4) | (ymm5 & ymm3 & ymm4)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
@@ -2134,34 +2134,34 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
 ; AVX512DQ-NEXT:    vpor %xmm8, %xmm9, %xmm8
 ; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215]
-; AVX512DQ-NEXT:    vpternlogq $184, %xmm7, %xmm9, %xmm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm8 = (~xmm8 & xmm9 & xmm7) | (xmm8 & ~xmm9 & ~xmm7) | (xmm8 & ~xmm9 & xmm7) | (xmm8 & xmm9 & xmm7)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm5, %xmm6, %xmm5
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
 ; AVX512DQ-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-NEXT:    vpternlogq $184, %xmm5, %xmm9, %xmm6
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm6 = (~xmm6 & xmm9 & xmm5) | (xmm6 & ~xmm9 & ~xmm5) | (xmm6 & ~xmm9 & xmm5) | (xmm6 & xmm9 & xmm5)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12]
 ; AVX512DQ-NEXT:    vpor %xmm5, %xmm7, %xmm5
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm3, %ymm4, %ymm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm4 & ymm3) | (~ymm7 & ymm4 & ymm3) | (ymm7 & ymm4 & ~ymm3) | (ymm7 & ymm4 & ymm3)
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm10
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm11, %xmm12, %xmm11
-; AVX512DQ-NEXT:    vpternlogq $226, %xmm5, %xmm9, %xmm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm11 = (~xmm11 & ~xmm9 & xmm5) | (xmm11 & ~xmm9 & xmm5) | (xmm11 & xmm9 & ~xmm5) | (xmm11 & xmm9 & xmm5)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13]
 ; AVX512DQ-NEXT:    vpor %xmm5, %xmm12, %xmm5
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm7, %xmm10, %xmm7
-; AVX512DQ-NEXT:    vpternlogq $226, %xmm5, %xmm9, %xmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm7 = (~xmm7 & ~xmm9 & xmm5) | (xmm7 & ~xmm9 & xmm5) | (xmm7 & xmm9 & ~xmm5) | (xmm7 & xmm9 & xmm5)
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14]
 ; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm2, %xmm9
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm3, %ymm4, %ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm4 & ymm3) | (~ymm0 & ymm4 & ymm3) | (ymm0 & ymm4 & ~ymm3) | (ymm0 & ymm4 & ymm3)
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm3, %xmm4
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7]
@@ -2195,7 +2195,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm4, %ymm3, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm3 & ymm4) | (~ymm5 & ymm3 & ymm4) | (ymm5 & ymm3 & ~ymm4) | (ymm5 & ymm3 & ymm4)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
@@ -2206,34 +2206,34 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
 ; AVX512DQ-FCP-NEXT:    vpor %xmm8, %xmm9, %xmm8
 ; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %xmm7, %xmm9, %xmm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm8 = (~xmm8 & xmm9 & xmm7) | (xmm8 & ~xmm9 & ~xmm7) | (xmm8 & ~xmm9 & xmm7) | (xmm8 & xmm9 & xmm7)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
 ; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %xmm5, %xmm9, %xmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm6 = (~xmm6 & xmm9 & xmm5) | (xmm6 & ~xmm9 & ~xmm5) | (xmm6 & ~xmm9 & xmm5) | (xmm6 & xmm9 & xmm5)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm3, %ymm4, %ymm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm4 & ymm3) | (~ymm7 & ymm4 & ymm3) | (ymm7 & ymm4 & ~ymm3) | (ymm7 & ymm4 & ymm3)
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm10
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm11, %xmm12, %xmm11
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %xmm5, %xmm9, %xmm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm11 = (~xmm11 & ~xmm9 & xmm5) | (xmm11 & ~xmm9 & xmm5) | (xmm11 & xmm9 & ~xmm5) | (xmm11 & xmm9 & xmm5)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm12, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %xmm5, %xmm9, %xmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm7 = (~xmm7 & ~xmm9 & xmm5) | (xmm7 & ~xmm9 & xmm5) | (xmm7 & xmm9 & ~xmm5) | (xmm7 & xmm9 & xmm5)
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm9
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm3, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm4 & ymm3) | (~ymm0 & ymm4 & ymm3) | (ymm0 & ymm4 & ~ymm3) | (ymm0 & ymm4 & ymm3)
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm4
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7]
@@ -3775,7 +3775,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm6
 ; AVX512-NEXT:    vmovdqa %ymm0, %ymm7
-; AVX512-NEXT:    vpternlogq $202, %ymm3, %ymm17, %ymm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm17 & ymm3) | (~ymm7 & ymm17 & ymm3) | (ymm7 & ymm17 & ~ymm3) | (ymm7 & ymm17 & ymm3)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
 ; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u]
@@ -3784,13 +3784,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm1
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
 ; AVX512-NEXT:    vmovdqa %ymm9, %ymm10
-; AVX512-NEXT:    vpternlogq $202, %ymm5, %ymm1, %ymm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm1 & ymm5) | (~ymm10 & ymm1 & ymm5) | (ymm10 & ymm1 & ~ymm5) | (ymm10 & ymm1 & ymm5)
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512-NEXT:    vpternlogq $248, %ymm16, %ymm4, %ymm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm4 & ymm16) | (ymm11 & ~ymm4 & ~ymm16) | (ymm11 & ~ymm4 & ymm16) | (ymm11 & ymm4 & ~ymm16) | (ymm11 & ymm4 & ymm16)
 ; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm13
 ; AVX512-NEXT:    vmovdqa %ymm0, %ymm14
-; AVX512-NEXT:    vpternlogq $202, %ymm6, %ymm13, %ymm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm13 & ymm6) | (~ymm14 & ymm13 & ymm6) | (ymm14 & ymm13 & ~ymm6) | (ymm14 & ymm13 & ymm6)
 ; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm15
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero
@@ -3803,7 +3803,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq $248, %ymm16, %ymm7, %ymm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm7 & ymm16) | (ymm8 & ~ymm7 & ~ymm16) | (ymm8 & ~ymm7 & ymm16) | (ymm8 & ymm7 & ~ymm16) | (ymm8 & ymm7 & ymm16)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero
 ; AVX512-NEXT:    vpor %xmm7, %xmm10, %xmm7
@@ -3812,50 +3812,50 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512-NEXT:    vmovdqa %ymm8, %ymm10
-; AVX512-NEXT:    vpternlogq $202, %ymm17, %ymm3, %ymm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm3 & ymm17) | (~ymm10 & ymm3 & ymm17) | (ymm10 & ymm3 & ~ymm17) | (ymm10 & ymm3 & ymm17)
 ; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm11
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm12, %xmm14, %xmm12
-; AVX512-NEXT:    vpternlogq $202, %ymm1, %ymm5, %ymm9
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm5 & ymm1) | (~ymm9 & ymm5 & ymm1) | (ymm9 & ymm5 & ~ymm1) | (ymm9 & ymm5 & ymm1)
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512-NEXT:    vpternlogq $248, %ymm16, %ymm12, %ymm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm12 & ymm16) | (ymm14 & ~ymm12 & ~ymm16) | (ymm14 & ~ymm12 & ymm16) | (ymm14 & ymm12 & ~ymm16) | (ymm14 & ymm12 & ymm16)
 ; AVX512-NEXT:    vmovdqa %ymm0, %ymm12
-; AVX512-NEXT:    vpternlogq $202, %ymm13, %ymm6, %ymm12
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm6 & ymm13) | (~ymm12 & ymm6 & ymm13) | (ymm12 & ymm6 & ~ymm13) | (ymm12 & ymm6 & ymm13)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm2
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12]
 ; AVX512-NEXT:    vpor %xmm4, %xmm15, %xmm4
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512-NEXT:    vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0]
-; AVX512-NEXT:    vpternlogq $184, %ymm14, %ymm15, %ymm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm15 & ymm14) | (ymm4 & ~ymm15 & ~ymm14) | (ymm4 & ~ymm15 & ymm14) | (ymm4 & ymm15 & ymm14)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm11, %xmm10, %xmm10
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpternlogq $248, %ymm16, %ymm10, %ymm9
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm10 & ymm16) | (ymm9 & ~ymm10 & ~ymm16) | (ymm9 & ~ymm10 & ymm16) | (ymm9 & ymm10 & ~ymm16) | (ymm9 & ymm10 & ymm16)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13]
 ; AVX512-NEXT:    vpor %xmm2, %xmm10, %xmm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpternlogq $184, %ymm9, %ymm15, %ymm2
-; AVX512-NEXT:    vpternlogq $202, %ymm6, %ymm13, %ymm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm15 & ymm9) | (ymm2 & ~ymm15 & ~ymm9) | (ymm2 & ~ymm15 & ymm9) | (ymm2 & ymm15 & ymm9)
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = (~ymm8 & ~ymm13 & ymm6) | (~ymm8 & ymm13 & ymm6) | (ymm8 & ymm13 & ~ymm6) | (ymm8 & ymm13 & ymm6)
 ; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm6
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero
 ; AVX512-NEXT:    vpor %xmm9, %xmm10, %xmm9
 ; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpternlogq $202, %ymm17, %ymm3, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm3 & ymm17) | (~ymm0 & ymm3 & ymm17) | (ymm0 & ymm3 & ~ymm17) | (ymm0 & ymm3 & ymm17)
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm10, %xmm11, %xmm10
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm5 & mem) | (ymm1 & ~ymm5 & ~mem) | (ymm1 & ymm5 & ~mem) | (ymm1 & ymm5 & mem)
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT:    vpternlogq $226, %ymm9, %ymm15, %ymm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm15 & ymm9) | (ymm5 & ~ymm15 & ymm9) | (ymm5 & ymm15 & ~ymm9) | (ymm5 & ymm15 & ymm9)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero
 ; AVX512-NEXT:    vpor %xmm6, %xmm8, %xmm6
@@ -3866,7 +3866,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT:    vpternlogq $226, %ymm6, %ymm15, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm15 & ymm6) | (ymm0 & ~ymm15 & ymm6) | (ymm0 & ymm15 & ~ymm6) | (ymm0 & ymm15 & ymm6)
 ; AVX512-NEXT:    vmovdqa64 %ymm18, (%rsi)
 ; AVX512-NEXT:    vmovdqa %ymm7, (%rdx)
 ; AVX512-NEXT:    vmovdqa %ymm4, (%rcx)
@@ -3885,7 +3885,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm7
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm3, %ymm17, %ymm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm17 & ymm3) | (~ymm7 & ymm17 & ymm3) | (ymm7 & ymm17 & ~ymm3) | (ymm7 & ymm17 & ymm3)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u]
@@ -3894,13 +3894,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm10
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm5, %ymm1, %ymm10
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm1 & ymm5) | (~ymm10 & ymm1 & ymm5) | (ymm10 & ymm1 & ~ymm5) | (ymm10 & ymm1 & ymm5)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm4, %ymm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm4 & ymm16) | (ymm11 & ~ymm4 & ~ymm16) | (ymm11 & ~ymm4 & ymm16) | (ymm11 & ymm4 & ~ymm16) | (ymm11 & ymm4 & ymm16)
 ; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm13
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm6, %ymm13, %ymm14
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm13 & ymm6) | (~ymm14 & ymm13 & ymm6) | (ymm14 & ymm13 & ~ymm6) | (ymm14 & ymm13 & ymm6)
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero
@@ -3913,7 +3913,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm7, %ymm8
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm7 & ymm16) | (ymm8 & ~ymm7 & ~ymm16) | (ymm8 & ~ymm7 & ymm16) | (ymm8 & ymm7 & ~ymm16) | (ymm8 & ymm7 & ymm16)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero
 ; AVX512-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
@@ -3922,50 +3922,50 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm10
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm17, %ymm3, %ymm10
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm3 & ymm17) | (~ymm10 & ymm3 & ymm17) | (ymm10 & ymm3 & ~ymm17) | (ymm10 & ymm3 & ymm17)
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm12, %xmm14, %xmm12
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm1, %ymm5, %ymm9
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm5 & ymm1) | (~ymm9 & ymm5 & ymm1) | (ymm9 & ymm5 & ~ymm1) | (ymm9 & ymm5 & ymm1)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm12, %ymm14
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm12 & ymm16) | (ymm14 & ~ymm12 & ~ymm16) | (ymm14 & ~ymm12 & ymm16) | (ymm14 & ymm12 & ~ymm16) | (ymm14 & ymm12 & ymm16)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm12
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm13, %ymm6, %ymm12
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm6 & ymm13) | (~ymm12 & ymm6 & ymm13) | (ymm12 & ymm6 & ~ymm13) | (ymm12 & ymm6 & ymm13)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm2
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12]
 ; AVX512-FCP-NEXT:    vpor %xmm4, %xmm15, %xmm4
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512-FCP-NEXT:    vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm14, %ymm15, %ymm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm15 & ymm14) | (ymm4 & ~ymm15 & ~ymm14) | (ymm4 & ~ymm15 & ymm14) | (ymm4 & ymm15 & ymm14)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm10, %ymm9
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm10 & ymm16) | (ymm9 & ~ymm10 & ~ymm16) | (ymm9 & ~ymm10 & ymm16) | (ymm9 & ymm10 & ~ymm16) | (ymm9 & ymm10 & ymm16)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13]
 ; AVX512-FCP-NEXT:    vpor %xmm2, %xmm10, %xmm2
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm9, %ymm15, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm6, %ymm13, %ymm8
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm15 & ymm9) | (ymm2 & ~ymm15 & ~ymm9) | (ymm2 & ~ymm15 & ymm9) | (ymm2 & ymm15 & ymm9)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = (~ymm8 & ~ymm13 & ymm6) | (~ymm8 & ymm13 & ymm6) | (ymm8 & ymm13 & ~ymm6) | (ymm8 & ymm13 & ymm6)
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm6
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero
 ; AVX512-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm17, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm3 & ymm17) | (~ymm0 & ymm3 & ymm17) | (ymm0 & ymm3 & ~ymm17) | (ymm0 & ymm3 & ymm17)
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm5 & mem) | (ymm1 & ~ymm5 & ~mem) | (ymm1 & ymm5 & ~mem) | (ymm1 & ymm5 & mem)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq $226, %ymm9, %ymm15, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm15 & ymm9) | (ymm5 & ~ymm15 & ymm9) | (ymm5 & ymm15 & ~ymm9) | (ymm5 & ymm15 & ymm9)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero
 ; AVX512-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm6
@@ -3976,7 +3976,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq $226, %ymm6, %ymm15, %ymm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm15 & ymm6) | (ymm0 & ~ymm15 & ymm6) | (ymm0 & ymm15 & ~ymm6) | (ymm0 & ymm15 & ymm6)
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
@@ -3995,7 +3995,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm6
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm7
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm3, %ymm17, %ymm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm17 & ymm3) | (~ymm7 & ymm17 & ymm3) | (ymm7 & ymm17 & ~ymm3) | (ymm7 & ymm17 & ymm3)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u]
@@ -4004,13 +4004,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
 ; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm10
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm5, %ymm1, %ymm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm1 & ymm5) | (~ymm10 & ymm1 & ymm5) | (ymm10 & ymm1 & ~ymm5) | (ymm10 & ymm1 & ymm5)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm16, %ymm4, %ymm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm4 & ymm16) | (ymm11 & ~ymm4 & ~ymm16) | (ymm11 & ~ymm4 & ymm16) | (ymm11 & ymm4 & ~ymm16) | (ymm11 & ymm4 & ymm16)
 ; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm13
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm14
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm6, %ymm13, %ymm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm13 & ymm6) | (~ymm14 & ymm13 & ymm6) | (ymm14 & ymm13 & ~ymm6) | (ymm14 & ymm13 & ymm6)
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm15
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero
@@ -4023,7 +4023,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm16, %ymm7, %ymm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm7 & ymm16) | (ymm8 & ~ymm7 & ~ymm16) | (ymm8 & ~ymm7 & ymm16) | (ymm8 & ymm7 & ~ymm16) | (ymm8 & ymm7 & ymm16)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero
 ; AVX512DQ-NEXT:    vpor %xmm7, %xmm10, %xmm7
@@ -4032,50 +4032,50 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm10
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm17, %ymm3, %ymm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm3 & ymm17) | (~ymm10 & ymm3 & ymm17) | (ymm10 & ymm3 & ~ymm17) | (ymm10 & ymm3 & ymm17)
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm12, %xmm14, %xmm12
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm1, %ymm5, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm5 & ymm1) | (~ymm9 & ymm5 & ymm1) | (ymm9 & ymm5 & ~ymm1) | (ymm9 & ymm5 & ymm1)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm16, %ymm12, %ymm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm12 & ymm16) | (ymm14 & ~ymm12 & ~ymm16) | (ymm14 & ~ymm12 & ymm16) | (ymm14 & ymm12 & ~ymm16) | (ymm14 & ymm12 & ymm16)
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm12
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm13, %ymm6, %ymm12
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm6 & ymm13) | (~ymm12 & ymm6 & ymm13) | (ymm12 & ymm6 & ~ymm13) | (ymm12 & ymm6 & ymm13)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm2
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12]
 ; AVX512DQ-NEXT:    vpor %xmm4, %xmm15, %xmm4
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm14, %ymm15, %ymm4
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm15 & ymm14) | (ymm4 & ~ymm15 & ~ymm14) | (ymm4 & ~ymm15 & ymm14) | (ymm4 & ymm15 & ymm14)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm11, %xmm10, %xmm10
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm16, %ymm10, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm10 & ymm16) | (ymm9 & ~ymm10 & ~ymm16) | (ymm9 & ~ymm10 & ymm16) | (ymm9 & ymm10 & ~ymm16) | (ymm9 & ymm10 & ymm16)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13]
 ; AVX512DQ-NEXT:    vpor %xmm2, %xmm10, %xmm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm9, %ymm15, %ymm2
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm6, %ymm13, %ymm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm15 & ymm9) | (ymm2 & ~ymm15 & ~ymm9) | (ymm2 & ~ymm15 & ymm9) | (ymm2 & ymm15 & ymm9)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = (~ymm8 & ~ymm13 & ymm6) | (~ymm8 & ymm13 & ymm6) | (ymm8 & ymm13 & ~ymm6) | (ymm8 & ymm13 & ymm6)
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm6
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero
 ; AVX512DQ-NEXT:    vpor %xmm9, %xmm10, %xmm9
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm17, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm3 & ymm17) | (~ymm0 & ymm3 & ymm17) | (ymm0 & ymm3 & ~ymm17) | (ymm0 & ymm3 & ymm17)
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm10, %xmm11, %xmm10
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm5 & mem) | (ymm1 & ~ymm5 & ~mem) | (ymm1 & ymm5 & ~mem) | (ymm1 & ymm5 & mem)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %ymm9, %ymm15, %ymm5
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm15 & ymm9) | (ymm5 & ~ymm15 & ymm9) | (ymm5 & ymm15 & ~ymm9) | (ymm5 & ymm15 & ymm9)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero
 ; AVX512DQ-NEXT:    vpor %xmm6, %xmm8, %xmm6
@@ -4086,7 +4086,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %ymm6, %ymm15, %ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm15 & ymm6) | (ymm0 & ~ymm15 & ymm6) | (ymm0 & ymm15 & ~ymm6) | (ymm0 & ymm15 & ymm6)
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm18, (%rsi)
 ; AVX512DQ-NEXT:    vmovdqa %ymm7, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa %ymm4, (%rcx)
@@ -4105,7 +4105,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm3, %ymm17, %ymm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm17 & ymm3) | (~ymm7 & ymm17 & ymm3) | (ymm7 & ymm17 & ~ymm3) | (ymm7 & ymm17 & ymm3)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u]
@@ -4114,13 +4114,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm10
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm5, %ymm1, %ymm10
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm1 & ymm5) | (~ymm10 & ymm1 & ymm5) | (ymm10 & ymm1 & ~ymm5) | (ymm10 & ymm1 & ymm5)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm4, %ymm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm4 & ymm16) | (ymm11 & ~ymm4 & ~ymm16) | (ymm11 & ~ymm4 & ymm16) | (ymm11 & ymm4 & ~ymm16) | (ymm11 & ymm4 & ymm16)
 ; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm6, %ymm13, %ymm14
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm13 & ymm6) | (~ymm14 & ymm13 & ymm6) | (ymm14 & ymm13 & ~ymm6) | (ymm14 & ymm13 & ymm6)
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero
@@ -4133,7 +4133,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm7, %ymm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm7 & ymm16) | (ymm8 & ~ymm7 & ~ymm16) | (ymm8 & ~ymm7 & ymm16) | (ymm8 & ymm7 & ~ymm16) | (ymm8 & ymm7 & ymm16)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero
 ; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
@@ -4142,50 +4142,50 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm10
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm17, %ymm3, %ymm10
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm3 & ymm17) | (~ymm10 & ymm3 & ymm17) | (ymm10 & ymm3 & ~ymm17) | (ymm10 & ymm3 & ymm17)
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm12, %xmm14, %xmm12
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm1, %ymm5, %ymm9
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm5 & ymm1) | (~ymm9 & ymm5 & ymm1) | (ymm9 & ymm5 & ~ymm1) | (ymm9 & ymm5 & ymm1)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm12, %ymm14
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm12 & ymm16) | (ymm14 & ~ymm12 & ~ymm16) | (ymm14 & ~ymm12 & ymm16) | (ymm14 & ymm12 & ~ymm16) | (ymm14 & ymm12 & ymm16)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm13, %ymm6, %ymm12
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm6 & ymm13) | (~ymm12 & ymm6 & ymm13) | (ymm12 & ymm6 & ~ymm13) | (ymm12 & ymm6 & ymm13)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm15, %xmm4
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-FCP-NEXT:    vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm14, %ymm15, %ymm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm15 & ymm14) | (ymm4 & ~ymm15 & ~ymm14) | (ymm4 & ~ymm15 & ymm14) | (ymm4 & ymm15 & ymm14)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm10, %ymm9
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm10 & ymm16) | (ymm9 & ~ymm10 & ~ymm16) | (ymm9 & ~ymm10 & ymm16) | (ymm9 & ymm10 & ~ymm16) | (ymm9 & ymm10 & ymm16)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm10, %xmm2
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm9, %ymm15, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm6, %ymm13, %ymm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm15 & ymm9) | (ymm2 & ~ymm15 & ~ymm9) | (ymm2 & ~ymm15 & ymm9) | (ymm2 & ymm15 & ymm9)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = (~ymm8 & ~ymm13 & ymm6) | (~ymm8 & ymm13 & ymm6) | (ymm8 & ymm13 & ~ymm6) | (ymm8 & ymm13 & ymm6)
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm17, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm3 & ymm17) | (~ymm0 & ymm3 & ymm17) | (ymm0 & ymm3 & ~ymm17) | (ymm0 & ymm3 & ymm17)
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm5 & mem) | (ymm1 & ~ymm5 & ~mem) | (ymm1 & ymm5 & ~mem) | (ymm1 & ymm5 & mem)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %ymm9, %ymm15, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm15 & ymm9) | (ymm5 & ~ymm15 & ymm9) | (ymm5 & ymm15 & ~ymm9) | (ymm5 & ymm15 & ymm9)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm6
@@ -4196,7 +4196,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %ymm6, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm15 & ymm6) | (ymm0 & ~ymm15 & ymm6) | (ymm0 & ymm15 & ~ymm6) | (ymm0 & ymm15 & ymm6)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
@@ -7385,7 +7385,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa64 224(%rdi), %ymm25
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %ymm26
 ; AVX512-NEXT:    vmovdqa %ymm12, %ymm0
-; AVX512-NEXT:    vpternlogq $202, %ymm25, %ymm26, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm26 & ymm25) | (~ymm0 & ymm26 & ymm25) | (ymm0 & ymm26 & ~ymm25) | (ymm0 & ymm26 & ymm25)
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
@@ -7396,7 +7396,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %ymm24
 ; AVX512-NEXT:    vmovdqa64 160(%rdi), %ymm18
 ; AVX512-NEXT:    vmovdqa %ymm12, %ymm6
-; AVX512-NEXT:    vpternlogq $202, %ymm24, %ymm18, %ymm6
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm18 & ymm24) | (~ymm6 & ymm18 & ymm24) | (ymm6 & ymm18 & ~ymm24) | (ymm6 & ymm18 & ymm24)
 ; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
 ; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm10
@@ -7407,7 +7407,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm9, %zmm10, %zmm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa %ymm12, %ymm9
-; AVX512-NEXT:    vpternlogq $202, %ymm31, %ymm30, %ymm9
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm30 & ymm31) | (~ymm9 & ymm30 & ymm31) | (ymm9 & ymm30 & ~ymm31) | (ymm9 & ymm30 & ymm31)
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm13
 ; AVX512-NEXT:    vpshufb %xmm5, %xmm13, %xmm5
@@ -7415,7 +7415,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa64 320(%rdi), %ymm29
 ; AVX512-NEXT:    vmovdqa64 352(%rdi), %ymm22
 ; AVX512-NEXT:    vmovdqa %ymm12, %ymm1
-; AVX512-NEXT:    vpternlogq $202, %ymm29, %ymm22, %ymm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm22 & ymm29) | (~ymm1 & ymm22 & ymm29) | (ymm1 & ymm22 & ~ymm29) | (ymm1 & ymm22 & ymm29)
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
 ; AVX512-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
@@ -7441,7 +7441,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512-NEXT:    vmovdqa %ymm9, %ymm4
-; AVX512-NEXT:    vpternlogq $202, %ymm26, %ymm25, %ymm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm25 & ymm26) | (~ymm4 & ymm25 & ymm26) | (ymm4 & ymm25 & ~ymm26) | (ymm4 & ymm25 & ymm26)
 ; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm15
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
@@ -7449,7 +7449,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpor %xmm1, %xmm5, %xmm1
 ; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa %ymm12, %ymm5
-; AVX512-NEXT:    vpternlogq $202, %ymm18, %ymm24, %ymm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm24 & ymm18) | (~ymm5 & ymm24 & ymm18) | (ymm5 & ymm24 & ~ymm18) | (ymm5 & ymm24 & ymm18)
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
 ; AVX512-NEXT:    vpshufb %xmm8, %xmm5, %xmm7
 ; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm1
@@ -7458,13 +7458,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpor %xmm7, %xmm13, %xmm2
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa %ymm9, %ymm13
-; AVX512-NEXT:    vpternlogq $202, %ymm30, %ymm31, %ymm13
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = (~ymm13 & ~ymm31 & ymm30) | (~ymm13 & ymm31 & ymm30) | (ymm13 & ymm31 & ~ymm30) | (ymm13 & ymm31 & ymm30)
 ; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm14
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
 ; AVX512-NEXT:    vporq %xmm0, %xmm6, %xmm16
 ; AVX512-NEXT:    vmovdqa %ymm12, %ymm11
-; AVX512-NEXT:    vpternlogq $202, %ymm22, %ymm29, %ymm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm29 & ymm22) | (~ymm11 & ymm29 & ymm22) | (ymm11 & ymm29 & ~ymm22) | (ymm11 & ymm29 & ymm22)
 ; AVX512-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
 ; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm7
 ; AVX512-NEXT:    vpshufb %xmm10, %xmm7, %xmm10
@@ -7485,7 +7485,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
 ; AVX512-NEXT:    vmovdqa %ymm5, %ymm1
-; AVX512-NEXT:    vpternlogq $202, %ymm19, %ymm20, %ymm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm19) | (~ymm1 & ymm20 & ymm19) | (ymm1 & ymm20 & ~ymm19) | (ymm1 & ymm20 & ymm19)
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7]
@@ -7494,25 +7494,26 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm15
 ; AVX512-NEXT:    vmovdqa %ymm5, %ymm2
-; AVX512-NEXT:    vpternlogq $202, %ymm6, %ymm23, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm23 & ymm6) | (~ymm2 & ymm23 & ymm6) | (ymm2 & ymm23 & ~ymm6) | (ymm2 & ymm23 & ymm6)
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq $248, %ymm4, %ymm17, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm17 & ymm4) | (ymm0 & ~ymm17 & ~ymm4) | (ymm0 & ~ymm17 & ymm4) | (ymm0 & ymm17 & ~ymm4) | (ymm0 & ymm17 & ymm4)
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = (~zmm0 & ~zmm17 & mem) | (zmm0 & ~zmm17 & mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem)
 ; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
-; AVX512-NEXT:    vpternlogq $184, %zmm0, %zmm3, %zmm15
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm3 & zmm0) | (zmm15 & ~zmm3 & ~zmm0) | (zmm15 & ~zmm3 & zmm0) | (zmm15 & zmm3 & zmm0)
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq $248, %ymm4, %ymm21, %ymm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm21 & ymm4) | (ymm1 & ~ymm21 & ~ymm4) | (ymm1 & ~ymm21 & ymm4) | (ymm1 & ymm21 & ~ymm4) | (ymm1 & ymm21 & ymm4)
 ; AVX512-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm17, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm17 & zmm2) | (zmm1 & ~zmm17 & zmm2) | (zmm1 & zmm17 & ~zmm2) | (zmm1 & zmm17 & zmm2)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm17
-; AVX512-NEXT:    vpternlogq $184, %zmm1, %zmm3, %zmm17
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm3 & zmm1) | (zmm17 & ~zmm3 & ~zmm1) | (zmm17 & ~zmm3 & zmm1) | (zmm17 & zmm3 & zmm1)
 ; AVX512-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
 ; AVX512-NEXT:    vpshufb %xmm8, %xmm13, %xmm1
 ; AVX512-NEXT:    vporq %xmm0, %xmm1, %xmm21
@@ -7520,7 +7521,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13]
 ; AVX512-NEXT:    vporq %xmm0, %xmm1, %xmm28
 ; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm11
-; AVX512-NEXT:    vpternlogq $226, %ymm26, %ymm12, %ymm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm12 & ymm26) | (ymm11 & ~ymm12 & ymm26) | (ymm11 & ymm12 & ~ymm26) | (ymm11 & ymm12 & ymm26)
 ; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm0
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
@@ -7528,17 +7529,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm25
 ; AVX512-NEXT:    vporq %xmm1, %xmm2, %xmm26
 ; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm14
-; AVX512-NEXT:    vpternlogq $226, %ymm24, %ymm9, %ymm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm9 & ymm24) | (ymm14 & ~ymm9 & ymm24) | (ymm14 & ymm9 & ~ymm24) | (ymm14 & ymm9 & ymm24)
 ; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm10
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
 ; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
 ; AVX512-NEXT:    vporq %xmm2, %xmm4, %xmm27
-; AVX512-NEXT:    vpternlogq $202, %ymm30, %ymm31, %ymm12
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm31 & ymm30) | (~ymm12 & ymm31 & ymm30) | (ymm12 & ymm31 & ~ymm30) | (ymm12 & ymm31 & ymm30)
 ; AVX512-NEXT:    vmovdqa %ymm5, %ymm4
-; AVX512-NEXT:    vpternlogq $202, %ymm23, %ymm6, %ymm4
-; AVX512-NEXT:    vpternlogq $202, %ymm29, %ymm22, %ymm9
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm6 & ymm23) | (~ymm4 & ymm6 & ymm23) | (ymm4 & ymm6 & ~ymm23) | (ymm4 & ymm6 & ymm23)
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm22 & ymm29) | (~ymm9 & ymm22 & ymm29) | (ymm9 & ymm22 & ~ymm29) | (ymm9 & ymm22 & ymm29)
 ; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm8
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
 ; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
@@ -7557,14 +7558,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm11
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512-NEXT:    vpternlogq $236, %ymm18, %ymm11, %ymm16
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm11 & ~ymm18) | (~ymm16 & ymm11 & ymm18) | (ymm16 & ~ymm11 & ymm18) | (ymm16 & ymm11 & ~ymm18) | (ymm16 & ymm11 & ymm18)
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
-; AVX512-NEXT:    vpternlogq $236, %ymm18, %ymm4, %ymm21
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm21 = (~ymm21 & ymm4 & ~ymm18) | (~ymm21 & ymm4 & ymm18) | (ymm21 & ~ymm4 & ymm18) | (ymm21 & ymm4 & ~ymm18) | (ymm21 & ymm4 & ymm18)
 ; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $202, %ymm20, %ymm19, %ymm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm19 & ymm20) | (~ymm5 & ymm19 & ymm20) | (ymm5 & ymm19 & ~ymm20) | (ymm5 & ymm19 & ymm20)
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
-; AVX512-NEXT:    vpternlogq $248, %ymm18, %ymm4, %ymm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & ymm18) | (ymm1 & ~ymm4 & ~ymm18) | (ymm1 & ~ymm4 & ymm18) | (ymm1 & ymm4 & ~ymm18) | (ymm1 & ymm4 & ymm18)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4
 ; AVX512-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
 ; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm1
@@ -7576,24 +7577,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
 ; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %ymm23, %ymm2, %ymm6
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm2 & ymm23) | (ymm6 & ~ymm2 & ymm23) | (ymm6 & ymm2 & ~ymm23) | (ymm6 & ymm2 & ymm23)
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm9
-; AVX512-NEXT:    vpternlogq $248, %ymm18, %ymm9, %ymm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm9 & ymm18) | (ymm5 & ~ymm9 & ~ymm18) | (ymm5 & ~ymm9 & ymm18) | (ymm5 & ymm9 & ~ymm18) | (ymm5 & ymm9 & ymm18)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512-NEXT:    vpternlogq $202, %ymm20, %ymm19, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm19 & ymm20) | (~ymm2 & ymm19 & ymm20) | (ymm2 & ymm19 & ~ymm20) | (ymm2 & ymm19 & ymm20)
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq $242, %ymm7, %ymm9, %ymm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm9 & ymm7) | (ymm11 & ~ymm9 & ~ymm7) | (ymm11 & ~ymm9 & ymm7) | (ymm11 & ymm9 & ~ymm7) | (ymm11 & ymm9 & ymm7)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm7
 ; AVX512-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm11
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm26, %zmm11, %zmm11
-; AVX512-NEXT:    vpternlogq $226, %zmm11, %zmm9, %zmm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm9 & zmm11) | (zmm8 & ~zmm9 & zmm11) | (zmm8 & zmm9 & ~zmm11) | (zmm8 & zmm9 & zmm11)
 ; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512-NEXT:    vpternlogq $184, %zmm8, %zmm11, %zmm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm11 & zmm8) | (zmm7 & ~zmm11 & ~zmm8) | (zmm7 & ~zmm11 & zmm8) | (zmm7 & zmm11 & zmm8)
 ; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm8
 ; AVX512-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX512-NEXT:    vpshufb %xmm13, %xmm12, %xmm8
@@ -7603,22 +7604,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq $242, %ymm0, %ymm9, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm9 & ymm0) | (ymm2 & ~ymm9 & ~ymm0) | (ymm2 & ~ymm9 & ymm0) | (ymm2 & ymm9 & ~ymm0) | (ymm2 & ymm9 & ymm0)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm9, %zmm1
-; AVX512-NEXT:    vpternlogq $184, %zmm1, %zmm11, %zmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm9 & zmm2) | (zmm1 & ~zmm9 & zmm2) | (zmm1 & zmm9 & ~zmm2) | (zmm1 & zmm9 & zmm2)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm11 & zmm1) | (zmm0 & ~zmm11 & ~zmm1) | (zmm0 & ~zmm11 & zmm1) | (zmm0 & zmm11 & zmm1)
 ; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
 ; AVX512-NEXT:    vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
-; AVX512-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm16
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm2 & zmm1) | (zmm16 & ~zmm2 & ~zmm1) | (zmm16 & ~zmm2 & zmm1) | (zmm16 & zmm2 & zmm1)
 ; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm21
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm2 & zmm1) | (zmm21 & ~zmm2 & ~zmm1) | (zmm21 & ~zmm2 & zmm1) | (zmm21 & zmm2 & zmm1)
 ; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
-; AVX512-NEXT:    vpternlogq $184, %zmm16, %zmm1, %zmm4
-; AVX512-NEXT:    vpternlogq $184, %zmm21, %zmm1, %zmm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm1 & zmm16) | (zmm4 & ~zmm1 & ~zmm16) | (zmm4 & ~zmm1 & zmm16) | (zmm4 & zmm1 & zmm16)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm21) | (zmm5 & ~zmm1 & ~zmm21) | (zmm5 & ~zmm1 & zmm21) | (zmm5 & zmm1 & zmm21)
 ; AVX512-NEXT:    vmovdqa64 %zmm15, (%rsi)
 ; AVX512-NEXT:    vmovdqa64 %zmm17, (%rdx)
 ; AVX512-NEXT:    vmovdqa64 %zmm4, (%rcx)
@@ -7638,7 +7639,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm25
 ; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm26
 ; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm25, %ymm26, %ymm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm26 & ymm25) | (~ymm0 & ymm26 & ymm25) | (ymm0 & ymm26 & ~ymm25) | (ymm0 & ymm26 & ymm25)
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
@@ -7649,7 +7650,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm24
 ; AVX512-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm18
 ; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm6
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm24, %ymm18, %ymm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm18 & ymm24) | (~ymm6 & ymm18 & ymm24) | (ymm6 & ymm18 & ~ymm24) | (ymm6 & ymm18 & ymm24)
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
 ; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm10
@@ -7660,7 +7661,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm10, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm9
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm31, %ymm30, %ymm9
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm30 & ymm31) | (~ymm9 & ymm30 & ymm31) | (ymm9 & ymm30 & ~ymm31) | (ymm9 & ymm30 & ymm31)
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm13
 ; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm13, %xmm5
@@ -7668,7 +7669,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm29
 ; AVX512-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm22
 ; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm1
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm29, %ymm22, %ymm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm22 & ymm29) | (~ymm1 & ymm22 & ymm29) | (ymm1 & ymm22 & ~ymm29) | (ymm1 & ymm22 & ymm29)
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
 ; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
@@ -7694,7 +7695,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm4
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm26, %ymm25, %ymm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm25 & ymm26) | (~ymm4 & ymm25 & ymm26) | (ymm4 & ymm25 & ~ymm26) | (ymm4 & ymm25 & ymm26)
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm15
 ; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
@@ -7702,7 +7703,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpor %xmm1, %xmm5, %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm5
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm18, %ymm24, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm24 & ymm18) | (~ymm5 & ymm24 & ymm18) | (ymm5 & ymm24 & ~ymm18) | (ymm5 & ymm24 & ymm18)
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
 ; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm7
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm1
@@ -7711,13 +7712,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm2
 ; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm13
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm30, %ymm31, %ymm13
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = (~ymm13 & ~ymm31 & ymm30) | (~ymm13 & ymm31 & ymm30) | (ymm13 & ymm31 & ~ymm30) | (ymm13 & ymm31 & ymm30)
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
 ; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
 ; AVX512-FCP-NEXT:    vporq %xmm0, %xmm6, %xmm16
 ; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm11
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm22, %ymm29, %ymm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm29 & ymm22) | (~ymm11 & ymm29 & ymm22) | (ymm11 & ymm29 & ~ymm22) | (ymm11 & ymm29 & ymm22)
 ; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm7
 ; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm10
@@ -7738,7 +7739,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm1
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm19, %ymm20, %ymm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm19) | (~ymm1 & ymm20 & ymm19) | (ymm1 & ymm20 & ~ymm19) | (ymm1 & ymm20 & ymm19)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7]
@@ -7747,25 +7748,26 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm15
 ; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm6, %ymm23, %ymm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm23 & ymm6) | (~ymm2 & ymm23 & ymm6) | (ymm2 & ymm23 & ~ymm6) | (ymm2 & ymm23 & ymm6)
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm4, %ymm17, %ymm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm17 & ymm4) | (ymm0 & ~ymm17 & ~ymm4) | (ymm0 & ~ymm17 & ymm4) | (ymm0 & ymm17 & ~ymm4) | (ymm0 & ymm17 & ymm4)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm0 = (~zmm0 & ~zmm17 & mem) | (zmm0 & ~zmm17 & mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem)
 ; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm3, %zmm15
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm3 & zmm0) | (zmm15 & ~zmm3 & ~zmm0) | (zmm15 & ~zmm3 & zmm0) | (zmm15 & zmm3 & zmm0)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm1
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm4, %ymm21, %ymm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm21 & ymm4) | (ymm1 & ~ymm21 & ~ymm4) | (ymm1 & ~ymm21 & ymm4) | (ymm1 & ymm21 & ~ymm4) | (ymm1 & ymm21 & ymm4)
 ; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm2
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm17, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm17 & zmm2) | (zmm1 & ~zmm17 & zmm2) | (zmm1 & zmm17 & ~zmm2) | (zmm1 & zmm17 & zmm2)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm17
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm3, %zmm17
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm3 & zmm1) | (zmm17 & ~zmm3 & ~zmm1) | (zmm17 & ~zmm3 & zmm1) | (zmm17 & zmm3 & zmm1)
 ; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm1
 ; AVX512-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm21
@@ -7773,7 +7775,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13]
 ; AVX512-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm28
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm11
-; AVX512-FCP-NEXT:    vpternlogq $226, %ymm26, %ymm12, %ymm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm12 & ymm26) | (ymm11 & ~ymm12 & ymm26) | (ymm11 & ymm12 & ~ymm26) | (ymm11 & ymm12 & ymm26)
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
@@ -7781,17 +7783,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm25
 ; AVX512-FCP-NEXT:    vporq %xmm1, %xmm2, %xmm26
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm14
-; AVX512-FCP-NEXT:    vpternlogq $226, %ymm24, %ymm9, %ymm14
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm9 & ymm24) | (ymm14 & ~ymm9 & ymm24) | (ymm14 & ymm9 & ~ymm24) | (ymm14 & ymm9 & ymm24)
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm10
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
 ; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
 ; AVX512-FCP-NEXT:    vporq %xmm2, %xmm4, %xmm27
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm30, %ymm31, %ymm12
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm31 & ymm30) | (~ymm12 & ymm31 & ymm30) | (ymm12 & ymm31 & ~ymm30) | (ymm12 & ymm31 & ymm30)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm4
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm23, %ymm6, %ymm4
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm29, %ymm22, %ymm9
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm6 & ymm23) | (~ymm4 & ymm6 & ymm23) | (ymm4 & ymm6 & ~ymm23) | (ymm4 & ymm6 & ymm23)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm22 & ymm29) | (~ymm9 & ymm22 & ymm29) | (ymm9 & ymm22 & ~ymm29) | (ymm9 & ymm22 & ymm29)
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm8
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
 ; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
@@ -7810,14 +7812,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm11
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512-FCP-NEXT:    vpternlogq $236, %ymm18, %ymm11, %ymm16
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm11 & ~ymm18) | (~ymm16 & ymm11 & ymm18) | (ymm16 & ~ymm11 & ymm18) | (ymm16 & ymm11 & ~ymm18) | (ymm16 & ymm11 & ymm18)
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpternlogq $236, %ymm18, %ymm4, %ymm21
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm21 = (~ymm21 & ymm4 & ~ymm18) | (~ymm21 & ymm4 & ymm18) | (ymm21 & ~ymm4 & ymm18) | (ymm21 & ymm4 & ~ymm18) | (ymm21 & ymm4 & ymm18)
 ; AVX512-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm20, %ymm19, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm19 & ymm20) | (~ymm5 & ymm19 & ymm20) | (ymm5 & ymm19 & ~ymm20) | (ymm5 & ymm19 & ymm20)
 ; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm18, %ymm4, %ymm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & ymm18) | (ymm1 & ~ymm4 & ~ymm18) | (ymm1 & ~ymm4 & ymm18) | (ymm1 & ymm4 & ~ymm18) | (ymm1 & ymm4 & ymm18)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4
 ; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm1
@@ -7829,24 +7831,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
 ; AVX512-FCP-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %ymm23, %ymm2, %ymm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm2 & ymm23) | (ymm6 & ~ymm2 & ymm23) | (ymm6 & ymm2 & ~ymm23) | (ymm6 & ymm2 & ymm23)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm9
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm18, %ymm9, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm9 & ymm18) | (ymm5 & ~ymm9 & ~ymm18) | (ymm5 & ~ymm9 & ymm18) | (ymm5 & ymm9 & ~ymm18) | (ymm5 & ymm9 & ymm18)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq $202, %ymm20, %ymm19, %ymm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm19 & ymm20) | (~ymm2 & ymm19 & ymm20) | (ymm2 & ymm19 & ~ymm20) | (ymm2 & ymm19 & ymm20)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq $242, %ymm7, %ymm9, %ymm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm9 & ymm7) | (ymm11 & ~ymm9 & ~ymm7) | (ymm11 & ~ymm9 & ymm7) | (ymm11 & ymm9 & ~ymm7) | (ymm11 & ymm9 & ymm7)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm7
 ; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm11
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm26, %zmm11, %zmm11
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm9, %zmm8
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm9 & zmm11) | (zmm8 & ~zmm9 & zmm11) | (zmm8 & zmm9 & ~zmm11) | (zmm8 & zmm9 & zmm11)
 ; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm8, %zmm11, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm11 & zmm8) | (zmm7 & ~zmm11 & ~zmm8) | (zmm7 & ~zmm11 & zmm8) | (zmm7 & zmm11 & zmm8)
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm8
 ; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm12, %xmm8
@@ -7856,22 +7858,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq $242, %ymm0, %ymm9, %ymm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm9 & ymm0) | (ymm2 & ~ymm9 & ~ymm0) | (ymm2 & ~ymm9 & ymm0) | (ymm2 & ymm9 & ~ymm0) | (ymm2 & ymm9 & ymm0)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm2
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm9, %zmm1
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm11, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm9 & zmm2) | (zmm1 & ~zmm9 & zmm2) | (zmm1 & zmm9 & ~zmm2) | (zmm1 & zmm9 & zmm2)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm11 & zmm1) | (zmm0 & ~zmm11 & ~zmm1) | (zmm0 & ~zmm11 & zmm1) | (zmm0 & zmm11 & zmm1)
 ; AVX512-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm16
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm2 & zmm1) | (zmm16 & ~zmm2 & ~zmm1) | (zmm16 & ~zmm2 & zmm1) | (zmm16 & zmm2 & zmm1)
 ; AVX512-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm21
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm2 & zmm1) | (zmm21 & ~zmm2 & ~zmm1) | (zmm21 & ~zmm2 & zmm1) | (zmm21 & zmm2 & zmm1)
 ; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm16, %zmm1, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm21, %zmm1, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm1 & zmm16) | (zmm4 & ~zmm1 & ~zmm16) | (zmm4 & ~zmm1 & zmm16) | (zmm4 & zmm1 & zmm16)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm21) | (zmm5 & ~zmm1 & ~zmm21) | (zmm5 & ~zmm1 & zmm21) | (zmm5 & zmm1 & zmm21)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, (%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rcx)
@@ -7891,7 +7893,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa64 224(%rdi), %ymm25
 ; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %ymm26
 ; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm0
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm25, %ymm26, %ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm26 & ymm25) | (~ymm0 & ymm26 & ymm25) | (ymm0 & ymm26 & ~ymm25) | (ymm0 & ymm26 & ymm25)
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm4
@@ -7902,7 +7904,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %ymm24
 ; AVX512DQ-NEXT:    vmovdqa64 160(%rdi), %ymm18
 ; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm6
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm24, %ymm18, %ymm6
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm18 & ymm24) | (~ymm6 & ymm18 & ymm24) | (ymm6 & ymm18 & ~ymm24) | (ymm6 & ymm18 & ymm24)
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm7, %xmm10
@@ -7913,7 +7915,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm9, %zmm10, %zmm2
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm9
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm31, %ymm30, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm30 & ymm31) | (~ymm9 & ymm30 & ymm31) | (ymm9 & ymm30 & ~ymm31) | (ymm9 & ymm30 & ymm31)
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm13
 ; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm13, %xmm5
@@ -7921,7 +7923,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %ymm29
 ; AVX512DQ-NEXT:    vmovdqa64 352(%rdi), %ymm22
 ; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm1
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm29, %ymm22, %ymm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm22 & ymm29) | (~ymm1 & ymm22 & ymm29) | (ymm1 & ymm22 & ~ymm29) | (ymm1 & ymm22 & ymm29)
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
 ; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
@@ -7947,7 +7949,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm4
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm26, %ymm25, %ymm4
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm25 & ymm26) | (~ymm4 & ymm25 & ymm26) | (ymm4 & ymm25 & ~ymm26) | (ymm4 & ymm25 & ymm26)
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm15
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
@@ -7955,7 +7957,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpor %xmm1, %xmm5, %xmm1
 ; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm5
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm18, %ymm24, %ymm5
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm24 & ymm18) | (~ymm5 & ymm24 & ymm18) | (ymm5 & ymm24 & ~ymm18) | (ymm5 & ymm24 & ymm18)
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
 ; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm5, %xmm7
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm1
@@ -7964,13 +7966,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpor %xmm7, %xmm13, %xmm2
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm13
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm30, %ymm31, %ymm13
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = (~ymm13 & ~ymm31 & ymm30) | (~ymm13 & ymm31 & ymm30) | (ymm13 & ymm31 & ~ymm30) | (ymm13 & ymm31 & ymm30)
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm14
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
 ; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
 ; AVX512DQ-NEXT:    vporq %xmm0, %xmm6, %xmm16
 ; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm11
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm22, %ymm29, %ymm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm29 & ymm22) | (~ymm11 & ymm29 & ymm22) | (ymm11 & ymm29 & ~ymm22) | (ymm11 & ymm29 & ymm22)
 ; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm7
 ; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm7, %xmm10
@@ -7991,7 +7993,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
 ; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm1
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm19, %ymm20, %ymm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm19) | (~ymm1 & ymm20 & ymm19) | (ymm1 & ymm20 & ~ymm19) | (ymm1 & ymm20 & ymm19)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7]
@@ -8000,25 +8002,26 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm15
 ; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm2
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm6, %ymm23, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm23 & ymm6) | (~ymm2 & ymm23 & ymm6) | (ymm2 & ymm23 & ~ymm6) | (ymm2 & ymm23 & ymm6)
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm4, %ymm17, %ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm17 & ymm4) | (ymm0 & ~ymm17 & ~ymm4) | (ymm0 & ~ymm17 & ymm4) | (ymm0 & ymm17 & ~ymm4) | (ymm0 & ymm17 & ymm4)
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm0 = (~zmm0 & ~zmm17 & mem) | (zmm0 & ~zmm17 & mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem)
 ; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm0, %zmm3, %zmm15
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm3 & zmm0) | (zmm15 & ~zmm3 & ~zmm0) | (zmm15 & ~zmm3 & zmm0) | (zmm15 & zmm3 & zmm0)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm4, %ymm21, %ymm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm21 & ymm4) | (ymm1 & ~ymm21 & ~ymm4) | (ymm1 & ~ymm21 & ymm4) | (ymm1 & ymm21 & ~ymm4) | (ymm1 & ymm21 & ymm4)
 ; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm17, %zmm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm17 & zmm2) | (zmm1 & ~zmm17 & zmm2) | (zmm1 & zmm17 & ~zmm2) | (zmm1 & zmm17 & zmm2)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm17
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm3, %zmm17
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm3 & zmm1) | (zmm17 & ~zmm3 & ~zmm1) | (zmm17 & ~zmm3 & zmm1) | (zmm17 & zmm3 & zmm1)
 ; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
 ; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm13, %xmm1
 ; AVX512DQ-NEXT:    vporq %xmm0, %xmm1, %xmm21
@@ -8026,7 +8029,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13]
 ; AVX512DQ-NEXT:    vporq %xmm0, %xmm1, %xmm28
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm11
-; AVX512DQ-NEXT:    vpternlogq $226, %ymm26, %ymm12, %ymm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm12 & ymm26) | (ymm11 & ~ymm12 & ymm26) | (ymm11 & ymm12 & ~ymm26) | (ymm11 & ymm12 & ymm26)
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm0
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
@@ -8034,17 +8037,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm25
 ; AVX512DQ-NEXT:    vporq %xmm1, %xmm2, %xmm26
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm14
-; AVX512DQ-NEXT:    vpternlogq $226, %ymm24, %ymm9, %ymm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm9 & ymm24) | (ymm14 & ~ymm9 & ymm24) | (ymm14 & ymm9 & ~ymm24) | (ymm14 & ymm9 & ymm24)
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm10
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
 ; AVX512DQ-NEXT:    vporq %xmm2, %xmm4, %xmm27
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm30, %ymm31, %ymm12
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm31 & ymm30) | (~ymm12 & ymm31 & ymm30) | (ymm12 & ymm31 & ~ymm30) | (ymm12 & ymm31 & ymm30)
 ; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm4
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm23, %ymm6, %ymm4
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm29, %ymm22, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm6 & ymm23) | (~ymm4 & ymm6 & ymm23) | (ymm4 & ymm6 & ~ymm23) | (ymm4 & ymm6 & ymm23)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm22 & ymm29) | (~ymm9 & ymm22 & ymm29) | (ymm9 & ymm22 & ~ymm29) | (ymm9 & ymm22 & ymm29)
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm8
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
@@ -8063,14 +8066,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm11
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm18, %ymm11, %ymm16
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm11 & ~ymm18) | (~ymm16 & ymm11 & ymm18) | (ymm16 & ~ymm11 & ymm18) | (ymm16 & ymm11 & ~ymm18) | (ymm16 & ymm11 & ymm18)
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm18, %ymm4, %ymm21
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm21 = (~ymm21 & ymm4 & ~ymm18) | (~ymm21 & ymm4 & ymm18) | (ymm21 & ~ymm4 & ymm18) | (ymm21 & ymm4 & ~ymm18) | (ymm21 & ymm4 & ymm18)
 ; AVX512DQ-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm20, %ymm19, %ymm5
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm19 & ymm20) | (~ymm5 & ymm19 & ymm20) | (ymm5 & ymm19 & ~ymm20) | (ymm5 & ymm19 & ymm20)
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm18, %ymm4, %ymm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & ymm18) | (ymm1 & ~ymm4 & ~ymm18) | (ymm1 & ~ymm4 & ymm18) | (ymm1 & ymm4 & ~ymm18) | (ymm1 & ymm4 & ymm18)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4
 ; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm1
@@ -8082,24 +8085,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
 ; AVX512DQ-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %ymm23, %ymm2, %ymm6
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm2 & ymm23) | (ymm6 & ~ymm2 & ymm23) | (ymm6 & ymm2 & ~ymm23) | (ymm6 & ymm2 & ymm23)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm18, %ymm9, %ymm5
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm9 & ymm18) | (ymm5 & ~ymm9 & ~ymm18) | (ymm5 & ~ymm9 & ymm18) | (ymm5 & ymm9 & ~ymm18) | (ymm5 & ymm9 & ymm18)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $202, %ymm20, %ymm19, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm19 & ymm20) | (~ymm2 & ymm19 & ymm20) | (ymm2 & ymm19 & ~ymm20) | (ymm2 & ymm19 & ymm20)
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq $242, %ymm7, %ymm9, %ymm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm9 & ymm7) | (ymm11 & ~ymm9 & ~ymm7) | (ymm11 & ~ymm9 & ymm7) | (ymm11 & ymm9 & ~ymm7) | (ymm11 & ymm9 & ymm7)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm7
 ; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm11
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm26, %zmm11, %zmm11
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm11, %zmm9, %zmm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm9 & zmm11) | (zmm8 & ~zmm9 & zmm11) | (zmm8 & zmm9 & ~zmm11) | (zmm8 & zmm9 & zmm11)
 ; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm8, %zmm11, %zmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm11 & zmm8) | (zmm7 & ~zmm11 & ~zmm8) | (zmm7 & ~zmm11 & zmm8) | (zmm7 & zmm11 & zmm8)
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm8
 ; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm12, %xmm8
@@ -8109,22 +8112,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq $242, %ymm0, %ymm9, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm9 & ymm0) | (ymm2 & ~ymm9 & ~ymm0) | (ymm2 & ~ymm9 & ymm0) | (ymm2 & ymm9 & ~ymm0) | (ymm2 & ymm9 & ymm0)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm2, %zmm2
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm9, %zmm1
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm11, %zmm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm9 & zmm2) | (zmm1 & ~zmm9 & zmm2) | (zmm1 & zmm9 & ~zmm2) | (zmm1 & zmm9 & zmm2)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm11 & zmm1) | (zmm0 & ~zmm11 & ~zmm1) | (zmm0 & ~zmm11 & zmm1) | (zmm0 & zmm11 & zmm1)
 ; AVX512DQ-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm16
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm2 & zmm1) | (zmm16 & ~zmm2 & ~zmm1) | (zmm16 & ~zmm2 & zmm1) | (zmm16 & zmm2 & zmm1)
 ; AVX512DQ-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm21
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm2 & zmm1) | (zmm21 & ~zmm2 & ~zmm1) | (zmm21 & ~zmm2 & zmm1) | (zmm21 & zmm2 & zmm1)
 ; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm16, %zmm1, %zmm4
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm21, %zmm1, %zmm5
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm1 & zmm16) | (zmm4 & ~zmm1 & ~zmm16) | (zmm4 & ~zmm1 & zmm16) | (zmm4 & zmm1 & zmm16)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm21) | (zmm5 & ~zmm1 & ~zmm21) | (zmm5 & ~zmm1 & zmm21) | (zmm5 & zmm1 & zmm21)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm15, (%rsi)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, (%rcx)
@@ -8144,7 +8147,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm25
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm26
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm25, %ymm26, %ymm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm26 & ymm25) | (~ymm0 & ymm26 & ymm25) | (ymm0 & ymm26 & ~ymm25) | (ymm0 & ymm26 & ymm25)
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
@@ -8155,7 +8158,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm24
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm18
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm6
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm24, %ymm18, %ymm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm18 & ymm24) | (~ymm6 & ymm18 & ymm24) | (ymm6 & ymm18 & ~ymm24) | (ymm6 & ymm18 & ymm24)
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm10
@@ -8166,7 +8169,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm10, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm9
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm31, %ymm30, %ymm9
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm30 & ymm31) | (~ymm9 & ymm30 & ymm31) | (ymm9 & ymm30 & ~ymm31) | (ymm9 & ymm30 & ymm31)
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm13
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm13, %xmm5
@@ -8174,7 +8177,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm29
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm22
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm1
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm29, %ymm22, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm22 & ymm29) | (~ymm1 & ymm22 & ymm29) | (ymm1 & ymm22 & ~ymm29) | (ymm1 & ymm22 & ymm29)
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
@@ -8200,7 +8203,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm26, %ymm25, %ymm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm25 & ymm26) | (~ymm4 & ymm25 & ymm26) | (ymm4 & ymm25 & ~ymm26) | (ymm4 & ymm25 & ymm26)
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm15
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
@@ -8208,7 +8211,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm5, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm5
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm18, %ymm24, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm24 & ymm18) | (~ymm5 & ymm24 & ymm18) | (ymm5 & ymm24 & ~ymm18) | (ymm5 & ymm24 & ymm18)
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm7
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm1
@@ -8217,13 +8220,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm13
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm30, %ymm31, %ymm13
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = (~ymm13 & ~ymm31 & ymm30) | (~ymm13 & ymm31 & ymm30) | (ymm13 & ymm31 & ~ymm30) | (ymm13 & ymm31 & ymm30)
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
 ; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm6, %xmm16
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm11
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm22, %ymm29, %ymm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm29 & ymm22) | (~ymm11 & ymm29 & ymm22) | (ymm11 & ymm29 & ~ymm22) | (ymm11 & ymm29 & ymm22)
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm7
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm10
@@ -8244,7 +8247,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm19, %ymm20, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm19) | (~ymm1 & ymm20 & ymm19) | (ymm1 & ymm20 & ~ymm19) | (ymm1 & ymm20 & ymm19)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7]
@@ -8253,25 +8256,26 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm15
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm6, %ymm23, %ymm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm23 & ymm6) | (~ymm2 & ymm23 & ymm6) | (ymm2 & ymm23 & ~ymm6) | (ymm2 & ymm23 & ymm6)
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm4, %ymm17, %ymm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm17 & ymm4) | (ymm0 & ~ymm17 & ~ymm4) | (ymm0 & ~ymm17 & ymm4) | (ymm0 & ymm17 & ~ymm4) | (ymm0 & ymm17 & ymm4)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm0 = (~zmm0 & ~zmm17 & mem) | (zmm0 & ~zmm17 & mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem)
 ; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm3, %zmm15
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm3 & zmm0) | (zmm15 & ~zmm3 & ~zmm0) | (zmm15 & ~zmm3 & zmm0) | (zmm15 & zmm3 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm4, %ymm21, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm21 & ymm4) | (ymm1 & ~ymm21 & ~ymm4) | (ymm1 & ~ymm21 & ymm4) | (ymm1 & ymm21 & ~ymm4) | (ymm1 & ymm21 & ymm4)
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm2
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm17, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm17 & zmm2) | (zmm1 & ~zmm17 & zmm2) | (zmm1 & zmm17 & ~zmm2) | (zmm1 & zmm17 & zmm2)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm17
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm3, %zmm17
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm3 & zmm1) | (zmm17 & ~zmm3 & ~zmm1) | (zmm17 & ~zmm3 & zmm1) | (zmm17 & zmm3 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm1
 ; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm21
@@ -8279,7 +8283,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13]
 ; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm28
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm11
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %ymm26, %ymm12, %ymm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm12 & ymm26) | (ymm11 & ~ymm12 & ymm26) | (ymm11 & ymm12 & ~ymm26) | (ymm11 & ymm12 & ymm26)
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
@@ -8287,17 +8291,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm25
 ; AVX512DQ-FCP-NEXT:    vporq %xmm1, %xmm2, %xmm26
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm14
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %ymm24, %ymm9, %ymm14
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm9 & ymm24) | (ymm14 & ~ymm9 & ymm24) | (ymm14 & ymm9 & ~ymm24) | (ymm14 & ymm9 & ymm24)
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
 ; AVX512DQ-FCP-NEXT:    vporq %xmm2, %xmm4, %xmm27
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm30, %ymm31, %ymm12
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm31 & ymm30) | (~ymm12 & ymm31 & ymm30) | (ymm12 & ymm31 & ~ymm30) | (ymm12 & ymm31 & ymm30)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm23, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm29, %ymm22, %ymm9
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm6 & ymm23) | (~ymm4 & ymm6 & ymm23) | (ymm4 & ymm6 & ~ymm23) | (ymm4 & ymm6 & ymm23)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm22 & ymm29) | (~ymm9 & ymm22 & ymm29) | (ymm9 & ymm22 & ~ymm29) | (ymm9 & ymm22 & ymm29)
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm8
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
@@ -8316,14 +8320,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm11
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq $236, %ymm18, %ymm11, %ymm16
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm11 & ~ymm18) | (~ymm16 & ymm11 & ymm18) | (ymm16 & ~ymm11 & ymm18) | (ymm16 & ymm11 & ~ymm18) | (ymm16 & ymm11 & ymm18)
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $236, %ymm18, %ymm4, %ymm21
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm21 = (~ymm21 & ymm4 & ~ymm18) | (~ymm21 & ymm4 & ymm18) | (ymm21 & ~ymm4 & ymm18) | (ymm21 & ymm4 & ~ymm18) | (ymm21 & ymm4 & ymm18)
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm20, %ymm19, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm19 & ymm20) | (~ymm5 & ymm19 & ymm20) | (ymm5 & ymm19 & ~ymm20) | (ymm5 & ymm19 & ymm20)
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm18, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & ymm18) | (ymm1 & ~ymm4 & ~ymm18) | (ymm1 & ~ymm4 & ymm18) | (ymm1 & ymm4 & ~ymm18) | (ymm1 & ymm4 & ymm18)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm1
@@ -8335,24 +8339,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %ymm23, %ymm2, %ymm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm2 & ymm23) | (ymm6 & ~ymm2 & ymm23) | (ymm6 & ymm2 & ~ymm23) | (ymm6 & ymm2 & ymm23)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm18, %ymm9, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm9 & ymm18) | (ymm5 & ~ymm9 & ~ymm18) | (ymm5 & ~ymm9 & ymm18) | (ymm5 & ymm9 & ~ymm18) | (ymm5 & ymm9 & ymm18)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, %ymm20, %ymm19, %ymm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm19 & ymm20) | (~ymm2 & ymm19 & ymm20) | (ymm2 & ymm19 & ~ymm20) | (ymm2 & ymm19 & ymm20)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq $242, %ymm7, %ymm9, %ymm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm9 & ymm7) | (ymm11 & ~ymm9 & ~ymm7) | (ymm11 & ~ymm9 & ymm7) | (ymm11 & ymm9 & ~ymm7) | (ymm11 & ymm9 & ymm7)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm7
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm11
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm26, %zmm11, %zmm11
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm9, %zmm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm9 & zmm11) | (zmm8 & ~zmm9 & zmm11) | (zmm8 & zmm9 & ~zmm11) | (zmm8 & zmm9 & zmm11)
 ; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm8, %zmm11, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm11 & zmm8) | (zmm7 & ~zmm11 & ~zmm8) | (zmm7 & ~zmm11 & zmm8) | (zmm7 & zmm11 & zmm8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm8
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm12, %xmm8
@@ -8362,22 +8366,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq $242, %ymm0, %ymm9, %ymm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm9 & ymm0) | (ymm2 & ~ymm9 & ~ymm0) | (ymm2 & ~ymm9 & ymm0) | (ymm2 & ymm9 & ~ymm0) | (ymm2 & ymm9 & ymm0)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm2
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm9, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm11, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm9 & zmm2) | (zmm1 & ~zmm9 & zmm2) | (zmm1 & zmm9 & ~zmm2) | (zmm1 & zmm9 & zmm2)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm11 & zmm1) | (zmm0 & ~zmm11 & ~zmm1) | (zmm0 & ~zmm11 & zmm1) | (zmm0 & zmm11 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm16
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm2 & zmm1) | (zmm16 & ~zmm2 & ~zmm1) | (zmm16 & ~zmm2 & zmm1) | (zmm16 & zmm2 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm21
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm2 & zmm1) | (zmm21 & ~zmm2 & ~zmm1) | (zmm21 & ~zmm2 & zmm1) | (zmm21 & zmm2 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm16, %zmm1, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm21, %zmm1, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm1 & zmm16) | (zmm4 & ~zmm1 & ~zmm16) | (zmm4 & ~zmm1 & zmm16) | (zmm4 & zmm1 & zmm16)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm21) | (zmm5 & ~zmm1 & ~zmm21) | (zmm5 & ~zmm1 & zmm21) | (zmm5 & zmm1 & zmm21)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rcx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 7d2f52d3c5830..cd481a30cb211 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -444,7 +444,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
 ; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~mem) | (ymm2 & ~ymm3 & mem) | (ymm2 & ymm3 & ~mem) | (ymm2 & ymm3 & mem)
 ; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
@@ -472,7 +472,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
 ; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
-; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~mem) | (ymm2 & ~ymm3 & mem) | (ymm2 & ymm3 & ~mem) | (ymm2 & ymm3 & mem)
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
@@ -499,7 +499,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~mem) | (ymm2 & ~ymm3 & mem) | (ymm2 & ymm3 & ~mem) | (ymm2 & ymm3 & mem)
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
@@ -527,7 +527,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~mem) | (ymm2 & ~ymm3 & mem) | (ymm2 & ymm3 & ~mem) | (ymm2 & ymm3 & mem)
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
@@ -936,7 +936,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpbroadcastq (%r8), %ymm6
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm6, %zmm4
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem)
 ; AVX512-NEXT:    vpsrlq $48, %xmm3, %xmm3
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -975,7 +975,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpbroadcastq (%r8), %ymm6
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm6, %zmm4
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem)
 ; AVX512-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm1
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
@@ -1020,7 +1020,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpbroadcastq (%r8), %ymm6
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm6, %zmm4
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem)
 ; AVX512DQ-NEXT:    vpsrlq $48, %xmm3, %xmm3
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -1059,7 +1059,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq (%r8), %ymm6
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm6, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem)
 ; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
@@ -1737,7 +1737,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0]
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm5 & mem) | (ymm8 & ~ymm5 & ~mem) | (ymm8 & ymm5 & ~mem) | (ymm8 & ymm5 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm5
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
@@ -1748,12 +1748,12 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %ymm7, %ymm8, %ymm6
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm8 & ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ~ymm7) | (ymm6 & ymm8 & ymm7)
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512-NEXT:    vpbroadcastq (%r8), %ymm6
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem)
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15]
@@ -1761,7 +1761,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535]
-; AVX512-NEXT:    vpternlogq $226, %ymm5, %ymm9, %ymm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm9 & ymm5) | (ymm7 & ~ymm9 & ymm5) | (ymm7 & ymm9 & ~ymm5) | (ymm7 & ymm9 & ymm5)
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5]
 ; AVX512-NEXT:    vprolq $16, %ymm1, %ymm10
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15]
@@ -1770,14 +1770,14 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
-; AVX512-NEXT:    vpternlogq $226, %ymm5, %ymm8, %ymm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm8 & ymm5) | (ymm10 & ~ymm8 & ymm5) | (ymm10 & ymm8 & ~ymm5) | (ymm10 & ymm8 & ymm5)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm5
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535]
 ; AVX512-NEXT:    vpandnq 16(%r8){1to4}, %ymm7, %ymm7
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm4
-; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & ~zmm5 & mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem)
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7]
@@ -1788,9 +1788,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
-; AVX512-NEXT:    vpternlogq $226, %ymm2, %ymm9, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm9 & ymm2) | (ymm0 & ~ymm9 & ymm2) | (ymm0 & ymm9 & ~ymm2) | (ymm0 & ymm9 & ymm2)
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm0 & mem) | (~ymm1 & ymm0 & mem) | (ymm1 & ymm0 & ~mem) | (ymm1 & ymm0 & mem)
 ; AVX512-NEXT:    vmovdqa %ymm1, 128(%r9)
 ; AVX512-NEXT:    vmovdqa64 %zmm4, 64(%r9)
 ; AVX512-NEXT:    vmovdqa64 %zmm6, (%r9)
@@ -1815,7 +1815,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm5 & mem) | (ymm8 & ~ymm5 & ~mem) | (ymm8 & ymm5 & ~mem) | (ymm8 & ymm5 & mem)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm5
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
@@ -1825,12 +1825,12 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %ymm7, %ymm8, %ymm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm8 & ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ~ymm7) | (ymm6 & ymm8 & ymm7)
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpbroadcastq (%r8), %ymm6
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15]
@@ -1838,7 +1838,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %ymm5, %ymm9, %ymm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm9 & ymm5) | (ymm7 & ~ymm9 & ymm5) | (ymm7 & ymm9 & ~ymm5) | (ymm7 & ymm9 & ymm5)
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5]
 ; AVX512-FCP-NEXT:    vprolq $16, %ymm1, %ymm10
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15]
@@ -1847,14 +1847,14 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
-; AVX512-FCP-NEXT:    vpternlogq $226, %ymm5, %ymm8, %ymm10
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm8 & ymm5) | (ymm10 & ~ymm8 & ymm5) | (ymm10 & ymm8 & ~ymm5) | (ymm10 & ymm8 & ymm5)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm5
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535]
 ; AVX512-FCP-NEXT:    vpandnq 16(%r8){1to4}, %ymm7, %ymm7
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & ~zmm5 & mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem)
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15]
@@ -1863,9 +1863,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
-; AVX512-FCP-NEXT:    vpternlogq $226, %ymm2, %ymm9, %ymm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm9 & ymm2) | (ymm0 & ~ymm9 & ymm2) | (ymm0 & ymm9 & ~ymm2) | (ymm0 & ymm9 & ymm2)
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm0 & mem) | (~ymm1 & ymm0 & mem) | (ymm1 & ymm0 & ~mem) | (ymm1 & ymm0 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm1, 128(%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 64(%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, (%r9)
@@ -1890,7 +1890,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0]
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm5 & mem) | (ymm8 & ~ymm5 & ~mem) | (ymm8 & ymm5 & ~mem) | (ymm8 & ymm5 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm5
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
@@ -1901,12 +1901,12 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %ymm7, %ymm8, %ymm6
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm8 & ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ~ymm7) | (ymm6 & ymm8 & ymm7)
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512DQ-NEXT:    vpbroadcastq (%r8), %ymm6
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15]
@@ -1914,7 +1914,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %ymm5, %ymm9, %ymm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm9 & ymm5) | (ymm7 & ~ymm9 & ymm5) | (ymm7 & ymm9 & ~ymm5) | (ymm7 & ymm9 & ymm5)
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5]
 ; AVX512DQ-NEXT:    vprolq $16, %ymm1, %ymm10
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15]
@@ -1923,14 +1923,14 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
-; AVX512DQ-NEXT:    vpternlogq $226, %ymm5, %ymm8, %ymm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm8 & ymm5) | (ymm10 & ~ymm8 & ymm5) | (ymm10 & ymm8 & ~ymm5) | (ymm10 & ymm8 & ymm5)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm5
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535]
 ; AVX512DQ-NEXT:    vpandnq 16(%r8){1to4}, %ymm7, %ymm7
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & ~zmm5 & mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem)
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7]
@@ -1941,9 +1941,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
-; AVX512DQ-NEXT:    vpternlogq $226, %ymm2, %ymm9, %ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm9 & ymm2) | (ymm0 & ~ymm9 & ymm2) | (ymm0 & ymm9 & ~ymm2) | (ymm0 & ymm9 & ymm2)
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm0 & mem) | (~ymm1 & ymm0 & mem) | (ymm1 & ymm0 & ~mem) | (ymm1 & ymm0 & mem)
 ; AVX512DQ-NEXT:    vmovdqa %ymm1, 128(%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm6, (%r9)
@@ -1968,7 +1968,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm5 & mem) | (ymm8 & ~ymm5 & ~mem) | (ymm8 & ymm5 & ~mem) | (ymm8 & ymm5 & mem)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
@@ -1978,12 +1978,12 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %ymm7, %ymm8, %ymm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm8 & ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ~ymm7) | (ymm6 & ymm8 & ymm7)
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq (%r8), %ymm6
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15]
@@ -1991,7 +1991,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %ymm5, %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm9 & ymm5) | (ymm7 & ~ymm9 & ymm5) | (ymm7 & ymm9 & ~ymm5) | (ymm7 & ymm9 & ymm5)
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5]
 ; AVX512DQ-FCP-NEXT:    vprolq $16, %ymm1, %ymm10
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15]
@@ -2000,14 +2000,14 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %ymm5, %ymm8, %ymm10
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm8 & ymm5) | (ymm10 & ~ymm8 & ymm5) | (ymm10 & ymm8 & ~ymm5) | (ymm10 & ymm8 & ymm5)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm5
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vpandnq 16(%r8){1to4}, %ymm7, %ymm7
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & ~zmm5 & mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem)
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15]
@@ -2016,9 +2016,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %ymm2, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm9 & ymm2) | (ymm0 & ~ymm9 & ymm2) | (ymm0 & ymm9 & ~ymm2) | (ymm0 & ymm9 & ymm2)
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm0 & mem) | (~ymm1 & ymm0 & mem) | (ymm1 & ymm0 & ~mem) | (ymm1 & ymm0 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, 128(%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 64(%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%r9)
@@ -3383,11 +3383,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2]
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[0,1,0,1]
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm16, %zmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm16 & zmm0) | (zmm3 & ~zmm16 & zmm0) | (zmm3 & zmm16 & ~zmm0) | (zmm3 & zmm16 & zmm0)
 ; AVX512-NEXT:    vpbroadcastq 24(%r8), %ymm0
 ; AVX512-NEXT:    vpbroadcastq 32(%r8), %ymm2
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm19
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm19
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm19 = (~zmm19 & zmm3 & mem) | (zmm19 & ~zmm3 & ~mem) | (zmm19 & zmm3 & ~mem) | (zmm19 & zmm3 & mem)
 ; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm3
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm0
@@ -3413,7 +3413,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm9, %zmm2
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm11 & mem) | (zmm2 & ~zmm11 & ~mem) | (zmm2 & zmm11 & ~mem) | (zmm2 & zmm11 & mem)
 ; AVX512-NEXT:    vmovdqa (%r8), %ymm9
 ; AVX512-NEXT:    vmovdqa 32(%r8), %ymm10
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128]
@@ -3422,7 +3422,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,1]
 ; AVX512-NEXT:    vpandnq %ymm10, %ymm21, %ymm10
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm10, %zmm10
-; AVX512-NEXT:    vpternlogq $248, %zmm21, %zmm2, %zmm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & zmm21) | (zmm10 & ~zmm2 & ~zmm21) | (zmm10 & ~zmm2 & zmm21) | (zmm10 & zmm2 & ~zmm21) | (zmm10 & zmm2 & zmm21)
 ; AVX512-NEXT:    vmovdqa (%rdx), %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
 ; AVX512-NEXT:    vpshufb %xmm13, %xmm4, %xmm4
@@ -3441,11 +3441,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm4, %zmm7, %zmm4
 ; AVX512-NEXT:    vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5]
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm7, %zmm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm7 & zmm2) | (zmm4 & ~zmm7 & zmm2) | (zmm4 & zmm7 & ~zmm2) | (zmm4 & zmm7 & zmm2)
 ; AVX512-NEXT:    vpbroadcastq (%r8), %ymm2
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm9[0,1,1,1]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & ~mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem)
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5]
 ; AVX512-NEXT:    vprolq $16, %ymm3, %ymm8
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15]
@@ -3468,12 +3468,12 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3,4],ymm0[5,6,7,8],ymm6[9],ymm0[10],ymm6[11,12],ymm0[13,14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
-; AVX512-NEXT:    vpternlogq $226, %zmm4, %zmm7, %zmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm7 & zmm4) | (zmm0 & ~zmm7 & zmm4) | (zmm0 & zmm7 & ~zmm4) | (zmm0 & zmm7 & zmm4)
 ; AVX512-NEXT:    vpbroadcastq 48(%r8), %ymm4
 ; AVX512-NEXT:    vpbroadcastq 56(%r8), %ymm6
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm4
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm6 & zmm0) | (zmm4 & ~zmm6 & ~zmm0) | (zmm4 & ~zmm6 & zmm0) | (zmm4 & zmm6 & zmm0)
 ; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm0
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm20[1,1,2,2]
@@ -3492,11 +3492,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm16, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm16 & zmm0) | (zmm1 & ~zmm16 & zmm0) | (zmm1 & zmm16 & ~zmm0) | (zmm1 & zmm16 & zmm0)
 ; AVX512-NEXT:    vpandnq 16(%r8){1to4}, %ymm6, %ymm0
 ; AVX512-NEXT:    vpshufb %ymm11, %ymm9, %ymm3
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
-; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem)
 ; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%r9)
 ; AVX512-NEXT:    vmovdqa64 %zmm4, 256(%r9)
 ; AVX512-NEXT:    vmovdqa64 %zmm2, (%r9)
@@ -3540,7 +3540,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm13[1],xmm2[2],xmm13[3],xmm2[4,5],xmm13[6],xmm2[7]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128]
@@ -3550,7 +3550,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1]
 ; AVX512-FCP-NEXT:    vpandnq %ymm2, %ymm16, %ymm2
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm18
-; AVX512-FCP-NEXT:    vpternlogq $248, %zmm16, %zmm1, %zmm18
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = (~zmm18 & zmm1 & zmm16) | (zmm18 & ~zmm1 & ~zmm16) | (zmm18 & ~zmm1 & zmm16) | (zmm18 & zmm1 & ~zmm16) | (zmm18 & zmm1 & zmm16)
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm1
 ; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm15
@@ -3574,11 +3574,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5]
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm14, %zmm16, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm16 & zmm14) | (zmm1 & ~zmm16 & zmm14) | (zmm1 & zmm16 & ~zmm14) | (zmm1 & zmm16 & zmm14)
 ; AVX512-FCP-NEXT:    vpbroadcastq (%r8), %ymm3
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm3, %zmm10
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm10
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm1 & mem) | (zmm10 & ~zmm1 & ~mem) | (zmm10 & zmm1 & ~mem) | (zmm10 & zmm1 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm15
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm1
 ; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm15, %ymm1
@@ -3602,13 +3602,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm13, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm19, %zmm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm19 & zmm3) | (zmm2 & ~zmm19 & zmm3) | (zmm2 & zmm19 & ~zmm3) | (zmm2 & zmm19 & zmm3)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
 ; AVX512-FCP-NEXT:    vpandnq 16(%r8){1to4}, %ymm3, %ymm13
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm12
 ; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & ~mem) | (zmm0 & ~zmm2 & mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem)
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
 ; AVX512-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm5, %ymm2
@@ -3628,11 +3628,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm5, %zmm1, %zmm8
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm16, %zmm8
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm16 & zmm7) | (zmm8 & ~zmm16 & zmm7) | (zmm8 & zmm16 & ~zmm7) | (zmm8 & zmm16 & zmm7)
 ; AVX512-FCP-NEXT:    vpbroadcastq 48(%r8), %ymm1
 ; AVX512-FCP-NEXT:    vpbroadcastq 56(%r8), %ymm5
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm8, %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm3 & zmm8) | (zmm1 & ~zmm3 & ~zmm8) | (zmm1 & ~zmm3 & zmm8) | (zmm1 & zmm3 & zmm8)
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm5
 ; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
@@ -3651,11 +3651,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm19, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm19 & zmm4) | (zmm5 & ~zmm19 & zmm4) | (zmm5 & zmm19 & ~zmm4) | (zmm5 & zmm19 & zmm4)
 ; AVX512-FCP-NEXT:    vpbroadcastq 24(%r8), %ymm2
 ; AVX512-FCP-NEXT:    vpbroadcastq 32(%r8), %ymm3
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & mem) | (zmm2 & ~zmm5 & ~mem) | (zmm2 & zmm5 & ~mem) | (zmm2 & zmm5 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 128(%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 256(%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r9)
@@ -3697,11 +3697,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[0,1,0,1]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm16, %zmm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm16 & zmm0) | (zmm3 & ~zmm16 & zmm0) | (zmm3 & zmm16 & ~zmm0) | (zmm3 & zmm16 & zmm0)
 ; AVX512DQ-NEXT:    vpbroadcastq 24(%r8), %ymm0
 ; AVX512DQ-NEXT:    vpbroadcastq 32(%r8), %ymm2
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm19
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm19
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm19 = (~zmm19 & zmm3 & mem) | (zmm19 & ~zmm3 & ~mem) | (zmm19 & zmm3 & ~mem) | (zmm19 & zmm3 & mem)
 ; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm3
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm0
@@ -3727,7 +3727,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm9, %zmm2
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm11 & mem) | (zmm2 & ~zmm11 & ~mem) | (zmm2 & zmm11 & ~mem) | (zmm2 & zmm11 & mem)
 ; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm9
 ; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm10
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128]
@@ -3736,7 +3736,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,1]
 ; AVX512DQ-NEXT:    vpandnq %ymm10, %ymm21, %ymm10
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm10, %zmm10
-; AVX512DQ-NEXT:    vpternlogq $248, %zmm21, %zmm2, %zmm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & zmm21) | (zmm10 & ~zmm2 & ~zmm21) | (zmm10 & ~zmm2 & zmm21) | (zmm10 & zmm2 & ~zmm21) | (zmm10 & zmm2 & zmm21)
 ; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
 ; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm4, %xmm4
@@ -3755,11 +3755,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm4, %zmm7, %zmm4
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm7, %zmm4
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm7 & zmm2) | (zmm4 & ~zmm7 & zmm2) | (zmm4 & zmm7 & ~zmm2) | (zmm4 & zmm7 & zmm2)
 ; AVX512DQ-NEXT:    vpbroadcastq (%r8), %ymm2
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm9[0,1,1,1]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & ~mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem)
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5]
 ; AVX512DQ-NEXT:    vprolq $16, %ymm3, %ymm8
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15]
@@ -3782,12 +3782,12 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3,4],ymm0[5,6,7,8],ymm6[9],ymm0[10],ymm6[11,12],ymm0[13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm4, %zmm7, %zmm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm7 & zmm4) | (zmm0 & ~zmm7 & zmm4) | (zmm0 & zmm7 & ~zmm4) | (zmm0 & zmm7 & zmm4)
 ; AVX512DQ-NEXT:    vpbroadcastq 48(%r8), %ymm4
 ; AVX512DQ-NEXT:    vpbroadcastq 56(%r8), %ymm6
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm4
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm6 & zmm0) | (zmm4 & ~zmm6 & ~zmm0) | (zmm4 & ~zmm6 & zmm0) | (zmm4 & zmm6 & zmm0)
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm0
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm20[1,1,2,2]
@@ -3806,11 +3806,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm16, %zmm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm16 & zmm0) | (zmm1 & ~zmm16 & zmm0) | (zmm1 & zmm16 & ~zmm0) | (zmm1 & zmm16 & zmm0)
 ; AVX512DQ-NEXT:    vpandnq 16(%r8){1to4}, %ymm6, %ymm0
 ; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm9, %ymm3
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
-; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 256(%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm2, (%r9)
@@ -3854,7 +3854,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm13[1],xmm2[2],xmm13[3],xmm2[4,5],xmm13[6],xmm2[7]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128]
@@ -3864,7 +3864,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1]
 ; AVX512DQ-FCP-NEXT:    vpandnq %ymm2, %ymm16, %ymm2
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm18
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm16, %zmm1, %zmm18
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = (~zmm18 & zmm1 & zmm16) | (zmm18 & ~zmm1 & ~zmm16) | (zmm18 & ~zmm1 & zmm16) | (zmm18 & zmm1 & ~zmm16) | (zmm18 & zmm1 & zmm16)
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm1
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm15
@@ -3888,11 +3888,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm14, %zmm16, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm16 & zmm14) | (zmm1 & ~zmm16 & zmm14) | (zmm1 & zmm16 & ~zmm14) | (zmm1 & zmm16 & zmm14)
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq (%r8), %ymm3
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm3, %zmm10
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm10
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm1 & mem) | (zmm10 & ~zmm1 & ~mem) | (zmm10 & zmm1 & ~mem) | (zmm10 & zmm1 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm15
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm15, %ymm1
@@ -3916,13 +3916,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm13, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm19, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm19 & zmm3) | (zmm2 & ~zmm19 & zmm3) | (zmm2 & zmm19 & ~zmm3) | (zmm2 & zmm19 & zmm3)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
 ; AVX512DQ-FCP-NEXT:    vpandnq 16(%r8){1to4}, %ymm3, %ymm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm12
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & ~mem) | (zmm0 & ~zmm2 & mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem)
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
 ; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm5, %ymm2
@@ -3942,11 +3942,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm5, %zmm1, %zmm8
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm16, %zmm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm16 & zmm7) | (zmm8 & ~zmm16 & zmm7) | (zmm8 & zmm16 & ~zmm7) | (zmm8 & zmm16 & zmm7)
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 48(%r8), %ymm1
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 56(%r8), %ymm5
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm8, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm3 & zmm8) | (zmm1 & ~zmm3 & ~zmm8) | (zmm1 & ~zmm3 & zmm8) | (zmm1 & zmm3 & zmm8)
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
@@ -3965,11 +3965,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm19, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm19 & zmm4) | (zmm5 & ~zmm19 & zmm4) | (zmm5 & zmm19 & ~zmm4) | (zmm5 & zmm19 & zmm4)
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 24(%r8), %ymm2
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 32(%r8), %ymm3
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & mem) | (zmm2 & ~zmm5 & ~mem) | (zmm2 & zmm5 & ~mem) | (zmm2 & zmm5 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 128(%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 256(%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r9)
@@ -6984,20 +6984,23 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpshufb %ymm7, %ymm2, %ymm4
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm31 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm31 = (~zmm31 & ~zmm19 & mem) | (zmm31 & ~zmm19 & mem) | (zmm31 & zmm19 & ~mem) | (zmm31 & zmm19 & mem)
 ; AVX512-NEXT:    vpbroadcastq 88(%r8), %ymm1
 ; AVX512-NEXT:    vpbroadcastq 96(%r8), %ymm2
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm31, %zmm18, %zmm1
-; AVX512-NEXT:    vpternlogq $226, %zmm25, %zmm19, %zmm27
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm18 & zmm31) | (zmm1 & ~zmm18 & ~zmm31) | (zmm1 & ~zmm18 & zmm31) | (zmm1 & zmm18 & zmm31)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm27 = (~zmm27 & ~zmm19 & zmm25) | (zmm27 & ~zmm19 & zmm25) | (zmm27 & zmm19 & ~zmm25) | (zmm27 & zmm19 & zmm25)
 ; AVX512-NEXT:    vpbroadcastq 24(%r8), %ymm2
 ; AVX512-NEXT:    vpbroadcastq 32(%r8), %ymm25
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm25, %zmm2, %zmm2
-; AVX512-NEXT:    vpternlogd $184, %zmm27, %zmm18, %zmm2
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm18 & zmm27) | (zmm2 & ~zmm18 & ~zmm27) | (zmm2 & ~zmm18 & zmm27) | (zmm2 & zmm18 & zmm27)
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm31 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm31 = (~zmm31 & ~zmm18 & mem) | (zmm31 & ~zmm18 & mem) | (zmm31 & zmm18 & ~mem) | (zmm31 & zmm18 & mem)
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm24 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm24 = (~zmm24 & ~zmm18 & mem) | (zmm24 & ~zmm18 & mem) | (zmm24 & zmm18 & ~mem) | (zmm24 & zmm18 & mem)
 ; AVX512-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm18 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
@@ -7025,43 +7028,43 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm4, %zmm4
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $248, %zmm21, %zmm31, %zmm23
-; AVX512-NEXT:    vpternlogq $248, %zmm21, %zmm24, %zmm26
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm31 & zmm21) | (zmm23 & ~zmm31 & ~zmm21) | (zmm23 & ~zmm31 & zmm21) | (zmm23 & zmm31 & ~zmm21) | (zmm23 & zmm31 & zmm21)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm24 & zmm21) | (zmm26 & ~zmm24 & ~zmm21) | (zmm26 & ~zmm24 & zmm21) | (zmm26 & zmm24 & ~zmm21) | (zmm26 & zmm24 & zmm21)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm25, %zmm18, %zmm18
 ; AVX512-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload
 ; AVX512-NEXT:    # zmm21 = mem[0,1,0,1,4,5,4,5]
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm18, %zmm24, %zmm21
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm24 & zmm18) | (zmm21 & ~zmm24 & zmm18) | (zmm21 & zmm24 & ~zmm18) | (zmm21 & zmm24 & zmm18)
 ; AVX512-NEXT:    vpbroadcastq 64(%r8), %ymm18
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm18, %zmm8
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm21, %zmm18, %zmm8
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm18 & zmm21) | (zmm8 & ~zmm18 & ~zmm21) | (zmm8 & ~zmm18 & zmm21) | (zmm8 & zmm18 & zmm21)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm28, %zmm27, %zmm21
 ; AVX512-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Folded Reload
 ; AVX512-NEXT:    # zmm22 = mem[0,1,0,1,4,5,4,5]
-; AVX512-NEXT:    vpternlogq $226, %zmm21, %zmm24, %zmm22
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm24 & zmm21) | (zmm22 & ~zmm24 & zmm21) | (zmm22 & zmm24 & ~zmm21) | (zmm22 & zmm24 & zmm21)
 ; AVX512-NEXT:    vpbroadcastq (%r8), %ymm21
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm21, %zmm10
-; AVX512-NEXT:    vpternlogd $184, %zmm22, %zmm18, %zmm10
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm18 & zmm22) | (zmm10 & ~zmm18 & ~zmm22) | (zmm10 & ~zmm18 & zmm22) | (zmm10 & zmm18 & zmm22)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm14, %zmm13
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
-; AVX512-NEXT:    vpternlogq $226, %zmm13, %zmm24, %zmm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm24 & zmm13) | (zmm11 & ~zmm24 & zmm13) | (zmm11 & zmm24 & ~zmm13) | (zmm11 & zmm24 & zmm13)
 ; AVX512-NEXT:    vpbroadcastq 112(%r8), %ymm12
 ; AVX512-NEXT:    vpbroadcastq 120(%r8), %ymm13
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm12, %zmm12
-; AVX512-NEXT:    vpternlogd $184, %zmm11, %zmm16, %zmm12
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm16 & zmm11) | (zmm12 & ~zmm16 & ~zmm11) | (zmm12 & ~zmm16 & zmm11) | (zmm12 & zmm16 & zmm11)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm15, %zmm7
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512-NEXT:    vpternlogq $226, %zmm7, %zmm24, %zmm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm24 & zmm7) | (zmm5 & ~zmm24 & zmm7) | (zmm5 & zmm24 & ~zmm7) | (zmm5 & zmm24 & zmm7)
 ; AVX512-NEXT:    vpbroadcastq 48(%r8), %ymm6
 ; AVX512-NEXT:    vpbroadcastq 56(%r8), %ymm7
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512-NEXT:    vpternlogd $184, %zmm5, %zmm16, %zmm6
-; AVX512-NEXT:    vpternlogq $226, %zmm30, %zmm19, %zmm9
-; AVX512-NEXT:    vpternlogq $226, %zmm20, %zmm19, %zmm0
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm16 & zmm5) | (zmm6 & ~zmm16 & ~zmm5) | (zmm6 & ~zmm16 & zmm5) | (zmm6 & zmm16 & zmm5)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm19 & zmm30) | (zmm9 & ~zmm19 & zmm30) | (zmm9 & zmm19 & ~zmm30) | (zmm9 & zmm19 & zmm30)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm19 & zmm20) | (zmm0 & ~zmm19 & zmm20) | (zmm0 & zmm19 & ~zmm20) | (zmm0 & zmm19 & zmm20)
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $248, %zmm5, %zmm9, %zmm17
-; AVX512-NEXT:    vpternlogq $248, %zmm5, %zmm0, %zmm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm9 & zmm5) | (zmm17 & ~zmm9 & ~zmm5) | (zmm17 & ~zmm9 & zmm5) | (zmm17 & zmm9 & ~zmm5) | (zmm17 & zmm9 & zmm5)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm0 & zmm5) | (zmm4 & ~zmm0 & ~zmm5) | (zmm4 & ~zmm0 & zmm5) | (zmm4 & zmm0 & ~zmm5) | (zmm4 & zmm0 & zmm5)
 ; AVX512-NEXT:    vmovdqa64 %zmm4, 384(%r9)
 ; AVX512-NEXT:    vmovdqa64 %zmm17, 64(%r9)
 ; AVX512-NEXT:    vmovdqa64 %zmm6, 256(%r9)
@@ -7184,7 +7187,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm6, %zmm1
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5]
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm31, %zmm20
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & ~zmm31 & zmm1) | (zmm20 & ~zmm31 & zmm1) | (zmm20 & zmm31 & ~zmm1) | (zmm20 & zmm31 & zmm1)
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm1
 ; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm2
@@ -7194,7 +7197,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm6, %zmm1
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm21 = zmm4[0,1,0,1,4,5,4,5]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm31, %zmm21
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm31 & zmm1) | (zmm21 & ~zmm31 & zmm1) | (zmm21 & zmm31 & ~zmm1) | (zmm21 & zmm31 & zmm1)
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %ymm28
@@ -7243,11 +7246,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm2
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm31, %zmm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm31 & zmm1) | (zmm2 & ~zmm31 & zmm1) | (zmm2 & zmm31 & ~zmm1) | (zmm2 & zmm31 & zmm1)
 ; AVX512-FCP-NEXT:    vpbroadcastq 112(%r8), %ymm0
 ; AVX512-FCP-NEXT:    vpbroadcastq 120(%r8), %ymm1
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm14
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm12, %zmm14
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm12 & zmm2) | (zmm14 & ~zmm12 & ~zmm2) | (zmm14 & ~zmm12 & zmm2) | (zmm14 & zmm12 & zmm2)
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm8
 ; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm23
@@ -7288,11 +7291,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm29[1,1,1,2,5,5,5,6]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10,11],ymm15[12],ymm7[13],ymm15[14],ymm7[15]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm18, %zmm15
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm31, %zmm15
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = (~zmm15 & ~zmm31 & zmm11) | (zmm15 & ~zmm31 & zmm11) | (zmm15 & zmm31 & ~zmm11) | (zmm15 & zmm31 & zmm11)
 ; AVX512-FCP-NEXT:    vpbroadcastq 48(%r8), %ymm7
 ; AVX512-FCP-NEXT:    vpbroadcastq 56(%r8), %ymm11
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm7, %zmm7
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm15, %zmm12, %zmm7
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm12 & zmm15) | (zmm7 & ~zmm12 & ~zmm15) | (zmm7 & ~zmm12 & zmm15) | (zmm7 & zmm12 & zmm15)
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3]
@@ -7315,7 +7318,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm12, %zmm8
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm1, %zmm8
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm1 & zmm0) | (zmm8 & ~zmm1 & zmm0) | (zmm8 & zmm1 & ~zmm0) | (zmm8 & zmm1 & zmm0)
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
@@ -7329,7 +7332,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpbroadcastq 96(%r8), %ymm5
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm8, %zmm5, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm5 & zmm8) | (zmm0 & ~zmm5 & ~zmm8) | (zmm0 & ~zmm5 & zmm8) | (zmm0 & zmm5 & zmm8)
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm8
 ; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3]
@@ -7340,31 +7343,34 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm28[2,3,2,3,6,7,6,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm12, %zmm6
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm1, %zmm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm1 & zmm4) | (zmm6 & ~zmm1 & zmm4) | (zmm6 & zmm1 & ~zmm4) | (zmm6 & zmm1 & zmm4)
 ; AVX512-FCP-NEXT:    vpbroadcastq 24(%r8), %ymm4
 ; AVX512-FCP-NEXT:    vpbroadcastq 32(%r8), %ymm8
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm6, %zmm5, %zmm4
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & zmm6) | (zmm4 & ~zmm5 & ~zmm6) | (zmm4 & ~zmm5 & zmm6) | (zmm4 & zmm5 & zmm6)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm3 = (~zmm3 & ~zmm5 & mem) | (zmm3 & ~zmm5 & mem) | (zmm3 & zmm5 & ~mem) | (zmm3 & zmm5 & mem)
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm16 = (~zmm16 & ~zmm5 & mem) | (zmm16 & ~zmm5 & mem) | (zmm16 & zmm5 & ~mem) | (zmm16 & zmm5 & mem)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $248, %zmm5, %zmm3, %zmm8
-; AVX512-FCP-NEXT:    vpternlogq $248, %zmm5, %zmm16, %zmm17
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm3 & zmm5) | (zmm8 & ~zmm3 & ~zmm5) | (zmm8 & ~zmm3 & zmm5) | (zmm8 & zmm3 & ~zmm5) | (zmm8 & zmm3 & zmm5)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm16 & zmm5) | (zmm17 & ~zmm16 & ~zmm5) | (zmm17 & ~zmm16 & zmm5) | (zmm17 & zmm16 & ~zmm5) | (zmm17 & zmm16 & zmm5)
 ; AVX512-FCP-NEXT:    vpbroadcastq 64(%r8), %ymm5
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm20, %zmm5, %zmm2
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & zmm20) | (zmm2 & ~zmm5 & ~zmm20) | (zmm2 & ~zmm5 & zmm20) | (zmm2 & zmm5 & zmm20)
 ; AVX512-FCP-NEXT:    vpbroadcastq (%r8), %ymm6
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm6, %zmm6
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm21, %zmm5, %zmm6
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & zmm21) | (zmm6 & ~zmm5 & ~zmm21) | (zmm6 & ~zmm5 & zmm21) | (zmm6 & zmm5 & zmm21)
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm25 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm19, %zmm1, %zmm9
+; AVX512-FCP-NEXT:    # zmm25 = (~zmm25 & ~zmm1 & mem) | (zmm25 & ~zmm1 & mem) | (zmm25 & zmm1 & ~mem) | (zmm25 & zmm1 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm1 & zmm19) | (zmm9 & ~zmm1 & zmm19) | (zmm9 & zmm1 & ~zmm19) | (zmm9 & zmm1 & zmm19)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $248, %zmm1, %zmm25, %zmm27
-; AVX512-FCP-NEXT:    vpternlogq $248, %zmm1, %zmm9, %zmm24
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm25 & zmm1) | (zmm27 & ~zmm25 & ~zmm1) | (zmm27 & ~zmm25 & zmm1) | (zmm27 & zmm25 & ~zmm1) | (zmm27 & zmm25 & zmm1)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm9 & zmm1) | (zmm24 & ~zmm9 & ~zmm1) | (zmm24 & ~zmm9 & zmm1) | (zmm24 & zmm9 & ~zmm1) | (zmm24 & zmm9 & zmm1)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 128(%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 256(%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 448(%r9)
@@ -7611,20 +7617,23 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm2, %ymm4
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
 ; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm31 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm31 = (~zmm31 & ~zmm19 & mem) | (zmm31 & ~zmm19 & mem) | (zmm31 & zmm19 & ~mem) | (zmm31 & zmm19 & mem)
 ; AVX512DQ-NEXT:    vpbroadcastq 88(%r8), %ymm1
 ; AVX512DQ-NEXT:    vpbroadcastq 96(%r8), %ymm2
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm31, %zmm18, %zmm1
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm25, %zmm19, %zmm27
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm18 & zmm31) | (zmm1 & ~zmm18 & ~zmm31) | (zmm1 & ~zmm18 & zmm31) | (zmm1 & zmm18 & zmm31)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm27 = (~zmm27 & ~zmm19 & zmm25) | (zmm27 & ~zmm19 & zmm25) | (zmm27 & zmm19 & ~zmm25) | (zmm27 & zmm19 & zmm25)
 ; AVX512DQ-NEXT:    vpbroadcastq 24(%r8), %ymm2
 ; AVX512DQ-NEXT:    vpbroadcastq 32(%r8), %ymm25
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm25, %zmm2, %zmm2
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm27, %zmm18, %zmm2
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm18 & zmm27) | (zmm2 & ~zmm18 & ~zmm27) | (zmm2 & ~zmm18 & zmm27) | (zmm2 & zmm18 & zmm27)
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
 ; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm31 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm31 = (~zmm31 & ~zmm18 & mem) | (zmm31 & ~zmm18 & mem) | (zmm31 & zmm18 & ~mem) | (zmm31 & zmm18 & mem)
 ; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm24 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm24 = (~zmm24 & ~zmm18 & mem) | (zmm24 & ~zmm18 & mem) | (zmm24 & zmm18 & ~mem) | (zmm24 & zmm18 & mem)
 ; AVX512DQ-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm18 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
@@ -7652,43 +7661,43 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm29, %zmm4, %zmm4
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $248, %zmm21, %zmm31, %zmm23
-; AVX512DQ-NEXT:    vpternlogq $248, %zmm21, %zmm24, %zmm26
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm31 & zmm21) | (zmm23 & ~zmm31 & ~zmm21) | (zmm23 & ~zmm31 & zmm21) | (zmm23 & zmm31 & ~zmm21) | (zmm23 & zmm31 & zmm21)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm24 & zmm21) | (zmm26 & ~zmm24 & ~zmm21) | (zmm26 & ~zmm24 & zmm21) | (zmm26 & zmm24 & ~zmm21) | (zmm26 & zmm24 & zmm21)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm25, %zmm18, %zmm18
 ; AVX512DQ-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload
 ; AVX512DQ-NEXT:    # zmm21 = mem[0,1,0,1,4,5,4,5]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm18, %zmm24, %zmm21
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm24 & zmm18) | (zmm21 & ~zmm24 & zmm18) | (zmm21 & zmm24 & ~zmm18) | (zmm21 & zmm24 & zmm18)
 ; AVX512DQ-NEXT:    vpbroadcastq 64(%r8), %ymm18
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm18, %zmm8
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm21, %zmm18, %zmm8
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm18 & zmm21) | (zmm8 & ~zmm18 & ~zmm21) | (zmm8 & ~zmm18 & zmm21) | (zmm8 & zmm18 & zmm21)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm28, %zmm27, %zmm21
 ; AVX512DQ-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Folded Reload
 ; AVX512DQ-NEXT:    # zmm22 = mem[0,1,0,1,4,5,4,5]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm21, %zmm24, %zmm22
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm24 & zmm21) | (zmm22 & ~zmm24 & zmm21) | (zmm22 & zmm24 & ~zmm21) | (zmm22 & zmm24 & zmm21)
 ; AVX512DQ-NEXT:    vpbroadcastq (%r8), %ymm21
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm21, %zmm10
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm22, %zmm18, %zmm10
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm18 & zmm22) | (zmm10 & ~zmm18 & ~zmm22) | (zmm10 & ~zmm18 & zmm22) | (zmm10 & zmm18 & zmm22)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm14, %zmm13
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm13, %zmm24, %zmm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm24 & zmm13) | (zmm11 & ~zmm24 & zmm13) | (zmm11 & zmm24 & ~zmm13) | (zmm11 & zmm24 & zmm13)
 ; AVX512DQ-NEXT:    vpbroadcastq 112(%r8), %ymm12
 ; AVX512DQ-NEXT:    vpbroadcastq 120(%r8), %ymm13
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm12, %zmm12
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm11, %zmm16, %zmm12
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm16 & zmm11) | (zmm12 & ~zmm16 & ~zmm11) | (zmm12 & ~zmm16 & zmm11) | (zmm12 & zmm16 & zmm11)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm15, %zmm7
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm7, %zmm24, %zmm5
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm24 & zmm7) | (zmm5 & ~zmm24 & zmm7) | (zmm5 & zmm24 & ~zmm7) | (zmm5 & zmm24 & zmm7)
 ; AVX512DQ-NEXT:    vpbroadcastq 48(%r8), %ymm6
 ; AVX512DQ-NEXT:    vpbroadcastq 56(%r8), %ymm7
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm5, %zmm16, %zmm6
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm30, %zmm19, %zmm9
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm20, %zmm19, %zmm0
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm16 & zmm5) | (zmm6 & ~zmm16 & ~zmm5) | (zmm6 & ~zmm16 & zmm5) | (zmm6 & zmm16 & zmm5)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm19 & zmm30) | (zmm9 & ~zmm19 & zmm30) | (zmm9 & zmm19 & ~zmm30) | (zmm9 & zmm19 & zmm30)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm19 & zmm20) | (zmm0 & ~zmm19 & zmm20) | (zmm0 & zmm19 & ~zmm20) | (zmm0 & zmm19 & zmm20)
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $248, %zmm5, %zmm9, %zmm17
-; AVX512DQ-NEXT:    vpternlogq $248, %zmm5, %zmm0, %zmm4
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm9 & zmm5) | (zmm17 & ~zmm9 & ~zmm5) | (zmm17 & ~zmm9 & zmm5) | (zmm17 & zmm9 & ~zmm5) | (zmm17 & zmm9 & zmm5)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm0 & zmm5) | (zmm4 & ~zmm0 & ~zmm5) | (zmm4 & ~zmm0 & zmm5) | (zmm4 & zmm0 & ~zmm5) | (zmm4 & zmm0 & zmm5)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 384(%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 64(%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 256(%r9)
@@ -7811,7 +7820,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm6, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm31, %zmm20
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & ~zmm31 & zmm1) | (zmm20 & ~zmm31 & zmm1) | (zmm20 & zmm31 & ~zmm1) | (zmm20 & zmm31 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm1
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm2
@@ -7821,7 +7830,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm6, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm21 = zmm4[0,1,0,1,4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm31, %zmm21
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm31 & zmm1) | (zmm21 & ~zmm31 & zmm1) | (zmm21 & zmm31 & ~zmm1) | (zmm21 & zmm31 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %ymm28
@@ -7870,11 +7879,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm31, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm31 & zmm1) | (zmm2 & ~zmm31 & zmm1) | (zmm2 & zmm31 & ~zmm1) | (zmm2 & zmm31 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 112(%r8), %ymm0
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 120(%r8), %ymm1
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm14
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm12, %zmm14
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm12 & zmm2) | (zmm14 & ~zmm12 & ~zmm2) | (zmm14 & ~zmm12 & zmm2) | (zmm14 & zmm12 & zmm2)
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm8
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm23
@@ -7915,11 +7924,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm29[1,1,1,2,5,5,5,6]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10,11],ymm15[12],ymm7[13],ymm15[14],ymm7[15]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm18, %zmm15
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm31, %zmm15
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = (~zmm15 & ~zmm31 & zmm11) | (zmm15 & ~zmm31 & zmm11) | (zmm15 & zmm31 & ~zmm11) | (zmm15 & zmm31 & zmm11)
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 48(%r8), %ymm7
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 56(%r8), %ymm11
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm7, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm15, %zmm12, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm12 & zmm15) | (zmm7 & ~zmm12 & ~zmm15) | (zmm7 & ~zmm12 & zmm15) | (zmm7 & zmm12 & zmm15)
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3]
@@ -7942,7 +7951,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm12, %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm1, %zmm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm1 & zmm0) | (zmm8 & ~zmm1 & zmm0) | (zmm8 & zmm1 & ~zmm0) | (zmm8 & zmm1 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
@@ -7956,7 +7965,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 96(%r8), %ymm5
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm8, %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm5 & zmm8) | (zmm0 & ~zmm5 & ~zmm8) | (zmm0 & ~zmm5 & zmm8) | (zmm0 & zmm5 & zmm8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm8
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3]
@@ -7967,31 +7976,34 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm28[2,3,2,3,6,7,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm12, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm1, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm1 & zmm4) | (zmm6 & ~zmm1 & zmm4) | (zmm6 & zmm1 & ~zmm4) | (zmm6 & zmm1 & zmm4)
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 24(%r8), %ymm4
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 32(%r8), %ymm8
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm6, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & zmm6) | (zmm4 & ~zmm5 & ~zmm6) | (zmm4 & ~zmm5 & zmm6) | (zmm4 & zmm5 & zmm6)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm3 = (~zmm3 & ~zmm5 & mem) | (zmm3 & ~zmm5 & mem) | (zmm3 & zmm5 & ~mem) | (zmm3 & zmm5 & mem)
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm16 = (~zmm16 & ~zmm5 & mem) | (zmm16 & ~zmm5 & mem) | (zmm16 & zmm5 & ~mem) | (zmm16 & zmm5 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm5, %zmm3, %zmm8
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm5, %zmm16, %zmm17
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm3 & zmm5) | (zmm8 & ~zmm3 & ~zmm5) | (zmm8 & ~zmm3 & zmm5) | (zmm8 & zmm3 & ~zmm5) | (zmm8 & zmm3 & zmm5)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm16 & zmm5) | (zmm17 & ~zmm16 & ~zmm5) | (zmm17 & ~zmm16 & zmm5) | (zmm17 & zmm16 & ~zmm5) | (zmm17 & zmm16 & zmm5)
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 64(%r8), %ymm5
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm20, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & zmm20) | (zmm2 & ~zmm5 & ~zmm20) | (zmm2 & ~zmm5 & zmm20) | (zmm2 & zmm5 & zmm20)
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq (%r8), %ymm6
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm6, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm21, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & zmm21) | (zmm6 & ~zmm5 & ~zmm21) | (zmm6 & ~zmm5 & zmm21) | (zmm6 & zmm5 & zmm21)
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm25 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm19, %zmm1, %zmm9
+; AVX512DQ-FCP-NEXT:    # zmm25 = (~zmm25 & ~zmm1 & mem) | (zmm25 & ~zmm1 & mem) | (zmm25 & zmm1 & ~mem) | (zmm25 & zmm1 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm1 & zmm19) | (zmm9 & ~zmm1 & zmm19) | (zmm9 & zmm1 & ~zmm19) | (zmm9 & zmm1 & zmm19)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm1, %zmm25, %zmm27
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm1, %zmm9, %zmm24
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm25 & zmm1) | (zmm27 & ~zmm25 & ~zmm1) | (zmm27 & ~zmm25 & zmm1) | (zmm27 & zmm25 & ~zmm1) | (zmm27 & zmm25 & zmm1)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm9 & zmm1) | (zmm24 & ~zmm9 & ~zmm1) | (zmm24 & ~zmm9 & zmm1) | (zmm24 & zmm9 & ~zmm1) | (zmm24 & zmm9 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 128(%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 256(%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 448(%r9)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index c725dcd972cd5..c93d7b7a720c3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -4128,19 +4128,19 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogd $184, %zmm16, %zmm9, %zmm8
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm9 & zmm16) | (zmm8 & ~zmm9 & ~zmm16) | (zmm8 & ~zmm9 & zmm16) | (zmm8 & zmm9 & zmm16)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm13
-; AVX512-NEXT:    vpternlogd $184, %zmm17, %zmm9, %zmm13
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm9 & zmm17) | (zmm13 & ~zmm9 & ~zmm17) | (zmm13 & ~zmm9 & zmm17) | (zmm13 & zmm9 & zmm17)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm0
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm2, %zmm5, %zmm0
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm5 & zmm2) | (zmm0 & ~zmm5 & ~zmm2) | (zmm0 & ~zmm5 & zmm2) | (zmm0 & zmm5 & zmm2)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm2
-; AVX512-NEXT:    vpternlogd $184, %zmm4, %zmm5, %zmm2
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & zmm4) | (zmm2 & ~zmm5 & ~zmm4) | (zmm2 & ~zmm5 & zmm4) | (zmm2 & zmm5 & zmm4)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm11, %zmm4
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm12, %zmm5, %zmm4
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & zmm12) | (zmm4 & ~zmm5 & ~zmm12) | (zmm4 & ~zmm5 & zmm12) | (zmm4 & zmm5 & zmm12)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm15, %zmm3
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm5, %zmm3
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm5 & zmm1) | (zmm3 & ~zmm5 & ~zmm1) | (zmm3 & ~zmm5 & zmm1) | (zmm3 & zmm5 & zmm1)
 ; AVX512-NEXT:    vmovdqa64 %zmm3, (%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm4, 192(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm2, 256(%rax)
@@ -4337,23 +4337,23 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm0
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm18, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm2 & zmm1) | (zmm0 & ~zmm2 & ~zmm1) | (zmm0 & ~zmm2 & zmm1) | (zmm0 & zmm2 & zmm1)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 256(%rax)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm22, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm2, %zmm8
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm2 & zmm0) | (zmm8 & ~zmm2 & ~zmm0) | (zmm8 & ~zmm2 & zmm0) | (zmm8 & zmm2 & zmm0)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm21, %zmm0, %zmm15
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm15 = (~zmm15 & zmm0 & zmm21) | (zmm15 & ~zmm0 & ~zmm21) | (zmm15 & ~zmm0 & zmm21) | (zmm15 & zmm0 & zmm21)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, (%rax)
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm19, %zmm0, %zmm14
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm0 & zmm19) | (zmm14 & ~zmm0 & ~zmm19) | (zmm14 & ~zmm0 & zmm19) | (zmm14 & zmm0 & zmm19)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 192(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm17, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm0 & zmm17) | (zmm1 & ~zmm0 & ~zmm17) | (zmm1 & ~zmm0 & zmm17) | (zmm1 & zmm0 & zmm17)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm16, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm0 & zmm16) | (zmm1 & ~zmm0 & ~zmm16) | (zmm1 & ~zmm0 & zmm16) | (zmm1 & zmm0 & zmm16)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
 ; AVX512-FCP-NEXT:    popq %rax
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -4568,19 +4568,19 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm20, %zmm19, %zmm10
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm16, %zmm11, %zmm10
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm11 & zmm16) | (zmm10 & ~zmm11 & ~zmm16) | (zmm10 & ~zmm11 & zmm16) | (zmm10 & zmm11 & zmm16)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm23, %zmm21, %zmm12
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm17, %zmm11, %zmm12
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm11 & zmm17) | (zmm12 & ~zmm11 & ~zmm17) | (zmm12 & ~zmm11 & zmm17) | (zmm12 & zmm11 & zmm17)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm22, %zmm4, %zmm0
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm4 & zmm22) | (zmm0 & ~zmm4 & ~zmm22) | (zmm0 & ~zmm4 & zmm22) | (zmm0 & zmm4 & zmm22)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm18, %zmm4, %zmm7
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm4 & zmm18) | (zmm7 & ~zmm4 & ~zmm18) | (zmm7 & ~zmm4 & zmm18) | (zmm7 & zmm4 & zmm18)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm4
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm5, %zmm4
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & zmm2) | (zmm4 & ~zmm5 & ~zmm2) | (zmm4 & ~zmm5 & zmm2) | (zmm4 & zmm5 & zmm2)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm9, %zmm1
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm3, %zmm5, %zmm1
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm5 & zmm3) | (zmm1 & ~zmm5 & ~zmm3) | (zmm1 & ~zmm5 & zmm3) | (zmm1 & zmm5 & zmm3)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 256(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm12, (%rax)
@@ -4787,22 +4787,22 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,1,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm10, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm4, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & zmm4) | (zmm6 & ~zmm5 & ~zmm4) | (zmm6 & ~zmm5 & zmm4) | (zmm6 & zmm5 & zmm4)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & zmm3) | (zmm2 & ~zmm5 & ~zmm3) | (zmm2 & ~zmm5 & zmm3) | (zmm2 & zmm5 & zmm3)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm2 & zmm0) | (zmm1 & ~zmm2 & ~zmm0) | (zmm1 & ~zmm2 & zmm0) | (zmm1 & zmm2 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm18, %zmm2, %zmm14
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm2 & zmm18) | (zmm14 & ~zmm2 & ~zmm18) | (zmm14 & ~zmm2 & zmm18) | (zmm14 & zmm2 & zmm18)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 320(%rax)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm17, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm1, %zmm13
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm1 & zmm0) | (zmm13 & ~zmm1 & ~zmm0) | (zmm13 & ~zmm1 & zmm0) | (zmm13 & zmm1 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 256(%rax)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm23, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm1, %zmm15
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm15 = (~zmm15 & zmm1 & zmm0) | (zmm15 & ~zmm1 & ~zmm0) | (zmm15 & ~zmm1 & zmm0) | (zmm15 & zmm1 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, 64(%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -8710,6 +8710,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm25, %zmm24, %zmm24
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
 ; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm24 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm24 = (~zmm24 & zmm25 & mem) | (zmm24 & ~zmm25 & ~mem) | (zmm24 & ~zmm25 & mem) | (zmm24 & zmm25 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm27, %zmm26, %zmm26
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7]
@@ -8722,6 +8723,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1]
 ; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm26 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm26 = (~zmm26 & zmm25 & mem) | (zmm26 & ~zmm25 & ~mem) | (zmm26 & ~zmm25 & mem) | (zmm26 & zmm25 & mem)
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7]
@@ -8736,26 +8738,30 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm17 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm17 = (~zmm17 & zmm25 & mem) | (zmm17 & ~zmm25 & ~mem) | (zmm17 & ~zmm25 & mem) | (zmm17 & zmm25 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm16, %zmm18, %zmm16
 ; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm16 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm16 = (~zmm16 & zmm25 & mem) | (zmm16 & ~zmm25 & ~mem) | (zmm16 & ~zmm25 & mem) | (zmm16 & zmm25 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm23, %zmm18
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
 ; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm18 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm18 = (~zmm18 & zmm23 & mem) | (zmm18 & ~zmm23 & ~mem) | (zmm18 & ~zmm23 & mem) | (zmm18 & zmm23 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm11, %zmm4
 ; AVX512-NEXT:    vpternlogd $184, (%rsp), %zmm23, %zmm4 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm4 = (~zmm4 & zmm23 & mem) | (zmm4 & ~zmm23 & ~mem) | (zmm4 & ~zmm23 & mem) | (zmm4 & zmm23 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
-; AVX512-NEXT:    vpternlogd $184, %zmm20, %zmm23, %zmm7
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm23 & zmm20) | (zmm7 & ~zmm23 & ~zmm20) | (zmm7 & ~zmm23 & zmm20) | (zmm7 & zmm23 & zmm20)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm10, %zmm5
-; AVX512-NEXT:    vpternlogd $184, %zmm19, %zmm23, %zmm5
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm23 & zmm19) | (zmm5 & ~zmm23 & ~zmm19) | (zmm5 & ~zmm23 & zmm19) | (zmm5 & zmm23 & zmm19)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm12, %zmm8
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm22, %zmm10, %zmm8
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm10 & zmm22) | (zmm8 & ~zmm10 & ~zmm22) | (zmm8 & ~zmm10 & zmm22) | (zmm8 & zmm10 & zmm22)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm14, %zmm6
-; AVX512-NEXT:    vpternlogd $184, %zmm28, %zmm10, %zmm6
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm10 & zmm28) | (zmm6 & ~zmm10 & ~zmm28) | (zmm6 & ~zmm10 & zmm28) | (zmm6 & zmm10 & zmm28)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm9, %zmm0
-; AVX512-NEXT:    vpternlogd $184, %zmm15, %zmm10, %zmm0
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm10 & zmm15) | (zmm0 & ~zmm10 & ~zmm15) | (zmm0 & ~zmm10 & zmm15) | (zmm0 & zmm10 & zmm15)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm10, %zmm2
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm10 & zmm1) | (zmm2 & ~zmm10 & ~zmm1) | (zmm2 & ~zmm10 & zmm1) | (zmm2 & zmm10 & zmm1)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa64 %zmm2, (%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm0, 192(%rax)
@@ -9166,40 +9172,45 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm28, %zmm6
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm27, %zmm8
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm8, %zmm9, %zmm6
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm9 & zmm8) | (zmm6 & ~zmm9 & ~zmm8) | (zmm6 & ~zmm9 & zmm8) | (zmm6 & zmm9 & zmm8)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm19, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm9, %zmm12
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm9 & zmm0) | (zmm12 & ~zmm9 & ~zmm0) | (zmm12 & ~zmm9 & zmm0) | (zmm12 & zmm9 & zmm0)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 448(%rax)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm29, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm9, %zmm11
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm9 & zmm0) | (zmm11 & ~zmm9 & ~zmm0) | (zmm11 & ~zmm9 & zmm0) | (zmm11 & zmm9 & zmm0)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 640(%rax)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm9, %zmm3
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm9 & zmm0) | (zmm3 & ~zmm9 & ~zmm0) | (zmm3 & ~zmm9 & zmm0) | (zmm3 & zmm9 & zmm0)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm17, %zmm0, %zmm4
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm0 & zmm17) | (zmm4 & ~zmm0 & ~zmm17) | (zmm4 & ~zmm0 & zmm17) | (zmm4 & zmm0 & zmm17)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm16, %zmm0, %zmm5
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm0 & zmm16) | (zmm5 & ~zmm0 & ~zmm16) | (zmm5 & ~zmm0 & zmm16) | (zmm5 & zmm0 & zmm16)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm25, %zmm1, %zmm13
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm1 & zmm25) | (zmm13 & ~zmm1 & ~zmm25) | (zmm13 & ~zmm1 & zmm25) | (zmm13 & zmm1 & zmm25)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 128(%rax)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm2 = (~zmm2 & zmm1 & mem) | (zmm2 & ~zmm1 & ~mem) | (zmm2 & ~zmm1 & mem) | (zmm2 & zmm1 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 320(%rax)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm2 = (~zmm2 & zmm0 & mem) | (zmm2 & ~zmm0 & ~mem) | (zmm2 & ~zmm0 & mem) | (zmm2 & zmm0 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 384(%rax)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm2 = (~zmm2 & zmm0 & mem) | (zmm2 & ~zmm0 & ~mem) | (zmm2 & ~zmm0 & mem) | (zmm2 & zmm0 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 576(%rax)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 512(%rax)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 704(%rax)
 ; AVX512-FCP-NEXT:    addq $1240, %rsp # imm = 0x4D8
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -9545,6 +9556,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm16
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
 ; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm16 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm16 = (~zmm16 & zmm28 & mem) | (zmm16 & ~zmm28 & ~mem) | (zmm16 & ~zmm28 & mem) | (zmm16 & zmm28 & mem)
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm4[2,2,2,3]
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm2, %zmm3 {%k1}
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm1
@@ -9621,6 +9633,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
 ; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm1 = (~zmm1 & zmm28 & mem) | (zmm1 & ~zmm28 & ~mem) | (zmm1 & ~zmm28 & mem) | (zmm1 & zmm28 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm12
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm2, %zmm2
 ; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm15
@@ -9651,26 +9664,31 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
 ; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm12 = (~zmm12 & zmm28 & mem) | (zmm12 & ~zmm28 & ~mem) | (zmm12 & ~zmm28 & mem) | (zmm12 & zmm28 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm30, %zmm29, %zmm17
 ; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm17 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm17 = (~zmm17 & zmm28 & mem) | (zmm17 & ~zmm28 & ~mem) | (zmm17 & ~zmm28 & mem) | (zmm17 & zmm28 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm24, %zmm22
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
 ; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm22 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm22 = (~zmm22 & zmm24 & mem) | (zmm22 & ~zmm24 & ~mem) | (zmm22 & ~zmm24 & mem) | (zmm22 & zmm24 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
 ; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm7 = (~zmm7 & zmm24 & mem) | (zmm7 & ~zmm24 & ~mem) | (zmm7 & ~zmm24 & mem) | (zmm7 & zmm24 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm5, %zmm3
 ; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm3 = (~zmm3 & zmm24 & mem) | (zmm3 & ~zmm24 & ~mem) | (zmm3 & ~zmm24 & mem) | (zmm3 & zmm24 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm23, %zmm24, %zmm0
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm24 & zmm23) | (zmm0 & ~zmm24 & ~zmm23) | (zmm0 & ~zmm24 & zmm23) | (zmm0 & zmm24 & zmm23)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm14, %zmm5
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm25, %zmm13, %zmm5
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm13 & zmm25) | (zmm5 & ~zmm13 & ~zmm25) | (zmm5 & ~zmm13 & zmm25) | (zmm5 & zmm13 & zmm25)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm6, %zmm6
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm26, %zmm13, %zmm6
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm13 & zmm26) | (zmm6 & ~zmm13 & ~zmm26) | (zmm6 & ~zmm13 & zmm26) | (zmm6 & zmm13 & zmm26)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm9, %zmm4
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm27, %zmm13, %zmm4
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm13 & zmm27) | (zmm4 & ~zmm13 & ~zmm27) | (zmm4 & ~zmm13 & zmm27) | (zmm4 & zmm13 & zmm27)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm13, %zmm8
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm13 & zmm2) | (zmm8 & ~zmm13 & ~zmm2) | (zmm8 & ~zmm13 & zmm2) | (zmm8 & zmm13 & zmm2)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 256(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 448(%rax)
@@ -10094,42 +10112,45 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm9[0,0,2,1,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm16, %zmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm8, %zmm11
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm8 & zmm0) | (zmm11 & ~zmm8 & ~zmm0) | (zmm11 & ~zmm8 & zmm0) | (zmm11 & zmm8 & zmm0)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm10, %zmm8, %zmm12
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm8 & zmm10) | (zmm12 & ~zmm8 & ~zmm10) | (zmm12 & ~zmm8 & zmm10) | (zmm12 & zmm8 & zmm10)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 192(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm0 & zmm2) | (zmm7 & ~zmm0 & ~zmm2) | (zmm7 & ~zmm0 & zmm2) | (zmm7 & zmm0 & zmm2)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm27, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm0 & zmm27) | (zmm6 & ~zmm0 & ~zmm27) | (zmm6 & ~zmm0 & zmm27) | (zmm6 & zmm0 & zmm27)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 320(%rax)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm24, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm6, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm6 & zmm2) | (zmm7 & ~zmm6 & ~zmm2) | (zmm7 & ~zmm6 & zmm2) | (zmm7 & zmm6 & zmm2)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 256(%rax)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm28, %zmm20, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm6, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm6 & zmm2) | (zmm7 & ~zmm6 & ~zmm2) | (zmm7 & ~zmm6 & zmm2) | (zmm7 & zmm6 & zmm2)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 448(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm23, %zmm8, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm8 & zmm23) | (zmm5 & ~zmm8 & ~zmm23) | (zmm5 & ~zmm8 & zmm23) | (zmm5 & zmm8 & zmm23)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 384(%rax)
 ; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm4 = (~zmm4 & zmm8 & mem) | (zmm4 & ~zmm8 & ~mem) | (zmm4 & ~zmm8 & mem) | (zmm4 & zmm8 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 576(%rax)
 ; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm3 = (~zmm3 & zmm0 & mem) | (zmm3 & ~zmm0 & ~mem) | (zmm3 & ~zmm0 & mem) | (zmm3 & zmm0 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 512(%rax)
 ; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 704(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm6 & zmm0) | (zmm1 & ~zmm6 & ~zmm0) | (zmm1 & ~zmm6 & zmm0) | (zmm1 & zmm6 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 640(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm6 & zmm0) | (zmm1 & ~zmm6 & ~zmm0) | (zmm1 & ~zmm6 & zmm0) | (zmm1 & zmm6 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
 ; AVX512DQ-FCP-NEXT:    addq $1176, %rsp # imm = 0x498
 ; AVX512DQ-FCP-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index dc362d729fcd3..51f173bc1a9bc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -647,7 +647,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512-NEXT:    vporq %zmm1, %zmm2, %zmm1
-; AVX512-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem)
 ; AVX512-NEXT:    vextracti32x4 $2, %zmm1, 32(%rax)
 ; AVX512-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, 48(%rax)
@@ -684,7 +684,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29]
-; AVX512-FCP-NEXT:    vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & mem)
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0]
 ; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
@@ -730,7 +730,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512DQ-NEXT:    vporq %zmm1, %zmm2, %zmm1
-; AVX512DQ-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem)
 ; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm1, 32(%rax)
 ; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
 ; AVX512DQ-NEXT:    vmovq %xmm0, 48(%rax)
@@ -767,7 +767,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29]
-; AVX512DQ-FCP-NEXT:    vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & mem)
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
@@ -1381,8 +1381,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpbroadcastd (%r10), %ymm11
 ; AVX512-NEXT:    vpbroadcastd 4(%r10), %ymm12
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm11
-; AVX512-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm10 & ~mem) | (~zmm11 & zmm10 & mem) | (zmm11 & ~zmm10 & mem) | (zmm11 & zmm10 & ~mem) | (zmm11 & zmm10 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm9 & mem) | (zmm11 & ~zmm9 & ~mem) | (zmm11 & zmm9 & ~mem) | (zmm11 & zmm9 & mem)
 ; AVX512-NEXT:    vpsrlq $48, %xmm4, %xmm4
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -1401,8 +1401,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
+; AVX512-NEXT:    vpternlogd {{.*#+}} ymm3 = (~ymm3 & ~ymm2 & mem) | (~ymm3 & ymm2 & ~mem) | (~ymm3 & ymm2 & mem) | (ymm3 & ymm2 & ~mem) | (ymm3 & ymm2 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & ~mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & ~mem) | (ymm3 & ymm1 & mem)
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm1
 ; AVX512-NEXT:    vmovdqa %xmm0, 96(%rax)
 ; AVX512-NEXT:    vmovdqa %ymm1, 64(%rax)
@@ -1441,8 +1441,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3
-; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm3 = (~ymm3 & ~ymm2 & mem) | (~ymm3 & ymm2 & ~mem) | (~ymm3 & ymm2 & mem) | (ymm3 & ymm2 & ~mem) | (ymm3 & ymm2 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & ~mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & ~mem) | (ymm3 & ymm1 & mem)
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm1
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
@@ -1463,8 +1463,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpbroadcastd (%r10), %ymm4
 ; AVX512-FCP-NEXT:    vpbroadcastd 4(%r10), %ymm5
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & ~mem) | (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem)
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm2 & mem) | (zmm4 & ~zmm2 & ~mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, 96(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm1, 64(%rax)
@@ -1506,8 +1506,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpbroadcastd (%r10), %ymm11
 ; AVX512DQ-NEXT:    vpbroadcastd 4(%r10), %ymm12
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm11
-; AVX512DQ-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm10 & ~mem) | (~zmm11 & zmm10 & mem) | (zmm11 & ~zmm10 & mem) | (zmm11 & zmm10 & ~mem) | (zmm11 & zmm10 & mem)
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm9 & mem) | (zmm11 & ~zmm9 & ~mem) | (zmm11 & zmm9 & ~mem) | (zmm11 & zmm9 & mem)
 ; AVX512DQ-NEXT:    vpsrlq $48, %xmm4, %xmm4
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -1526,8 +1526,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm3 = (~ymm3 & ~ymm2 & mem) | (~ymm3 & ymm2 & ~mem) | (~ymm3 & ymm2 & mem) | (ymm3 & ymm2 & ~mem) | (ymm3 & ymm2 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & ~mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & ~mem) | (ymm3 & ymm1 & mem)
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, 96(%rax)
 ; AVX512DQ-NEXT:    vmovdqa %ymm1, 64(%rax)
@@ -1566,8 +1566,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm3 = (~ymm3 & ~ymm2 & mem) | (~ymm3 & ymm2 & ~mem) | (~ymm3 & ymm2 & mem) | (ymm3 & ymm2 & ~mem) | (ymm3 & ymm2 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & ~mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & ~mem) | (ymm3 & ymm1 & mem)
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
@@ -1588,8 +1588,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd (%r10), %ymm4
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%r10), %ymm5
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & ~mem) | (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm2 & mem) | (zmm4 & ~zmm2 & ~mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, 96(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, 64(%rax)
@@ -2937,27 +2937,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm16, %zmm7, %zmm7
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm17, %zmm9, %zmm9
-; AVX512-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm9 = (~zmm9 & zmm7 & ~mem) | (zmm9 & ~zmm7 & mem) | (zmm9 & zmm7 & ~mem) | (zmm9 & zmm7 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm19, %zmm4, %zmm4
-; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm4 & mem) | (zmm5 & ~zmm4 & ~mem) | (zmm5 & ~zmm4 & mem) | (zmm5 & zmm4 & ~mem) | (zmm5 & zmm4 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm9 & mem) | (zmm5 & ~zmm9 & ~mem) | (zmm5 & zmm9 & ~mem) | (zmm5 & zmm9 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm4
 ; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm4 & ~mem) | (zmm7 & ~zmm4 & mem) | (zmm7 & zmm4 & ~mem) | (zmm7 & zmm4 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm12, %zmm0
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm0 & mem) | (zmm3 & ~zmm0 & ~mem) | (zmm3 & zmm0 & ~mem) | (zmm3 & zmm0 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm7 & mem) | (zmm3 & ~zmm7 & ~mem) | (zmm3 & zmm7 & ~mem) | (zmm3 & zmm7 & mem)
 ; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5]
-; AVX512-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm0
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm18 & ~mem) | (zmm0 & ~zmm18 & mem) | (zmm0 & zmm18 & ~mem) | (zmm0 & zmm18 & mem)
 ; AVX512-NEXT:    vpbroadcastd (%rax), %ymm1
 ; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm4
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm6
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm15 & mem) | (zmm1 & ~zmm15 & ~mem) | (zmm1 & zmm15 & ~mem) | (zmm1 & zmm15 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & mem) | (ymm6 & ~ymm8 & ~mem) | (ymm6 & ymm8 & ~mem) | (ymm6 & ymm8 & mem)
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2]
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm2 & mem) | (ymm0 & ~ymm2 & ~mem) | (ymm0 & ymm2 & ~mem) | (ymm0 & ymm2 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm6 & mem) | (ymm0 & ~ymm6 & ~mem) | (ymm0 & ymm6 & ~mem) | (ymm0 & ymm6 & mem)
 ; AVX512-NEXT:    vmovdqa %ymm0, 192(%rcx)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rcx)
 ; AVX512-NEXT:    vmovdqa64 %zmm3, 128(%rcx)
@@ -3065,24 +3065,24 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm3, %zmm2
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & ~mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm8, %zmm2
-; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & mem) | (zmm10 & ~zmm2 & ~mem) | (zmm10 & ~zmm2 & mem) | (zmm10 & zmm2 & ~mem) | (zmm10 & zmm2 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm0 & mem) | (zmm10 & ~zmm0 & ~mem) | (zmm10 & zmm0 & ~mem) | (zmm10 & zmm0 & mem)
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm9 & ~mem) | (zmm0 & ~zmm9 & mem) | (zmm0 & zmm9 & ~mem) | (zmm0 & zmm9 & mem)
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm13 & mem) | (zmm6 & ~zmm13 & ~mem) | (zmm6 & zmm13 & ~mem) | (zmm6 & zmm13 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm0 & mem) | (zmm6 & ~zmm0 & ~mem) | (zmm6 & zmm0 & ~mem) | (zmm6 & zmm0 & mem)
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5]
-; AVX512-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm15 & ~mem) | (zmm0 & ~zmm15 & mem) | (zmm0 & zmm15 & ~mem) | (zmm0 & zmm15 & mem)
 ; AVX512-FCP-NEXT:    vpbroadcastd (%rax), %ymm2
 ; AVX512-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm3
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4
-; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm1
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm1
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm20 & mem) | (zmm2 & ~zmm20 & ~mem) | (zmm2 & zmm20 & ~mem) | (zmm2 & zmm20 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & mem) | (zmm2 & ~zmm0 & ~mem) | (zmm2 & zmm0 & ~mem) | (zmm2 & zmm0 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm5 & mem) | (ymm4 & ~ymm5 & ~mem) | (ymm4 & ymm5 & ~mem) | (ymm4 & ymm5 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm7 & ~mem) | (ymm1 & ~ymm7 & mem) | (ymm1 & ymm7 & ~mem) | (ymm1 & ymm7 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & mem) | (ymm1 & ~ymm4 & ~mem) | (ymm1 & ymm4 & ~mem) | (ymm1 & ymm4 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm1, 192(%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, (%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rcx)
@@ -3199,27 +3199,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm16, %zmm7, %zmm7
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm17, %zmm9, %zmm9
-; AVX512DQ-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm9 = (~zmm9 & zmm7 & ~mem) | (zmm9 & ~zmm7 & mem) | (zmm9 & zmm7 & ~mem) | (zmm9 & zmm7 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm19, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm4 & mem) | (zmm5 & ~zmm4 & ~mem) | (zmm5 & ~zmm4 & mem) | (zmm5 & zmm4 & ~mem) | (zmm5 & zmm4 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm9 & mem) | (zmm5 & ~zmm9 & ~mem) | (zmm5 & zmm9 & ~mem) | (zmm5 & zmm9 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm4
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm4 & ~mem) | (zmm7 & ~zmm4 & mem) | (zmm7 & zmm4 & ~mem) | (zmm7 & zmm4 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm12, %zmm0
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm0 & mem) | (zmm3 & ~zmm0 & ~mem) | (zmm3 & zmm0 & ~mem) | (zmm3 & zmm0 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm7 & mem) | (zmm3 & ~zmm7 & ~mem) | (zmm3 & zmm7 & ~mem) | (zmm3 & zmm7 & mem)
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5]
-; AVX512DQ-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm0
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm18 & ~mem) | (zmm0 & ~zmm18 & mem) | (zmm0 & zmm18 & ~mem) | (zmm0 & zmm18 & mem)
 ; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm1
 ; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm4
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm6
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm15 & mem) | (zmm1 & ~zmm15 & ~mem) | (zmm1 & zmm15 & ~mem) | (zmm1 & zmm15 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & mem) | (ymm6 & ~ymm8 & ~mem) | (ymm6 & ymm8 & ~mem) | (ymm6 & ymm8 & mem)
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2]
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm2 & mem) | (ymm0 & ~ymm2 & ~mem) | (ymm0 & ymm2 & ~mem) | (ymm0 & ymm2 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm6 & mem) | (ymm0 & ~ymm6 & ~mem) | (ymm0 & ymm6 & ~mem) | (ymm0 & ymm6 & mem)
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, 192(%rcx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rcx)
@@ -3327,24 +3327,24 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm3, %zmm2
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & ~mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm8, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & mem) | (zmm10 & ~zmm2 & ~mem) | (zmm10 & ~zmm2 & mem) | (zmm10 & zmm2 & ~mem) | (zmm10 & zmm2 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm0 & mem) | (zmm10 & ~zmm0 & ~mem) | (zmm10 & zmm0 & ~mem) | (zmm10 & zmm0 & mem)
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm9 & ~mem) | (zmm0 & ~zmm9 & mem) | (zmm0 & zmm9 & ~mem) | (zmm0 & zmm9 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm13 & mem) | (zmm6 & ~zmm13 & ~mem) | (zmm6 & zmm13 & ~mem) | (zmm6 & zmm13 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm0 & mem) | (zmm6 & ~zmm0 & ~mem) | (zmm6 & zmm0 & ~mem) | (zmm6 & zmm0 & mem)
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5]
-; AVX512DQ-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm15 & ~mem) | (zmm0 & ~zmm15 & mem) | (zmm0 & zmm15 & ~mem) | (zmm0 & zmm15 & mem)
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd (%rax), %ymm2
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm3
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm1
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm20 & mem) | (zmm2 & ~zmm20 & ~mem) | (zmm2 & zmm20 & ~mem) | (zmm2 & zmm20 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & mem) | (zmm2 & ~zmm0 & ~mem) | (zmm2 & zmm0 & ~mem) | (zmm2 & zmm0 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm5 & mem) | (ymm4 & ~ymm5 & ~mem) | (ymm4 & ymm5 & ~mem) | (ymm4 & ymm5 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm7 & ~mem) | (ymm1 & ~ymm7 & mem) | (ymm1 & ymm7 & ~mem) | (ymm1 & ymm7 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & mem) | (ymm1 & ~ymm4 & ~mem) | (ymm1 & ymm4 & ~mem) | (ymm1 & ymm4 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, 192(%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rcx)
@@ -6136,19 +6136,19 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm28
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm29
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm28, %zmm30, %zmm29
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm29 = (~zmm29 & zmm30 & zmm28) | (zmm29 & ~zmm30 & ~zmm28) | (zmm29 & ~zmm30 & zmm28) | (zmm29 & zmm30 & zmm28)
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm1, %zmm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm30, %zmm9
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm30 & zmm1) | (zmm9 & ~zmm30 & zmm1) | (zmm9 & zmm30 & ~zmm1) | (zmm9 & zmm30 & zmm1)
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm2 & zmm0) | (zmm1 & ~zmm2 & zmm0) | (zmm1 & zmm2 & ~zmm0) | (zmm1 & zmm2 & zmm0)
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm3 & ~mem) | (~zmm0 & zmm3 & mem) | (zmm0 & ~zmm3 & mem) | (zmm0 & zmm3 & ~mem) | (zmm0 & zmm3 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem)
 ; AVX512-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm1 = mem[2,3,3,3,6,7,7,7]
 ; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
@@ -6177,48 +6177,48 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm5
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm27
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm29 & mem) | (zmm5 & ~zmm29 & ~mem) | (zmm5 & zmm29 & ~mem) | (zmm5 & zmm29 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm27 = (~zmm27 & zmm5 & mem) | (zmm27 & ~zmm5 & ~mem) | (zmm27 & zmm5 & ~mem) | (zmm27 & zmm5 & mem)
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
 ; AVX512-NEXT:    vpbroadcastd 32(%rax), %ymm5
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm4 & mem) | (zmm1 & ~zmm4 & ~mem) | (zmm1 & zmm4 & ~mem) | (zmm1 & zmm4 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & zmm9 & ~mem) | (zmm1 & zmm9 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm3, %zmm3
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm12, %zmm4
-; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm2 & zmm3) | (zmm4 & ~zmm2 & zmm3) | (zmm4 & zmm2 & ~zmm3) | (zmm4 & zmm2 & zmm3)
 ; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm2
 ; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm3
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm3 & mem) | (zmm2 & ~zmm3 & ~mem) | (zmm2 & zmm3 & ~mem) | (zmm2 & zmm3 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & ~mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm18, %zmm4, %zmm4
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & ~mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3]
-; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm3 & mem) | (zmm20 & ~zmm3 & ~mem) | (zmm20 & ~zmm3 & mem) | (zmm20 & zmm3 & ~mem) | (zmm20 & zmm3 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm4 & mem) | (zmm20 & ~zmm4 & ~mem) | (zmm20 & zmm4 & ~mem) | (zmm20 & zmm4 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm28, %zmm21, %zmm3
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm26 & ~mem) | (zmm3 & ~zmm26 & mem) | (zmm3 & zmm26 & ~mem) | (zmm3 & zmm26 & mem)
 ; AVX512-NEXT:    vpbroadcastd (%rax), %ymm4
 ; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm5
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm4
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm25 & mem) | (zmm4 & ~zmm25 & ~mem) | (zmm4 & zmm25 & ~mem) | (zmm4 & zmm25 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & ~mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm3
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm5
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm3 & mem) | (zmm5 & ~zmm3 & ~mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm3
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
 ; AVX512-NEXT:    vpermd (%rax), %zmm6, %zmm6
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm3 & mem) | (zmm6 & ~zmm3 & ~mem) | (zmm6 & zmm3 & ~mem) | (zmm6 & zmm3 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa64 %zmm6, 128(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm4, (%rax)
@@ -6447,20 +6447,20 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm24, %zmm16, %zmm12
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm15
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = (~zmm12 & ~zmm16 & zmm24) | (zmm12 & ~zmm16 & zmm24) | (zmm12 & zmm16 & ~zmm24) | (zmm12 & zmm16 & zmm24)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm12 & mem) | (zmm15 & ~zmm12 & ~mem) | (zmm15 & zmm12 & ~mem) | (zmm15 & zmm12 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm5
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31]
 ; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm7
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm15 & mem) | (zmm7 & ~zmm15 & ~mem) | (zmm7 & zmm15 & ~mem) | (zmm7 & zmm15 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm26, %zmm2, %zmm8
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm2 & zmm26) | (zmm8 & ~zmm2 & zmm26) | (zmm8 & zmm2 & ~zmm26) | (zmm8 & zmm2 & zmm26)
 ; AVX512-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm12
 ; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm13
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm12, %zmm12
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm12
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm27
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm29 & mem) | (zmm12 & ~zmm29 & ~mem) | (zmm12 & zmm29 & ~mem) | (zmm12 & zmm29 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm8 & mem) | (zmm12 & ~zmm8 & ~mem) | (zmm12 & zmm8 & ~mem) | (zmm12 & zmm8 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm3 & mem) | (zmm27 & ~zmm3 & ~mem) | (zmm27 & zmm3 & ~mem) | (zmm27 & zmm3 & mem)
 ; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm4 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
@@ -6477,36 +6477,36 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm24[1,1,1,1,5,5,5,5]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm6 & mem) | (zmm4 & ~zmm6 & ~mem) | (zmm4 & zmm6 & ~mem) | (zmm4 & zmm6 & mem)
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [6,0,0,0,7,0,0,7]
 ; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm3, %ymm3
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm5
 ; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm22, %zmm5
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm31, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm31 & mem) | (zmm5 & ~zmm31 & ~mem) | (zmm5 & zmm31 & ~mem) | (zmm5 & zmm31 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm4 & mem) | (zmm5 & ~zmm4 & ~mem) | (zmm5 & zmm4 & ~mem) | (zmm5 & zmm4 & mem)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm2, %zmm6
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm25, %zmm16, %zmm21
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm2 & zmm4) | (zmm6 & ~zmm2 & zmm4) | (zmm6 & zmm2 & ~zmm4) | (zmm6 & zmm2 & zmm4)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm16 & zmm25) | (zmm21 & ~zmm16 & zmm25) | (zmm21 & zmm16 & ~zmm25) | (zmm21 & zmm16 & zmm25)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm30 & ~mem) | (~zmm0 & zmm30 & mem) | (zmm0 & ~zmm30 & mem) | (zmm0 & zmm30 & ~mem) | (zmm0 & zmm30 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & mem) | (zmm0 & ~zmm6 & ~mem) | (zmm0 & zmm6 & ~mem) | (zmm0 & zmm6 & mem)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm2, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm2 & mem) | (zmm4 & ~zmm2 & ~mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & ~zmm9 & mem) | (zmm1 & zmm9 & ~mem) | (zmm1 & zmm9 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm4 & mem) | (zmm1 & ~zmm4 & ~mem) | (zmm1 & zmm4 & ~mem) | (zmm1 & zmm4 & mem)
 ; AVX512-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm2
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm2
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm2
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm23 & mem) | (zmm2 & ~zmm23 & ~mem) | (zmm2 & zmm23 & ~mem) | (zmm2 & zmm23 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm21 & mem) | (zmm2 & ~zmm21 & ~mem) | (zmm2 & zmm21 & ~mem) | (zmm2 & zmm21 & mem)
 ; AVX512-FCP-NEXT:    vpbroadcastd (%rax), %ymm3
 ; AVX512-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm4
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm3
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm28 & mem) | (zmm3 & ~zmm28 & ~mem) | (zmm3 & zmm28 & ~mem) | (zmm3 & zmm28 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm27 & mem) | (zmm3 & ~zmm27 & ~mem) | (zmm3 & zmm27 & ~mem) | (zmm3 & zmm27 & mem)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 128(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
@@ -6764,19 +6764,19 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm28
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm29
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm28, %zmm30, %zmm29
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm29 = (~zmm29 & zmm30 & zmm28) | (zmm29 & ~zmm30 & ~zmm28) | (zmm29 & ~zmm30 & zmm28) | (zmm29 & zmm30 & zmm28)
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm30, %zmm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm30 & zmm1) | (zmm9 & ~zmm30 & zmm1) | (zmm9 & zmm30 & ~zmm1) | (zmm9 & zmm30 & zmm1)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm2 & zmm0) | (zmm1 & ~zmm2 & zmm0) | (zmm1 & zmm2 & ~zmm0) | (zmm1 & zmm2 & zmm0)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm3 & ~mem) | (~zmm0 & zmm3 & mem) | (zmm0 & ~zmm3 & mem) | (zmm0 & zmm3 & ~mem) | (zmm0 & zmm3 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem)
 ; AVX512DQ-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm1 = mem[2,3,3,3,6,7,7,7]
 ; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
@@ -6805,48 +6805,48 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm5
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm27
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm29 & mem) | (zmm5 & ~zmm29 & ~mem) | (zmm5 & zmm29 & ~mem) | (zmm5 & zmm29 & mem)
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm27 = (~zmm27 & zmm5 & mem) | (zmm27 & ~zmm5 & ~mem) | (zmm27 & zmm5 & ~mem) | (zmm27 & zmm5 & mem)
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
 ; AVX512DQ-NEXT:    vpbroadcastd 32(%rax), %ymm5
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm4 & mem) | (zmm1 & ~zmm4 & ~mem) | (zmm1 & zmm4 & ~mem) | (zmm1 & zmm4 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & zmm9 & ~mem) | (zmm1 & zmm9 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm3, %zmm3
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm12, %zmm4
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm4
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm2 & zmm3) | (zmm4 & ~zmm2 & zmm3) | (zmm4 & zmm2 & ~zmm3) | (zmm4 & zmm2 & zmm3)
 ; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm2
 ; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm3
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm3 & mem) | (zmm2 & ~zmm3 & ~mem) | (zmm2 & zmm3 & ~mem) | (zmm2 & zmm3 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & ~mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm18, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & ~mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3]
-; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm3 & mem) | (zmm20 & ~zmm3 & ~mem) | (zmm20 & ~zmm3 & mem) | (zmm20 & zmm3 & ~mem) | (zmm20 & zmm3 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm4 & mem) | (zmm20 & ~zmm4 & ~mem) | (zmm20 & zmm4 & ~mem) | (zmm20 & zmm4 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm28, %zmm21, %zmm3
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm26 & ~mem) | (zmm3 & ~zmm26 & mem) | (zmm3 & zmm26 & ~mem) | (zmm3 & zmm26 & mem)
 ; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm4
 ; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm5
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm4
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm25 & mem) | (zmm4 & ~zmm25 & ~mem) | (zmm4 & zmm25 & ~mem) | (zmm4 & zmm25 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & ~mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm3
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm3 & mem) | (zmm5 & ~zmm3 & ~mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm3
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
 ; AVX512DQ-NEXT:    vpermd (%rax), %zmm6, %zmm6
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm3 & mem) | (zmm6 & ~zmm3 & ~mem) | (zmm6 & zmm3 & ~mem) | (zmm6 & zmm3 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 128(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, (%rax)
@@ -7075,20 +7075,20 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm24, %zmm16, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm15
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = (~zmm12 & ~zmm16 & zmm24) | (zmm12 & ~zmm16 & zmm24) | (zmm12 & zmm16 & ~zmm24) | (zmm12 & zmm16 & zmm24)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm12 & mem) | (zmm15 & ~zmm12 & ~mem) | (zmm15 & zmm12 & ~mem) | (zmm15 & zmm12 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm5
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm15 & mem) | (zmm7 & ~zmm15 & ~mem) | (zmm7 & zmm15 & ~mem) | (zmm7 & zmm15 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm26, %zmm2, %zmm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm2 & zmm26) | (zmm8 & ~zmm2 & zmm26) | (zmm8 & zmm2 & ~zmm26) | (zmm8 & zmm2 & zmm26)
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm12
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm13
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm12, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm27
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm29 & mem) | (zmm12 & ~zmm29 & ~mem) | (zmm12 & zmm29 & ~mem) | (zmm12 & zmm29 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm8 & mem) | (zmm12 & ~zmm8 & ~mem) | (zmm12 & zmm8 & ~mem) | (zmm12 & zmm8 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm3 & mem) | (zmm27 & ~zmm3 & ~mem) | (zmm27 & zmm3 & ~mem) | (zmm27 & zmm3 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm4 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
@@ -7105,36 +7105,36 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm24[1,1,1,1,5,5,5,5]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm6 & mem) | (zmm4 & ~zmm6 & ~mem) | (zmm4 & zmm6 & ~mem) | (zmm4 & zmm6 & mem)
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [6,0,0,0,7,0,0,7]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm3, %ymm3
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm22, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm31, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm31 & mem) | (zmm5 & ~zmm31 & ~mem) | (zmm5 & zmm31 & ~mem) | (zmm5 & zmm31 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm4 & mem) | (zmm5 & ~zmm4 & ~mem) | (zmm5 & zmm4 & ~mem) | (zmm5 & zmm4 & mem)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm2, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm25, %zmm16, %zmm21
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm2 & zmm4) | (zmm6 & ~zmm2 & zmm4) | (zmm6 & zmm2 & ~zmm4) | (zmm6 & zmm2 & zmm4)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm16 & zmm25) | (zmm21 & ~zmm16 & zmm25) | (zmm21 & zmm16 & ~zmm25) | (zmm21 & zmm16 & zmm25)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm30 & ~mem) | (~zmm0 & zmm30 & mem) | (zmm0 & ~zmm30 & mem) | (zmm0 & zmm30 & ~mem) | (zmm0 & zmm30 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & mem) | (zmm0 & ~zmm6 & ~mem) | (zmm0 & zmm6 & ~mem) | (zmm0 & zmm6 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm2, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm2 & mem) | (zmm4 & ~zmm2 & ~mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & ~zmm9 & mem) | (zmm1 & zmm9 & ~mem) | (zmm1 & zmm9 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm4 & mem) | (zmm1 & ~zmm4 & ~mem) | (zmm1 & zmm4 & ~mem) | (zmm1 & zmm4 & mem)
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm2
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm23 & mem) | (zmm2 & ~zmm23 & ~mem) | (zmm2 & zmm23 & ~mem) | (zmm2 & zmm23 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm21 & mem) | (zmm2 & ~zmm21 & ~mem) | (zmm2 & zmm21 & ~mem) | (zmm2 & zmm21 & mem)
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd (%rax), %ymm3
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm4
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm28 & mem) | (zmm3 & ~zmm28 & ~mem) | (zmm3 & zmm28 & ~mem) | (zmm3 & zmm28 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm27 & mem) | (zmm3 & ~zmm27 & ~mem) | (zmm3 & zmm27 & ~mem) | (zmm3 & zmm27 & mem)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 128(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
@@ -12769,24 +12769,24 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpshufb %ymm12, %ymm9, %ymm9
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm17, %zmm6
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm6 & ~mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem)
 ; AVX512-NEXT:    vmovdqa 96(%r8), %ymm6
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7]
 ; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $248, %ymm11, %ymm7, %ymm6
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm7 & ymm11) | (ymm6 & ~ymm7 & ~ymm11) | (ymm6 & ~ymm7 & ymm11) | (ymm6 & ymm7 & ~ymm11) | (ymm6 & ymm7 & ymm11)
 ; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm12
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $248, %ymm11, %ymm6, %ymm9
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm6 & ymm11) | (ymm9 & ~ymm6 & ~ymm11) | (ymm9 & ~ymm6 & ymm11) | (ymm9 & ymm6 & ~ymm11) | (ymm9 & ymm6 & ymm11)
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm6
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3]
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogq $184, %ymm6, %ymm10, %ymm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ymm10 & ymm6) | (ymm7 & ~ymm10 & ~ymm6) | (ymm7 & ~ymm10 & ymm6) | (ymm7 & ymm10 & ymm6)
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2]
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %ymm7, %ymm8, %ymm6
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & ymm7) | (ymm6 & ~ymm8 & ~ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ymm7)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -13024,7 +13024,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpbroadcastd 100(%rax), %ymm1
 ; AVX512-NEXT:    vpbroadcastd 104(%rax), %ymm2
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm31
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm31 = (~zmm31 & zmm0 & mem) | (zmm31 & ~zmm0 & ~mem) | (zmm31 & zmm0 & ~mem) | (zmm31 & zmm0 & mem)
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
 ; AVX512-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
 ; AVX512-NEXT:    vmovdqa %xmm4, %xmm6
@@ -13042,7 +13042,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm14
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm0, %zmm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm14 = (~zmm14 & ~zmm0 & zmm1) | (zmm14 & ~zmm0 & zmm1) | (zmm14 & zmm0 & ~zmm1) | (zmm14 & zmm0 & zmm1)
 ; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm1
 ; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm3
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
@@ -13054,7 +13054,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpbroadcastd 68(%rax), %ymm5
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm25
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm19, %zmm25
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm25 = (~zmm25 & zmm19 & zmm1) | (zmm25 & ~zmm19 & ~zmm1) | (zmm25 & ~zmm19 & zmm1) | (zmm25 & zmm19 & zmm1)
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm8, %xmm3
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2]
@@ -13071,7 +13071,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm0, %zmm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm0 & zmm3) | (zmm5 & ~zmm0 & zmm3) | (zmm5 & zmm0 & ~zmm3) | (zmm5 & zmm0 & zmm3)
 ; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm0
 ; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm1
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -13081,7 +13081,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpbroadcastd (%rax), %ymm1
 ; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm2
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm19, %zmm20
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm20 = (~zmm20 & zmm19 & zmm0) | (zmm20 & ~zmm19 & ~zmm0) | (zmm20 & ~zmm19 & zmm0) | (zmm20 & zmm19 & zmm0)
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
@@ -13156,7 +13156,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm2
 ; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm4
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm13
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm13
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm1 & mem) | (zmm13 & ~zmm1 & ~mem) | (zmm13 & zmm1 & ~mem) | (zmm13 & zmm1 & mem)
 ; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm7
 ; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm6
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
@@ -13169,99 +13169,103 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm3, %zmm4, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm4 & zmm3) | (zmm1 & ~zmm4 & ~zmm3) | (zmm1 & ~zmm4 & zmm3) | (zmm1 & zmm4 & zmm3)
 ; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm3 = mem[2,1,3,2]
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %ymm1, %ymm29, %ymm3
-; AVX512-NEXT:    vpternlogq $184, %ymm3, %ymm28, %ymm30
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm29 & ymm1) | (ymm3 & ~ymm29 & ~ymm1) | (ymm3 & ~ymm29 & ymm1) | (ymm3 & ymm29 & ymm1)
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm30 = (~ymm30 & ymm28 & ymm3) | (ymm30 & ~ymm28 & ~ymm3) | (ymm30 & ~ymm28 & ymm3) | (ymm30 & ymm28 & ymm3)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
 ; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm3 = (~zmm3 & ~zmm28 & mem) | (~zmm3 & zmm28 & mem) | (zmm3 & ~zmm28 & mem) | (zmm3 & zmm28 & ~mem) | (zmm3 & zmm28 & mem)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
 ; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = (~zmm0 & ~zmm28 & mem) | (~zmm0 & zmm28 & mem) | (zmm0 & ~zmm28 & mem) | (zmm0 & zmm28 & ~mem) | (zmm0 & zmm28 & mem)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm23, %zmm28, %zmm18
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm18 = (~zmm18 & ~zmm28 & zmm23) | (zmm18 & ~zmm28 & zmm23) | (zmm18 & zmm28 & ~zmm23) | (zmm18 & zmm28 & zmm23)
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm18
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm18 = (~zmm18 & ~zmm2 & zmm3) | (zmm18 & ~zmm2 & zmm3) | (zmm18 & zmm2 & ~zmm3) | (zmm18 & zmm2 & zmm3)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm23
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm23
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm23 = (~zmm23 & ~zmm28 & zmm3) | (zmm23 & ~zmm28 & zmm3) | (zmm23 & zmm28 & ~zmm3) | (zmm23 & zmm28 & zmm3)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm23 = (~zmm23 & ~zmm2 & zmm0) | (zmm23 & ~zmm2 & zmm0) | (zmm23 & zmm2 & ~zmm0) | (zmm23 & zmm2 & zmm0)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm0 & ~mem) | (zmm17 & ~zmm0 & mem) | (zmm17 & zmm0 & ~mem) | (zmm17 & zmm0 & mem)
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm19, %ymm0
+; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = (~ymm0 & ~ymm19 & mem) | (ymm0 & ~ymm19 & mem) | (ymm0 & ymm19 & ~mem) | (ymm0 & ymm19 & mem)
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-NEXT:    vpternlogq $184, %ymm0, %ymm1, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm1 & ymm0) | (ymm2 & ~ymm1 & ~ymm0) | (ymm2 & ~ymm1 & ymm0) | (ymm2 & ymm1 & ymm0)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm19 = (~zmm19 & zmm1 & mem) | (zmm19 & ~zmm1 & ~mem) | (zmm19 & ~zmm1 & mem) | (zmm19 & zmm1 & ~mem) | (zmm19 & zmm1 & mem)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
 ; AVX512-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3]
 ; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = (~zmm0 & ~zmm1 & mem) | (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm1
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm30
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm30 = (~zmm30 & zmm2 & zmm1) | (zmm30 & ~zmm2 & ~zmm1) | (zmm30 & ~zmm2 & zmm1) | (zmm30 & zmm2 & zmm1)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm11
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm2 & zmm1) | (zmm11 & ~zmm2 & ~zmm1) | (zmm11 & ~zmm2 & zmm1) | (zmm11 & zmm2 & zmm1)
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm14, %zmm1, %zmm25
-; AVX512-NEXT:    vpternlogq $184, %zmm5, %zmm1, %zmm20
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm1 & zmm14) | (zmm25 & ~zmm1 & ~zmm14) | (zmm25 & ~zmm1 & zmm14) | (zmm25 & zmm1 & zmm14)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm1 & zmm5) | (zmm20 & ~zmm1 & ~zmm5) | (zmm20 & ~zmm1 & zmm5) | (zmm20 & zmm1 & zmm5)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm5, %zmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm5 & zmm1) | (zmm2 & ~zmm5 & zmm1) | (zmm2 & zmm5 & ~zmm1) | (zmm2 & zmm5 & zmm1)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm22, %zmm3, %zmm22
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm5, %zmm22
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm5 & zmm1) | (zmm22 & ~zmm5 & zmm1) | (zmm22 & zmm5 & ~zmm1) | (zmm22 & zmm5 & zmm1)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
 ; AVX512-NEXT:    vpermd 64(%rax), %zmm14, %zmm5
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm5
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm3 & zmm1) | (zmm5 & ~zmm3 & ~zmm1) | (zmm5 & ~zmm3 & zmm1) | (zmm5 & zmm3 & zmm1)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm24, %zmm26, %zmm1
 ; AVX512-NEXT:    vpermd (%rax), %zmm14, %zmm14
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm14
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm3 & zmm1) | (zmm14 & ~zmm3 & ~zmm1) | (zmm14 & ~zmm3 & zmm1) | (zmm14 & zmm3 & zmm1)
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm1, %zmm5
-; AVX512-NEXT:    vpternlogq $184, %zmm22, %zmm1, %zmm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm2) | (zmm5 & ~zmm1 & ~zmm2) | (zmm5 & ~zmm1 & zmm2) | (zmm5 & zmm1 & zmm2)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm1 & zmm22) | (zmm14 & ~zmm1 & ~zmm22) | (zmm14 & ~zmm1 & zmm22) | (zmm14 & zmm1 & zmm22)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $184, %zmm1, %zmm4, %zmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm4 & zmm1) | (zmm2 & ~zmm4 & ~zmm1) | (zmm2 & ~zmm4 & zmm1) | (zmm2 & zmm4 & zmm1)
 ; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm1 = mem[0,1,1,3]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm22
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm4 & zmm1) | (zmm22 & ~zmm4 & zmm1) | (zmm22 & zmm4 & ~zmm1) | (zmm22 & zmm4 & zmm1)
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm21, %zmm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm8
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm4 & zmm1) | (zmm8 & ~zmm4 & zmm1) | (zmm8 & zmm4 & ~zmm1) | (zmm8 & zmm4 & zmm1)
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
@@ -13305,27 +13309,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2]
 ; AVX512-NEXT:    vpbroadcastd 96(%rax), %ymm10
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm9
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm9 = (~zmm9 & zmm29 & zmm7) | (zmm9 & ~zmm29 & ~zmm7) | (zmm9 & ~zmm29 & zmm7) | (zmm9 & zmm29 & zmm7)
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm24, %zmm3
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2]
 ; AVX512-NEXT:    vpbroadcastd 32(%rax), %ymm10
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm7
-; AVX512-NEXT:    vpternlogd $184, %zmm3, %zmm29, %zmm7
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm29 & zmm3) | (zmm7 & ~zmm29 & ~zmm3) | (zmm7 & ~zmm29 & zmm3) | (zmm7 & zmm29 & zmm3)
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm22, %zmm3, %zmm9
-; AVX512-NEXT:    vpternlogq $184, %zmm8, %zmm3, %zmm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm3 & zmm22) | (zmm9 & ~zmm3 & ~zmm22) | (zmm9 & ~zmm3 & zmm22) | (zmm9 & zmm3 & zmm22)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm3 & zmm8) | (zmm7 & ~zmm3 & ~zmm8) | (zmm7 & ~zmm3 & zmm8) | (zmm7 & zmm3 & zmm8)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm16, %zmm3
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm21, %zmm8
-; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm28 & zmm3) | (zmm8 & ~zmm28 & zmm3) | (zmm8 & zmm28 & ~zmm3) | (zmm8 & zmm28 & zmm3)
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm27, %zmm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm3
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm28, %zmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm28 & zmm1) | (zmm3 & ~zmm28 & zmm1) | (zmm3 & zmm28 & ~zmm1) | (zmm3 & zmm28 & zmm1)
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512-NEXT:    vpternlogq $184, %zmm8, %zmm1, %zmm31
-; AVX512-NEXT:    vpternlogq $184, %zmm3, %zmm1, %zmm13
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm0
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm31 = (~zmm31 & zmm1 & zmm8) | (zmm31 & ~zmm1 & ~zmm8) | (zmm31 & ~zmm1 & zmm8) | (zmm31 & zmm1 & zmm8)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm1 & zmm3) | (zmm13 & ~zmm1 & ~zmm3) | (zmm13 & ~zmm1 & zmm3) | (zmm13 & zmm1 & zmm3)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm17 & mem) | (zmm0 & ~zmm17 & ~mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm2 & mem) | (zmm11 & ~zmm2 & ~mem) | (zmm11 & zmm2 & ~mem) | (zmm11 & zmm2 & mem)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa64 %zmm0, 320(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm13, 256(%rax)
@@ -13442,26 +13446,26 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm16, %zmm11
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm12
-; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm12
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm11 & ~mem) | (zmm12 & ~zmm11 & mem) | (zmm12 & zmm11 & ~mem) | (zmm12 & zmm11 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
 ; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm7
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm13, %ymm12, %ymm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm12 & ymm13) | (ymm11 & ~ymm12 & ~ymm13) | (ymm11 & ~ymm12 & ymm13) | (ymm11 & ymm12 & ~ymm13) | (ymm11 & ymm12 & ymm13)
 ; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm10
 ; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm6
 ; AVX512-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm11, %ymm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm11 & ymm16) | (ymm6 & ~ymm11 & ~ymm16) | (ymm6 & ~ymm11 & ymm16) | (ymm6 & ymm11 & ~ymm16) | (ymm6 & ymm11 & ymm16)
 ; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm11
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0]
 ; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm12
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm16, %ymm12
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = (~ymm12 & ymm16 & ymm11) | (ymm12 & ~ymm16 & ~ymm11) | (ymm12 & ~ymm16 & ymm11) | (ymm12 & ymm16 & ymm11)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm19
 ; AVX512-FCP-NEXT:    vprold $16, %ymm10, %ymm11
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm12, %ymm10, %ymm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm10 & ymm12) | (ymm11 & ~ymm10 & ~ymm12) | (ymm11 & ~ymm10 & ymm12) | (ymm11 & ymm10 & ymm12)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -13499,7 +13503,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm20, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm27, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm27 & zmm4) | (zmm1 & ~zmm27 & zmm4) | (zmm1 & zmm27 & ~zmm4) | (zmm1 & zmm27 & zmm4)
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6]
 ; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm1, %ymm1
@@ -13551,7 +13555,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm27, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm27 & zmm4) | (zmm0 & ~zmm27 & zmm4) | (zmm0 & zmm27 & ~zmm4) | (zmm0 & zmm27 & zmm4)
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7]
@@ -13566,7 +13570,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm16, %zmm3
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm4, %zmm19, %zmm3
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm19 & zmm4) | (zmm3 & ~zmm19 & ~zmm4) | (zmm3 & ~zmm19 & zmm4) | (zmm3 & zmm19 & zmm4)
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm10
 ; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm3
@@ -13587,7 +13591,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm11, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm5, %zmm27, %zmm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm27 & zmm5) | (zmm6 & ~zmm27 & zmm5) | (zmm6 & zmm27 & ~zmm5) | (zmm6 & zmm27 & zmm5)
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm12
 ; AVX512-FCP-NEXT:    vprold $16, %ymm26, %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm22
@@ -13603,8 +13607,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm1, %zmm1
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
 ; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm5, %zmm18, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm18 & zmm5) | (zmm0 & ~zmm18 & ~zmm5) | (zmm0 & ~zmm18 & zmm5) | (zmm0 & zmm18 & zmm5)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & mem) | (zmm0 & ~zmm6 & ~mem) | (zmm0 & zmm6 & ~mem) | (zmm0 & zmm6 & mem)
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm0
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm21[3,3,3,3,7,7,7,7]
@@ -13637,7 +13641,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,10,9]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm28, %zmm5
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm3, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm3 & zmm2) | (zmm5 & ~zmm3 & zmm2) | (zmm5 & zmm3 & ~zmm2) | (zmm5 & zmm3 & zmm2)
 ; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm2
 ; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm6
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
@@ -13653,9 +13657,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm8
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm31
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm21, %zmm31
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm31 = (~zmm31 & zmm21 & zmm2) | (zmm31 & ~zmm21 & ~zmm2) | (zmm31 & ~zmm21 & zmm2) | (zmm31 & zmm21 & zmm2)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm5, %zmm2, %zmm31
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm31 = (~zmm31 & zmm2 & zmm5) | (zmm31 & ~zmm2 & ~zmm5) | (zmm31 & ~zmm2 & zmm5) | (zmm31 & zmm2 & zmm5)
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm8
@@ -13677,7 +13681,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
 ; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
 ; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm3, %zmm8
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm3 & zmm6) | (zmm8 & ~zmm3 & zmm6) | (zmm8 & zmm3 & ~zmm6) | (zmm8 & zmm3 & zmm6)
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm5
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
@@ -13690,8 +13694,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpbroadcastd (%rax), %ymm5
 ; AVX512-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm6
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm26
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm21, %zmm26
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm8, %zmm2, %zmm26
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm26 = (~zmm26 & zmm21 & zmm3) | (zmm26 & ~zmm21 & ~zmm3) | (zmm26 & ~zmm21 & zmm3) | (zmm26 & zmm21 & zmm3)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm2 & zmm8) | (zmm26 & ~zmm2 & ~zmm8) | (zmm26 & ~zmm2 & zmm8) | (zmm26 & zmm2 & zmm8)
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
@@ -13722,7 +13726,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm19[3,3,3,3,7,7,7,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8,9,10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm27, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm27 & zmm3) | (zmm5 & ~zmm27 & zmm3) | (zmm5 & zmm27 & ~zmm3) | (zmm5 & zmm27 & zmm3)
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vprold $16, %ymm4, %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
@@ -13740,7 +13744,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm10
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm21, %zmm10
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & ~zmm21 & zmm1) | (zmm10 & ~zmm21 & zmm1) | (zmm10 & zmm21 & ~zmm1) | (zmm10 & zmm21 & zmm1)
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
@@ -13749,8 +13753,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm16, %zmm1
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
 ; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm16, %zmm22
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm22 = (~zmm22 & zmm8 & mem) | (zmm22 & ~zmm8 & ~mem) | (zmm22 & zmm8 & ~mem) | (zmm22 & zmm8 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm22 = (~zmm22 & zmm5 & mem) | (zmm22 & ~zmm5 & ~mem) | (zmm22 & zmm5 & ~mem) | (zmm22 & zmm5 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm5
 ; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
@@ -13781,14 +13785,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm20
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm16, %zmm25
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm18, %zmm25
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm25 = (~zmm25 & zmm18 & zmm2) | (zmm25 & ~zmm18 & ~zmm2) | (zmm25 & ~zmm18 & zmm2) | (zmm25 & zmm18 & zmm2)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm10, %zmm27, %zmm25
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm27 & zmm10) | (zmm25 & ~zmm27 & ~zmm10) | (zmm25 & ~zmm27 & zmm10) | (zmm25 & zmm27 & zmm10)
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
 ; AVX512-FCP-NEXT:    vprold $16, %xmm3, %xmm3
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm21, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm21 & zmm0) | (zmm1 & ~zmm21 & zmm0) | (zmm1 & zmm21 & ~zmm0) | (zmm1 & zmm21 & zmm0)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,8,9,11]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm21, %zmm3
@@ -13800,11 +13804,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm28, %zmm7
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm3) | (zmm7 & ~zmm28 & zmm3) | (zmm7 & zmm28 & ~zmm3) | (zmm7 & zmm28 & zmm3)
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm0, %ymm18, %ymm16
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm18 & ymm0) | (ymm16 & ~ymm18 & ~ymm0) | (ymm16 & ~ymm18 & ymm0) | (ymm16 & ymm18 & ymm0)
 ; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm13
 ; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
@@ -13823,7 +13827,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm19
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm5, %zmm5
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm6, %zmm24, %zmm5
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm24 & zmm6) | (zmm5 & ~zmm24 & ~zmm6) | (zmm5 & ~zmm24 & zmm6) | (zmm5 & zmm24 & zmm6)
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3]
@@ -13839,10 +13843,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm17, %ymm17
 ; AVX512-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm19
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm17, %zmm20
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm8, %zmm18, %zmm20
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm27, %zmm20
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm20 = (~zmm20 & zmm18 & zmm8) | (zmm20 & ~zmm18 & ~zmm8) | (zmm20 & ~zmm18 & zmm8) | (zmm20 & zmm18 & zmm8)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm27 & zmm1) | (zmm20 & ~zmm27 & ~zmm1) | (zmm20 & ~zmm27 & zmm1) | (zmm20 & zmm27 & zmm1)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm7, %zmm1, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm7) | (zmm5 & ~zmm1 & ~zmm7) | (zmm5 & ~zmm1 & zmm7) | (zmm5 & zmm1 & zmm7)
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
 ; AVX512-FCP-NEXT:    vprold $16, %xmm9, %xmm8
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm15[1,1,2,3]
@@ -13860,7 +13864,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm0, %zmm7
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm8, %zmm28, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm8) | (zmm7 & ~zmm28 & zmm8) | (zmm7 & zmm28 & ~zmm8) | (zmm7 & zmm28 & zmm8)
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
@@ -13870,23 +13874,25 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm3
 ; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm8
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm24, %zmm3
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm7, %zmm1, %zmm3
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm24 & zmm2) | (zmm3 & ~zmm24 & ~zmm2) | (zmm3 & ~zmm24 & zmm2) | (zmm3 & zmm24 & zmm2)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm1 & zmm7) | (zmm3 & ~zmm1 & ~zmm7) | (zmm3 & ~zmm1 & zmm7) | (zmm3 & zmm1 & zmm7)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm13
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm13 = (~ymm13 & ~ymm0 & mem) | (ymm13 & ~ymm0 & mem) | (ymm13 & ymm0 & ~mem) | (ymm13 & ymm0 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm13, %ymm1, %ymm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm1 & ymm13) | (ymm6 & ~ymm1 & ~ymm13) | (ymm6 & ~ymm1 & ymm13) | (ymm6 & ymm1 & ymm13)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm16, %ymm2, %ymm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm2 & ymm16) | (ymm4 & ~ymm2 & ~ymm16) | (ymm4 & ~ymm2 & ymm16) | (ymm4 & ymm2 & ymm16)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm0 & mem) | (zmm13 & ~zmm0 & ~mem) | (zmm13 & zmm0 & ~mem) | (zmm13 & zmm0 & mem)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm0 = (~zmm0 & ~zmm2 & mem) | (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm4
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm2 = (~zmm2 & ~zmm4 & mem) | (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem)
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
@@ -13926,25 +13932,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm12 & mem) | (zmm14 & ~zmm12 & ~mem) | (zmm14 & zmm12 & ~mem) | (zmm14 & zmm12 & mem)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm28, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm28 & zmm11) | (zmm4 & ~zmm28 & zmm11) | (zmm4 & zmm28 & ~zmm11) | (zmm4 & zmm28 & zmm11)
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm28, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm6) | (zmm7 & ~zmm28 & zmm6) | (zmm7 & zmm28 & ~zmm6) | (zmm7 & zmm28 & zmm6)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm6 & zmm0) | (zmm4 & ~zmm6 & zmm0) | (zmm4 & zmm6 & ~zmm0) | (zmm4 & zmm6 & zmm0)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm6 & zmm2) | (zmm7 & ~zmm6 & zmm2) | (zmm7 & zmm6 & ~zmm2) | (zmm7 & zmm6 & zmm2)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & ~mem) | (zmm2 & ~zmm0 & mem) | (zmm2 & zmm0 & ~mem) | (zmm2 & zmm0 & mem)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem)
 ; AVX512-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10
+; AVX512-FCP-NEXT:    # zmm10 = (~zmm10 & zmm1 & mem) | (zmm10 & ~zmm1 & ~mem) | (zmm10 & ~zmm1 & mem) | (zmm10 & zmm1 & ~mem) | (zmm10 & zmm1 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & mem) | (zmm10 & ~zmm2 & ~mem) | (zmm10 & zmm2 & ~mem) | (zmm10 & zmm2 & mem)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 320(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 256(%rax)
@@ -14097,24 +14105,24 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm9, %ymm9
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm17, %zmm6
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm6 & ~mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem)
 ; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm6
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7]
 ; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm11, %ymm7, %ymm6
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm7 & ymm11) | (ymm6 & ~ymm7 & ~ymm11) | (ymm6 & ~ymm7 & ymm11) | (ymm6 & ymm7 & ~ymm11) | (ymm6 & ymm7 & ymm11)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm12
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm11, %ymm6, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm6 & ymm11) | (ymm9 & ~ymm6 & ~ymm11) | (ymm9 & ~ymm6 & ymm11) | (ymm9 & ymm6 & ~ymm11) | (ymm9 & ymm6 & ymm11)
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm7, %ymm6
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm6, %ymm10, %ymm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = (~ymm7 & ymm10 & ymm6) | (ymm7 & ~ymm10 & ~ymm6) | (ymm7 & ~ymm10 & ymm6) | (ymm7 & ymm10 & ymm6)
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm7, %ymm8, %ymm6
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & ymm7) | (ymm6 & ~ymm8 & ~ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ymm7)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -14352,7 +14360,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpbroadcastd 100(%rax), %ymm1
 ; AVX512DQ-NEXT:    vpbroadcastd 104(%rax), %ymm2
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm31
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm31 = (~zmm31 & zmm0 & mem) | (zmm31 & ~zmm0 & ~mem) | (zmm31 & zmm0 & ~mem) | (zmm31 & zmm0 & mem)
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
 ; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
 ; AVX512DQ-NEXT:    vmovdqa %xmm4, %xmm6
@@ -14370,7 +14378,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm14
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm0, %zmm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm14 = (~zmm14 & ~zmm0 & zmm1) | (zmm14 & ~zmm0 & zmm1) | (zmm14 & zmm0 & ~zmm1) | (zmm14 & zmm0 & zmm1)
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm3
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
@@ -14382,7 +14390,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpbroadcastd 68(%rax), %ymm5
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm25
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm19, %zmm25
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm25 = (~zmm25 & zmm19 & zmm1) | (zmm25 & ~zmm19 & ~zmm1) | (zmm25 & ~zmm19 & zmm1) | (zmm25 & zmm19 & zmm1)
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
 ; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm8, %xmm3
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2]
@@ -14399,7 +14407,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm0, %zmm5
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm0 & zmm3) | (zmm5 & ~zmm0 & zmm3) | (zmm5 & zmm0 & ~zmm3) | (zmm5 & zmm0 & zmm3)
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm1
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -14409,7 +14417,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm1
 ; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm2
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm19, %zmm20
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm20 = (~zmm20 & zmm19 & zmm0) | (zmm20 & ~zmm19 & ~zmm0) | (zmm20 & ~zmm19 & zmm0) | (zmm20 & zmm19 & zmm0)
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
@@ -14484,7 +14492,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm2
 ; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm4
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm13
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm13
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm1 & mem) | (zmm13 & ~zmm1 & ~mem) | (zmm13 & zmm1 & ~mem) | (zmm13 & zmm1 & mem)
 ; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm7
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm6
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
@@ -14497,99 +14505,103 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm3, %zmm4, %zmm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm4 & zmm3) | (zmm1 & ~zmm4 & ~zmm3) | (zmm1 & ~zmm4 & zmm3) | (zmm1 & zmm4 & zmm3)
 ; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm3 = mem[2,1,3,2]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm1, %ymm29, %ymm3
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm3, %ymm28, %ymm30
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm29 & ymm1) | (ymm3 & ~ymm29 & ~ymm1) | (ymm3 & ~ymm29 & ymm1) | (ymm3 & ymm29 & ymm1)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm30 = (~ymm30 & ymm28 & ymm3) | (ymm30 & ~ymm28 & ~ymm3) | (ymm30 & ~ymm28 & ymm3) | (ymm30 & ymm28 & ymm3)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm3 = (~zmm3 & ~zmm28 & mem) | (~zmm3 & zmm28 & mem) | (zmm3 & ~zmm28 & mem) | (zmm3 & zmm28 & ~mem) | (zmm3 & zmm28 & mem)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm0 = (~zmm0 & ~zmm28 & mem) | (~zmm0 & zmm28 & mem) | (zmm0 & ~zmm28 & mem) | (zmm0 & zmm28 & ~mem) | (zmm0 & zmm28 & mem)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm23, %zmm28, %zmm18
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm18 = (~zmm18 & ~zmm28 & zmm23) | (zmm18 & ~zmm28 & zmm23) | (zmm18 & zmm28 & ~zmm23) | (zmm18 & zmm28 & zmm23)
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm18
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm18 = (~zmm18 & ~zmm2 & zmm3) | (zmm18 & ~zmm2 & zmm3) | (zmm18 & zmm2 & ~zmm3) | (zmm18 & zmm2 & zmm3)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm23
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm23
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm23 = (~zmm23 & ~zmm28 & zmm3) | (zmm23 & ~zmm28 & zmm3) | (zmm23 & zmm28 & ~zmm3) | (zmm23 & zmm28 & zmm3)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm23 = (~zmm23 & ~zmm2 & zmm0) | (zmm23 & ~zmm2 & zmm0) | (zmm23 & zmm2 & ~zmm0) | (zmm23 & zmm2 & zmm0)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm0 & ~mem) | (zmm17 & ~zmm0 & mem) | (zmm17 & zmm0 & ~mem) | (zmm17 & zmm0 & mem)
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512DQ-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm19, %ymm0
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm0 = (~ymm0 & ~ymm19 & mem) | (ymm0 & ~ymm19 & mem) | (ymm0 & ymm19 & ~mem) | (ymm0 & ymm19 & mem)
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm0, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm1 & ymm0) | (ymm2 & ~ymm1 & ~ymm0) | (ymm2 & ~ymm1 & ymm0) | (ymm2 & ymm1 & ymm0)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
 ; AVX512DQ-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm19 = (~zmm19 & zmm1 & mem) | (zmm19 & ~zmm1 & ~mem) | (zmm19 & ~zmm1 & mem) | (zmm19 & zmm1 & ~mem) | (zmm19 & zmm1 & mem)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
 ; AVX512DQ-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3]
 ; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm0 = (~zmm0 & ~zmm1 & mem) | (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm30
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm30 = (~zmm30 & zmm2 & zmm1) | (zmm30 & ~zmm2 & ~zmm1) | (zmm30 & ~zmm2 & zmm1) | (zmm30 & zmm2 & zmm1)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm11
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm2 & zmm1) | (zmm11 & ~zmm2 & ~zmm1) | (zmm11 & ~zmm2 & zmm1) | (zmm11 & zmm2 & zmm1)
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm14, %zmm1, %zmm25
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm5, %zmm1, %zmm20
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm1 & zmm14) | (zmm25 & ~zmm1 & ~zmm14) | (zmm25 & ~zmm1 & zmm14) | (zmm25 & zmm1 & zmm14)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm1 & zmm5) | (zmm20 & ~zmm1 & ~zmm5) | (zmm20 & ~zmm1 & zmm5) | (zmm20 & zmm1 & zmm5)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm5, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm5 & zmm1) | (zmm2 & ~zmm5 & zmm1) | (zmm2 & zmm5 & ~zmm1) | (zmm2 & zmm5 & zmm1)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm22, %zmm3, %zmm22
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm5, %zmm22
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm5 & zmm1) | (zmm22 & ~zmm5 & zmm1) | (zmm22 & zmm5 & ~zmm1) | (zmm22 & zmm5 & zmm1)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
 ; AVX512DQ-NEXT:    vpermd 64(%rax), %zmm14, %zmm5
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm5
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm3 & zmm1) | (zmm5 & ~zmm3 & ~zmm1) | (zmm5 & ~zmm3 & zmm1) | (zmm5 & zmm3 & zmm1)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm24, %zmm26, %zmm1
 ; AVX512DQ-NEXT:    vpermd (%rax), %zmm14, %zmm14
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm14
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm3 & zmm1) | (zmm14 & ~zmm3 & ~zmm1) | (zmm14 & ~zmm3 & zmm1) | (zmm14 & zmm3 & zmm1)
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm2, %zmm1, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm22, %zmm1, %zmm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm2) | (zmm5 & ~zmm1 & ~zmm2) | (zmm5 & ~zmm1 & zmm2) | (zmm5 & zmm1 & zmm2)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm1 & zmm22) | (zmm14 & ~zmm1 & ~zmm22) | (zmm14 & ~zmm1 & zmm22) | (zmm14 & zmm1 & zmm22)
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm4, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm4 & zmm1) | (zmm2 & ~zmm4 & ~zmm1) | (zmm2 & ~zmm4 & zmm1) | (zmm2 & zmm4 & zmm1)
 ; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm1 = mem[0,1,1,3]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm22
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm4 & zmm1) | (zmm22 & ~zmm4 & zmm1) | (zmm22 & zmm4 & ~zmm1) | (zmm22 & zmm4 & zmm1)
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm21, %zmm1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm8
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm4 & zmm1) | (zmm8 & ~zmm4 & zmm1) | (zmm8 & zmm4 & ~zmm1) | (zmm8 & zmm4 & zmm1)
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
@@ -14633,27 +14645,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2]
 ; AVX512DQ-NEXT:    vpbroadcastd 96(%rax), %ymm10
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm9
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm9 = (~zmm9 & zmm29 & zmm7) | (zmm9 & ~zmm29 & ~zmm7) | (zmm9 & ~zmm29 & zmm7) | (zmm9 & zmm29 & zmm7)
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm24, %zmm3
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2]
 ; AVX512DQ-NEXT:    vpbroadcastd 32(%rax), %ymm10
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm7
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm3, %zmm29, %zmm7
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm29 & zmm3) | (zmm7 & ~zmm29 & ~zmm3) | (zmm7 & ~zmm29 & zmm3) | (zmm7 & zmm29 & zmm3)
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm22, %zmm3, %zmm9
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm8, %zmm3, %zmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm3 & zmm22) | (zmm9 & ~zmm3 & ~zmm22) | (zmm9 & ~zmm3 & zmm22) | (zmm9 & zmm3 & zmm22)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm3 & zmm8) | (zmm7 & ~zmm3 & ~zmm8) | (zmm7 & ~zmm3 & zmm8) | (zmm7 & zmm3 & zmm8)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm16, %zmm3
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm21, %zmm8
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm28 & zmm3) | (zmm8 & ~zmm28 & zmm3) | (zmm8 & zmm28 & ~zmm3) | (zmm8 & zmm28 & zmm3)
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm27, %zmm1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm3
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm28, %zmm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm28 & zmm1) | (zmm3 & ~zmm28 & zmm1) | (zmm3 & zmm28 & ~zmm1) | (zmm3 & zmm28 & zmm1)
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm8, %zmm1, %zmm31
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm3, %zmm1, %zmm13
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm0
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm31 = (~zmm31 & zmm1 & zmm8) | (zmm31 & ~zmm1 & ~zmm8) | (zmm31 & ~zmm1 & zmm8) | (zmm31 & zmm1 & zmm8)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm1 & zmm3) | (zmm13 & ~zmm1 & ~zmm3) | (zmm13 & ~zmm1 & zmm3) | (zmm13 & zmm1 & zmm3)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm17 & mem) | (zmm0 & ~zmm17 & ~mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm2 & mem) | (zmm11 & ~zmm2 & ~mem) | (zmm11 & zmm2 & ~mem) | (zmm11 & zmm2 & mem)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 320(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 256(%rax)
@@ -14770,26 +14782,26 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm16, %zmm11
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm12
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm11 & ~mem) | (zmm12 & ~zmm11 & mem) | (zmm12 & zmm11 & ~mem) | (zmm12 & zmm11 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm7
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm13, %ymm12, %ymm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm12 & ymm13) | (ymm11 & ~ymm12 & ~ymm13) | (ymm11 & ~ymm12 & ymm13) | (ymm11 & ymm12 & ~ymm13) | (ymm11 & ymm12 & ymm13)
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm10
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm11, %ymm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm11 & ymm16) | (ymm6 & ~ymm11 & ~ymm16) | (ymm6 & ~ymm11 & ymm16) | (ymm6 & ymm11 & ~ymm16) | (ymm6 & ymm11 & ymm16)
 ; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm11
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm16, %ymm12
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = (~ymm12 & ymm16 & ymm11) | (ymm12 & ~ymm16 & ~ymm11) | (ymm12 & ~ymm16 & ymm11) | (ymm12 & ymm16 & ymm11)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm19
 ; AVX512DQ-FCP-NEXT:    vprold $16, %ymm10, %ymm11
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm12, %ymm10, %ymm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm10 & ymm12) | (ymm11 & ~ymm10 & ~ymm12) | (ymm11 & ~ymm10 & ymm12) | (ymm11 & ymm10 & ymm12)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -14827,7 +14839,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm20, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm27, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm27 & zmm4) | (zmm1 & ~zmm27 & zmm4) | (zmm1 & zmm27 & ~zmm4) | (zmm1 & zmm27 & zmm4)
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm1, %ymm1
@@ -14879,7 +14891,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm27, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm27 & zmm4) | (zmm0 & ~zmm27 & zmm4) | (zmm0 & zmm27 & ~zmm4) | (zmm0 & zmm27 & zmm4)
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7]
@@ -14894,7 +14906,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm16, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm4, %zmm19, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm19 & zmm4) | (zmm3 & ~zmm19 & ~zmm4) | (zmm3 & ~zmm19 & zmm4) | (zmm3 & zmm19 & zmm4)
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm10
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm3
@@ -14915,7 +14927,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm11, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm5, %zmm27, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm27 & zmm5) | (zmm6 & ~zmm27 & zmm5) | (zmm6 & zmm27 & ~zmm5) | (zmm6 & zmm27 & zmm5)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm12
 ; AVX512DQ-FCP-NEXT:    vprold $16, %ymm26, %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm22
@@ -14931,8 +14943,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm1, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm5, %zmm18, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm18 & zmm5) | (zmm0 & ~zmm18 & ~zmm5) | (zmm0 & ~zmm18 & zmm5) | (zmm0 & zmm18 & zmm5)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & mem) | (zmm0 & ~zmm6 & ~mem) | (zmm0 & zmm6 & ~mem) | (zmm0 & zmm6 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm21[3,3,3,3,7,7,7,7]
@@ -14965,7 +14977,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,10,9]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm28, %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm3, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm3 & zmm2) | (zmm5 & ~zmm3 & zmm2) | (zmm5 & zmm3 & ~zmm2) | (zmm5 & zmm3 & zmm2)
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm6
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
@@ -14981,9 +14993,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm8
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm21, %zmm31
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm31 = (~zmm31 & zmm21 & zmm2) | (zmm31 & ~zmm21 & ~zmm2) | (zmm31 & ~zmm21 & zmm2) | (zmm31 & zmm21 & zmm2)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm5, %zmm2, %zmm31
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm31 = (~zmm31 & zmm2 & zmm5) | (zmm31 & ~zmm2 & ~zmm5) | (zmm31 & ~zmm2 & zmm5) | (zmm31 & zmm2 & zmm5)
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm8
@@ -15005,7 +15017,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm3, %zmm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm3 & zmm6) | (zmm8 & ~zmm3 & zmm6) | (zmm8 & zmm3 & ~zmm6) | (zmm8 & zmm3 & zmm6)
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm5
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
@@ -15018,8 +15030,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd (%rax), %ymm5
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm6
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm26
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm21, %zmm26
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm8, %zmm2, %zmm26
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm26 = (~zmm26 & zmm21 & zmm3) | (zmm26 & ~zmm21 & ~zmm3) | (zmm26 & ~zmm21 & zmm3) | (zmm26 & zmm21 & zmm3)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm2 & zmm8) | (zmm26 & ~zmm2 & ~zmm8) | (zmm26 & ~zmm2 & zmm8) | (zmm26 & zmm2 & zmm8)
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
@@ -15050,7 +15062,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm19[3,3,3,3,7,7,7,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8,9,10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm27, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm27 & zmm3) | (zmm5 & ~zmm27 & zmm3) | (zmm5 & zmm27 & ~zmm3) | (zmm5 & zmm27 & zmm3)
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vprold $16, %ymm4, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
@@ -15068,7 +15080,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm21, %zmm10
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & ~zmm21 & zmm1) | (zmm10 & ~zmm21 & zmm1) | (zmm10 & zmm21 & ~zmm1) | (zmm10 & zmm21 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
@@ -15077,8 +15089,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm16, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm16, %zmm22
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm22 = (~zmm22 & zmm8 & mem) | (zmm22 & ~zmm8 & ~mem) | (zmm22 & zmm8 & ~mem) | (zmm22 & zmm8 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm22 = (~zmm22 & zmm5 & mem) | (zmm22 & ~zmm5 & ~mem) | (zmm22 & zmm5 & ~mem) | (zmm22 & zmm5 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
@@ -15109,14 +15121,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm20
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm16, %zmm25
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm18, %zmm25
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm25 = (~zmm25 & zmm18 & zmm2) | (zmm25 & ~zmm18 & ~zmm2) | (zmm25 & ~zmm18 & zmm2) | (zmm25 & zmm18 & zmm2)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm10, %zmm27, %zmm25
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm27 & zmm10) | (zmm25 & ~zmm27 & ~zmm10) | (zmm25 & ~zmm27 & zmm10) | (zmm25 & zmm27 & zmm10)
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
 ; AVX512DQ-FCP-NEXT:    vprold $16, %xmm3, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm21, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm21 & zmm0) | (zmm1 & ~zmm21 & zmm0) | (zmm1 & zmm21 & ~zmm0) | (zmm1 & zmm21 & zmm0)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,8,9,11]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm21, %zmm3
@@ -15128,11 +15140,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm28, %zmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm3) | (zmm7 & ~zmm28 & zmm3) | (zmm7 & zmm28 & ~zmm3) | (zmm7 & zmm28 & zmm3)
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm0, %ymm18, %ymm16
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm18 & ymm0) | (ymm16 & ~ymm18 & ~ymm0) | (ymm16 & ~ymm18 & ymm0) | (ymm16 & ymm18 & ymm0)
 ; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm13
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
@@ -15151,7 +15163,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm19
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm5, %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm6, %zmm24, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm24 & zmm6) | (zmm5 & ~zmm24 & ~zmm6) | (zmm5 & ~zmm24 & zmm6) | (zmm5 & zmm24 & zmm6)
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3]
@@ -15167,10 +15179,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm17, %ymm17
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm19
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm17, %zmm20
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm8, %zmm18, %zmm20
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm27, %zmm20
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm20 = (~zmm20 & zmm18 & zmm8) | (zmm20 & ~zmm18 & ~zmm8) | (zmm20 & ~zmm18 & zmm8) | (zmm20 & zmm18 & zmm8)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm27 & zmm1) | (zmm20 & ~zmm27 & ~zmm1) | (zmm20 & ~zmm27 & zmm1) | (zmm20 & zmm27 & zmm1)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm7, %zmm1, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm7) | (zmm5 & ~zmm1 & ~zmm7) | (zmm5 & ~zmm1 & zmm7) | (zmm5 & zmm1 & zmm7)
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
 ; AVX512DQ-FCP-NEXT:    vprold $16, %xmm9, %xmm8
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm15[1,1,2,3]
@@ -15188,7 +15200,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm0, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm8, %zmm28, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm8) | (zmm7 & ~zmm28 & zmm8) | (zmm7 & zmm28 & ~zmm8) | (zmm7 & zmm28 & zmm8)
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
@@ -15198,23 +15210,25 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm3
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm8
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm24, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm7, %zmm1, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm24 & zmm2) | (zmm3 & ~zmm24 & ~zmm2) | (zmm3 & ~zmm24 & zmm2) | (zmm3 & zmm24 & zmm2)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm1 & zmm7) | (zmm3 & ~zmm1 & ~zmm7) | (zmm3 & ~zmm1 & zmm7) | (zmm3 & zmm1 & zmm7)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm13 = (~ymm13 & ~ymm0 & mem) | (ymm13 & ~ymm0 & mem) | (ymm13 & ymm0 & ~mem) | (ymm13 & ymm0 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm13, %ymm1, %ymm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm1 & ymm13) | (ymm6 & ~ymm1 & ~ymm13) | (ymm6 & ~ymm1 & ymm13) | (ymm6 & ymm1 & ymm13)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm16, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm2 & ymm16) | (ymm4 & ~ymm2 & ~ymm16) | (ymm4 & ~ymm2 & ymm16) | (ymm4 & ymm2 & ymm16)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm0 & mem) | (zmm13 & ~zmm0 & ~mem) | (zmm13 & zmm0 & ~mem) | (zmm13 & zmm0 & mem)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm0 = (~zmm0 & ~zmm2 & mem) | (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm4
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm2 = (~zmm2 & ~zmm4 & mem) | (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
@@ -15254,25 +15268,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm12 & mem) | (zmm14 & ~zmm12 & ~mem) | (zmm14 & zmm12 & ~mem) | (zmm14 & zmm12 & mem)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm28, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm28 & zmm11) | (zmm4 & ~zmm28 & zmm11) | (zmm4 & zmm28 & ~zmm11) | (zmm4 & zmm28 & zmm11)
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm28, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm6) | (zmm7 & ~zmm28 & zmm6) | (zmm7 & zmm28 & ~zmm6) | (zmm7 & zmm28 & zmm6)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm6 & zmm0) | (zmm4 & ~zmm6 & zmm0) | (zmm4 & zmm6 & ~zmm0) | (zmm4 & zmm6 & zmm0)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm6 & zmm2) | (zmm7 & ~zmm6 & zmm2) | (zmm7 & zmm6 & ~zmm2) | (zmm7 & zmm6 & zmm2)
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & ~mem) | (zmm2 & ~zmm0 & mem) | (zmm2 & zmm0 & ~mem) | (zmm2 & zmm0 & mem)
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem)
 ; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10
+; AVX512DQ-FCP-NEXT:    # zmm10 = (~zmm10 & zmm1 & mem) | (zmm10 & ~zmm1 & ~mem) | (zmm10 & ~zmm1 & mem) | (zmm10 & zmm1 & ~mem) | (zmm10 & zmm1 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & mem) | (zmm10 & ~zmm2 & ~mem) | (zmm10 & zmm2 & ~mem) | (zmm10 & zmm2 & mem)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 320(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 256(%rax)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index 311166ef60dda..47690f3c60edf 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -7226,7 +7226,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    # ymm5 = mem[2,1,3,3,6,5,7,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm10
 ; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm3, %zmm13, %zmm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm13 & zmm3) | (zmm10 & ~zmm13 & ~zmm3) | (zmm10 & ~zmm13 & zmm3) | (zmm10 & zmm13 & zmm3)
 ; AVX512-NEXT:    vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm3 = mem[0,0,2,1,4,4,6,5]
 ; AVX512-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
@@ -7253,7 +7253,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    # ymm2 = mem[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm9
-; AVX512-NEXT:    vpternlogq $184, %zmm0, %zmm13, %zmm9
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm13 & zmm0) | (zmm9 & ~zmm13 & ~zmm0) | (zmm9 & ~zmm13 & zmm0) | (zmm9 & zmm13 & zmm0)
 ; AVX512-NEXT:    vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm0 = mem[0,0,2,1,4,4,6,5]
 ; AVX512-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
@@ -7280,7 +7280,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm11
-; AVX512-NEXT:    vpternlogq $184, %zmm0, %zmm13, %zmm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm13 & zmm0) | (zmm11 & ~zmm13 & ~zmm0) | (zmm11 & ~zmm13 & zmm0) | (zmm11 & zmm13 & zmm0)
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -7314,7 +7314,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
-; AVX512-NEXT:    vpternlogq $184, %zmm0, %zmm13, %zmm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm13 & zmm0) | (zmm4 & ~zmm13 & ~zmm0) | (zmm4 & ~zmm13 & zmm0) | (zmm4 & zmm13 & zmm0)
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
@@ -7391,7 +7391,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm23 = mem[2,1,3,3,6,5,7,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm23, %zmm5, %zmm5
-; AVX512-NEXT:    vpternlogq $184, %zmm12, %zmm13, %zmm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm13 & zmm12) | (zmm5 & ~zmm13 & ~zmm12) | (zmm5 & ~zmm13 & zmm12) | (zmm5 & zmm13 & zmm12)
 ; AVX512-NEXT:    vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm12 = mem[0,0,2,1,4,4,6,5]
 ; AVX512-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
@@ -7427,7 +7427,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm30 = mem[2,1,3,3,6,5,7,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm30, %zmm12, %zmm12
-; AVX512-NEXT:    vpternlogq $184, %zmm0, %zmm13, %zmm12
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm13 & zmm0) | (zmm12 & ~zmm13 & ~zmm0) | (zmm12 & ~zmm13 & zmm0) | (zmm12 & zmm13 & zmm0)
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3]
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
@@ -7454,7 +7454,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm18, %zmm16, %zmm16
-; AVX512-NEXT:    vpternlogq $184, %zmm7, %zmm13, %zmm16
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm13 & zmm7) | (zmm16 & ~zmm13 & ~zmm7) | (zmm16 & ~zmm13 & zmm7) | (zmm16 & zmm13 & zmm7)
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm18, %zmm7, %zmm7
@@ -7473,7 +7473,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm2
-; AVX512-NEXT:    vpternlogq $184, %zmm0, %zmm13, %zmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm13 & zmm0) | (zmm2 & ~zmm13 & ~zmm0) | (zmm2 & ~zmm13 & zmm0) | (zmm2 & zmm13 & zmm0)
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
@@ -7664,6 +7664,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm3
 ; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm3 = (~zmm3 & ~zmm7 & mem) | (zmm3 & ~zmm7 & mem) | (zmm3 & zmm7 & ~mem) | (zmm3 & zmm7 & mem)
 ; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
 ; AVX512-FCP-NEXT:    vpandnq %zmm19, %zmm2, %zmm19
 ; AVX512-FCP-NEXT:    vpandq %zmm2, %zmm21, %zmm21
@@ -7747,30 +7748,32 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm8, %ymm5
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm6 = (~zmm6 & ~zmm7 & mem) | (zmm6 & ~zmm7 & mem) | (zmm6 & zmm7 & ~mem) | (zmm6 & zmm7 & mem)
 ; AVX512-FCP-NEXT:    vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    vpord %zmm5, %zmm8, %zmm6 {%k1}
 ; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm15 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm15 = (~zmm15 & ~zmm7 & mem) | (zmm15 & ~zmm7 & mem) | (zmm15 & zmm7 & ~mem) | (zmm15 & zmm7 & mem)
 ; AVX512-FCP-NEXT:    vpandnq (%rsp), %zmm2, %zmm5 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    vpord %zmm5, %zmm8, %zmm15 {%k1}
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm20, %zmm7, %zmm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm7 & zmm20) | (zmm11 & ~zmm7 & zmm20) | (zmm11 & zmm7 & ~zmm20) | (zmm11 & zmm7 & zmm20)
 ; AVX512-FCP-NEXT:    vpandnq %zmm22, %zmm2, %zmm5
 ; AVX512-FCP-NEXT:    vpandq %zmm2, %zmm23, %zmm8
 ; AVX512-FCP-NEXT:    vpord %zmm5, %zmm8, %zmm11 {%k1}
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm24, %zmm7, %zmm14
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = (~zmm14 & ~zmm7 & zmm24) | (zmm14 & ~zmm7 & zmm24) | (zmm14 & zmm7 & ~zmm24) | (zmm14 & zmm7 & zmm24)
 ; AVX512-FCP-NEXT:    vpandnq %zmm25, %zmm2, %zmm5
 ; AVX512-FCP-NEXT:    vpandq %zmm2, %zmm26, %zmm8
 ; AVX512-FCP-NEXT:    vpord %zmm5, %zmm8, %zmm14 {%k1}
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm27, %zmm7, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm7 & zmm27) | (zmm0 & ~zmm7 & zmm27) | (zmm0 & zmm7 & ~zmm27) | (zmm0 & zmm7 & zmm27)
 ; AVX512-FCP-NEXT:    vpandnq %zmm28, %zmm2, %zmm5
 ; AVX512-FCP-NEXT:    vpandq %zmm2, %zmm29, %zmm8
 ; AVX512-FCP-NEXT:    vpord %zmm5, %zmm8, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm30, %zmm7, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm7 & zmm30) | (zmm1 & ~zmm7 & zmm30) | (zmm1 & zmm7 & ~zmm30) | (zmm1 & zmm7 & zmm30)
 ; AVX512-FCP-NEXT:    vpandnq %zmm31, %zmm2, %zmm5
 ; AVX512-FCP-NEXT:    vpandq %zmm2, %zmm16, %zmm8
 ; AVX512-FCP-NEXT:    vpord %zmm5, %zmm8, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm19, %zmm7, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm7 & zmm19) | (zmm4 & ~zmm7 & zmm19) | (zmm4 & zmm7 & ~zmm19) | (zmm4 & zmm7 & zmm19)
 ; AVX512-FCP-NEXT:    vpandnq %zmm13, %zmm2, %zmm5
 ; AVX512-FCP-NEXT:    vpandq %zmm2, %zmm21, %zmm2
 ; AVX512-FCP-NEXT:    vpord %zmm5, %zmm2, %zmm4 {%k1}
@@ -7999,7 +8002,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    # ymm5 = mem[2,1,3,3,6,5,7,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm10
 ; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm3, %zmm13, %zmm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm13 & zmm3) | (zmm10 & ~zmm13 & ~zmm3) | (zmm10 & ~zmm13 & zmm3) | (zmm10 & zmm13 & zmm3)
 ; AVX512DQ-NEXT:    vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm3 = mem[0,0,2,1,4,4,6,5]
 ; AVX512DQ-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
@@ -8026,7 +8029,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    # ymm2 = mem[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm9
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm0, %zmm13, %zmm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm13 & zmm0) | (zmm9 & ~zmm13 & ~zmm0) | (zmm9 & ~zmm13 & zmm0) | (zmm9 & zmm13 & zmm0)
 ; AVX512DQ-NEXT:    vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm0 = mem[0,0,2,1,4,4,6,5]
 ; AVX512DQ-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
@@ -8053,7 +8056,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm11
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm0, %zmm13, %zmm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm13 & zmm0) | (zmm11 & ~zmm13 & ~zmm0) | (zmm11 & ~zmm13 & zmm0) | (zmm11 & zmm13 & zmm0)
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -8087,7 +8090,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm0, %zmm13, %zmm4
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm13 & zmm0) | (zmm4 & ~zmm13 & ~zmm0) | (zmm4 & ~zmm13 & zmm0) | (zmm4 & zmm13 & zmm0)
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
@@ -8164,7 +8167,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm23 = mem[2,1,3,3,6,5,7,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm23, %zmm5, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm12, %zmm13, %zmm5
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm13 & zmm12) | (zmm5 & ~zmm13 & ~zmm12) | (zmm5 & ~zmm13 & zmm12) | (zmm5 & zmm13 & zmm12)
 ; AVX512DQ-NEXT:    vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm12 = mem[0,0,2,1,4,4,6,5]
 ; AVX512DQ-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
@@ -8200,7 +8203,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm30 = mem[2,1,3,3,6,5,7,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm30, %zmm12, %zmm12
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm0, %zmm13, %zmm12
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm13 & zmm0) | (zmm12 & ~zmm13 & ~zmm0) | (zmm12 & ~zmm13 & zmm0) | (zmm12 & zmm13 & zmm0)
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3]
 ; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
@@ -8227,7 +8230,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm18, %zmm16, %zmm16
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm7, %zmm13, %zmm16
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm13 & zmm7) | (zmm16 & ~zmm13 & ~zmm7) | (zmm16 & ~zmm13 & zmm7) | (zmm16 & zmm13 & zmm7)
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm18, %zmm7, %zmm7
@@ -8246,7 +8249,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm2
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm0, %zmm13, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm13 & zmm0) | (zmm2 & ~zmm13 & ~zmm0) | (zmm2 & ~zmm13 & zmm0) | (zmm2 & zmm13 & zmm0)
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
@@ -8437,6 +8440,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm3
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm3 = (~zmm3 & ~zmm7 & mem) | (zmm3 & ~zmm7 & mem) | (zmm3 & zmm7 & ~mem) | (zmm3 & zmm7 & mem)
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
 ; AVX512DQ-FCP-NEXT:    vpandnq %zmm19, %zmm2, %zmm19
 ; AVX512DQ-FCP-NEXT:    vpandq %zmm2, %zmm21, %zmm21
@@ -8520,30 +8524,32 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm8, %ymm5
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm6 = (~zmm6 & ~zmm7 & mem) | (zmm6 & ~zmm7 & mem) | (zmm6 & zmm7 & ~mem) | (zmm6 & zmm7 & mem)
 ; AVX512DQ-FCP-NEXT:    vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vpord %zmm5, %zmm8, %zmm6 {%k1}
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm15 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm15 = (~zmm15 & ~zmm7 & mem) | (zmm15 & ~zmm7 & mem) | (zmm15 & zmm7 & ~mem) | (zmm15 & zmm7 & mem)
 ; AVX512DQ-FCP-NEXT:    vpandnq (%rsp), %zmm2, %zmm5 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vpord %zmm5, %zmm8, %zmm15 {%k1}
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm20, %zmm7, %zmm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm7 & zmm20) | (zmm11 & ~zmm7 & zmm20) | (zmm11 & zmm7 & ~zmm20) | (zmm11 & zmm7 & zmm20)
 ; AVX512DQ-FCP-NEXT:    vpandnq %zmm22, %zmm2, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpandq %zmm2, %zmm23, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpord %zmm5, %zmm8, %zmm11 {%k1}
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm24, %zmm7, %zmm14
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = (~zmm14 & ~zmm7 & zmm24) | (zmm14 & ~zmm7 & zmm24) | (zmm14 & zmm7 & ~zmm24) | (zmm14 & zmm7 & zmm24)
 ; AVX512DQ-FCP-NEXT:    vpandnq %zmm25, %zmm2, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpandq %zmm2, %zmm26, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpord %zmm5, %zmm8, %zmm14 {%k1}
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm27, %zmm7, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm7 & zmm27) | (zmm0 & ~zmm7 & zmm27) | (zmm0 & zmm7 & ~zmm27) | (zmm0 & zmm7 & zmm27)
 ; AVX512DQ-FCP-NEXT:    vpandnq %zmm28, %zmm2, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpandq %zmm2, %zmm29, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpord %zmm5, %zmm8, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm30, %zmm7, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm7 & zmm30) | (zmm1 & ~zmm7 & zmm30) | (zmm1 & zmm7 & ~zmm30) | (zmm1 & zmm7 & zmm30)
 ; AVX512DQ-FCP-NEXT:    vpandnq %zmm31, %zmm2, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpandq %zmm2, %zmm16, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpord %zmm5, %zmm8, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm19, %zmm7, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm7 & zmm19) | (zmm4 & ~zmm7 & zmm19) | (zmm4 & zmm7 & ~zmm19) | (zmm4 & zmm7 & zmm19)
 ; AVX512DQ-FCP-NEXT:    vpandnq %zmm13, %zmm2, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpandq %zmm2, %zmm21, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpord %zmm5, %zmm2, %zmm4 {%k1}
diff --git a/llvm/test/CodeGen/Xtensa/inline-asm-invalid.ll b/llvm/test/CodeGen/Xtensa/inline-asm-invalid.ll
new file mode 100644
index 0000000000000..2a436dd156dd7
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/inline-asm-invalid.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: not llc --mtriple=xtensa < %s 2>&1 | FileCheck %s
+
+define void @constraint_f() nounwind {
+; CHECK: error: unknown asm constraint 'f'
+  tail call void asm "addi a1, a1, $0", "f"(i32 1)
+  ret void
+}
+
+define i32 @register_a100(i32 %a) nounwind {
+; CHECK: error: couldn't allocate input reg for constraint '{$a100}'
+  %1 = tail call i32 asm "addi $0, $1, 1", "=r,{$a100}"(i32 %a)
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/Xtensa/inline-asm-mem-constraint.ll b/llvm/test/CodeGen/Xtensa/inline-asm-mem-constraint.ll
new file mode 100644
index 0000000000000..4b27ba9337f88
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/inline-asm-mem-constraint.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=xtensa < %s | FileCheck %s --check-prefix=XTENSA
+
+define i32 @m_offset_0(ptr %p) nounwind {
+; XTENSA-LABEL: m_offset_0:
+; XTENSA:         #APP
+; XTENSA-NEXT:    l32i a2, a2, 0
+; XTENSA-NEXT:    #NO_APP
+; XTENSA-NEXT:    ret
+  %1 = call i32 asm "l32i $0, $1", "=r,*m"(ptr elementtype(i32) %p)
+  ret i32 %1
+}
+
+define i32 @m_offset_1020(ptr %p) nounwind {
+; XTENSA-LABEL: m_offset_1020:
+; XTENSA:         #APP
+; XTENSA-NEXT:    l32i a2, a2, 1020
+; XTENSA-NEXT:    #NO_APP
+; XTENSA-NEXT:    ret
+  %1 = getelementptr inbounds i8, ptr %p, i32 1020
+  %2 = call i32 asm "l32i $0, $1", "=r,*m"(ptr elementtype(i32) %1)
+  ret i32 %2
+}
+
+define i8 @m_i8_offset_7(ptr %p) nounwind {
+; XTENSA-LABEL: m_i8_offset_7:
+; XTENSA:         addi a8, a2, 7
+; XTENSA-NEXT:    #APP
+; XTENSA-NEXT:    l8ui a2, a8, 0
+; XTENSA-NEXT:    #NO_APP
+; XTENSA-NEXT:    ret
+  %1 = getelementptr inbounds i8, ptr %p, i32 7
+  %2 = call i8 asm "l8ui $0, $1", "=r,*m"(ptr elementtype(i8) %1)
+  ret i8 %2
+}
+
+define i16 @m_i16_offset_10(ptr %p) nounwind {
+; XTENSA-LABEL: m_i16_offset_10:
+; XTENSA:         #APP
+; XTENSA-NEXT:    l16si a2, a2, 20
+; XTENSA-NEXT:    #NO_APP
+; XTENSA-NEXT:    ret
+  %1 = getelementptr inbounds i16, ptr %p, i32 10
+  %2 = call i16 asm "l16si $0, $1", "=r,*m"(ptr elementtype(i16) %1)
+  ret i16 %2
+}
diff --git a/llvm/test/CodeGen/Xtensa/inline-asm.ll b/llvm/test/CodeGen/Xtensa/inline-asm.ll
new file mode 100644
index 0000000000000..748f5f857acfd
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/inline-asm.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=xtensa < %s \
+; RUN: | FileCheck -check-prefix=XTENSA %s
+
+@gi = external global i32
+
+define i32 @constraint_r(i32 %a) {
+; XTENSA-LABEL: constraint_r:
+; XTENSA:         l32r a8, .LCPI0_0
+; XTENSA-NEXT:    l32i a8, a8, 0
+; XTENSA-NEXT:    #APP
+; XTENSA-NEXT:    add a2, a2, a8
+; XTENSA-NEXT:    #NO_APP
+; XTENSA-NEXT:    ret
+  %1 = load i32, ptr @gi
+  %2 = tail call i32 asm "add $0, $1, $2", "=r,r,r"(i32 %a, i32 %1)
+  ret i32 %2
+}
+
+define i32 @constraint_i(i32 %a) {
+; XTENSA-LABEL: constraint_i:
+; XTENSA:         #APP
+; XTENSA-NEXT:    addi a2, a2, 113
+; XTENSA-NEXT:    #NO_APP
+; XTENSA-NEXT:    ret
+  %1 = load i32, ptr @gi
+  %2 = tail call i32 asm "addi $0, $1, $2", "=r,r,i"(i32 %a, i32 113)
+  ret i32 %2
+}
+
+define i32 @explicit_register_a3(i32 %a) nounwind {
+; XTENSA-LABEL: explicit_register_a3:
+; XTENSA:         or a3, a2, a2
+; XTENSA-NEXT:    #APP
+; XTENSA-NEXT:    addi a2, a3, 1
+; XTENSA-NEXT:    #NO_APP
+; XTENSA-NEXT:    ret
+  %1 = tail call i32 asm "addi $0, $1, 1", "=r,{a3}"(i32 %a)
+  ret i32 %1
+}
diff --git a/llvm/test/DebugInfo/ARM/align-func-start.ll b/llvm/test/DebugInfo/ARM/align-func-start.ll
new file mode 100644
index 0000000000000..ebdf4b200dfff
--- /dev/null
+++ b/llvm/test/DebugInfo/ARM/align-func-start.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mtriple=arm-none-eabi < %s | FileCheck %s
+; RUN: llc -mtriple=arm-none-eabi < %s | llvm-mc --triple=arm-none-eabi -mcpu=cortex-m3
+
+; Check that, when an aligned loop is the first thing in a function, we do not
+; emit an invalid .loc directive, which is rejected by the assembly parser.
+
+; CHECK-NOT: .loc    0
+; CHECK: .loc    1 2 3 prologue_end
+; CHECK-NOT: .loc 0
+
+define dso_local void @foo() "target-cpu"="cortex-m3" !dbg !8 {
+entry:
+  br label %while.body, !dbg !11
+
+while.body:
+  br label %while.body, !dbg !11
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (git@github.com:llvm/llvm-project.git 1c984b86b389bbc71c8c2988d1d707e2f32878bd)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/work/scratch")
+!2 = !{i32 7, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null}
+!11 = !DILocation(line: 2, column: 3, scope: !8)
diff --git a/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir b/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir
index 1790f761585c3..a9c20d774822e 100644
--- a/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir
+++ b/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir
@@ -30,6 +30,21 @@
 # RUN:     -debug-entry-values -mtriple=x86_64-unknown-unknown \
 # RUN:     -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5
 
+## === DWARF3, tune for gdb ===
+# RUN: llc -emit-call-site-info -dwarf-version 3 -debugger-tune=gdb -filetype=obj \
+# RUN:     -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s  \
+# RUN:     | llvm-dwarfdump - | FileCheck %s -implicit-check-not=DW_AT_call
+
+## === DWARF3, tune for lldb ===
+# RUN: llc -dwarf-version 3 -debugger-tune=lldb -emit-call-site-info -filetype=obj \
+# RUN:     -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s   \
+# RUN:     | llvm-dwarfdump - | FileCheck %s -implicit-check-not=DW_AT_GNU_call
+
+## === DWARF3, tune for sce ===
+# RUN: llc -emit-call-site-info -dwarf-version 3 -filetype=obj -debugger-tune=sce \
+# RUN:     -debug-entry-values -mtriple=x86_64-unknown-unknown \
+# RUN:     -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -implicit-check-not=DW_AT_call
+
 ## This is based on the following reproducer:
 ##
 ## extern void fn();
diff --git a/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll b/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll
index 4ae0b78f160c8..2d927b18d976d 100644
--- a/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll
+++ b/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll
@@ -120,6 +120,10 @@ entry:
 ; CHECK-NEXT: .b8 3                                // Abbreviation Code
 ; CHECK-NEXT: .b8 5                                // DW_TAG_formal_parameter
 ; CHECK-NEXT: .b8 0                                // DW_CHILDREN_no
+; CHECK-NEXT: .b8 51                               // DW_AT_address_class
+; CHECK-NEXT: .b8 11                               // DW_FORM_data1
+; CHECK-NEXT: .b8 2                                // DW_AT_location
+; CHECK-NEXT: .b8 10                               // DW_FORM_block1
 ; CHECK-NEXT: .b8 3                                // DW_AT_name
 ; CHECK-NEXT: .b8 8                                // DW_FORM_string
 ; CHECK-NEXT: .b8 58                               // DW_AT_decl_file
@@ -145,12 +149,12 @@ entry:
 ; CHECK-NEXT: }
 ; CHECK-NEXT: .section .debug_info
 ; CHECK-NEXT: {
-; CHECK-NEXT: .b32 183                             // Length of Unit
+; CHECK-NEXT: .b32 197                             // Length of Unit
 ; CHECK-NEXT: .b8 2                                // DWARF version number
 ; CHECK-NEXT: .b8 0
 ; CHECK-NEXT: .b32 .debug_abbrev                   // Offset Into Abbrev. Section
 ; CHECK-NEXT: .b8 8                                // Address Size (in bytes)
-; CHECK-NEXT: .b8 1                                // Abbrev [1] 0xb:0xb0 DW_TAG_compile_unit
+; CHECK-NEXT: .b8 1                                // Abbrev [1] 0xb:0xbe DW_TAG_compile_unit
 ; CHECK-NEXT: .b8 99                               // DW_AT_producer
 ; CHECK-NEXT: .b8 108
 ; CHECK-NEXT: .b8 97
@@ -223,7 +227,7 @@ entry:
 ; CHECK-NEXT: .b8 0
 ; CHECK-NEXT: .b64 $L__func_begin0                 // DW_AT_low_pc
 ; CHECK-NEXT: .b64 $L__func_end2                   // DW_AT_high_pc
-; CHECK-NEXT: .b8 2                                // Abbrev [2] 0x65:0x27 DW_TAG_subprogram
+; CHECK-NEXT: .b8 2                                // Abbrev [2] 0x65:0x2e DW_TAG_subprogram
 ; CHECK-NEXT: .b64 $L__func_begin0                 // DW_AT_low_pc
 ; CHECK-NEXT: .b64 $L__func_end0                   // DW_AT_high_pc
 ; CHECK-NEXT: .b8 1                                // DW_AT_frame_base
@@ -233,16 +237,23 @@ entry:
 ; CHECK-NEXT: .b8 1                                // DW_AT_decl_file
 ; CHECK-NEXT: .b8 1                                // DW_AT_decl_line
 ; CHECK-NEXT: .b8 1                                // DW_AT_prototyped
-; CHECK-NEXT: .b32 179                             // DW_AT_type
+; CHECK-NEXT: .b32 193                             // DW_AT_type
 ; CHECK-NEXT: .b8 1                                // DW_AT_external
-; CHECK-NEXT: .b8 3                                // Abbrev [3] 0x82:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT: .b8 3                                // Abbrev [3] 0x82:0x10 DW_TAG_formal_parameter
+; CHECK-NEXT: .b8 2                                // DW_AT_address_class
+; CHECK-NEXT: .b8 5                                // DW_AT_location
+; CHECK-NEXT: .b8 144
+; CHECK-NEXT: .b8 177
+; CHECK-NEXT: .b8 228
+; CHECK-NEXT: .b8 149
+; CHECK-NEXT: .b8 1
 ; CHECK-NEXT: .b8 99                               // DW_AT_name
 ; CHECK-NEXT: .b8 0
 ; CHECK-NEXT: .b8 1                                // DW_AT_decl_file
 ; CHECK-NEXT: .b8 1                                // DW_AT_decl_line
-; CHECK-NEXT: .b32 179                             // DW_AT_type
+; CHECK-NEXT: .b32 193                             // DW_AT_type
 ; CHECK-NEXT: .b8 0                                // End Of Children Mark
-; CHECK-NEXT: .b8 2                                // Abbrev [2] 0x8c:0x27 DW_TAG_subprogram
+; CHECK-NEXT: .b8 2                                // Abbrev [2] 0x93:0x2e DW_TAG_subprogram
 ; CHECK-NEXT: .b64 $L__func_begin2                 // DW_AT_low_pc
 ; CHECK-NEXT: .b64 $L__func_end2                   // DW_AT_high_pc
 ; CHECK-NEXT: .b8 1                                // DW_AT_frame_base
@@ -252,16 +263,23 @@ entry:
 ; CHECK-NEXT: .b8 1                                // DW_AT_decl_file
 ; CHECK-NEXT: .b8 3                                // DW_AT_decl_line
 ; CHECK-NEXT: .b8 1                                // DW_AT_prototyped
-; CHECK-NEXT: .b32 179                             // DW_AT_type
+; CHECK-NEXT: .b32 193                             // DW_AT_type
 ; CHECK-NEXT: .b8 1                                // DW_AT_external
-; CHECK-NEXT: .b8 3                                // Abbrev [3] 0xa9:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT: .b8 3                                // Abbrev [3] 0xb0:0x10 DW_TAG_formal_parameter
+; CHECK-NEXT: .b8 2                                // DW_AT_address_class
+; CHECK-NEXT: .b8 5                                // DW_AT_location
+; CHECK-NEXT: .b8 144
+; CHECK-NEXT: .b8 177
+; CHECK-NEXT: .b8 228
+; CHECK-NEXT: .b8 149
+; CHECK-NEXT: .b8 1
 ; CHECK-NEXT: .b8 101                              // DW_AT_name
 ; CHECK-NEXT: .b8 0
 ; CHECK-NEXT: .b8 1                                // DW_AT_decl_file
 ; CHECK-NEXT: .b8 3                                // DW_AT_decl_line
-; CHECK-NEXT: .b32 179                             // DW_AT_type
+; CHECK-NEXT: .b32 193                             // DW_AT_type
 ; CHECK-NEXT: .b8 0                                // End Of Children Mark
-; CHECK-NEXT: .b8 4                                // Abbrev [4] 0xb3:0x7 DW_TAG_base_type
+; CHECK-NEXT: .b8 4                                // Abbrev [4] 0xc1:0x7 DW_TAG_base_type
 ; CHECK-NEXT: .b8 105                              // DW_AT_name
 ; CHECK-NEXT: .b8 110
 ; CHECK-NEXT: .b8 116
diff --git a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
index c25742ef0d276..3585095151181 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
@@ -160,6 +160,10 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 ; CHECK-NEXT:.b8 5                                   // Abbreviation Code
 ; CHECK-NEXT:.b8 5                                   // DW_TAG_formal_parameter
 ; CHECK-NEXT:.b8 0                                   // DW_CHILDREN_no
+; CHECK-NEXT:.b8 51                                  // DW_AT_address_class
+; CHECK-NEXT:.b8 11                                  // DW_FORM_data1
+; CHECK-NEXT:.b8 2                                   // DW_AT_location
+; CHECK-NEXT:.b8 10                                  // DW_FORM_block1
 ; CHECK-NEXT:.b8 3                                   // DW_AT_name
 ; CHECK-NEXT:.b8 8                                   // DW_FORM_string
 ; CHECK-NEXT:.b8 58                                  // DW_AT_decl_file
@@ -171,6 +175,19 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 ; CHECK-NEXT:.b8 0                                   // EOM(1)
 ; CHECK-NEXT:.b8 0                                   // EOM(2)
 ; CHECK-NEXT:.b8 6                                   // Abbreviation Code
+; CHECK-NEXT:.b8 5                                   // DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 0                                   // DW_CHILDREN_no
+; CHECK-NEXT:.b8 3                                   // DW_AT_name
+; CHECK-NEXT:.b8 8                                   // DW_FORM_string
+; CHECK-NEXT:.b8 58                                  // DW_AT_decl_file
+; CHECK-NEXT:.b8 11                                  // DW_FORM_data1
+; CHECK-NEXT:.b8 59                                  // DW_AT_decl_line
+; CHECK-NEXT:.b8 11                                  // DW_FORM_data1
+; CHECK-NEXT:.b8 73                                  // DW_AT_type
+; CHECK-NEXT:.b8 19                                  // DW_FORM_ref4
+; CHECK-NEXT:.b8 0                                   // EOM(1)
+; CHECK-NEXT:.b8 0                                   // EOM(2)
+; CHECK-NEXT:.b8 7                                   // Abbreviation Code
 ; CHECK-NEXT:.b8 15                                  // DW_TAG_pointer_type
 ; CHECK-NEXT:.b8 0                                   // DW_CHILDREN_no
 ; CHECK-NEXT:.b8 73                                  // DW_AT_type
@@ -178,15 +195,15 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 ; CHECK-NEXT:.b8 0                                   // EOM(1)
 ; CHECK-NEXT:.b8 0                                   // EOM(2)
 ; CHECK-NEXT:.b8 0                                   // EOM(3)
-; CHECK-NEXT:        }
-; CHECK-NEXT:        .section        .debug_info
-; CHECK-NEXT:        {
-; CHECK-NEXT:.b32 240                                // Length of Unit
+; CHECK-NEXT:	}
+; CHECK-NEXT:	.section	.debug_info
+; CHECK-NEXT:	{
+; CHECK-NEXT:.b32 254                                // Length of Unit
 ; CHECK-NEXT:.b8 2                                   // DWARF version number
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b32 .debug_abbrev                      // Offset Into Abbrev. Section
 ; CHECK-NEXT:.b8 8                                   // Address Size (in bytes)
-; CHECK-NEXT:.b8 1                                   // Abbrev [1] 0xb:0xe9 DW_TAG_compile_unit
+; CHECK-NEXT:.b8 1                                   // Abbrev [1] 0xb:0xf7 DW_TAG_compile_unit
 ; CHECK-NEXT:.b8 99                                  // DW_AT_producer
 ; CHECK-NEXT:.b8 108
 ; CHECK-NEXT:.b8 97
@@ -298,7 +315,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 ; CHECK-NEXT:.b8 9                                   // DW_AT_location
 ; CHECK-NEXT:.b8 3
 ; CHECK-NEXT:.b64 SHARED
-; CHECK-NEXT:.b8 4                                   // Abbrev [4] 0xa0:0x45 DW_TAG_subprogram
+; CHECK-NEXT:.b8 4                                   // Abbrev [4] 0xa0:0x53 DW_TAG_subprogram
 ; CHECK-NEXT:.b64 $L__func_begin0                    // DW_AT_low_pc
 ; CHECK-NEXT:.b64 $L__func_end0                      // DW_AT_high_pc
 ; CHECK-NEXT:.b8 1                                   // DW_AT_frame_base
@@ -316,32 +333,46 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0xc0:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0xc0:0x10 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 2                                   // DW_AT_address_class
+; CHECK-NEXT:.b8 5                                   // DW_AT_location
+; CHECK-NEXT:.b8 144
+; CHECK-NEXT:.b8 177
+; CHECK-NEXT:.b8 204
+; CHECK-NEXT:.b8 149
+; CHECK-NEXT:.b8 1
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_line
-; CHECK-NEXT:.b32 229                                // DW_AT_type
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0xc9:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 243                                // DW_AT_type
+; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0xd0:0x9 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b8 120                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_line
-; CHECK-NEXT:.b32 238                                // DW_AT_type
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0xd2:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 252                                // DW_AT_type
+; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0xd9:0x9 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b8 121                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_line
-; CHECK-NEXT:.b32 238                                // DW_AT_type
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0xdb:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 252                                // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0xe2:0x10 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 2                                   // DW_AT_address_class
+; CHECK-NEXT:.b8 5                                   // DW_AT_location
+; CHECK-NEXT:.b8 144
+; CHECK-NEXT:.b8 177
+; CHECK-NEXT:.b8 228
+; CHECK-NEXT:.b8 149
+; CHECK-NEXT:.b8 1
 ; CHECK-NEXT:.b8 105                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b32 127                                // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0xe5:0x9 DW_TAG_base_type
+; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0xf3:0x9 DW_TAG_base_type
 ; CHECK-NEXT:.b8 102                                 // DW_AT_name
 ; CHECK-NEXT:.b8 108
 ; CHECK-NEXT:.b8 111
@@ -350,8 +381,8 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 4                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0xee:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 229                                // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xfc:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 243                                // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
 ; CHECK-NEXT:        }
 ; CHECK-NEXT:         .section        .debug_loc      {       }
diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll
index 922a420820f46..a255717926d6b 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-info.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll
@@ -490,6 +490,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 27                                  // Abbreviation Code
 ; CHECK-NEXT:.b8 5                                   // DW_TAG_formal_parameter
 ; CHECK-NEXT:.b8 0                                   // DW_CHILDREN_no
+; CHECK-NEXT:.b8 51                                  // DW_AT_address_class
+; CHECK-NEXT:.b8 11                                  // DW_FORM_data1
+; CHECK-NEXT:.b8 2                                   // DW_AT_location
+; CHECK-NEXT:.b8 10                                  // DW_FORM_block1
 ; CHECK-NEXT:.b8 49                                  // DW_AT_abstract_origin
 ; CHECK-NEXT:.b8 19                                  // DW_FORM_ref4
 ; CHECK-NEXT:.b8 0                                   // EOM(1)
@@ -703,12 +707,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:	}
 ; CHECK-NEXT:	.section	.debug_info
 ; CHECK-NEXT:	{
-; CHECK-NEXT:.b32 10029                              // Length of Unit
+; CHECK-NEXT:.b32 10036                              // Length of Unit
 ; CHECK-NEXT:.b8 2                                   // DWARF version number
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b32 .debug_abbrev                      // Offset Into Abbrev. Section
 ; CHECK-NEXT:.b8 8                                   // Address Size (in bytes)
-; CHECK-NEXT:.b8 1                                   // Abbrev [1] 0xb:0x2726 DW_TAG_compile_unit
+; CHECK-NEXT:.b8 1                                   // Abbrev [1] 0xb:0x272d DW_TAG_compile_unit
 ; CHECK-NEXT:.b8 0                                   // DW_AT_producer
 ; CHECK-NEXT:.b8 4                                   // DW_AT_language
 ; CHECK-NEXT:.b8 0
@@ -2600,7 +2604,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 4                                   // DW_AT_byte_size
 ; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x84d:0x5 DW_TAG_pointer_type
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 23                                  // Abbrev [23] 0x852:0xbf DW_TAG_subprogram
+; CHECK-NEXT:.b8 23                                  // Abbrev [23] 0x852:0xc6 DW_TAG_subprogram
 ; CHECK-NEXT:.b64 $L__func_begin0                    // DW_AT_low_pc
 ; CHECK-NEXT:.b64 $L__func_end0                      // DW_AT_high_pc
 ; CHECK-NEXT:.b8 1                                   // DW_AT_frame_base
@@ -2634,7 +2638,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_line
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 22                                  // Abbrev [22] 0x886:0x9 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 0
@@ -2658,7 +2662,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_line
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 25                                  // Abbrev [25] 0x8aa:0x18 DW_TAG_inlined_subroutine
 ; CHECK-NEXT:.b32 707                                // DW_AT_abstract_origin
 ; CHECK-NEXT:.b64 $L__tmp1                           // DW_AT_low_pc
@@ -2680,867 +2684,874 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1                                   // DW_AT_call_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_call_line
 ; CHECK-NEXT:.b8 37                                  // DW_AT_call_column
-; CHECK-NEXT:.b8 26                                  // Abbrev [26] 0x8f2:0x1e DW_TAG_inlined_subroutine
+; CHECK-NEXT:.b8 26                                  // Abbrev [26] 0x8f2:0x25 DW_TAG_inlined_subroutine
 ; CHECK-NEXT:.b32 2066                               // DW_AT_abstract_origin
 ; CHECK-NEXT:.b64 $L__tmp9                           // DW_AT_low_pc
 ; CHECK-NEXT:.b64 $L__tmp10                          // DW_AT_high_pc
 ; CHECK-NEXT:.b8 1                                   // DW_AT_call_file
 ; CHECK-NEXT:.b8 8                                   // DW_AT_call_line
 ; CHECK-NEXT:.b8 5                                   // DW_AT_call_column
-; CHECK-NEXT:.b8 27                                  // Abbrev [27] 0x90a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 27                                  // Abbrev [27] 0x90a:0xc DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 2                                   // DW_AT_address_class
+; CHECK-NEXT:.b8 5                                   // DW_AT_location
+; CHECK-NEXT:.b8 144
+; CHECK-NEXT:.b8 179
+; CHECK-NEXT:.b8 204
+; CHECK-NEXT:.b8 149
+; CHECK-NEXT:.b8 1
 ; CHECK-NEXT:.b32 2095                               // DW_AT_abstract_origin
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 28                                  // Abbrev [28] 0x911:0x588 DW_TAG_namespace
+; CHECK-NEXT:.b8 28                                  // Abbrev [28] 0x918:0x588 DW_TAG_namespace
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 100
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x916:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x91d:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 202                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3737                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x91d:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3744                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x924:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 203                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3781                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x924:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3788                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x92b:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 204                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3810                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x92b:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3817                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x932:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 205                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3841                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x932:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3848                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x939:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 206                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3870                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x939:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3877                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x940:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 207                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3901                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x940:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3908                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x947:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 208                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3930                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x947:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3937                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x94e:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 209                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3967                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x94e:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3974                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x955:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 210                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3998                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x955:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4005                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x95c:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 211                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4027                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x95c:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4034                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x963:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 212                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4056                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x963:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4063                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x96a:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 213                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4099                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x96a:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4106                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x971:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 214                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4126                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x971:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4133                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x978:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 215                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4155                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x978:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4162                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x97f:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 216                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4182                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x97f:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4189                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x986:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 217                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4211                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x986:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4218                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x98d:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 218                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4238                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x98d:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4245                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x994:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 219                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4267                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x994:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4274                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x99b:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 220                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4298                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x99b:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4305                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9a2:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 221                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4327                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9a2:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4334                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9a9:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 222                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4362                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9a9:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4369                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9b0:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 223                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4393                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9b0:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4400                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9b7:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 224                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4432                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9b7:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4439                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9be:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 225                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4467                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9be:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4474                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9c5:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 226                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4502                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9c5:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4509                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9cc:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 227                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4537                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9cc:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4544                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9d3:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 228                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4586                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9d3:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4593                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9da:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 229                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4629                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9da:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4636                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9e1:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 230                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4666                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9e1:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4673                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9e8:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 231                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4697                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9e8:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4704                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9ef:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 232                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4742                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9ef:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4749                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9f6:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 233                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4787                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9f6:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4794                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9fd:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 234                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4843                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x9fd:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4850                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa04:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 235                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4874                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa04:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4881                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa0b:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 236                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4913                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa0b:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4920                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa12:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 237                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4963                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa12:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4970                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa19:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 238                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5017                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa19:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5024                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa20:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 239                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5048                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa20:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5055                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa27:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 240                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5085                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa27:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5092                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa2e:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 241                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5135                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa2e:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5142                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa35:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 242                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5176                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa35:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5183                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa3c:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 243                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5213                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa3c:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5220                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa43:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 244                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5246                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa43:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5253                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa4a:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 245                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5277                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa4a:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5284                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa51:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 246                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5310                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa51:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5317                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa58:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 247                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5337                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa58:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5344                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa5f:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 248                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5368                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa5f:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5375                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa66:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 249                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5399                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa66:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5406                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa6d:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 250                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5428                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa6d:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5435                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa74:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 251                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5457                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa74:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5464                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa7b:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 252                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5488                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa7b:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5495                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa82:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 253                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5521                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa82:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5528                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa89:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 254                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5556                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa89:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5563                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xa90:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 255                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5592                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xa90:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5599                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xa97:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 0                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5649                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xa98:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5656                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xa9f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5680                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xaa0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5687                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xaa7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5719                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xaa8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5726                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xaaf:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5764                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xab0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5771                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xab7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5797                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xab8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5804                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xabf:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5842                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xac0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5849                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xac7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5888                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xac8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5895                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xacf:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5917                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xad0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5924                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xad7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5948                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xad8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5955                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xadf:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 9                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5989                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xae0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5996                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xae7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 10                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6028                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xae8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6035                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xaef:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 11                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6063                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xaf0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6070                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xaf7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6090                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xaf8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6097                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xaff:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6119                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb00:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6126                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb07:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 14                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6148                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb08:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6155                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb0f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 15                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6175                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb10:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6182                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb17:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 16                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6204                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb18:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6211                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb1f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 17                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6237                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb20:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6244                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb27:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 102                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6268                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb27:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6275                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb2e:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 121                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6288                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb2e:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6295                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb35:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 140                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6308                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb35:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6315                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb3c:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 159                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6328                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb3c:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6335                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb43:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 180                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6354                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb43:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6361                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb4a:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 199                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6374                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb4a:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6381                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb51:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 218                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6393                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb51:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6400                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xb58:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 237                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6413                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb58:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6420                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb5f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 0                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6432                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb60:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6439                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb67:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 19                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6452                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb68:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6459                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb6f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 38                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6473                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb70:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6480                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb77:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 59                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6498                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb78:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6505                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb7f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 78                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6524                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb80:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6531                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb87:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 97                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6550                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb88:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6557                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb8f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 116                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6569                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb90:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6576                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb97:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 135                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6590                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb98:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6597                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xb9f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 147                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6620                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xba0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6627                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xba7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 184                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6644                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xba8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6651                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xbaf:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 203                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6663                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xbb0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6670                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xbb7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 222                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6683                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xbb8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6690                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xbbf:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 241                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6703                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xbc0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6710                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xbc7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 6722                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbc8:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6729                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbcf:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 118                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6742                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbcf:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6749                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbd6:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 119                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6757                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbd6:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6764                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbdd:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 121                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6805                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbdd:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6812                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbe4:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 122                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6818                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbe4:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6825                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbeb:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 123                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6838                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbeb:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6845                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbf2:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 129                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6867                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbf2:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6874                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbf9:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 130                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6887                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xbf9:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6894                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc00:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 131                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6908                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc00:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6915                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc07:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 132                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6929                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc07:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6936                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc0e:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 133                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7057                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc0e:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7064                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc15:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 134                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7085                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc15:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7092                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc1c:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 135                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7110                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc1c:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7117                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc23:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 136                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7128                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc23:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7135                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc2a:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 137                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7145                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc2a:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7152                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc31:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 138                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7173                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc31:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7180                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc38:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 139                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7194                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc38:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7201                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc3f:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 140                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7220                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc3f:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7227                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc46:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 142                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7243                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc46:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7250                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc4d:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 143                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7270                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc4d:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7277                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc54:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 144                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7321                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc54:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7328                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc5b:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 146                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7354                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc5b:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7361                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc62:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 152                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7387                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc62:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7394                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc69:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 153                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7402                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc69:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7409                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc70:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 154                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7431                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc70:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7438                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc77:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 155                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7449                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc77:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7456                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc7e:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 156                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7481                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc7e:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7488                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc85:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 157                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7513                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc85:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7520                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc8c:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 158                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7546                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc8c:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7553                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc93:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 160                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7569                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc93:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7576                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc9a:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 161                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7614                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xc9a:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7621                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xca1:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 241                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7762                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xca1:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7769                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xca8:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 243                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7811                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xca8:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7818                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xcaf:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 245                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7830                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xcaf:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7837                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xcb6:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 246                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7716                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xcb6:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7723                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xcbd:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 247                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7852                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xcbd:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7859                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xcc4:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 249                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7879                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xcc4:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7886                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xccb:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 250                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7994                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xccb:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8001                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xcd2:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 251                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7901                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xcd2:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7908                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xcd9:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 252                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7934                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xcd9:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7941                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0xce0:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 253                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 8021                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xce0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8028                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xce7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 149                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8064                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xce8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8071                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xcef:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 150                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8096                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xcf0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8103                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xcf7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 151                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8130                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xcf8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8137                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xcff:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 152                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8162                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd00:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8169                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd07:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 153                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8196                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd08:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8203                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd0f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 154                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8236                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd10:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8243                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd17:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 155                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8268                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd18:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8275                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd1f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 156                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8302                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd20:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8309                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd27:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 157                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8334                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd28:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8341                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd2f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 158                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8366                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd30:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8373                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd37:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 159                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8412                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd38:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8419                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd3f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 160                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8442                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd40:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8449                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd47:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 161                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8474                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd48:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8481                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd4f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 162                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8506                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd50:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8513                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd57:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 163                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8536                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd58:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8543                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd5f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 164                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8568                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd60:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8575                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd67:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 165                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8598                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd68:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8605                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd6f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 166                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8632                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd70:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8639                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd77:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 167                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8664                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd78:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8671                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd7f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 168                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8702                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd80:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8709                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd87:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 169                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8736                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd88:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8743                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd8f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 170                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8778                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd90:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8785                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd97:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 171                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8816                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd98:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8823                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xd9f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 172                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8854                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xda0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8861                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xda7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 173                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8892                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xda8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8899                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdaf:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 174                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8933                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdb0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8940                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdb7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 175                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8973                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdb8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8980                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdbf:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 176                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9007                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdc0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9014                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdc7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 177                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9047                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdc8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9054                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdcf:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 178                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9083                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdd0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9090                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdd7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 179                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9119                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdd8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9126                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xddf:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 180                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9157                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xde0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9164                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xde7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 181                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9191                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xde8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9198                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdef:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 182                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9225                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdf0:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9232                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdf7:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 183                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9257                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdf8:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9264                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xdff:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 184                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9289                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe00:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9296                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe07:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 185                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9319                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe08:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9326                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe0f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 186                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9353                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe10:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9360                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe17:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 187                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9389                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe18:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9396                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe1f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 188                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9428                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe20:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9435                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe27:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 189                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9471                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe28:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9478                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe2f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 190                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9520                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe30:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9527                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe37:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 191                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9556                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe38:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9563                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe3f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 192                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9605                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe40:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9612                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe47:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 193                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9654                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe48:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9661                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe4f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 194                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9686                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe50:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9693                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe57:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 195                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9720                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe58:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9727                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe5f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 196                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9764                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe60:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9771                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe67:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 197                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9806                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe68:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9813                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe6f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 198                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9836                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe70:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9843                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe77:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 199                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9868                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe78:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9875                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe7f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 200                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9900                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe80:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9907                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe87:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 201                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9930                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe88:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9937                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe8f:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 202                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9962                               // DW_AT_import
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe90:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9969                               // DW_AT_import
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0xe97:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 203                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9998                               // DW_AT_import
+; CHECK-NEXT:.b32 10005                              // DW_AT_import
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xe99:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xea0:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3556,12 +3567,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 44                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xeae:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xeb5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0xeb4:0x11 DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0xebb:0x11 DW_TAG_base_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 110
@@ -3578,7 +3589,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 8                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xec5:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xecc:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3598,10 +3609,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 46                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xedc:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xee3:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xee2:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xee9:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3623,10 +3634,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 48                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xefb:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf02:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xf01:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xf08:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3646,10 +3657,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 50                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf18:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf1f:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xf1e:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xf25:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3671,10 +3682,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 52                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf37:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf3e:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xf3d:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xf44:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3694,10 +3705,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 56                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf54:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf5b:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xf5a:0x25 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xf61:0x25 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3720,12 +3731,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 54                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf74:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf7b:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf79:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf80:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xf7f:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xf86:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3747,10 +3758,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 58                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf98:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf9f:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xf9e:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xfa5:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3770,10 +3781,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 60                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xfb5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xfbc:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xfbb:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xfc2:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3793,10 +3804,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 62                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xfd2:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xfd9:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xfd8:0x2b DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xfdf:0x2b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3825,12 +3836,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 64                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xff8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xfff:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xffd:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1004:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1003:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x100a:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3848,10 +3859,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 66                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1018:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x101f:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x101e:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1025:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3871,10 +3882,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 68                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1035:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x103c:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x103b:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1042:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3892,10 +3903,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 72                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1050:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1057:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1056:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x105d:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3915,10 +3926,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 70                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x106d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1074:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1073:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x107a:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3936,10 +3947,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 76                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1088:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x108f:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x108e:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1095:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3959,10 +3970,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 74                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x10a5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x10ac:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x10ab:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x10b2:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3984,10 +3995,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 78                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x10c4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x10cb:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x10ca:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x10d1:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4007,10 +4018,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 80                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x10e1:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x10e8:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x10e7:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x10ee:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4031,12 +4042,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 82                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x10ff:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1106:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1104:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x110b:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x110a:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1111:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4058,10 +4069,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 84                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1123:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x112a:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1129:0x27 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1130:0x27 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4081,14 +4092,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 86                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1140:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1147:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1145:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x114c:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x114a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1151:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1150:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1157:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4109,12 +4120,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 88                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1168:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x116f:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x116d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1174:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1173:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x117a:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4135,12 +4146,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 90                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x118b:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1192:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1190:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1197:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1196:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x119d:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4161,12 +4172,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 92                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11ae:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11b5:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11b3:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11ba:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x11b9:0x2a DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x11c0:0x2a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4197,19 +4208,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 94                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11dd:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11e4:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x11e3:0x7 DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x11ea:0x7 DW_TAG_base_type
 ; CHECK-NEXT:.b8 105                                 // DW_AT_name
 ; CHECK-NEXT:.b8 110
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 4                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x11ea:0x26 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x11f1:0x26 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4233,14 +4244,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 96                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1205:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x120c:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x120a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4624                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1211:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4631                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1210:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1215:0x25 DW_TAG_subprogram
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1217:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x121c:0x25 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4263,12 +4274,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 98                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x122f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1236:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1234:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x123b:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x123a:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1241:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4288,12 +4299,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 100                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1253:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x125a:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1259:0x25 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1260:0x25 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4319,12 +4330,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 102                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4734                               // DW_AT_type
+; CHECK-NEXT:.b32 4741                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1278:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x127f:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x127e:0x8 DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1285:0x8 DW_TAG_base_type
 ; CHECK-NEXT:.b8 98                                  // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 111
@@ -4332,7 +4343,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 1                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1286:0x2d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x128d:0x2d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4361,14 +4372,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 106                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4734                               // DW_AT_type
+; CHECK-NEXT:.b32 4741                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12a8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12af:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12ad:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12b4:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x12b3:0x38 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x12ba:0x38 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4408,14 +4419,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 105                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4734                               // DW_AT_type
+; CHECK-NEXT:.b32 4741                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12e0:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12e7:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12e5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12ec:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x12eb:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x12f2:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4435,12 +4446,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 108                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4734                               // DW_AT_type
+; CHECK-NEXT:.b32 4741                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1304:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x130b:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x130a:0x27 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1311:0x27 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4463,14 +4474,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 112                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4734                               // DW_AT_type
+; CHECK-NEXT:.b32 4741                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1326:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x132d:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x132b:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1332:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1331:0x32 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1338:0x32 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4504,14 +4515,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 111                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4734                               // DW_AT_type
+; CHECK-NEXT:.b32 4741                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1358:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x135f:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x135d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1364:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1363:0x36 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x136a:0x36 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4549,14 +4560,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 114                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4734                               // DW_AT_type
+; CHECK-NEXT:.b32 4741                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x138e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1395:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1393:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x139a:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1399:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x13a0:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4576,12 +4587,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 116                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4734                               // DW_AT_type
+; CHECK-NEXT:.b32 4741                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x13b2:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x13b9:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x13b8:0x25 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x13bf:0x25 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4607,12 +4618,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 118                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4734                               // DW_AT_type
+; CHECK-NEXT:.b32 4741                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x13d7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x13de:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x13dd:0x32 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x13e4:0x32 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4646,14 +4657,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 120                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4734                               // DW_AT_type
+; CHECK-NEXT:.b32 4741                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1404:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x140b:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1409:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1410:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x140f:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1416:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4671,12 +4682,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 121                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1426:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x142d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x142c:0xc DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1433:0xc DW_TAG_base_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 110
@@ -4688,7 +4699,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 8                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1438:0x25 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x143f:0x25 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4711,12 +4722,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 123                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1452:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1459:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1457:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x145e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x145d:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1464:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4740,10 +4751,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 125                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1478:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x147f:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x147e:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1485:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4763,12 +4774,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 126                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1497:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x149e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x149d:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x14a4:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4790,12 +4801,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 128                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x14b8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x14bf:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x14be:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x14c5:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4813,10 +4824,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 138                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x14d3:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x14da:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x14d9:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x14e0:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4838,10 +4849,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 130                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x14f2:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x14f9:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x14f8:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x14ff:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4863,10 +4874,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 132                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1511:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1518:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1517:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x151e:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4886,10 +4897,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 134                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x152e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1535:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1534:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x153b:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4909,10 +4920,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 136                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x154b:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1552:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1551:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1558:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4932,12 +4943,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 140                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x156a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1571:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1570:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1577:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4959,12 +4970,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 142                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x158b:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1592:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1591:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1598:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4988,12 +4999,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 143                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15ae:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15b5:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x15b4:0x24 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x15bb:0x24 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5015,12 +5026,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 145                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15cd:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15d4:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15d2:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15d9:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2125                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x15d8:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x15df:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5038,12 +5049,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 146                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15ef:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15f6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x15f5:0xa DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x15fc:0xa DW_TAG_base_type
 ; CHECK-NEXT:.b8 100                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 117
@@ -5053,11 +5064,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 8                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x15ff:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 5636                               // DW_AT_type
-; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x1604:0x5 DW_TAG_const_type
-; CHECK-NEXT:.b32 5641                               // DW_AT_type
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1609:0x8 DW_TAG_base_type
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1606:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 5643                               // DW_AT_type
+; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x160b:0x5 DW_TAG_const_type
+; CHECK-NEXT:.b32 5648                               // DW_AT_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1610:0x8 DW_TAG_base_type
 ; CHECK-NEXT:.b8 99                                  // DW_AT_name
 ; CHECK-NEXT:.b8 104
 ; CHECK-NEXT:.b8 97
@@ -5065,7 +5076,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 8                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 1                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1611:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1618:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5087,10 +5098,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 147                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x162a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1631:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1630:0x27 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1637:0x27 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5120,10 +5131,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 149                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1651:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1658:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1657:0x2d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x165e:0x2d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5154,12 +5165,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 151                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1679:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1680:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x167e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1685:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1684:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x168b:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5178,12 +5189,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 155                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x169a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16a1:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x169f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16a6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x16a5:0x2d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x16ac:0x2d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5214,12 +5225,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 157                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16c7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16ce:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16cc:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16d3:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x16d2:0x2e DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x16d9:0x2e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5246,14 +5257,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 159                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16f0:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16f7:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16f5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16fc:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16fa:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4624                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1701:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4631                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1700:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1707:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5273,10 +5284,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 161                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1717:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x171e:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x171d:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1724:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5298,10 +5309,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 163                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1736:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x173d:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x173c:0x29 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1743:0x29 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5328,12 +5339,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 165                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x175a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1761:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x175f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1766:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1765:0x27 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x176c:0x27 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5358,12 +5369,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 167                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1781:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1788:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1786:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x178d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x178c:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1793:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5387,12 +5398,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 169                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4734                               // DW_AT_type
+; CHECK-NEXT:.b32 4741                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17a9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17b0:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x17af:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x17b6:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5410,10 +5421,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 171                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17c4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17cb:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x17ca:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x17d1:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5433,10 +5444,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 173                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17e1:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17e8:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x17e7:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x17ee:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5456,10 +5467,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 175                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17fe:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1805:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1804:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x180b:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5477,10 +5488,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 177                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1819:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1820:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x181f:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1826:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5500,10 +5511,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 179                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1836:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x183d:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x183c:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1843:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5527,10 +5538,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 181                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1857:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x185e:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x185d:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1864:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5552,10 +5563,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 183                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1876:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x187d:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x187c:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1883:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 99
 ; CHECK-NEXT:.b8 111
@@ -5563,13 +5574,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 54                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x188a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1891:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1890:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1897:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 115
 ; CHECK-NEXT:.b8 105
@@ -5577,13 +5588,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 56                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x189e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18a5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x18a4:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x18ab:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 97
@@ -5591,13 +5602,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 58                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18b2:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18b9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x18b8:0x1a DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x18bf:0x1a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 97
@@ -5606,15 +5617,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 60                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18c7:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18cc:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18ce:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18d3:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x18d2:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x18d9:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 99                                  // DW_AT_name
 ; CHECK-NEXT:.b8 101
 ; CHECK-NEXT:.b8 105
@@ -5622,26 +5633,26 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 178                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18e0:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18e7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x18e6:0x13 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x18ed:0x13 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 99                                  // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 115
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 63                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18f3:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18fa:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x18f9:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1900:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 99                                  // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 115
@@ -5649,26 +5660,26 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 72                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1907:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x190e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x190d:0x13 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1914:0x13 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 101                                 // DW_AT_name
 ; CHECK-NEXT:.b8 120
 ; CHECK-NEXT:.b8 112
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 100                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x191a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1921:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1920:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1927:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 102                                 // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 98
@@ -5676,13 +5687,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 181                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x192e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1935:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1934:0x15 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x193b:0x15 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 102                                 // DW_AT_name
 ; CHECK-NEXT:.b8 108
 ; CHECK-NEXT:.b8 111
@@ -5691,13 +5702,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 184                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1943:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x194a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1949:0x19 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1950:0x19 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 102                                 // DW_AT_name
 ; CHECK-NEXT:.b8 109
 ; CHECK-NEXT:.b8 111
@@ -5705,15 +5716,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 187                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1957:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x195c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x195e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1963:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1962:0x1a DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1969:0x1a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 102                                 // DW_AT_name
 ; CHECK-NEXT:.b8 114
 ; CHECK-NEXT:.b8 101
@@ -5722,15 +5733,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 103                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1971:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1976:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4624                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1978:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x197d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4631                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x197c:0x1a DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1983:0x1a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 100
 ; CHECK-NEXT:.b8 101
@@ -5739,28 +5750,28 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 106                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x198b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1990:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1992:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1997:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1996:0x13 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x199d:0x13 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 103
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 109                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19a3:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19aa:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x19a9:0x15 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x19b0:0x15 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 103
@@ -5769,13 +5780,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 112                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19b8:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19bf:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x19be:0x19 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x19c5:0x19 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 109                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 100
@@ -5783,45 +5794,45 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 115                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19cc:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19d1:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6615                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x19d7:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x19dc:0x18 DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19d3:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19d8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6622                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x19de:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x19e3:0x18 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 112                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 119
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 153                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19e9:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19ee:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19f0:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19f5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x19f4:0x13 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x19fb:0x13 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 110
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 65                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a01:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a08:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1a07:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1a0e:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 110
@@ -5829,13 +5840,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 74                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a15:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a1c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1a1b:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1a22:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 113
 ; CHECK-NEXT:.b8 114
@@ -5843,26 +5854,26 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 156                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a29:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a30:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1a2f:0x13 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1a36:0x13 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 116                                 // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 110
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 67                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a3c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a43:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1a42:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1a49:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 116                                 // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 110
@@ -5870,14 +5881,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 76                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a50:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a57:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1a56:0xd DW_TAG_typedef
-; CHECK-NEXT:.b32 6755                               // DW_AT_type
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1a5d:0xd DW_TAG_typedef
+; CHECK-NEXT:.b32 6762                               // DW_AT_type
 ; CHECK-NEXT:.b8 100                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 118
@@ -5886,10 +5897,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 101                                 // DW_AT_decl_line
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1a63:0x2 DW_TAG_structure_type
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1a6a:0x2 DW_TAG_structure_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1a65:0xe DW_TAG_typedef
-; CHECK-NEXT:.b32 6771                               // DW_AT_type
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1a6c:0xe DW_TAG_typedef
+; CHECK-NEXT:.b32 6778                               // DW_AT_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 100
 ; CHECK-NEXT:.b8 105
@@ -5899,35 +5910,35 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 109                                 // DW_AT_decl_line
-; CHECK-NEXT:.b8 35                                  // Abbrev [35] 0x1a73:0x22 DW_TAG_structure_type
+; CHECK-NEXT:.b8 35                                  // Abbrev [35] 0x1a7a:0x22 DW_TAG_structure_type
 ; CHECK-NEXT:.b8 16                                  // DW_AT_byte_size
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 105                                 // DW_AT_decl_line
-; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1a77:0xf DW_TAG_member
+; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1a7e:0xf DW_TAG_member
 ; CHECK-NEXT:.b8 113                                 // DW_AT_name
 ; CHECK-NEXT:.b8 117
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 107                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1a86:0xe DW_TAG_member
+; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1a8d:0xe DW_TAG_member
 ; CHECK-NEXT:.b8 114                                 // DW_AT_name
 ; CHECK-NEXT:.b8 101
 ; CHECK-NEXT:.b8 109
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 108                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 8
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 36                                  // Abbrev [36] 0x1a95:0xd DW_TAG_subprogram
+; CHECK-NEXT:.b8 36                                  // Abbrev [36] 0x1a9c:0xd DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 98
 ; CHECK-NEXT:.b8 111
@@ -5940,7 +5951,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 1                                   // DW_AT_noreturn
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1aa2:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1aa9:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 98
 ; CHECK-NEXT:.b8 115
@@ -5948,13 +5959,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ab0:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ab7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1ab6:0x17 DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1abd:0x17 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 101
@@ -5965,16 +5976,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ac7:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6861                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ace:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6868                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1acd:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 6866                               // DW_AT_type
-; CHECK-NEXT:.b8 38                                  // Abbrev [38] 0x1ad2:0x1 DW_TAG_subroutine_type
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1ad3:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1ad4:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 6873                               // DW_AT_type
+; CHECK-NEXT:.b8 38                                  // Abbrev [38] 0x1ad9:0x1 DW_TAG_subroutine_type
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1ada:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 111
@@ -5982,13 +5993,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 9                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 26                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ae1:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ae8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1ae7:0x15 DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1aee:0x15 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 111
@@ -5997,13 +6008,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 22                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1af6:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1afd:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1afc:0x15 DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1b03:0x15 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 111
@@ -6012,13 +6023,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 27                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b0b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b12:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1b11:0x2b DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1b18:0x2b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 98                                  // DW_AT_name
 ; CHECK-NEXT:.b8 115
 ; CHECK-NEXT:.b8 101
@@ -6029,26 +6040,26 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 10                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 20                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 6972                               // DW_AT_type
+; CHECK-NEXT:.b32 6979                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b22:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6973                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b27:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6973                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b2c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b31:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b36:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7014                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1b3c:0x1 DW_TAG_pointer_type
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1b3d:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 6978                               // DW_AT_type
-; CHECK-NEXT:.b8 40                                  // Abbrev [40] 0x1b42:0x1 DW_TAG_const_type
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1b43:0xe DW_TAG_typedef
-; CHECK-NEXT:.b32 6993                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b29:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6980                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b2e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6980                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b33:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b38:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b3d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7021                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1b43:0x1 DW_TAG_pointer_type
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1b44:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 6985                               // DW_AT_type
+; CHECK-NEXT:.b8 40                                  // Abbrev [40] 0x1b49:0x1 DW_TAG_const_type
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1b4a:0xe DW_TAG_typedef
+; CHECK-NEXT:.b32 7000                               // DW_AT_type
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 122
@@ -6058,7 +6069,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 11                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 62                                  // DW_AT_decl_line
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1b51:0x15 DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1b58:0x15 DW_TAG_base_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 110
@@ -6079,8 +6090,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 8                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 20                                  // Abbrev [20] 0x1b66:0x16 DW_TAG_typedef
-; CHECK-NEXT:.b32 7036                               // DW_AT_type
+; CHECK-NEXT:.b8 20                                  // Abbrev [20] 0x1b6d:0x16 DW_TAG_typedef
+; CHECK-NEXT:.b32 7043                               // DW_AT_type
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 95
 ; CHECK-NEXT:.b8 99
@@ -6098,16 +6109,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 230                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1b7c:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 7041                               // DW_AT_type
-; CHECK-NEXT:.b8 41                                  // Abbrev [41] 0x1b81:0x10 DW_TAG_subroutine_type
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b86:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6973                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b8b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6973                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1b91:0x1c DW_TAG_subprogram
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1b83:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 7048                               // DW_AT_type
+; CHECK-NEXT:.b8 41                                  // Abbrev [41] 0x1b88:0x10 DW_TAG_subroutine_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b8d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6980                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b92:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6980                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1b98:0x1c DW_TAG_subprogram
 ; CHECK-NEXT:.b8 99                                  // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 108
@@ -6118,15 +6129,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 212                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6972                               // DW_AT_type
+; CHECK-NEXT:.b32 6979                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ba2:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ba7:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ba9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bae:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1bad:0x19 DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1bb4:0x19 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 100                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 118
@@ -6134,15 +6145,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 21                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 6742                               // DW_AT_type
+; CHECK-NEXT:.b32 6749                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bbb:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bc0:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bc2:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bc7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 42                                  // Abbrev [42] 0x1bc6:0x12 DW_TAG_subprogram
+; CHECK-NEXT:.b8 42                                  // Abbrev [42] 0x1bcd:0x12 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 101                                 // DW_AT_name
 ; CHECK-NEXT:.b8 120
 ; CHECK-NEXT:.b8 105
@@ -6154,10 +6165,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 1                                   // DW_AT_noreturn
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bd2:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bd9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 18                                  // Abbrev [18] 0x1bd8:0x11 DW_TAG_subprogram
+; CHECK-NEXT:.b8 18                                  // Abbrev [18] 0x1bdf:0x11 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 102                                 // DW_AT_name
 ; CHECK-NEXT:.b8 114
 ; CHECK-NEXT:.b8 101
@@ -6168,10 +6179,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1be3:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6972                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bea:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6979                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1be9:0x17 DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1bf0:0x17 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 103                                 // DW_AT_name
 ; CHECK-NEXT:.b8 101
 ; CHECK-NEXT:.b8 116
@@ -6182,15 +6193,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 52                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 7168                               // DW_AT_type
+; CHECK-NEXT:.b32 7175                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bfa:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c01:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1c00:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 5641                               // DW_AT_type
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1c05:0x15 DW_TAG_subprogram
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1c07:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 5648                               // DW_AT_type
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1c0c:0x15 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 98
@@ -6199,13 +6210,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c14:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c1b:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1c1a:0x1a DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1c21:0x1a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 100
 ; CHECK-NEXT:.b8 105
@@ -6214,15 +6225,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 23                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 6757                               // DW_AT_type
+; CHECK-NEXT:.b32 6764                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c29:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c2e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c30:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c35:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1c34:0x17 DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1c3b:0x17 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 109                                 // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 108
@@ -6233,13 +6244,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 210                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6972                               // DW_AT_type
+; CHECK-NEXT:.b32 6979                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c45:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c4c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1c4b:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1c52:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 109                                 // DW_AT_name
 ; CHECK-NEXT:.b8 98
 ; CHECK-NEXT:.b8 108
@@ -6249,15 +6260,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 95                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c5b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c60:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c62:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c67:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1c66:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1c6d:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 109                                 // DW_AT_name
 ; CHECK-NEXT:.b8 98
 ; CHECK-NEXT:.b8 115
@@ -6270,19 +6281,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 106                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c79:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7305                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c7e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c83:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1c89:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 7310                               // DW_AT_type
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1c8e:0xb DW_TAG_base_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c80:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7312                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c85:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c8a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1c90:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 7317                               // DW_AT_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1c95:0xb DW_TAG_base_type
 ; CHECK-NEXT:.b8 119                                 // DW_AT_name
 ; CHECK-NEXT:.b8 99
 ; CHECK-NEXT:.b8 104
@@ -6293,7 +6304,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 4                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1c99:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1ca0:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 109                                 // DW_AT_name
 ; CHECK-NEXT:.b8 98
 ; CHECK-NEXT:.b8 116
@@ -6304,17 +6315,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 98                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1caa:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7305                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1caf:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cb4:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 18                                  // Abbrev [18] 0x1cba:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cb1:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7312                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cb6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cbb:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 18                                  // Abbrev [18] 0x1cc1:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 113                                 // DW_AT_name
 ; CHECK-NEXT:.b8 115
 ; CHECK-NEXT:.b8 111
@@ -6326,16 +6337,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cc6:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6972                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ccb:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cd0:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ccd:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 6979                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cd5:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7014                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 43                                  // Abbrev [43] 0x1cdb:0xf DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cd2:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cd7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cdc:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7021                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 43                                  // Abbrev [43] 0x1ce2:0xf DW_TAG_subprogram
 ; CHECK-NEXT:.b8 114                                 // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 110
@@ -6344,10 +6355,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 118                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1cea:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1cf1:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 114                                 // DW_AT_name
 ; CHECK-NEXT:.b8 101
 ; CHECK-NEXT:.b8 97
@@ -6359,15 +6370,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 224                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6972                               // DW_AT_type
+; CHECK-NEXT:.b32 6979                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cfc:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6972                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d01:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d03:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 6979                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d08:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 18                                  // Abbrev [18] 0x1d07:0x12 DW_TAG_subprogram
+; CHECK-NEXT:.b8 18                                  // Abbrev [18] 0x1d0e:0x12 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 114
 ; CHECK-NEXT:.b8 97
@@ -6379,10 +6390,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d13:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d1a:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 619                                // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1d19:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1d20:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6392,17 +6403,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 164                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5621                               // DW_AT_type
+; CHECK-NEXT:.b32 5628                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d29:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d2e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7476                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1d34:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 7168                               // DW_AT_type
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1d39:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d30:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d35:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7483                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1d3b:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 7175                               // DW_AT_type
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1d40:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6412,17 +6423,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 183                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d49:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d4e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7476                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d53:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1d59:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d50:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d55:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7483                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d5a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1d60:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6433,17 +6444,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 187                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6993                               // DW_AT_type
+; CHECK-NEXT:.b32 7000                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d6a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d6f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7476                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d74:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1d7a:0x17 DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d71:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d76:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7483                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d7b:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1d81:0x17 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 121
 ; CHECK-NEXT:.b8 115
@@ -6454,13 +6465,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 205                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d8b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d92:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1d91:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1d98:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 119                                 // DW_AT_name
 ; CHECK-NEXT:.b8 99
 ; CHECK-NEXT:.b8 115
@@ -6473,21 +6484,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 109                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1da4:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7168                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1da9:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7604                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1dae:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1db4:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 7609                               // DW_AT_type
-; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x1db9:0x5 DW_TAG_const_type
-; CHECK-NEXT:.b32 7310                               // DW_AT_type
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1dbe:0x1c DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1dab:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7175                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1db0:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7611                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1db5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6986                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1dbb:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 7616                               // DW_AT_type
+; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x1dc0:0x5 DW_TAG_const_type
+; CHECK-NEXT:.b32 7317                               // DW_AT_type
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1dc5:0x1c DW_TAG_subprogram
 ; CHECK-NEXT:.b8 119                                 // DW_AT_name
 ; CHECK-NEXT:.b8 99
 ; CHECK-NEXT:.b8 116
@@ -6498,15 +6509,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 102                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1dcf:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7168                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1dd4:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7310                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1dd6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7175                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ddb:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7317                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 28                                  // Abbrev [28] 0x1dda:0x78 DW_TAG_namespace
+; CHECK-NEXT:.b8 28                                  // Abbrev [28] 0x1de1:0x78 DW_TAG_namespace
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 95
 ; CHECK-NEXT:.b8 103
@@ -6517,43 +6528,43 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 120
 ; CHECK-NEXT:.b8 120
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1de5:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1dec:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 201                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7762                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1dec:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7769                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1df3:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 207                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7811                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1df3:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7818                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1dfa:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 211                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7830                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1dfa:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7837                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1e01:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 217                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7852                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1e01:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7859                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1e08:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 228                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7879                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1e08:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7886                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1e0f:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 229                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7901                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1e0f:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7908                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1e16:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 230                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7934                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1e16:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7941                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1e1d:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 232                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7994                               // DW_AT_import
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1e1d:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8001                               // DW_AT_import
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x1e24:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 233                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 8021                               // DW_AT_import
-; CHECK-NEXT:.b8 4                                   // Abbrev [4] 0x1e24:0x2d DW_TAG_subprogram
+; CHECK-NEXT:.b32 8028                               // DW_AT_import
+; CHECK-NEXT:.b8 4                                   // Abbrev [4] 0x1e2b:0x2d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -6581,17 +6592,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 214                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7762                               // DW_AT_type
+; CHECK-NEXT:.b32 7769                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1e46:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1e4b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1e4d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1e52:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1e52:0xf DW_TAG_typedef
-; CHECK-NEXT:.b32 7777                               // DW_AT_type
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1e59:0xf DW_TAG_typedef
+; CHECK-NEXT:.b32 7784                               // DW_AT_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 108
 ; CHECK-NEXT:.b8 100
@@ -6602,35 +6613,35 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 121                                 // DW_AT_decl_line
-; CHECK-NEXT:.b8 35                                  // Abbrev [35] 0x1e61:0x22 DW_TAG_structure_type
+; CHECK-NEXT:.b8 35                                  // Abbrev [35] 0x1e68:0x22 DW_TAG_structure_type
 ; CHECK-NEXT:.b8 16                                  // DW_AT_byte_size
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 117                                 // DW_AT_decl_line
-; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1e65:0xf DW_TAG_member
+; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1e6c:0xf DW_TAG_member
 ; CHECK-NEXT:.b8 113                                 // DW_AT_name
 ; CHECK-NEXT:.b8 117
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 119                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1e74:0xe DW_TAG_member
+; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1e7b:0xe DW_TAG_member
 ; CHECK-NEXT:.b8 114                                 // DW_AT_name
 ; CHECK-NEXT:.b8 101
 ; CHECK-NEXT:.b8 109
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 120                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 8
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 42                                  // Abbrev [42] 0x1e83:0x13 DW_TAG_subprogram
+; CHECK-NEXT:.b8 42                                  // Abbrev [42] 0x1e8a:0x13 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 69
 ; CHECK-NEXT:.b8 120
@@ -6643,10 +6654,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 1                                   // DW_AT_noreturn
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1e90:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1e97:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1e96:0x16 DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1e9d:0x16 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 108
 ; CHECK-NEXT:.b8 97
@@ -6656,13 +6667,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ea6:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ead:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1eac:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1eb3:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 108
 ; CHECK-NEXT:.b8 100
@@ -6672,15 +6683,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 29                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 7762                               // DW_AT_type
+; CHECK-NEXT:.b32 7769                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ebc:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ec1:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ec3:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ec8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1ec7:0x16 DW_TAG_subprogram
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1ece:0x16 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 111
@@ -6690,13 +6701,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 36                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ed7:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ede:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1edd:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1ee4:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6707,17 +6718,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 209                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1eee:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ef3:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7476                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ef8:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1efe:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ef5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1efa:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7483                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1eff:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1f05:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6729,17 +6740,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 214                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7968                               // DW_AT_type
+; CHECK-NEXT:.b32 7975                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f10:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f15:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7476                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f1a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1f20:0x1a DW_TAG_base_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f17:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f1c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7483                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f21:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1f27:0x1a DW_TAG_base_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 110
@@ -6765,7 +6776,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 8                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1f3a:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1f41:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6778,12 +6789,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f4a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f4f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7476                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f51:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f56:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7483                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1f55:0x1c DW_TAG_subprogram
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0x1f5c:0x1c DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6794,15 +6805,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 175                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 8049                               // DW_AT_type
+; CHECK-NEXT:.b32 8056                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f66:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5631                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f6b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7476                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f6d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5638                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f72:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7483                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1f71:0xf DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1f78:0xf DW_TAG_base_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 110
@@ -6817,7 +6828,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 8                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x1f80:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x1f87:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -6840,10 +6851,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f9a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1fa1:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x1fa0:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x1fa7:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -6868,10 +6879,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1fbc:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1fc3:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x1fc2:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x1fc9:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -6894,10 +6905,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1fdc:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1fe3:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x1fe2:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x1fe9:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -6922,10 +6933,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ffe:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2005:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2004:0x28 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x200b:0x28 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -6951,12 +6962,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2021:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2028:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2026:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x202d:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x202c:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2033:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -6979,10 +6990,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2046:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x204d:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x204c:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2053:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7007,10 +7018,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2068:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x206f:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x206e:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2075:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7033,10 +7044,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2088:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x208f:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x208e:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2095:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7059,10 +7070,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20a8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20af:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x20ae:0x2e DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x20b5:0x2e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7094,12 +7105,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 4
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20d1:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20d8:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20d6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20dd:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x20dc:0x1e DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x20e3:0x1e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7120,10 +7131,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 4
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20f4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20fb:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x20fa:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2101:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7146,10 +7157,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2114:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x211b:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x211a:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2121:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7172,10 +7183,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2134:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x213b:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x213a:0x1e DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2141:0x1e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7196,10 +7207,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2152:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2159:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2158:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x215f:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7222,10 +7233,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2172:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2179:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2178:0x1e DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x217f:0x1e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7246,10 +7257,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2190:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2197:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2196:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x219d:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7274,10 +7285,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21b2:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21b9:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x21b8:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x21bf:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7300,10 +7311,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21d2:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21d9:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x21d8:0x26 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x21df:0x26 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7327,12 +7338,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 6
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21f3:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21fa:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21f8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21ff:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x21fe:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2205:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7357,10 +7368,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x221a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2221:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2220:0x2a DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2227:0x2a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7383,14 +7394,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 6
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x223a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2241:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x223f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2246:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2244:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x224b:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x224a:0x26 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2251:0x26 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7414,12 +7425,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2265:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x226c:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x226a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2271:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2270:0x26 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2277:0x26 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7443,12 +7454,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x228b:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2292:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2290:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2297:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2296:0x26 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x229d:0x26 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7472,12 +7483,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 6
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22b1:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22b8:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22b6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22bd:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x22bc:0x29 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x22c3:0x29 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7504,12 +7515,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 6
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22da:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22e1:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22df:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4624                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22e6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4631                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x22e5:0x28 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x22ec:0x28 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7535,12 +7546,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2302:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2309:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2307:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x230e:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x230d:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2314:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7563,12 +7574,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 85                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2329:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2330:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x232f:0x28 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2336:0x28 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7594,12 +7605,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x234c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2353:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2351:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2358:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2357:0x24 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x235e:0x24 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7626,10 +7637,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2375:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x237c:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x237b:0x24 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2382:0x24 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7654,12 +7665,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 125                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 4
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2399:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x23a0:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x239f:0x26 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x23a6:0x26 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7686,12 +7697,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 66                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 3764                               // DW_AT_type
+; CHECK-NEXT:.b32 3771                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x23bf:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x23c6:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x23c5:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x23cc:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7716,10 +7727,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x23e1:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x23e8:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x23e7:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x23ee:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7744,10 +7755,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2403:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x240a:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2409:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2410:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7770,10 +7781,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2423:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x242a:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2429:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2430:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7796,10 +7807,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 6
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2443:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x244a:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2449:0x1e DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2450:0x1e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7820,10 +7831,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2461:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2468:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2467:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x246e:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7846,12 +7857,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 116                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 4
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2483:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x248a:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2489:0x24 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2490:0x24 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7876,12 +7887,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 71                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24a7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24ae:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x24ad:0x27 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x24b4:0x27 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7906,12 +7917,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 6
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24c9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24d0:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24ce:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24d5:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2125                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x24d4:0x2b DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x24db:0x2b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7945,10 +7956,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 4
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24f9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2500:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x24ff:0x31 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2506:0x31 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7983,12 +7994,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 4
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2525:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x252c:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x252a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2531:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2530:0x24 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2537:0x24 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8010,12 +8021,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 6
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2549:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2550:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x254e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2555:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2554:0x31 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x255b:0x31 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8050,12 +8061,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 6
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x257a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2581:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x257f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2586:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2585:0x31 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x258c:0x31 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8085,14 +8096,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 6
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25a6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25ad:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25ab:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25b2:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25b0:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4624                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25b7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4631                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x25b6:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x25bd:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8115,10 +8126,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 4
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25d0:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25d7:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x25d6:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x25dd:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8143,10 +8154,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 6
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25f2:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25f9:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x25f8:0x2c DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x25ff:0x2c DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8176,12 +8187,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2619:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2620:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x261e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5164                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2625:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5171                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2624:0x2a DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x262b:0x2a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8209,12 +8220,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2643:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x264a:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2648:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4579                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x264f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4586                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x264e:0x1e DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2655:0x1e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8235,10 +8246,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 4
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2666:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x266d:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x266c:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2673:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8261,10 +8272,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2686:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x268d:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x268c:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2693:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8287,10 +8298,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 3
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x26a6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x26ad:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x26ac:0x1e DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x26b3:0x1e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8311,10 +8322,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 4
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x26c4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x26cb:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x26ca:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x26d1:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8337,10 +8348,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x26e4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x26eb:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x26ea:0x24 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x26f1:0x24 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8367,10 +8378,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 6
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2708:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x270f:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x270e:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x2715:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8395,7 +8406,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x272a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2731:0x5 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 2116                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
diff --git a/llvm/test/ExecutionEngine/MCJIT/test-global-ctors.ll b/llvm/test/ExecutionEngine/MCJIT/test-global-ctors.ll
index 2e95e7eda588a..4598c9f71f14c 100644
--- a/llvm/test/ExecutionEngine/MCJIT/test-global-ctors.ll
+++ b/llvm/test/ExecutionEngine/MCJIT/test-global-ctors.ll
@@ -1,6 +1,6 @@
 ; RUN: %lli -jit-kind=mcjit %s > /dev/null
 ; RUN: %lli %s > /dev/null
-; UNSUPPORTED: target={{.*}}-darwin{{.*}}
+; UNSUPPORTED: system-darwin
 @var = global i32 1, align 4
 @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @ctor_func, ptr null }]
 @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @dtor_func, ptr null }]
diff --git a/llvm/test/ExecutionEngine/Orc/weak-comdat.ll b/llvm/test/ExecutionEngine/Orc/weak-comdat.ll
index e1e1ab02e2981..4ebbfdf4aa35b 100644
--- a/llvm/test/ExecutionEngine/Orc/weak-comdat.ll
+++ b/llvm/test/ExecutionEngine/Orc/weak-comdat.ll
@@ -1,5 +1,5 @@
 ; RUN: lli -extra-module %p/Inputs/weak-comdat-def.ll %s
-; UNSUPPORTED: target={{.*}}-darwin{{.*}}
+; UNSUPPORTED: system-darwin
 
 declare i32 @g()
 
diff --git a/llvm/test/MC/AMDGPU/ds_swizzle.s b/llvm/test/MC/AMDGPU/ds_swizzle.s
new file mode 100644
index 0000000000000..d768339fe5b77
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/ds_swizzle.s
@@ -0,0 +1,131 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck -check-prefix=GFX7 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=GFX8 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx908 -show-encoding %s | FileCheck -check-prefix=GFX9 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -show-encoding %s | FileCheck -check-prefix=GFX10PLUS %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck -check-prefix=GFX10PLUS %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck -check-prefix=GFX10PLUS %s
+
+// RUN: not llvm-mc -triple=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck -check-prefix=ERROR-PREGFX9 %s --implicit-check-not=error:
+// RUN: not llvm-mc -triple=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=ERROR-PREGFX9 %s --implicit-check-not=error:
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx908 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERROR %s --implicit-check-not=error:
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERROR %s --implicit-check-not=error:
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERROR %s --implicit-check-not=error:
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERROR %s --implicit-check-not=error:
+
+//==============================================================================
+// FFT mode
+
+ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU
+// GFX9:      ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+
+ds_swizzle_b32 v5, v1 offset:swizzle(FFT,5)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU
+// GFX9:      ds_swizzle_b32 v5, v1 offset:swizzle(FFT,5) ; encoding: [0x05,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,5) ; encoding: [0x05,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+
+ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU
+// GFX9:      ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16) ; encoding: [0x10,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16) ; encoding: [0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+
+ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU
+// GFX9:      ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) ; encoding: [0x1f,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) ; encoding: [0x1f,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+
+ds_swizzle_b32 v5, v1 offset:0xf000
+// GFX7:   ds_swizzle_b32 v5, v1 offset:61440 ; encoding: [0x00,0xf0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+// GFX8:   ds_swizzle_b32 v5, v1 offset:61440 ; encoding: [0x00,0xf0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX9:      ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xf0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xf0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+
+
+ds_swizzle_b32 v5, v1 offset:swizzle(FFT,32)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU
+// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: FFT swizzle must be in the interval [0,31]
+
+ds_swizzle_b32 v5, v1 offset:swizzle(FFT,-2)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU
+// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: FFT swizzle must be in the interval [0,31]
+
+ds_swizzle_b32 v5, v1 offset:swizzle(FFT)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU
+// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: expected a comma
+
+ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16,31)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: FFT mode swizzle not supported on this GPU
+// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: expected a closing parentheses
+
+//==============================================================================
+// ROTATE mode
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// GFX9:      ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xc0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// GFX9:      ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0) ; encoding: [0x00,0xc4,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0) ; encoding: [0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05]
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// GFX9:      ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1) ; encoding: [0x20,0xc0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1) ; encoding: [0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// GFX9:      ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1) ; encoding: [0x20,0xc4,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1) ; encoding: [0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05]
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// GFX9:      ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31) ; encoding: [0xe0,0xc3,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31) ; encoding: [0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05]
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// GFX9:      ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31) ; encoding: [0xe0,0xc7,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31) ; encoding: [0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05]
+
+ds_swizzle_b32 v5, v1 offset:0xd000
+// GFX7:   ds_swizzle_b32 v5, v1 offset:53248 ; encoding: [0x00,0xd0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+// GFX8:   ds_swizzle_b32 v5, v1 offset:53248 ; encoding: [0x00,0xd0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX9:   ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xd0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+// GFX10PLUS: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xd0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,2,31)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: direction must be 0 (left) or 1 (right)
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,-1,31)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: direction must be 0 (left) or 1 (right)
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,32)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: number of threads to rotate must be in the interval [0,31]
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,-2)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: number of threads to rotate must be in the interval [0,31]
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: expected a comma
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: expected a comma
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: expected a comma
+
+ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1,2)
+// ERROR-PREGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: Rotate mode swizzle not supported on this GPU
+// ERROR: :[[@LINE-2]]:{{[0-9]+}}: error: expected a closing parentheses
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index dadb515630b66..67038f4c8eec0 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -559,50 +559,59 @@ v_and_or_b32 v5, src_scc, vcc_lo, -1
 v_and_or_b32 v255, 0xaf123456, vcc_hi, null
 // GFX11: encoding: [0xff,0x00,0x57,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
-v_ashrrev_i16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
+v_ashrrev_i16 v5.l, v1.l, v2.l
+// GFX11: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
 
-v_ashrrev_i16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
+v_ashrrev_i16 v5.l, v255.l, v255.l
+// GFX11: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
 
-v_ashrrev_i16 v5, s1, s2
-// GFX11: encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00]
+v_ashrrev_i16 v5.l, s1, s2
+// GFX11: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00]
 
-v_ashrrev_i16 v5, s105, s105
-// GFX11: encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00]
+v_ashrrev_i16 v5.l, s105, s105
+// GFX11: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00]
 
-v_ashrrev_i16 v5, vcc_lo, ttmp15
-// GFX11: encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00]
+v_ashrrev_i16 v5.l, vcc_lo, ttmp15
+// GFX11: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_ashrrev_i16 v5, vcc_hi, 0xfe0b
-// GFX11: encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+v_ashrrev_i16 v5.l, vcc_hi, 0xfe0b
+// GFX11: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_ashrrev_i16 v5, ttmp15, src_scc
-// GFX11: encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00]
+v_ashrrev_i16 v5.l, ttmp15, src_scc
+// GFX11: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_ashrrev_i16 v5, m0, 0.5
+v_ashrrev_i16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x3a,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_ashrrev_i16 v5, exec_lo, -1
-// GFX11: encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00]
+v_ashrrev_i16 v5.l, exec_lo, -1
+// GFX11: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00]
 
-v_ashrrev_i16 v5, exec_hi, null
-// GFX11: encoding: [0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00]
+v_ashrrev_i16 v5.l, exec_hi, null
+// GFX11: [0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_ashrrev_i16 v5, null, exec_lo
-// GFX11: encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00]
+v_ashrrev_i16 v5.l, null, exec_lo
+// GFX11: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_ashrrev_i16 v5, -1, exec_hi
-// GFX11: encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00]
+v_ashrrev_i16 v5.l, -1, exec_hi
+// GFX11: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_ashrrev_i16 v5, 0.5, m0
+v_ashrrev_i16 v5.l, 0.5, m0
 // GFX11: encoding: [0x05,0x00,0x3a,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_ashrrev_i16 v5, src_scc, vcc_lo
-// GFX11: encoding: [0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00]
+v_ashrrev_i16 v5.l, src_scc, vcc_lo
+// GFX11: [0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_ashrrev_i16 v255, 0xfe0b, vcc_hi
-// GFX11: encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_ashrrev_i16 v255.l, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_ashrrev_i16 v5.l, v1.h, v2.l
+// GFX11: [0x05,0x08,0x3a,0xd7,0x01,0x05,0x02,0x00]
+
+v_ashrrev_i16 v5.l, v255.l, v255.h
+// GFX11: [0x05,0x10,0x3a,0xd7,0xff,0xff,0x03,0x00]
+
+v_ashrrev_i16 v255.h, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x40,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_ashrrev_i64 v[5:6], v1, vcc
 // GFX11: encoding: [0x05,0x00,0x3e,0xd7,0x01,0xd5,0x00,0x00]
@@ -2617,50 +2626,59 @@ v_lshl_or_b32 v5, src_scc, vcc_lo, -1
 v_lshl_or_b32 v255, 0xaf123456, vcc_hi, null
 // GFX11: encoding: [0xff,0x00,0x56,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
-v_lshlrev_b16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
+v_lshlrev_b16 v5.l, v1.l, v2.l
+// GFX11: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
 
-v_lshlrev_b16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
+v_lshlrev_b16 v5.l, v255.l, v255.l
+// GFX11: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
 
-v_lshlrev_b16 v5, s1, s2
-// GFX11: encoding: [0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00]
+v_lshlrev_b16 v5.l, s1, s2
+// GFX11: [0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00]
 
-v_lshlrev_b16 v5, s105, s105
-// GFX11: encoding: [0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00]
+v_lshlrev_b16 v5.l, s105, s105
+// GFX11: [0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00]
 
-v_lshlrev_b16 v5, vcc_lo, ttmp15
-// GFX11: encoding: [0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00]
+v_lshlrev_b16 v5.l, vcc_lo, ttmp15
+// GFX11: [0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_lshlrev_b16 v5, vcc_hi, 0xfe0b
-// GFX11: encoding: [0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+v_lshlrev_b16 v5.l, vcc_hi, 0xfe0b
+// GFX11: [0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_lshlrev_b16 v5, ttmp15, src_scc
-// GFX11: encoding: [0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00]
+v_lshlrev_b16 v5.l, ttmp15, src_scc
+// GFX11: [0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_lshlrev_b16 v5, m0, 0.5
+v_lshlrev_b16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x38,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_lshlrev_b16 v5, exec_lo, -1
-// GFX11: encoding: [0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00]
+v_lshlrev_b16 v5.l, exec_lo, -1
+// GFX11: [0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00]
 
-v_lshlrev_b16 v5, exec_hi, null
-// GFX11: encoding: [0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00]
+v_lshlrev_b16 v5.l, exec_hi, null
+// GFX11: [0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_lshlrev_b16 v5, null, exec_lo
-// GFX11: encoding: [0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00]
+v_lshlrev_b16 v5.l, null, exec_lo
+// GFX11: [0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_lshlrev_b16 v5, -1, exec_hi
-// GFX11: encoding: [0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00]
+v_lshlrev_b16 v5.l, -1, exec_hi
+// GFX11: [0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_lshlrev_b16 v5, 0.5, m0
+v_lshlrev_b16 v5.l, 0.5, m0
 // GFX11: encoding: [0x05,0x00,0x38,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_lshlrev_b16 v5, src_scc, vcc_lo
-// GFX11: encoding: [0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00]
+v_lshlrev_b16 v5.l, src_scc, vcc_lo
+// GFX11: [0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_lshlrev_b16 v255.l, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_lshlrev_b16 v5.l, v1.h, v2.l
+// GFX11: [0x05,0x08,0x38,0xd7,0x01,0x05,0x02,0x00]
+
+v_lshlrev_b16 v5.l, v255.l, v255.h
+// GFX11: [0x05,0x10,0x38,0xd7,0xff,0xff,0x03,0x00]
 
-v_lshlrev_b16 v255, 0xfe0b, vcc_hi
-// GFX11: encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_lshlrev_b16 v255.h, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x40,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_lshlrev_b64 v[5:6], v1, vcc
 // GFX11: encoding: [0x05,0x00,0x3c,0xd7,0x01,0xd5,0x00,0x00]
@@ -2689,50 +2707,59 @@ v_lshlrev_b64 v[5:6], src_scc, src_scc
 v_lshlrev_b64 v[254:255], 0xaf123456, 0.5
 // GFX11: encoding: [0xfe,0x00,0x3c,0xd7,0xff,0xe0,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-v_lshrrev_b16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
+v_lshrrev_b16 v5.l, v1.l, v2.l
+// GFX11: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
 
-v_lshrrev_b16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
+v_lshrrev_b16 v5.l, v255.l, v255.l
+// GFX11: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
 
-v_lshrrev_b16 v5, s1, s2
-// GFX11: encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00]
+v_lshrrev_b16 v5.l, s1, s2
+// GFX11: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00]
 
-v_lshrrev_b16 v5, s105, s105
-// GFX11: encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00]
+v_lshrrev_b16 v5.l, s105, s105
+// GFX11: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00]
 
-v_lshrrev_b16 v5, vcc_lo, ttmp15
-// GFX11: encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00]
+v_lshrrev_b16 v5.l, vcc_lo, ttmp15
+// GFX11: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_lshrrev_b16 v5, vcc_hi, 0xfe0b
-// GFX11: encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+v_lshrrev_b16 v5.l, vcc_hi, 0xfe0b
+// GFX11: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_lshrrev_b16 v5, ttmp15, src_scc
-// GFX11: encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00]
+v_lshrrev_b16 v5.l, ttmp15, src_scc
+// GFX11: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_lshrrev_b16 v5, m0, 0.5
+v_lshrrev_b16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x39,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_lshrrev_b16 v5, exec_lo, -1
-// GFX11: encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00]
+v_lshrrev_b16 v5.l, exec_lo, -1
+// GFX11: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00]
 
-v_lshrrev_b16 v5, exec_hi, null
-// GFX11: encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00]
+v_lshrrev_b16 v5.l, exec_hi, null
+// GFX11: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_lshrrev_b16 v5, null, exec_lo
-// GFX11: encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00]
+v_lshrrev_b16 v5.l, null, exec_lo
+// GFX11: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_lshrrev_b16 v5, -1, exec_hi
-// GFX11: encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00]
+v_lshrrev_b16 v5.l, -1, exec_hi
+// GFX11: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_lshrrev_b16 v5, 0.5, m0
+v_lshrrev_b16 v5.l, 0.5, m0
 // GFX11: encoding: [0x05,0x00,0x39,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_lshrrev_b16 v5, src_scc, vcc_lo
-// GFX11: encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00]
+v_lshrrev_b16 v5.l, src_scc, vcc_lo
+// GFX11: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_lshrrev_b16 v255, 0xfe0b, vcc_hi
-// GFX11: encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_lshrrev_b16 v255.l, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_lshrrev_b16 v5.l, v1.h, v2.l
+// GFX11: [0x05,0x08,0x39,0xd7,0x01,0x05,0x02,0x00]
+
+v_lshrrev_b16 v5.l, v255.l, v255.h
+// GFX11: [0x05,0x10,0x39,0xd7,0xff,0xff,0x03,0x00]
+
+v_lshrrev_b16 v255.h, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x40,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_lshrrev_b64 v[5:6], v1, vcc
 // GFX11: encoding: [0x05,0x00,0x3d,0xd7,0x01,0xd5,0x00,0x00]
@@ -3490,95 +3517,113 @@ v_max_f64 v[5:6], -|src_scc|, -|exec| mul:4
 v_max_f64 v[254:255], 0xaf123456, -|vcc| clamp div:2
 // GFX11: encoding: [0xfe,0x82,0x2a,0xd7,0xff,0xd4,0x00,0x58,0x56,0x34,0x12,0xaf]
 
-v_max_i16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
+v_max_i16 v5.l, v1.l, v2.l
+// GFX11: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
 
-v_max_i16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
+v_max_i16 v5.l, v255.l, v255.l
+// GFX11: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
 
-v_max_i16 v5, s1, s2
-// GFX11: encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00]
+v_max_i16 v5.l, s1, s2
+// GFX11: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00]
 
-v_max_i16 v5, s105, s105
-// GFX11: encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00]
+v_max_i16 v5.l, s105, s105
+// GFX11: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00]
 
-v_max_i16 v5, vcc_lo, ttmp15
-// GFX11: encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00]
+v_max_i16 v5.l, vcc_lo, ttmp15
+// GFX11: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_max_i16 v5, vcc_hi, 0xfe0b
-// GFX11: encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+v_max_i16 v5.l, vcc_hi, 0xfe0b
+// GFX11: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_max_i16 v5, ttmp15, src_scc
-// GFX11: encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00]
+v_max_i16 v5.l, ttmp15, src_scc
+// GFX11: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_max_i16 v5, m0, 0.5
+v_max_i16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_max_i16 v5, exec_lo, -1
-// GFX11: encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00]
+v_max_i16 v5.l, exec_lo, -1
+// GFX11: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00]
 
-v_max_i16 v5, exec_hi, null
-// GFX11: encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00]
+v_max_i16 v5.l, exec_hi, null
+// GFX11: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_max_i16 v5, null, exec_lo
-// GFX11: encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00]
+v_max_i16 v5.l, null, exec_lo
+// GFX11: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_max_i16 v5, -1, exec_hi
-// GFX11: encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00]
+v_max_i16 v5.l, -1, exec_hi
+// GFX11: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_max_i16 v5, 0.5, m0
+v_max_i16 v5.l, 0.5, m0
 // GFX11: encoding: [0x05,0x00,0x0a,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_max_i16 v5, src_scc, vcc_lo
-// GFX11: encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00]
+v_max_i16 v5.l, src_scc, vcc_lo
+// GFX11: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_max_i16 v255.l, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_max_i16 v5.l, v1.h, v2.l
+// GFX11: [0x05,0x08,0x0a,0xd7,0x01,0x05,0x02,0x00]
+
+v_max_i16 v5.l, v255.l, v255.h
+// GFX11: [0x05,0x10,0x0a,0xd7,0xff,0xff,0x03,0x00]
 
-v_max_i16 v255, 0xfe0b, vcc_hi
-// GFX11: encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_max_i16 v255.h, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x40,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_max_u16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
+v_max_u16 v5.l, v1.l, v2.l
+// GFX11: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
 
-v_max_u16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
+v_max_u16 v5.l, v255.l, v255.l
+// GFX11: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
 
-v_max_u16 v5, s1, s2
-// GFX11: encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00]
+v_max_u16 v5.l, s1, s2
+// GFX11: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00]
 
-v_max_u16 v5, s105, s105
-// GFX11: encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00]
+v_max_u16 v5.l, s105, s105
+// GFX11: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00]
 
-v_max_u16 v5, vcc_lo, ttmp15
-// GFX11: encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00]
+v_max_u16 v5.l, vcc_lo, ttmp15
+// GFX11: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_max_u16 v5, vcc_hi, 0xfe0b
-// GFX11: encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+v_max_u16 v5.l, vcc_hi, 0xfe0b
+// GFX11: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_max_u16 v5, ttmp15, src_scc
-// GFX11: encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00]
+v_max_u16 v5.l, ttmp15, src_scc
+// GFX11: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_max_u16 v5, m0, 0.5
+v_max_u16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x09,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_max_u16 v5, exec_lo, -1
-// GFX11: encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00]
+v_max_u16 v5.l, exec_lo, -1
+// GFX11: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00]
 
-v_max_u16 v5, exec_hi, null
-// GFX11: encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00]
+v_max_u16 v5.l, exec_hi, null
+// GFX11: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_max_u16 v5, null, exec_lo
-// GFX11: encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00]
+v_max_u16 v5.l, null, exec_lo
+// GFX11: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_max_u16 v5, -1, exec_hi
-// GFX11: encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00]
+v_max_u16 v5.l, -1, exec_hi
+// GFX11: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_max_u16 v5, 0.5, m0
+v_max_u16 v5.l, 0.5, m0
 // GFX11: encoding: [0x05,0x00,0x09,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_max_u16 v5, src_scc, vcc_lo
-// GFX11: encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00]
+v_max_u16 v5.l, src_scc, vcc_lo
+// GFX11: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_max_u16 v255, 0xfe0b, vcc_hi
-// GFX11: encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_max_u16 v255.l, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_max_u16 v5.l, v1.h, v2.l
+// GFX11: [0x05,0x08,0x09,0xd7,0x01,0x05,0x02,0x00]
+
+v_max_u16 v5.l, v255.l, v255.h
+// GFX11: [0x05,0x10,0x09,0xd7,0xff,0xff,0x03,0x00]
+
+v_max_u16 v255.h, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_maxmin_f16 v5, v1, v2, s3
 // GFX11: encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
@@ -4432,95 +4477,113 @@ v_min_f64 v[5:6], -|src_scc|, -|exec| mul:4
 v_min_f64 v[254:255], 0xaf123456, -|vcc| clamp div:2
 // GFX11: encoding: [0xfe,0x82,0x29,0xd7,0xff,0xd4,0x00,0x58,0x56,0x34,0x12,0xaf]
 
-v_min_i16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
+v_min_i16 v5.l, v1.l, v2.l
+// GFX11: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
 
-v_min_i16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
+v_min_i16 v5.l, v255.l, v255.l
+// GFX11: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
 
-v_min_i16 v5, s1, s2
-// GFX11: encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00]
+v_min_i16 v5.l, s1, s2
+// GFX11: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00]
 
-v_min_i16 v5, s105, s105
-// GFX11: encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00]
+v_min_i16 v5.l, s105, s105
+// GFX11: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00]
 
-v_min_i16 v5, vcc_lo, ttmp15
-// GFX11: encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00]
+v_min_i16 v5.l, vcc_lo, ttmp15
+// GFX11: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_min_i16 v5, vcc_hi, 0xfe0b
-// GFX11: encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+v_min_i16 v5.l, vcc_hi, 0xfe0b
+// GFX11: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_min_i16 v5, ttmp15, src_scc
-// GFX11: encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00]
+v_min_i16 v5.l, ttmp15, src_scc
+// GFX11: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_min_i16 v5, m0, 0.5
+v_min_i16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_min_i16 v5, exec_lo, -1
-// GFX11: encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00]
+v_min_i16 v5.l, exec_lo, -1
+// GFX11: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00]
 
-v_min_i16 v5, exec_hi, null
-// GFX11: encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00]
+v_min_i16 v5.l, exec_hi, null
+// GFX11: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_min_i16 v5, null, exec_lo
-// GFX11: encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00]
+v_min_i16 v5.l, null, exec_lo
+// GFX11: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_min_i16 v5, -1, exec_hi
-// GFX11: encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00]
+v_min_i16 v5.l, -1, exec_hi
+// GFX11: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_min_i16 v5, 0.5, m0
+v_min_i16 v5.l, 0.5, m0
 // GFX11: encoding: [0x05,0x00,0x0c,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_min_i16 v5, src_scc, vcc_lo
-// GFX11: encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00]
+v_min_i16 v5.l, src_scc, vcc_lo
+// GFX11: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_min_i16 v255.l, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_min_i16 v5.l, v1.h, v2.l
+// GFX11: [0x05,0x08,0x0c,0xd7,0x01,0x05,0x02,0x00]
+
+v_min_i16 v5.l, v255.l, v255.h
+// GFX11: [0x05,0x10,0x0c,0xd7,0xff,0xff,0x03,0x00]
 
-v_min_i16 v255, 0xfe0b, vcc_hi
-// GFX11: encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_min_i16 v255.h, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x40,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_min_u16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
+v_min_u16 v5.l, v1.l, v2.l
+// GFX11: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
 
-v_min_u16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
+v_min_u16 v5.l, v255.l, v255.l
+// GFX11: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
 
-v_min_u16 v5, s1, s2
-// GFX11: encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00]
+v_min_u16 v5.l, s1, s2
+// GFX11: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00]
 
-v_min_u16 v5, s105, s105
-// GFX11: encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00]
+v_min_u16 v5.l, s105, s105
+// GFX11: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00]
 
-v_min_u16 v5, vcc_lo, ttmp15
-// GFX11: encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00]
+v_min_u16 v5.l, vcc_lo, ttmp15
+// GFX11: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_min_u16 v5, vcc_hi, 0xfe0b
-// GFX11: encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+v_min_u16 v5.l, vcc_hi, 0xfe0b
+// GFX11: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_min_u16 v5, ttmp15, src_scc
-// GFX11: encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00]
+v_min_u16 v5.l, ttmp15, src_scc
+// GFX11: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_min_u16 v5, m0, 0.5
+v_min_u16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_min_u16 v5, exec_lo, -1
-// GFX11: encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00]
+v_min_u16 v5.l, exec_lo, -1
+// GFX11: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00]
 
-v_min_u16 v5, exec_hi, null
-// GFX11: encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00]
+v_min_u16 v5.l, exec_hi, null
+// GFX11: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_min_u16 v5, null, exec_lo
-// GFX11: encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00]
+v_min_u16 v5.l, null, exec_lo
+// GFX11: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_min_u16 v5, -1, exec_hi
-// GFX11: encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00]
+v_min_u16 v5.l, -1, exec_hi
+// GFX11: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_min_u16 v5, 0.5, m0
+v_min_u16 v5.l, 0.5, m0
 // GFX11: encoding: [0x05,0x00,0x0b,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_min_u16 v5, src_scc, vcc_lo
-// GFX11: encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00]
+v_min_u16 v5.l, src_scc, vcc_lo
+// GFX11: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_min_u16 v255, 0xfe0b, vcc_hi
-// GFX11: encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_min_u16 v255.l, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_min_u16 v5.l, v1.h, v2.l
+// GFX11: [0x05,0x08,0x0b,0xd7,0x01,0x05,0x02,0x00]
+
+v_min_u16 v5.l, v255.l, v255.h
+// GFX11: [0x05,0x10,0x0b,0xd7,0xff,0xff,0x03,0x00]
+
+v_min_u16 v255.h, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_minmax_f16 v5, v1, v2, s3
 // GFX11: encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
@@ -4963,50 +5026,59 @@ v_mul_hi_u32 v5, src_scc, vcc_lo
 v_mul_hi_u32 v255, 0xaf123456, vcc_hi
 // GFX11: encoding: [0xff,0x00,0x2d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_mul_lo_u16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
+v_mul_lo_u16 v5.l, v1.l, v2.l
+// GFX11: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
 
-v_mul_lo_u16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
+v_mul_lo_u16 v5.l, v255.l, v255.l
+// GFX11: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
 
-v_mul_lo_u16 v5, s1, s2
-// GFX11: encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00]
+v_mul_lo_u16 v5.l, s1, s2
+// GFX11: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00]
 
-v_mul_lo_u16 v5, s105, s105
-// GFX11: encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00]
+v_mul_lo_u16 v5.l, s105, s105
+// GFX11: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00]
 
-v_mul_lo_u16 v5, vcc_lo, ttmp15
-// GFX11: encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00]
+v_mul_lo_u16 v5.l, vcc_lo, ttmp15
+// GFX11: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_mul_lo_u16 v5, vcc_hi, 0xfe0b
-// GFX11: encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+v_mul_lo_u16 v5.l, vcc_hi, 0xfe0b
+// GFX11: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_mul_lo_u16 v5, ttmp15, src_scc
-// GFX11: encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00]
+v_mul_lo_u16 v5.l, ttmp15, src_scc
+// GFX11: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_mul_lo_u16 v5, m0, 0.5
+v_mul_lo_u16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x05,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_mul_lo_u16 v5, exec_lo, -1
-// GFX11: encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00]
+v_mul_lo_u16 v5.l, exec_lo, -1
+// GFX11: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00]
 
-v_mul_lo_u16 v5, exec_hi, null
-// GFX11: encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00]
+v_mul_lo_u16 v5.l, exec_hi, null
+// GFX11: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_mul_lo_u16 v5, null, exec_lo
-// GFX11: encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00]
+v_mul_lo_u16 v5.l, null, exec_lo
+// GFX11: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_mul_lo_u16 v5, -1, exec_hi
-// GFX11: encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00]
+v_mul_lo_u16 v5.l, -1, exec_hi
+// GFX11: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_mul_lo_u16 v5, 0.5, m0
+v_mul_lo_u16 v5.l, 0.5, m0
 // GFX11: encoding: [0x05,0x00,0x05,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_mul_lo_u16 v5, src_scc, vcc_lo
-// GFX11: encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00]
+v_mul_lo_u16 v5.l, src_scc, vcc_lo
+// GFX11: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_mul_lo_u16 v255.l, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_mul_lo_u16 v5.l, v1.h, v2.l
+// GFX11: [0x05,0x08,0x05,0xd7,0x01,0x05,0x02,0x00]
+
+v_mul_lo_u16 v5.l, v255.l, v255.h
+// GFX11: [0x05,0x10,0x05,0xd7,0xff,0xff,0x03,0x00]
 
-v_mul_lo_u16 v255, 0xfe0b, vcc_hi
-// GFX11: encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_mul_lo_u16 v255.h, 0xfe0b, vcc_hi
+// GFX11: [0xff,0x40,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_mul_lo_u32 v5, v1, v2
 // GFX11: encoding: [0x05,0x00,0x2c,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index ceb8cac21f5aa..3c693c556194e 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -488,48 +488,60 @@ v_and_or_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 boun
 v_and_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x57,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_mirror
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_half_mirror
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:1
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:15
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:1
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:15
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:1
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:15
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX11: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
+v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x40,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+
 v_bcnt_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x1e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
@@ -1771,90 +1783,114 @@ v_lshl_or_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bou
 v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x56,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_mirror
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_half_mirror
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:1
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:15
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:1
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:15
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:1
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:15
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX11: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_lshlrev_b16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_lshlrev_b16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_lshlrev_b16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x40,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_mirror
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_half_mirror
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:1
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:15
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:1
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:15
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:1
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:15
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX11: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
+v_lshrrev_b16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_lshrrev_b16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_lshrrev_b16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x40,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+
 v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
@@ -2359,90 +2395,114 @@ v_max3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_max3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x1e,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_max_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_mirror
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_half_mirror
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_shl:1
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_shl:15
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_shr:1
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_shr:15
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_ror:1
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_ror:15
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX11: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+v_max_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_max_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_max_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_max_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_max_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_max_i16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x40,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+
+v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_mirror
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_half_mirror
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_shl:1
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_shl:15
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_shr:1
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_shr:15
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_ror:1
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_ror:15
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX11: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+v_max_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
+v_max_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_max_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_max_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_max_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+
 v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
@@ -3199,90 +3259,114 @@ v_min3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_min3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x1b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_min_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_mirror
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_half_mirror
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_shl:1
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_shl:15
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_shr:1
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_shr:15
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_ror:1
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_ror:15
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX11: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+v_min_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_min_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_min_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_min_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_min_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_min_i16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x40,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+
+v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_mirror
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_half_mirror
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_shl:1
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_shl:15
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_shr:1
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_shr:15
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_ror:1
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_ror:15
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX11: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+v_min_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
+v_min_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_min_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_min_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_min_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+
 v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
@@ -3493,48 +3577,60 @@ v_msad_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_c
 v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x80,0x39,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_mirror
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_half_mirror
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:1
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:15
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:1
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:15
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:1
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:15
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX11: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
+v_mul_lo_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_mul_lo_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_mul_lo_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x40,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+
 v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
index 446c08347b3a2..79709278bc0c7 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
@@ -252,15 +252,27 @@ v_and_or_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_and_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x57,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX11: [0x05,0x00,0x3a,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x3a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x3a,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x40,0x3a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
 v_bcnt_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x1e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -948,24 +960,48 @@ v_lshl_or_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x56,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX11: [0x05,0x00,0x38,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x38,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_lshlrev_b16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_lshlrev_b16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x38,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_lshlrev_b16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x40,0x38,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX11: [0x05,0x00,0x39,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x39,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+v_lshrrev_b16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_lshrrev_b16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x39,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_lshrrev_b16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x40,0x39,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
 v_mad_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
@@ -1386,24 +1422,48 @@ v_max3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_max3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x1e,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_max_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_max_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX11: [0x05,0x00,0x0a,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+v_max_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x0a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_max_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_max_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_max_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x0a,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_max_i16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x40,0x0a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_max_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_max_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX11: [0x05,0x00,0x09,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+v_max_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+v_max_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_max_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_max_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x09,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_max_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x40,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
 v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
@@ -1986,24 +2046,48 @@ v_min3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_min3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x1b,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_min_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_min_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX11: [0x05,0x00,0x0c,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+v_min_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x0c,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_min_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_min_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_min_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x0c,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_min_i16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x40,0x0c,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_min_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_min_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX11: [0x05,0x00,0x0b,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+v_min_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+v_min_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_min_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_min_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x0b,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_min_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x40,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
 v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
@@ -2184,15 +2268,27 @@ v_msad_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x80,0x39,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX11: [0x05,0x00,0x05,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x05,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
+v_mul_lo_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_lo_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x05,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_lo_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x40,0x05,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
 v_mullit_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index 3e99a6120bfdd..f28933ec3a894 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -523,51 +523,60 @@ v_and_or_b32 v5, src_scc, vcc_lo, -1
 v_and_or_b32 v255, 0xaf123456, vcc_hi, null
 // GFX12: encoding: [0xff,0x00,0x57,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
-v_ashrrev_i16 v5, v1, v2
+v_ashrrev_i16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
 
-v_ashrrev_i16 v5, v255, v255
+v_ashrrev_i16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x3a,0xd7,0x01,0x05,0x02,0x00]
+
+v_ashrrev_i16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
 
-v_ashrrev_i16 v5, s1, s2
+v_ashrrev_i16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x3a,0xd7,0xff,0xff,0x03,0x00]
+
+v_ashrrev_i16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00]
 
-v_ashrrev_i16 v5, s105, s105
+v_ashrrev_i16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00]
 
-v_ashrrev_i16 v5, vcc_lo, ttmp15
+v_ashrrev_i16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_ashrrev_i16 v5, vcc_hi, 0xfe0b
+v_ashrrev_i16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_ashrrev_i16 v5, ttmp15, src_scc
+v_ashrrev_i16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_ashrrev_i16 v5, m0, 0.5
+v_ashrrev_i16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_ashrrev_i16 v5, exec_lo, -1
+v_ashrrev_i16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00]
 
-v_ashrrev_i16 v5, exec_hi, null
+v_ashrrev_i16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_ashrrev_i16 v5, null, exec_lo
+v_ashrrev_i16 v5.l, null, exec_lo
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_ashrrev_i16 v5, -1, exec_hi
+v_ashrrev_i16 v5.l, -1, exec_hi
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_ashrrev_i16 v5, 0.5, m0
+v_ashrrev_i16 v5.l, 0.5, m0
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_ashrrev_i16 v5, src_scc, vcc_lo
+v_ashrrev_i16 v5.l, src_scc, vcc_lo
 // GFX12: encoding: [0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_ashrrev_i16 v255, 0xfe0b, vcc_hi
+v_ashrrev_i16 v255.l, 0xfe0b, vcc_hi
 // GFX12: encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+v_ashrrev_i16 v255.h, 0xfe0b, vcc_hi
+// GFX12: encoding: [0xff,0x40,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
 v_ashrrev_i64 v[5:6], v1, vcc
 // GFX12: encoding: [0x05,0x00,0x3e,0xd7,0x01,0xd5,0x00,0x00]
 
@@ -2623,96 +2632,114 @@ v_lshl_or_b32 v5, src_scc, vcc_lo, -1
 v_lshl_or_b32 v255, 0xaf123456, vcc_hi, null
 // GFX12: encoding: [0xff,0x00,0x56,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
-v_lshlrev_b16 v5, v1, v2
+v_lshlrev_b16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
 
-v_lshlrev_b16 v5, v255, v255
+v_lshlrev_b16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x38,0xd7,0x01,0x05,0x02,0x00]
+
+v_lshlrev_b16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
 
-v_lshlrev_b16 v5, s1, s2
+v_lshlrev_b16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x38,0xd7,0xff,0xff,0x03,0x00]
+
+v_lshlrev_b16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00]
 
-v_lshlrev_b16 v5, s105, s105
+v_lshlrev_b16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00]
 
-v_lshlrev_b16 v5, vcc_lo, ttmp15
+v_lshlrev_b16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_lshlrev_b16 v5, vcc_hi, 0xfe0b
+v_lshlrev_b16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_lshlrev_b16 v5, ttmp15, src_scc
+v_lshlrev_b16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_lshlrev_b16 v5, m0, 0.5
+v_lshlrev_b16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_lshlrev_b16 v5, exec_lo, -1
+v_lshlrev_b16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00]
 
-v_lshlrev_b16 v5, exec_hi, null
+v_lshlrev_b16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_lshlrev_b16 v5, null, exec_lo
+v_lshlrev_b16 v5.l, null, exec_lo
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_lshlrev_b16 v5, -1, exec_hi
+v_lshlrev_b16 v5.l, -1, exec_hi
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_lshlrev_b16 v5, 0.5, m0
+v_lshlrev_b16 v5.l, 0.5, m0
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_lshlrev_b16 v5, src_scc, vcc_lo
+v_lshlrev_b16 v5.l, src_scc, vcc_lo
 // GFX12: encoding: [0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_lshlrev_b16 v255, 0xfe0b, vcc_hi
+v_lshlrev_b16 v255.l, 0xfe0b, vcc_hi
 // GFX12: encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_lshrrev_b16 v5, v1, v2
+v_lshlrev_b16 v255.h, 0xfe0b, vcc_hi
+// GFX12: encoding: [0xff,0x40,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_lshrrev_b16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
 
-v_lshrrev_b16 v5, v255, v255
+v_lshrrev_b16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
 
-v_lshrrev_b16 v5, s1, s2
+v_lshrrev_b16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00]
 
-v_lshrrev_b16 v5, s105, s105
+v_lshrrev_b16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x39,0xd7,0x01,0x05,0x02,0x00]
+
+v_lshrrev_b16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00]
 
-v_lshrrev_b16 v5, vcc_lo, ttmp15
+v_lshrrev_b16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x39,0xd7,0xff,0xff,0x03,0x00]
+
+v_lshrrev_b16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_lshrrev_b16 v5, vcc_hi, 0xfe0b
+v_lshrrev_b16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_lshrrev_b16 v5, ttmp15, src_scc
+v_lshrrev_b16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_lshrrev_b16 v5, m0, 0.5
+v_lshrrev_b16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_lshrrev_b16 v5, exec_lo, -1
+v_lshrrev_b16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00]
 
-v_lshrrev_b16 v5, exec_hi, null
+v_lshrrev_b16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_lshrrev_b16 v5, null, exec_lo
+v_lshrrev_b16 v5.l, null, exec_lo
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_lshrrev_b16 v5, -1, exec_hi
+v_lshrrev_b16 v5.l, -1, exec_hi
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_lshrrev_b16 v5, 0.5, m0
+v_lshrrev_b16 v5.l, 0.5, m0
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_lshrrev_b16 v5, src_scc, vcc_lo
+v_lshrrev_b16 v5.l, src_scc, vcc_lo
 // GFX12: encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_lshrrev_b16 v255, 0xfe0b, vcc_hi
+v_lshrrev_b16 v255.l, 0xfe0b, vcc_hi
 // GFX12: encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+v_lshrrev_b16 v255.h, 0xfe0b, vcc_hi
+// GFX12: encoding: [0xff,0x40,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
 v_lshrrev_b64 v[5:6], v1, vcc
 // GFX12: encoding: [0x05,0x00,0x3d,0xd7,0x01,0xd5,0x00,0x00]
 
@@ -3430,96 +3457,114 @@ v_max3_u32 v5, src_scc, vcc_lo, -1
 v_max3_u32 v255, 0xaf123456, vcc_hi, null
 // GFX12: encoding: [0xff,0x00,0x1e,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
-v_max_i16 v5, v1, v2
+v_max_i16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
 
-v_max_i16 v5, v255, v255
+v_max_i16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
 
-v_max_i16 v5, s1, s2
+v_max_i16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x0a,0xd7,0x01,0x05,0x02,0x00]
+
+v_max_i16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00]
 
-v_max_i16 v5, s105, s105
+v_max_i16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x0a,0xd7,0xff,0xff,0x03,0x00]
+
+v_max_i16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00]
 
-v_max_i16 v5, vcc_lo, ttmp15
+v_max_i16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_max_i16 v5, vcc_hi, 0xfe0b
+v_max_i16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_max_i16 v5, ttmp15, src_scc
+v_max_i16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_max_i16 v5, m0, 0.5
+v_max_i16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_max_i16 v5, exec_lo, -1
+v_max_i16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00]
 
-v_max_i16 v5, exec_hi, null
+v_max_i16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_max_i16 v5, null, exec_lo
+v_max_i16 v5.l, null, exec_lo
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_max_i16 v5, -1, exec_hi
+v_max_i16 v5.l, -1, exec_hi
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_max_i16 v5, 0.5, m0
+v_max_i16 v5.l, 0.5, m0
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_max_i16 v5, src_scc, vcc_lo
+v_max_i16 v5.l, src_scc, vcc_lo
 // GFX12: encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_max_i16 v255, 0xfe0b, vcc_hi
+v_max_i16 v255.l, 0xfe0b, vcc_hi
 // GFX12: encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_max_u16 v5, v1, v2
+v_max_i16 v255.h, 0xfe0b, vcc_hi
+// GFX12: encoding: [0xff,0x40,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_max_u16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
 
-v_max_u16 v5, v255, v255
+v_max_u16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x09,0xd7,0x01,0x05,0x02,0x00]
+
+v_max_u16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
 
-v_max_u16 v5, s1, s2
+v_max_u16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x09,0xd7,0xff,0xff,0x03,0x00]
+
+v_max_u16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00]
 
-v_max_u16 v5, s105, s105
+v_max_u16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00]
 
-v_max_u16 v5, vcc_lo, ttmp15
+v_max_u16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_max_u16 v5, vcc_hi, 0xfe0b
+v_max_u16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_max_u16 v5, ttmp15, src_scc
+v_max_u16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_max_u16 v5, m0, 0.5
+v_max_u16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_max_u16 v5, exec_lo, -1
+v_max_u16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00]
 
-v_max_u16 v5, exec_hi, null
+v_max_u16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_max_u16 v5, null, exec_lo
+v_max_u16 v5.l, null, exec_lo
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_max_u16 v5, -1, exec_hi
+v_max_u16 v5.l, -1, exec_hi
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_max_u16 v5, 0.5, m0
+v_max_u16 v5.l, 0.5, m0
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_max_u16 v5, src_scc, vcc_lo
+v_max_u16 v5.l, src_scc, vcc_lo
 // GFX12: encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_max_u16 v255, 0xfe0b, vcc_hi
+v_max_u16 v255.l, 0xfe0b, vcc_hi
 // GFX12: encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+v_max_u16 v255.h, 0xfe0b, vcc_hi
+// GFX12: encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
 v_maxmin_num_f16 v5, v1, v2, s3
 // GFX12: encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
 
@@ -4330,96 +4375,114 @@ v_min3_u32 v5, src_scc, vcc_lo, -1
 v_min3_u32 v255, 0xaf123456, vcc_hi, null
 // GFX12: encoding: [0xff,0x00,0x1b,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
-v_min_i16 v5, v1, v2
+v_min_i16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
 
-v_min_i16 v5, v255, v255
+v_min_i16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
 
-v_min_i16 v5, s1, s2
+v_min_i16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x0c,0xd7,0x01,0x05,0x02,0x00]
+
+v_min_i16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00]
 
-v_min_i16 v5, s105, s105
+v_min_i16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x0c,0xd7,0xff,0xff,0x03,0x00]
+
+v_min_i16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00]
 
-v_min_i16 v5, vcc_lo, ttmp15
+v_min_i16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_min_i16 v5, vcc_hi, 0xfe0b
+v_min_i16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_min_i16 v5, ttmp15, src_scc
+v_min_i16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_min_i16 v5, m0, 0.5
+v_min_i16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_min_i16 v5, exec_lo, -1
+v_min_i16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00]
 
-v_min_i16 v5, exec_hi, null
+v_min_i16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_min_i16 v5, null, exec_lo
+v_min_i16 v5.l, null, exec_lo
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_min_i16 v5, -1, exec_hi
+v_min_i16 v5.l, -1, exec_hi
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_min_i16 v5, 0.5, m0
+v_min_i16 v5.l, 0.5, m0
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_min_i16 v5, src_scc, vcc_lo
+v_min_i16 v5.l, src_scc, vcc_lo
 // GFX12: encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_min_i16 v255, 0xfe0b, vcc_hi
+v_min_i16 v255.l, 0xfe0b, vcc_hi
 // GFX12: encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_min_u16 v5, v1, v2
+v_min_i16 v255.h, 0xfe0b, vcc_hi
+// GFX12: encoding: [0xff,0x40,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_min_u16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
 
-v_min_u16 v5, v255, v255
+v_min_u16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x0b,0xd7,0x01,0x05,0x02,0x00]
+
+v_min_u16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
 
-v_min_u16 v5, s1, s2
+v_min_u16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x0b,0xd7,0xff,0xff,0x03,0x00]
+
+v_min_u16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00]
 
-v_min_u16 v5, s105, s105
+v_min_u16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00]
 
-v_min_u16 v5, vcc_lo, ttmp15
+v_min_u16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_min_u16 v5, vcc_hi, 0xfe0b
+v_min_u16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_min_u16 v5, ttmp15, src_scc
+v_min_u16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_min_u16 v5, m0, 0.5
+v_min_u16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_min_u16 v5, exec_lo, -1
+v_min_u16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00]
 
-v_min_u16 v5, exec_hi, null
+v_min_u16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_min_u16 v5, null, exec_lo
+v_min_u16 v5.l, null, exec_lo
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_min_u16 v5, -1, exec_hi
+v_min_u16 v5.l, -1, exec_hi
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_min_u16 v5, 0.5, m0
+v_min_u16 v5.l, 0.5, m0
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_min_u16 v5, src_scc, vcc_lo
+v_min_u16 v5.l, src_scc, vcc_lo
 // GFX12: encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_min_u16 v255, 0xfe0b, vcc_hi
+v_min_u16 v255.l, 0xfe0b, vcc_hi
 // GFX12: encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+v_min_u16 v255.h, 0xfe0b, vcc_hi
+// GFX12: encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
 v_minmax_num_f16 v5, v1, v2, s3
 // GFX12: encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
 
@@ -4825,51 +4888,60 @@ v_mul_hi_u32 v5, src_scc, vcc_lo
 v_mul_hi_u32 v255, 0xaf123456, vcc_hi
 // GFX12: encoding: [0xff,0x00,0x2d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_mul_lo_u16 v5, v1, v2
+v_mul_lo_u16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
 
-v_mul_lo_u16 v5, v255, v255
+v_mul_lo_u16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x05,0xd7,0x01,0x05,0x02,0x00]
+
+v_mul_lo_u16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
 
-v_mul_lo_u16 v5, s1, s2
+v_mul_lo_u16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x05,0xd7,0xff,0xff,0x03,0x00]
+
+v_mul_lo_u16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00]
 
-v_mul_lo_u16 v5, s105, s105
+v_mul_lo_u16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00]
 
-v_mul_lo_u16 v5, vcc_lo, ttmp15
+v_mul_lo_u16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_mul_lo_u16 v5, vcc_hi, 0xfe0b
+v_mul_lo_u16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_mul_lo_u16 v5, ttmp15, src_scc
+v_mul_lo_u16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_mul_lo_u16 v5, m0, 0.5
+v_mul_lo_u16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_mul_lo_u16 v5, exec_lo, -1
+v_mul_lo_u16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00]
 
-v_mul_lo_u16 v5, exec_hi, null
+v_mul_lo_u16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_mul_lo_u16 v5, null, exec_lo
+v_mul_lo_u16 v5.l, null, exec_lo
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_mul_lo_u16 v5, -1, exec_hi
+v_mul_lo_u16 v5.l, -1, exec_hi
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_mul_lo_u16 v5, 0.5, m0
+v_mul_lo_u16 v5.l, 0.5, m0
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_mul_lo_u16 v5, src_scc, vcc_lo
+v_mul_lo_u16 v5.l, src_scc, vcc_lo
 // GFX12: encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_mul_lo_u16 v255, 0xfe0b, vcc_hi
+v_mul_lo_u16 v255.l, 0xfe0b, vcc_hi
 // GFX12: encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+v_mul_lo_u16 v255.h, 0xfe0b, vcc_hi
+// GFX12: encoding: [0xff,0x40,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
 v_mul_lo_u32 v5, v1, v2
 // GFX12: encoding: [0x05,0x00,0x2c,0xd7,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index 14b489efc8d19..adf37901fc85f 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -526,47 +526,59 @@ v_and_or_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 boun
 v_and_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x00,0x57,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_half_mirror
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:1
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
+// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
+// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:15
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:1
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:15
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:1
-// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:15
-// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x40,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_bcnt_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x1e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2038,89 +2050,113 @@ v_lshl_or_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bou
 v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x00,0x56,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_half_mirror
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:1
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1
+// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15
+// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:15
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:1
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:15
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:1
-// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:15
-// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_lshlrev_b16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_lshlrev_b16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_lshlrev_b16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x40,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_mirror
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1
+// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15
+// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:1
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:15
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:1
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:15
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:1
-// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:15
-// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_lshrrev_b16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_lshrrev_b16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_lshrrev_b16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x40,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -2698,89 +2734,113 @@ v_max3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_max3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x00,0x1e,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_max_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_max_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_max_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_half_mirror
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_shl:1
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
+// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
+// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_shl:15
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_shr:1
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_shr:15
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_ror:1
-// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_max_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_max_i16_e64_dpp v5, v1, v2 row_ror:15
-// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_max_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_max_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_max_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_max_i16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x40,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_max_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+
+v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_mirror
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
+// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
+// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_shl:1
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_shl:15
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_shr:1
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_shr:15
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_ror:1
-// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_max_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_max_u16_e64_dpp v5, v1, v2 row_ror:15
-// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_max_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_max_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_max_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
-
-v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_max_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3634,89 +3694,113 @@ v_min3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_min3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x00,0x1b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_min_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_min_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_min_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_half_mirror
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_shl:1
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
+// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
+// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_shl:15
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_shr:1
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_shr:15
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_ror:1
-// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_min_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_min_i16_e64_dpp v5, v1, v2 row_ror:15
-// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_min_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_min_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_min_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_min_i16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x40,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_min_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_min_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+
+v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_mirror
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
+// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_shl:1
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
+// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_shl:15
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_shr:1
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_shr:15
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_ror:1
-// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-
-v_min_u16_e64_dpp v5, v1, v2 row_ror:15
-// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_min_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_min_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_min_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_min_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_min_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3955,47 +4039,59 @@ v_msad_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_c
 v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x80,0x39,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_half_mirror
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:1
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
+// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
+// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:15
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:1
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:15
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:1
-// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:15
-// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_mul_lo_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_mul_lo_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_mul_lo_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x40,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index 2d912a4d1ad1f..1be122faccbc9 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -290,14 +290,26 @@ v_and_or_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_and_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x00,0x57,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x00,0x3a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x3a,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x00,0x3a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x3a,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x40,0x3a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_bcnt_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -1151,23 +1163,47 @@ v_lshl_or_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x00,0x56,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x00,0x38,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x38,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x00,0x38,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_lshlrev_b16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_lshlrev_b16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x38,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_lshlrev_b16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x40,0x38,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x00,0x39,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x39,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x00,0x39,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_lshrrev_b16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_lshrrev_b16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x39,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_lshrrev_b16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x40,0x39,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_mad_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1661,23 +1697,47 @@ v_max3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_max3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x00,0x1e,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_max_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x00,0x0a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_max_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_max_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x0a,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x00,0x0a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_max_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_max_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_max_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x0a,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_max_i16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x40,0x0a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_max_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x00,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_max_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_max_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x09,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x00,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_max_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_max_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_max_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x09,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_max_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x40,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2357,23 +2417,47 @@ v_min3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_min3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x00,0x1b,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_min_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x00,0x0c,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_min_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_min_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x0c,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x00,0x0c,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_min_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_min_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_min_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x0c,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_min_i16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x40,0x0c,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_min_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x00,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_min_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_min_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x0b,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x00,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_min_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_min_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_min_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x0b,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_min_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x40,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2582,14 +2666,26 @@ v_msad_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x80,0x39,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x00,0x05,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x05,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x00,0x05,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_mul_lo_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_lo_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x05,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_lo_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x40,0x05,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_mullit_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
index ee8218613a6dc..0b518acc884df 100644
--- a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
+++ b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
@@ -8257,15 +8257,15 @@ ds_read_u16 a5, v1
 // NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 ds_read_u16 a5, v1 offset:4
 
-// GFX90A: ds_swizzle_b32 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05]
+// GFX90A: ds_swizzle_b32 a5, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05]
 // NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 ds_swizzle_b32 a5, v1 offset:65535
 
-// GFX90A: ds_swizzle_b32 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff]
+// GFX90A: ds_swizzle_b32 a255, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff]
 // NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 ds_swizzle_b32 a255, v1 offset:65535
 
-// GFX90A: ds_swizzle_b32 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05]
+// GFX90A: ds_swizzle_b32 a5, v255 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05]
 // NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 ds_swizzle_b32 a5, v255 offset:65535
 
diff --git a/llvm/test/MC/ARM/ltorg-range.s b/llvm/test/MC/ARM/ltorg-range.s
index 5c27d4cd0df26..88b9bb3cb5be8 100644
--- a/llvm/test/MC/ARM/ltorg-range.s
+++ b/llvm/test/MC/ARM/ltorg-range.s
@@ -1,5 +1,5 @@
 @ RUN: llvm-mc -triple armv7-unknown-linux-gnueabi -filetype obj -o - %s \
-@ RUN:   | llvm-objdump -d - | FileCheck %s
+@ RUN:   | llvm-objdump -d -z - | FileCheck %s
 
         ldr r0, =0x01020304
 @ CHECK: ldr
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_ds.txt
index 688b5f916630a..bf9bad5240416 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_ds.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_ds.txt
@@ -4208,24 +4208,48 @@
 # GFX10: ds_sub_u64 v255, v[2:3] offset:65535    ; encoding: [0xff,0xff,0x04,0xd9,0xff,0x02,0x00,0x00]
 0xff,0xff,0x04,0xd9,0xff,0x02,0x00,0x00
 
-# GFX10: ds_swizzle_b32 v255, v1 offset:65535    ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff]
+# GFX10: ds_swizzle_b32 v255, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff]
 0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff
 
 # GFX10: ds_swizzle_b32 v5, v1                   ; encoding: [0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05]
 0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05
 
-# GFX10: ds_swizzle_b32 v5, v1 offset:65535      ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05]
+# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05]
 0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05
 
-# GFX10: ds_swizzle_b32 v5, v1 offset:65535 gds  ; encoding: [0xff,0xff,0xd6,0xd8,0x01,0x00,0x00,0x05]
+# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) gds ; encoding: [0xff,0xff,0xd6,0xd8,0x01,0x00,0x00,0x05]
 0xff,0xff,0xd6,0xd8,0x01,0x00,0x00,0x05
 
 # GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(BITMASK_PERM,"00p00") ; encoding: [0x04,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05]
 0x04,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05
 
-# GFX10: ds_swizzle_b32 v5, v255 offset:65535    ; encoding: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05]
+# GFX10: ds_swizzle_b32 v5, v255 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05]
 0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05
 
+# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16) ; encoding: [0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0) ; encoding: [0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1) ; encoding: [0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1) ; encoding: [0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31) ; encoding: [0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX10: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31) ; encoding: [0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05
+
 # GFX10: ds_wrap_rtn_b32 v255, v1, v2, v3 offset:65535 ; encoding: [0xff,0xff,0xd0,0xd8,0x01,0x02,0x03,0xff]
 0xff,0xff,0xd0,0xd8,0x01,0x02,0x03,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt
index 1b632b56400fb..448cfc95de095 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt
@@ -3802,16 +3802,16 @@
 # GFX11: ds_sub_u64 v255, v[2:3] offset:65535    ; encoding: [0xff,0xff,0x04,0xd9,0xff,0x02,0x00,0x00]
 0xff,0xff,0x04,0xd9,0xff,0x02,0x00,0x00
 
-# GFX11: ds_swizzle_b32 v255, v1 offset:65535    ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff]
+# GFX11: ds_swizzle_b32 v255, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff]
 0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff
 
 # GFX11: ds_swizzle_b32 v5, v1                   ; encoding: [0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05]
 0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05
 
-# GFX11: ds_swizzle_b32 v5, v1 offset:65535      ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05]
+# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05]
 0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05
 
-# GFX11: ds_swizzle_b32 v5, v1 offset:65535 gds  ; encoding: [0xff,0xff,0xd6,0xd8,0x01,0x00,0x00,0x05]
+# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) gds ; encoding: [0xff,0xff,0xd6,0xd8,0x01,0x00,0x00,0x05]
 0xff,0xff,0xd6,0xd8,0x01,0x00,0x00,0x05
 
 # GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(BITMASK_PERM,"00p00") ; encoding: [0x04,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05]
@@ -3829,9 +3829,33 @@
 # GFX11: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
 0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08
 
-# GFX11: ds_swizzle_b32 v5, v255 offset:65535    ; encoding: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05]
+# GFX11: ds_swizzle_b32 v5, v255 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05]
 0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05
 
+# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16) ; encoding: [0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0) ; encoding: [0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1) ; encoding: [0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1) ; encoding: [0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31) ; encoding: [0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX11: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31) ; encoding: [0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05
+
 # GFX11: ds_wrap_rtn_b32 v255, v1, v2, v3 offset:65535 ; encoding: [0xff,0xff,0xd0,0xd8,0x01,0x02,0x03,0xff]
 0xff,0xff,0xd0,0xd8,0x01,0x02,0x03,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
index c58b696e2d2e7..07058a6451592 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
@@ -549,51 +549,114 @@
 # GFX11: v_and_or_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x57,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 0xff,0x00,0x57,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf
 
-# GFX11: v_ashrrev_i16 v5, v1, v2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, v1, v2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, v1, v2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00
 
-# GFX11: v_ashrrev_i16 v5, v255, v255            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, v255, v255            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, v255, v255            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00
 
-# GFX11: v_ashrrev_i16 v5, s1, s2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, s1, s2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, s1, s2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_ashrrev_i16 v5, s105, s105            ; encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, s105, s105            ; encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, s105, s105            ; encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_ashrrev_i16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_ashrrev_i16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, vcc_hi, 0xfe0b      ; encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, vcc_hi, 0xfe0b      ; encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_ashrrev_i16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_ashrrev_i16 v5, m0, 0x3800
+# W32-REAL16: v_ashrrev_i16 v5.l, m0, 0x3800          ; encoding: [0x05,0x00,0x3a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, m0, 0x3800            ; encoding: [0x05,0x00,0x3a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, m0, 0x3800          ; encoding: [0x05,0x00,0x3a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, m0, 0x3800            ; encoding: [0x05,0x00,0x3a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_ashrrev_i16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_ashrrev_i16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, exec_hi, null       ; encoding: [0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, exec_hi, null       ; encoding: [0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_ashrrev_i16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_ashrrev_i16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX11: v_ashrrev_i16 v5, 0x3800, m0
+# W32-REAL16: v_ashrrev_i16 v5.l, 0x3800, m0          ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, 0x3800, m0            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, 0x3800, m0          ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, 0x3800, m0            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX11: v_ashrrev_i16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, src_scc, vcc_lo     ; encoding: [0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, src_scc, vcc_lo     ; encoding: [0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX11: v_ashrrev_i16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v255.l, 0xfe0b, vcc_hi    ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v255.l, 0xfe0b, vcc_hi    ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
+# W32-REAL16: v_ashrrev_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, v1, v2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, v1, v2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x3a,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_ashrrev_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, v255, v255            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, v255, v255            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x3a,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_ashrrev_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
 # GFX11: v_ashrrev_i64 v[5:6], v1, vcc           ; encoding: [0x05,0x00,0x3e,0xd7,0x01,0xd5,0x00,0x00]
 0x05,0x00,0x3e,0xd7,0x01,0xd5,0x00,0x00
 
@@ -2292,51 +2355,114 @@
 # GFX11: v_lshl_or_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x56,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 0xff,0x00,0x56,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf
 
-# GFX11: v_lshlrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00
 
-# GFX11: v_lshlrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00
 
-# GFX11: v_lshlrev_b16 v5, s1, s2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, s1, s2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, s1, s2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_lshlrev_b16 v5, s105, s105            ; encoding: [0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, s105, s105            ; encoding: [0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, s105, s105            ; encoding: [0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_lshlrev_b16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_lshlrev_b16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, vcc_hi, 0xfe0b      ; encoding: [0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, vcc_hi, 0xfe0b      ; encoding: [0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_lshlrev_b16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_lshlrev_b16 v5, m0, 0x3800
+# W32-REAL16: v_lshlrev_b16 v5.l, m0, 0x3800          ; encoding: [0x05,0x00,0x38,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, m0, 0x3800            ; encoding: [0x05,0x00,0x38,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, m0, 0x3800          ; encoding: [0x05,0x00,0x38,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, m0, 0x3800            ; encoding: [0x05,0x00,0x38,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_lshlrev_b16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_lshlrev_b16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, exec_hi, null       ; encoding: [0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, exec_hi, null       ; encoding: [0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_lshlrev_b16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_lshlrev_b16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX11: v_lshlrev_b16 v5, 0x3800, m0
+# W32-REAL16: v_lshlrev_b16 v5.l, 0x3800, m0          ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, 0x3800, m0            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, 0x3800, m0          ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, 0x3800, m0            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x38,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX11: v_lshlrev_b16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, src_scc, vcc_lo     ; encoding: [0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, src_scc, vcc_lo     ; encoding: [0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX11: v_lshlrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v255.l, 0xfe0b, vcc_hi    ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v255.l, 0xfe0b, vcc_hi    ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
+# W32-REAL16: v_lshlrev_b16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x38,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_lshlrev_b16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x38,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_lshlrev_b16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
 # GFX11: v_lshlrev_b64 v[5:6], v1, vcc           ; encoding: [0x05,0x00,0x3c,0xd7,0x01,0xd5,0x00,0x00]
 0x05,0x00,0x3c,0xd7,0x01,0xd5,0x00,0x00
 
@@ -2364,51 +2490,114 @@
 # GFX11: v_lshlrev_b64 v[254:255], 0xaf123456, 0.5 ; encoding: [0xfe,0x00,0x3c,0xd7,0xff,0xe0,0x01,0x00,0x56,0x34,0x12,0xaf]
 0xfe,0x00,0x3c,0xd7,0xff,0xe0,0x01,0x00,0x56,0x34,0x12,0xaf
 
-# GFX11: v_lshrrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00
 
-# GFX11: v_lshrrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00
 
-# GFX11: v_lshrrev_b16 v5, s1, s2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, s1, s2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, s1, s2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_lshrrev_b16 v5, s105, s105            ; encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, s105, s105            ; encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, s105, s105            ; encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_lshrrev_b16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_lshrrev_b16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, vcc_hi, 0xfe0b      ; encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, vcc_hi, 0xfe0b      ; encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_lshrrev_b16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_lshrrev_b16 v5, m0, 0x3800
+# W32-REAL16: v_lshrrev_b16 v5.l, m0, 0x3800          ; encoding: [0x05,0x00,0x39,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, m0, 0x3800            ; encoding: [0x05,0x00,0x39,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, m0, 0x3800          ; encoding: [0x05,0x00,0x39,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, m0, 0x3800            ; encoding: [0x05,0x00,0x39,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_lshrrev_b16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_lshrrev_b16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, exec_hi, null       ; encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, exec_hi, null       ; encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_lshrrev_b16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_lshrrev_b16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX11: v_lshrrev_b16 v5, 0x3800, m0
+# W32-REAL16: v_lshrrev_b16 v5.l, 0x3800, m0          ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, 0x3800, m0            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, 0x3800, m0          ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, 0x3800, m0            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x39,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX11: v_lshrrev_b16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, src_scc, vcc_lo     ; encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, src_scc, vcc_lo     ; encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX11: v_lshrrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v255.l, 0xfe0b, vcc_hi    ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v255.l, 0xfe0b, vcc_hi    ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
+# W32-REAL16: v_lshrrev_b16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x39,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_lshrrev_b16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x39,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_lshrrev_b16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
 # GFX11: v_lshrrev_b64 v[5:6], v1, vcc           ; encoding: [0x05,0x00,0x3d,0xd7,0x01,0xd5,0x00,0x00]
 0x05,0x00,0x3d,0xd7,0x01,0xd5,0x00,0x00
 
@@ -3133,96 +3322,222 @@
 # GFX11: v_max_f64 v[254:255], 0xaf123456, -|vcc| clamp div:2 ; encoding: [0xfe,0x82,0x2a,0xd7,0xff,0xd4,0x00,0x58,0x56,0x34,0x12,0xaf]
 0xfe,0x82,0x2a,0xd7,0xff,0xd4,0x00,0x58,0x56,0x34,0x12,0xaf
 
-# GFX11: v_max_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_max_i16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_max_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_max_i16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_max_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00
 
-# GFX11: v_max_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_max_i16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_max_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_max_i16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_max_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00
 
-# GFX11: v_max_i16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_max_i16 v5, s105, s105                ; encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, s105, s105                ; encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, s105, s105                ; encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_max_i16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_max_i16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_max_i16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_max_i16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_max_i16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_max_i16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_max_i16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_max_i16 v5, m0, 0x3800
+# W32-REAL16: v_max_i16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_max_i16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_max_i16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_max_i16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_max_i16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_max_i16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_max_i16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_max_i16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_max_i16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX11: v_max_i16 v5, 0x3800, m0
+# W32-REAL16: v_max_i16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX11: v_max_i16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX11: v_max_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max_i16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max_i16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_max_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_max_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_max_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_max_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_max_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0a,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_max_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_max_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_max_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_max_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0a,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_max_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_max_u16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_max_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_max_u16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_max_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00
 
-# GFX11: v_max_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_max_u16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_max_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_max_u16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_max_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00
 
-# GFX11: v_max_u16 v5, s1, s2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, s1, s2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, s1, s2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_max_u16 v5, s105, s105                ; encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, s105, s105                ; encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, s105, s105                ; encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_max_u16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_max_u16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_max_u16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_max_u16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_max_u16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_max_u16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_max_u16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_max_u16 v5, m0, 0x3800
+# W32-REAL16: v_max_u16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x09,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x09,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x09,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x09,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_max_u16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_max_u16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_max_u16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_max_u16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_max_u16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_max_u16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_max_u16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_max_u16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX11: v_max_u16 v5, 0x3800, m0
+# W32-REAL16: v_max_u16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x09,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX11: v_max_u16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX11: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max_u16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max_u16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
+# W32-REAL16: v_max_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_max_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_max_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_max_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x09,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_max_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_max_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_max_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_max_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x09,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_max_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
 # GFX11: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
 0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00
 
@@ -4075,96 +4390,222 @@
 # GFX11: v_min_f64 v[254:255], 0xaf123456, -|vcc| clamp div:2 ; encoding: [0xfe,0x82,0x29,0xd7,0xff,0xd4,0x00,0x58,0x56,0x34,0x12,0xaf]
 0xfe,0x82,0x29,0xd7,0xff,0xd4,0x00,0x58,0x56,0x34,0x12,0xaf
 
-# GFX11: v_min_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_min_i16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_min_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_min_i16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_min_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00
 
-# GFX11: v_min_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_min_i16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_min_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_min_i16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_min_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00
 
-# GFX11: v_min_i16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_min_i16 v5, s105, s105                ; encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, s105, s105                ; encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, s105, s105                ; encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_min_i16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_min_i16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_min_i16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_min_i16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_min_i16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_min_i16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_min_i16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_min_i16 v5, m0, 0x3800
+# W32-REAL16: v_min_i16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_min_i16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_min_i16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_min_i16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_min_i16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_min_i16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_min_i16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_min_i16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_min_i16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX11: v_min_i16 v5, 0x3800, m0
+# W32-REAL16: v_min_i16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX11: v_min_i16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX11: v_min_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min_i16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min_i16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_min_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_min_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_min_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_min_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_min_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0c,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_min_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_min_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_min_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_min_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0c,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_min_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_min_u16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_min_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_min_u16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_min_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00
 
-# GFX11: v_min_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_min_u16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_min_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_min_u16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_min_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00
 
-# GFX11: v_min_u16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_min_u16 v5, s105, s105                ; encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, s105, s105                ; encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, s105, s105                ; encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_min_u16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_min_u16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_min_u16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_min_u16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_min_u16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_min_u16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_min_u16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_min_u16 v5, m0, 0x3800
+# W32-REAL16: v_min_u16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_min_u16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_min_u16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_min_u16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_min_u16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_min_u16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_min_u16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_min_u16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_min_u16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX11: v_min_u16 v5, 0x3800, m0
+# W32-REAL16: v_min_u16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX11: v_min_u16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX11: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min_u16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min_u16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
+# W32-REAL16: v_min_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_min_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_min_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_min_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0b,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_min_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_min_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_min_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_min_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0b,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_min_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
 # GFX11: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
 0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00
 
@@ -4606,51 +5047,114 @@
 # GFX11: v_mul_hi_u32 v255, 0xaf123456, vcc_hi   ; encoding: [0xff,0x00,0x2d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x00,0x2d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX11: v_mul_lo_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00
 
-# GFX11: v_mul_lo_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00
 
-# GFX11: v_mul_lo_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_mul_lo_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_mul_lo_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_mul_lo_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_mul_lo_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_mul_lo_u16 v5, m0, 0x3800
+# W32-REAL16: v_mul_lo_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x05,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x05,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x05,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x05,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_mul_lo_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_mul_lo_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_mul_lo_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_mul_lo_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX11: v_mul_lo_u16 v5, 0x3800, m0
+# W32-REAL16: v_mul_lo_u16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x05,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX11: v_mul_lo_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX11: v_mul_lo_u16 v255, 0xfe0b, vcc_hi       ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v255.l, 0xfe0b, vcc_hi     ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v255, 0xfe0b, vcc_hi       ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v255.l, 0xfe0b, vcc_hi     ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v255, 0xfe0b, vcc_hi       ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
+# W32-REAL16: v_mul_lo_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x05,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_mul_lo_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x05,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_mul_lo_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v255, 0xfe0b, vcc_hi       ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v255, 0xfe0b, vcc_hi       ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
 # GFX11: v_mul_lo_u32 v5, v1, v2                 ; encoding: [0x05,0x00,0x2c,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x2c,0xd7,0x01,0x05,0x02,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
index bf3fa3bf65c74..4ae8b053f0e0f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
@@ -395,48 +395,114 @@
 # GFX11: v_and_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x57,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x57,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
 # GFX11: v_bcnt_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
@@ -1380,90 +1446,222 @@
 # GFX11: v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x56,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x56,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_lshlrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
 # GFX11: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
@@ -1674,90 +1872,222 @@
 # GFX11: v_max3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x1e,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x1e,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_max_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_max_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_max_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_max_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_max_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_max_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
+# W32-REAL16: v_max_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
 # GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
@@ -2262,90 +2592,222 @@
 # GFX11: v_min3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x1b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x1b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_min_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_min_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_min_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_min_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_min_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_min_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
+# W32-REAL16: v_min_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
 # GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
@@ -2556,48 +3018,114 @@
 # GFX11: v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x80,0x39,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 0xff,0x80,0x39,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_mul_lo_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
 # GFX11: v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
index cdbf798fd99c7..b44dba748666c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
@@ -215,12 +215,42 @@
 # GFX11: v_and_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x57,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 0xff,0x00,0x57,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00
 
-# GFX11: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
 # GFX11: v_bcnt_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
@@ -762,18 +792,78 @@
 # GFX11: v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x56,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 0xff,0x00,0x56,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00
 
-# GFX11: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX11: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_lshlrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
 # GFX11: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
@@ -954,18 +1044,78 @@
 # GFX11: v_max3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1e,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1e,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00
 
-# GFX11: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_max_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_max_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX11: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_max_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_max_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_max_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_max_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
+# W32-REAL16: v_max_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
 # GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
@@ -1338,18 +1488,78 @@
 # GFX11: v_min3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1b,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1b,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00
 
-# GFX11: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_min_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_min_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX11: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_min_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_min_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_min_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_min_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
+# W32-REAL16: v_min_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
 # GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
@@ -1530,12 +1740,42 @@
 # GFX11: v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x80,0x39,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 0xff,0x80,0x39,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00
 
-# GFX11: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_mul_lo_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
 # GFX11: v_mullit_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
index 473ede00603a7..080a4cab2a319 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
@@ -2415,13 +2415,13 @@
 # GFX12: ds_sub_u64 v255, v[2:3] offset:65535    ; encoding: [0xff,0xff,0x04,0xd9,0xff,0x02,0x00,0x00]
 0xff,0xff,0x04,0xd9,0xff,0x02,0x00,0x00
 
-# GFX12: ds_swizzle_b32 v255, v1 offset:65535    ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff]
+# GFX12: ds_swizzle_b32 v255, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff]
 0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff
 
 # GFX12: ds_swizzle_b32 v5, v1                   ; encoding: [0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05]
 0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05
 
-# GFX12: ds_swizzle_b32 v5, v1 offset:65535      ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05]
+# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05]
 0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05
 
 # GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(BITMASK_PERM,"00p00") ; encoding: [0x04,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05]
@@ -2439,9 +2439,33 @@
 # GFX12: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
 0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08
 
-# GFX12: ds_swizzle_b32 v5, v255 offset:65535    ; encoding: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05]
+# GFX12: ds_swizzle_b32 v5, v255 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05]
 0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05
 
+# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16) ; encoding: [0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x10,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x00,0xe0,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x00,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0) ; encoding: [0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x00,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1) ; encoding: [0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x20,0xc0,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1) ; encoding: [0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0x20,0xc4,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31) ; encoding: [0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0xe0,0xc3,0xd4,0xd8,0x01,0x00,0x00,0x05
+
+# GFX12: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31) ; encoding: [0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05]
+0xe0,0xc7,0xd4,0xd8,0x01,0x00,0x00,0x05
+
 # GFX12: ds_store_2addr_b32 v0, v1, v2           ; encoding: [0x00,0x00,0x38,0xd8,0x00,0x01,0x02,0x00]
 0x00,0x00,0x38,0xd8,0x00,0x01,0x02,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index 9b41b22b9012f..af04a31423b6f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -513,51 +513,114 @@
 # GFX12: v_and_or_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x57,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 0xff,0x00,0x57,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf
 
-# GFX12: v_ashrrev_i16 v5, v1, v2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, v1, v2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, v1, v2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_ashrrev_i16 v5, v255, v255            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, v255, v255            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, v255, v255            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_ashrrev_i16 v5, s1, s2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, s1, s2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, s1, s2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_ashrrev_i16 v5, s105, s105            ; encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, s105, s105            ; encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, s105, s105            ; encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_ashrrev_i16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_ashrrev_i16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, vcc_hi, 0xfe0b      ; encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, vcc_hi, 0xfe0b      ; encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_ashrrev_i16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_ashrrev_i16 v5, m0, 0x3800
+# W32-REAL16: v_ashrrev_i16 v5.l, m0, 0x3800          ; encoding: [0x05,0x00,0x3a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, m0, 0x3800            ; encoding: [0x05,0x00,0x3a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, m0, 0x3800          ; encoding: [0x05,0x00,0x3a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, m0, 0x3800            ; encoding: [0x05,0x00,0x3a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_ashrrev_i16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_ashrrev_i16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, exec_hi, null       ; encoding: [0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, exec_hi, null       ; encoding: [0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_ashrrev_i16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_ashrrev_i16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_ashrrev_i16 v5, 0x3800, m0
+# W32-REAL16: v_ashrrev_i16 v5.l, 0x3800, m0          ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, 0x3800, m0            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, 0x3800, m0          ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, 0x3800, m0            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_ashrrev_i16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v5.l, src_scc, vcc_lo     ; encoding: [0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, src_scc, vcc_lo     ; encoding: [0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x3a,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_ashrrev_i16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16 v255.l, 0xfe0b, vcc_hi    ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v255.l, 0xfe0b, vcc_hi    ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
+# W32-REAL16: v_ashrrev_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, v1, v2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x3a,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, v1, v2                ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x3a,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_ashrrev_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_ashrrev_i16 v5, v255, v255            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_ashrrev_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x3a,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_ashrrev_i16 v5, v255, v255            ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x3a,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_ashrrev_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x3a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
 # GFX12: v_ashrrev_i64 v[5:6], v1, vcc           ; encoding: [0x05,0x00,0x3e,0xd7,0x01,0xd5,0x00,0x00]
 0x05,0x00,0x3e,0xd7,0x01,0xd5,0x00,0x00
 
@@ -2292,96 +2355,222 @@
 # GFX12: v_lshl_or_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x56,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 0xff,0x00,0x56,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf
 
-# GFX12: v_lshlrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_lshlrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_lshlrev_b16 v5, s1, s2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, s1, s2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, s1, s2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_lshlrev_b16 v5, s105, s105            ; encoding: [0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, s105, s105            ; encoding: [0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, s105, s105            ; encoding: [0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_lshlrev_b16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_lshlrev_b16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, vcc_hi, 0xfe0b      ; encoding: [0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, vcc_hi, 0xfe0b      ; encoding: [0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_lshlrev_b16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x38,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_lshlrev_b16 v5, m0, 0x3800
+# W32-REAL16: v_lshlrev_b16 v5.l, m0, 0x3800          ; encoding: [0x05,0x00,0x38,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, m0, 0x3800            ; encoding: [0x05,0x00,0x38,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, m0, 0x3800          ; encoding: [0x05,0x00,0x38,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, m0, 0x3800            ; encoding: [0x05,0x00,0x38,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_lshlrev_b16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x38,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_lshlrev_b16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, exec_hi, null       ; encoding: [0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, exec_hi, null       ; encoding: [0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_lshlrev_b16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x38,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_lshlrev_b16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x38,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_lshlrev_b16 v5, 0x3800, m0
+# W32-REAL16: v_lshlrev_b16 v5.l, 0x3800, m0          ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, 0x3800, m0            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, 0x3800, m0          ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, 0x3800, m0            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x38,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_lshlrev_b16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, src_scc, vcc_lo     ; encoding: [0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, src_scc, vcc_lo     ; encoding: [0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x38,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_lshlrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16 v255.l, 0xfe0b, vcc_hi    ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v255.l, 0xfe0b, vcc_hi    ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_lshrrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_lshlrev_b16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x38,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x38,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x38,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_lshlrev_b16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_lshlrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_lshlrev_b16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x38,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_lshlrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x38,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x38,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_lshlrev_b16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x38,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_lshrrev_b16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_lshrrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_lshrrev_b16 v5, s1, s2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, s1, s2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, s1, s2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_lshrrev_b16 v5, s105, s105            ; encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, s105, s105            ; encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, s105, s105            ; encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_lshrrev_b16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_lshrrev_b16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, vcc_hi, 0xfe0b      ; encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, vcc_hi, 0xfe0b      ; encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, vcc_hi, 0xfe0b        ; encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_lshrrev_b16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_lshrrev_b16 v5, m0, 0x3800
+# W32-REAL16: v_lshrrev_b16 v5.l, m0, 0x3800          ; encoding: [0x05,0x00,0x39,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, m0, 0x3800            ; encoding: [0x05,0x00,0x39,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, m0, 0x3800          ; encoding: [0x05,0x00,0x39,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, m0, 0x3800            ; encoding: [0x05,0x00,0x39,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_lshrrev_b16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_lshrrev_b16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, exec_hi, null       ; encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, exec_hi, null       ; encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, exec_hi, null         ; encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_lshrrev_b16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_lshrrev_b16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_lshrrev_b16 v5, 0x3800, m0
+# W32-REAL16: v_lshrrev_b16 v5.l, 0x3800, m0          ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, 0x3800, m0            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, 0x3800, m0          ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, 0x3800, m0            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x39,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_lshrrev_b16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v5.l, src_scc, vcc_lo     ; encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, src_scc, vcc_lo     ; encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, src_scc, vcc_lo       ; encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_lshrrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16 v255.l, 0xfe0b, vcc_hi    ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v255.l, 0xfe0b, vcc_hi    ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
+# W32-REAL16: v_lshrrev_b16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x39,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, v1, v2                ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x39,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_lshrrev_b16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_lshrrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_lshrrev_b16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x39,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_lshrrev_b16 v5, v255, v255            ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x39,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_lshrrev_b16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16 v255, 0xfe0b, vcc_hi      ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x39,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
 # GFX12: v_lshrrev_b64 v[5:6], v1, vcc           ; encoding: [0x05,0x00,0x3d,0xd7,0x01,0xd5,0x00,0x00]
 0x05,0x00,0x3d,0xd7,0x01,0xd5,0x00,0x00
 
@@ -3067,96 +3256,222 @@
 # GFX12: v_max3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x1e,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 0xff,0x00,0x1e,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf
 
-# GFX12: v_max_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_max_i16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_max_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_max_i16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_max_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_max_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_max_i16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_max_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_max_i16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_max_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_max_i16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_max_i16 v5, s105, s105                ; encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, s105, s105                ; encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, s105, s105                ; encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_max_i16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_max_i16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_max_i16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_max_i16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_max_i16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_max_i16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_max_i16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_max_i16 v5, m0, 0x3800
+# W32-REAL16: v_max_i16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_max_i16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_max_i16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_max_i16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_max_i16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_max_i16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_max_i16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_max_i16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_max_i16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_max_i16 v5, 0x3800, m0
+# W32-REAL16: v_max_i16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_max_i16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_max_i16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_max_i16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_max_i16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_max_i16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_max_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max_i16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max_i16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_max_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_max_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_max_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_max_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0a,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_max_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0a,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_max_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_max_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_max_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0a,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_max_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0a,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_max_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x0a,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_max_u16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_max_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_max_u16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_max_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_max_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_max_u16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_max_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_max_u16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_max_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_max_u16 v5, s1, s2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, s1, s2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, s1, s2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_max_u16 v5, s105, s105                ; encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, s105, s105                ; encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, s105, s105                ; encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_max_u16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_max_u16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_max_u16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_max_u16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_max_u16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_max_u16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_max_u16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_max_u16 v5, m0, 0x3800
+# W32-REAL16: v_max_u16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x09,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x09,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x09,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x09,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_max_u16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_max_u16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_max_u16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_max_u16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_max_u16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_max_u16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_max_u16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_max_u16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_max_u16 v5, 0x3800, m0
+# W32-REAL16: v_max_u16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x09,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_max_u16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_max_u16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_max_u16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_max_u16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_max_u16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max_u16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max_u16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
+# W32-REAL16: v_max_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_max_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_max_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x09,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_max_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x09,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_max_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_max_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_max_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x09,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_max_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x09,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_max_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
 # GFX12: v_maxmin_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
 0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00
 
@@ -3967,96 +4282,222 @@
 # GFX12: v_min3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x1b,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 0xff,0x00,0x1b,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf
 
-# GFX12: v_min_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_min_i16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_min_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_min_i16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_min_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_min_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_min_i16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_min_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_min_i16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_min_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_min_i16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_min_i16 v5, s105, s105                ; encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, s105, s105                ; encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, s105, s105                ; encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_min_i16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_min_i16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_min_i16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_min_i16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_min_i16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_min_i16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_min_i16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_min_i16 v5, m0, 0x3800
+# W32-REAL16: v_min_i16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_min_i16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_min_i16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_min_i16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_min_i16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_min_i16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_min_i16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_min_i16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_min_i16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_min_i16 v5, 0x3800, m0
+# W32-REAL16: v_min_i16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_min_i16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_min_i16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_min_i16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_min_i16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_min_i16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_min_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min_i16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min_i16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_min_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_min_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_min_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_min_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0c,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_min_i16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0c,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_min_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_min_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_min_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0c,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_min_i16 v5, v255, v255                ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0c,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_min_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_i16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x0c,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_min_u16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_min_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_min_u16 v5.l, v1.l, v2.l              ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_min_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_min_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_min_u16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_min_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_min_u16 v5.l, v255.l, v255.l          ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_min_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_min_u16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, s1, s2                  ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, s1, s2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_min_u16 v5, s105, s105                ; encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, s105, s105                ; encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, s105, s105              ; encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, s105, s105                ; encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_min_u16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, vcc_lo, ttmp15          ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, vcc_lo, ttmp15            ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_min_u16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, vcc_hi, 0xfe0b          ; encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, vcc_hi, 0xfe0b            ; encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_min_u16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_min_u16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_min_u16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_min_u16 v5.l, ttmp15, src_scc         ; encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_min_u16 v5, ttmp15, src_scc           ; encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_min_u16 v5, m0, 0x3800
+# W32-REAL16: v_min_u16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, m0, 0x3800              ; encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, m0, 0x3800                ; encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_min_u16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_min_u16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_min_u16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_min_u16 v5.l, exec_lo, -1             ; encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_min_u16 v5, exec_lo, -1               ; encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_min_u16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, exec_hi, null           ; encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, exec_hi, null             ; encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_min_u16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, null, exec_lo           ; encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, null, exec_lo             ; encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_min_u16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, -1, exec_hi             ; encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, -1, exec_hi               ; encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_min_u16 v5, 0x3800, m0
+# W32-REAL16: v_min_u16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, 0x3800, m0              ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, 0x3800, m0                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_min_u16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_min_u16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_min_u16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_min_u16 v5.l, src_scc, vcc_lo         ; encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_min_u16 v5, src_scc, vcc_lo           ; encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min_u16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min_u16 v255.l, 0xfe0b, vcc_hi        ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
+# W32-REAL16: v_min_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_min_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_min_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0b,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_min_u16 v5, v1, v2                    ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0b,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_min_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_min_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_min_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0b,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_min_u16 v5, v255, v255                ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0b,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_min_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
 # GFX12: v_minmax_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
 0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00
 
@@ -4462,51 +4903,114 @@
 # GFX12: v_mul_hi_u32 v255, 0xaf123456, vcc_hi   ; encoding: [0xff,0x00,0x2d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x00,0x2d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX12: v_mul_lo_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_mul_lo_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_mul_lo_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_mul_lo_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_mul_lo_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_mul_lo_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_mul_lo_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_mul_lo_u16 v5, m0, 0x3800
+# W32-REAL16: v_mul_lo_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x05,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x05,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x05,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x05,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_mul_lo_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_mul_lo_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_mul_lo_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_mul_lo_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_mul_lo_u16 v5, 0x3800, m0
+# W32-REAL16: v_mul_lo_u16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x05,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_mul_lo_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_mul_lo_u16 v255, 0xfe0b, vcc_hi       ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16 v255.l, 0xfe0b, vcc_hi     ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v255, 0xfe0b, vcc_hi       ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v255.l, 0xfe0b, vcc_hi     ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v255, 0xfe0b, vcc_hi       ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
+# W32-REAL16: v_mul_lo_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x05,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x05,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_mul_lo_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_mul_lo_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_mul_lo_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x05,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_mul_lo_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x05,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_mul_lo_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16 v255, 0xfe0b, vcc_hi       ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16 v255, 0xfe0b, vcc_hi       ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0x40,0x05,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
 # GFX12: v_mul_lo_u32 v5, v1, v2                 ; encoding: [0x05,0x00,0x2c,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x2c,0xd7,0x01,0x05,0x02,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index f9efef4f4ebc3..65cfdd5ef7de0 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -413,48 +413,138 @@
 # GFX12: v_and_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x57,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x57,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x3a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
 # GFX12: v_bcnt_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
@@ -1554,90 +1644,222 @@
 # GFX12: v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x56,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x56,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x38,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_lshlrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x38,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
 # GFX12: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
@@ -1866,90 +2088,222 @@
 # GFX12: v_max3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x1e,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x1e,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_max_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_max_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_max_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_max_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_max_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_max_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_max_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_max_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
+# W32-REAL16: v_max_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_max_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
 # GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
@@ -2487,90 +2841,222 @@
 # GFX12: v_min3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x1b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x1b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_min_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_min_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_min_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_min_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_min_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_min_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_min_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_min_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
+# W32-REAL16: v_min_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_min_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
 # GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
@@ -2796,48 +3282,114 @@
 # GFX12: v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x80,0x39,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 0xff,0x80,0x39,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_mul_lo_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0x40,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
 # GFX12: v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index 47611e0b9708f..4640b967cbc07 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -233,12 +233,66 @@
 # GFX12: v_and_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x57,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 0xff,0x00,0x57,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00
 
-# GFX12: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_ashrrev_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x3a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
 # GFX12: v_bcnt_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
@@ -876,18 +930,78 @@
 # GFX12: v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x56,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 0xff,0x00,0x56,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00
 
-# GFX12: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX12: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_lshlrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_lshlrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x38,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_lshrrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_lshrrev_b16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x39,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
 # GFX12: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
@@ -1083,18 +1197,78 @@
 # GFX12: v_max3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1e,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1e,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00
 
-# GFX12: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_max_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_max_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX12: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_max_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_max_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x0a,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_max_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_max_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
+# W32-REAL16: v_max_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
 # GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
@@ -1497,18 +1671,78 @@
 # GFX12: v_min3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1b,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1b,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00
 
-# GFX12: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_min_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_min_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX12: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_min_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_min_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x0c,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_min_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_min_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
+# W32-REAL16: v_min_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
 # GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
@@ -1704,12 +1938,42 @@
 # GFX12: v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x80,0x39,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 0xff,0x80,0x39,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00
 
-# GFX12: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_mul_lo_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_mul_lo_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0x40,0x05,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
 # GFX12: v_mullit_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt
index b8cc7ac604da2..081b40ac78f8c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt
@@ -6192,13 +6192,13 @@
 # GFX90A: ds_read_u16 a5, v1 offset:4     ; encoding: [0x04,0x00,0x78,0xda,0x01,0x00,0x00,0x05]
 0x04,0x00,0x78,0xda,0x01,0x00,0x00,0x05
 
-# GFX90A: ds_swizzle_b32 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05]
+# GFX90A: ds_swizzle_b32 a5, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05]
 0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05
 
-# GFX90A: ds_swizzle_b32 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff]
+# GFX90A: ds_swizzle_b32 a255, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff]
 0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff
 
-# GFX90A: ds_swizzle_b32 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05]
+# GFX90A: ds_swizzle_b32 a5, v255 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05]
 0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05
 
 # GFX90A: ds_swizzle_b32 a5, v1           ; encoding: [0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_ds.txt
index 58a44e1250542..2af4b8396d71a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_ds.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_ds.txt
@@ -1074,13 +1074,13 @@
 # CHECK: ds_read_u16 v5, v1 offset:65535 gds     ; encoding: [0xff,0xff,0x79,0xd8,0x01,0x00,0x00,0x05]
 0xff,0xff,0x79,0xd8,0x01,0x00,0x00,0x05
 
-# CHECK: ds_swizzle_b32 v5, v1 offset:65535      ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0x05]
+# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0x05]
 0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0x05
 
-# CHECK: ds_swizzle_b32 v255, v1 offset:65535    ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0xff]
+# CHECK: ds_swizzle_b32 v255, v1 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0xff]
 0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0xff
 
-# CHECK: ds_swizzle_b32 v5, v255 offset:65535    ; encoding: [0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x05]
+# CHECK: ds_swizzle_b32 v5, v255 offset:swizzle(FFT,31) ; encoding: [0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x05]
 0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x05
 
 # CHECK: ds_swizzle_b32 v5, v1                   ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05]
@@ -1089,9 +1089,33 @@
 # CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(BITMASK_PERM,"00p00") ; encoding: [0x04,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05]
 0x04,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05
 
-# CHECK: ds_swizzle_b32 v5, v1 offset:65535 gds  ; encoding: [0xff,0xff,0x7b,0xd8,0x01,0x00,0x00,0x05]
+# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,31) gds ; encoding: [0xff,0xff,0x7b,0xd8,0x01,0x00,0x00,0x05]
 0xff,0xff,0x7b,0xd8,0x01,0x00,0x00,0x05
 
+# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,16) ; encoding: [0x10,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+0x10,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05
+
+# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(FFT,0) ; encoding: [0x00,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+0x00,0xe0,0x7a,0xd8,0x01,0x00,0x00,0x05
+
+# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,0) ; encoding: [0x00,0xc0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+0x00,0xc0,0x7a,0xd8,0x01,0x00,0x00,0x05
+
+# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,0) ; encoding: [0x00,0xc4,0x7a,0xd8,0x01,0x00,0x00,0x05]
+0x00,0xc4,0x7a,0xd8,0x01,0x00,0x00,0x05
+
+# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,1) ; encoding: [0x20,0xc0,0x7a,0xd8,0x01,0x00,0x00,0x05]
+0x20,0xc0,0x7a,0xd8,0x01,0x00,0x00,0x05
+
+# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,1) ; encoding: [0x20,0xc4,0x7a,0xd8,0x01,0x00,0x00,0x05]
+0x20,0xc4,0x7a,0xd8,0x01,0x00,0x00,0x05
+
+# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,0,31) ; encoding: [0xe0,0xc3,0x7a,0xd8,0x01,0x00,0x00,0x05]
+0xe0,0xc3,0x7a,0xd8,0x01,0x00,0x00,0x05
+
+# CHECK: ds_swizzle_b32 v5, v1 offset:swizzle(ROTATE,1,31) ; encoding: [0xe0,0xc7,0x7a,0xd8,0x01,0x00,0x00,0x05]
+0xe0,0xc7,0x7a,0xd8,0x01,0x00,0x00,0x05
+
 # CHECK: ds_permute_b32 v5, v1, v2 offset:65535  ; encoding: [0xff,0xff,0x7c,0xd8,0x01,0x02,0x00,0x05]
 0xff,0xff,0x7c,0xd8,0x01,0x02,0x00,0x05
 
diff --git a/llvm/test/MC/WebAssembly/type-checker-errors.s b/llvm/test/MC/WebAssembly/type-checker-errors.s
index 3106fe76c8449..d81c5aff0a7e9 100644
--- a/llvm/test/MC/WebAssembly/type-checker-errors.s
+++ b/llvm/test/MC/WebAssembly/type-checker-errors.s
@@ -19,7 +19,7 @@ local_set_no_local_type:
 local_set_empty_stack_while_popping:
   .functype local_set_empty_stack_while_popping () -> ()
   .local i32
-# CHECK: [[@LINE+1]]:3: error: empty stack while popping i32
+# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   local.set 0
   end_function
 
@@ -27,7 +27,7 @@ local_set_type_mismatch:
   .functype local_set_type_mismatch () -> ()
   .local i32
   f32.const 1.0
-# CHECK: [[@LINE+1]]:3: error: popped f32, expected i32
+# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [i32] but got [f32]
   local.set 0
   end_function
 
@@ -40,7 +40,7 @@ local_tee_no_local_type:
 local_tee_empty_stack_while_popping:
   .functype local_tee_empty_stack_while_popping () -> ()
   .local f32
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping f32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [f32] but got []
   local.tee 0
   end_function
 
@@ -48,8 +48,9 @@ local_tee_type_mismatch:
   .functype local_tee_type_mismatch () -> ()
   .local f32
   i32.const 1
-# CHECK: :[[@LINE+1]]:3: error: popped i32, expected f32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [f32] but got [i32]
   local.tee 0
+  drop
   end_function
 
 global_get_missing_globaltype:
@@ -79,7 +80,7 @@ global_set_expected_expression_operand:
 global_set_empty_stack_while_popping:
   .functype global_set_empty_stack_while_popping () -> ()
   .globaltype valid_global, i64
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping i64
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i64] but got []
   global.set valid_global
   end_function
 
@@ -87,7 +88,7 @@ global_set_type_mismatch:
   .functype global_set_type_mismatch () -> ()
   .globaltype valid_global, i64
   i32.const 1
-# CHECK: :[[@LINE+1]]:3: error: popped i32, expected i64
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i64] but got [i32]
   global.set valid_global
   end_function
 
@@ -109,46 +110,51 @@ table_get_missing_tabletype:
 
 table_get_empty_stack_while_popping:
   .functype table_get_empty_stack_while_popping () -> ()
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   table.get valid_table
+  drop
   end_function
 
 table_get_type_mismatch:
   .functype table_get_type_mismatch () -> ()
   f32.const 1.0
-# CHECK: :[[@LINE+1]]:3: error: popped f32, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [f32]
   table.get valid_table
+  drop
   end_function
 
 table_set_expected_expression_operand:
   .functype table_set_expected_expression_operand () -> ()
+  i32.const 0
 # CHECK: :[[@LINE+1]]:13: error: expected expression operand
   table.set 1
   end_function
 
 table_set_missing_tabletype:
   .functype table_set_missing_tabletype () -> ()
+  i32.const 0
 # CHECK: :[[@LINE+1]]:13: error: symbol foo: missing .tabletype
   table.set foo
   end_function
 
 table_set_empty_stack_while_popping_1:
   .functype table_set_empty_stack_while_popping_1 () -> ()
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping externref
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref] but got []
   table.set valid_table
   end_function
 
 table_set_empty_stack_while_popping_2:
   .functype table_set_empty_stack_while_popping_2 (externref) -> ()
   local.get 0
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref] but got [externref]
   table.set valid_table
   end_function
 
 table_set_type_mismatch_1:
   .functype table_set_type_mismatch_1 () -> ()
+  i32.const 0
   ref.null_func
-# CHECK: :[[@LINE+1]]:3: error: popped funcref, expected externref
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref] but got [i32, funcref]
   table.set valid_table
   end_function
 
@@ -156,32 +162,38 @@ table_set_type_mismatch_2:
   .functype table_set_type_mismatch_2 () -> ()
   f32.const 1.0
   ref.null_extern
-# CHECK: :[[@LINE+1]]:3: error: popped f32, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref] but got [f32, externref]
   table.set valid_table
   end_function
 
 table_fill_expected_expression_operand:
   .functype table_fill_expected_expression_operand () -> ()
+  i32.const 0
+  ref.null_extern
+  i32.const 4
 # CHECK: :[[@LINE+1]]:14: error: expected expression operand
   table.fill 1
   end_function
 
 table_fill_missing_tabletype:
   .functype table_fill_missing_tabletype () -> ()
+  i32.const 0
+  ref.null_extern
+  i32.const 4
 # CHECK: :[[@LINE+1]]:14: error: symbol foo: missing .tabletype
   table.fill foo
   end_function
 
 table_fill_empty_stack_while_popping_1:
   .functype table_fill_empty_stack_while_popping_1 () -> ()
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got []
   table.fill valid_table
   end_function
 
 table_fill_empty_stack_while_popping_2:
   .functype table_fill_empty_stack_while_popping_2 (i32) -> ()
   local.get 0
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping externref
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [i32]
   table.fill valid_table
   end_function
 
@@ -189,22 +201,25 @@ table_fill_empty_stack_while_popping_3:
   .functype table_fill_empty_stack_while_popping_3 (i32, externref) -> ()
   local.get 1
   local.get 0
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [externref, i32]
   table.fill valid_table
   end_function
 
 table_fill_type_mismatch_1:
   .functype table_fill_type_mismatch_1 () -> ()
+  i32.const 0
+  ref.null_extern
   ref.null_func
-# CHECK: :[[@LINE+1]]:3: error: popped funcref, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [i32, externref, funcref]
   table.fill valid_table
   end_function
 
 table_fill_type_mismatch_2:
   .functype table_fill_type_mismatch_2 () -> ()
+  i32.const 0
   ref.null_func
   i32.const 1
-# CHECK: [[@LINE+1]]:3: error: popped funcref, expected externref
+# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [i32, funcref, i32]
   table.fill valid_table
   end_function
 
@@ -213,23 +228,16 @@ table_fill_type_mismatch_3:
   f32.const 2.0
   ref.null_extern
   i32.const 1
-# CHECK: :[[@LINE+1]]:3: error: popped f32, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [f32, externref, i32]
   table.fill valid_table
   end_function
 
 table_fill_type_mismatch_4:
   .functype table_fill_type_mismatch_4 () -> ()
-  ref.null_exn
   i32.const 1
-# CHECK: [[@LINE+1]]:3: error: popped exnref, expected externref
-  table.fill valid_table
-  end_function
-
-table_fill_type_mismatch_5:
-  .functype table_fill_type_mismatch_5 () -> ()
   ref.null_exn
   i32.const 1
-# CHECK: [[@LINE+1]]:3: error: popped exnref, expected externref
+# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [i32, exnref, i32]
   table.fill valid_table
   end_function
 
@@ -244,14 +252,15 @@ table_grow_non_exist_table:
 table_grow_type_mismatch_1:
   .functype table_grow_type_mismatch_1 (externref, i32) -> (i32)
   local.get 1
-# CHECK: [[@LINE+1]]:3: error: empty stack while popping externref
+# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [externref, i32] but got [i32]
   table.grow valid_table
   end_function
 
 table_grow_type_mismatch_2:
   .functype table_grow_type_mismatch_2 (externref, i32) -> (i32)
   local.get 0
-# CHECK: [[@LINE+1]]:3: error: popped externref, expected i32
+  local.get 0
+# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [externref, i32] but got [externref, externref]
   table.grow valid_table
   end_function
 
@@ -260,57 +269,62 @@ table_grow_wrong_result:
   local.get 0
   local.get 1
   table.grow valid_table
-# CHECK: [[@LINE+1]]:3: error: popped i32, expected f32
+# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [f32] but got [i32]
   end_function
 
 drop_empty_stack_while_popping:
   .functype drop_empty_stack_while_popping () -> ()
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping value
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [any] but got []
   drop
   end_function
 
 end_block_insufficient_values_on_stack_1:
   .functype end_block_insufficient_values_on_stack_1 () -> ()
   block i32
-# CHECK: :[[@LINE+1]]:3: error: end: insufficient values on the type stack
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   end_block
+  drop
   end_function
 
 end_block_insufficient_values_on_stack_2:
   .functype end_block_insufficient_values_on_stack_2 () -> ()
   block () -> (i32)
-# CHECK: :[[@LINE+1]]:3: error: end: insufficient values on the type stack
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   end_block
+  drop
   end_function
 
 end_block_type_mismatch:
   .functype end_block_type_mismatch () -> ()
   block i32
   f32.const 1.0
-# CHECK: :[[@LINE+1]]:3: error: end got f32, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [f32]
   end_block
+  drop
   end_function
 
 end_loop_insufficient_values_on_stack:
   .functype end_loop_insufficient_values_on_stack () -> ()
   loop i32
-# CHECK: :[[@LINE+1]]:3: error: end: insufficient values on the type stack
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   end_loop
+  drop
   end_function
 
 end_loop_type_mismatch:
   .functype end_loop_type_mismatch () -> ()
   loop f32
   i32.const 1
-# CHECK: :[[@LINE+1]]:3: error: end got i32, expected f32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [f32] but got [i32]
   end_loop
+  drop
   end_function
 
 end_if_insufficient_values_on_stack_1:
   .functype end_if_insufficient_values_on_stack_1 () -> ()
   i32.const 1
   if i32
-# CHECK: :[[@LINE+1]]:3: error: end: insufficient values on the type stack
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   end_if
   end_function
 
@@ -319,8 +333,9 @@ end_if_type_mismatch_1:
   i32.const 1
   if f32
   i32.const 1
-# CHECK: :[[@LINE+1]]:3: error: end got i32, expected f32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [f32] but got [i32]
   end_if
+  drop
   end_function
 
 end_if_insufficient_values_on_stack_2:
@@ -329,7 +344,7 @@ end_if_insufficient_values_on_stack_2:
   if i32
   i32.const 2
   else
-# CHECK: :[[@LINE+1]]:3: error: end: insufficient values on the type stack
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   end_if
   drop
   end_function
@@ -341,7 +356,7 @@ end_if_type_mismatch_2:
   i32.const 2
   else
   f32.const 3.0
-# CHECK: :[[@LINE+1]]:3: error: end got f32, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [f32]
   end_if
   drop
   end_function
@@ -350,7 +365,7 @@ else_insufficient_values_on_stack:
   .functype else_insufficient_values_on_stack () -> ()
   i32.const 1
   if i32
-# CHECK: :[[@LINE+1]]:3: error: end: insufficient values on the type stack
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   else
   i32.const 0
   end_if
@@ -362,7 +377,7 @@ else_type_mismatch:
   i32.const 1
   if i32
   f32.const 0.0
-# CHECK: :[[@LINE+1]]:3: error: popped f32, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [f32]
   else
   i32.const 0
   end_if
@@ -377,7 +392,7 @@ end_try_insufficient_values_on_stack:
   try i32
   i32.const 0
   catch_all
-# CHECK: :[[@LINE+1]]:3: error: end: insufficient values on the type stack
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   end_try
   drop
   end_function
@@ -387,7 +402,7 @@ end_try_type_mismatch:
   try i32
   i32.const 0
   catch tag_f32
-# CHECK: :[[@LINE+1]]:3: error: end got f32, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [f32]
   end_try
   drop
   end_function
@@ -395,7 +410,7 @@ end_try_type_mismatch:
 catch_insufficient_values_on_stack:
   .functype catch_insufficient_values_on_stack () -> ()
   try i32
-# CHECK: :[[@LINE+1]]:3: error: end: insufficient values on the type stack
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   catch tag_i32
   end_try
   drop
@@ -405,7 +420,7 @@ catch_type_mismatch:
   .functype catch_type_mismatch () -> ()
   try i32
   f32.const 1.0
-# CHECK: :[[@LINE+1]]:3: error: popped f32, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [f32]
   catch tag_i32
   end_try
   drop
@@ -414,7 +429,7 @@ catch_type_mismatch:
 catch_all_insufficient_values_on_stack:
   .functype catch_all_insufficient_values_on_stack () -> ()
   try i32
-# CHECK: :[[@LINE+1]]:3: error: end: insufficient values on the type stack
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   catch_all
   i32.const 0
   end_try
@@ -425,7 +440,7 @@ catch_all_type_mismatch:
   .functype catch_all_type_mismatch () -> ()
   try i32
   f32.const 1.0
-# CHECK: :[[@LINE+1]]:3: error: popped f32, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [f32]
   catch_all
   i32.const 0
   end_try
@@ -435,7 +450,7 @@ catch_all_type_mismatch:
 delegate_insufficient_values_on_stack:
   .functype delegate_insufficient_values_on_stack () -> ()
   try i32
-# CHECK: :[[@LINE+1]]:3: error: end: insufficient values on the type stack
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   delegate 0
   drop
   end_function
@@ -444,46 +459,46 @@ delegate_type_mismatch:
   .functype delegate_type_mismatch () -> ()
   try i32
   f32.const 1.0
-# CHECK: :[[@LINE+1]]:3: error: end got f32, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [f32]
   delegate 0
   drop
   end_function
 
 end_function_empty_stack_while_popping:
   .functype end_function_empty_stack_while_popping () -> (i32)
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   end_function
 
 end_function_type_mismatch:
   .functype end_function_type_mismatch () -> (f32)
   i32.const 1
-# CHECK: :[[@LINE+1]]:3: error: popped i32, expected f32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [f32] but got [i32]
   end_function
 
 end_function_superfluous_end_function_values:
   .functype end_function_superfluous_end_function_values () -> ()
   i32.const 1
   f32.const 2.0
-# CHECK: :[[@LINE+1]]:3: error: 2 superfluous return values
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [] but got [i32, f32]
   end_function
 
 return_empty_stack_while_popping:
   .functype return_empty_stack_while_popping () -> (i32)
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   return
   end_function
 
 return_type_mismatch:
   .functype return_type_mismatch () -> (f32)
   i32.const 1
-# CHECK: :[[@LINE+1]]:3: error: popped i32, expected f32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [f32] but got [i32]
   return
   end_function
 
 # Missing index for call_indirect.
 call_indirect_empty_stack_while_popping_1:
   .functype call_indirect_empty_stack_while_popping_1 () -> ()
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   call_indirect () -> ()
   end_function
 
@@ -491,7 +506,7 @@ call_indirect_empty_stack_while_popping_1:
 call_indirect_empty_stack_while_popping_2:
   .functype call_indirect_empty_stack_while_popping_1 (f32) -> ()
   i32.const 1
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping f32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [f32] but got []
   call_indirect (f32) -> ()
   end_function
 
@@ -499,7 +514,7 @@ call_indirect_type_mismatch_for_argument:
   .functype call_indirect_type_mismatch_for_argument () -> ()
   i32.const 1
   i32.const 2
-# CHECK: :[[@LINE+1]]:3: error: popped i32, expected f32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [f32] but got [i32]
   call_indirect (f32) -> ()
   end_function
 
@@ -507,13 +522,13 @@ call_indirect_superfluous_value_at_end:
   .functype call_indirect_superfluous_value_at_end () -> ()
   i32.const 1
   call_indirect () -> (i64)
-# CHECK: :[[@LINE+1]]:3: error: 1 superfluous return values
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [] but got [i64]
   end_function
 
 # Missing index for return_call_indirect.
 return_call_indirect_empty_stack_while_popping_1:
   .functype return_call_indirect_empty_stack_while_popping_1 () -> ()
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   return_call_indirect () -> ()
   end_function
 
@@ -521,7 +536,7 @@ return_call_indirect_empty_stack_while_popping_1:
 return_call_indirect_empty_stack_while_popping_2:
   .functype return_call_indirect_empty_stack_while_popping_2 () -> ()
   i32.const 1
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping f32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [f32] but got []
   return_call_indirect (f32) -> ()
   end_function
 
@@ -535,14 +550,14 @@ call_expected_expression_operand:
 
 call_empty_stack_while_popping:
   .functype call_empty_stack_while_popping () -> ()
-# CHECK: [[@LINE+1]]:3: error: empty stack while popping i32
+# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   call fn_i32_to_void
   end_function
 
 call_type_mismatch:
   .functype call_type_mismatch () -> ()
   f32.const 1.0
-# CHECK: :[[@LINE+1]]:3: error: popped f32, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [f32]
   call fn_i32_to_void
   end_function
 
@@ -551,7 +566,7 @@ call_type_mismatch:
 call_superfluous_value_at_end:
   .functype call_superfluous_value_at_end () -> ()
   call fn_void_to_i32
-# CHECK: :[[@LINE+1]]:3: error: 1 superfluous return values
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [] but got [i32]
   end_function
 
 call_missing_functype:
@@ -568,14 +583,14 @@ return_call_expected_expression_operand:
 
 return_call_empty_stack_while_popping:
   .functype return_call_empty_stack_while_popping () -> ()
-# CHECK: [[@LINE+1]]:3: error: empty stack while popping i32
+# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [i32] but got []
   return_call fn_i32_to_void
   end_function
 
 return_call_type_mismatch:
   .functype return_call_type_mismatch () -> ()
   f32.const 1.0
-# CHECK: :[[@LINE+1]]:3: error: popped f32, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [f32]
   return_call fn_i32_to_void
   end_function
 
@@ -607,27 +622,29 @@ catch_superfluous_value_at_end:
   catch tag_i32
   end_try
 # FIXME: Superfluous value should be caught at end_try?
-# CHECK: :[[@LINE+1]]:3: error: 1 superfluous return values
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [] but got [i32]
   end_function
 
 ref_is_null_empty_stack_while_popping:
   .functype ref_is_null_empty_stack_while_popping () -> ()
-# CHECK: [[@LINE+1]]:3: error: empty stack while popping reftype
+# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [ref] but got []
   ref.is_null
+  drop
   end_function
 
 ref_is_null_type_mismatch:
   .functype ref_is_null_type_mismatch () -> ()
   i32.const 1
-# CHECK: [[@LINE+1]]:3: error: popped i32, expected reftype
+# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [ref] but got [i32]
   ref.is_null
+  drop
   end_function
 
 ref_is_null_pushes_i32:
   .functype ref_is_null_pushes_i32 () -> (i64)
   ref.null_func
   ref.is_null
-# CHECK: :[[@LINE+1]]:3: error: popped i32, expected i64
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i64] but got [i32]
   end_function
 
 # For the other instructions, the type checker checks vs the operands in the
@@ -636,16 +653,18 @@ ref_is_null_pushes_i32:
 
 other_insn_test_1:
   .functype other_insn_test_1 () -> ()
-# CHECK: [[@LINE+1]]:3: error: empty stack while popping i32
+# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [i32, i32] but got []
   i32.add
+  drop
   end_function
 
 other_insn_test_2:
   .functype other_insn_test_2 () -> ()
   i32.const 1
   ref.null_func
-# CHECK: :[[@LINE+1]]:3: error: popped funcref, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, i32] but got [i32, funcref]
   i32.add
+  drop
   end_function
 
 other_insn_test_3:
@@ -653,7 +672,7 @@ other_insn_test_3:
   f32.const 1.0
   f32.const 2.0
   f32.add
-# CHECK: :[[@LINE+1]]:3: error: 1 superfluous return values
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [] but got [f32]
   end_function
 
 # Unreachable code within 'block' does not affect type checking after
@@ -663,7 +682,7 @@ check_after_unreachable_within_block:
   block
   unreachable
   end_block
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping value
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [any] but got []
   drop
   end_function
 
@@ -673,7 +692,7 @@ check_after_unreachable_within_loop:
   loop
   unreachable
   end_loop
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping value
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [any] but got []
   drop
   end_function
 
@@ -686,7 +705,7 @@ check_after_unreachable_within_if_1:
   else
   unreachable
   end_if
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping value
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [any] but got []
   drop
   end_function
 
@@ -697,7 +716,7 @@ check_after_unreachable_within_if_2:
   if
   unreachable
   else
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping value
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [any] but got []
   drop
   end_if
   end_function
@@ -710,7 +729,7 @@ check_after_unreachable_within_try_1:
   catch_all
   unreachable
   end_try
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping value
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [any] but got []
   drop
   end_function
 
@@ -721,7 +740,7 @@ check_after_unreachable_within_try_2:
   unreachable
   catch tag_i32
   drop
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping value
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [any] but got []
   drop
   end_try
   end_function
@@ -732,7 +751,7 @@ check_after_unreachable_within_try_3:
   try
   unreachable
   catch_all
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping value
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [any] but got []
   drop
   end_try
   end_function
@@ -743,7 +762,7 @@ check_after_unreachable_within_try_4:
   try
   unreachable
   delegate 0
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping value
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [any] but got []
   drop
   end_function
 
@@ -753,7 +772,7 @@ br_invalid_type_loop:
   loop (i32) -> (f32)
     drop
     f32.const 1.0
-# CHECK: :[[@LINE+1]]:5: error: br got f32, expected i32
+# CHECK: :[[@LINE+1]]:5: error: type mismatch, expected [i32] but got [f32]
     br 0
   end_loop
   drop
@@ -763,7 +782,7 @@ br_invalid_type_block:
   .functype br_invalid_type_block () -> ()
   i32.const 1
   block (i32) -> (f32)
-# CHECK: :[[@LINE+1]]:5: error: br got i32, expected f32
+# CHECK: :[[@LINE+1]]:5: error: type mismatch, expected [f32] but got [i32]
     br 0
     f32.const 1.0
   end_block
@@ -777,7 +796,7 @@ br_invalid_type_if:
     f32.const 1.0
   else
     i32.const 1
-# CHECK: :[[@LINE+1]]:5: error: br got i32, expected f32
+# CHECK: :[[@LINE+1]]:5: error: type mismatch, expected [f32] but got [i32]
     br 0
   end_if
   drop
@@ -787,7 +806,7 @@ br_invalid_type_try:
   .functype br_invalid_type_try () -> ()
   try f32
     i32.const 1
-# CHECK: :[[@LINE+1]]:5: error: br got i32, expected f32
+# CHECK: :[[@LINE+1]]:5: error: type mismatch, expected [f32] but got [i32]
     br 0
   catch tag_f32
   end_try
@@ -799,7 +818,7 @@ br_invalid_type_catch:
   try f32
     f32.const 1.0
   catch tag_i32
-# CHECK: :[[@LINE+1]]:5: error: br got i32, expected f32
+# CHECK: :[[@LINE+1]]:5: error: type mismatch, expected [f32] but got [i32]
     br 0
   end_try
   drop
@@ -811,7 +830,7 @@ br_invalid_type_catch_all:
     f32.const 1.0
   catch_all
     i32.const 1
-# CHECK: :[[@LINE+1]]:5: error: br got i32, expected f32
+# CHECK: :[[@LINE+1]]:5: error: type mismatch, expected [f32] but got [i32]
     br 0
   end_try
   drop
@@ -834,7 +853,7 @@ br_incorrect_signature:
   block f32
     block i32
       i32.const 1
-# CHECK: :[[@LINE+1]]:7: error: br got i32, expected f32
+# CHECK: :[[@LINE+1]]:7: error: type mismatch, expected [f32] but got [i32]
       br 1
     end_block
     drop
@@ -847,7 +866,7 @@ br_incorrect_func_signature:
   .functype br_incorrect_func_signature () -> (i32)
   block f32
     f32.const 1.0
-# CHECK: :[[@LINE+1]]:5: error: br got f32, expected i32
+# CHECK: :[[@LINE+1]]:5: error: type mismatch, expected [i32] but got [f32]
     br 1
   end_block
   drop
@@ -856,20 +875,53 @@ br_incorrect_func_signature:
 
 multiple_errors_in_function:
   .functype multiple_errors_in_function () -> ()
-# CHECK: :[[@LINE+2]]:3: error: empty stack while popping i32
+# CHECK: :[[@LINE+2]]:3: error: type mismatch, expected [i32] but got []
 # CHECK: :[[@LINE+1]]:13: error: expected expression operand
   table.get 1
 
-# CHECK: :[[@LINE+3]]:3: error: empty stack while popping i32
-# CHECK: :[[@LINE+2]]:3: error: empty stack while popping externref
-# CHECK: :[[@LINE+1]]:3: error: empty stack while popping i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [any]
   table.fill valid_table
 
   f32.const 0.0
   ref.null_extern
-# CHECK: :[[@LINE+2]]:3: error: popped externref, expected i32
-# CHECK: :[[@LINE+1]]:3: error: popped f32, expected i32
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, i32] but got [f32, externref]
   i32.add
   drop
+  end_function
+
+.functype take_and_return_multi(i32, i64, f32, f64) -> (i32, i64, f32, f64)
+
+call_with_multi_param_and_return:
+  .functype call_with_multi_param_and_return () -> (i32)
+  ref.null_extern
+  f32.const 0.0
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, i64, f32, f64] but got [externref, f32]
+  call take_and_return_multi
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [i32, i64, f32, f64]
+  end_function
+
+.functype callee (f32, i32) -> ()
+
+any_value_on_stack:
+  .functype any_value_on_stack () -> ()
+  # This local does not exist so it should error out, but it should put an 'any'
+  # value on the stack so 'call callee' should not error out again
+# CHECK: :[[@LINE+1]]:13: error: no local type specified for index 0
+  local.get 0
+  i32.const 0
+# CHECK-NOT: :[[@LINE+1]]:3: error: type mismatch
+  call callee
+
+  # But this time 'call callee' should error out
+  i32.const 0
+# CHECK: :[[@LINE+1]]:13: error: no local type specified for index 0
+  local.get 0
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [f32, i32] but got [i32, any]
+  call callee
+
+# CHECK: :[[@LINE+2]]:13: error: no local type specified for index 0
+# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [any] but got []
+  local.set 0
+  drop
 
   end_function
diff --git a/llvm/test/MC/WebAssembly/type-checker-return.s b/llvm/test/MC/WebAssembly/type-checker-return.s
index 552093bc555bd..016c034058101 100644
--- a/llvm/test/MC/WebAssembly/type-checker-return.s
+++ b/llvm/test/MC/WebAssembly/type-checker-return.s
@@ -1,10 +1,5 @@
 # RUN: llvm-mc -triple=wasm32 -mattr=+tail-call %s 2>&1
 
-# XFAIL: *
-
-# FIXME: These shouldn't produce an error, as return will implicitly drop any
-# superfluous values.
-
 return_superfluous_return_values:
   .functype return_superfluous_return_values () -> ()
   i32.const 1
diff --git a/llvm/test/Support/interrupts.test b/llvm/test/Support/interrupts.test
index 4768ac61dff02..0966586106cc7 100644
--- a/llvm/test/Support/interrupts.test
+++ b/llvm/test/Support/interrupts.test
@@ -1,9 +1,9 @@
 ## Show that SIGINT and similar signals don't cause crash messages to be
 ## reported.
 # RUN: %python %s wrapper llvm-symbolizer 2> %t.err
-# RUN: FileCheck --input-file=%t.err %s
+# RUN: FileCheck -DMSG=%errc_ENOENT --input-file=%t.err %s
 
-# CHECK: {{.*}} error: 'foo': {{[Nn]}}o such file or directory
+# CHECK: {{.*}} error: 'foo': [[MSG]]
 # CHECK-NOT: {{.+}}
 
 import os
diff --git a/llvm/test/TableGen/intrinsic-attrs.td b/llvm/test/TableGen/intrinsic-attrs.td
index 3228b32405103..579b5e8a21b86 100644
--- a/llvm/test/TableGen/intrinsic-attrs.td
+++ b/llvm/test/TableGen/intrinsic-attrs.td
@@ -2,7 +2,6 @@
 
 include "llvm/IR/Intrinsics.td"
 
-// ... this intrinsic.
 def int_random_gen   : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects]>;
 
 def int_deref_ptr_ret : Intrinsic<[llvm_ptr_ty], [], [Dereferenceable<RetIndex, 16>]>;
@@ -24,14 +23,16 @@ def int_deref_ptr_ret : Intrinsic<[llvm_ptr_ty], [], [Dereferenceable<RetIndex,
 // CHECK-NEXT: });
 
 
-// CHECK: 1, // llvm.deref.ptr.ret
-// CHECK: 2, // llvm.random.gen
+// CHECK: getAttributes(LLVMContext &C, ID id)
+// CHECK: 0 << 8 | 0, // llvm.deref.ptr.ret
+// CHECK: 1 << 8 | 1, // llvm.random.gen
 
 // CHECK: case 1:
-// CHECK-NEXT: AS[0] = {0, getIntrinsicArgAttributeSet(C, 0)};
-// CHECK-NEXT: AS[1] = {AttributeList::FunctionIndex, getIntrinsicFnAttributeSet(C, 0)};
-// CHECK-NEXT: NumAttrs = 2;
-
-// CHECK: case 2:
-// CHECK-NEXT: AS[0] = {AttributeList::FunctionIndex, getIntrinsicFnAttributeSet(C, 1)};
-// CHECK-NEXT: NumAttrs = 1;
+// CHECK-NEXT: return AttributeList::get(C, {
+// CHECK-NEXT:   {AttributeList::FunctionIndex, getIntrinsicFnAttributeSet(C, FnAttrID)}
+// CHECK-NEXT: });
+// CHECK-NEXT: case 0:
+// CHECK-NEXT: return AttributeList::get(C, {
+// CHECK-NEXT:   {0, getIntrinsicArgAttributeSet(C, 0)},
+// CHECK-NEXT:   {AttributeList::FunctionIndex, getIntrinsicFnAttributeSet(C, FnAttrID)}
+// CHECK-NEXT: });
diff --git a/llvm/test/TableGen/intrinsic-target-prefix-for-target-independent.td b/llvm/test/TableGen/intrinsic-target-prefix-for-target-independent.td
new file mode 100644
index 0000000000000..84e365f02d968
--- /dev/null
+++ b/llvm/test/TableGen/intrinsic-target-prefix-for-target-independent.td
@@ -0,0 +1,9 @@
+// RUN: not llvm-tblgen -gen-intrinsic-enums -I %p/../../include %s 2>&1 | FileCheck %s -DFILE=%s
+
+include "llvm/IR/Intrinsics.td"
+
+// Check that target independent intrinsics with a prefix that matches a target
+// name are flagged.
+// CHECK: [[FILE]]:[[@LINE+1]]:5: error: target independent intrinsic `llvm.aarch64.foo' has prefix `llvm.aarch64` that conflicts with intrinsics for target `aarch64`
+def int_aarch64_foo : Intrinsic<[],[]>;
+
diff --git a/llvm/test/TableGen/invalid-type-cast-patfrags.td b/llvm/test/TableGen/invalid-type-cast-patfrags.td
new file mode 100644
index 0000000000000..49d8a73b81078
--- /dev/null
+++ b/llvm/test/TableGen/invalid-type-cast-patfrags.td
@@ -0,0 +1,39 @@
+// RUN: not llvm-tblgen -gen-dag-isel -I %p/../../include -I %p/Common -DERROR1 %s 2>&1 | FileCheck --check-prefix=ERROR1 %s
+// RUN: not llvm-tblgen -gen-dag-isel -I %p/../../include -I %p/Common -DERROR2 %s 2>&1 | FileCheck --check-prefix=ERROR2 %s
+// RUN: not llvm-tblgen -gen-dag-isel -I %p/../../include -I %p/Common -DERROR3 %s 2>&1 | FileCheck --check-prefix=ERROR3 %s
+// RUN: not llvm-tblgen -gen-dag-isel -I %p/../../include -I %p/Common -DERROR4 %s 2>&1 | FileCheck --check-prefix=ERROR4 %s
+
+include "llvm/Target/Target.td"
+include "GlobalISelEmitterCommon.td"
+
+def int_foo : Intrinsic<[llvm_anyint_ty, llvm_anyint_ty], [llvm_i32_ty]>;
+def int_bar : Intrinsic<[], []>;
+
+def INSTR_FOO : Instruction {
+  let OutOperandList = (outs GPR32:$a, GPR32:$b);
+  let InOperandList = (ins GPR32:$c);
+}
+def INSTR_BAR : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins);
+}
+
+#ifdef ERROR1
+// ERROR1: [[@LINE+1]]:1: error: {{.*}} Invalid number of type casts!
+def : Pat<([i32, i32, i32] (int_foo (i32 GPR32:$a))), ([i32, i32, i32] (INSTR_FOO $a))>;
+#endif
+
+#ifdef ERROR2
+// ERROR2: [[@LINE+1]]:1: error: {{.*}} Invalid number of type casts!
+def : Pat<([]<ValueType> (int_bar)), ([]<ValueType> (INSTR_BAR))>;
+#endif
+
+#ifdef ERROR3
+// ERROR3: [[@LINE+1]]:1: error: {{.*}} Type cast only takes one operand!
+def : Pat<([i32, i32] (int_foo), (int_foo)), ([i32, i32] (INSTR_FOO))>;
+#endif
+
+#ifdef ERROR4
+// ERROR4: [[@LINE+1]]:1: error: {{.*}} Type cast should not have a name!
+def : Pat<([i32, i32] ([i32, i32] (int_foo)):$name), ([i32, i32] (INSTR_FOO))>;
+#endif
diff --git a/llvm/test/TableGen/multiple-type-casts-patfrags.td b/llvm/test/TableGen/multiple-type-casts-patfrags.td
new file mode 100644
index 0000000000000..c4b4b62995fdb
--- /dev/null
+++ b/llvm/test/TableGen/multiple-type-casts-patfrags.td
@@ -0,0 +1,37 @@
+// RUN: llvm-tblgen -gen-dag-isel -I %p/../../include -I %p/Common %s | FileCheck -check-prefix=SDAG %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck -check-prefix=GISEL %s
+
+include "llvm/Target/Target.td"
+include "GlobalISelEmitterCommon.td"
+
+def REG : Register<"REG">;
+def GPR : RegisterClass<"MyTarget", [i16, i32], 32, (add REG)>;
+
+def int_foo : Intrinsic<[llvm_anyint_ty, llvm_anyint_ty], []>;
+
+def INSTR_FOO_I16_I32 : Instruction {
+  let OutOperandList = (outs GPR:$a, GPR:$b);
+  let InOperandList = (ins);
+}
+def INSTR_FOO_I32_I16 : Instruction {
+  let OutOperandList = (outs GPR:$a, GPR:$b);
+  let InOperandList = (ins);
+}
+
+// SDAG: 7*/ OPC_SwitchType {{.*}}, 10, /*MVT::i16*/6
+// SDAG: OPC_CheckTypeRes, 1, /*MVT::i32*/7
+// SDAG: OPC_MorphNodeTo2Chain, TARGET_VAL(::INSTR_FOO_I16_I32)
+
+// GISEL: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s16
+// GISEL: GIM_RootCheckType, /*Op*/1, /*Type*/GILLT_s32
+// GISEL: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(::INSTR_FOO_I16_I32)
+def : Pat<([i16, i32] (int_foo)), ([i16, i32] (INSTR_FOO_I16_I32))>;
+
+// SDAG: 20*/ /*SwitchType*/ {{.*}} /*MVT::i32*/7
+// SDAG: OPC_CheckTypeRes, 1, /*MVT::i16*/6
+// SDAG: OPC_MorphNodeTo2Chain, TARGET_VAL(::INSTR_FOO_I32_I16)
+
+// GISEL: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32
+// GISEL: GIM_RootCheckType, /*Op*/1, /*Type*/GILLT_s16
+// GISEL: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(::INSTR_FOO_I32_I16)
+def : Pat<([i32, i16] (int_foo)), ([i32, i16] (INSTR_FOO_I32_I16))>;
diff --git a/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll b/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll
index 63366ba998c7b..ca757a165fa4b 100644
--- a/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll
@@ -68,18 +68,14 @@ define internal i32 @test_cannot_promote_3(ptr %p, ptr nocapture readonly %test_
   ret i32 %sum
 }
 
-; FIXME: We should perform ArgPromotion here!
-;
 ; This is called only by @caller_safe_args_1, from which we can prove that
 ; %test_c does not alias %p for any Call to the function, so we can promote it.
 ;
 define internal i32 @test_can_promote_1(ptr %p, ptr nocapture readonly %test_c) {
 ; CHECK-LABEL: define {{[^@]+}}@test_can_promote_1
-; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) {
-; CHECK-NEXT:    [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]])
-; CHECK-NEXT:    [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4
-; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]]
+; CHECK-SAME: (ptr [[P:%.*]], i32 [[TEST_C_0_VAL:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_0_VAL]])
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[TEST_C_0_VAL]], [[RES]]
 ; CHECK-NEXT:    ret i32 [[SUM]]
 ;
   %res = call i32 @callee(ptr %p, ptr %test_c)
@@ -91,19 +87,15 @@ define internal i32 @test_can_promote_1(ptr %p, ptr nocapture readonly %test_c)
   ret i32 %sum
 }
 
-; FIXME: We should perform ArgPromotion here!
-;
 ; This is called by multiple callers (@caller_safe_args_1, @caller_safe_args_2),
 ; from which we can prove that %test_c does not alias %p for any Call to the
 ; function, so we can promote it.
 ;
 define internal i32 @test_can_promote_2(ptr %p, ptr nocapture readonly %test_c) {
 ; CHECK-LABEL: define {{[^@]+}}@test_can_promote_2
-; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) {
-; CHECK-NEXT:    [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]])
-; CHECK-NEXT:    [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4
-; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]]
+; CHECK-SAME: (ptr [[P:%.*]], i32 [[TEST_C_0_VAL:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_0_VAL]])
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[TEST_C_0_VAL]], [[RES]]
 ; CHECK-NEXT:    ret i32 [[SUM]]
 ;
   %res = call i32 @callee(ptr %p, ptr %test_c)
@@ -186,8 +178,10 @@ define i32 @caller_safe_args_1(i64 %n) {
 ; CHECK-NEXT:    [[CALLER_C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 5, ptr [[CALLER_C]], align 4
 ; CHECK-NEXT:    [[RES1:%.*]] = call i32 @test_cannot_promote_3(ptr [[P]], ptr [[CALLER_C]])
-; CHECK-NEXT:    [[RES2:%.*]] = call i32 @test_can_promote_1(ptr [[P]], ptr [[CALLER_C]])
-; CHECK-NEXT:    [[RES3:%.*]] = call i32 @test_can_promote_2(ptr [[P]], ptr [[CALLER_C]])
+; CHECK-NEXT:    [[CALLER_C_VAL:%.*]] = load i32, ptr [[CALLER_C]], align 4
+; CHECK-NEXT:    [[RES2:%.*]] = call i32 @test_can_promote_1(ptr [[P]], i32 [[CALLER_C_VAL]])
+; CHECK-NEXT:    [[CALLER_C_VAL1:%.*]] = load i32, ptr [[CALLER_C]], align 4
+; CHECK-NEXT:    [[RES3:%.*]] = call i32 @test_can_promote_2(ptr [[P]], i32 [[CALLER_C_VAL1]])
 ; CHECK-NEXT:    [[RES12:%.*]] = add i32 [[RES1]], [[RES2]]
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[RES12]], [[RES3]]
 ; CHECK-NEXT:    ret i32 [[RES]]
@@ -215,7 +209,8 @@ define i32 @caller_safe_args_2(i64 %n, ptr %p) {
 ; CHECK-NEXT:    call void @memset(ptr [[P]], i64 0, i64 [[N]])
 ; CHECK-NEXT:    [[CALLER_C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 5, ptr [[CALLER_C]], align 4
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @test_can_promote_2(ptr [[P]], ptr [[CALLER_C]])
+; CHECK-NEXT:    [[CALLER_C_VAL:%.*]] = load i32, ptr [[CALLER_C]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @test_can_promote_2(ptr [[P]], i32 [[CALLER_C_VAL]])
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   call void @memset(ptr %p, i64 0, i64 %n)
diff --git a/llvm/test/Transforms/Attributor/address_space_info.ll b/llvm/test/Transforms/Attributor/address_space_info.ll
index 73dd93c55b819..0c8b06ac6666a 100644
--- a/llvm/test/Transforms/Attributor/address_space_info.ll
+++ b/llvm/test/Transforms/Attributor/address_space_info.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --prefix-filecheck-ir-name true
-; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefix=CHECK
+
+; REQUIRES: amdgpu-registered-target
 
 @dst = dso_local addrspace(1) externally_initialized global i32 0, align 4
 @g1 = dso_local addrspace(1) externally_initialized global ptr null, align 4
diff --git a/llvm/test/Transforms/Attributor/nocapture-1.ll b/llvm/test/Transforms/Attributor/nocapture-1.ll
index 3401ddfdd7d75..de5f31e470edf 100644
--- a/llvm/test/Transforms/Attributor/nocapture-1.ll
+++ b/llvm/test/Transforms/Attributor/nocapture-1.ll
@@ -257,7 +257,7 @@ define i32 @nc1_addrspace(ptr %q, ptr addrspace(1) %p, i1 %b) {
 ; TUNIT-NEXT:    [[TMP:%.*]] = addrspacecast ptr addrspace(1) [[P]] to ptr
 ; TUNIT-NEXT:    [[TMP2:%.*]] = select i1 [[B]], ptr [[TMP]], ptr [[Q]]
 ; TUNIT-NEXT:    [[VAL:%.*]] = load i32, ptr [[TMP2]], align 4
-; TUNIT-NEXT:    store i32 0, ptr addrspace(1) [[P]], align 4
+; TUNIT-NEXT:    store i32 0, ptr [[TMP]], align 4
 ; TUNIT-NEXT:    store ptr [[Q]], ptr @g, align 8
 ; TUNIT-NEXT:    ret i32 [[VAL]]
 ;
@@ -272,7 +272,7 @@ define i32 @nc1_addrspace(ptr %q, ptr addrspace(1) %p, i1 %b) {
 ; CGSCC-NEXT:    [[TMP:%.*]] = addrspacecast ptr addrspace(1) [[P]] to ptr
 ; CGSCC-NEXT:    [[TMP2:%.*]] = select i1 [[B]], ptr [[TMP]], ptr [[Q]]
 ; CGSCC-NEXT:    [[VAL:%.*]] = load i32, ptr [[TMP2]], align 4
-; CGSCC-NEXT:    store i32 0, ptr addrspace(1) [[P]], align 4
+; CGSCC-NEXT:    store i32 0, ptr [[TMP]], align 4
 ; CGSCC-NEXT:    store ptr [[Q]], ptr @g, align 8
 ; CGSCC-NEXT:    ret i32 [[VAL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/value-simplify.ll b/llvm/test/Transforms/Attributor/value-simplify.ll
index 68f179c88116e..a5789790cc92a 100644
--- a/llvm/test/Transforms/Attributor/value-simplify.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify.ll
@@ -838,8 +838,7 @@ define void @user() {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; TUNIT-LABEL: define {{[^@]+}}@user
 ; TUNIT-SAME: () #[[ATTR5]] {
-; TUNIT-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspacecast (ptr addrspace(3) @ConstAS3Ptr to ptr) to ptr addrspace(3)
-; TUNIT-NEXT:    store i32 0, ptr addrspace(3) [[TMP1]], align 4
+; TUNIT-NEXT:    store i32 0, ptr addrspacecast (ptr addrspace(3) @ConstAS3Ptr to ptr), align 4
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write)
diff --git a/llvm/test/Transforms/DeadStoreElimination/malloc-to-calloc-with-nonzero-default-as.ll b/llvm/test/Transforms/DeadStoreElimination/malloc-to-calloc-with-nonzero-default-as.ll
new file mode 100644
index 0000000000000..977bf93fa856e
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/malloc-to-calloc-with-nonzero-default-as.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=dse < %s | FileCheck %s
+
+define ptr addrspace(4) @malloc_to_calloc(i64 %size) {
+; CHECK-LABEL: define ptr addrspace(4) @malloc_to_calloc(
+; CHECK-SAME: i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    [[CALLOC:%.*]] = call ptr addrspace(4) @calloc(i64 1, i64 [[SIZE]])
+; CHECK-NEXT:    ret ptr addrspace(4) [[CALLOC]]
+;
+  %ret = call ptr addrspace(4) @malloc(i64 %size)
+  call void @llvm.memset.p4.i64(ptr addrspace(4) %ret, i8 0, i64 %size, i1 false)
+  ret ptr addrspace(4) %ret
+}
+
+declare void @llvm.memset.p4.i64(ptr addrspace(4) nocapture writeonly, i8, i64, i1 immarg)
+
+declare noalias ptr addrspace(4) @malloc(i64) willreturn allockind("alloc,uninitialized") "alloc-family"="malloc"
diff --git a/llvm/test/Transforms/GlobalMerge/macho-sections.ll b/llvm/test/Transforms/GlobalMerge/macho-sections.ll
new file mode 100644
index 0000000000000..2d6f81de7eb85
--- /dev/null
+++ b/llvm/test/Transforms/GlobalMerge/macho-sections.ll
@@ -0,0 +1,31 @@
+; RUN: opt -global-merge -global-merge-max-offset=100 -S -o - %s | FileCheck %s
+; RUN: opt -passes='global-merge<max-offset=100>' -S -o - %s | FileCheck %s
+
+;; Check that we do _not_ merge globals which are in certain special
+;; sections under Mach-O.
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-apple-macos11"
+
+; CHECK: @cfstring1 = private global i32 1, section "__DATA,__cfstring"
+@cfstring1 = private global i32 1, section "__DATA,__cfstring"
+; CHECK: @cfstring2 = private global i32 2, section "__DATA,__cfstring"
+@cfstring2 = private global i32 2, section "__DATA,__cfstring"
+; CHECK: @objcclassrefs1 = private global i32 3, section "__DATA,__objc_classrefs,regular,no_dead_strip"
+@objcclassrefs1 = private global i32 3, section "__DATA,__objc_classrefs,regular,no_dead_strip"
+; CHECK: @objcclassrefs2 = private global i32 4, section "__DATA,__objc_classrefs,regular,no_dead_strip"
+@objcclassrefs2 = private global i32 4, section "__DATA,__objc_classrefs,regular,no_dead_strip"
+; CHECK: @objcselrefs1 = private global i32 5, section "__DATA,__objc_selrefs,literal_pointers,no_dead_strip"
+@objcselrefs1 = private global i32 5, section "__DATA,__objc_selrefs,literal_pointers,no_dead_strip"
+; CHECK: @objcselrefs2 = private global i32 6, section "__DATA,__objc_selrefs,literal_pointers,no_dead_strip"
+@objcselrefs2 = private global i32 6, section "__DATA,__objc_selrefs,literal_pointers,no_dead_strip"
+
+define void @use() {
+  load ptr, ptr @cfstring1
+  load ptr, ptr @cfstring2
+  load ptr, ptr @objcclassrefs1
+  load ptr, ptr @objcclassrefs2
+  load ptr, ptr @objcselrefs1
+  load ptr, ptr @objcselrefs2
+  ret void
+}
diff --git a/llvm/test/Transforms/GlobalMerge/macho-symbols.ll b/llvm/test/Transforms/GlobalMerge/macho-symbols.ll
new file mode 100644
index 0000000000000..a3e99a55668ba
--- /dev/null
+++ b/llvm/test/Transforms/GlobalMerge/macho-symbols.ll
@@ -0,0 +1,28 @@
+; RUN: opt -global-merge -global-merge-max-offset=100 -S -o - %s | FileCheck %s
+; RUN: opt -passes='global-merge<max-offset=100>' -S -o - %s | FileCheck %s
+
+;; For Mach-O, we do not expect any alias symbols to be created for
+;; internal/private symbols by GlobalMerge.
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-apple-macos11"
+
+@a = private global i32 1
+@b = private global i32 2
+@c = internal global i32 3
+@d = internal global i32 4
+
+; CHECK: @_MergedGlobals = internal global <{ i32, i32, i32, i32 }> <{ i32 1, i32 2, i32 3, i32 4 }>, align 4
+; CHECK-NOT: alias
+
+define void @use() {
+  ; CHECK: load i32, ptr @_MergedGlobals,
+  %x = load i32, ptr @a
+  ; CHECK: load i32, ptr getelementptr inbounds (<{ i32, i32, i32, i32 }>, ptr @_MergedGlobals, i32 0, i32 1)
+  %y = load i32, ptr @b
+  ; CHECK: load i32, ptr getelementptr inbounds (<{ i32, i32, i32, i32 }>, ptr @_MergedGlobals, i32 0, i32 2)
+  %z1 = load i32, ptr @c
+  ; CHECK: load i32, ptr getelementptr inbounds (<{ i32, i32, i32, i32 }>, ptr @_MergedGlobals, i32 0, i32 3)
+  %z2 = load i32, ptr @d
+  ret void
+}
diff --git a/llvm/test/Transforms/GlobalOpt/externally-initialized.ll b/llvm/test/Transforms/GlobalOpt/externally-initialized.ll
index 7a8244ea8297a..f0ee7830f273a 100644
--- a/llvm/test/Transforms/GlobalOpt/externally-initialized.ll
+++ b/llvm/test/Transforms/GlobalOpt/externally-initialized.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -S -passes=globalopt | FileCheck %s
+; RUN: opt < %s -passes=early-cse | opt -S -passes=globalopt | FileCheck %s --check-prefix=CHECK-CONSTANT
 
 ; This global is externally_initialized, which may modify the value between
 ; it's static initializer and any code in this module being run, so the only
@@ -12,6 +13,10 @@
 ; CHECK: @b = internal unnamed_addr externally_initialized global i32 undef
 @b = internal externally_initialized global i32 undef
 
+; This constant global is externally_initialized, which may modify the value
+; between its static const initializer and any code in this module being run, so
+; the read from it cannot be const propagated
+@c = internal externally_initialized constant i32 42
 
 define void @foo() {
 ; CHECK-LABEL: foo
@@ -35,3 +40,11 @@ entry:
   %val = load i32, ptr @b
   ret i32 %val
 }
+
+define i32 @bam() {
+; CHECK-CONSTANT-LABEL: bam
+entry:
+; CHECK-CONSTANT: %val = load i32, ptr @c
+  %val = load i32, ptr @c
+  ret i32 %val
+}
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/issue110433.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/issue110433.ll
new file mode 100644
index 0000000000000..4297d1c27639e
--- /dev/null
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/issue110433.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s
+
+define <8 x i1> @load_vector_of_flat_ptr_from_constant(ptr addrspace(4) %ptr) {
+; CHECK-LABEL: define <8 x i1> @load_vector_of_flat_ptr_from_constant(
+; CHECK-SAME: ptr addrspace(4) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[LD:%.*]] = load <8 x ptr>, ptr addrspace(4) [[PTR]], align 128
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast <8 x ptr> [[LD]] to <8 x ptr addrspace(1)>
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast <8 x ptr addrspace(1)> [[TMP1]] to <8 x ptr>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <8 x ptr> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %ld = load <8 x ptr>, ptr addrspace(4) %ptr, align 128
+  %cmp = icmp eq <8 x ptr> %ld, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @load_vector_of_flat_ptr_from_global(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define <8 x i1> @load_vector_of_flat_ptr_from_global(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[LD:%.*]] = load <8 x ptr>, ptr addrspace(1) [[PTR]], align 128
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <8 x ptr> [[LD]], zeroinitializer
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %ld = load <8 x ptr>, ptr addrspace(1) %ptr, align 128
+  %cmp = icmp eq <8 x ptr> %ld, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @load_vector_of_flat_ptr_from_global_invariant(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define <8 x i1> @load_vector_of_flat_ptr_from_global_invariant(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[LD:%.*]] = load <8 x ptr>, ptr addrspace(1) [[PTR]], align 128, !invariant [[META0:![0-9]+]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <8 x ptr> [[LD]], zeroinitializer
+; CHECK-NEXT:    ret <8 x i1> [[CMP]]
+;
+  %ld = load <8 x ptr>, ptr addrspace(1) %ptr, align 128, !invariant !0
+  %cmp = icmp eq <8 x ptr> %ld, zeroinitializer
+  ret <8 x i1> %cmp
+}
+
+!0 = !{}
+;.
+; CHECK: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll
index 374a985191768..04550156be30b 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll
@@ -138,7 +138,7 @@ define <vscale x 8 x bfloat> @test_fcvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vs
 ; CHECK-LABEL: define <vscale x 8 x bfloat> @test_fcvtnt_bf16_f32(
 ; CHECK-SAME: <vscale x 8 x bfloat> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
 ; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat> [[A]], <vscale x 8 x i1> [[PG]], <vscale x 4 x float> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OUT]]
 ;
   %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
@@ -150,7 +150,7 @@ define <vscale x 8 x half> @test_fcvtnt_f16_f32(<vscale x 8 x half> %a, <vscale
 ; CHECK-LABEL: define <vscale x 8 x half> @test_fcvtnt_f16_f32(
 ; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
 ; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fcvtnt.f16f32(<vscale x 8 x half> undef, <vscale x 4 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fcvtnt.f16f32(<vscale x 8 x half> [[A]], <vscale x 4 x i1> [[PG]], <vscale x 4 x float> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
 ;
   %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -162,7 +162,7 @@ define <vscale x 4 x float> @test_fcvtnt_f32_f64(<vscale x 4 x float> %a, <vscal
 ; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtnt_f32_f64(
 ; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
 ; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtnt.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtnt.f32f64(<vscale x 4 x float> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -186,7 +186,7 @@ define <vscale x 4 x float> @test_fcvtxnt_f32_f64(<vscale x 4 x float> %a, <vsca
 ; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtxnt_f32_f64(
 ; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
 ; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtxnt.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtxnt.f32f64(<vscale x 4 x float> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
index 7e9d12241ee36..ea6422e5ed8c2 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
@@ -4085,10 +4085,10 @@ declare <4 x i32> @llvm.amdgcn.struct.ptr.tbuffer.load.v4i32(ptr addrspace(8), i
 
 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 1, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
@@ -4096,12 +4096,12 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8
 ; Check that the intrinsic remains unchanged in the presence of TFE or LWE
 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe(
-; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 1, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 1, i32 0)
 ; CHECK-NEXT:    [[DATA_VEC:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA_VEC]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
+  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
   %data.vec = extractvalue {<4 x float>,i32} %data, 0
   %elt0 = extractelement <4 x float> %data.vec, i32 0
   ret float %elt0
@@ -4110,12 +4110,12 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr,
 ; Check that the intrinsic remains unchanged in the presence of TFE or LWE
 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe(
-; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 2, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 2, i32 0)
 ; CHECK-NEXT:    [[DATA_VEC:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA_VEC]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
+  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
   %data.vec = extractvalue {<4 x float>,i32} %data, 0
   %elt0 = extractelement <4 x float> %data.vec, i32 0
   ret float %elt0
@@ -4123,194 +4123,194 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr,
 
 define amdgpu_ps float @extract_elt0_image_sample_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.2d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(float %s, float %t, float %r, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 0, float [[S:%.*]], float [[T:%.*]], float [[R:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32.v4i32(i32 0, float [[S:%.*]], float [[T:%.*]], float [[R:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 0, float %s, float %t, float %r, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32.v4i32(i32 0, float %s, float %t, float %r, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(float %s, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1darray.f32.f32(i32 1, float [[S:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1darray.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32.v8i32.v4i32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 2, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 2, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 4, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 4, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 8, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 8, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 9, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 9, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[SHUF:%.*]] = insertelement <2 x float> poison, float [[DATA]], i64 0
 ; CHECK-NEXT:    ret <2 x float> [[SHUF]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32.v8i32.v4i32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32.v8i32.v4i32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32.v8i32.v4i32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[SHUF:%.*]] = insertelement <3 x float> poison, float [[DATA]], i64 0
 ; CHECK-NEXT:    ret <3 x float> [[SHUF]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32.v8i32.v4i32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x float> [[DATA]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; CHECK-NEXT:    ret <3 x float> [[SHUF]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32.v8i32.v4i32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x float> [[DATA]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; CHECK-NEXT:    ret <3 x float> [[SHUF]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32.v8i32.v4i32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <3 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32.v8i32.v4i32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <3 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.cl
@@ -4318,15 +4318,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, floa
 
 define amdgpu_ps float @extract_elt1_image_sample_cl_2darray_v4f32_f32(float %s, float %t, float %slice, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_image_sample_cl_2darray_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.2darray.f32.f32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.2darray.f32.f32.v8i32.v4i32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32 15, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32.v8i32.v4i32(i32 15, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 1
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.d
@@ -4334,15 +4334,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32, float, f
 
 define amdgpu_ps float @extract_elt2_image_sample_d_cube_v4f32_f32_f32(float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt2_image_sample_d_cube_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32(i32 4, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32.v8i32.v4i32(i32 4, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 2
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.d.cl
@@ -4350,15 +4350,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32, float, f
 
 define amdgpu_ps float @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.1d.f32.f32.f32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.1d.f32.f32.f32.v8i32.v4i32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 3
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.l
@@ -4366,15 +4366,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float,
 
 define amdgpu_ps float @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.1d.f32.f32(i32 4, float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.1d.f32.f32.v8i32.v4i32(i32 4, float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32 6, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32.v8i32.v4i32(i32 6, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 1
   ret float %elt0
 }
 
-declare <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.b
@@ -4382,15 +4382,15 @@ declare <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32, float, float,
 
 define amdgpu_ps float @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.1d.f32.f32.f32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.1d.f32.f32.f32.v8i32.v4i32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 9, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 9, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 1
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.b.cl
@@ -4398,15 +4398,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, flo
 
 define amdgpu_ps <2 x float> @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.b.cl.1d.v2f32.f32.f32(i32 12, float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.b.cl.1d.v2f32.f32.f32.v8i32.v4i32(i32 12, float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 13, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 13, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.lz
@@ -4414,15 +4414,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float,
 
 define amdgpu_ps <2 x float> @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.1d.v2f32.f32(i32 10, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.1d.v2f32.f32.v8i32.v4i32(i32 10, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuf
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.cd
@@ -4430,15 +4430,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i
 
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.cd.1d.v3f32.f32.f32(i32 14, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.cd.1d.v3f32.f32.f32.v8i32.v4i32(i32 14, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <3 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.cd.cl
@@ -4446,78 +4446,78 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, fl
 
 define amdgpu_ps half @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32.v8i32.v4i32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret half [[DATA]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 3
   ret half %elt0
 }
 
 define amdgpu_ps half @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32.v8i32.v4i32(i32 4, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret half [[DATA]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 2
   ret half %elt0
 }
 
 define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32.v8i32.v4i32(i32 2, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret half [[DATA]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 1
   ret half %elt0
 }
 
 define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32(i32 7, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32.v8i32.v4i32(i32 7, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[RES:%.*]] = shufflevector <3 x half> [[DATA]], <3 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 ; CHECK-NEXT:    ret <4 x half> [[RES]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x half> %res
 }
 
 define amdgpu_ps <4 x half> @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32.v8i32.v4i32(i32 3, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x half> [[DATA]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret <4 x half> [[RES]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x half> %res
 }
 
 define amdgpu_ps <4 x half> @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32.v8i32.v4i32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[RES:%.*]] = insertelement <4 x half> poison, half [[DATA]], i64 0
 ; CHECK-NEXT:    ret <4 x half> [[RES]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
   ret <4 x half> %res
 }
 
 define amdgpu_ps half @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32.v8i32.v4i32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret half [[DATA]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 0
   ret half %elt0
 }
 
-declare <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c
@@ -4525,15 +4525,15 @@ declare <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_c_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.1d.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.cl
@@ -4541,15 +4541,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_c_cl_1d_v4f32_f32(float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_cl_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.1d.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.d
@@ -4557,15 +4557,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, floa
 
 define amdgpu_ps float @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.1d.f32.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.d.cl
@@ -4573,15 +4573,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, f
 
 define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.1d.f32.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.l
@@ -4589,15 +4589,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float
 
 define amdgpu_ps float @extract_elt0_image_sample_c_l_1d_v4f32_f32(float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_l_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.1d.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.b
@@ -4605,15 +4605,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float
 
 define amdgpu_ps float @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.1d.f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.1d.f32.f32.f32.v8i32.v4i32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.b.cl
@@ -4621,15 +4621,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, f
 
 define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.1d.f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.1d.f32.f32.f32.v8i32.v4i32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.lz
@@ -4637,15 +4637,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float
 
 define amdgpu_ps float @extract_elt0_image_sample_c_lz_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_lz_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.1d.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.cd
@@ -4653,15 +4653,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, floa
 
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.1d.f32.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.cd.cl
@@ -4669,15 +4669,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.1d.f32.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.o
@@ -4685,15 +4685,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, floa
 
 define amdgpu_ps float @extract_elt0_image_sample_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.cl.o
@@ -4701,15 +4701,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32, i32, float, <8
 
 define amdgpu_ps float @extract_elt0_image_sample_cl_o_1d_v4f32_f32(i32 %offset, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_cl_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.d.o
@@ -4717,15 +4717,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32, i32, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.d.cl.o
@@ -4733,15 +4733,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32, i32, flo
 
 define amdgpu_ps float @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.l.o
@@ -4749,15 +4749,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_sample_l_o_1d_v4f32_f32(i32 %offset, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_l_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.b.o
@@ -4765,15 +4765,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32, i32, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.b.cl.o
@@ -4781,15 +4781,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32, i32, flo
 
 define amdgpu_ps float @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.cl.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.lz.o
@@ -4797,15 +4797,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_sample_lz_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_lz_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.lz.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.lz.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.cd.o
@@ -4813,15 +4813,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32, i32, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.cd.cl.o
@@ -4829,15 +4829,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32, i32, fl
 
 define amdgpu_ps float @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.cl.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.o
@@ -4845,15 +4845,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_sample_c_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.cl.o
@@ -4861,15 +4861,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32, i32, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.d.o
@@ -4877,15 +4877,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32, i32, floa
 
 define amdgpu_ps float @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.d.cl.o
@@ -4893,15 +4893,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32, i32, f
 
 define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.l.o
@@ -4909,15 +4909,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32, i32
 
 define amdgpu_ps float @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.b.o
@@ -4925,15 +4925,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32, i32, float
 
 define amdgpu_ps float @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.b.cl.o
@@ -4941,15 +4941,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32, i32, f
 
 define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.lz.o
@@ -4957,15 +4957,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32, i32
 
 define amdgpu_ps float @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.cd.o
@@ -4973,15 +4973,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32, i32, floa
 
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.cd.cl.o
@@ -4989,15 +4989,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4
@@ -5007,16 +5007,16 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32, i3
 
 define amdgpu_ps float @extract_elt0_image_gather4_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32.v8i32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32.v8i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.cl
@@ -5024,16 +5024,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <
 
 define amdgpu_ps float @extract_elt0_image_gather4_cl_2d_v4f32_f32(float %s, float %t, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_cl_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32.v8i32.v4i32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32.v8i32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32.v8i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.l
@@ -5041,16 +5041,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32, float, float
 
 define amdgpu_ps float @extract_elt0_image_gather4_l_2d_v4f32_f32(float %s, float %t, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_l_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32.v8i32.v4i32(i32 4, float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32.v8i32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32.v8i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.b
@@ -5058,16 +5058,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32, float, float,
 
 define amdgpu_ps float @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(float %bias, float %s, float %t, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32.v8i32.v4i32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32.v8i32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32.v8i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.b.cl
@@ -5075,16 +5075,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32, floa
 
 define amdgpu_ps float @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32.v8i32.v4i32(i32 1, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32.v8i32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32.v8i32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.lz
@@ -5092,16 +5092,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32, floa
 
 define amdgpu_ps float @extract_elt0_image_gather4_lz_2d_v4f32_f16(half %s, half %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_lz_2d_v4f32_f16(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16.v8i32.v4i32(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16.v8i32(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16.v8i32(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.o
@@ -5109,16 +5109,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32, half, half,
 
 define amdgpu_ps float @extract_elt0_image_gather4_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32.v8i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.cl.o
@@ -5126,16 +5126,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32, i32, float, f
 
 define amdgpu_ps float @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.l.o
@@ -5143,16 +5143,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32, i32, float
 
 define amdgpu_ps float @extract_elt0_image_gather4_l_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_l_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.b.o
@@ -5160,16 +5160,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float,
 
 define amdgpu_ps float @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32.v8i32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32.v8i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.b.cl.o
@@ -5177,16 +5177,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32, i32, fl
 
 define amdgpu_ps float @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32.v8i32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32.v8i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.lz.o
@@ -5194,16 +5194,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32.v8i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.c.o
@@ -5211,16 +5211,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32, i32, float
 
 define amdgpu_ps float @extract_elt0_image_gather4_c_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_c_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.c.cl.o
@@ -5228,16 +5228,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32, i32, float,
 
 define amdgpu_ps float @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.c.l.o
@@ -5245,16 +5245,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32, i32, flo
 
 define amdgpu_ps float @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.c.b.o
@@ -5262,16 +5262,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32, i32, floa
 
 define amdgpu_ps float @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32.v8i32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32.v8i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.c.b.cl.o
@@ -5279,16 +5279,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32.v8i32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32.v8i32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.c.lz.o
@@ -5296,16 +5296,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32, i3
 
 define amdgpu_ps float @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.getlod
@@ -5313,15 +5313,15 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32, i32, flo
 
 define amdgpu_ps float @extract_elt0_image_getlod_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_getlod_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.getlod.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.getlod.1d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32.v8i32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32.v8i32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.load
@@ -5329,15 +5329,15 @@ declare <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32, float, <8 x i32>
 
 define amdgpu_ps float @extract_elt0_image_load_2dmsaa_v4f32_i32(i32 %s, i32 %t, i32 %sample, <8 x i32> inreg %sampler) #0 {
 ; CHECK-LABEL: @extract_elt0_image_load_2dmsaa_v4f32_i32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SAMPLE:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SAMPLE:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32.v8i32(i32 15, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32.v8i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.load.mip
@@ -5345,15 +5345,15 @@ declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_load_mip_1d_v4f32_i32(i32 %s, i32 %mip, <8 x i32> inreg %sampler) #0 {
 ; CHECK-LABEL: @extract_elt0_image_load_mip_1d_v4f32_i32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.mip.1d.f32.i32(i32 1, i32 [[S:%.*]], i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.mip.1d.f32.i32.v8i32(i32 1, i32 [[S:%.*]], i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32.v8i32(i32 15, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32.v8i32(i32, i32, i32, <8 x i32>, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.getresinfo
@@ -5361,15 +5361,15 @@ declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x
 
 define amdgpu_ps float @extract_elt0_image_getresinfo_1d_v4f32_i32(i32 %mip, <8 x i32> inreg %sampler) #0 {
 ; CHECK-LABEL: @extract_elt0_image_getresinfo_1d_v4f32_i32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32.v8i32(i32 1, i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32.v8i32(i32 15, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; TFE / LWE
@@ -5377,30 +5377,30 @@ declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32, i32, <8 x i3
 
 define amdgpu_ps float @extract_elt0_tfe_image_load_1d_v4f32i32_i32(i32 %s, <8 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_tfe_image_load_1d_v4f32i32_i32(
-; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 1)
+; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32.v8i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 1)
 ; CHECK-NEXT:    [[RGBA:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[RGBA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
+  %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32.v8i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
   %rgba = extractvalue { <4 x float>, i32 } %data, 0
   %elt0 = extractelement <4 x float> %rgba, i32 0
   ret float %elt0
 }
 
-declare {<4 x float>, i32} @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>, i32} @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #1
 
 define amdgpu_hs float @tfe_check_assert() #0 {
 ; CHECK-LABEL: @tfe_check_assert(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2d.f32.i32.v8i32(i32 1, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call nsz <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1) #2
+  %data = call nsz <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 15, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1) #2
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #1
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
index af12367b9759c..66184405f3092 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@@ -4084,10 +4084,10 @@ declare <4 x i32> @llvm.amdgcn.struct.ptr.tbuffer.load.v4i32(ptr addrspace(8), i
 
 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 1, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
@@ -4095,12 +4095,12 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8
 ; Check that the intrinsic remains unchanged in the presence of TFE or LWE
 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe(
-; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 1, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 1, i32 0)
 ; CHECK-NEXT:    [[DATA_VEC:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA_VEC]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
+  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
   %data.vec = extractvalue {<4 x float>,i32} %data, 0
   %elt0 = extractelement <4 x float> %data.vec, i32 0
   ret float %elt0
@@ -4109,12 +4109,12 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr,
 ; Check that the intrinsic remains unchanged in the presence of TFE or LWE
 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe(
-; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 2, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 2, i32 0)
 ; CHECK-NEXT:    [[DATA_VEC:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA_VEC]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
+  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
   %data.vec = extractvalue {<4 x float>,i32} %data, 0
   %elt0 = extractelement <4 x float> %data.vec, i32 0
   ret float %elt0
@@ -4122,194 +4122,194 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr,
 
 define amdgpu_ps float @extract_elt0_image_sample_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.2d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(float %s, float %t, float %r, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 0, float [[S:%.*]], float [[T:%.*]], float [[R:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32.v4i32(i32 0, float [[S:%.*]], float [[T:%.*]], float [[R:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 0, float %s, float %t, float %r, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32.v4i32(i32 0, float %s, float %t, float %r, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(float %s, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1darray.f32.f32(i32 1, float [[S:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1darray.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32.v8i32.v4i32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 2, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 2, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 4, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 4, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 8, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 8, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 9, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 9, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps float @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[SHUF:%.*]] = insertelement <2 x float> poison, float [[DATA]], i64 0
 ; CHECK-NEXT:    ret <2 x float> [[SHUF]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32.v8i32.v4i32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32.v8i32.v4i32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32.v8i32.v4i32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[SHUF:%.*]] = insertelement <3 x float> poison, float [[DATA]], i64 0
 ; CHECK-NEXT:    ret <3 x float> [[SHUF]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32.v8i32.v4i32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x float> [[DATA]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; CHECK-NEXT:    ret <3 x float> [[SHUF]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32.v8i32.v4i32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x float> [[DATA]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; CHECK-NEXT:    ret <3 x float> [[SHUF]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32.v8i32.v4i32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <3 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32.v8i32.v4i32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <3 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.cl
@@ -4317,15 +4317,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, floa
 
 define amdgpu_ps float @extract_elt1_image_sample_cl_2darray_v4f32_f32(float %s, float %t, float %slice, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_image_sample_cl_2darray_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.2darray.f32.f32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.2darray.f32.f32.v8i32.v4i32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32 15, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32.v8i32.v4i32(i32 15, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 1
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.d
@@ -4333,15 +4333,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32, float, f
 
 define amdgpu_ps float @extract_elt2_image_sample_d_cube_v4f32_f32_f32(float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt2_image_sample_d_cube_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32(i32 4, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32.v8i32.v4i32(i32 4, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 2
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.d.cl
@@ -4349,15 +4349,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32, float, f
 
 define amdgpu_ps float @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.1d.f32.f32.f32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.1d.f32.f32.f32.v8i32.v4i32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 3
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.l
@@ -4365,15 +4365,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float,
 
 define amdgpu_ps float @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.1d.f32.f32(i32 4, float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.1d.f32.f32.v8i32.v4i32(i32 4, float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32 6, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32.v8i32.v4i32(i32 6, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 1
   ret float %elt0
 }
 
-declare <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.b
@@ -4381,15 +4381,15 @@ declare <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32, float, float,
 
 define amdgpu_ps float @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.1d.f32.f32.f32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.1d.f32.f32.f32.v8i32.v4i32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 9, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 9, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 1
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.b.cl
@@ -4397,15 +4397,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, flo
 
 define amdgpu_ps <2 x float> @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.b.cl.1d.v2f32.f32.f32(i32 12, float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.b.cl.1d.v2f32.f32.f32.v8i32.v4i32(i32 12, float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 13, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 13, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.lz
@@ -4413,15 +4413,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float,
 
 define amdgpu_ps <2 x float> @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.1d.v2f32.f32(i32 10, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.1d.v2f32.f32.v8i32.v4i32(i32 10, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuf
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.cd
@@ -4429,15 +4429,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i
 
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.cd.1d.v3f32.f32.f32(i32 14, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.cd.1d.v3f32.f32.f32.v8i32.v4i32(i32 14, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret <3 x float> [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.cd.cl
@@ -4445,78 +4445,78 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, fl
 
 define amdgpu_ps half @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32.v8i32.v4i32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret half [[DATA]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 3
   ret half %elt0
 }
 
 define amdgpu_ps half @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32.v8i32.v4i32(i32 4, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret half [[DATA]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 2
   ret half %elt0
 }
 
 define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32.v8i32.v4i32(i32 2, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret half [[DATA]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 1
   ret half %elt0
 }
 
 define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32(i32 7, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32.v8i32.v4i32(i32 7, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[RES:%.*]] = shufflevector <3 x half> [[DATA]], <3 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 ; CHECK-NEXT:    ret <4 x half> [[RES]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x half> %res
 }
 
 define amdgpu_ps <4 x half> @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32.v8i32.v4i32(i32 3, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x half> [[DATA]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret <4 x half> [[RES]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x half> %res
 }
 
 define amdgpu_ps <4 x half> @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32.v8i32.v4i32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[RES:%.*]] = insertelement <4 x half> poison, half [[DATA]], i64 0
 ; CHECK-NEXT:    ret <4 x half> [[RES]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
   ret <4 x half> %res
 }
 
 define amdgpu_ps half @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32.v8i32.v4i32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret half [[DATA]]
 ;
-  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 0
   ret half %elt0
 }
 
-declare <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c
@@ -4524,15 +4524,15 @@ declare <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_c_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.1d.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.cl
@@ -4540,15 +4540,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_c_cl_1d_v4f32_f32(float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_cl_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.1d.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.d
@@ -4556,15 +4556,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, floa
 
 define amdgpu_ps float @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.1d.f32.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.d.cl
@@ -4572,15 +4572,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, f
 
 define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.1d.f32.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.l
@@ -4588,15 +4588,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float
 
 define amdgpu_ps float @extract_elt0_image_sample_c_l_1d_v4f32_f32(float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_l_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.1d.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.b
@@ -4604,15 +4604,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float
 
 define amdgpu_ps float @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.1d.f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.1d.f32.f32.f32.v8i32.v4i32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.b.cl
@@ -4620,15 +4620,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, f
 
 define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.1d.f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.1d.f32.f32.f32.v8i32.v4i32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.lz
@@ -4636,15 +4636,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float
 
 define amdgpu_ps float @extract_elt0_image_sample_c_lz_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_lz_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.1d.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.cd
@@ -4652,15 +4652,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, floa
 
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.1d.f32.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.cd.cl
@@ -4668,15 +4668,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.1d.f32.f32.f32.v8i32.v4i32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.o
@@ -4684,15 +4684,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, floa
 
 define amdgpu_ps float @extract_elt0_image_sample_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.cl.o
@@ -4700,15 +4700,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32, i32, float, <8
 
 define amdgpu_ps float @extract_elt0_image_sample_cl_o_1d_v4f32_f32(i32 %offset, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_cl_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.d.o
@@ -4716,15 +4716,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32, i32, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.d.cl.o
@@ -4732,15 +4732,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32, i32, flo
 
 define amdgpu_ps float @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.l.o
@@ -4748,15 +4748,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_sample_l_o_1d_v4f32_f32(i32 %offset, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_l_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.b.o
@@ -4764,15 +4764,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32, i32, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.b.cl.o
@@ -4780,15 +4780,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32, i32, flo
 
 define amdgpu_ps float @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.cl.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.lz.o
@@ -4796,15 +4796,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_sample_lz_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_lz_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.lz.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.lz.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.cd.o
@@ -4812,15 +4812,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32, i32, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.cd.cl.o
@@ -4828,15 +4828,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32, i32, fl
 
 define amdgpu_ps float @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.cl.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.o
@@ -4844,15 +4844,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_sample_c_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.cl.o
@@ -4860,15 +4860,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32, i32, float,
 
 define amdgpu_ps float @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.d.o
@@ -4876,15 +4876,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32, i32, floa
 
 define amdgpu_ps float @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.d.cl.o
@@ -4892,15 +4892,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32, i32, f
 
 define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.l.o
@@ -4908,15 +4908,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32, i32
 
 define amdgpu_ps float @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.b.o
@@ -4924,15 +4924,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32, i32, float
 
 define amdgpu_ps float @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.b.cl.o
@@ -4940,15 +4940,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32, i32, f
 
 define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.lz.o
@@ -4956,15 +4956,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32, i32
 
 define amdgpu_ps float @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.o.1d.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.cd.o
@@ -4972,15 +4972,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32, i32, floa
 
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample.c.cd.cl.o
@@ -4988,15 +4988,15 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.o.1d.f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4
@@ -5006,16 +5006,16 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32, i3
 
 define amdgpu_ps float @extract_elt0_image_gather4_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32.v8i32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32.v8i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.cl
@@ -5023,16 +5023,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <
 
 define amdgpu_ps float @extract_elt0_image_gather4_cl_2d_v4f32_f32(float %s, float %t, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_cl_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32.v8i32.v4i32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32.v8i32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32.v8i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.l
@@ -5040,16 +5040,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32, float, float
 
 define amdgpu_ps float @extract_elt0_image_gather4_l_2d_v4f32_f32(float %s, float %t, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_l_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32.v8i32.v4i32(i32 4, float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32.v8i32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32.v8i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.b
@@ -5057,16 +5057,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32, float, float,
 
 define amdgpu_ps float @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(float %bias, float %s, float %t, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32.v8i32.v4i32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32.v8i32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32.v8i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.b.cl
@@ -5074,16 +5074,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32, floa
 
 define amdgpu_ps float @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32.v8i32.v4i32(i32 1, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32.v8i32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32.v8i32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.lz
@@ -5091,16 +5091,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32, floa
 
 define amdgpu_ps float @extract_elt0_image_gather4_lz_2d_v4f32_f16(half %s, half %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_lz_2d_v4f32_f16(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16.v8i32.v4i32(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16.v8i32(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16.v8i32(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.o
@@ -5108,16 +5108,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32, half, half,
 
 define amdgpu_ps float @extract_elt0_image_gather4_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32.v8i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.cl.o
@@ -5125,16 +5125,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32, i32, float, f
 
 define amdgpu_ps float @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.l.o
@@ -5142,16 +5142,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32, i32, float
 
 define amdgpu_ps float @extract_elt0_image_gather4_l_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_l_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.b.o
@@ -5159,16 +5159,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float,
 
 define amdgpu_ps float @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32.v8i32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32.v8i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.b.cl.o
@@ -5176,16 +5176,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32, i32, fl
 
 define amdgpu_ps float @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32.v8i32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32.v8i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.lz.o
@@ -5193,16 +5193,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32.v8i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.c.o
@@ -5210,16 +5210,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32, i32, float
 
 define amdgpu_ps float @extract_elt0_image_gather4_c_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_c_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.c.cl.o
@@ -5227,16 +5227,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32, i32, float,
 
 define amdgpu_ps float @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.c.l.o
@@ -5244,16 +5244,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32, i32, flo
 
 define amdgpu_ps float @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.c.b.o
@@ -5261,16 +5261,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32, i32, floa
 
 define amdgpu_ps float @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32.v8i32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32.v8i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.c.b.cl.o
@@ -5278,16 +5278,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32.v8i32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32.v8i32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.gather4.c.lz.o
@@ -5295,16 +5295,16 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32, i3
 
 define amdgpu_ps float @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32.v8i32.v4i32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32.v8i32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.getlod
@@ -5312,15 +5312,15 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32, i32, flo
 
 define amdgpu_ps float @extract_elt0_image_getlod_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_image_getlod_1d_v4f32_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.getlod.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.getlod.1d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32.v8i32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32.v8i32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.load
@@ -5328,15 +5328,15 @@ declare <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32, float, <8 x i32>
 
 define amdgpu_ps float @extract_elt0_image_load_2dmsaa_v4f32_i32(i32 %s, i32 %t, i32 %sample, <8 x i32> inreg %sampler) #0 {
 ; CHECK-LABEL: @extract_elt0_image_load_2dmsaa_v4f32_i32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SAMPLE:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SAMPLE:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32.v8i32(i32 15, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32.v8i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.load.mip
@@ -5344,15 +5344,15 @@ declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32,
 
 define amdgpu_ps float @extract_elt0_image_load_mip_1d_v4f32_i32(i32 %s, i32 %mip, <8 x i32> inreg %sampler) #0 {
 ; CHECK-LABEL: @extract_elt0_image_load_mip_1d_v4f32_i32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.mip.1d.f32.i32(i32 1, i32 [[S:%.*]], i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.mip.1d.f32.i32.v8i32(i32 1, i32 [[S:%.*]], i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32.v8i32(i32 15, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32.v8i32(i32, i32, i32, <8 x i32>, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.getresinfo
@@ -5360,15 +5360,15 @@ declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x
 
 define amdgpu_ps float @extract_elt0_image_getresinfo_1d_v4f32_i32(i32 %mip, <8 x i32> inreg %sampler) #0 {
 ; CHECK-LABEL: @extract_elt0_image_getresinfo_1d_v4f32_i32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32.v8i32(i32 1, i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32.v8i32(i32 15, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; TFE / LWE
@@ -5376,30 +5376,30 @@ declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32, i32, <8 x i3
 
 define amdgpu_ps float @extract_elt0_tfe_image_load_1d_v4f32i32_i32(i32 %s, <8 x i32> inreg %rsrc) #0 {
 ; CHECK-LABEL: @extract_elt0_tfe_image_load_1d_v4f32i32_i32(
-; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 1)
+; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32.v8i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 1)
 ; CHECK-NEXT:    [[RGBA:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[RGBA]], i64 0
 ; CHECK-NEXT:    ret float [[ELT0]]
 ;
-  %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
+  %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32.v8i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
   %rgba = extractvalue { <4 x float>, i32 } %data, 0
   %elt0 = extractelement <4 x float> %rgba, i32 0
   ret float %elt0
 }
 
-declare {<4 x float>, i32} @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>, i32} @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32.v8i32(i32, i32, <8 x i32>, i32, i32) #1
 
 define amdgpu_hs float @tfe_check_assert() #0 {
 ; CHECK-LABEL: @tfe_check_assert(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1)
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2d.f32.i32.v8i32(i32 1, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1)
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
-  %data = call nsz <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1) #2
+  %data = call nsz <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 15, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1) #2
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #1
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics-gfx8.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics-gfx8.ll
index 0cb9b4b84a556..4df170dc8f85c 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics-gfx8.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics-gfx8.ll
@@ -5,22 +5,22 @@
 ; llvm.amdgcn.image.sample a16 is disabled on pre-gfx9
 ; --------------------------------------------------------------------
 
-declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 define amdgpu_kernel void @image_sample_a16_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
 ; CHECK-LABEL: @image_sample_a16_1d(
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -29,13 +29,13 @@ define amdgpu_kernel void @image_sample_a16_2d(ptr addrspace(1) %out, <8 x i32>
 ; CHECK-LABEL: @image_sample_a16_2d(
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
 ; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -45,14 +45,14 @@ define amdgpu_kernel void @image_sample_a16_3d(ptr addrspace(1) %out, <8 x i32>
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
 ; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
 ; CHECK-NEXT:    [[R32:%.*]] = fpext half [[R:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[R32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32.v4i32(i32 15, float [[S32]], float [[T32]], float [[R32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %r32 = fpext half %r to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -63,14 +63,14 @@ define amdgpu_kernel void @image_sample_a16_cube(ptr addrspace(1) %out, <8 x i32
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
 ; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
 ; CHECK-NEXT:    [[FACE32:%.*]] = fpext half [[FACE:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[FACE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32.v8i32.v4i32(i32 15, float [[S32]], float [[T32]], float [[FACE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %face32 = fpext half %face to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32.v8i32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -79,13 +79,13 @@ define amdgpu_kernel void @image_sample_a16_1darray(ptr addrspace(1) %out, <8 x
 ; CHECK-LABEL: @image_sample_a16_1darray(
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
 ; CHECK-NEXT:    [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float [[S32]], float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32.v8i32.v4i32(i32 15, float [[S32]], float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %slice32 = fpext half %slice to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32.v8i32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -95,14 +95,14 @@ define amdgpu_kernel void @image_sample_a16_2darray(ptr addrspace(1) %out, <8 x
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
 ; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
 ; CHECK-NEXT:    [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32.v8i32.v4i32(i32 15, float [[S32]], float [[T32]], float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %slice32 = fpext half %slice to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32.v8i32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index fabf8ab51764b..e4f8bb133e6ce 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -3159,96 +3159,96 @@ define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src0)
 ; llvm.amdgcn.image.sample a16
 ; --------------------------------------------------------------------
 
-declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-
-declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-
-declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-
-declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-
-declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-
-declare <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-
-declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-
-declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 define amdgpu_kernel void @image_sample_a16_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
 ; CHECK-LABEL: @image_sample_a16_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
 ; CHECK-LABEL: @image_sample_a16_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) {
 ; CHECK-LABEL: @image_sample_a16_3d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %r32 = fpext half %r to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -3256,133 +3256,133 @@ define amdgpu_kernel void @image_sample_a16_3d(ptr addrspace(1) %out, <8 x i32>
 define amdgpu_kernel void @image_sample_a16_cube(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) {
 ;
 ; CHECK-LABEL: @image_sample_a16_cube(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %face32 = fpext half %face to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_1darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) {
 ; CHECK-LABEL: @image_sample_a16_1darray(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %slice32 = fpext half %slice to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_2darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) {
 ; CHECK-LABEL: @image_sample_a16_2darray(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %slice32 = fpext half %slice to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) {
 ; CHECK-LABEL: @image_sample_a16_c_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) {
 ; CHECK-LABEL: @image_sample_a16_c_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 15, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 15, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_c_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_c_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 15, float %zcompare, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_b16_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s) {
 ; CHECK-LABEL: @image_sample_a16_b16_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[BIAS:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
   %s32 = fpext half %s to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float %bias32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -3390,26 +3390,26 @@ define amdgpu_kernel void @image_sample_a16_b16_1d(ptr addrspace(1) %out, <8 x i
 define amdgpu_kernel void @image_sample_a16_b32_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) {
 ; CHECK-LABEL: @image_sample_a16_b32_1d(
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float %bias, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_b16_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t) {
 ; CHECK-LABEL: @image_sample_a16_b16_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float %bias32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -3418,26 +3418,26 @@ define amdgpu_kernel void @image_sample_a16_b32_2d(ptr addrspace(1) %out, <8 x i
 ; CHECK-LABEL: @image_sample_a16_b32_2d(
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
 ; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float %bias, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_b16_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s) {
 ; CHECK-LABEL: @image_sample_a16_c_b16_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
   %s32 = fpext half %s to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias32, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias32, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -3445,26 +3445,26 @@ define amdgpu_kernel void @image_sample_a16_c_b16_1d(ptr addrspace(1) %out, <8 x
 define amdgpu_kernel void @image_sample_a16_c_b32_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) {
 ; CHECK-LABEL: @image_sample_a16_c_b32_1d(
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_b16_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t) {
 ; CHECK-LABEL: @image_sample_a16_c_b16_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float %bias32, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias32, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -3473,27 +3473,27 @@ define amdgpu_kernel void @image_sample_a16_c_b32_2d(ptr addrspace(1) %out, <8 x
 ; CHECK-LABEL: @image_sample_a16_c_b32_2d(
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
 ; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_b16_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_b16_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
   %s32 = fpext half %s to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float %bias32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -3502,20 +3502,20 @@ define amdgpu_kernel void @image_sample_a16_b32_cl_1d(ptr addrspace(1) %out, <8
 ; CHECK-LABEL: @image_sample_a16_b32_cl_1d(
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
 ; CHECK-NEXT:    [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_b16_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_b16_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3523,7 +3523,7 @@ define amdgpu_kernel void @image_sample_a16_b16_cl_2d(ptr addrspace(1) %out, <8
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float %bias32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -3533,28 +3533,28 @@ define amdgpu_kernel void @image_sample_a16_b32_cl_2d(ptr addrspace(1) %out, <8
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
 ; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
 ; CHECK-NEXT:    [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_b16_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_c_b16_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
   %s32 = fpext half %s to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias32, float %zcompare, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias32, float %zcompare, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -3563,20 +3563,20 @@ define amdgpu_kernel void @image_sample_a16_c_b32_cl_1d(ptr addrspace(1) %out, <
 ; CHECK-LABEL: @image_sample_a16_c_b32_cl_1d(
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
 ; CHECK-NEXT:    [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias, float %zcompare, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_b16_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_c_b16_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3584,7 +3584,7 @@ define amdgpu_kernel void @image_sample_a16_c_b16_cl_2d(ptr addrspace(1) %out, <
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float %bias32, float %zcompare, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias32, float %zcompare, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -3594,35 +3594,35 @@ define amdgpu_kernel void @image_sample_a16_c_b32_cl_2d(ptr addrspace(1) %out, <
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
 ; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
 ; CHECK-NEXT:    [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %bias, float %zcompare, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_d_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) {
 ; CHECK-LABEL: @image_sample_a16_d_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
   %dsdv32 = fpext half %dsdv to float
   %s32 = fpext half %s to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_d_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
 ; CHECK-LABEL: @image_sample_a16_d_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3632,14 +3632,14 @@ define amdgpu_kernel void @image_sample_a16_d_2d(ptr addrspace(1) %out, <8 x i32
   %dtdv32 = fpext half %dtdv to float
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_d_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r) {
 ; CHECK-LABEL: @image_sample_a16_d_3d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3652,28 +3652,28 @@ define amdgpu_kernel void @image_sample_a16_d_3d(ptr addrspace(1) %out, <8 x i32
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %r32 = fpext half %r to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %drdh32, float %dsdv32, float %dtdv32, float %drdv32, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dtdh32, float %drdh32, float %dsdv32, float %dtdv32, float %drdv32, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) {
 ; CHECK-LABEL: @image_sample_a16_c_d_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
   %dsdv32 = fpext half %dsdv to float
   %s32 = fpext half %s to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
 ; CHECK-LABEL: @image_sample_a16_c_d_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3683,14 +3683,14 @@ define amdgpu_kernel void @image_sample_a16_c_d_2d(ptr addrspace(1) %out, <8 x i
   %dtdv32 = fpext half %dtdv to float
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_d_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_d_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3698,14 +3698,14 @@ define amdgpu_kernel void @image_sample_a16_d_cl_1d(ptr addrspace(1) %out, <8 x
   %dsdv32 = fpext half %dsdv to float
   %s32 = fpext half %s to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_d_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_d_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3716,14 +3716,14 @@ define amdgpu_kernel void @image_sample_a16_d_cl_2d(ptr addrspace(1) %out, <8 x
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_c_d_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3731,14 +3731,14 @@ define amdgpu_kernel void @image_sample_a16_c_d_cl_1d(ptr addrspace(1) %out, <8
   %dsdv32 = fpext half %dsdv to float
   %s32 = fpext half %s to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_c_d_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3749,28 +3749,28 @@ define amdgpu_kernel void @image_sample_a16_c_d_cl_2d(ptr addrspace(1) %out, <8
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_cd_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) {
 ; CHECK-LABEL: @image_sample_a16_cd_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
   %dsdv32 = fpext half %dsdv to float
   %s32 = fpext half %s to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_cd_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
 ; CHECK-LABEL: @image_sample_a16_cd_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3780,28 +3780,28 @@ define amdgpu_kernel void @image_sample_a16_cd_2d(ptr addrspace(1) %out, <8 x i3
   %dtdv32 = fpext half %dtdv to float
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_cd_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) {
 ; CHECK-LABEL: @image_sample_a16_c_cd_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
   %dsdv32 = fpext half %dsdv to float
   %s32 = fpext half %s to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_cd_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
 ; CHECK-LABEL: @image_sample_a16_c_cd_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3811,14 +3811,14 @@ define amdgpu_kernel void @image_sample_a16_c_cd_2d(ptr addrspace(1) %out, <8 x
   %dtdv32 = fpext half %dtdv to float
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_cd_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_cd_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3826,14 +3826,14 @@ define amdgpu_kernel void @image_sample_a16_cd_cl_1d(ptr addrspace(1) %out, <8 x
   %dsdv32 = fpext half %dsdv to float
   %s32 = fpext half %s to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_cd_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_cd_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3844,14 +3844,14 @@ define amdgpu_kernel void @image_sample_a16_cd_cl_2d(ptr addrspace(1) %out, <8 x
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_cd_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_c_cd_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3859,14 +3859,14 @@ define amdgpu_kernel void @image_sample_a16_c_cd_cl_1d(ptr addrspace(1) %out, <8
   %dsdv32 = fpext half %dsdv to float
   %s32 = fpext half %s to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_cd_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
 ; CHECK-LABEL: @image_sample_a16_c_cd_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -3877,118 +3877,118 @@ define amdgpu_kernel void @image_sample_a16_c_cd_cl_2d(ptr addrspace(1) %out, <8
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %clamp32 = fpext half %clamp to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_l_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %lod) {
 ; CHECK-LABEL: @image_sample_a16_l_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half [[S:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %lod32 = fpext half %lod to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %s32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) {
 ; CHECK-LABEL: @image_sample_a16_l_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %lod32 = fpext half %lod to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s32, float %t32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %t32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_l_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %lod) {
 ; CHECK-LABEL: @image_sample_a16_c_l_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %lod32 = fpext half %lod to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) {
 ; CHECK-LABEL: @image_sample_a16_c_l_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %lod32 = fpext half %lod to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s32, float %t32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s32, float %t32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_lz_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
 ; CHECK-LABEL: @image_sample_a16_lz_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_lz_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
 ; CHECK-LABEL: @image_sample_a16_lz_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_lz_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) {
 ; CHECK-LABEL: @image_sample_a16_c_lz_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_lz_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) {
 ; CHECK-LABEL: @image_sample_a16_c_lz_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V1(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) {
 ; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_V1(
-; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16.v8i32.v4i32(i32 4, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store float [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -3999,14 +3999,14 @@ define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V1(ptr addrspace(1) %o
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %slice32 = fpext half %slice to float
-  %res = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32.v8i32.v4i32(i32 4, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store float %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V2(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) {
 ; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_V2(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16.v8i32.v4i32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <2 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -4017,14 +4017,14 @@ define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V2(ptr addrspace(1) %o
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %slice32 = fpext half %slice to float
-  %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32.v8i32.v4i32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <2 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %slice) {
 ; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half 0xH3400, half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16.v8i32.v4i32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half 0xH3400, half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <2 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -4034,7 +4034,7 @@ define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const(ptr addrspace(1)
   %dtdv32 = fpext half %dtdv to float
   %s32 = fpext half %s to float
   %slice32 = fpext half %slice to float
-  %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float 0.25, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32.v8i32.v4i32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float 0.25, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <2 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -4043,7 +4043,7 @@ define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const_noopt(ptr addrsp
 ; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const_noopt(
 ; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
 ; CHECK-NEXT:    [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S32]], float 1.000000e+10, float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32.v8i32.v4i32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S32]], float 1.000000e+10, float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <2 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -4053,19 +4053,19 @@ define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const_noopt(ptr addrsp
   %dtdv32 = fpext half %dtdv to float
   %s32 = fpext half %s to float
   %slice32 = fpext half %slice to float
-  %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float 1.0e+10, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32.v8i32.v4i32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float 1.0e+10, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <2 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_load_a16_mip_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s) {
 ; CHECK-LABEL: @image_load_a16_mip_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16.v8i32(i32 15, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = zext i16 %s to i32
-  %res = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32.v8i32(i32 15, i32 %s32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -4073,37 +4073,37 @@ define amdgpu_kernel void @image_load_a16_mip_1d(ptr addrspace(1) %out, <8 x i32
 define amdgpu_kernel void @image_load_a16_mip_1d_noopt(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s) {
 ; CHECK-LABEL: @image_load_a16_mip_1d_noopt(
 ; CHECK-NEXT:    [[S32:%.*]] = sext i16 [[S:%.*]] to i32
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S32]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 15, i32 [[S32]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = sext i16 %s to i32
-  %res = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32.v8i32(i32 15, i32 %s32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_load_a16_mip_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s, i16 %t) {
 ; CHECK-LABEL: @image_load_a16_mip_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.*]], i16 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16.v8i32(i32 15, i16 [[S:%.*]], i16 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = zext i16 %s to i32
   %t32 = zext i16 %t to i32
-  %res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 %t32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32.v8i32(i32 15, i32 %s32, i32 %t32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_load_a16_mip_2d_const(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s) {
 ; CHECK-LABEL: @image_load_a16_mip_2d_const(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.*]], i16 -1, <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16.v8i32(i32 15, i16 [[S:%.*]], i16 -1, <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = zext i16 %s to i32
-  %res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 65535, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32.v8i32(i32 15, i32 %s32, i32 65535, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -4111,12 +4111,12 @@ define amdgpu_kernel void @image_load_a16_mip_2d_const(ptr addrspace(1) %out, <8
 define amdgpu_kernel void @image_load_a16_mip_2d_const_noopt(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s) {
 ; CHECK-LABEL: @image_load_a16_mip_2d_const_noopt(
 ; CHECK-NEXT:    [[S32:%.*]] = zext i16 [[S:%.*]] to i32
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S32]], i32 65536, <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 15, i32 [[S32]], i32 65536, <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = zext i16 %s to i32
-  %res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 65536, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32.v8i32(i32 15, i32 %s32, i32 65536, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -4127,20 +4127,20 @@ define amdgpu_kernel void @image_load_a16_mip_2d_const_noopt(ptr addrspace(1) %o
 
 define amdgpu_kernel void @image_sample_g16_d_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
 ; CHECK-LABEL: @image_sample_g16_d_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
   %dsdv32 = fpext half %dsdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_d_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; CHECK-LABEL: @image_sample_g16_d_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -4148,14 +4148,14 @@ define amdgpu_kernel void @image_sample_g16_d_2d(ptr addrspace(1) %out, <8 x i32
   %dtdh32 = fpext half %dtdh to float
   %dsdv32 = fpext half %dsdv to float
   %dtdv32 = fpext half %dtdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_d_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
 ; CHECK-LABEL: @image_sample_g16_d_3d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -4165,27 +4165,27 @@ define amdgpu_kernel void @image_sample_g16_d_3d(ptr addrspace(1) %out, <8 x i32
   %dsdv32 = fpext half %dsdv to float
   %dtdv32 = fpext half %dtdv to float
   %drdv32 = fpext half %drdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %drdh32, float %dsdv32, float %dtdv32, float %drdv32, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dtdh32, float %drdh32, float %dsdv32, float %dtdv32, float %drdv32, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_c_d_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
 ; CHECK-LABEL: @image_sample_g16_c_d_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
   %dsdv32 = fpext half %dsdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_c_d_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; CHECK-LABEL: @image_sample_g16_c_d_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -4193,27 +4193,27 @@ define amdgpu_kernel void @image_sample_g16_c_d_2d(ptr addrspace(1) %out, <8 x i
   %dtdh32 = fpext half %dtdh to float
   %dsdv32 = fpext half %dsdv to float
   %dtdv32 = fpext half %dtdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_d_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; CHECK-LABEL: @image_sample_g16_d_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
   %dsdv32 = fpext half %dsdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_d_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; CHECK-LABEL: @image_sample_g16_d_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -4221,27 +4221,27 @@ define amdgpu_kernel void @image_sample_g16_d_cl_2d(ptr addrspace(1) %out, <8 x
   %dtdh32 = fpext half %dtdh to float
   %dsdv32 = fpext half %dsdv to float
   %dtdv32 = fpext half %dtdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_c_d_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; CHECK-LABEL: @image_sample_g16_c_d_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
   %dsdv32 = fpext half %dsdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_c_d_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; CHECK-LABEL: @image_sample_g16_c_d_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -4249,27 +4249,27 @@ define amdgpu_kernel void @image_sample_g16_c_d_cl_2d(ptr addrspace(1) %out, <8
   %dtdh32 = fpext half %dtdh to float
   %dsdv32 = fpext half %dsdv to float
   %dtdv32 = fpext half %dtdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_cd_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
 ; CHECK-LABEL: @image_sample_g16_cd_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
   %dsdv32 = fpext half %dsdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_cd_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; CHECK-LABEL: @image_sample_g16_cd_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -4277,27 +4277,27 @@ define amdgpu_kernel void @image_sample_g16_cd_2d(ptr addrspace(1) %out, <8 x i3
   %dtdh32 = fpext half %dtdh to float
   %dsdv32 = fpext half %dsdv to float
   %dtdv32 = fpext half %dtdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_c_cd_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
 ; CHECK-LABEL: @image_sample_g16_c_cd_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
   %dsdv32 = fpext half %dsdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_c_cd_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; CHECK-LABEL: @image_sample_g16_c_cd_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -4305,27 +4305,27 @@ define amdgpu_kernel void @image_sample_g16_c_cd_2d(ptr addrspace(1) %out, <8 x
   %dtdh32 = fpext half %dtdh to float
   %dsdv32 = fpext half %dsdv to float
   %dtdv32 = fpext half %dtdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_cd_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; CHECK-LABEL: @image_sample_g16_cd_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
   %dsdv32 = fpext half %dsdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_cd_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; CHECK-LABEL: @image_sample_g16_cd_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32.v8i32.v4i32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -4333,27 +4333,27 @@ define amdgpu_kernel void @image_sample_g16_cd_cl_2d(ptr addrspace(1) %out, <8 x
   %dtdh32 = fpext half %dtdh to float
   %dsdv32 = fpext half %dsdv to float
   %dtdv32 = fpext half %dtdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_c_cd_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; CHECK-LABEL: @image_sample_g16_c_cd_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
   %dsdv32 = fpext half %dsdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_c_cd_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; CHECK-LABEL: @image_sample_g16_c_cd_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -4361,14 +4361,14 @@ define amdgpu_kernel void @image_sample_g16_c_cd_cl_2d(ptr addrspace(1) %out, <8
   %dtdh32 = fpext half %dtdh to float
   %dsdv32 = fpext half %dsdv to float
   %dtdv32 = fpext half %dtdv to float
-  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V1(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
 ; CHECK-LABEL: @image_sample_g16_c_d_o_2darray_V1(
-; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f32(i32 4, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f32.v8i32.v4i32(i32 4, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store float [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -4376,14 +4376,14 @@ define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V1(ptr addrspace(1) %o
   %dtdh32 = fpext half %dtdh to float
   %dsdv32 = fpext half %dsdv to float
   %dtdv32 = fpext half %dtdv to float
-  %res = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32.v8i32.v4i32(i32 4, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store float %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V2(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
 ; CHECK-LABEL: @image_sample_g16_c_d_o_2darray_V2(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32.v8i32.v4i32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <2 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -4391,7 +4391,7 @@ define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V2(ptr addrspace(1) %o
   %dtdh32 = fpext half %dtdh to float
   %dsdv32 = fpext half %dsdv to float
   %dtdv32 = fpext half %dtdv to float
-  %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32.v8i32.v4i32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <2 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -4402,63 +4402,63 @@ define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V2(ptr addrspace(1) %o
 
 define amdgpu_kernel void @image_sample_a16_1d_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
 ; CHECK-LABEL: @image_sample_a16_1d_nnan(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
-  %res = call nnan <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call nnan <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_1d_nnan_ninf_nsz(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
 ; CHECK-LABEL: @image_sample_a16_1d_nnan_ninf_nsz(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan ninf nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call nnan ninf nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
-  %res = call nnan ninf nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call nnan ninf nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_1d_fast(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
 ; CHECK-LABEL: @image_sample_a16_1d_fast(
-; CHECK-NEXT:    [[RES:%.*]] = call fast <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call fast <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
-  %res = call fast <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call fast <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_2d_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
 ; CHECK-LABEL: @image_sample_a16_2d_nnan(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
-  %res = call nnan <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call nnan <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_3d_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) {
 ; CHECK-LABEL: @image_sample_a16_3d_nnan(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %r32 = fpext half %r to float
-  %res = call nnan <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call nnan <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -4466,41 +4466,41 @@ define amdgpu_kernel void @image_sample_a16_3d_nnan(ptr addrspace(1) %out, <8 x
 define amdgpu_kernel void @image_sample_a16_cube_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) {
 ;
 ; CHECK-LABEL: @image_sample_a16_cube_nnan(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %face32 = fpext half %face to float
-  %res = call nnan <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call nnan <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_1darray_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) {
 ; CHECK-LABEL: @image_sample_a16_1darray_nnan(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %slice32 = fpext half %slice to float
-  %res = call nnan <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call nnan <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @image_sample_a16_2darray_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) {
 ; CHECK-LABEL: @image_sample_a16_2darray_nnan(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
   %t32 = fpext half %t to float
   %slice32 = fpext half %slice to float
-  %res = call nnan <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res = call nnan <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32.v8i32.v4i32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %res, ptr addrspace(1) %out
   ret void
 }
@@ -4509,26 +4509,26 @@ define amdgpu_kernel void @image_sample_a16_2darray_nnan(ptr addrspace(1) %out,
 ; llvm.amdgcn.image.sample l to lz
 ; --------------------------------------------------------------------
 
-declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
-declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2darray.v4f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32.v8i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32.v8i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2darray.v4f32.f32.v8i32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 define amdgpu_kernel void @sample_l_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) {
 ; CHECK-LABEL: @sample_l_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %s, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32.v8i32.v4i32(i32 15, float %s, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4536,12 +4536,12 @@ main_body:
 define amdgpu_kernel void @sample_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
 ; CHECK-LABEL: @sample_l_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s, float %t, float -0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32.v8i32.v4i32(i32 15, float %s, float %t, float -0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4549,12 +4549,12 @@ main_body:
 define amdgpu_kernel void @sample_c_l_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %lod) {
 ; CHECK-LABEL: @sample_c_l_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float -2.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s, float -2.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4562,12 +4562,12 @@ main_body:
 define amdgpu_kernel void @sample_c_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
 ; CHECK-LABEL: @sample_c_l_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32.v8i32.v4i32(i32 15, float %zcompare, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4575,12 +4575,12 @@ main_body:
 define amdgpu_kernel void @sample_l_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %lod) {
 ; CHECK-LABEL: @sample_l_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %s, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4588,12 +4588,12 @@ main_body:
 define amdgpu_kernel void @sample_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) {
 ; CHECK-LABEL: @sample_l_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4601,12 +4601,12 @@ main_body:
 define amdgpu_kernel void @sample_c_l_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %lod) {
 ; CHECK-LABEL: @sample_c_l_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %s, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4614,12 +4614,12 @@ main_body:
 define amdgpu_kernel void @sample_c_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %lod) {
 ; CHECK-LABEL: @sample_c_l_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4627,12 +4627,12 @@ main_body:
 define amdgpu_kernel void @gather4_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
 ; CHECK-LABEL: @gather4_l_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 15, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32.v8i32(i32 15, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4640,12 +4640,12 @@ main_body:
 define amdgpu_kernel void @gather4_c_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
 ; CHECK-LABEL: @gather4_c_l_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32.v8i32(i32 15, float %zcompare, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4653,12 +4653,12 @@ main_body:
 define amdgpu_kernel void @gather4_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) {
 ; CHECK-LABEL: @gather4_l_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32.v8i32(i32 15, i32 %offset, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4666,12 +4666,12 @@ main_body:
 define amdgpu_kernel void @gather4_c_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %lod) {
 ; CHECK-LABEL: @gather4_c_l_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32.v8i32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4679,12 +4679,12 @@ main_body:
 define amdgpu_kernel void @gather4_c_l_o_2darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %slice, float %lod) {
 ; CHECK-LABEL: @gather4_c_l_o_2darray(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2darray.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float %slice, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2darray.v4f32.f32.v8i32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float %slice, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4696,12 +4696,12 @@ main_body:
 define amdgpu_kernel void @load_mip_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s) {
 ; CHECK-LABEL: @load_mip_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32.v8i32(i32 15, i32 %s, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4709,12 +4709,12 @@ main_body:
 define amdgpu_kernel void @load_mip_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t) {
 ; CHECK-LABEL: @load_mip_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32.v8i32(i32 15, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4722,12 +4722,12 @@ main_body:
 define amdgpu_kernel void @load_mip_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %u) {
 ; CHECK-LABEL: @load_mip_3d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32.v8i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32.v8i32(i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4735,12 +4735,12 @@ main_body:
 define amdgpu_kernel void @load_mip_1darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t) {
 ; CHECK-LABEL: @load_mip_1darray(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32.v8i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32.v8i32(i32 15, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4748,12 +4748,12 @@ main_body:
 define amdgpu_kernel void @load_mip_2darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %u) {
 ; CHECK-LABEL: @load_mip_2darray(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32.v8i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32.v8i32(i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4761,12 +4761,12 @@ main_body:
 define amdgpu_kernel void @load_mip_cube(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %u) {
 ; CHECK-LABEL: @load_mip_cube(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32.v8i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32.v8i32(i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4775,7 +4775,7 @@ main_body:
 define amdgpu_kernel void @store_mip_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
 ; CHECK-LABEL: @store_mip_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4786,7 +4786,7 @@ main_body:
 define amdgpu_kernel void @store_mip_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) {
 ; CHECK-LABEL: @store_mip_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.2d.v4f32.i32.v8i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4797,7 +4797,7 @@ main_body:
 define amdgpu_kernel void @store_mip_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) {
 ; CHECK-LABEL: @store_mip_3d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.3d.v4f32.i32.v8i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4808,7 +4808,7 @@ main_body:
 define amdgpu_kernel void @store_mip_1darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) {
 ; CHECK-LABEL: @store_mip_1darray(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1darray.v4f32.i32.v8i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4819,7 +4819,7 @@ main_body:
 define amdgpu_kernel void @store_mip_2darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) {
 ; CHECK-LABEL: @store_mip_2darray(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.2darray.v4f32.i32.v8i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4830,7 +4830,7 @@ main_body:
 define amdgpu_kernel void @store_mip_cube(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) {
 ; CHECK-LABEL: @store_mip_cube(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.cube.v4f32.i32.v8i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4838,49 +4838,49 @@ main_body:
   ret void
 }
 
-declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32.v8i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32.v8i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32.v8i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32.v8i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32.v8i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32.v8i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 
 
-declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample bias zero
 ; --------------------------------------------------------------------
 
-declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.b.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f16(i32, i32, half, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-
-declare <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.v8i32.v4i32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.v8i32.v4i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.v8i32.v4i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.o.2d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f16.v8i32.v4i32(i32, i32, half, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.v8i32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.v8i32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.v8i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 define amdgpu_kernel void @sample_b_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
 ; CHECK-LABEL: @sample_b_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32(i32 15, float 0.0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.v8i32.v4i32(i32 15, float 0.0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4888,12 +4888,12 @@ main_body:
 define amdgpu_kernel void @sample_b_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; CHECK-LABEL: @sample_b_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32(i32 15, float -0.0, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.v8i32.v4i32(i32 15, float -0.0, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4901,12 +4901,12 @@ main_body:
 define amdgpu_kernel void @sample_c_b_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
 ; CHECK-LABEL: @sample_c_b_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32(i32 15, float -0.0, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.v8i32.v4i32(i32 15, float -0.0, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4914,12 +4914,12 @@ main_body:
 define amdgpu_kernel void @sample_c_b_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
 ; CHECK-LABEL: @sample_c_b_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32(i32 15, float 0.0, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.v8i32.v4i32(i32 15, float 0.0, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4927,12 +4927,12 @@ main_body:
 define amdgpu_kernel void @sample_b_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s) {
 ; CHECK-LABEL: @sample_b_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32(i32 15, i32 %offset, float 0.0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float 0.0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4940,12 +4940,12 @@ main_body:
 define amdgpu_kernel void @sample_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) {
 ; CHECK-LABEL: @sample_b_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.b.o.2d.v4f32.f32(i32 15, i32 %offset, float 0.0, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float 0.0, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4953,12 +4953,12 @@ main_body:
 define amdgpu_kernel void @sample_c_b_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s) {
 ; CHECK-LABEL: @sample_c_b_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32(i32 15, i32 %offset, float 0.0, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float 0.0, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4966,12 +4966,12 @@ main_body:
 define amdgpu_kernel void @sample_c_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) {
 ; CHECK-LABEL: @sample_c_b_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f32(i32 15, i32 %offset, float 0.0, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 %offset, float 0.0, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4979,12 +4979,12 @@ main_body:
 define amdgpu_kernel void @gather4_b_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; CHECK-LABEL: @gather4_b_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32(i32 15, float 0.0, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.v8i32(i32 15, float 0.0, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -4992,12 +4992,12 @@ main_body:
 define amdgpu_kernel void @gather4_c_b_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
 ; CHECK-LABEL: @gather4_c_b_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32(i32 15,  float 0.0, float %zcompare,float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.v8i32(i32 15,  float 0.0, float %zcompare,float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5005,12 +5005,12 @@ main_body:
 define amdgpu_kernel void @gather4_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) {
 ; CHECK-LABEL: @gather4_b_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32(i32 15, i32 %offset, float 0.0, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.v8i32(i32 15, i32 %offset, float 0.0, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5018,12 +5018,12 @@ main_body:
 define amdgpu_kernel void @gather4_c_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) {
 ; CHECK-LABEL: @gather4_c_b_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32(i32 15, i32 %offset,  float 0.0, float %zcompare,float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.v8i32(i32 15, i32 %offset,  float 0.0, float %zcompare,float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5031,12 +5031,12 @@ main_body:
 define amdgpu_kernel void @sample_c_b_o_a16_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %s, half %t) {
 ; CHECK-LABEL: @sample_c_b_o_a16_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f16(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f16.v8i32.v4i32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f16(i32 15, i32 %offset, half 0.0, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f16.v8i32.v4i32(i32 15, i32 %offset, half 0.0, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5045,12 +5045,12 @@ main_body:
 define amdgpu_kernel void @sample_b_1d_pos(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
 ; CHECK-LABEL: @sample_b_1d_pos(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float 1.000000e+00, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float 1.000000e+00, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32(i32 15, float 1.0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.v8i32.v4i32(i32 15, float 1.0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5059,12 +5059,12 @@ main_body:
 define amdgpu_kernel void @sample_b_1d_neg(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
 ; CHECK-LABEL: @sample_b_1d_neg(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float -1.000000e+00, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float -1.000000e+00, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32(i32 15, float -1.0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.v8i32.v4i32(i32 15, float -1.0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5073,13 +5073,13 @@ main_body:
 define amdgpu_kernel void @sample_b_1d_a16(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
 ; CHECK-LABEL: @sample_b_1d_a16(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16.v8i32.v4i32(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
   %s32 = fpext half %s to float
-  %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32(i32 15, float -0.0, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.v8i32.v4i32(i32 15, float -0.0, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5091,12 +5091,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
 ; CHECK-LABEL: @offset_sample_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5104,12 +5104,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; CHECK-LABEL: @offset_sample_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32(i32 15, i32 0, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5117,12 +5117,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
 ; CHECK-LABEL: @offset_sample_c_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 0, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5130,12 +5130,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
 ; CHECK-LABEL: @offset_sample_c_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f32(i32 15, i32 0, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5143,12 +5143,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %clamp) {
 ; CHECK-LABEL: @offset_sample_cl_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 15, float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32 15, i32 0, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5156,12 +5156,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) {
 ; CHECK-LABEL: @offset_sample_cl_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.cl.o.2d.v4f32.f32(i32 15, i32 0, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cl.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5169,12 +5169,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %clamp) {
 ; CHECK-LABEL: @offset_sample_c_cl_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32 15, i32 0, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5182,12 +5182,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) {
 ; CHECK-LABEL: @offset_sample_c_cl_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.2d.v4f32.f32(i32 15, i32 0, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5195,12 +5195,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_b_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s) {
 ; CHECK-LABEL: @offset_sample_b_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32 15, i32 0, float %bias, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %bias, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5208,12 +5208,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
 ; CHECK-LABEL: @offset_sample_b_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.b.o.2d.v4f32.f32.f32(i32 15, i32 0, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5221,12 +5221,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_b_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s) {
 ; CHECK-LABEL: @offset_sample_c_b_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32 15, i32 0, float %bias, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %bias, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5234,12 +5234,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) {
 ; CHECK-LABEL: @offset_sample_c_b_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f32.f32(i32 15, i32 0, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5247,12 +5247,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_b_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %clamp) {
 ; CHECK-LABEL: @offset_sample_b_cl_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 0, float %bias, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %bias, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5260,12 +5260,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_b_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) {
 ; CHECK-LABEL: @offset_sample_b_cl_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.2d.v4f32.f32.f32(i32 15, i32 0, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5273,12 +5273,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_b_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %clamp) {
 ; CHECK-LABEL: @offset_sample_c_b_cl_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 0, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5286,12 +5286,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_b_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
 ; CHECK-LABEL: @offset_sample_c_b_cl_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.2d.v4f32.f32.f32(i32 15, i32 0, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5299,12 +5299,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_d_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) {
 ; CHECK-LABEL: @offset_sample_d_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32 15, i32 0, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5312,12 +5312,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_d_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
 ; CHECK-LABEL: @offset_sample_d_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.d.o.2d.v4f32.f32.f32(i32 15, i32 0, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5325,12 +5325,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_d_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
 ; CHECK-LABEL: @offset_sample_c_d_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32 15, i32 0, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5338,12 +5338,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_d_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
 ; CHECK-LABEL: @offset_sample_c_d_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.2d.v4f32.f32.f32(i32 15, i32 0, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5351,12 +5351,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_d_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) {
 ; CHECK-LABEL: @offset_sample_d_cl_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 0, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5364,12 +5364,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_d_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
 ; CHECK-LABEL: @offset_sample_d_cl_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2d.v4f32.f32.f32(i32 15, i32 0, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5377,12 +5377,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_d_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) {
 ; CHECK-LABEL: @offset_sample_c_d_cl_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 0, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5390,12 +5390,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_d_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
 ; CHECK-LABEL: @offset_sample_c_d_cl_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.2d.v4f32.f32.f32(i32 15, i32 0, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5403,12 +5403,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_cd_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) {
 ; CHECK-LABEL: @offset_sample_cd_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32 15, i32 0, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5416,12 +5416,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_cd_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
 ; CHECK-LABEL: @offset_sample_cd_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.o.2d.v4f32.f32.f32(i32 15, i32 0, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5429,12 +5429,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_cd_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
 ; CHECK-LABEL: @offset_sample_c_cd_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32 15, i32 0, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5442,12 +5442,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_cd_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
 ; CHECK-LABEL: @offset_sample_c_cd_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.2d.v4f32.f32.f32(i32 15, i32 0, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5455,12 +5455,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_cd_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) {
 ; CHECK-LABEL: @offset_sample_cd_cl_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 0, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5468,12 +5468,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_cd_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
 ; CHECK-LABEL: @offset_sample_cd_cl_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.2d.v4f32.f32.f32(i32 15, i32 0, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5481,12 +5481,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_cd_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) {
 ; CHECK-LABEL: @offset_sample_c_cd_cl_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 0, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5494,12 +5494,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_cd_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
 ; CHECK-LABEL: @offset_sample_c_cd_cl_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.2d.v4f32.f32.f32(i32 15, i32 0, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5507,12 +5507,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_l_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) {
 ; CHECK-LABEL: @offset_sample_l_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 0, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5520,12 +5520,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
 ; CHECK-LABEL: @offset_sample_l_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32(i32 15, i32 0, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5533,12 +5533,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_l_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %lod) {
 ; CHECK-LABEL: @offset_sample_c_l_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 0, float %zcompare, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5546,12 +5546,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
 ; CHECK-LABEL: @offset_sample_c_l_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f32(i32 15, i32 0, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5559,12 +5559,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_lz_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
 ; CHECK-LABEL: @offset_sample_lz_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5572,12 +5572,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_lz_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; CHECK-LABEL: @offset_sample_lz_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.lz.o.2d.v4f32.f32(i32 15, i32 0, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.lz.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5585,12 +5585,12 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_lz_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
 ; CHECK-LABEL: @offset_sample_c_lz_o_1d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 0, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
@@ -5598,56 +5598,56 @@ main_body:
 define amdgpu_kernel void @offset_sample_c_lz_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
 ; CHECK-LABEL: @offset_sample_c_lz_o_2d(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.2d.v4f32.f32(i32 15, i32 0, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.2d.v4f32.f32.v8i32.v4i32(i32 15, i32 0, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.cl.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-
-declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.b.o.2d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-
-declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.d.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-
-declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.cd.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-
-declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.lz.o.2d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cl.o.2d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.2d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.o.2d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.o.2d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.2d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.o.2d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.2d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.2d.v4f32.f32.f32.v8i32.v4i32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.lz.o.2d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.2d.v4f32.f32.v8i32.v4i32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.is.shared
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
index 65c89618e20ba..e7a36e805a012 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
@@ -7,41 +7,41 @@
 
 define amdgpu_ps void @image_store_1d_store_all_zeros(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; GCN-LABEL: @image_store_1d_store_all_zeros(
-; GCN-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32(float 0.000000e+00, i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GCN-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32.v8i32(float 0.000000e+00, i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GCN-NEXT:    ret void
 ;
 ; GFX12-LABEL: @image_store_1d_store_all_zeros(
-; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32(float 0.000000e+00, i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32.v8i32(float 0.000000e+00, i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:    ret void
 ;
 ; GFXUNKNOWN-LABEL: @image_store_1d_store_all_zeros(
-; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> zeroinitializer, i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> zeroinitializer, i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFXUNKNOWN-NEXT:    ret void
 ;
-  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> zeroinitializer, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> zeroinitializer, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
 }
 
 define amdgpu_ps void @image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg %rsrc, float %vdata1, i32 %s) #0 {
 ; GCN-LABEL: @image_store_1d_store_insert_zeros_at_end(
-; GCN-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32(float [[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GCN-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32.v8i32(float [[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GCN-NEXT:    ret void
 ;
 ; GFX12-LABEL: @image_store_1d_store_insert_zeros_at_end(
 ; GFX12-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
-; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> [[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:    ret void
 ;
 ; GFXUNKNOWN-LABEL: @image_store_1d_store_insert_zeros_at_end(
 ; GFXUNKNOWN-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
-; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> [[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFXUNKNOWN-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
   %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1
   %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2
   %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3
-  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %newvdata4, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> %newvdata4, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
 }
 
@@ -49,26 +49,26 @@ define amdgpu_ps void @image_store_mip_1d_store_insert_zeros_at_end(<8 x i32> in
 ; GCN-LABEL: @image_store_mip_1d_store_insert_zeros_at_end(
 ; GCN-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> <float 0.000000e+00, float poison, float poison>, float [[VDATA1:%.*]], i64 1
 ; GCN-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA2:%.*]], i64 2
-; GCN-NEXT:    call void @llvm.amdgcn.image.store.1d.v3f32.i32(<3 x float> [[TMP2]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GCN-NEXT:    call void @llvm.amdgcn.image.store.1d.v3f32.i32.v8i32(<3 x float> [[TMP2]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GCN-NEXT:    ret void
 ;
 ; GFX12-LABEL: @image_store_mip_1d_store_insert_zeros_at_end(
 ; GFX12-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> <float 0.000000e+00, float poison, float poison>, float [[VDATA1:%.*]], i64 1
 ; GFX12-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA2:%.*]], i64 2
-; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.v3f32.i32(<3 x float> [[TMP2]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.v3f32.i32.v8i32(<3 x float> [[TMP2]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:    ret void
 ;
 ; GFXUNKNOWN-LABEL: @image_store_mip_1d_store_insert_zeros_at_end(
 ; GFXUNKNOWN-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> <float 0.000000e+00, float poison, float poison, float 0.000000e+00>, float [[VDATA1:%.*]], i64 1
 ; GFXUNKNOWN-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA2:%.*]], i64 2
-; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[NEWVDATA4]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> [[NEWVDATA4]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFXUNKNOWN-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0
   %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1
   %newvdata3 = insertelement <4 x float> %newvdata2, float %vdata2, i32 2
   %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3
-  call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %newvdata4, i32 7, i32 %s, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32.v8i32(<4 x float> %newvdata4, i32 7, i32 %s, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
 }
 
@@ -148,39 +148,39 @@ define amdgpu_ps void @struct_tbuffer_store_insert_undefs(<4 x i32> inreg %a, fl
 define amdgpu_ps void @image_store_1d_store_shufflevector_same(<8 x i32> inreg %rsrc, <4 x float> %vdata1, i32 %s) #0 {
 ; GCN-LABEL: @image_store_1d_store_shufflevector_same(
 ; GCN-NEXT:    [[DATA:%.*]] = shufflevector <4 x float> [[VDATA1:%.*]], <4 x float> poison, <4 x i32> zeroinitializer
-; GCN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[DATA]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GCN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> [[DATA]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GCN-NEXT:    ret void
 ;
 ; GFX12-LABEL: @image_store_1d_store_shufflevector_same(
 ; GFX12-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[VDATA1:%.*]], i64 0
-; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32(float [[TMP1]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32.v8i32(float [[TMP1]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:    ret void
 ;
 ; GFXUNKNOWN-LABEL: @image_store_1d_store_shufflevector_same(
 ; GFXUNKNOWN-NEXT:    [[DATA:%.*]] = shufflevector <4 x float> [[VDATA1:%.*]], <4 x float> poison, <4 x i32> zeroinitializer
-; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[DATA]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> [[DATA]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFXUNKNOWN-NEXT:    ret void
 ;
   %data = shufflevector <4 x float> %vdata1, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %data, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> %data, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
 }
 
 define amdgpu_ps void @image_store_1d_store_shufflevector(<8 x i32> inreg %rsrc, float %vdata1, i32 %s) #0 {
 ; GCN-LABEL: @image_store_1d_store_shufflevector(
-; GCN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> <float 2.000000e+00, float 2.000000e+00, float 5.000000e+00, float 2.000000e+00>, i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GCN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> <float 2.000000e+00, float 2.000000e+00, float 5.000000e+00, float 2.000000e+00>, i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GCN-NEXT:    ret void
 ;
 ; GFX12-LABEL: @image_store_1d_store_shufflevector(
-; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.v3f32.i32(<3 x float> <float 2.000000e+00, float 2.000000e+00, float 5.000000e+00>, i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.v3f32.i32.v8i32(<3 x float> <float 2.000000e+00, float 2.000000e+00, float 5.000000e+00>, i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:    ret void
 ;
 ; GFXUNKNOWN-LABEL: @image_store_1d_store_shufflevector(
-; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> <float 2.000000e+00, float 2.000000e+00, float 5.000000e+00, float 2.000000e+00>, i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> <float 2.000000e+00, float 2.000000e+00, float 5.000000e+00, float 2.000000e+00>, i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFXUNKNOWN-NEXT:    ret void
 ;
   %data = shufflevector <4 x float> <float 2.0, float 1.0, float 2.0, float 5.0>, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
-  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %data, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> %data, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
 }
 
@@ -280,20 +280,20 @@ declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>,
 declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2
 declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0
 declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0
-declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
-declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2d.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.3d.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.cube.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.1darray.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2darray.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32.v8i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind writeonly }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
index d8d3c25688367..30431ad724843 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
@@ -7,17 +7,17 @@
 define amdgpu_ps half @image_sample_2d_fptrunc_to_d16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX7-LABEL: @image_sample_2d_fptrunc_to_d16(
 ; GFX7-NEXT:  main_body:
-; GFX7-NEXT:    [[TEX:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[TEX:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; GFX7-NEXT:    [[TEX_HALF:%.*]] = fptrunc float [[TEX]] to half
 ; GFX7-NEXT:    ret half [[TEX_HALF]]
 ;
 ; GFX81PLUS-LABEL: @image_sample_2d_fptrunc_to_d16(
 ; GFX81PLUS-NEXT:  main_body:
-; GFX81PLUS-NEXT:    [[TEX:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[TEX:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; GFX81PLUS-NEXT:    ret half [[TEX]]
 ;
 main_body:
-  %tex = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+  %tex = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   %tex_half = fptrunc float %tex to half
   ret half %tex_half
 }
@@ -25,7 +25,7 @@ main_body:
 define amdgpu_ps half @image_sample_2d_v2f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX7-LABEL: @image_sample_2d_v2f32(
 ; GFX7-NEXT:  main_body:
-; GFX7-NEXT:    [[TEX:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 3, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[TEX:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; GFX7-NEXT:    [[TEX_2_HALF:%.*]] = fptrunc <2 x float> [[TEX]] to <2 x half>
 ; GFX7-NEXT:    [[TEX_HALF_0:%.*]] = extractelement <2 x half> [[TEX_2_HALF]], i64 0
 ; GFX7-NEXT:    [[TEX_HALF_1:%.*]] = extractelement <2 x half> [[TEX_2_HALF]], i64 1
@@ -34,14 +34,14 @@ define amdgpu_ps half @image_sample_2d_v2f32(<8 x i32> inreg %rsrc, <4 x i32> in
 ;
 ; GFX81PLUS-LABEL: @image_sample_2d_v2f32(
 ; GFX81PLUS-NEXT:  main_body:
-; GFX81PLUS-NEXT:    [[TEX:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.lz.2d.v2f16.f32(i32 3, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[TEX:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.lz.2d.v2f16.f32.v8i32.v4i32(i32 3, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; GFX81PLUS-NEXT:    [[TEX_HALF_0:%.*]] = extractelement <2 x half> [[TEX]], i64 0
 ; GFX81PLUS-NEXT:    [[TEX_HALF_1:%.*]] = extractelement <2 x half> [[TEX]], i64 1
 ; GFX81PLUS-NEXT:    [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
 ; GFX81PLUS-NEXT:    ret half [[ADDF_SUM_0]]
 ;
 main_body:
-  %tex = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 3, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+  %tex = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   %tex_2_half = fptrunc <2 x float> %tex to <2 x half>
   %tex_half_0 = extractelement <2 x half> %tex_2_half, i64 0
   %tex_half_1 = extractelement <2 x half> %tex_2_half, i64 1
@@ -52,7 +52,7 @@ main_body:
 define amdgpu_ps half @image_sample_2d_v3f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX7-LABEL: @image_sample_2d_v3f32(
 ; GFX7-NEXT:  main_body:
-; GFX7-NEXT:    [[TEX:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32(i32 7, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[TEX:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; GFX7-NEXT:    [[TEX_3_HALF:%.*]] = fptrunc <3 x float> [[TEX]] to <3 x half>
 ; GFX7-NEXT:    [[TEX_HALF_0:%.*]] = extractelement <3 x half> [[TEX_3_HALF]], i64 0
 ; GFX7-NEXT:    [[TEX_HALF_1:%.*]] = extractelement <3 x half> [[TEX_3_HALF]], i64 1
@@ -63,7 +63,7 @@ define amdgpu_ps half @image_sample_2d_v3f32(<8 x i32> inreg %rsrc, <4 x i32> in
 ;
 ; GFX81PLUS-LABEL: @image_sample_2d_v3f32(
 ; GFX81PLUS-NEXT:  main_body:
-; GFX81PLUS-NEXT:    [[TEX:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32(i32 7, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[TEX:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; GFX81PLUS-NEXT:    [[TEX_HALF_0:%.*]] = extractelement <3 x half> [[TEX]], i64 0
 ; GFX81PLUS-NEXT:    [[TEX_HALF_1:%.*]] = extractelement <3 x half> [[TEX]], i64 1
 ; GFX81PLUS-NEXT:    [[TEX_HALF_2:%.*]] = extractelement <3 x half> [[TEX]], i64 2
@@ -72,7 +72,7 @@ define amdgpu_ps half @image_sample_2d_v3f32(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX81PLUS-NEXT:    ret half [[ADDF_SUM_1]]
 ;
 main_body:
-  %tex = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32(i32 7, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+  %tex = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   %tex_3_half = fptrunc <3 x float> %tex to <3 x half>
   %tex_half_0 = extractelement <3 x half> %tex_3_half, i64 0
   %tex_half_1 = extractelement <3 x half> %tex_3_half, i64 1
@@ -85,7 +85,7 @@ main_body:
 define amdgpu_ps half @image_sample_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX7-LABEL: @image_sample_2d_v4f32(
 ; GFX7-NEXT:  main_body:
-; GFX7-NEXT:    [[TEX:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[TEX:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; GFX7-NEXT:    [[TEX_4_HALF:%.*]] = fptrunc <4 x float> [[TEX]] to <4 x half>
 ; GFX7-NEXT:    [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX_4_HALF]], i64 0
 ; GFX7-NEXT:    [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX_4_HALF]], i64 1
@@ -98,7 +98,7 @@ define amdgpu_ps half @image_sample_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> in
 ;
 ; GFX81PLUS-LABEL: @image_sample_2d_v4f32(
 ; GFX81PLUS-NEXT:  main_body:
-; GFX81PLUS-NEXT:    [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; GFX81PLUS-NEXT:    [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX]], i64 0
 ; GFX81PLUS-NEXT:    [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX]], i64 1
 ; GFX81PLUS-NEXT:    [[TEX_HALF_2:%.*]] = extractelement <4 x half> [[TEX]], i64 2
@@ -109,7 +109,7 @@ define amdgpu_ps half @image_sample_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX81PLUS-NEXT:    ret half [[ADDF_SUM_2]]
 ;
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   %tex_4_half = fptrunc <4 x float> %tex to <4 x half>
   %tex_half_0 = extractelement <4 x half> %tex_4_half, i64 0
   %tex_half_1 = extractelement <4 x half> %tex_4_half, i64 1
@@ -124,7 +124,7 @@ main_body:
 define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
 ; GFX7-LABEL: @image_gather4_2d_v4f32(
 ; GFX7-NEXT:  main_body:
-; GFX7-NEXT:    [[TEX:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[TEX:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16.v8i32.v4i32(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; GFX7-NEXT:    [[TEX_4_HALF:%.*]] = fptrunc <4 x float> [[TEX]] to <4 x half>
 ; GFX7-NEXT:    [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX_4_HALF]], i64 0
 ; GFX7-NEXT:    [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX_4_HALF]], i64 1
@@ -137,7 +137,7 @@ define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> i
 ;
 ; GFX81PLUS-LABEL: @image_gather4_2d_v4f32(
 ; GFX81PLUS-NEXT:  main_body:
-; GFX81PLUS-NEXT:    [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.gather4.2d.v4f16.f16(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.gather4.2d.v4f16.f16.v8i32.v4i32(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
 ; GFX81PLUS-NEXT:    [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX]], i64 0
 ; GFX81PLUS-NEXT:    [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX]], i64 1
 ; GFX81PLUS-NEXT:    [[TEX_HALF_2:%.*]] = extractelement <4 x half> [[TEX]], i64 2
@@ -148,7 +148,7 @@ define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX81PLUS-NEXT:    ret half [[ADDF_SUM_2]]
 ;
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16.v8i32.v4i32(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   %tex_4_half = fptrunc <4 x float> %tex to <4 x half>
   %tex_half_0 = extractelement <4 x half> %tex_4_half, i64 0
   %tex_half_1 = extractelement <4 x half> %tex_4_half, i64 1
@@ -162,22 +162,22 @@ main_body:
 
 define amdgpu_ps half @load_1d(i16 %s, <8 x i32> inreg %rsrc) {
 ; GFX7-LABEL: @load_1d(
-; GFX7-NEXT:    [[S_FLOAT:%.*]] = call float @llvm.amdgcn.image.load.1d.f32.i16(i32 1, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX7-NEXT:    [[S_FLOAT:%.*]] = call float @llvm.amdgcn.image.load.1d.f32.i16.v8i32(i32 1, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX7-NEXT:    [[S_HALF:%.*]] = fptrunc float [[S_FLOAT]] to half
 ; GFX7-NEXT:    ret half [[S_HALF]]
 ;
 ; GFX81PLUS-LABEL: @load_1d(
-; GFX81PLUS-NEXT:    [[S_FLOAT:%.*]] = call half @llvm.amdgcn.image.load.1d.f16.i16(i32 1, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[S_FLOAT:%.*]] = call half @llvm.amdgcn.image.load.1d.f16.i16.v8i32(i32 1, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX81PLUS-NEXT:    ret half [[S_FLOAT]]
 ;
-  %s_float = call float @llvm.amdgcn.image.load.1d.f32.i16(i32 1, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %s_float = call float @llvm.amdgcn.image.load.1d.f32.i16.v8i32(i32 1, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
   %s_half = fptrunc float %s_float to half
   ret half %s_half
 }
 
 define amdgpu_ps half @load_1d_v2(i16 %s, <8 x i32> inreg %rsrc) {
 ; GFX7-LABEL: @load_1d_v2(
-; GFX7-NEXT:    [[V2_FLOAT:%.*]] = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i16(i32 3, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX7-NEXT:    [[V2_FLOAT:%.*]] = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i16.v8i32(i32 3, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX7-NEXT:    [[V2_HALF:%.*]] = fptrunc <2 x float> [[V2_FLOAT]] to <2 x half>
 ; GFX7-NEXT:    [[S0:%.*]] = extractelement <2 x half> [[V2_HALF]], i64 0
 ; GFX7-NEXT:    [[S1:%.*]] = extractelement <2 x half> [[V2_HALF]], i64 1
@@ -185,13 +185,13 @@ define amdgpu_ps half @load_1d_v2(i16 %s, <8 x i32> inreg %rsrc) {
 ; GFX7-NEXT:    ret half [[ADDF_SUM_0]]
 ;
 ; GFX81PLUS-LABEL: @load_1d_v2(
-; GFX81PLUS-NEXT:    [[V2_FLOAT:%.*]] = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i16(i32 3, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[V2_FLOAT:%.*]] = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i16.v8i32(i32 3, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX81PLUS-NEXT:    [[S0:%.*]] = extractelement <2 x half> [[V2_FLOAT]], i64 0
 ; GFX81PLUS-NEXT:    [[S1:%.*]] = extractelement <2 x half> [[V2_FLOAT]], i64 1
 ; GFX81PLUS-NEXT:    [[ADDF_SUM_0:%.*]] = fadd half [[S0]], [[S1]]
 ; GFX81PLUS-NEXT:    ret half [[ADDF_SUM_0]]
 ;
-  %v2_float = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i16(i32 3, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %v2_float = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i16.v8i32(i32 3, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
   %v2_half = fptrunc <2 x float> %v2_float to <2 x half>
   %s0 = extractelement <2 x half> %v2_half, i64 0
   %s1 = extractelement <2 x half> %v2_half, i64 1
@@ -201,7 +201,7 @@ define amdgpu_ps half @load_1d_v2(i16 %s, <8 x i32> inreg %rsrc) {
 
 define amdgpu_ps half @load_1d_v3(i16 %s, <8 x i32> inreg %rsrc) {
 ; GFX7-LABEL: @load_1d_v3(
-; GFX7-NEXT:    [[V3_FLOAT:%.*]] = call <3 x float> @llvm.amdgcn.image.load.1d.v3f32.i16(i32 7, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX7-NEXT:    [[V3_FLOAT:%.*]] = call <3 x float> @llvm.amdgcn.image.load.1d.v3f32.i16.v8i32(i32 7, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX7-NEXT:    [[V3_HALF:%.*]] = fptrunc <3 x float> [[V3_FLOAT]] to <3 x half>
 ; GFX7-NEXT:    [[S0:%.*]] = extractelement <3 x half> [[V3_HALF]], i64 0
 ; GFX7-NEXT:    [[S1:%.*]] = extractelement <3 x half> [[V3_HALF]], i64 1
@@ -211,7 +211,7 @@ define amdgpu_ps half @load_1d_v3(i16 %s, <8 x i32> inreg %rsrc) {
 ; GFX7-NEXT:    ret half [[ADDF_SUM_1]]
 ;
 ; GFX81PLUS-LABEL: @load_1d_v3(
-; GFX81PLUS-NEXT:    [[V3_FLOAT:%.*]] = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i16(i32 7, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[V3_FLOAT:%.*]] = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i16.v8i32(i32 7, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX81PLUS-NEXT:    [[S0:%.*]] = extractelement <3 x half> [[V3_FLOAT]], i64 0
 ; GFX81PLUS-NEXT:    [[S1:%.*]] = extractelement <3 x half> [[V3_FLOAT]], i64 1
 ; GFX81PLUS-NEXT:    [[S2:%.*]] = extractelement <3 x half> [[V3_FLOAT]], i64 2
@@ -219,7 +219,7 @@ define amdgpu_ps half @load_1d_v3(i16 %s, <8 x i32> inreg %rsrc) {
 ; GFX81PLUS-NEXT:    [[ADDF_SUM_1:%.*]] = fadd half [[S2]], [[ADDF_SUM_0]]
 ; GFX81PLUS-NEXT:    ret half [[ADDF_SUM_1]]
 ;
-  %v3_float = call <3 x float> @llvm.amdgcn.image.load.1d.v3f32.i16(i32 7, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %v3_float = call <3 x float> @llvm.amdgcn.image.load.1d.v3f32.i16.v8i32(i32 7, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
   %v3_half = fptrunc <3 x float> %v3_float to <3 x half>
   %s0 = extractelement <3 x half> %v3_half, i64 0
   %s1 = extractelement <3 x half> %v3_half, i64 1
@@ -231,7 +231,7 @@ define amdgpu_ps half @load_1d_v3(i16 %s, <8 x i32> inreg %rsrc) {
 
 define amdgpu_ps half @load_1d_v4(i16 %s, <8 x i32> inreg %rsrc) {
 ; GFX7-LABEL: @load_1d_v4(
-; GFX7-NEXT:    [[V4_FLOAT:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX7-NEXT:    [[V4_FLOAT:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16.v8i32(i32 15, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX7-NEXT:    [[V4_HALF:%.*]] = fptrunc <4 x float> [[V4_FLOAT]] to <4 x half>
 ; GFX7-NEXT:    [[S0:%.*]] = extractelement <4 x half> [[V4_HALF]], i64 0
 ; GFX7-NEXT:    [[S1:%.*]] = extractelement <4 x half> [[V4_HALF]], i64 1
@@ -243,7 +243,7 @@ define amdgpu_ps half @load_1d_v4(i16 %s, <8 x i32> inreg %rsrc) {
 ; GFX7-NEXT:    ret half [[ADDF_SUM_2]]
 ;
 ; GFX81PLUS-LABEL: @load_1d_v4(
-; GFX81PLUS-NEXT:    [[V4_FLOAT:%.*]] = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 15, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[V4_FLOAT:%.*]] = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16.v8i32(i32 15, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX81PLUS-NEXT:    [[S0:%.*]] = extractelement <4 x half> [[V4_FLOAT]], i64 0
 ; GFX81PLUS-NEXT:    [[S1:%.*]] = extractelement <4 x half> [[V4_FLOAT]], i64 1
 ; GFX81PLUS-NEXT:    [[S2:%.*]] = extractelement <4 x half> [[V4_FLOAT]], i64 2
@@ -253,7 +253,7 @@ define amdgpu_ps half @load_1d_v4(i16 %s, <8 x i32> inreg %rsrc) {
 ; GFX81PLUS-NEXT:    [[ADDF_SUM_2:%.*]] = fadd half [[ADDF_SUM_0]], [[ADDF_SUM_1]]
 ; GFX81PLUS-NEXT:    ret half [[ADDF_SUM_2]]
 ;
-  %v4_float = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %v4_float = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16.v8i32(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
   %v4_half = fptrunc <4 x float> %v4_float to <4 x half>
   %s0 = extractelement <4 x half> %v4_half, i64 0
   %s1 = extractelement <4 x half> %v4_half, i64 1
@@ -268,17 +268,17 @@ define amdgpu_ps half @load_1d_v4(i16 %s, <8 x i32> inreg %rsrc) {
 define amdgpu_ps half @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
 ; GFX7-LABEL: @load_2dmsaa(
 ; GFX7-NEXT:  main_body:
-; GFX7-NEXT:    [[S_FLOAT:%.*]] = call float @llvm.amdgcn.image.msaa.load.x.2dmsaa.f32.i32(i32 1, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX7-NEXT:    [[S_FLOAT:%.*]] = call float @llvm.amdgcn.image.msaa.load.x.2dmsaa.f32.i32.v8i32(i32 1, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX7-NEXT:    [[S_HALF:%.*]] = fptrunc float [[S_FLOAT]] to half
 ; GFX7-NEXT:    ret half [[S_HALF]]
 ;
 ; GFX81PLUS-LABEL: @load_2dmsaa(
 ; GFX81PLUS-NEXT:  main_body:
-; GFX81PLUS-NEXT:    [[S_FLOAT:%.*]] = call half @llvm.amdgcn.image.msaa.load.x.2dmsaa.f16.i32(i32 1, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[S_FLOAT:%.*]] = call half @llvm.amdgcn.image.msaa.load.x.2dmsaa.f16.i32.v8i32(i32 1, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX81PLUS-NEXT:    ret half [[S_FLOAT]]
 ;
 main_body:
-  %s_float = call float @llvm.amdgcn.image.msaa.load.x.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  %s_float = call float @llvm.amdgcn.image.msaa.load.x.2dmsaa.f32.i32.v8i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
   %s_half = fptrunc float %s_float to half
   ret half %s_half
 }
@@ -286,7 +286,7 @@ main_body:
 define amdgpu_ps half @load_2dmsaa_v2(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
 ; GFX7-LABEL: @load_2dmsaa_v2(
 ; GFX7-NEXT:  main_body:
-; GFX7-NEXT:    [[V2_FLOAT:%.*]] = call <2 x float> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v2f32.i32(i32 3, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX7-NEXT:    [[V2_FLOAT:%.*]] = call <2 x float> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX7-NEXT:    [[V2_HALF:%.*]] = fptrunc <2 x float> [[V2_FLOAT]] to <2 x half>
 ; GFX7-NEXT:    [[S0:%.*]] = extractelement <2 x half> [[V2_HALF]], i64 0
 ; GFX7-NEXT:    [[S1:%.*]] = extractelement <2 x half> [[V2_HALF]], i64 1
@@ -295,14 +295,14 @@ define amdgpu_ps half @load_2dmsaa_v2(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32
 ;
 ; GFX81PLUS-LABEL: @load_2dmsaa_v2(
 ; GFX81PLUS-NEXT:  main_body:
-; GFX81PLUS-NEXT:    [[V2_FLOAT:%.*]] = call <2 x half> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v2f16.i32(i32 3, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[V2_FLOAT:%.*]] = call <2 x half> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v2f16.i32.v8i32(i32 3, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX81PLUS-NEXT:    [[S0:%.*]] = extractelement <2 x half> [[V2_FLOAT]], i64 0
 ; GFX81PLUS-NEXT:    [[S1:%.*]] = extractelement <2 x half> [[V2_FLOAT]], i64 1
 ; GFX81PLUS-NEXT:    [[ADDF_SUM_0:%.*]] = fadd half [[S0]], [[S1]]
 ; GFX81PLUS-NEXT:    ret half [[ADDF_SUM_0]]
 ;
 main_body:
-  %v2_float = call <2 x float> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v2f32.i32(i32 3, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  %v2_float = call <2 x float> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v2f32.i32.v8i32(i32 3, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
   %v2_half = fptrunc <2 x float> %v2_float to <2 x half>
   %s0 = extractelement <2 x half> %v2_half, i64 0
   %s1 = extractelement <2 x half> %v2_half, i64 1
@@ -313,7 +313,7 @@ main_body:
 define amdgpu_ps half @load_2dmsaa_v3(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
 ; GFX7-LABEL: @load_2dmsaa_v3(
 ; GFX7-NEXT:  main_body:
-; GFX7-NEXT:    [[V3_FLOAT:%.*]] = call <3 x float> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v3f32.i32(i32 7, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX7-NEXT:    [[V3_FLOAT:%.*]] = call <3 x float> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX7-NEXT:    [[V3_HALF:%.*]] = fptrunc <3 x float> [[V3_FLOAT]] to <3 x half>
 ; GFX7-NEXT:    [[S0:%.*]] = extractelement <3 x half> [[V3_HALF]], i64 0
 ; GFX7-NEXT:    [[S1:%.*]] = extractelement <3 x half> [[V3_HALF]], i64 1
@@ -324,7 +324,7 @@ define amdgpu_ps half @load_2dmsaa_v3(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32
 ;
 ; GFX81PLUS-LABEL: @load_2dmsaa_v3(
 ; GFX81PLUS-NEXT:  main_body:
-; GFX81PLUS-NEXT:    [[V3_FLOAT:%.*]] = call <3 x half> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v3f16.i32(i32 7, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[V3_FLOAT:%.*]] = call <3 x half> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v3f16.i32.v8i32(i32 7, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX81PLUS-NEXT:    [[S0:%.*]] = extractelement <3 x half> [[V3_FLOAT]], i64 0
 ; GFX81PLUS-NEXT:    [[S1:%.*]] = extractelement <3 x half> [[V3_FLOAT]], i64 1
 ; GFX81PLUS-NEXT:    [[S2:%.*]] = extractelement <3 x half> [[V3_FLOAT]], i64 2
@@ -333,7 +333,7 @@ define amdgpu_ps half @load_2dmsaa_v3(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32
 ; GFX81PLUS-NEXT:    ret half [[ADDF_SUM_1]]
 ;
 main_body:
-  %v3_float = call <3 x float> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v3f32.i32(i32 7, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  %v3_float = call <3 x float> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v3f32.i32.v8i32(i32 7, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
   %v3_half = fptrunc <3 x float> %v3_float to <3 x half>
   %s0 = extractelement <3 x half> %v3_half, i64 0
   %s1 = extractelement <3 x half> %v3_half, i64 1
@@ -346,7 +346,7 @@ main_body:
 define amdgpu_ps half @load_2dmsaa_v4(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
 ; GFX7-LABEL: @load_2dmsaa_v4(
 ; GFX7-NEXT:  main_body:
-; GFX7-NEXT:    [[V4_FLOAT:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX7-NEXT:    [[V4_FLOAT:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v4f32.i32.v8i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX7-NEXT:    [[V4_HALF:%.*]] = fptrunc <4 x float> [[V4_FLOAT]] to <4 x half>
 ; GFX7-NEXT:    [[S0:%.*]] = extractelement <4 x half> [[V4_HALF]], i64 0
 ; GFX7-NEXT:    [[S1:%.*]] = extractelement <4 x half> [[V4_HALF]], i64 1
@@ -359,7 +359,7 @@ define amdgpu_ps half @load_2dmsaa_v4(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32
 ;
 ; GFX81PLUS-LABEL: @load_2dmsaa_v4(
 ; GFX81PLUS-NEXT:  main_body:
-; GFX81PLUS-NEXT:    [[V4_FLOAT:%.*]] = call <4 x half> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v4f16.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[V4_FLOAT:%.*]] = call <4 x half> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v4f16.i32.v8i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX81PLUS-NEXT:    [[S0:%.*]] = extractelement <4 x half> [[V4_FLOAT]], i64 0
 ; GFX81PLUS-NEXT:    [[S1:%.*]] = extractelement <4 x half> [[V4_FLOAT]], i64 1
 ; GFX81PLUS-NEXT:    [[S2:%.*]] = extractelement <4 x half> [[V4_FLOAT]], i64 2
@@ -370,7 +370,7 @@ define amdgpu_ps half @load_2dmsaa_v4(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32
 ; GFX81PLUS-NEXT:    ret half [[ADDF_SUM_2]]
 ;
 main_body:
-  %v4_float = call <4 x float> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  %v4_float = call <4 x float> @llvm.amdgcn.image.msaa.load.x.2dmsaa.v4f32.i32.v8i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
   %v4_half = fptrunc <4 x float> %v4_float to <4 x half>
   %s0 = extractelement <4 x half> %v4_half, i64 0
   %s1 = extractelement <4 x half> %v4_half, i64 1
diff --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll
index 3ba21abb069ba..a14f342ae2482 100644
--- a/llvm/test/Transforms/InstCombine/icmp-mul.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll
@@ -1223,3 +1223,244 @@ define <2 x i1> @mul_mixed_nsw_nuw_xy_z_setnonzero_vec_ule(<2 x i8> %x, <2 x i8>
   %cmp = icmp ule <2 x i8> %muly, %mulx
   ret <2 x i1> %cmp
 }
+
+define i1 @icmp_eq_mul_nsw_nonequal(i8 %a, i8 %c) {
+; CHECK-LABEL: @icmp_eq_mul_nsw_nonequal(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %b = add i8 %a, 1
+  %mul1 = mul nsw i8 %a, %c
+  %mul2 = mul nsw i8 %b, %c
+  %cmp = icmp eq i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+define i1 @icmp_eq_mul_nuw_nonequal(i8 %a, i8 %c) {
+; CHECK-LABEL: @icmp_eq_mul_nuw_nonequal(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %b = add i8 %a, 1
+  %mul1 = mul nuw i8 %a, %c
+  %mul2 = mul nuw i8 %b, %c
+  %cmp = icmp eq i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+define i1 @icmp_eq_mul_nsw_nonequal_commuted(i8 %a, i8 %c) {
+; CHECK-LABEL: @icmp_eq_mul_nsw_nonequal_commuted(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %b = add i8 %a, 1
+  %mul1 = mul nsw i8 %a, %c
+  %mul2 = mul nsw i8 %c, %b
+  %cmp = icmp eq i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+define i1 @icmp_ne_mul_nsw_nonequal(i8 %a, i8 %c) {
+; CHECK-LABEL: @icmp_ne_mul_nsw_nonequal(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[C:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %b = add i8 %a, 1
+  %mul1 = mul nsw i8 %a, %c
+  %mul2 = mul nsw i8 %b, %c
+  %cmp = icmp ne i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+; Negative tests
+
+define i1 @icmp_eq_mul_nsw_mayequal(i8 %a, i8 %b, i8 %c) {
+; CHECK-LABEL: @icmp_eq_mul_nsw_mayequal(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL1:%.*]] = mul nsw i8 [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul nsw i8 [[B:%.*]], [[C]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[MUL1]], [[MUL2]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %mul1 = mul nsw i8 %a, %c
+  %mul2 = mul nsw i8 %b, %c
+  %cmp = icmp eq i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+define i1 @icmp_eq_mul_nsw_nuw_nonequal(i8 %a, i8 %c) {
+; CHECK-LABEL: @icmp_eq_mul_nsw_nuw_nonequal(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = add i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[MUL1:%.*]] = mul nsw i8 [[A]], [[C:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul nuw i8 [[B]], [[C]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[MUL1]], [[MUL2]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %b = add i8 %a, 1
+  %mul1 = mul nsw i8 %a, %c
+  %mul2 = mul nuw i8 %b, %c
+  %cmp = icmp eq i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+define i1 @icmp_ult_mul_nsw_nonequal(i8 %a, i8 %c) {
+; CHECK-LABEL: @icmp_ult_mul_nsw_nonequal(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = add i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[MUL1:%.*]] = mul nsw i8 [[A]], [[C:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul nsw i8 [[B]], [[C]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[MUL1]], [[MUL2]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %b = add i8 %a, 1
+  %mul1 = mul nsw i8 %a, %c
+  %mul2 = mul nsw i8 %b, %c
+  %cmp = icmp ult i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+define i1 @icmp_mul_nsw_slt(i8 %x, i8 %y) {
+; CHECK-LABEL: @icmp_mul_nsw_slt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %mul1 = mul nsw i8 %x, 7
+  %mul2 = mul nsw i8 %y, 7
+  %cmp = icmp slt i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+define i1 @icmp_mul_nsw_sle(i8 %x, i8 %y) {
+; CHECK-LABEL: @icmp_mul_nsw_sle(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %mul1 = mul nsw i8 %x, 7
+  %mul2 = mul nsw i8 %y, 7
+  %cmp = icmp sle i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+define i1 @icmp_mul_nsw_sgt(i8 %x, i8 %y) {
+; CHECK-LABEL: @icmp_mul_nsw_sgt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %mul1 = mul nsw i8 %x, 7
+  %mul2 = mul nsw i8 %y, 7
+  %cmp = icmp sgt i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+define i1 @icmp_mul_nsw_sge(i8 %x, i8 %y) {
+; CHECK-LABEL: @icmp_mul_nsw_sge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %mul1 = mul nsw i8 %x, 7
+  %mul2 = mul nsw i8 %y, 7
+  %cmp = icmp sge i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+define i1 @icmp_mul_nsw_slt_neg(i8 %x, i8 %y) {
+; CHECK-LABEL: @icmp_mul_nsw_slt_neg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %mul1 = mul nsw i8 %x, -7
+  %mul2 = mul nsw i8 %y, -7
+  %cmp = icmp slt i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+define i1 @icmp_mul_nsw_slt_neg_var(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @icmp_mul_nsw_slt_neg_var(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i8 [[Z:%.*]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %cond = icmp slt i8 %z, 0
+  call void @llvm.assume(i1 %cond)
+  %mul1 = mul nsw i8 %x, %z
+  %mul2 = mul nsw i8 %y, %z
+  %cmp = icmp slt i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+; Negative tests
+
+define i1 @icmp_mul_nonsw_slt(i8 %x, i8 %y) {
+; CHECK-LABEL: @icmp_mul_nonsw_slt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL1:%.*]] = mul i8 [[X:%.*]], 7
+; CHECK-NEXT:    [[MUL2:%.*]] = mul nsw i8 [[Y:%.*]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[MUL1]], [[MUL2]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %mul1 = mul i8 %x, 7
+  %mul2 = mul nsw i8 %y, 7
+  %cmp = icmp slt i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+define i1 @icmp_mul_nsw_slt_unknown_sign(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @icmp_mul_nsw_slt_unknown_sign(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL1:%.*]] = mul nsw i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul nsw i8 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[MUL1]], [[MUL2]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %mul1 = mul nsw i8 %x, %z
+  %mul2 = mul nsw i8 %y, %z
+  %cmp = icmp slt i8 %mul1, %mul2
+  ret i1 %cmp
+}
+
+define i1 @icmp_mul_nsw_slt_may_be_zero(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @icmp_mul_nsw_slt_may_be_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i8 [[Z:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[MUL1:%.*]] = mul nsw i8 [[X:%.*]], [[Z]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul nsw i8 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[MUL1]], [[MUL2]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %cond = icmp sgt i8 %z, -1
+  call void @llvm.assume(i1 %cond)
+
+  %mul1 = mul nsw i8 %x, %z
+  %mul2 = mul nsw i8 %y, %z
+  %cmp = icmp slt i8 %mul1, %mul2
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/instcombine-verify-known-bits.ll b/llvm/test/Transforms/InstCombine/instcombine-verify-known-bits.ll
new file mode 100644
index 0000000000000..a4a2ef1c4a874
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/instcombine-verify-known-bits.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine -instcombine-verify-known-bits < %s | FileCheck %s
+
+define i16 @pr110631(i32 range(i32 0, 256) %arg, i64 %arg1) {
+; CHECK-LABEL: define i16 @pr110631(
+; CHECK-SAME: i32 range(i32 0, 256) [[ARG:%.*]], i64 [[ARG1:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[I:%.*]] = xor i32 [[ARG]], 48991
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[ARG1]] to i32
+; CHECK-NEXT:    [[I4:%.*]] = and i32 [[I]], [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nuw i32 [[I4]] to i16
+; CHECK-NEXT:    [[I8:%.*]] = xor i16 [[TMP1]], 1
+; CHECK-NEXT:    ret i16 [[I8]]
+;
+bb:
+  %i = xor i32 %arg, 48991
+  %i2 = zext i32 %i to i64
+  %i3 = and i64 %arg1, %i2
+  %i4 = trunc i64 %i3 to i32
+  %i5 = trunc i32 %i4 to i16
+  %i6 = sext i16 %i5 to i32
+  %i7 = xor i32 %i6, 1
+  %i8 = trunc i32 %i7 to i16
+  ret i16 %i8
+}
diff --git a/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll b/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll
index e9e105b91f3c1..5d32158e61715 100644
--- a/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll
@@ -18,6 +18,23 @@ define i32 @shrinkExtractElt_i64_to_i32_0(<3 x i64> %x) {
   ret i32 %t
 }
 
+define i32 @shrinkShiftExtractElt_i64_to_i32_0(<3 x i64> %x) {
+; LE-LABEL: @shrinkShiftExtractElt_i64_to_i32_0(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; LE-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i64 1
+; LE-NEXT:    ret i32 [[T]]
+;
+; BE-LABEL: @shrinkShiftExtractElt_i64_to_i32_0(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; BE-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i64 0
+; BE-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i32 0
+  %s = lshr i64 %e, 32
+  %t = trunc i64 %s to i32
+  ret i32 %t
+}
+
 define i32 @vscale_shrinkExtractElt_i64_to_i32_0(<vscale x 3 x i64> %x) {
 ; LE-LABEL: @vscale_shrinkExtractElt_i64_to_i32_0(
 ; LE-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 3 x i64> [[X:%.*]] to <vscale x 6 x i32>
@@ -34,6 +51,22 @@ define i32 @vscale_shrinkExtractElt_i64_to_i32_0(<vscale x 3 x i64> %x) {
   ret i32 %t
 }
 
+define i32 @vscale_shrinkShiftExtractElt_i64_to_i32_0(<vscale x 3 x i64> %x) {
+; LE-LABEL: @vscale_shrinkShiftExtractElt_i64_to_i32_0(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 3 x i64> [[X:%.*]] to <vscale x 6 x i32>
+; LE-NEXT:    [[T:%.*]] = extractelement <vscale x 6 x i32> [[TMP1]], i64 1
+; LE-NEXT:    ret i32 [[T]]
+;
+; BE-LABEL: @vscale_shrinkShiftExtractElt_i64_to_i32_0(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 3 x i64> [[X:%.*]] to <vscale x 6 x i32>
+; BE-NEXT:    [[T:%.*]] = extractelement <vscale x 6 x i32> [[TMP1]], i64 0
+; BE-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <vscale x 3 x i64> %x, i32 0
+  %s = lshr i64 %e, 32
+  %t = trunc i64 %s to i32
+  ret i32 %t
+}
 
 define i32 @shrinkExtractElt_i64_to_i32_1(<3 x i64> %x) {
 ; LE-LABEL: @shrinkExtractElt_i64_to_i32_1(
@@ -83,6 +116,23 @@ define i16 @shrinkExtractElt_i64_to_i16_0(<3 x i64> %x) {
   ret i16 %t
 }
 
+define i16 @shrinkShiftExtractElt_i64_to_i16_0(<3 x i64> %x) {
+; LE-LABEL: @shrinkShiftExtractElt_i64_to_i16_0(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; LE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 3
+; LE-NEXT:    ret i16 [[T]]
+;
+; BE-LABEL: @shrinkShiftExtractElt_i64_to_i16_0(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; BE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 0
+; BE-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i16 0
+  %s = ashr i64 %e, 48
+  %t = trunc i64 %s to i16
+  ret i16 %t
+}
+
 define i16 @shrinkExtractElt_i64_to_i16_1(<3 x i64> %x) {
 ; LE-LABEL: @shrinkExtractElt_i64_to_i16_1(
 ; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
@@ -157,6 +207,20 @@ define i30 @shrinkExtractElt_i40_to_i30_1(<3 x i40> %x) {
   ret i30 %t
 }
 
+; Do not optimize if the shift amount isn't a whole number of truncated bits.
+define i16 @shrinkShiftExtractElt_i64_to_i16_0_badshift(<3 x i64> %x) {
+; ANY-LABEL: @shrinkShiftExtractElt_i64_to_i16_0_badshift(
+; ANY-NEXT:    [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 0
+; ANY-NEXT:    [[S:%.*]] = lshr i64 [[E]], 31
+; ANY-NEXT:    [[T:%.*]] = trunc i64 [[S]] to i16
+; ANY-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i16 0
+  %s = lshr i64 %e, 31
+  %t = trunc i64 %s to i16
+  ret i16 %t
+}
+
 ; Do not canonicalize if that would increase the instruction count.
 declare void @use(i64)
 define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) {
@@ -172,6 +236,45 @@ define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) {
   ret i16 %t
 }
 
+; Do not canonicalize if that would increase the instruction count.
+define i16 @shrinkShiftExtractElt_i64_to_i16_2_extra_shift_use(<3 x i64> %x) {
+; ANY-LABEL: @shrinkShiftExtractElt_i64_to_i16_2_extra_shift_use(
+; ANY-NEXT:    [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2
+; ANY-NEXT:    [[S:%.*]] = lshr i64 [[E]], 48
+; ANY-NEXT:    call void @use(i64 [[S]])
+; ANY-NEXT:    [[T:%.*]] = trunc nuw i64 [[S]] to i16
+; ANY-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i64 2
+  %s = lshr i64 %e, 48
+  call void @use(i64 %s)
+  %t = trunc i64 %s to i16
+  ret i16 %t
+}
+
+; OK to reuse the extract if we remove the shift+trunc.
+define i16 @shrinkShiftExtractElt_i64_to_i16_2_extra_extract_use(<3 x i64> %x) {
+; LE-LABEL: @shrinkShiftExtractElt_i64_to_i16_2_extra_extract_use(
+; LE-NEXT:    [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2
+; LE-NEXT:    call void @use(i64 [[E]])
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X]] to <12 x i16>
+; LE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 11
+; LE-NEXT:    ret i16 [[T]]
+;
+; BE-LABEL: @shrinkShiftExtractElt_i64_to_i16_2_extra_extract_use(
+; BE-NEXT:    [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2
+; BE-NEXT:    call void @use(i64 [[E]])
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X]] to <12 x i16>
+; BE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 8
+; BE-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i64 2
+  call void @use(i64 %e)
+  %s = lshr i64 %e, 48
+  %t = trunc i64 %s to i16
+  ret i16 %t
+}
+
 ; Check to ensure PR45314 remains fixed.
 define <4 x i64> @PR45314(<4 x i64> %x) {
 ; LE-LABEL: @PR45314(
diff --git a/llvm/test/Transforms/InstCombine/trunc-extractelement.ll b/llvm/test/Transforms/InstCombine/trunc-extractelement.ll
index 5e62ca9cd591d..ba2d07009d9c7 100644
--- a/llvm/test/Transforms/InstCombine/trunc-extractelement.ll
+++ b/llvm/test/Transforms/InstCombine/trunc-extractelement.ll
@@ -18,6 +18,23 @@ define i32 @shrinkExtractElt_i64_to_i32_0(<3 x i64> %x) {
   ret i32 %t
 }
 
+define i32 @shrinkShiftExtractElt_i64_to_i32_0(<3 x i64> %x) {
+; LE-LABEL: @shrinkShiftExtractElt_i64_to_i32_0(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; LE-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i64 1
+; LE-NEXT:    ret i32 [[T]]
+;
+; BE-LABEL: @shrinkShiftExtractElt_i64_to_i32_0(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; BE-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i64 0
+; BE-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i32 0
+  %s = lshr i64 %e, 32
+  %t = trunc i64 %s to i32
+  ret i32 %t
+}
+
 define i32 @vscale_shrinkExtractElt_i64_to_i32_0(<vscale x 3 x i64> %x) {
 ; LE-LABEL: @vscale_shrinkExtractElt_i64_to_i32_0(
 ; LE-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 3 x i64> [[X:%.*]] to <vscale x 6 x i32>
@@ -34,6 +51,22 @@ define i32 @vscale_shrinkExtractElt_i64_to_i32_0(<vscale x 3 x i64> %x) {
   ret i32 %t
 }
 
+define i32 @vscale_shrinkShiftExtractElt_i64_to_i32_0(<vscale x 3 x i64> %x) {
+; LE-LABEL: @vscale_shrinkShiftExtractElt_i64_to_i32_0(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 3 x i64> [[X:%.*]] to <vscale x 6 x i32>
+; LE-NEXT:    [[T:%.*]] = extractelement <vscale x 6 x i32> [[TMP1]], i64 1
+; LE-NEXT:    ret i32 [[T]]
+;
+; BE-LABEL: @vscale_shrinkShiftExtractElt_i64_to_i32_0(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 3 x i64> [[X:%.*]] to <vscale x 6 x i32>
+; BE-NEXT:    [[T:%.*]] = extractelement <vscale x 6 x i32> [[TMP1]], i64 0
+; BE-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <vscale x 3 x i64> %x, i32 0
+  %s = lshr i64 %e, 32
+  %t = trunc i64 %s to i32
+  ret i32 %t
+}
 
 define i32 @shrinkExtractElt_i64_to_i32_1(<3 x i64> %x) {
 ; LE-LABEL: @shrinkExtractElt_i64_to_i32_1(
@@ -83,6 +116,23 @@ define i16 @shrinkExtractElt_i64_to_i16_0(<3 x i64> %x) {
   ret i16 %t
 }
 
+define i16 @shrinkShiftExtractElt_i64_to_i16_0(<3 x i64> %x) {
+; LE-LABEL: @shrinkShiftExtractElt_i64_to_i16_0(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; LE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 3
+; LE-NEXT:    ret i16 [[T]]
+;
+; BE-LABEL: @shrinkShiftExtractElt_i64_to_i16_0(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; BE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 0
+; BE-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i16 0
+  %s = ashr i64 %e, 48
+  %t = trunc i64 %s to i16
+  ret i16 %t
+}
+
 define i16 @shrinkExtractElt_i64_to_i16_1(<3 x i64> %x) {
 ; LE-LABEL: @shrinkExtractElt_i64_to_i16_1(
 ; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
@@ -157,6 +207,20 @@ define i30 @shrinkExtractElt_i40_to_i30_1(<3 x i40> %x) {
   ret i30 %t
 }
 
+; Do not optimize if the shift amount isn't a whole number of truncated bits.
+define i16 @shrinkShiftExtractElt_i64_to_i16_0_badshift(<3 x i64> %x) {
+; ANY-LABEL: @shrinkShiftExtractElt_i64_to_i16_0_badshift(
+; ANY-NEXT:    [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 0
+; ANY-NEXT:    [[S:%.*]] = lshr i64 [[E]], 31
+; ANY-NEXT:    [[T:%.*]] = trunc i64 [[S]] to i16
+; ANY-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i16 0
+  %s = lshr i64 %e, 31
+  %t = trunc i64 %s to i16
+  ret i16 %t
+}
+
 ; Do not canonicalize if that would increase the instruction count.
 declare void @use(i64)
 define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) {
@@ -172,6 +236,45 @@ define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) {
   ret i16 %t
 }
 
+; Do not canonicalize if that would increase the instruction count.
+define i16 @shrinkShiftExtractElt_i64_to_i16_2_extra_shift_use(<3 x i64> %x) {
+; ANY-LABEL: @shrinkShiftExtractElt_i64_to_i16_2_extra_shift_use(
+; ANY-NEXT:    [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2
+; ANY-NEXT:    [[S:%.*]] = lshr i64 [[E]], 48
+; ANY-NEXT:    call void @use(i64 [[S]])
+; ANY-NEXT:    [[T:%.*]] = trunc nuw i64 [[S]] to i16
+; ANY-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i64 2
+  %s = lshr i64 %e, 48
+  call void @use(i64 %s)
+  %t = trunc i64 %s to i16
+  ret i16 %t
+}
+
+; OK to reuse the extract if we remove the shift+trunc.
+define i16 @shrinkShiftExtractElt_i64_to_i16_2_extra_extract_use(<3 x i64> %x) {
+; LE-LABEL: @shrinkShiftExtractElt_i64_to_i16_2_extra_extract_use(
+; LE-NEXT:    [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2
+; LE-NEXT:    call void @use(i64 [[E]])
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X]] to <12 x i16>
+; LE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 11
+; LE-NEXT:    ret i16 [[T]]
+;
+; BE-LABEL: @shrinkShiftExtractElt_i64_to_i16_2_extra_extract_use(
+; BE-NEXT:    [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2
+; BE-NEXT:    call void @use(i64 [[E]])
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X]] to <12 x i16>
+; BE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i64 8
+; BE-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i64 2
+  call void @use(i64 %e)
+  %s = lshr i64 %e, 48
+  %t = trunc i64 %s to i16
+  ret i16 %t
+}
+
 ; Check to ensure PR45314 remains fixed.
 define <4 x i64> @PR45314(<4 x i64> %x) {
 ; LE-LABEL: @PR45314(
diff --git a/llvm/test/Transforms/LICM/hoist-binop.ll b/llvm/test/Transforms/LICM/hoist-binop.ll
index a840e24757884..74e2b7a2caf4a 100644
--- a/llvm/test/Transforms/LICM/hoist-binop.ll
+++ b/llvm/test/Transforms/LICM/hoist-binop.ll
@@ -437,17 +437,17 @@ loop:
   br label %loop
 }
 
-; Don't hoist floating-point ops, even if they are associative. This would be
-; valid, but is currently disabled.
-define void @fadd(float %c1, float %c2) {
-; CHECK-LABEL: @fadd(
+; The simple case. Hoist if fast is present on both instructions.
+define void @fadd_fast(float %c1, float %c2) {
+; CHECK-LABEL: @fadd_fast(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = fadd fast float [[C1:%.*]], [[C2:%.*]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = fadd fast float [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = fadd fast float [[INDEX]], [[C1]]
 ; CHECK-NEXT:    call void @use(float [[STEP_ADD]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = fadd fast float [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = fadd fast float [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 entry:
@@ -461,6 +461,226 @@ loop:
   br label %loop
 }
 
+; The simple case. Hoist if fast is present on both instructions.
+define void @fmul_fast(float %c1, float %c2) {
+; CHECK-LABEL: @fmul_fast(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = fmul fast float [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = fmul fast float [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(float [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = fmul fast float [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi float [ 0., %entry ], [ %index.next, %loop ]
+  %step.add = fmul fast float %index, %c1
+  call void @use(float %step.add)
+  %index.next = fmul fast float %step.add, %c2
+  br label %loop
+}
+
+; The minimum case.
+; Hoist if reasassoc and nsz are present on both instructions.
+define void @fadd_reassoc_nsz(float %c1, float %c2) {
+; CHECK-LABEL: @fadd_reassoc_nsz(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = fadd reassoc nsz float [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = fadd reassoc nsz float [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(float [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = fadd reassoc nsz float [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi float [ 0., %entry ], [ %index.next, %loop ]
+  %step.add = fadd reassoc nsz float %index, %c1
+  call void @use(float %step.add)
+  %index.next = fadd reassoc nsz float %step.add, %c2
+  br label %loop
+}
+
+; The minimum case.
+; Hoist if reasassoc and nsz are present on both instructions.
+define void @fmul_reassoc_nsz(float %c1, float %c2) {
+; CHECK-LABEL: @fmul_reassoc_nsz(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = fmul reassoc nsz float [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = fmul reassoc nsz float [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(float [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = fmul reassoc nsz float [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi float [ 0., %entry ], [ %index.next, %loop ]
+  %step.add = fmul reassoc nsz float %index, %c1
+  call void @use(float %step.add)
+  %index.next = fmul reassoc nsz float %step.add, %c2
+  br label %loop
+}
+
+; Don't hoist if both reassoc and nsz aren't present on both instructions.
+define void @fadd_nonassoc(float %c1, float %c2) {
+; CHECK-LABEL: @fadd_nonassoc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = fadd reassoc float [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT:    call void @use(float [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = fadd reassoc nsz float [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi float [ 0., %entry ], [ %index.next, %loop ]
+  %step.add = fadd reassoc float %index, %c1
+  call void @use(float %step.add)
+  %index.next = fadd reassoc nsz float %step.add, %c2
+  br label %loop
+}
+
+; Don't hoist if both reassoc and nsz aren't present on both instructions.
+define void @fmul_noassoc(float %c1, float %c2) {
+; CHECK-LABEL: @fmul_noassoc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = fmul reassoc nsz float [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT:    call void @use(float [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = fmul nsz float [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi float [ 0., %entry ], [ %index.next, %loop ]
+  %step.add = fmul reassoc nsz float %index, %c1
+  call void @use(float %step.add)
+  %index.next = fmul nsz float %step.add, %c2
+  br label %loop
+}
+
+; No intersection in flags present on both instructions,
+; except reassoc and nsz.
+define void @fadd_fmf_nointersect(float %c1, float %c2) {
+; CHECK-LABEL: @fadd_fmf_nointersect(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = fadd reassoc nsz float [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = fadd reassoc nnan nsz float [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(float [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = fadd reassoc nsz float [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi float [ 0., %entry ], [ %index.next, %loop ]
+  %step.add = fadd reassoc nsz nnan float %index, %c1
+  call void @use(float %step.add)
+  %index.next = fadd reassoc nsz ninf float %step.add, %c2
+  br label %loop
+}
+
+; No intersection in flags present on both instructions,
+; except reassoc and nsz.
+define void @fmul_fmf_nointersect(float %c1, float %c2) {
+; CHECK-LABEL: @fmul_fmf_nointersect(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = fmul reassoc nsz float [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = fmul reassoc nsz contract float [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(float [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = fmul reassoc nsz float [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi float [ 0., %entry ], [ %index.next, %loop ]
+  %step.add = fmul reassoc nsz contract float %index, %c1
+  call void @use(float %step.add)
+  %index.next = fmul reassoc nnan nsz float %step.add, %c2
+  br label %loop
+}
+
+; Non-empty intersection in flags present on both instructions,
+; including reassoc and nsz.
+define void @fadd_fmf_intersect(float %c1, float %c2) {
+; CHECK-LABEL: @fadd_fmf_intersect(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = fadd reassoc ninf nsz float [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = fadd reassoc nnan ninf nsz float [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(float [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = fadd reassoc ninf nsz float [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi float [ 0., %entry ], [ %index.next, %loop ]
+  %step.add = fadd reassoc nnan nsz ninf float %index, %c1
+  call void @use(float %step.add)
+  %index.next = fadd reassoc ninf nsz float %step.add, %c2
+  br label %loop
+}
+
+; Non-empty intersection in flags present on both instructions,
+; including reassoc and nsz.
+define void @fmul_fmf_intersect(float %c1, float %c2) {
+; CHECK-LABEL: @fmul_fmf_intersect(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = fmul reassoc nsz afn float [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = fmul reassoc nsz arcp afn float [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(float [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = fmul reassoc nsz afn float [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi float [ 0., %entry ], [ %index.next, %loop ]
+  %step.add = fmul reassoc afn nsz arcp float %index, %c1
+  call void @use(float %step.add)
+  %index.next = fmul reassoc nsz afn float %step.add, %c2
+  br label %loop
+}
+
 ; Don't hoist if the intermediate op has more than two uses. This is an
 ; heuristic that can be adjusted if warranted. Currently we are being
 ; conservative to minimise potential impact in code size.
diff --git a/llvm/test/Transforms/LoopIdiom/SPIRV/lit.local.cfg b/llvm/test/Transforms/LoopIdiom/SPIRV/lit.local.cfg
new file mode 100644
index 0000000000000..78dd74cd6dc63
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/SPIRV/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "SPIRV" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/LoopIdiom/SPIRV/popcnt.ll b/llvm/test/Transforms/LoopIdiom/SPIRV/popcnt.ll
new file mode 100644
index 0000000000000..dd3a4d9699fdb
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/SPIRV/popcnt.ll
@@ -0,0 +1,105 @@
+; RUN: opt -passes=loop-idiom -mtriple=spirv32-- -S < %s | FileCheck %s
+; RUN: opt -passes=loop-idiom -mtriple=spirv64-- -S < %s | FileCheck %s
+
+; Mostly copied from x86 version.
+
+;To recognize this pattern:
+;int popcount(unsigned long long a) {
+;    int c = 0;
+;    while (a) {
+;        c++;
+;        a &= a - 1;
+;    }
+;    return c;
+;}
+;
+
+; CHECK-LABEL: @popcount_i64
+; CHECK: entry
+; CHECK: llvm.ctpop.i64
+; CHECK: ret
+define i32 @popcount_i64(i64 %a) nounwind uwtable readnone ssp {
+entry:
+  %tobool3 = icmp eq i64 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i64 %a.addr.04, -1
+  %and = and i64 %sub, %a.addr.04
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
+
+; CHECK-LABEL: @popcount_i32
+; CHECK: entry
+; CHECK: llvm.ctpop.i32
+; CHECK: ret
+define i32 @popcount_i32(i32 %a) nounwind uwtable readnone ssp {
+entry:
+  %tobool3 = icmp eq i32 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i32 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i32 %a.addr.04, -1
+  %and = and i32 %sub, %a.addr.04
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
+
+; To recognize this pattern:
+;int popcount(unsigned long long a, int mydata1, int mydata2) {
+;    int c = 0;
+;    while (a) {
+;        c++;
+;        a &= a - 1;
+;        mydata1 *= c;
+;        mydata2 *= (int)a;
+;    }
+;    return c + mydata1 + mydata2;
+;}
+
+; CHECK-LABEL: @popcount2
+; CHECK: entry
+; CHECK: llvm.ctpop.i64
+; CHECK: ret
+define i32 @popcount2(i64 %a, i32 %mydata1, i32 %mydata2) nounwind uwtable readnone ssp {
+entry:
+  %tobool9 = icmp eq i64 %a, 0
+  br i1 %tobool9, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.013 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %mydata2.addr.012 = phi i32 [ %mul1, %while.body ], [ %mydata2, %entry ]
+  %mydata1.addr.011 = phi i32 [ %mul, %while.body ], [ %mydata1, %entry ]
+  %a.addr.010 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.013, 1
+  %sub = add i64 %a.addr.010, -1
+  %and = and i64 %sub, %a.addr.010
+  %mul = mul nsw i32 %inc, %mydata1.addr.011
+  %conv = trunc i64 %and to i32
+  %mul1 = mul nsw i32 %conv, %mydata2.addr.012
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  %mydata2.addr.0.lcssa = phi i32 [ %mydata2, %entry ], [ %mul1, %while.body ]
+  %mydata1.addr.0.lcssa = phi i32 [ %mydata1, %entry ], [ %mul, %while.body ]
+  %add = add i32 %mydata2.addr.0.lcssa, %mydata1.addr.0.lcssa
+  %add2 = add i32 %add, %c.0.lcssa
+  ret i32 %add2
+}
diff --git a/llvm/test/Transforms/LoopLoadElim/unknown-stride-known-dep.ll b/llvm/test/Transforms/LoopLoadElim/unknown-stride-known-dep.ll
new file mode 100644
index 0000000000000..e7b0968c8e826
--- /dev/null
+++ b/llvm/test/Transforms/LoopLoadElim/unknown-stride-known-dep.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-load-elim -S %s | FileCheck %s
+
+; The test was originally written as part of the investigation of #96656.
+; The bug has now been marked as invalid, and we keep the test to show
+; LLE's operation on known dependence returned by LAA.
+
+define void @unknown_stride_known_dependence(ptr %x, ptr %y, i1 %cond) {
+; CHECK-LABEL: define void @unknown_stride_known_dependence(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[X]], align 4
+; CHECK-NEXT:    br i1 [[COND]], label %[[NOLOOP_EXIT:.*]], label %[[LOOP_LVER_CHECK:.*]]
+; CHECK:       [[LOOP_LVER_CHECK]]:
+; CHECK-NEXT:    [[SEXT_X:%.*]] = sext i32 [[LOAD]] to i64
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[GEP_16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[LOAD]], 1
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label %[[LOOP_PH_LVER_ORIG:.*]], label %[[LOOP_PH:.*]]
+; CHECK:       [[LOOP_PH_LVER_ORIG]]:
+; CHECK-NEXT:    br label %[[LOOP_LVER_ORIG:.*]]
+; CHECK:       [[LOOP_LVER_ORIG]]:
+; CHECK-NEXT:    [[IV_LVER_ORIG:%.*]] = phi i64 [ 0, %[[LOOP_PH_LVER_ORIG]] ], [ [[IV_NEXT_LVER_ORIG:%.*]], %[[LOOP_LVER_ORIG]] ]
+; CHECK-NEXT:    [[MUL_LVER_ORIG:%.*]] = mul i64 [[IV_LVER_ORIG]], [[SEXT_X]]
+; CHECK-NEXT:    [[GEP_8_MUL_LVER_ORIG:%.*]] = getelementptr double, ptr [[GEP_8]], i64 [[MUL_LVER_ORIG]]
+; CHECK-NEXT:    [[LOAD_8_LVER_ORIG:%.*]] = load double, ptr [[GEP_8_MUL_LVER_ORIG]], align 8
+; CHECK-NEXT:    [[GEP_16_MUL_LVER_ORIG:%.*]] = getelementptr double, ptr [[GEP_16]], i64 [[MUL_LVER_ORIG]]
+; CHECK-NEXT:    store double [[LOAD_8_LVER_ORIG]], ptr [[GEP_16_MUL_LVER_ORIG]], align 8
+; CHECK-NEXT:    [[IV_NEXT_LVER_ORIG]] = add i64 [[IV_LVER_ORIG]], 1
+; CHECK-NEXT:    [[ICMP_LVER_ORIG:%.*]] = icmp eq i64 [[IV_LVER_ORIG]], 1
+; CHECK-NEXT:    br i1 [[ICMP_LVER_ORIG]], label %[[EXIT_LOOPEXIT_LOOPEXIT:.*]], label %[[LOOP_LVER_ORIG]]
+; CHECK:       [[LOOP_PH]]:
+; CHECK-NEXT:    [[LOAD_INITIAL:%.*]] = load double, ptr [[GEP_8]], align 8
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[STORE_FORWARDED:%.*]] = phi double [ [[LOAD_INITIAL]], %[[LOOP_PH]] ], [ [[STORE_FORWARDED]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[IV]], [[SEXT_X]]
+; CHECK-NEXT:    [[GEP_8_MUL:%.*]] = getelementptr double, ptr [[GEP_8]], i64 [[MUL]]
+; CHECK-NEXT:    [[LOAD_8:%.*]] = load double, ptr [[GEP_8_MUL]], align 8
+; CHECK-NEXT:    [[GEP_16_MUL:%.*]] = getelementptr double, ptr [[GEP_16]], i64 [[MUL]]
+; CHECK-NEXT:    store double [[STORE_FORWARDED]], ptr [[GEP_16_MUL]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i64 [[IV]], 1
+; CHECK-NEXT:    br i1 [[ICMP]], label %[[EXIT_LOOPEXIT_LOOPEXIT1:.*]], label %[[LOOP]]
+; CHECK:       [[NOLOOP_EXIT]]:
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[LOAD]] to i64
+; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr double, ptr [[Y]], i64 [[SEXT]]
+; CHECK-NEXT:    [[LOAD_Y:%.*]] = load double, ptr [[GEP_Y]], align 8
+; CHECK-NEXT:    store double [[LOAD_Y]], ptr [[X]], align 8
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT_LOOPEXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT:.*]]
+; CHECK:       [[EXIT_LOOPEXIT_LOOPEXIT1]]:
+; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %load = load i32, ptr %x, align 4
+  br i1 %cond, label %noloop.exit, label %loop.ph
+
+loop.ph:                                              ; preds = %entry
+  %sext.x = sext i32 %load to i64
+  %gep.8 = getelementptr i8, ptr %y, i64 8
+  %gep.16 = getelementptr i8, ptr %y, i64 16
+  br label %loop
+
+loop:                                                 ; preds = %loop, %loop.ph
+  %iv = phi i64 [ 0, %loop.ph ], [ %iv.next, %loop ]
+  %mul = mul i64 %iv, %sext.x
+  %gep.8.mul = getelementptr double, ptr %gep.8, i64 %mul
+  %load.8 = load double, ptr %gep.8.mul, align 8
+  %gep.16.mul = getelementptr double, ptr %gep.16, i64 %mul
+  store double %load.8, ptr %gep.16.mul
+  %iv.next = add i64 %iv, 1
+  %icmp = icmp eq i64 %iv, 1
+  br i1 %icmp, label %exit, label %loop
+
+noloop.exit:                                          ; preds = %loop.ph
+  %sext = sext i32 %load to i64
+  %gep.y = getelementptr double, ptr %y, i64 %sext
+  %load.y = load double, ptr %gep.y
+  store double %load.y, ptr %x
+  br label %exit
+
+exit:                                                 ; preds = %loop.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
index 5e72e13a26edb..8f1c95fd4a330 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
@@ -79,11 +79,11 @@ define void @lsr_crash_preserve_addrspace_unknown_type2(ptr addrspace(5) %array,
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_INC:.*]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[IDX:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ARRAY]], i32 [[J]]
-; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[ARRAY2]], i32 [[J]]
-; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ARRAY]], i32 [[J]]
+; CHECK-NEXT:    [[IDX:%.*]] = getelementptr i8, ptr addrspace(5) [[ARRAY]], i32 [[J]]
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr i8, ptr addrspace(3) [[ARRAY2]], i32 [[J]]
+; CHECK-NEXT:    [[T:%.*]] = getelementptr i8, ptr addrspace(5) [[ARRAY]], i32 [[J]]
 ; CHECK-NEXT:    [[N8:%.*]] = load i8, ptr addrspace(5) [[T]], align 4
-; CHECK-NEXT:    [[N7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[T]], i32 42
+; CHECK-NEXT:    [[N7:%.*]] = getelementptr i8, ptr addrspace(5) [[T]], i32 42
 ; CHECK-NEXT:    [[N9:%.*]] = load i8, ptr addrspace(5) [[N7]], align 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[J]], 42
 ; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN17:.*]], label %[[FOR_INC]]
diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/illegal-addr-modes.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/illegal-addr-modes.ll
index 745b54e2bdc64..1709ec1086042 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/ARM/illegal-addr-modes.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/illegal-addr-modes.ll
@@ -25,7 +25,7 @@ define ptr @negativeOneCase(ptr returned %a, ptr nocapture readonly %b, i32 %n)
 ; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
 ; CHECK:       while.cond:
 ; CHECK-NEXT:    [[P_0:%.*]] = phi ptr [ [[ADD_PTR]], [[ENTRY:%.*]] ], [ [[INCDEC_PTR:%.*]], [[WHILE_COND]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_0]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr i8, ptr [[P_0]], i32 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_COND2_PREHEADER:%.*]], label [[WHILE_COND]]
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
index 272e47b3ee1e7..5eebf13313555 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
@@ -4514,7 +4514,7 @@ define void @test8() {
 ; EPILOG-NEXT:    %i4.7 = add nuw nsw i64 %i3, 8
 ; EPILOG-NEXT:    br i1 false, label %outerloop.loopexit.loopexit, label %latch.7
 ; EPILOG:       latch.7:
-; EPILOG-NEXT:    %niter.next.7 = add i64 %niter, 8
+; EPILOG-NEXT:    %niter.next.7 = add nuw nsw i64 %niter, 8
 ; EPILOG-NEXT:    %niter.ncmp.7 = icmp ne i64 %niter.next.7, %unroll_iter
 ; EPILOG-NEXT:    br i1 %niter.ncmp.7, label %innerH, label %exit.unr-lcssa.loopexit
 ; EPILOG:       exit.unr-lcssa.loopexit:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll
new file mode 100644
index 0000000000000..a617f5e4e4893
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -epilogue-vectorization-minimum-VF=4 -debug-only=loop-vectorize -force-vector-interleave=1 -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %N) {
+; CHECK-LABEL: define void @simple_histogram(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP6]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP14]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP15]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP23:%.*]] = shl i64 [[TMP22]], 1
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP23]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP25:%.*]] = shl i64 [[TMP24]], 1
+; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], [[TMP25]]
+; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP17:%.*]] = shl i64 [[TMP16]], 1
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX4]]
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i32>, ptr [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 2 x i32> [[WIDE_LOAD5]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 2 x i64> [[TMP19]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv2p0.i32(<vscale x 2 x ptr> [[TMP20]], i32 1, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[N_MOD_VF2]], 0
+; CHECK-NEXT:    br i1 [[CMP_N7]], label [[FOR_EXIT]], label [[SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-no-scalar-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-no-scalar-interleave.ll
new file mode 100644
index 0000000000000..716e3c4fbd7a2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-no-scalar-interleave.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -force-vector-interleave=2 -force-vector-width=1 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+;; Make sure we don't interleave a histogram when vectorization is disabled.
+
+; CHECK-LABEL: LV: Checking a loop in 'simple_histogram_forced_scalar_interleave'
+; CHECK: LV: Not interleaving without vectorization due to histogram operations.
+
+define void @simple_histogram_forced_scalar_interleave(ptr noalias %buckets, ptr readonly %indices, i64 %N) {
+; CHECK-LABEL: define void @simple_histogram_forced_scalar_interleave(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-outerloop-scevaddrec.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-outerloop-scevaddrec.ll
new file mode 100644
index 0000000000000..02bbfa839e690
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-outerloop-scevaddrec.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;; Make sure we don't detect a histogram operation if the index address is
+;; loop invariant.
+; CHECK: LV: Checking for a histogram on:   store i32 %inc, ptr %gep.bucket, align 4
+; CHECK-NEXT: LV: Can't vectorize due to memory conflicts
+; CHECK-NEXT: LV: Not vectorizing: Cannot prove legality.
+
+define void @outer_loop_scevaddrec(ptr noalias %buckets, ptr readonly %indices, i64 %N, i64 %M) {
+entry:
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %outer.iv
+  br label %inner.body
+
+inner.body:
+  %iv = phi i64 [ 0, %outer.header ], [ %iv.next, %inner.body ]
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %outer.latch, label %inner.body
+
+outer.latch:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %outer.exitcond = icmp eq i64 %outer.iv.next, %M
+  br i1 %outer.exitcond, label %outer.exit, label %outer.header
+
+outer.exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll
new file mode 100644
index 0000000000000..9ca4b20341e0e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -force-vector-interleave=1 -max-dependences=2 -debug-only=loop-vectorize,loop-accesses -S 2>&1 | FileCheck %s
+; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -force-vector-interleave=1 -debug-only=loop-vectorize,loop-accesses -S 2>&1 | FileCheck %s --check-prefix=NORMAL_DEP_LIMIT
+; REQUIRES: asserts
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;; Check that we don't crash if LAA gives up on recording dependences and
+;; returns a null pointer.
+
+; CHECK-LABEL: LAA: Checking a loop in 'many_deps'
+; CHECK: Too many dependences, stopped recording
+; CHECK: LV: Can't vectorize due to memory conflicts
+; CHECK: LV: Not vectorizing: Cannot prove legality.
+
+define void @many_deps(ptr noalias %buckets, ptr %array, ptr %indices, ptr %other, i64 %N) {
+; CHECK-LABEL: define void @many_deps(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr [[ARRAY:%.*]], ptr [[INDICES:%.*]], ptr [[OTHER:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[L_IDX]] to i64
+; CHECK-NEXT:    [[GEP_BUCKET:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[L_BUCKET]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT:    [[IDX_ADDR:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV]]
+; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-NEXT:    store i32 [[IV_TRUNC]], ptr [[IDX_ADDR]], align 4
+; CHECK-NEXT:    [[GEP_OTHER:%.*]] = getelementptr inbounds i32, ptr [[OTHER]], i64 [[IV]]
+; CHECK-NEXT:    [[L_OTHER:%.*]] = load i32, ptr [[GEP_OTHER]], align 4
+; CHECK-NEXT:    [[ADD_OTHER:%.*]] = add i32 [[L_OTHER]], [[IV_TRUNC]]
+; CHECK-NEXT:    store i32 [[ADD_OTHER]], ptr [[GEP_OTHER]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+; NORMAL_DEP_LIMIT-LABEL: define void @many_deps(
+; NORMAL_DEP_LIMIT-SAME: ptr noalias [[BUCKETS:%.*]], ptr [[ARRAY:%.*]], ptr [[INDICES:%.*]], ptr [[OTHER:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; NORMAL_DEP_LIMIT-NEXT:  entry:
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8)
+; NORMAL_DEP_LIMIT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NORMAL_DEP_LIMIT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; NORMAL_DEP_LIMIT:       vector.memcheck:
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP3:%.*]] = shl i64 [[N]], 2
+; NORMAL_DEP_LIMIT-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 [[TMP3]]
+; NORMAL_DEP_LIMIT-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[OTHER]], i64 [[TMP3]]
+; NORMAL_DEP_LIMIT-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[INDICES]], i64 [[TMP3]]
+; NORMAL_DEP_LIMIT-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[ARRAY]], [[SCEVGEP1]]
+; NORMAL_DEP_LIMIT-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[OTHER]], [[SCEVGEP]]
+; NORMAL_DEP_LIMIT-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; NORMAL_DEP_LIMIT-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[ARRAY]], [[SCEVGEP2]]
+; NORMAL_DEP_LIMIT-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[INDICES]], [[SCEVGEP]]
+; NORMAL_DEP_LIMIT-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; NORMAL_DEP_LIMIT-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; NORMAL_DEP_LIMIT-NEXT:    [[BOUND06:%.*]] = icmp ult ptr [[OTHER]], [[SCEVGEP2]]
+; NORMAL_DEP_LIMIT-NEXT:    [[BOUND17:%.*]] = icmp ult ptr [[INDICES]], [[SCEVGEP1]]
+; NORMAL_DEP_LIMIT-NEXT:    [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]]
+; NORMAL_DEP_LIMIT-NEXT:    [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT8]]
+; NORMAL_DEP_LIMIT-NEXT:    br i1 [[CONFLICT_RDX9]], label [[SCALAR_PH]], label [[ENTRY:%.*]]
+; NORMAL_DEP_LIMIT:       vector.ph:
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP4]], 2
+; NORMAL_DEP_LIMIT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]]
+; NORMAL_DEP_LIMIT-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP6]] to i32
+; NORMAL_DEP_LIMIT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP9]], i64 0
+; NORMAL_DEP_LIMIT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; NORMAL_DEP_LIMIT-NEXT:    br label [[FOR_BODY:%.*]]
+; NORMAL_DEP_LIMIT:       vector.body:
+; NORMAL_DEP_LIMIT-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NORMAL_DEP_LIMIT-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[TMP7]], [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ]
+; NORMAL_DEP_LIMIT-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; NORMAL_DEP_LIMIT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[GEP_INDICES]], align 4, !alias.scope [[META0:![0-9]+]]
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP11]]
+; NORMAL_DEP_LIMIT-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP12]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV]]
+; NORMAL_DEP_LIMIT-NEXT:    store <vscale x 4 x i32> [[VEC_IND]], ptr [[TMP13]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META5:![0-9]+]]
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[OTHER]], i64 [[IV]]
+; NORMAL_DEP_LIMIT-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4, !alias.scope [[META7:![0-9]+]], !noalias [[META0]]
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD10]], [[VEC_IND]]
+; NORMAL_DEP_LIMIT-NEXT:    store <vscale x 4 x i32> [[TMP15]], ptr [[TMP14]], align 4, !alias.scope [[META7]], !noalias [[META0]]
+; NORMAL_DEP_LIMIT-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP6]]
+; NORMAL_DEP_LIMIT-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; NORMAL_DEP_LIMIT-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NORMAL_DEP_LIMIT:       middle.block:
+; NORMAL_DEP_LIMIT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; NORMAL_DEP_LIMIT-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; NORMAL_DEP_LIMIT:       scalar.ph:
+; NORMAL_DEP_LIMIT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; NORMAL_DEP_LIMIT-NEXT:    br label [[FOR_BODY1:%.*]]
+; NORMAL_DEP_LIMIT:       for.body:
+; NORMAL_DEP_LIMIT-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; NORMAL_DEP_LIMIT-NEXT:    [[GEP_INDICES1:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]]
+; NORMAL_DEP_LIMIT-NEXT:    [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES1]], align 4
+; NORMAL_DEP_LIMIT-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[L_IDX]] to i64
+; NORMAL_DEP_LIMIT-NEXT:    [[GEP_BUCKET:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; NORMAL_DEP_LIMIT-NEXT:    [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4
+; NORMAL_DEP_LIMIT-NEXT:    [[INC:%.*]] = add nsw i32 [[L_BUCKET]], 1
+; NORMAL_DEP_LIMIT-NEXT:    store i32 [[INC]], ptr [[GEP_BUCKET]], align 4
+; NORMAL_DEP_LIMIT-NEXT:    [[IDX_ADDR:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV1]]
+; NORMAL_DEP_LIMIT-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV1]] to i32
+; NORMAL_DEP_LIMIT-NEXT:    store i32 [[IV_TRUNC]], ptr [[IDX_ADDR]], align 4
+; NORMAL_DEP_LIMIT-NEXT:    [[GEP_OTHER:%.*]] = getelementptr inbounds i32, ptr [[OTHER]], i64 [[IV1]]
+; NORMAL_DEP_LIMIT-NEXT:    [[L_OTHER:%.*]] = load i32, ptr [[GEP_OTHER]], align 4
+; NORMAL_DEP_LIMIT-NEXT:    [[ADD_OTHER:%.*]] = add i32 [[L_OTHER]], [[IV_TRUNC]]
+; NORMAL_DEP_LIMIT-NEXT:    store i32 [[ADD_OTHER]], ptr [[GEP_OTHER]], align 4
+; NORMAL_DEP_LIMIT-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; NORMAL_DEP_LIMIT-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; NORMAL_DEP_LIMIT-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]]
+; NORMAL_DEP_LIMIT:       for.exit:
+; NORMAL_DEP_LIMIT-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %idx.addr = getelementptr inbounds i32, ptr %array, i64 %iv
+  %iv.trunc = trunc i64 %iv to i32
+  store i32 %iv.trunc, ptr %idx.addr, align 4
+  %gep.other = getelementptr inbounds i32, ptr %other, i64 %iv
+  %l.other = load i32, ptr %gep.other, align 4
+  %add.other = add i32 %l.other, %iv.trunc
+  store i32 %add.other, ptr %gep.other, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
new file mode 100644
index 0000000000000..9be068ce880ea
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
@@ -0,0 +1,107 @@
+; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -force-vector-interleave=1 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;; Based on the following C code:
+;;
+;; void simple_histogram(int *buckets, unsigned *indices, int N) {
+;;   for (int i = 0; i < N; ++i)
+;;     buckets[indices[i]]++;
+;; }
+
+;; Check that the scalar plan contains the original instructions.
+; CHECK: VPlan 'Initial VPlan for VF={1},UF>=1' {
+; CHECK-NEXT: Live-in [[VFxUF:.*]] = VF * UF
+; CHECK-NEXT: Live-in [[VTC:.*]] = vector-trip-count
+; CHECK-NEXT: Live-in [[OTC:.*]] = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT [[IV:.*]] = CANONICAL-INDUCTION ir<0>, [[IV_NEXT:.*]]
+; CHECK-NEXT:     [[STEPS:vp.*]] = SCALAR-STEPS [[IV]], ir<1>
+; CHECK-NEXT:     CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]]
+; CHECK-NEXT:     CLONE [[IDX:.*]] = load [[GEP_IDX]]
+; CHECK-NEXT:     CLONE [[EXT_IDX:.*]] = zext [[IDX]]
+; CHECK-NEXT:     CLONE [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]]
+; CHECK-NEXT:     CLONE [[HISTVAL:.*]] = load [[GEP_BUCKET]]
+; CHECK-NEXT:     CLONE [[UPDATE:.*]] = add nsw [[HISTVAL]], ir<1>
+; CHECK-NEXT:     CLONE store [[UPDATE]], [[GEP_BUCKET]]
+; CHECK-NEXT:     EMIT [[IV_NEXT]] = add nuw [[IV]], [[VFxUF]]
+; CHECK-NEXT:     EMIT branch-on-count [[IV_NEXT]], [[VTC]]
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT [[TC_CHECK:.*]] = icmp eq [[OTC:.*]], [[VTC]]
+; CHECK-NEXT:   EMIT branch-on-cond [[TC_CHECK]]
+; CHECK-NEXT: Successor(s): ir-bb<for.exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.exit>:
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+
+;; Check that the vectorized plan contains a histogram recipe instead.
+; CHECK: VPlan 'Initial VPlan for VF={vscale x 2,vscale x 4},UF>=1' {
+; CHECK-NEXT: Live-in [[VFxUF:.*]] = VF * UF
+; CHECK-NEXT: Live-in [[VTC:.*]] = vector-trip-count
+; CHECK-NEXT: Live-in [[OTC:.*]] = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT [[IV:.*]] = CANONICAL-INDUCTION ir<0>, [[IV_NEXT:.*]]
+; CHECK-NEXT:     [[STEPS:vp.*]] = SCALAR-STEPS [[IV]], ir<1>
+; CHECK-NEXT:     CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]]
+; CHECK-NEXT:     [[VECP_IDX:vp.*]] = vector-pointer [[GEP_IDX]]
+; CHECK-NEXT:     WIDEN [[IDX:.*]] = load [[VECP_IDX]]
+; CHECK-NEXT:     WIDEN-CAST [[EXT_IDX:.*]] = zext  [[IDX]] to i64
+; CHECK-NEXT:     WIDEN-GEP Inv[Var] [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]]
+; CHECK-NEXT:     WIDEN-HISTOGRAM buckets: [[GEP_BUCKET]], inc: ir<1>
+; CHECK-NEXT:     EMIT [[IV_NEXT]] = add nuw [[IV]], [[VFxUF]]
+; CHECK-NEXT:     EMIT branch-on-count [[IV_NEXT]], [[VTC]]
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT [[TC_CHECK:.*]] = icmp eq [[OTC]], [[VTC]]
+; CHECK-NEXT:   EMIT branch-on-cond [[TC_CHECK]]
+; CHECK-NEXT: Successor(s): ir-bb<for.exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.exit>:
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+
+define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %N) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
new file mode 100644
index 0000000000000..0c41477f285d0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
@@ -0,0 +1,937 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;; Based on the following C code:
+;;
+;; void simple_histogram(int *buckets, unsigned *indices, int N) {
+;;   for (int i = 0; i < N; ++i)
+;;     buckets[indices[i]]++;
+;; }
+
+;; Confirm finding a histogram operation
+; CHECK-LABEL: Checking a loop in 'simple_histogram'
+; CHECK: LV: Checking for a histogram on: store i32 %inc, ptr %gep.bucket, align 4
+; CHECK: LV: Found histogram for: store i32 %inc, ptr %gep.bucket, align 4
+
+;; Confirm cost calculation for runtime checks
+; CHECK-LABEL: LV: Checking a loop in 'simple_histogram_rtdepcheck'
+; CHECK: Calculating cost of runtime checks:
+; CHECK: Total cost of runtime checks:
+; CHECK: LV: Minimum required TC for runtime checks to be profitable:
+
+;; Confirm inability to vectorize with potential alias to buckets
+; CHECK-LABEL: LV: Checking a loop in 'simple_histogram_unsafe_alias'
+; CHECK: LV: Can't vectorize due to memory conflicts
+; CHECK-NEXT: LV: Not vectorizing: Cannot prove legality.
+
+define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @simple_histogram(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+  ret void
+}
+
+define void @simple_histogram_inc_param(ptr noalias %buckets, ptr readonly %indices, i64 %N, i32 %incval) #0 {
+; CHECK-LABEL: define void @simple_histogram_inc_param(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]], i32 [[INCVAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 [[INCVAL]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], [[INCVAL]]
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, %incval
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+  ret void
+}
+
+define void @simple_histogram_sub(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @simple_histogram_sub(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 -1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], -1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = sext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = sub nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+  ret void
+}
+
+define void @conditional_histogram(ptr noalias %buckets, ptr readonly %indices, ptr readonly %conds, i64 %N) #0 {
+; CHECK-LABEL: define void @conditional_histogram(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], ptr readonly [[CONDS:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[CONDS]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD1]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 5100, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> [[TMP13]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[NEXT:%.*]] ]
+; CHECK-NEXT:    [[CONDIDX:%.*]] = getelementptr inbounds i32, ptr [[CONDS]], i64 [[IV1]]
+; CHECK-NEXT:    [[CONDDATA:%.*]] = load i32, ptr [[CONDIDX]], align 4
+; CHECK-NEXT:    [[IFCOND:%.*]] = icmp sgt i32 [[CONDDATA]], 5100
+; CHECK-NEXT:    br i1 [[IFCOND]], label [[IFTRUE:%.*]], label [[NEXT]]
+; CHECK:       iftrue:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP15]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    br label [[NEXT]]
+; CHECK:       next:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %next ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %condidx = getelementptr inbounds i32, ptr %conds, i64 %iv
+  %conddata = load i32, ptr %condidx, align 4
+  %ifcond = icmp sgt i32 %conddata, 5100
+  br i1 %ifcond, label %iftrue, label %next
+
+iftrue:
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  br label %next
+
+next:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+  ret void
+}
+
+define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @histogram_8bit(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP6]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i8(<vscale x 4 x ptr> [[TMP7]], i8 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i8 [[TMP1]], 1
+; CHECK-NEXT:    store i8 [[INC]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i8, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i8, ptr %gep.bucket, align 4
+  %inc = add nsw i8 %l.bucket, 1
+  store i8 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+  ret void
+}
+
+;; We don't currently support floating point histograms.
+define void @histogram_float(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @histogram_float(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    store float [[INC]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds float, ptr %buckets, i64 %idxprom1
+  %l.bucket = load float, ptr %gep.bucket, align 4
+  %inc = fadd fast float %l.bucket, 1.0
+  store float %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+  ret void
+}
+
+;; We don't support histograms with a update value that's not loop-invariant.
+define void @histogram_varying_increment(ptr noalias %buckets, ptr readonly %indices, ptr readonly %incvals, i64 %N) #0 {
+; CHECK-LABEL: define void @histogram_varying_increment(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], ptr readonly [[INCVALS:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INCIDX:%.*]] = getelementptr inbounds i32, ptr [[INCVALS]], i64 [[IV]]
+; CHECK-NEXT:    [[INCVAL:%.*]] = load i32, ptr [[INCIDX]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], [[INCVAL]]
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %gep.incvals = getelementptr inbounds i32, ptr %incvals, i64 %iv
+  %l.incval = load i32, ptr %gep.incvals, align 4
+  %inc = add nsw i32 %l.bucket, %l.incval
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+  ret void
+}
+
+;; Test that interleaving works when vectorizing.
+define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @simple_histogram_user_interleave(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[DOTIDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP19]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP21]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !0
+
+for.exit:
+  ret void
+}
+
+;; Test that we can handle more than one GEP index.
+@idx_array = dso_local local_unnamed_addr global [1048576 x i32] zeroinitializer, align 4
+@data_array = dso_local local_unnamed_addr global [1048576 x i32] zeroinitializer, align 4
+
+define void @histogram_array_3op_gep(i64 noundef %N) #0 {
+; CHECK-LABEL: define void @histogram_array_3op_gep(
+; CHECK-SAME: i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1048576 x i32], ptr @idx_array, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1048576 x i32], ptr @data_array, i64 0, <vscale x 4 x i64> [[TMP14]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP11]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1048576 x i32], ptr @idx_array, i64 0, i64 [[IV]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [1048576 x i32], ptr @data_array, i64 0, i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP10]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds [1048576 x i32], ptr @idx_array, i64 0, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom5 = sext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds [1048576 x i32], ptr @data_array, i64 0, i64 %idxprom5
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+  ret void
+}
+
+;; Add a struct into the mix, use a different constant index.
+;; { unused, buckets }
+%somestruct = type { [1048576 x i32], [1048576 x i32] }
+
+define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr readonly %indices, ptr noalias %data.struct) #0 {
+; CHECK-LABEL: define void @histogram_array_4op_gep_nonzero_const_idx(
+; CHECK-SAME: i64 noundef [[N:%.*]], ptr readonly [[INDICES:%.*]], ptr noalias [[DATA_STRUCT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[SOMESTRUCT:%.*]], ptr [[DATA_STRUCT]], i64 1, i32 0, <vscale x 4 x i64> [[TMP6]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP7]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
+; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[L_IDX]] to i64
+; CHECK-NEXT:    [[GEP_BUCKET:%.*]] = getelementptr inbounds [[SOMESTRUCT]], ptr [[DATA_STRUCT]], i64 1, i32 0, i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[L_BUCKET]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom5 = sext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds %somestruct, ptr %data.struct, i32 1, i32 0, i64 %idxprom5
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+  ret void
+}
+
+;; Make sure the histogram intrinsic uses the active lane mask when tail folding.
+define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @simple_histogram_tailfold(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP5]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 poison, label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !2
+
+for.exit:
+  ret void
+}
+
+;; Check that we can still vectorize a histogram when LAA found another dependency
+;; that doesn't conflict with the buckets.
+define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @simple_histogram_rtdepcheck(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr [[ARRAY:%.*]], ptr [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[ARRAY1:%.*]] = ptrtoint ptr [[ARRAY]] to i64
+; CHECK-NEXT:    [[INDICES2:%.*]] = ptrtoint ptr [[INDICES]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[ARRAY1]], [[INDICES2]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP6]], -4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc nuw nsw i64 [[TMP8]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP13]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP14]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[INDEX]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[VEC_IND]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP18]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IDX_ADDR:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV]]
+; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-NEXT:    store i32 [[IV_TRUNC]], ptr [[IDX_ADDR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %idx.addr = getelementptr inbounds i32, ptr %array, i64 %iv
+  %iv.trunc = trunc i64 %iv to i32
+  store i32 %iv.trunc, ptr %idx.addr, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
+
+;; Make sure we don't vectorize if there's a potential alias between buckets
+;; and indices.
+define void @simple_histogram_unsafe_alias(ptr %buckets, ptr %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @simple_histogram_unsafe_alias(
+; CHECK-SAME: ptr [[BUCKETS:%.*]], ptr [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
+
+define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @simple_histogram_64b(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -2
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[BUCKETS]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> [[TMP6]], i64 1, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds i64, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[L_IDX:%.*]] = load i64, ptr [[GEP_INDICES]], align 4
+; CHECK-NEXT:    [[GEP_BUCKET:%.*]] = getelementptr inbounds i64, ptr [[BUCKETS]], i64 [[L_IDX]]
+; CHECK-NEXT:    [[L_BUCKET:%.*]] = load i64, ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i64 [[L_BUCKET]], 1
+; CHECK-NEXT:    store i64 [[INC]], ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i64, ptr %indices, i64 %iv
+  %l.idx = load i64, ptr %gep.indices, align 4
+  %gep.bucket = getelementptr inbounds i64, ptr %buckets, i64 %l.idx
+  %l.bucket = load i64, ptr %gep.bucket, align 4
+  %inc = add nsw i64 %l.bucket, 1
+  store i64 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve2" vscale_range(1,16) }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.interleave.count", i32 2}
+!2 = distinct !{!2, !3}
+!3 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.interleave.count", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
index 5a92b7824b382..a43e1a6755ed7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
@@ -23,7 +23,7 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.src>
 ; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN-CAST ir<%conv> = fpext ir<%l> to double
-; CHECK-NEXT:     WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64)
+; CHECK-NEXT:     WIDEN-CALL ir<%s> = call reassoc nnan ninf nsz arcp contract afn @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64)
 ; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]>
 ; CHECK-NEXT:     REPLICATE store ir<%s>, ir<%gep.dst>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -60,7 +60,7 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.src>
 ; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN-CAST ir<%conv> = fpext ir<%l> to double
-; CHECK-NEXT:     WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using vector intrinsic)
+; CHECK-NEXT:     WIDEN-CALL ir<%s> = call reassoc nnan ninf nsz arcp contract afn @llvm.sin.f64(ir<%conv>) (using vector intrinsic)
 ; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]>
 ; CHECK-NEXT:     REPLICATE store ir<%s>, ir<%gep.dst>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
new file mode 100644
index 0000000000000..870925950ae49
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -force-tail-folding-style=data-with-evl -S | FileCheck %s
+; Generated from issue #109468.
+; In this test case, the vector store with tail mask will transfer to the vp intrinsic with EVL.
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
+; CHECK-LABEL: define void @lshift_significand(
+; CHECK-SAME: i32 [[N:%.*]], ptr nocapture writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 3, [[SPEC_SELECT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 -1, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP0]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 2, i1 true)
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = sub nuw nsw i64 1, [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 0, [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sub i64 1, [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[TMP14]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i64 [[TMP18]]
+; CHECK-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP20]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = sub nuw nsw i64 1, [[IV]]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP23]]
+; CHECK-NEXT:    store i64 0, ptr [[ARRAYIDX13]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp1.peel = icmp eq i32 %n, 0
+  %spec.select = select i1 %cmp1.peel, i64 2, i64 0
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %spec.select, %entry ], [ %iv.next, %loop ]
+  %1 = sub nuw nsw i64 1, %iv
+  %arrayidx13 = getelementptr i64, ptr %dst, i64 %1
+  store i64 0, ptr %arrayidx13, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 3
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 11405a1c91158..90c209cf3f518 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -39,7 +39,8 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
 ; IF-EVL-INLOOP-NEXT:    EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]>
 ; IF-EVL-INLOOP-NEXT:    WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]>
-; IF-EVL-INLOOP-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%n>
+; IF-EVL-INLOOP-NEXT:    EMIT vp<[[AVL:%.+]]> = sub ir<%n>, vp<[[EVL_PHI]]>
+; IF-EVL-INLOOP-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
 ; IF-EVL-INLOOP-NEXT:    vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
 ; IF-EVL-INLOOP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-INLOOP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
index 6dfe5b608199b..c14a8bce8f48d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -23,7 +23,8 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:  vector.body:
 ; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
 ; IF-EVL-NEXT:    EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]>
-; IF-EVL-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%N>
+; IF-EVL-NEXT:    EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
 ; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
 ; IF-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
 ; IF-EVL-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll b/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll
index 40c143faadd3c..d956f718cb10a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll
@@ -17,13 +17,7 @@ define i64 @test_pr98660(ptr %dst, i64 %N) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 24
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP9]], i32 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicated-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/predicated-instruction-cost.ll
new file mode 100644
index 0000000000000..0072dd95bd098
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/predicated-instruction-cost.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test case for https://github.com/llvm/llvm-project/issues/110295.
+define void @predicated_urem_shl_cost(ptr %A, i32 %x, i1 %c) {
+; CHECK-LABEL: define void @predicated_urem_shl_cost(
+; CHECK-SAME: ptr [[A:%.*]], i32 [[X:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 2, [[X]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[L]], [[REM]]
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, %[[LOOP_HEADER]] ], [ [[SHL]], %[[THEN]] ]
+; CHECK-NEXT:    store i32 [[P]], ptr [[GEP]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %gep = getelementptr inbounds i32, ptr %A, i32 %iv
+  %l = load i32, ptr %gep
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  %rem = urem i32 2, %x
+  %shl = shl i32 %l, %rem
+  br label %loop.latch
+
+loop.latch:
+  %p = phi i32 [ 0, %loop.header ], [ %shl, %then ]
+  store i32 %p, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv, 0
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
index a768f69f49788..12f95e0a0a7dc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
@@ -35,15 +35,9 @@ define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD4]], <i64 128, i64 128, i64 128, i64 128>
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD5]], <i64 128, i64 128, i64 128, i64 128>
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD6]], <i64 128, i64 128, i64 128, i64 128>
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[TMP22]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[TMP24]], 1
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 1
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP27]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i64, ptr [[TMP28]], i32 0
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i64, ptr [[TMP28]], i32 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[TMP28]], i32 8
@@ -116,9 +110,6 @@ define void @iv.4_used_as_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4
@@ -128,19 +119,13 @@ define void @iv.4_used_as_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP1]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP2]], 4
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], <i64 128, i64 128, i64 128, i64 128>
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD1]], <i64 128, i64 128, i64 128, i64 128>
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD2]], <i64 128, i64 128, i64 128, i64 128>
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD3]], <i64 128, i64 128, i64 128, i64 128>
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP12]], 1
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP13]], 1
-; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[TMP14]], 1
 ; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[TMP15]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i64, ptr [[TMP24]], i32 0
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i64, ptr [[TMP24]], i32 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i64, ptr [[TMP24]], i32 8
diff --git a/llvm/test/Transforms/LoopVectorize/histograms.ll b/llvm/test/Transforms/LoopVectorize/histograms.ll
new file mode 100644
index 0000000000000..5f0e3b1e10d3e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/histograms.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -force-vector-width=2 -S | FileCheck %s
+
+;; Currently we don't expect this to vectorize, since the generic cost model returns
+;; invalid for the histogram intrinsic.
+define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %N) {
+; CHECK-LABEL: define void @simple_histogram(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = add nsw i32 %l.bucket, 1
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll
index 0d5871e24c524..383b62b368ef0 100644
--- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -14,11 +15,59 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_0_fadd_select1(
-; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]]
-; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
 define float @fcmp_0_fadd_select1(ptr noalias %x, i32 %N) nounwind readonly {
+; CHECK-LABEL: define float @fcmp_0_fadd_select1(
+; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp fast ogt float [[TMP8]], 0.000000e+00
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP8]], [[SUM_1]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], float [[ADD]], float [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[TMP9]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -56,11 +105,59 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_0_fadd_select2(
-; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]]
-; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
 define double @fcmp_0_fadd_select2(ptr noalias %x, i32 %N) nounwind readonly {
+; CHECK-LABEL: define double @fcmp_0_fadd_select2(
+; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x double> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load double, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp fast ogt double [[TMP8]], 0.000000e+00
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast double [[TMP8]], [[SUM_1]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], double [[ADD]], double [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret double [[TMP9]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -99,11 +196,61 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_val_fadd_select1(
-; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %broadcast.splat
-; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]]
-; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
 define float @fcmp_val_fadd_select1(ptr noalias %x, float %y, i32 %N) nounwind readonly {
+; CHECK-LABEL: define float @fcmp_val_fadd_select1(
+; CHECK-SAME: ptr noalias [[X:%.*]], float [[Y:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[Y]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp fast ogt float [[TMP8]], [[Y]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP8]], [[SUM_1]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], float [[ADD]], float [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[TMP9]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -142,11 +289,61 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_val_fadd_select2(
-; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %broadcast.splat
-; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]]
-; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
 define double @fcmp_val_fadd_select2(ptr noalias %x, double %y, i32 %N) nounwind readonly {
+; CHECK-LABEL: define double @fcmp_val_fadd_select2(
+; CHECK-SAME: ptr noalias [[X:%.*]], double [[Y:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x double> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load double, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp fast ogt double [[TMP8]], [[Y]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast double [[TMP8]], [[SUM_1]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], double [[ADD]], double [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret double [[TMP9]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -186,11 +383,64 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_array_elm_fadd_select1(
-; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %[[V1:.*]]
-; CHECK: %[[V4:.*]] = fadd fast <4 x float> %[[V0]], %[[V3:.*]]
-; CHECK: select <4 x i1> %[[V2]], <4 x float> %[[V4]], <4 x float> %[[V3]]
 define float @fcmp_array_elm_fadd_select1(ptr noalias %x, ptr noalias %y, i32 %N) nounwind readonly {
+; CHECK-LABEL: define float @fcmp_array_elm_fadd_select1(
+; CHECK-SAME: ptr noalias [[X:%.*]], ptr noalias [[Y:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP7]] = select <4 x i1> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp fast ogt float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[SUM_1]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], float [[ADD]], float [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[TMP12]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -232,11 +482,64 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_array_elm_fadd_select2(
-; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %[[V1:.*]]
-; CHECK: %[[V4:.*]] = fadd fast <4 x double> %[[V0]], %[[V3:.*]]
-; CHECK: select <4 x i1> %[[V2]], <4 x double> %[[V4]], <4 x double> %[[V3]]
 define double @fcmp_array_elm_fadd_select2(ptr noalias %x, ptr noalias %y, i32 %N) nounwind readonly {
+; CHECK-LABEL: define double @fcmp_array_elm_fadd_select2(
+; CHECK-SAME: ptr noalias [[X:%.*]], ptr noalias [[Y:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <4 x double> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP7]] = select <4 x i1> [[TMP5]], <4 x double> [[TMP6]], <4 x double> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp fast ogt double [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast double [[TMP10]], [[SUM_1]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], double [[ADD]], double [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret double [[TMP12]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -276,11 +579,59 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_0_fsub_select1(
-; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = fsub fast <4 x float> %[[V2:.*]], %[[V0]]
-; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
 define float @fcmp_0_fsub_select1(ptr noalias %x, i32 %N) nounwind readonly {
+; CHECK-LABEL: define float @fcmp_0_fsub_select1(
+; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp fast ogt float [[TMP8]], 0.000000e+00
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[SUM_1]], [[TMP8]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], float [[SUB]], float [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[TMP9]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -316,9 +667,33 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_0_fsub_select1_novectorize(
-; CHECK-NOT: <4 x float>
 define float @fcmp_0_fsub_select1_novectorize(ptr noalias %x, i32 %N) nounwind readonly {
+; CHECK-LABEL: define float @fcmp_0_fsub_select1_novectorize(
+; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_HEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi float [ 0.000000e+00, %[[FOR_HEADER]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[SUM_1]], [[TMP0]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], float [[SUB]], float [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[TMP1]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -356,11 +731,59 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_0_fsub_select2(
-; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = fsub fast <4 x double> %[[V2:.*]], %[[V0]]
-; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
 define double @fcmp_0_fsub_select2(ptr noalias %x, i32 %N) nounwind readonly {
+; CHECK-LABEL: define double @fcmp_0_fsub_select2(
+; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load double, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp fast ogt double [[TMP8]], 0.000000e+00
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast double [[SUM_1]], [[TMP8]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], double [[SUB]], double [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret double [[TMP9]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -397,9 +820,33 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_0_fsub_select2_notvectorize(
-; CHECK-NOT: <4 x doubole>
 define double @fcmp_0_fsub_select2_notvectorize(ptr noalias %x, i32 %N) nounwind readonly {
+; CHECK-LABEL: define double @fcmp_0_fsub_select2_notvectorize(
+; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_HEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi double [ 0.000000e+00, %[[FOR_HEADER]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp ogt double [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[SUM_1]], [[TMP0]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], double [[SUB]], double [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret double [[TMP1]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -437,11 +884,59 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_0_fmult_select1(
-; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = fmul fast <4 x float> %[[V2:.*]], %[[V0]]
-; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
 define float @fcmp_0_fmult_select1(ptr noalias %x, i32 %N) nounwind readonly {
+; CHECK-LABEL: define float @fcmp_0_fmult_select1(
+; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp fast ogt float [[TMP8]], 0.000000e+00
+; CHECK-NEXT:    [[MULT:%.*]] = fmul fast float [[SUM_1]], [[TMP8]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], float [[MULT]], float [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[TMP9]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -478,9 +973,33 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_0_fmult_select1_notvectorize(
-; CHECK-NOT: <4 x float>
 define float @fcmp_0_fmult_select1_notvectorize(ptr noalias %x, i32 %N) nounwind readonly {
+; CHECK-LABEL: define float @fcmp_0_fmult_select1_notvectorize(
+; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_HEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi float [ 0.000000e+00, %[[FOR_HEADER]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[MULT:%.*]] = fmul float [[SUM_1]], [[TMP0]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], float [[MULT]], float [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi float [ [[SUM_2]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[TMP1]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -518,11 +1037,59 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_0_fmult_select2(
-; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = fmul fast <4 x double> %[[V2:.*]], %[[V0]]
-; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
 define double @fcmp_0_fmult_select2(ptr noalias %x, i32 %N) nounwind readonly {
+; CHECK-LABEL: define double @fcmp_0_fmult_select2(
+; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ <double 0.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[TMP5]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load double, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp fast ogt double [[TMP8]], 0.000000e+00
+; CHECK-NEXT:    [[MULT:%.*]] = fmul fast double [[SUM_1]], [[TMP8]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], double [[MULT]], double [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret double [[TMP9]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -559,9 +1126,33 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_0_fmult_select2_notvectorize(
-; CHECK-NOT: <4 x double>
 define double @fcmp_0_fmult_select2_notvectorize(ptr noalias %x, i32 %N) nounwind readonly {
+; CHECK-LABEL: define double @fcmp_0_fmult_select2_notvectorize(
+; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_HEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi double [ 0.000000e+00, %[[FOR_HEADER]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp ogt double [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[MULT:%.*]] = fmul double [[SUM_1]], [[TMP0]]
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], double [[MULT]], double [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi double [ [[SUM_2]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret double [[TMP1]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -604,18 +1195,77 @@ for.end:                                          ; preds = %for.body, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_multi(
-; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00,
-; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
-; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00,
-; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
-; CHECK: %[[C22:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer
-; CHECK-DAG: %[[M1:.*]] = fmul fast <4 x float> %[[V0]], <float 3.000000e+00,
-; CHECK-DAG: %[[M2:.*]] = fmul fast <4 x float> %[[V0]], <float 2.000000e+00,
-; CHECK: %[[S1:.*]] = select <4 x i1> %[[C22]], <4 x float> %[[M1]], <4 x float> %[[M2]]
-; CHECK: %[[S2:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[V0]], <4 x float> %[[S1]]
-; CHECK: fadd fast <4 x float> %[[S2]],
 define float @fcmp_multi(ptr nocapture readonly %a, i32 %n) nounwind readonly {
+; CHECK-LABEL: define float @fcmp_multi(
+; CHECK-SAME: ptr nocapture readonly [[A:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP10]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP6]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP9]]
+; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[WIDE_LOAD]], <4 x float> [[PREDPHI]]
+; CHECK-NEXT:    [[TMP10]] = fadd fast <4 x float> [[PREDPHI1]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP10]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[SUM_011:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP13]], 1.000000e+00
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[FOR_INC]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    [[CMP8:%.*]] = fcmp olt float [[TMP13]], 3.000000e+00
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[IF_THEN10:.*]], label %[[IF_ELSE14:.*]]
+; CHECK:       [[IF_THEN10]]:
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP13]], 2.000000e+00
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[IF_ELSE14]]:
+; CHECK-NEXT:    [[MUL17:%.*]] = fmul fast float [[TMP13]], 3.000000e+00
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[DOTPN:%.*]] = phi float [ [[MUL]], %[[IF_THEN10]] ], [ [[MUL17]], %[[IF_ELSE14]] ], [ [[TMP13]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1]] = fadd fast float [[DOTPN]], [[SUM_011]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi float [ [[SUM_1]], %[[FOR_INC]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_1_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[SUM_0_LCSSA]]
+;
 entry:
   %cmp10 = icmp sgt i32 %n, 0
   br i1 %cmp10, label %for.body.preheader, label %for.end
@@ -671,17 +1321,75 @@ for.end:                                          ; preds = %for.inc, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_fadd_fsub(
-; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00,
-; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
-; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00,
-; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
-; CHECK-DAG: %[[SUB:.*]] = fsub fast <4 x float>
-; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float>
-; CHECK-DAG: %[[C22:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer
-; CHECK: %[[S1:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[ADD]], <4 x float> %[[SUB]]
-; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]]
 define float @fcmp_fadd_fsub(ptr nocapture readonly %a, i32 %n) nounwind readonly {
+; CHECK-LABEL: define float @fcmp_fadd_fsub(
+; CHECK-SAME: ptr nocapture readonly [[A:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[TMP6]]
+; CHECK-NEXT:    [[PREDPHI1]] = select <4 x i1> [[TMP9]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI1]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[SUM_010:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP12]], 1.000000e+00
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP12]], [[SUM_010]]
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    [[CMP8:%.*]] = fcmp olt float [[TMP12]], 3.000000e+00
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[IF_THEN10:.*]], label %[[FOR_INC]]
+; CHECK:       [[IF_THEN10]]:
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[SUM_010]], [[TMP12]]
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[SUM_1]] = phi float [ [[ADD]], %[[IF_THEN]] ], [ [[SUB]], %[[IF_THEN10]] ], [ [[SUM_010]], %[[IF_ELSE]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi float [ [[SUM_1]], %[[FOR_INC]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_1_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[SUM_0_LCSSA]]
+;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body.preheader, label %for.end
@@ -736,9 +1444,43 @@ for.end:                                          ; preds = %for.inc, %entry
 ;   return sum;
 ; }
 
-; CHECK-LABEL: @fcmp_fadd_fmul(
-; CHECK-NOT: <4 x float>
 define float @fcmp_fadd_fmul(ptr nocapture readonly %a, i32 %n) nounwind readonly {
+; CHECK-LABEL: define float @fcmp_fadd_fmul(
+; CHECK-SAME: ptr nocapture readonly [[A:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[SUM_010:%.*]] = phi float [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[SUM_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 1.000000e+00
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], [[SUM_010]]
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    [[CMP8:%.*]] = fcmp olt float [[TMP0]], 3.000000e+00
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[IF_THEN10:.*]], label %[[FOR_INC]]
+; CHECK:       [[IF_THEN10]]:
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP0]], [[SUM_010]]
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[SUM_1]] = phi float [ [[ADD]], %[[IF_THEN]] ], [ [[MUL]], %[[IF_THEN10]] ], [ [[SUM_010]], %[[IF_ELSE]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi float [ [[SUM_1]], %[[FOR_INC]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_1_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[SUM_0_LCSSA]]
+;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body.preheader, label %for.end
@@ -792,8 +1534,30 @@ for.end:                                          ; preds = %for.inc, %entry
 ; }
 
 define float @fcmp_store_back(ptr nocapture %a, i32 %LEN) nounwind readonly {
-; CHECK-LABEL: @fcmp_store_back(
-; CHECK-NOT:     <4 x float>
+; CHECK-LABEL: define float @fcmp_store_back(
+; CHECK-SAME: ptr nocapture [[A:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[LEN]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_08:%.*]] = phi float [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = fadd fast float [[TMP0]], [[SUM_08]]
+; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[SUM_0_LCSSA]]
 ;
 entry:
   %cmp7 = icmp sgt i32 %LEN, 0
@@ -819,11 +1583,58 @@ for.end:                                          ; preds = %for.body, %entry
   ret float %sum.0.lcssa
 }
 
-; CHECK-LABEL: @fcmp_0_add_select2(
-; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = add <4 x i64> %[[V2:.*]], <i64 2, i64 2, i64 2, i64 2>
-; CHECK: select <4 x i1> %[[V1]], <4 x i64> %[[V3]], <4 x i64> %[[V2]]
 define i64 @fcmp_0_add_select2(ptr noalias %x, i64 %N) nounwind readonly {
+; CHECK-LABEL: define i64 @fcmp_0_add_select2(
+; CHECK-SAME: ptr noalias [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[VEC_PHI]], <i64 2, i64 2, i64 2, i64 2>
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i64> [[TMP4]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP5]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp ogt float [[TMP8]], 0.000000e+00
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUM_1]], 2
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], i64 [[ADD]], i64 [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi i64 [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i64 [[TMP9]]
+;
 entry:
   %cmp.1 = icmp sgt i64 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -848,11 +1659,64 @@ for.end:                                          ; preds = %for.body, %entry
   ret i64 %1
 }
 
-; CHECK-LABEL: @fcmp_0_sub_select1(
-; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = sub <4 x i32> %[[V2:.*]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK: select <4 x i1> %[[V1]], <4 x i32> %[[V3]], <4 x i32> %[[V2]]
 define i32 @fcmp_0_sub_select1(ptr noalias %x, i32 %N) nounwind readonly {
+; CHECK-LABEL: define i32 @fcmp_0_sub_select1(
+; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 0, [[ZEXT]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 0, [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 0, [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 -3
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[REVERSE]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[VEC_PHI]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP7]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp ogt float [[TMP10]], 0.000000e+00
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[SUM_1]], 2
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], i32 [[SUB]], i32 [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = sub nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -878,11 +1742,59 @@ for.end:                                          ; preds = %for.body, %entry
   ret i32 %1
 }
 
-; CHECK-LABEL: @fcmp_0_mult_select1(
-; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = mul <4 x i32> %[[V2:.*]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK: select <4 x i1> %[[V1]], <4 x i32> %[[V3]], <4 x i32> %[[V2]]
 define i32 @fcmp_0_mult_select1(ptr noalias %x, i32 %N) nounwind readonly {
+; CHECK-LABEL: define i32 @fcmp_0_mult_select1(
+; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_1]], label %[[FOR_HEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_HEADER]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 1, i32 1>, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_HEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_2:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP_2:%.*]] = fcmp ogt float [[TMP8]], 0.000000e+00
+; CHECK-NEXT:    [[MULT:%.*]] = mul nsw i32 [[SUM_1]], 2
+; CHECK-NEXT:    [[SUM_2]] = select i1 [[CMP_2]], i32 [[MULT]], i32 [[SUM_1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_2]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUM_2_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
 entry:
   %cmp.1 = icmp sgt i32 %N, 0
   br i1 %cmp.1, label %for.header, label %for.end
@@ -908,11 +1820,27 @@ for.end:                                          ; preds = %for.body, %entry
   ret i32 %1
 }
 
-@table = constant [13 x i16] [i16 10, i16 35, i16 69, i16 147, i16 280, i16 472, i16 682, i16 1013, i16 1559, i16 2544, i16 4553, i16 6494, i16 10000], align 1 
+@table = constant [13 x i16] [i16 10, i16 35, i16 69, i16 147, i16 280, i16 472, i16 682, i16 1013, i16 1559, i16 2544, i16 4553, i16 6494, i16 10000], align 1
 
-; CHECK-LABEL: @non_reduction_index(
-; CHECK-NOT:     <4 x i16>
 define i16 @non_reduction_index(i16 noundef %val) {
+; CHECK-LABEL: define i16 @non_reduction_index(
+; CHECK-SAME: i16 noundef [[VAL:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i16 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    ret i16 [[SPEC_SELECT_LCSSA]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_05:%.*]] = phi i16 [ 12, %[[ENTRY]] ], [ [[SUB:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[K_04:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[I_05]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i16 [[TMP0]], [[VAL]]
+; CHECK-NEXT:    [[SUB]] = add nsw i16 [[I_05]], -1
+; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i16 [[SUB]], i16 [[K_04]]
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i16 [[SUB]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
 entry:
   br label %for.body
 
@@ -932,11 +1860,27 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %cmp.not, label %for.cond.cleanup, label %for.body
 }
 
-@tablef = constant [13 x half] [half 10.0, half 35.0, half 69.0, half 147.0, half 280.0, half 472.0, half 682.0, half 1013.0, half 1559.0, half 2544.0, half 4556.0, half 6496.0, half 10000.0], align 1 
+@tablef = constant [13 x half] [half 10.0, half 35.0, half 69.0, half 147.0, half 280.0, half 472.0, half 682.0, half 1013.0, half 1559.0, half 2544.0, half 4556.0, half 6496.0, half 10000.0], align 1
 
-; CHECK-LABEL: @non_reduction_index_half(
-; CHECK-NOT:     <4 x half>
 define i16 @non_reduction_index_half(half noundef %val) {
+; CHECK-LABEL: define i16 @non_reduction_index_half(
+; CHECK-SAME: half noundef [[VAL:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i16 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    ret i16 [[SPEC_SELECT_LCSSA]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_05:%.*]] = phi i16 [ 12, %[[ENTRY]] ], [ [[SUB:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[K_04:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[I_05]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp ugt half [[TMP0]], [[VAL]]
+; CHECK-NEXT:    [[SUB]] = add nsw i16 [[I_05]], -1
+; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[FCMP1]], i16 [[SUB]], i16 [[K_04]]
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i16 [[SUB]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
 entry:
   br label %for.body
 
@@ -956,5 +1900,37 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %cmp.not, label %for.cond.cleanup, label %for.body
 }
 
-; Make sure any check-not directives are not triggered by function declarations.
-; CHECK: declare
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]}
+; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]}
+; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]}
+; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
+; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
+; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
+; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
+; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
+; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
+; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
+; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
+; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll b/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll
index 13e79a4a47b39..f62c3c7f42ec4 100644
--- a/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll
+++ b/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll
@@ -4,18 +4,80 @@
 define void @test_ptr_iv_no_inbounds(ptr %p1.start, ptr %p2.start, ptr %p1.end) {
 ; CHECK-LABEL: @test_ptr_iv_no_inbounds(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1_START7:%.*]] = ptrtoint ptr [[P1_START:%.*]] to i64
+; CHECK-NEXT:    [[P1_END6:%.*]] = ptrtoint ptr [[P1_END:%.*]] to i64
+; CHECK-NEXT:    [[P1_START4:%.*]] = ptrtoint ptr [[P1_START]] to i64
+; CHECK-NEXT:    [[P1_END3:%.*]] = ptrtoint ptr [[P1_END]] to i64
+; CHECK-NEXT:    [[P1_START2:%.*]] = ptrtoint ptr [[P1_START]] to i64
+; CHECK-NEXT:    [[P1_END1:%.*]] = ptrtoint ptr [[P1_END]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[P1_END6]], -4
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[P1_START7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[P1_END1]] to i2
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[P1_START2]] to i2
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i2 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i2 [[TMP6]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[P1_END3]], -4
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], [[P1_START4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[P1_START]], i64 [[TMP12]]
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[P2_START:%.*]], i64 [[TMP12]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[P1_START]], [[SCEVGEP5]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[P2_START]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P1_START]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[N_VEC]], 4
+; CHECK-NEXT:    [[IND_END8:%.*]] = getelementptr i8, ptr [[P2_START]], i64 [[TMP14]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P1_START]], i64 [[TMP15]]
+; CHECK-NEXT:    [[OFFSET_IDX10:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX10]], 0
+; CHECK-NEXT:    [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[P2_START]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr float, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP17]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr float, ptr [[NEXT_GEP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x float>, ptr [[TMP18]], align 4, !alias.scope [[META3]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fadd <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD12]]
+; CHECK-NEXT:    store <2 x float> [[TMP19]], ptr [[TMP17]], align 4, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P1_START]], [[ENTRY:%.*]] ], [ [[P1_START]], [[VECTOR_SCEVCHECK]] ], [ [[P1_START]], [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[IND_END8]], [[MIDDLE_BLOCK]] ], [ [[P2_START]], [[ENTRY]] ], [ [[P2_START]], [[VECTOR_SCEVCHECK]] ], [ [[P2_START]], [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[P1:%.*]] = phi ptr [ [[P1_START:%.*]], [[ENTRY:%.*]] ], [ [[P1_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[P2:%.*]] = phi ptr [ [[P2_START:%.*]], [[ENTRY]] ], [ [[P2_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[P1:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P1_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[P2:%.*]] = phi ptr [ [[BC_RESUME_VAL9]], [[SCALAR_PH]] ], [ [[P2_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[P1_VAL:%.*]] = load float, ptr [[P1]], align 4
 ; CHECK-NEXT:    [[P2_VAL:%.*]] = load float, ptr [[P2]], align 4
 ; CHECK-NEXT:    [[SUM:%.*]] = fadd float [[P1_VAL]], [[P2_VAL]]
 ; CHECK-NEXT:    store float [[SUM]], ptr [[P1]], align 4
 ; CHECK-NEXT:    [[P1_NEXT]] = getelementptr float, ptr [[P1]], i64 1
 ; CHECK-NEXT:    [[P2_NEXT]] = getelementptr float, ptr [[P2]], i64 1
-; CHECK-NEXT:    [[C:%.*]] = icmp ne ptr [[P1_NEXT]], [[P1_END:%.*]]
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = icmp ne ptr [[P1_NEXT]], [[P1_END]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -80,14 +142,14 @@ define void @test_ptr_iv_with_inbounds(ptr %p1.start, ptr %p2.start, ptr %p1.end
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX8]], 0
 ; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[P2_START]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr float, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP13]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP13]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[NEXT_GEP9]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP14]], align 4, !alias.scope [[META3]]
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP14]], align 4, !alias.scope [[META12]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD10]]
-; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[TMP13]], align 4, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[TMP13]], align 4, !alias.scope [[META9]], !noalias [[META12]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -105,7 +167,7 @@ define void @test_ptr_iv_with_inbounds(ptr %p1.start, ptr %p2.start, ptr %p1.end
 ; CHECK-NEXT:    [[P1_NEXT]] = getelementptr inbounds float, ptr [[P1]], i64 1
 ; CHECK-NEXT:    [[P2_NEXT]] = getelementptr inbounds float, ptr [[P2]], i64 1
 ; CHECK-NEXT:    [[C:%.*]] = icmp ne ptr [[P1_NEXT]], [[P1_END]]
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -155,7 +217,7 @@ define void @store_pointer_induction(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -167,7 +229,7 @@ define void @store_pointer_induction(ptr %start, ptr %end) {
 ; CHECK-NEXT:    store ptr [[IV]], ptr [[IV]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = getelementptr inbounds ptr, ptr [[IV]], i32 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq ptr [[IV_NEXT]], [[END]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/phi-cost.ll b/llvm/test/Transforms/LoopVectorize/phi-cost.ll
index 8d407c969b527..6ee2f36dd5b41 100644
--- a/llvm/test/Transforms/LoopVectorize/phi-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/phi-cost.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; REQUIRES: asserts
 ; RUN: opt < %s -passes='function(loop-vectorize,instcombine)' -force-vector-width=2 -force-vector-interleave=1 -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
 
@@ -6,17 +7,54 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ; CHECK-LABEL: phi_two_incoming_values
 ; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
 ; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %tmp5 = phi i32 [ %tmp1, %for.body ], [ %tmp4, %if.then ]
-; CHECK:       define void @phi_two_incoming_values(
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK:         [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr {{.*}}
-; CHECK:         [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i32>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = add <2 x i32> [[WIDE_LOAD]], [[TMP6]]
-; CHECK:         store <2 x i32> [[PREDPHI]], ptr {{.*}}
+;
+define void @phi_two_incoming_values(ptr noalias %a, ptr noalias %b, i64 %n) {
+; CHECK-LABEL: define void @phi_two_incoming_values(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], -2
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i1> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = add <2 x i32> [[WIDE_LOAD]], [[TMP5]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[IF_END:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[IF_THEN:.*]], label %[[IF_END]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw i32 [[TMP2]], 1
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[TMP2]], %[[FOR_BODY]] ], [ [[TMP5]], %[[IF_THEN]] ]
+; CHECK-NEXT:    store i32 [[TMP6]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[I]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
 ;
-define void @phi_two_incoming_values(ptr %a, ptr %b, i64 %n) {
 entry:
   br label %for.body
 
@@ -46,15 +84,63 @@ for.end:
 ; CHECK-LABEL: phi_three_incoming_values
 ; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
 ; CHECK:       LV: Found an estimated cost of 2 for VF 2 For instruction: %tmp8 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %tmp7, %if.else ]
-; CHECK:       define void @phi_three_incoming_values(
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK:         [[PREDPHI:%.*]] = select <2 x i1> {{.*}}, <2 x i32> {{.*}}, <2 x i32> <i32 3, i32 3>
-; CHECK:         [[PREDPHI7:%.*]] = select <2 x i1> {{.*}}, <2 x i32> [[PREDPHI]], <2 x i32> <i32 9, i32 9>
-; CHECK:         store <2 x i32> [[PREDPHI7]], ptr {{.*}}
+;
+define void @phi_three_incoming_values(ptr noalias %a, ptr noalias %b, i64 %n) {
+; CHECK-LABEL: define void @phi_three_incoming_values(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], -2
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], <i32 20, i32 20>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD2]], <i32 4, i32 4>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> <i32 4, i32 4>, <2 x i32> <i32 5, i32 5>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> [[TMP8]], <2 x i32> <i32 3, i32 3>
+; CHECK-NEXT:    [[PREDPHI3:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[PREDPHI]], <2 x i32> <i32 9, i32 9>
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI3]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[IF_END:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[IF_THEN:.*]], label %[[IF_END]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP3]], 19
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[IF_END]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i32 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 4, i32 5
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ 9, %[[FOR_BODY]] ], [ 3, %[[IF_THEN]] ], [ [[TMP9]], %[[IF_ELSE]] ]
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[I]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
 ;
-define void @phi_three_incoming_values(ptr %a, ptr %b, i64 %n) {
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
index 8cf4e77a0d499..0d9918b74a2ff 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -596,10 +596,10 @@ exit:                                             ; preds = %for.body
 define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) {
 ; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
-; CHECK-NEXT:    store i32 [[TMP4]], ptr %dst, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
+; CHECK-NEXT:    store i32 [[TMP6]], ptr %dst, align 4
+; CHECK-NEXT:    store i32 [[TMP7]], ptr %dst, align 4
 ;
 entry:
   br label %for.body
@@ -625,10 +625,10 @@ exit:
 define void @reduc_mul_add_store_same_ptr(ptr %dst, ptr readonly %src) {
 ; CHECK-LABEL: define void @reduc_mul_add_store_same_ptr
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
-; CHECK-NEXT:    store i32 [[TMP4]], ptr %dst, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
+; CHECK-NEXT:    store i32 [[TMP7]], ptr %dst, align 4
+; CHECK-NEXT:    store i32 [[TMP6]], ptr %dst, align 4
 ;
 entry:
   br label %for.body
@@ -655,10 +655,10 @@ exit:
 define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
 ; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
-; CHECK-NEXT:    store i32 [[TMP4]], ptr %dst2, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst1, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
+; CHECK-NEXT:    store i32 [[TMP6]], ptr %dst1, align 4
+; CHECK-NEXT:    store i32 [[TMP7]], ptr %dst2, align 4
 ;
 entry:
   br label %for.body
@@ -684,10 +684,10 @@ exit:
 define void @reduc_mul_add_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
 ; CHECK-LABEL: define void @reduc_mul_add_store_different_ptr
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
-; CHECK-NEXT:    store i32 [[TMP4]], ptr %dst2, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT:    store i32 [[TMP2]], ptr %dst1, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
+; CHECK-NEXT:    store i32 [[TMP7]], ptr %dst1, align 4
+; CHECK-NEXT:    store i32 [[TMP6]], ptr %dst2, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 26974c2307065..2a55f6d756826 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -212,6 +212,7 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:   EMIT vp<[[RED_RES:.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
+; CHECK-NEXT:   CLONE store vp<[[RED_RES]]>, ir<%dst>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
diff --git a/llvm/test/Transforms/LoopVersioning/pr96656.ll b/llvm/test/Transforms/LoopVersioning/single-iteration.ll
similarity index 59%
rename from llvm/test/Transforms/LoopVersioning/pr96656.ll
rename to llvm/test/Transforms/LoopVersioning/single-iteration.ll
index 0264fe40a9430..ffb9c7d4cd5ab 100644
--- a/llvm/test/Transforms/LoopVersioning/pr96656.ll
+++ b/llvm/test/Transforms/LoopVersioning/single-iteration.ll
@@ -1,16 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=loop-versioning -S %s | FileCheck %s
 
-define void @lver.check.unnecessary(ptr %arg, ptr %arg1, i1 %arg2) {
-; CHECK-LABEL: define void @lver.check.unnecessary(
-; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]], i1 [[ARG2:%.*]]) {
+; Callers should not call LoopVersioning on single-iteration loops, as it
+; is very likely not profitable.
+; LoopVersioning faithfully versions single-iteration loops when the stride
+; is unknown.
+
+define double @single_iteration_unknown_stride(i32 %x, ptr %y, i1 %cond) {
+; CHECK-LABEL: define double @single_iteration_unknown_stride(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[Y:%.*]], i1 [[COND:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[ARG]], align 4
-; CHECK-NEXT:    br i1 [[ARG2]], label %[[NOLOOP_EXIT:.*]], label %[[LOOP_BODY_LVER_CHECK:.*]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[NOLOOP_EXIT:.*]], label %[[LOOP_BODY_LVER_CHECK:.*]]
 ; CHECK:       [[LOOP_BODY_LVER_CHECK]]:
-; CHECK-NEXT:    [[SEXT7:%.*]] = sext i32 [[LOAD]] to i64
-; CHECK-NEXT:    [[GEP8:%.*]] = getelementptr i8, ptr [[ARG1]], i64 8
-; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[LOAD]], 1
+; CHECK-NEXT:    [[SEXT7:%.*]] = sext i32 [[X]] to i64
+; CHECK-NEXT:    [[GEP8:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[X]], 1
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label %[[LOOP_BODY_PH_LVER_ORIG:.*]], label %[[LOOP_BODY_PH:.*]]
 ; CHECK:       [[LOOP_BODY_PH_LVER_ORIG]]:
 ; CHECK-NEXT:    br label %[[LOOP_BODY_LVER_ORIG:.*]]
@@ -19,7 +23,7 @@ define void @lver.check.unnecessary(ptr %arg, ptr %arg1, i1 %arg2) {
 ; CHECK-NEXT:    [[MUL_LVER_ORIG:%.*]] = mul i64 [[PHI_LVER_ORIG]], [[SEXT7]]
 ; CHECK-NEXT:    [[GEP10_LVER_ORIG:%.*]] = getelementptr double, ptr [[GEP8]], i64 [[MUL_LVER_ORIG]]
 ; CHECK-NEXT:    [[LOAD11_LVER_ORIG:%.*]] = load double, ptr [[GEP10_LVER_ORIG]], align 8
-; CHECK-NEXT:    store double [[LOAD11_LVER_ORIG]], ptr [[ARG1]], align 8
+; CHECK-NEXT:    store double [[LOAD11_LVER_ORIG]], ptr [[Y]], align 8
 ; CHECK-NEXT:    [[ADD_LVER_ORIG]] = add i64 [[PHI_LVER_ORIG]], 1
 ; CHECK-NEXT:    [[ICMP_LVER_ORIG:%.*]] = icmp eq i64 [[PHI_LVER_ORIG]], 0
 ; CHECK-NEXT:    br i1 [[ICMP_LVER_ORIG]], label %[[LOOP_EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY_LVER_ORIG]]
@@ -30,49 +34,52 @@ define void @lver.check.unnecessary(ptr %arg, ptr %arg1, i1 %arg2) {
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[PHI]], [[SEXT7]]
 ; CHECK-NEXT:    [[GEP10:%.*]] = getelementptr double, ptr [[GEP8]], i64 [[MUL]]
 ; CHECK-NEXT:    [[LOAD11:%.*]] = load double, ptr [[GEP10]], align 8
-; CHECK-NEXT:    store double [[LOAD11]], ptr [[ARG1]], align 8
+; CHECK-NEXT:    store double [[LOAD11]], ptr [[Y]], align 8
 ; CHECK-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i64 [[PHI]], 0
 ; CHECK-NEXT:    br i1 [[ICMP]], label %[[LOOP_EXIT_LOOPEXIT1:.*]], label %[[LOOP_BODY]]
 ; CHECK:       [[NOLOOP_EXIT]]:
-; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[LOAD]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[ARG1]], i64 [[SEXT]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[X]] to i64
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[Y]], i64 [[SEXT]]
 ; CHECK-NEXT:    [[LOAD5:%.*]] = load double, ptr [[GEP]], align 8
-; CHECK-NEXT:    store double [[LOAD5]], ptr [[ARG]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double [[LOAD5]]
 ; CHECK:       [[LOOP_EXIT_LOOPEXIT]]:
 ; CHECK-NEXT:    br label %[[LOOP_EXIT:.*]]
 ; CHECK:       [[LOOP_EXIT_LOOPEXIT1]]:
 ; CHECK-NEXT:    br label %[[LOOP_EXIT]]
 ; CHECK:       [[LOOP_EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext i32 [[X]] to i64
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr double, ptr [[Y]], i64 [[SEXT2]]
+; CHECK-NEXT:    [[LOAD6:%.*]] = load double, ptr [[GEP2]], align 8
+; CHECK-NEXT:    ret double [[LOAD6]]
 ;
 entry:
-  %load = load i32, ptr %arg, align 4
-  br i1 %arg2, label %noloop.exit, label %loop.ph
+  br i1 %cond, label %noloop.exit, label %loop.ph
 
 loop.ph:                                          ; preds = %entry
-  %sext7 = sext i32 %load to i64
-  %gep8 = getelementptr i8, ptr %arg1, i64 8
+  %sext7 = sext i32 %x to i64
+  %gep8 = getelementptr i8, ptr %y, i64 8
   br label %loop.body
 
 loop.body:                                        ; preds = %loop.body, %loop.ph
-  %phi = phi i64 [ 0, %loop.ph ], [ %add, %loop.body ]
-  %mul = mul i64 %phi, %sext7
+  %iv = phi i64 [ 0, %loop.ph ], [ %iv.next, %loop.body ]
+  %mul = mul i64 %iv, %sext7
   %gep10 = getelementptr double, ptr %gep8, i64 %mul
   %load11 = load double, ptr %gep10, align 8
-  store double %load11, ptr %arg1, align 8
-  %add = add i64 %phi, 1
-  %icmp = icmp eq i64 %phi, 0
+  store double %load11, ptr %y, align 8
+  %iv.next = add i64 %iv, 1
+  %icmp = icmp eq i64 %iv, 0
   br i1 %icmp, label %loop.exit, label %loop.body
 
 noloop.exit:                                      ; preds = %entry
-  %sext = sext i32 %load to i64
-  %gep = getelementptr double, ptr %arg1, i64 %sext
+  %sext = sext i32 %x to i64
+  %gep = getelementptr double, ptr %y, i64 %sext
   %load5 = load double, ptr %gep, align 8
-  store double %load5, ptr %arg, align 8
-  ret void
+  ret double %load5
 
 loop.exit:                                        ; preds = %loop.body
-  ret void
+  %sext2 = sext i32 %x to i64
+  %gep2 = getelementptr double, ptr %y, i64 %sext2
+  %load6 = load double, ptr %gep2, align 8
+  ret double %load6
 }
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
index ba260752ce4b5..39b90adc74ef3 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -812,6 +812,88 @@ define void @memcpy_memory_none(ptr %p, ptr %p2, i64 %size) {
   ret void
 }
 
+declare void @do_something()
+declare void @capture(ptr)
+
+define void @memcpy_memcpy_escape_before(ptr noalias %P, ptr noalias %Q) {
+; CHECK-LABEL: @memcpy_memcpy_escape_before(
+; CHECK-NEXT:    [[MEMTMP:%.*]] = alloca [32 x i8], align 16
+; CHECK-NEXT:    call void @capture(ptr [[MEMTMP]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[MEMTMP]], ptr align 16 [[P:%.*]], i32 32, i1 false)
+; CHECK-NEXT:    call void @do_something()
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[Q:%.*]], ptr align 16 [[MEMTMP]], i32 32, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %memtmp = alloca [32 x i8], align 16
+  call void @capture(ptr %memtmp)
+  call void @llvm.memcpy.p0.p0.i32(ptr align 16 %memtmp, ptr align 16 %P, i32 32, i1 false)
+  call void @do_something()
+  call void @llvm.memcpy.p0.p0.i32(ptr align 16 %Q, ptr align 16 %memtmp, i32 32, i1 false)
+  ret void
+}
+
+define void @memcpy_memcpy_escape_after1(ptr noalias %P, ptr noalias %Q) {
+; CHECK-LABEL: @memcpy_memcpy_escape_after1(
+; CHECK-NEXT:    [[MEMTMP:%.*]] = alloca [32 x i8], align 16
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[MEMTMP]], ptr align 16 [[P:%.*]], i32 32, i1 false)
+; CHECK-NEXT:    call void @do_something()
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[Q:%.*]], ptr align 16 [[P]], i32 32, i1 false)
+; CHECK-NEXT:    call void @capture(ptr [[MEMTMP]])
+; CHECK-NEXT:    ret void
+;
+  %memtmp = alloca [32 x i8], align 16
+  call void @llvm.memcpy.p0.p0.i32(ptr align 16 %memtmp, ptr align 16 %P, i32 32, i1 false)
+  call void @do_something()
+  call void @llvm.memcpy.p0.p0.i32(ptr align 16 %Q, ptr align 16 %memtmp, i32 32, i1 false)
+  call void @capture(ptr %memtmp)
+  ret void
+}
+
+define void @memcpy_memcpy_escape_after2(ptr noalias %P, ptr noalias %Q) {
+; CHECK-LABEL: @memcpy_memcpy_escape_after2(
+; CHECK-NEXT:    call void @do_something()
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[Q:%.*]], ptr align 16 [[P:%.*]], i32 32, i1 false)
+; CHECK-NEXT:    call void @capture(ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  %memtmp = alloca [32 x i8], align 16
+  call void @llvm.memcpy.p0.p0.i32(ptr align 16 %memtmp, ptr align 16 %P, i32 32, i1 false)
+  call void @do_something()
+  call void @llvm.memcpy.p0.p0.i32(ptr align 16 %Q, ptr align 16 %memtmp, i32 32, i1 false)
+  call void @capture(ptr %P)
+  ret void
+}
+
+define void @memcpy_byval_escape_after(ptr noalias %P) {
+; CHECK-LABEL: @memcpy_byval_escape_after(
+; CHECK-NEXT:    call void @do_something()
+; CHECK-NEXT:    call void @test4a(ptr byval(i8) align 1 [[P:%.*]])
+; CHECK-NEXT:    call void @capture(ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  %A = alloca [8 x i8]
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %A, ptr align 4 %P, i64 8, i1 false)
+  call void @do_something()
+  call void @test4a(ptr align 1 byval(i8) %A)
+  call void @capture(ptr %P)
+  ret void
+}
+
+define void @memcpy_immut_escape_after(ptr align 4 noalias %val) {
+; CHECK-LABEL: @memcpy_immut_escape_after(
+; CHECK-NEXT:    call void @do_something()
+; CHECK-NEXT:    call void @f(ptr noalias nocapture readonly align 4 [[VAL:%.*]])
+; CHECK-NEXT:    call void @capture(ptr [[VAL]])
+; CHECK-NEXT:    ret void
+;
+  %val1 = alloca i8, align 4
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %val1, ptr align 4 %val, i64 1, i1 false)
+  call void @do_something()
+  call void @f(ptr align 4 nocapture noalias readonly %val1)
+  call void @capture(ptr %val)
+  ret void
+}
+
 !0 = !{!0}
 !1 = !{!1, !0}
 !2 = !{!1}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll
new file mode 100644
index 0000000000000..bf419ea987bd0
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll
@@ -0,0 +1,102 @@
+;; This test ensures that the logic which assigns calls to stack nodes
+;; correctly handles a case where multiple nodes have stack ids that
+;; overlap with each other but have different last nodes (can happen with
+;; inlining into various levels of a call chain). Specifically, when we
+;; have one that is duplicated (e.g. unrolling), we need to correctly
+;; handle the case where the context id has already been assigned to
+;; a different call with a different last node.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS --check-prefix=REMARKS
+
+; REMARKS: created clone _Z1Ab.memprof.1
+; REMARKS: created clone _Z3XZNv.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z3XZNv.memprof.1
+;; Make sure the inlined context in _Z3XZNv, which partially overlaps the stack
+;; ids in the shorter inlined context of Z2XZv, correctly calls a cloned
+;; version of Z1Ab, which will call the cold annotated allocation.
+; REMARKS: call in clone _Z3XZNv.memprof.1 assigned to call function clone _Z1Ab.memprof.1
+; REMARKS: call in clone _Z1Ab.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone main assigned to call function clone _Z3XZNv
+; REMARKS: call in clone _Z3XZNv assigned to call function clone _Z1Ab
+; REMARKS: call in clone _Z1Ab marked with memprof allocation attribute notcold
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @_Z1Ab() {
+entry:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #1, !memprof !0, !callsite !5
+  ret void
+}
+
+; Function Attrs: nobuiltin
+declare ptr @_Znam(i64) #0
+
+;; Inlining of stack id 2 into 3. Assume this is called from somewhere else.
+define dso_local void @_Z2XZv() local_unnamed_addr #0 {
+entry:
+  ;; Simulate duplication of the callsite (e.g. unrolling).
+  call void @_Z1Ab(), !callsite !6
+  call void @_Z1Ab(), !callsite !6
+  ret void
+}
+
+;; Inlining of stack id 2 into 3 into 4. Called by main below.
+define dso_local void @_Z3XZNv() local_unnamed_addr {
+entry:
+  call void @_Z1Ab(), !callsite !7
+  ret void
+}
+
+define dso_local noundef i32 @main() local_unnamed_addr {
+entry:
+  call void @_Z3XZNv(), !callsite !8 ;; Not cold context
+  call void @_Z3XZNv(), !callsite !9 ;; Cold context
+  ret i32 0
+}
+
+attributes #0 = { nobuiltin }
+attributes #7 = { builtin }
+
+!0 = !{!1, !3}
+;; Not cold context via first call to _Z3XZNv in main
+!1 = !{!2, !"notcold"}
+!2 = !{i64 1, i64 2, i64 3, i64 4, i64 5}
+;; Cold context via second call to _Z3XZNv in main
+!3 = !{!4, !"cold"}
+!4 = !{i64 1, i64 2, i64 3, i64 4, i64 6}
+!5 = !{i64 1}
+!6 = !{i64 2, i64 3}
+!7 = !{i64 2, i64 3, i64 4}
+!8 = !{i64 5}
+!9 = !{i64 6}
+
+; IR: define {{.*}} @_Z1Ab()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define {{.*}} @_Z2XZv()
+; IR:   call {{.*}} @_Z1Ab()
+; IR:   call {{.*}} @_Z1Ab()
+; IR: define {{.*}} @_Z3XZNv()
+; IR:   call {{.*}} @_Z1Ab()
+; IR: define {{.*}} @main()
+; IR:   call {{.*}} @_Z3XZNv()
+; IR:   call {{.*}} @_Z3XZNv.memprof.1()
+; IR: define {{.*}} @_Z1Ab.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define {{.*}} @_Z3XZNv.memprof.1()
+; IR:   call {{.*}} @_Z1Ab.memprof.1()
+
+; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { "memprof"="cold" }
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index 5b878108af59a..5f0b16048d40c 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -685,10 +685,10 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1
 ; CHECK-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]]
-; CHECK-NEXT:    [[ADD18:%.*]] = add nsw i32 [[MUL]], 2
-; CHECK-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[ADD18]] to i64
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM19]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1
+; CHECK-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]]
 ; CHECK-NEXT:    [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3
 ; CHECK-NEXT:    [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]]
@@ -700,8 +700,8 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
-; CHECK-NEXT:    [[ARRAYIDX56:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM19]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX56]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]]
 ; CHECK-NEXT:    [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
@@ -715,12 +715,12 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[ARRAYIDX84:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 28
-; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[MUL81:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX82:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 32
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
 ; CHECK-NEXT:    [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 44
 ; CHECK-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36
@@ -728,8 +728,8 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
 ; CHECK-NEXT:    store i32 [[MUL73]], ptr [[Z]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4
-; CHECK-NEXT:    store <2 x i32> [[TMP16]], ptr [[ARRAYIDX84]], align 4
-; CHECK-NEXT:    store i32 [[MUL85]], ptr [[ARRAYIDX76]], align 4
+; CHECK-NEXT:    store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4
+; CHECK-NEXT:    store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4
 ; CHECK-NEXT:    store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4
 ; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/minbitwidth-cast-node-length.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/minbitwidth-cast-node-length.ll
new file mode 100644
index 0000000000000..877c06ab7bcde
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/minbitwidth-cast-node-length.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=arm64-apple-macosx11.0.0 < %s | FileCheck %s
+
+define void @test(ptr %pDst, i32 %stride, i8 %0, ptr %p1, ptr %p2, ptr %p4, ptr %p3) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[PDST:%.*]], i32 [[STRIDE:%.*]], i8 [[TMP0:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]], ptr [[P4:%.*]], ptr [[P3:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MUL100:%.*]] = mul i32 [[STRIDE]], 9
+; CHECK-NEXT:    [[MUL101:%.*]] = mul i32 [[STRIDE]], 7
+; CHECK-NEXT:    [[MUL102:%.*]] = mul i32 [[STRIDE]], 5
+; CHECK-NEXT:    [[MUL103:%.*]] = mul i32 [[STRIDE]], 3
+; CHECK-NEXT:    [[CONV111:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL112:%.*]] = mul i32 [[CONV111]], 14
+; CHECK-NEXT:    [[CONV117:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL118:%.*]] = mul i32 [[CONV117]], 14
+; CHECK-NEXT:    [[CONV124:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL125:%.*]] = mul i32 [[CONV124]], 14
+; CHECK-NEXT:    [[CONV131:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL132:%.*]] = mul i32 [[CONV131]], 14
+; CHECK-NEXT:    [[CMP139:%.*]] = icmp uge i32 [[MUL112]], [[MUL100]]
+; CHECK-NEXT:    [[CMP142:%.*]] = icmp uge i32 [[MUL112]], [[MUL101]]
+; CHECK-NEXT:    [[ADD1441:%.*]] = or i1 [[CMP139]], [[CMP142]]
+; CHECK-NEXT:    [[CMP145:%.*]] = icmp uge i32 [[MUL112]], [[MUL102]]
+; CHECK-NEXT:    [[ADD1472:%.*]] = or i1 [[ADD1441]], [[CMP145]]
+; CHECK-NEXT:    [[CMP148:%.*]] = icmp uge i32 [[MUL112]], [[MUL103]]
+; CHECK-NEXT:    [[ADD1504:%.*]] = or i1 [[ADD1472]], [[CMP148]]
+; CHECK-NEXT:    [[ADD151:%.*]] = zext i1 [[ADD1504]] to i64
+; CHECK-NEXT:    [[ARRAYIDX156:%.*]] = getelementptr [8 x i32], ptr [[P1]], i64 0, i64 [[ADD151]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX156]], align 4
+; CHECK-NEXT:    [[CMP165:%.*]] = icmp uge i32 [[MUL118]], [[MUL101]]
+; CHECK-NEXT:    [[CMP171:%.*]] = icmp uge i32 [[MUL118]], [[MUL103]]
+; CHECK-NEXT:    [[ADD1734:%.*]] = or i1 [[CMP165]], [[CMP171]]
+; CHECK-NEXT:    [[ADD173:%.*]] = zext i1 [[ADD1734]] to i64
+; CHECK-NEXT:    [[ARRAYIDX178:%.*]] = getelementptr [8 x i32], ptr [[P2]], i64 0, i64 [[ADD173]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX178]], align 4
+; CHECK-NEXT:    [[CMP185:%.*]] = icmp uge i32 [[MUL125]], [[MUL100]]
+; CHECK-NEXT:    [[CMP188:%.*]] = icmp uge i32 [[MUL125]], [[MUL101]]
+; CHECK-NEXT:    [[ADD1905:%.*]] = or i1 [[CMP185]], [[CMP188]]
+; CHECK-NEXT:    [[CMP191:%.*]] = icmp uge i32 [[MUL125]], [[MUL102]]
+; CHECK-NEXT:    [[ADD1936:%.*]] = or i1 [[ADD1905]], [[CMP191]]
+; CHECK-NEXT:    [[ADD193:%.*]] = zext i1 [[ADD1936]] to i64
+; CHECK-NEXT:    [[ARRAYIDX201:%.*]] = getelementptr [8 x i32], ptr [[P4]], i64 0, i64 [[ADD193]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX201]], align 4
+; CHECK-NEXT:    [[CMP208:%.*]] = icmp uge i32 [[MUL132]], [[MUL100]]
+; CHECK-NEXT:    [[CMP211:%.*]] = icmp uge i32 [[MUL132]], [[MUL101]]
+; CHECK-NEXT:    [[ADD2137:%.*]] = or i1 [[CMP208]], [[CMP211]]
+; CHECK-NEXT:    [[CMP214:%.*]] = icmp uge i32 [[MUL132]], [[MUL102]]
+; CHECK-NEXT:    [[ADD2168:%.*]] = or i1 [[ADD2137]], [[CMP214]]
+; CHECK-NEXT:    [[CMP217:%.*]] = icmp uge i32 [[MUL132]], [[MUL103]]
+; CHECK-NEXT:    [[ADD2199:%.*]] = or i1 [[ADD2168]], [[CMP217]]
+; CHECK-NEXT:    [[ADD219:%.*]] = zext i1 [[ADD2199]] to i64
+; CHECK-NEXT:    [[ARRAYIDX224:%.*]] = getelementptr [8 x i32], ptr [[P3]], i64 0, i64 [[ADD219]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX224]], align 4
+; CHECK-NEXT:    [[CONV230:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL231:%.*]] = mul i32 [[CONV230]], 14
+; CHECK-NEXT:    [[CONV237:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL238:%.*]] = mul i32 [[CONV237]], 14
+; CHECK-NEXT:    [[CONV244:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL245:%.*]] = mul i32 [[CONV244]], 14
+; CHECK-NEXT:    [[CONV484:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL485:%.*]] = mul i32 [[CONV484]], 14
+; CHECK-NEXT:    [[CMP262:%.*]] = icmp uge i32 [[MUL231]], [[MUL101]]
+; CHECK-NEXT:    [[CMP268:%.*]] = icmp uge i32 [[MUL231]], [[MUL103]]
+; CHECK-NEXT:    [[ADD1503:%.*]] = or i1 [[CMP262]], [[CMP268]]
+; CHECK-NEXT:    [[ADD150:%.*]] = zext i1 [[ADD1503]] to i64
+; CHECK-NEXT:    [[ARRAYIDX155:%.*]] = getelementptr [8 x i32], ptr [[P1]], i64 0, i64 [[ADD150]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX155]], align 4
+; CHECK-NEXT:    [[OR951:%.*]] = or i32 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[CMP282:%.*]] = icmp uge i32 [[MUL238]], [[MUL100]]
+; CHECK-NEXT:    [[CMP285:%.*]] = icmp uge i32 [[MUL238]], [[MUL101]]
+; CHECK-NEXT:    [[ADD28711:%.*]] = or i1 [[CMP282]], [[CMP285]]
+; CHECK-NEXT:    [[CMP288:%.*]] = icmp uge i32 [[MUL238]], [[MUL102]]
+; CHECK-NEXT:    [[ADD29012:%.*]] = or i1 [[ADD28711]], [[CMP288]]
+; CHECK-NEXT:    [[CMP291:%.*]] = icmp uge i32 [[MUL238]], [[MUL103]]
+; CHECK-NEXT:    [[ADD29313:%.*]] = or i1 [[ADD29012]], [[CMP291]]
+; CHECK-NEXT:    [[ADD293:%.*]] = zext i1 [[ADD29313]] to i64
+; CHECK-NEXT:    [[ARRAYIDX298:%.*]] = getelementptr [8 x i32], ptr [[P2]], i64 0, i64 [[ADD293]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX298]], align 4
+; CHECK-NEXT:    [[OR301952:%.*]] = or i32 [[TMP21]], [[TMP12]]
+; CHECK-NEXT:    [[CMP310:%.*]] = icmp uge i32 [[MUL245]], [[MUL101]]
+; CHECK-NEXT:    [[CMP316:%.*]] = icmp uge i32 [[MUL245]], [[MUL103]]
+; CHECK-NEXT:    [[ADD31814:%.*]] = or i1 [[CMP310]], [[CMP316]]
+; CHECK-NEXT:    [[ADD318:%.*]] = zext i1 [[ADD31814]] to i64
+; CHECK-NEXT:    [[ARRAYIDX323:%.*]] = getelementptr [8 x i32], ptr [[P4]], i64 0, i64 [[ADD318]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX323]], align 4
+; CHECK-NEXT:    [[OR326953:%.*]] = or i32 [[TMP14]], [[TMP19]]
+; CHECK-NEXT:    [[CMP332:%.*]] = icmp uge i32 [[MUL485]], [[MUL100]]
+; CHECK-NEXT:    [[CMP335:%.*]] = icmp uge i32 [[MUL485]], [[MUL101]]
+; CHECK-NEXT:    [[ADD33715:%.*]] = or i1 [[CMP332]], [[CMP335]]
+; CHECK-NEXT:    [[CMP338:%.*]] = icmp uge i32 [[MUL485]], [[MUL102]]
+; CHECK-NEXT:    [[ADD34016:%.*]] = or i1 [[ADD33715]], [[CMP338]]
+; CHECK-NEXT:    [[CMP341:%.*]] = icmp uge i32 [[MUL485]], [[MUL103]]
+; CHECK-NEXT:    [[ADD34317:%.*]] = or i1 [[ADD34016]], [[CMP341]]
+; CHECK-NEXT:    [[ADD343:%.*]] = zext i1 [[ADD34317]] to i64
+; CHECK-NEXT:    [[ARRAYIDX348:%.*]] = getelementptr [8 x i32], ptr [[P3]], i64 0, i64 [[ADD343]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX348]], align 4
+; CHECK-NEXT:    [[OR351954:%.*]] = or i32 [[TMP22]], [[TMP20]]
+; CHECK-NEXT:    [[CONV485:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL486:%.*]] = mul i32 [[CONV485]], 14
+; CHECK-NEXT:    [[CONV491:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL492:%.*]] = mul i32 [[CONV491]], 14
+; CHECK-NEXT:    [[CONV498:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL499:%.*]] = mul i32 [[CONV498]], 14
+; CHECK-NEXT:    [[CONV505:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL506:%.*]] = mul i32 [[CONV505]], 14
+; CHECK-NEXT:    [[CMP519:%.*]] = icmp uge i32 [[MUL486]], [[MUL102]]
+; CHECK-NEXT:    [[CMP522:%.*]] = icmp uge i32 [[MUL486]], [[MUL103]]
+; CHECK-NEXT:    [[ADD52418:%.*]] = or i1 [[CMP519]], [[CMP522]]
+; CHECK-NEXT:    [[ADD524:%.*]] = zext i1 [[ADD52418]] to i64
+; CHECK-NEXT:    [[ARRAYIDX529:%.*]] = getelementptr [8 x i32], ptr [[P1]], i64 0, i64 [[ADD524]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX529]], align 4
+; CHECK-NEXT:    [[CMP541:%.*]] = icmp uge i32 [[MUL492]], [[MUL101]]
+; CHECK-NEXT:    [[CMP544:%.*]] = icmp uge i32 [[MUL492]], [[MUL102]]
+; CHECK-NEXT:    [[ADD54619:%.*]] = or i1 [[CMP541]], [[CMP544]]
+; CHECK-NEXT:    [[CMP547:%.*]] = icmp uge i32 [[MUL492]], [[MUL103]]
+; CHECK-NEXT:    [[ADD54920:%.*]] = or i1 [[ADD54619]], [[CMP547]]
+; CHECK-NEXT:    [[ADD549:%.*]] = zext i1 [[ADD54920]] to i64
+; CHECK-NEXT:    [[ARRAYIDX554:%.*]] = getelementptr [8 x i32], ptr [[P2]], i64 0, i64 [[ADD549]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX554]], align 4
+; CHECK-NEXT:    [[CMP572:%.*]] = icmp uge i32 [[MUL499]], [[MUL103]]
+; CHECK-NEXT:    [[CONV573:%.*]] = zext i1 [[CMP572]] to i64
+; CHECK-NEXT:    [[ARRAYIDX579:%.*]] = getelementptr [8 x i32], ptr [[P4]], i64 0, i64 [[CONV573]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX579]], align 4
+; CHECK-NEXT:    [[CMP594:%.*]] = icmp uge i32 [[MUL506]], [[MUL102]]
+; CHECK-NEXT:    [[CONV595:%.*]] = zext i1 [[CMP594]] to i64
+; CHECK-NEXT:    [[ARRAYIDX604:%.*]] = getelementptr [8 x i32], ptr [[P3]], i64 0, i64 [[CONV595]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX604]], align 4
+; CHECK-NEXT:    [[OR4791159:%.*]] = or i32 [[OR301952]], [[OR951]]
+; CHECK-NEXT:    [[OR6071160:%.*]] = or i32 [[OR4791159]], [[OR326953]]
+; CHECK-NEXT:    [[OR4541161:%.*]] = or i32 [[OR6071160]], [[OR351954]]
+; CHECK-NEXT:    [[SHL58111621163:%.*]] = or i32 [[TMP27]], [[TMP29]]
+; CHECK-NEXT:    [[SHL55611641165:%.*]] = or i32 [[TMP25]], [[SHL58111621163]]
+; CHECK-NEXT:    [[SHL53111661167:%.*]] = or i32 [[TMP23]], [[SHL55611641165]]
+; CHECK-NEXT:    [[SHL5311166:%.*]] = trunc i32 [[SHL53111661167]] to i8
+; CHECK-NEXT:    [[CONV616:%.*]] = trunc i32 [[OR4541161]] to i8
+; CHECK-NEXT:    [[ARRAYIDX617:%.*]] = getelementptr i8, ptr [[PDST]], i64 4
+; CHECK-NEXT:    store i8 [[CONV616]], ptr [[ARRAYIDX617]], align 1
+; CHECK-NEXT:    store i8 [[SHL5311166]], ptr [[PDST]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %mul100 = mul i32 %stride, 9
+  %mul101 = mul i32 %stride, 7
+  %mul102 = mul i32 %stride, 5
+  %mul103 = mul i32 %stride, 3
+  %conv111 = zext i8 %0 to i32
+  %mul112 = mul i32 %conv111, 14
+  %conv117 = zext i8 %0 to i32
+  %mul118 = mul i32 %conv117, 14
+  %conv124 = zext i8 %0 to i32
+  %mul125 = mul i32 %conv124, 14
+  %conv131 = zext i8 %0 to i32
+  %mul132 = mul i32 %conv131, 14
+  %cmp139 = icmp uge i32 %mul112, %mul100
+  %cmp142 = icmp uge i32 %mul112, %mul101
+  %add1441 = or i1 %cmp139, %cmp142
+  %cmp145 = icmp uge i32 %mul112, %mul102
+  %add1472 = or i1 %add1441, %cmp145
+  %cmp148 = icmp uge i32 %mul112, %mul103
+  %add1503 = or i1 %add1472, %cmp148
+  %add150 = zext i1 %add1503 to i64
+  %arrayidx155 = getelementptr [8 x i32], ptr %p1, i64 0, i64 %add150
+  %1 = load i32, ptr %arrayidx155, align 4
+  %cmp165 = icmp uge i32 %mul118, %mul101
+  %cmp171 = icmp uge i32 %mul118, %mul103
+  %add1734 = or i1 %cmp165, %cmp171
+  %add173 = zext i1 %add1734 to i64
+  %arrayidx178 = getelementptr [8 x i32], ptr %p2, i64 0, i64 %add173
+  %2 = load i32, ptr %arrayidx178, align 4
+  %cmp185 = icmp uge i32 %mul125, %mul100
+  %cmp188 = icmp uge i32 %mul125, %mul101
+  %add1905 = or i1 %cmp185, %cmp188
+  %cmp191 = icmp uge i32 %mul125, %mul102
+  %add1936 = or i1 %add1905, %cmp191
+  %add193 = zext i1 %add1936 to i64
+  %arrayidx201 = getelementptr [8 x i32], ptr %p4, i64 0, i64 %add193
+  %3 = load i32, ptr %arrayidx201, align 4
+  %cmp208 = icmp uge i32 %mul132, %mul100
+  %cmp211 = icmp uge i32 %mul132, %mul101
+  %add2137 = or i1 %cmp208, %cmp211
+  %cmp214 = icmp uge i32 %mul132, %mul102
+  %add2168 = or i1 %add2137, %cmp214
+  %cmp217 = icmp uge i32 %mul132, %mul103
+  %add2199 = or i1 %add2168, %cmp217
+  %add219 = zext i1 %add2199 to i64
+  %arrayidx224 = getelementptr [8 x i32], ptr %p3, i64 0, i64 %add219
+  %4 = load i32, ptr %arrayidx224, align 4
+  %conv230 = zext i8 %0 to i32
+  %mul231 = mul i32 %conv230, 14
+  %conv237 = zext i8 %0 to i32
+  %mul238 = mul i32 %conv237, 14
+  %conv244 = zext i8 %0 to i32
+  %mul245 = mul i32 %conv244, 14
+  %conv251 = zext i8 %0 to i32
+  %mul252 = mul i32 %conv251, 14
+  %cmp262 = icmp uge i32 %mul231, %mul101
+  %cmp268 = icmp uge i32 %mul231, %mul103
+  %add27010 = or i1 %cmp262, %cmp268
+  %add270 = zext i1 %add27010 to i64
+  %arrayidx275 = getelementptr [8 x i32], ptr %p1, i64 0, i64 %add270
+  %5 = load i32, ptr %arrayidx275, align 4
+  %or951 = or i32 %5, %1
+  %cmp282 = icmp uge i32 %mul238, %mul100
+  %cmp285 = icmp uge i32 %mul238, %mul101
+  %add28711 = or i1 %cmp282, %cmp285
+  %cmp288 = icmp uge i32 %mul238, %mul102
+  %add29012 = or i1 %add28711, %cmp288
+  %cmp291 = icmp uge i32 %mul238, %mul103
+  %add29313 = or i1 %add29012, %cmp291
+  %add293 = zext i1 %add29313 to i64
+  %arrayidx298 = getelementptr [8 x i32], ptr %p2, i64 0, i64 %add293
+  %6 = load i32, ptr %arrayidx298, align 4
+  %or301952 = or i32 %6, %2
+  %cmp310 = icmp uge i32 %mul245, %mul101
+  %cmp316 = icmp uge i32 %mul245, %mul103
+  %add31814 = or i1 %cmp310, %cmp316
+  %add318 = zext i1 %add31814 to i64
+  %arrayidx323 = getelementptr [8 x i32], ptr %p4, i64 0, i64 %add318
+  %7 = load i32, ptr %arrayidx323, align 4
+  %or326953 = or i32 %7, %3
+  %cmp332 = icmp uge i32 %mul252, %mul100
+  %cmp335 = icmp uge i32 %mul252, %mul101
+  %add33715 = or i1 %cmp332, %cmp335
+  %cmp338 = icmp uge i32 %mul252, %mul102
+  %add34016 = or i1 %add33715, %cmp338
+  %cmp341 = icmp uge i32 %mul252, %mul103
+  %add34317 = or i1 %add34016, %cmp341
+  %add343 = zext i1 %add34317 to i64
+  %arrayidx348 = getelementptr [8 x i32], ptr %p3, i64 0, i64 %add343
+  %8 = load i32, ptr %arrayidx348, align 4
+  %or351954 = or i32 %8, %4
+  %conv484 = zext i8 %0 to i32
+  %mul485 = mul i32 %conv484, 14
+  %conv491 = zext i8 %0 to i32
+  %mul492 = mul i32 %conv491, 14
+  %conv498 = zext i8 %0 to i32
+  %mul499 = mul i32 %conv498, 14
+  %conv505 = zext i8 %0 to i32
+  %mul506 = mul i32 %conv505, 14
+  %cmp519 = icmp uge i32 %mul485, %mul102
+  %cmp522 = icmp uge i32 %mul485, %mul103
+  %add52418 = or i1 %cmp519, %cmp522
+  %add524 = zext i1 %add52418 to i64
+  %arrayidx529 = getelementptr [8 x i32], ptr %p1, i64 0, i64 %add524
+  %9 = load i32, ptr %arrayidx529, align 4
+  %cmp541 = icmp uge i32 %mul492, %mul101
+  %cmp544 = icmp uge i32 %mul492, %mul102
+  %add54619 = or i1 %cmp541, %cmp544
+  %cmp547 = icmp uge i32 %mul492, %mul103
+  %add54920 = or i1 %add54619, %cmp547
+  %add549 = zext i1 %add54920 to i64
+  %arrayidx554 = getelementptr [8 x i32], ptr %p2, i64 0, i64 %add549
+  %10 = load i32, ptr %arrayidx554, align 4
+  %cmp572 = icmp uge i32 %mul499, %mul103
+  %conv573 = zext i1 %cmp572 to i64
+  %arrayidx579 = getelementptr [8 x i32], ptr %p4, i64 0, i64 %conv573
+  %11 = load i32, ptr %arrayidx579, align 4
+  %cmp594 = icmp uge i32 %mul506, %mul102
+  %conv595 = zext i1 %cmp594 to i64
+  %arrayidx604 = getelementptr [8 x i32], ptr %p3, i64 0, i64 %conv595
+  %12 = load i32, ptr %arrayidx604, align 4
+  %or4791159 = or i32 %or301952, %or951
+  %or6071160 = or i32 %or4791159, %or326953
+  %or4541161 = or i32 %or6071160, %or351954
+  %shl58111621163 = or i32 %11, %12
+  %shl55611641165 = or i32 %10, %shl58111621163
+  %shl53111661167 = or i32 %9, %shl55611641165
+  %shl5311166 = trunc i32 %shl53111661167 to i8
+  %conv616 = trunc i32 %or4541161 to i8
+  %arrayidx617 = getelementptr i8, ptr %pDst, i64 4
+  store i8 %conv616, ptr %arrayidx617, align 1
+  store i8 %shl5311166, ptr %pDst, align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
index 9c086abe216c0..0fe4e6a5aa28b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
@@ -259,10 +259,12 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP3]], i64 12)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP8]], <16 x i8> [[TMP11]]
-; CHECK-NEXT:    store <16 x i8> [[TMP12]], ptr [[PTR]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP8]], <8 x i8> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP10]], <4 x i8> [[TMP3]], i64 12)
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP11]], <16 x i8> [[TMP13]]
+; CHECK-NEXT:    store <16 x i8> [[TMP14]], ptr [[PTR]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
index 887f59bbda94d..995cd7cfbc880 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
@@ -1,19 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-10 -pass-remarks-output=%t %s | FileCheck %s
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-20 -pass-remarks-output=%t %s | FileCheck %s
 ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
 
 ; YAML: --- !Passed
 ; YAML: Pass:            slp-vectorizer
 ; YAML: Name:            StoresVectorized
-; YAML: Function:        test
+; YAML: Function:        test1
 ; YAML: Args:
 ; YAML:   - String:          'Stores SLP vectorized with cost '
 ; YAML:   - Cost:            '6'
 ; YAML:   - String:          ' and with tree size '
 ; YAML:   - TreeSize:        '5'
 
-define void @test(<4 x float> %load6, <4 x float> %load7, <4 x float> %load8, <4 x float> %load17, <4 x float> %fmuladd7, <4 x float> %fmuladd16, ptr %out_ptr) {
-; CHECK-LABEL: @test(
+define void @test1(<4 x float> %load6, <4 x float> %load7, <4 x float> %load8, <4 x float> %load17, <4 x float> %fmuladd7, <4 x float> %fmuladd16, ptr %out_ptr) {
+; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[VEXT165_I:%.*]] = shufflevector <4 x float> [[LOAD6:%.*]], <4 x float> [[LOAD7:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:    [[VEXT309_I:%.*]] = shufflevector <4 x float> [[LOAD7]], <4 x float> [[LOAD8:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -40,3 +40,42 @@ entry:
 }
 
 declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; YAML: --- !Passed
+; YAML: Pass:            slp-vectorizer
+; YAML: Name:            StoresVectorized
+; YAML: Function:        test2
+; YAML: Args:
+; YAML:   - String:          'Stores SLP vectorized with cost '
+; YAML:   - Cost:            '16'
+; YAML:   - String:          ' and with tree size '
+; YAML:   - TreeSize:        '5'
+
+define void @test2(<8 x float> %load6, <8 x float> %load7, <8 x float> %load8, <8 x float> %load17, <8 x float> %fmuladd7, <8 x float> %fmuladd16, ptr %out_ptr) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VEXT165_I:%.*]] = shufflevector <8 x float> [[LOAD6:%.*]], <8 x float> [[LOAD7:%.*]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[VEXT309_I:%.*]] = shufflevector <8 x float> [[LOAD7]], <8 x float> [[LOAD8:%.*]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> [[VEXT165_I]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP0]], <8 x float> [[VEXT309_I]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> poison, i64 8)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP2]], <8 x float> [[LOAD17:%.*]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> [[FMULADD7:%.*]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP5]], <8 x float> [[FMULADD16:%.*]], i64 8)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP4]], <16 x float> [[TMP6]])
+; CHECK-NEXT:    store <16 x float> [[TMP7]], ptr [[OUT_PTR:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vext165.i = shufflevector <8 x float> %load6, <8 x float> %load7, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %vext309.i = shufflevector <8 x float> %load7, <8 x float> %load8, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %fmuladd8 = tail call noundef <8 x float> @llvm.fmuladd.v8f32(<8 x float> %vext165.i, <8 x float> %load17, <8 x float> %fmuladd7)
+  %fmuladd17 = tail call noundef <8 x float> @llvm.fmuladd.v8f32(<8 x float> %vext309.i, <8 x float> %load17, <8 x float> %fmuladd16)
+  %add.ptr.i.i = getelementptr inbounds i8, ptr %out_ptr, i64 32
+  store <8 x float> %fmuladd8, ptr %out_ptr, align 4
+  store <8 x float> %fmuladd17, ptr %add.ptr.i.i, align 4
+  ret void
+}
+
+declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>)
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/select-profitability.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/select-profitability.ll
index 4496b19fa200c..9cfc5f86cb014 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/select-profitability.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/select-profitability.ll
@@ -31,13 +31,17 @@ define i32 @pow2_zero_constant_shift(i16 zeroext %a, i16 zeroext %b, i16 zeroext
 define i32 @pow2_zero_variable_shift(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i16 zeroext %d) {
 ; CHECK-LABEL: define i32 @pow2_zero_variable_shift(
 ; CHECK-SAME: i16 zeroext [[A:%.*]], i16 zeroext [[B:%.*]], i16 zeroext [[C:%.*]], i16 zeroext [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[B]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[C]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[D]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i16> [[TMP4]], <i16 1, i16 1, i16 1, i16 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> <i32 524288, i32 262144, i32 131072, i32 65536>, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[OR_RDX2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[T39_I0:%.*]] = icmp eq i16 [[A]], 1
+; CHECK-NEXT:    [[T39_I1:%.*]] = icmp eq i16 [[B]], 1
+; CHECK-NEXT:    [[T39_I2:%.*]] = icmp eq i16 [[C]], 1
+; CHECK-NEXT:    [[T39_I3:%.*]] = icmp eq i16 [[D]], 1
+; CHECK-NEXT:    [[T40_I0:%.*]] = select i1 [[T39_I0]], i32 524288, i32 0
+; CHECK-NEXT:    [[T40_I1:%.*]] = select i1 [[T39_I1]], i32 262144, i32 0
+; CHECK-NEXT:    [[T40_I2:%.*]] = select i1 [[T39_I2]], i32 131072, i32 0
+; CHECK-NEXT:    [[T40_I3:%.*]] = select i1 [[T39_I3]], i32 65536, i32 0
+; CHECK-NEXT:    [[OR_RDX0:%.*]] = or i32 [[T40_I0]], [[T40_I1]]
+; CHECK-NEXT:    [[OR_RDX1:%.*]] = or i32 [[T40_I2]], [[T40_I3]]
+; CHECK-NEXT:    [[OR_RDX2:%.*]] = or i32 [[OR_RDX0]], [[OR_RDX1]]
 ; CHECK-NEXT:    ret i32 [[OR_RDX2]]
 ;
   %t39.i0 = icmp eq i16 %a, 1
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-109835.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-109835.ll
new file mode 100644
index 0000000000000..819256677f96d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-109835.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z16 -passes=slp-vectorizer -S -slp-revec %s | FileCheck %s
+
+@g_155 = external dso_local global i64, align 8
+@g_855 = external dso_local global i8, align 2
+@g_3_1_0 = external dso_local global i32, align 4
+@g_7 = external dso_local global [5 x i32], align 4
+
+; Function Attrs: nofree norecurse noreturn nounwind memory(readwrite, argmem: none)
+define void @func_1() {
+; CHECK-LABEL: @func_1(
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr @g_7, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr inbounds nuw (i8, ptr @g_7, i64 4), align 4
+; CHECK-NEXT:    br label [[DOTLOOPEXIT:%.*]]
+; CHECK:       .loopexit:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[OP_RDX15:%.*]], [[DOTLOOPEXIT]] ], [ [[DOTPRE]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i32> [ [[TMP71:%.*]], [[DOTLOOPEXIT]] ], [ [[TMP1]], [[TMP0]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP14]], i32 2
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP19]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <4 x i32> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[DOTNOT2_410:%.*]] = icmp eq i32 [[TMP29]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = zext i1 [[DOTNOT2_410]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = zext <4 x i1> [[TMP24]] to <4 x i32>
+; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP31]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i32 [[TMP32]], [[TMP30]]
+; CHECK-NEXT:    [[OP_RDX15]] = xor i32 [[OP_RDX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP35:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP36:%.*]] = load volatile i32, ptr @g_3_1_0, align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <4 x i32> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = zext <4 x i1> [[TMP41]] to <4 x i32>
+; CHECK-NEXT:    [[TMP43:%.*]] = xor <4 x i32> [[TMP3]], [[TMP42]]
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i32> [[TMP45]], i32 [[TMP12]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP13]], i32 3
+; CHECK-NEXT:    [[TMP48:%.*]] = icmp eq <4 x i32> [[TMP47]], zeroinitializer
+; CHECK-NEXT:    [[TMP49:%.*]] = zext <4 x i1> [[TMP48]] to <4 x i32>
+; CHECK-NEXT:    [[TMP50:%.*]] = xor <4 x i32> [[TMP43]], [[TMP49]]
+; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i32> [[TMP51]], i32 [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i32> [[TMP52]], i32 [[TMP17]], i32 2
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i32> [[TMP53]], i32 [[TMP18]], i32 3
+; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq <4 x i32> [[TMP54]], zeroinitializer
+; CHECK-NEXT:    [[TMP56:%.*]] = zext <4 x i1> [[TMP55]] to <4 x i32>
+; CHECK-NEXT:    [[TMP57:%.*]] = xor <4 x i32> [[TMP50]], [[TMP56]]
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <4 x i32> poison, i32 [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <4 x i32> [[TMP58]], i32 [[TMP26]], i32 1
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i32> [[TMP59]], i32 [[TMP27]], i32 2
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP28]], i32 3
+; CHECK-NEXT:    [[TMP62:%.*]] = icmp eq <4 x i32> [[TMP61]], zeroinitializer
+; CHECK-NEXT:    [[TMP63:%.*]] = zext <4 x i1> [[TMP62]] to <4 x i32>
+; CHECK-NEXT:    [[TMP64:%.*]] = xor <4 x i32> [[TMP57]], [[TMP63]]
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <4 x i32> poison, i32 [[TMP33]], i32 0
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <4 x i32> [[TMP65]], i32 [[TMP34]], i32 1
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <4 x i32> [[TMP66]], i32 [[TMP35]], i32 2
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <4 x i32> [[TMP67]], i32 [[TMP36]], i32 3
+; CHECK-NEXT:    [[TMP69:%.*]] = icmp eq <4 x i32> [[TMP68]], zeroinitializer
+; CHECK-NEXT:    [[TMP70:%.*]] = zext <4 x i1> [[TMP69]] to <4 x i32>
+; CHECK-NEXT:    [[TMP71]] = xor <4 x i32> [[TMP64]], [[TMP70]]
+; CHECK-NEXT:    br label [[DOTLOOPEXIT]]
+;
+  %.pre = load i32, ptr @g_7, align 4
+  %1 = load <4 x i32>, ptr getelementptr inbounds nuw (i8, ptr @g_7, i64 4), align 4
+  br label %.loopexit
+
+.loopexit:                                        ; preds = %.loopexit, %0
+  %2 = phi i32 [ %op.rdx15, %.loopexit ], [ %.pre, %0 ]
+  %3 = phi <4 x i32> [ %71, %.loopexit ], [ %1, %0 ]
+  %4 = load volatile i32, ptr @g_3_1_0, align 4
+  %5 = load volatile i32, ptr @g_3_1_0, align 4
+  %6 = load volatile i32, ptr @g_3_1_0, align 4
+  %7 = load volatile i32, ptr @g_3_1_0, align 4
+  %8 = load volatile i32, ptr @g_3_1_0, align 4
+  %9 = load volatile i32, ptr @g_3_1_0, align 4
+  %10 = load volatile i32, ptr @g_3_1_0, align 4
+  %11 = load volatile i32, ptr @g_3_1_0, align 4
+  %12 = load volatile i32, ptr @g_3_1_0, align 4
+  %13 = load volatile i32, ptr @g_3_1_0, align 4
+  %14 = load volatile i32, ptr @g_3_1_0, align 4
+  %15 = load volatile i32, ptr @g_3_1_0, align 4
+  %16 = load volatile i32, ptr @g_3_1_0, align 4
+  %17 = load volatile i32, ptr @g_3_1_0, align 4
+  %18 = load volatile i32, ptr @g_3_1_0, align 4
+  %19 = load volatile i32, ptr @g_3_1_0, align 4
+  %20 = insertelement <4 x i32> poison, i32 %4, i32 0
+  %21 = insertelement <4 x i32> %20, i32 %9, i32 1
+  %22 = insertelement <4 x i32> %21, i32 %14, i32 2
+  %23 = insertelement <4 x i32> %22, i32 %19, i32 3
+  %24 = icmp eq <4 x i32> %23, zeroinitializer
+  %25 = load volatile i32, ptr @g_3_1_0, align 4
+  %26 = load volatile i32, ptr @g_3_1_0, align 4
+  %27 = load volatile i32, ptr @g_3_1_0, align 4
+  %28 = load volatile i32, ptr @g_3_1_0, align 4
+  %29 = load volatile i32, ptr @g_3_1_0, align 4
+  %.not2.410 = icmp eq i32 %29, 0
+  %30 = zext i1 %.not2.410 to i32
+  %31 = zext <4 x i1> %24 to <4 x i32>
+  %32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %31)
+  %op.rdx = xor i32 %32, %30
+  %op.rdx15 = xor i32 %op.rdx, %2
+  %33 = load volatile i32, ptr @g_3_1_0, align 4
+  %34 = load volatile i32, ptr @g_3_1_0, align 4
+  %35 = load volatile i32, ptr @g_3_1_0, align 4
+  %36 = load volatile i32, ptr @g_3_1_0, align 4
+  %37 = insertelement <4 x i32> poison, i32 %5, i32 0
+  %38 = insertelement <4 x i32> %37, i32 %6, i32 1
+  %39 = insertelement <4 x i32> %38, i32 %7, i32 2
+  %40 = insertelement <4 x i32> %39, i32 %8, i32 3
+  %41 = icmp eq <4 x i32> %40, zeroinitializer
+  %42 = zext <4 x i1> %41 to <4 x i32>
+  %43 = xor <4 x i32> %3, %42
+  %44 = insertelement <4 x i32> poison, i32 %10, i32 0
+  %45 = insertelement <4 x i32> %44, i32 %11, i32 1
+  %46 = insertelement <4 x i32> %45, i32 %12, i32 2
+  %47 = insertelement <4 x i32> %46, i32 %13, i32 3
+  %48 = icmp eq <4 x i32> %47, zeroinitializer
+  %49 = zext <4 x i1> %48 to <4 x i32>
+  %50 = xor <4 x i32> %43, %49
+  %51 = insertelement <4 x i32> poison, i32 %15, i32 0
+  %52 = insertelement <4 x i32> %51, i32 %16, i32 1
+  %53 = insertelement <4 x i32> %52, i32 %17, i32 2
+  %54 = insertelement <4 x i32> %53, i32 %18, i32 3
+  %55 = icmp eq <4 x i32> %54, zeroinitializer
+  %56 = zext <4 x i1> %55 to <4 x i32>
+  %57 = xor <4 x i32> %50, %56
+  %58 = insertelement <4 x i32> poison, i32 %25, i32 0
+  %59 = insertelement <4 x i32> %58, i32 %26, i32 1
+  %60 = insertelement <4 x i32> %59, i32 %27, i32 2
+  %61 = insertelement <4 x i32> %60, i32 %28, i32 3
+  %62 = icmp eq <4 x i32> %61, zeroinitializer
+  %63 = zext <4 x i1> %62 to <4 x i32>
+  %64 = xor <4 x i32> %57, %63
+  %65 = insertelement <4 x i32> poison, i32 %33, i32 0
+  %66 = insertelement <4 x i32> %65, i32 %34, i32 1
+  %67 = insertelement <4 x i32> %66, i32 %35, i32 2
+  %68 = insertelement <4 x i32> %67, i32 %36, i32 3
+  %69 = icmp eq <4 x i32> %68, zeroinitializer
+  %70 = zext <4 x i1> %69 to <4 x i32>
+  %71 = xor <4 x i32> %64, %70
+  br label %.loopexit
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll
index 9c22295a1c718..43c42c1ea2bfb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll
@@ -12,12 +12,12 @@ define void @test() {
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[BB6]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP8:%.*]], %[[BB6]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> <i32 0, i32 0, i32 poison, i32 poison>, <2 x i32> [[TMP1]], i64 2)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>, <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 4>
 ; CHECK-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <2 x i32> <i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP8]] = mul <2 x i32> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    br i1 false, label %[[BB2]], label %[[BB6]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll
index 813c5e7418b30..47b42bc8f32a7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll
@@ -10,10 +10,10 @@ define void @foo() personality ptr @bar {
 ; CHECK:       bb2.loopexit:
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ [[TMP8:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ [[TMP7:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP4:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 0, i32 0, i32 poison) [ "deopt"() ]
 ; CHECK-NEXT:            to label [[BB4:%.*]] unwind label [[BB10:%.*]]
 ; CHECK:       bb4:
@@ -21,30 +21,29 @@ define void @foo() personality ptr @bar {
 ; CHECK:       bb5:
 ; CHECK-NEXT:    br label [[BB7:%.*]]
 ; CHECK:       bb6:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i32> [ <i32 0, i32 poison>, [[BB8:%.*]] ]
-; CHECK-NEXT:    [[TMP4]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP3]] = phi <2 x i32> [ <i32 0, i32 poison>, [[BB8:%.*]] ]
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    [[LOCAL_5_84111:%.*]] = phi i32 [ poison, [[BB8]] ], [ poison, [[BB5]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ]
 ; CHECK-NEXT:            to label [[BB8]] unwind label [[BB12:%.*]]
 ; CHECK:       bb8:
 ; CHECK-NEXT:    br i1 poison, label [[BB7]], label [[BB6]]
 ; CHECK:       bb9:
 ; CHECK-NEXT:    [[INDVARS_IV528799:%.*]] = phi i64 [ poison, [[BB10]] ], [ poison, [[BB12]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[BB10]] ], [ [[TMP10:%.*]], [[BB12]] ]
-; CHECK-NEXT:    [[TMP8]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP7]], i64 2)
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], [[BB10]] ], [ [[TMP9:%.*]], [[BB12]] ]
+; CHECK-NEXT:    [[TMP7]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 0>
 ; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb10:
-; CHECK-NEXT:    [[TMP9]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ]
+; CHECK-NEXT:    [[TMP8]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ]
 ; CHECK-NEXT:    [[LANDING_PAD68:%.*]] = landingpad { ptr, i32 }
 ; CHECK-NEXT:            cleanup
 ; CHECK-NEXT:    br label [[BB9]]
 ; CHECK:       bb11:
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb12:
-; CHECK-NEXT:    [[TMP10]] = phi <2 x i32> [ [[TMP5]], [[BB7]] ]
+; CHECK-NEXT:    [[TMP9]] = phi <2 x i32> [ [[TMP4]], [[BB7]] ]
 ; CHECK-NEXT:    [[LANDING_PAD149:%.*]] = landingpad { ptr, i32 }
 ; CHECK-NEXT:            cleanup
 ; CHECK-NEXT:    br label [[BB9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbw-multiused-from-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbw-multiused-from-gather.ll
new file mode 100644
index 0000000000000..8e4b280271051
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbw-multiused-from-gather.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i1 @test() {
+; CHECK-LABEL: define i1 @test() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 0 to i32
+; CHECK-NEXT:    [[CONV85_22_I333_I_I:%.*]] = or i32 0, [[TMP0]]
+; CHECK-NEXT:    [[CMP3_I_22_I334_I_I:%.*]] = icmp ugt i32 [[CONV85_22_I333_I_I]], 0
+; CHECK-NEXT:    [[SHL_I111_22_I335_I_I:%.*]] = select i1 [[CMP3_I_22_I334_I_I]], i32 0, i32 0
+; CHECK-NEXT:    [[C22_I336_I_I:%.*]] = shl i32 [[CONV85_22_I333_I_I]], [[SHL_I111_22_I335_I_I]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 0 to i32
+; CHECK-NEXT:    [[CONV85_23_I340_I_I:%.*]] = or i32 0, [[TMP1]]
+; CHECK-NEXT:    [[CMP3_I_23_I341_I_I:%.*]] = icmp ugt i32 [[CONV85_23_I340_I_I]], 0
+; CHECK-NEXT:    [[SHL_I111_23_I342_I_I:%.*]] = select i1 [[CMP3_I_23_I341_I_I]], i32 0, i32 0
+; CHECK-NEXT:    [[C23_I343_I_I:%.*]] = shl i32 [[CONV85_23_I340_I_I]], [[SHL_I111_23_I342_I_I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 0 to i32
+; CHECK-NEXT:    [[CONV85_24_I347_I_I:%.*]] = or i32 0, [[TMP2]]
+; CHECK-NEXT:    [[CMP3_I_24_I348_I_I:%.*]] = icmp ugt i32 [[CONV85_24_I347_I_I]], 0
+; CHECK-NEXT:    [[SHL_I111_24_I349_I_I:%.*]] = select i1 [[CMP3_I_24_I348_I_I]], i32 0, i32 0
+; CHECK-NEXT:    [[C24_I350_I_I:%.*]] = shl i32 [[CONV85_24_I347_I_I]], [[SHL_I111_24_I349_I_I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 0 to i32
+; CHECK-NEXT:    [[CONV85_25_I354_I_I:%.*]] = or i32 0, [[TMP3]]
+; CHECK-NEXT:    [[CMP3_I_25_I355_I_I:%.*]] = icmp ugt i32 [[CONV85_25_I354_I_I]], 0
+; CHECK-NEXT:    [[SHL_I111_25_I356_I_I:%.*]] = select i1 [[CMP3_I_25_I355_I_I]], i32 0, i32 0
+; CHECK-NEXT:    [[C25_I357_I_I:%.*]] = shl i32 [[CONV85_25_I354_I_I]], [[SHL_I111_25_I356_I_I]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP4]], [[C22_I336_I_I]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = and i32 [[C23_I343_I_I]], [[C24_I350_I_I]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = and i32 [[OP_RDX]], [[OP_RDX1]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[C25_I357_I_I]]
+; CHECK-NEXT:    [[CONV109_I_I:%.*]] = trunc i32 [[OP_RDX3]] to i8
+; CHECK-NEXT:    [[CMP_I_I54_I:%.*]] = icmp eq i8 [[CONV109_I_I]], 0
+; CHECK-NEXT:    ret i1 [[CMP_I_I54_I]]
+;
+entry:
+  %c18.i308.i.i = shl i32 0, 0
+  %c19.i315.i.i = shl i32 0, 0
+  %and.19.i316.i.i = and i32 %c18.i308.i.i, %c19.i315.i.i
+  %c20.i322.i.i = shl i32 0, 0
+  %and.20.i323.i.i = and i32 %and.19.i316.i.i, %c20.i322.i.i
+  %c21.i329.i.i = shl i32 0, 0
+  %and.21.i330.i.i = and i32 %and.20.i323.i.i, %c21.i329.i.i
+  %0 = trunc i64 0 to i32
+  %conv85.22.i333.i.i = or i32 0, %0
+  %cmp3.i.22.i334.i.i = icmp ugt i32 %conv85.22.i333.i.i, 0
+  %shl.i111.22.i335.i.i = select i1 %cmp3.i.22.i334.i.i, i32 0, i32 0
+  %c22.i336.i.i = shl i32 %conv85.22.i333.i.i, %shl.i111.22.i335.i.i
+  %and.22.i337.i.i = and i32 %and.21.i330.i.i, %c22.i336.i.i
+  %1 = trunc i64 0 to i32
+  %conv85.23.i340.i.i = or i32 0, %1
+  %cmp3.i.23.i341.i.i = icmp ugt i32 %conv85.23.i340.i.i, 0
+  %shl.i111.23.i342.i.i = select i1 %cmp3.i.23.i341.i.i, i32 0, i32 0
+  %c23.i343.i.i = shl i32 %conv85.23.i340.i.i, %shl.i111.23.i342.i.i
+  %and.23.i344.i.i = and i32 %and.22.i337.i.i, %c23.i343.i.i
+  %2 = trunc i64 0 to i32
+  %conv85.24.i347.i.i = or i32 0, %2
+  %cmp3.i.24.i348.i.i = icmp ugt i32 %conv85.24.i347.i.i, 0
+  %shl.i111.24.i349.i.i = select i1 %cmp3.i.24.i348.i.i, i32 0, i32 0
+  %c24.i350.i.i = shl i32 %conv85.24.i347.i.i, %shl.i111.24.i349.i.i
+  %and.24.i351.i.i = and i32 %and.23.i344.i.i, %c24.i350.i.i
+  %3 = trunc i64 0 to i32
+  %conv85.25.i354.i.i = or i32 0, %3
+  %cmp3.i.25.i355.i.i = icmp ugt i32 %conv85.25.i354.i.i, 0
+  %shl.i111.25.i356.i.i = select i1 %cmp3.i.25.i355.i.i, i32 0, i32 0
+  %c25.i357.i.i = shl i32 %conv85.25.i354.i.i, %shl.i111.25.i356.i.i
+  %and.25.i358.i.i = and i32 %and.24.i351.i.i, %c25.i357.i.i
+  %conv109.i.i = trunc i32 %and.25.i358.i.i to i8
+  %cmp.i.i54.i = icmp eq i8 %conv109.i.i, 0
+  ret i1 %cmp.i.i54.i
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-num-elems.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-num-elems.ll
new file mode 100644
index 0000000000000..893dc5b3ea44d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-num-elems.ll
@@ -0,0 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64--linux-gnu < %s -mcpu=sapphirerapids | FileCheck %s
+
+define void @test(ptr %src, ptr %dst, ptr %c, i64 %arrayidx14.15p) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], ptr [[C:%.*]], i64 [[ARRAYIDX14_15P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[ARRAYIDX12_5:%.*]] = getelementptr i8, ptr [[C]], i64 192
+; CHECK-NEXT:    [[ARRAYIDX14_5:%.*]] = getelementptr i8, ptr [[C]], i64 320
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr i8, ptr [[C]], i64 72
+; CHECK-NEXT:    [[ARRAYIDX12_6:%.*]] = getelementptr i8, ptr [[C]], i64 200
+; CHECK-NEXT:    [[ARRAYIDX14_6:%.*]] = getelementptr i8, ptr [[C]], i64 328
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr i8, ptr [[C]], i64 80
+; CHECK-NEXT:    [[ARRAYIDX12_7:%.*]] = getelementptr i8, ptr [[C]], i64 208
+; CHECK-NEXT:    [[ARRAYIDX14_7:%.*]] = getelementptr i8, ptr [[C]], i64 336
+; CHECK-NEXT:    [[ARRAYIDX12_8:%.*]] = getelementptr i8, ptr [[C]], i64 216
+; CHECK-NEXT:    [[ARRAYIDX14_8:%.*]] = getelementptr i8, ptr [[C]], i64 344
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr i8, ptr [[C]], i64 96
+; CHECK-NEXT:    [[ARRAYIDX12_9:%.*]] = getelementptr i8, ptr [[C]], i64 224
+; CHECK-NEXT:    [[ARRAYIDX14_9:%.*]] = getelementptr i8, ptr [[C]], i64 352
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr i8, ptr [[C]], i64 104
+; CHECK-NEXT:    [[ARRAYIDX12_10:%.*]] = getelementptr i8, ptr [[C]], i64 232
+; CHECK-NEXT:    [[ARRAYIDX14_10:%.*]] = getelementptr i8, ptr [[C]], i64 360
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr i8, ptr [[C]], i64 112
+; CHECK-NEXT:    [[ARRAYIDX12_11:%.*]] = getelementptr i8, ptr [[C]], i64 240
+; CHECK-NEXT:    [[ARRAYIDX14_11:%.*]] = getelementptr i8, ptr [[C]], i64 368
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr i8, ptr [[C]], i64 120
+; CHECK-NEXT:    [[ARRAYIDX12_12:%.*]] = getelementptr i8, ptr [[C]], i64 248
+; CHECK-NEXT:    [[ARRAYIDX14_12:%.*]] = getelementptr i8, ptr [[C]], i64 376
+; CHECK-NEXT:    [[ARRAYIDX14_13:%.*]] = getelementptr i8, ptr [[C]], i64 384
+; CHECK-NEXT:    [[ARRAYIDX12_5P:%.*]] = load i64, ptr [[ARRAYIDX12_5]], align 8
+; CHECK-NEXT:    [[ARRAYIDX14_5P:%.*]] = load i64, ptr [[ARRAYIDX14_5]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_6P:%.*]] = load i64, ptr [[ARRAYIDX_6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12_6P:%.*]] = load i64, ptr [[ARRAYIDX12_6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX14_6P:%.*]] = load i64, ptr [[ARRAYIDX14_6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_7P:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12_7P:%.*]] = load i64, ptr [[ARRAYIDX12_7]], align 8
+; CHECK-NEXT:    [[ARRAYIDX14_7P:%.*]] = load i64, ptr [[ARRAYIDX14_7]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_8P:%.*]] = load i64, ptr [[C]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12_8P:%.*]] = load i64, ptr [[ARRAYIDX12_8]], align 8
+; CHECK-NEXT:    [[ARRAYIDX14_8P:%.*]] = load i64, ptr [[ARRAYIDX14_8]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_9P:%.*]] = load i64, ptr [[ARRAYIDX_9]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12_9P:%.*]] = load i64, ptr [[ARRAYIDX12_9]], align 8
+; CHECK-NEXT:    [[ARRAYIDX14_9P:%.*]] = load i64, ptr [[ARRAYIDX14_9]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_10P:%.*]] = load i64, ptr [[ARRAYIDX_10]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12_10P:%.*]] = load i64, ptr [[ARRAYIDX12_10]], align 8
+; CHECK-NEXT:    [[ARRAYIDX14_10P:%.*]] = load i64, ptr [[ARRAYIDX14_10]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_11P:%.*]] = load i64, ptr [[ARRAYIDX_11]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12_11P:%.*]] = load i64, ptr [[ARRAYIDX12_11]], align 8
+; CHECK-NEXT:    [[ARRAYIDX14_11P:%.*]] = load i64, ptr [[ARRAYIDX14_11]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_12P:%.*]] = load i64, ptr [[ARRAYIDX_12]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12_12P:%.*]] = load i64, ptr [[ARRAYIDX12_12]], align 8
+; CHECK-NEXT:    [[ARRAYIDX14_12P:%.*]] = load i64, ptr [[ARRAYIDX14_12]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_13P:%.*]] = load i64, ptr [[SRC]], align 8
+; CHECK-NEXT:    [[ARRAYIDX14_13P:%.*]] = load i64, ptr [[ARRAYIDX14_13]], align 8
+; CHECK-NEXT:    br label %[[LAND_LHS_TRUE137:.*]]
+; CHECK:       [[LAND_LHS_TRUE137]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_15P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, %[[LAND_LHS_TRUE137]] ], [ 1, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 1, %[[LAND_LHS_TRUE137]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ [[TMP0]], %[[LAND_LHS_TRUE137]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ [[TMP1]], %[[LAND_LHS_TRUE137]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i64 [ [[TMP2]], %[[LAND_LHS_TRUE137]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i64 [ [[TMP3]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_13P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ [[TMP4]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_13P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i64 [ [[TMP5]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_13P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ [[TMP6]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_12P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i64 [ [[TMP7]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_12P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP8]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_12P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i64 [ [[TMP9]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_11P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i64 [ [[TMP10]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_11P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i64 [ [[TMP11]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_11P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi i64 [ [[TMP12]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_10P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = phi i64 [ [[TMP13]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_10P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi i64 [ [[TMP14]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_10P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i64 [ [[TMP15]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_9P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = phi i64 [ [[TMP16]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_9P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = phi i64 [ [[TMP17]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_9P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi i64 [ [[TMP18]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_8P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi i64 [ [[TMP19]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_8P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ [[TMP20]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_8P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = phi i64 [ [[TMP21]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_7P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = phi i64 [ [[TMP22]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_7P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = phi i64 [ [[TMP23]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_7P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = phi i64 [ [[TMP24]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_6P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = phi i64 [ [[TMP25]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_6P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = phi i64 [ [[TMP26]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX_6P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP30:%.*]] = phi i64 [ [[TMP27]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX14_5P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = phi i64 [ [[TMP28]], %[[LAND_LHS_TRUE137]] ], [ [[ARRAYIDX12_5P]], %[[ENTRY]] ]
+; CHECK-NEXT:    store i64 [[TMP31]], ptr [[DST]], align 8
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[SRC]], align 8
+; CHECK-NEXT:    store i64 [[TMP29]], ptr [[DST]], align 8
+; CHECK-NEXT:    br label %[[LAND_LHS_TRUE137]]
+;
+entry:
+  %arrayidx12.5 = getelementptr i8, ptr %c, i64 192
+  %arrayidx14.5 = getelementptr i8, ptr %c, i64 320
+  %arrayidx.6 = getelementptr i8, ptr %c, i64 72
+  %arrayidx12.6 = getelementptr i8, ptr %c, i64 200
+  %arrayidx14.6 = getelementptr i8, ptr %c, i64 328
+  %arrayidx.7 = getelementptr i8, ptr %c, i64 80
+  %arrayidx12.7 = getelementptr i8, ptr %c, i64 208
+  %arrayidx14.7 = getelementptr i8, ptr %c, i64 336
+  %arrayidx12.8 = getelementptr i8, ptr %c, i64 216
+  %arrayidx14.8 = getelementptr i8, ptr %c, i64 344
+  %arrayidx.9 = getelementptr i8, ptr %c, i64 96
+  %arrayidx12.9 = getelementptr i8, ptr %c, i64 224
+  %arrayidx14.9 = getelementptr i8, ptr %c, i64 352
+  %arrayidx.10 = getelementptr i8, ptr %c, i64 104
+  %arrayidx12.10 = getelementptr i8, ptr %c, i64 232
+  %arrayidx14.10 = getelementptr i8, ptr %c, i64 360
+  %arrayidx.11 = getelementptr i8, ptr %c, i64 112
+  %arrayidx12.11 = getelementptr i8, ptr %c, i64 240
+  %arrayidx14.11 = getelementptr i8, ptr %c, i64 368
+  %arrayidx.12 = getelementptr i8, ptr %c, i64 120
+  %arrayidx12.12 = getelementptr i8, ptr %c, i64 248
+  %arrayidx14.12 = getelementptr i8, ptr %c, i64 376
+  %arrayidx14.13 = getelementptr i8, ptr %c, i64 384
+  %arrayidx12.5p = load i64, ptr %arrayidx12.5, align 8
+  %arrayidx14.5p = load i64, ptr %arrayidx14.5, align 8
+  %arrayidx.6p = load i64, ptr %arrayidx.6, align 8
+  %arrayidx12.6p = load i64, ptr %arrayidx12.6, align 8
+  %arrayidx14.6p = load i64, ptr %arrayidx14.6, align 8
+  %arrayidx.7p = load i64, ptr %arrayidx.7, align 8
+  %arrayidx12.7p = load i64, ptr %arrayidx12.7, align 8
+  %arrayidx14.7p = load i64, ptr %arrayidx14.7, align 8
+  %arrayidx.8p = load i64, ptr %c, align 8
+  %arrayidx12.8p = load i64, ptr %arrayidx12.8, align 8
+  %arrayidx14.8p = load i64, ptr %arrayidx14.8, align 8
+  %arrayidx.9p = load i64, ptr %arrayidx.9, align 8
+  %arrayidx12.9p = load i64, ptr %arrayidx12.9, align 8
+  %arrayidx14.9p = load i64, ptr %arrayidx14.9, align 8
+  %arrayidx.10p = load i64, ptr %arrayidx.10, align 8
+  %arrayidx12.10p = load i64, ptr %arrayidx12.10, align 8
+  %arrayidx14.10p = load i64, ptr %arrayidx14.10, align 8
+  %arrayidx.11p = load i64, ptr %arrayidx.11, align 8
+  %arrayidx12.11p = load i64, ptr %arrayidx12.11, align 8
+  %arrayidx14.11p = load i64, ptr %arrayidx14.11, align 8
+  %arrayidx.12p = load i64, ptr %arrayidx.12, align 8
+  %arrayidx12.12p = load i64, ptr %arrayidx12.12, align 8
+  %arrayidx14.12p = load i64, ptr %arrayidx14.12, align 8
+  %arrayidx.13p = load i64, ptr %src, align 8
+  %arrayidx14.13p = load i64, ptr %arrayidx14.13, align 8
+  br label %land.lhs.true137
+
+land.lhs.true137:
+  %0 = phi i64 [ 0, %land.lhs.true137 ], [ %arrayidx14.15p, %entry ]
+  %1 = phi i64 [ 0, %land.lhs.true137 ], [ 1, %entry ]
+  %2 = phi i64 [ 1, %land.lhs.true137 ], [ 0, %entry ]
+  %3 = phi i64 [ %0, %land.lhs.true137 ], [ 0, %entry ]
+  %4 = phi i64 [ %1, %land.lhs.true137 ], [ 0, %entry ]
+  %5 = phi i64 [ %2, %land.lhs.true137 ], [ 0, %entry ]
+  %6 = phi i64 [ %3, %land.lhs.true137 ], [ %arrayidx14.13p, %entry ]
+  %7 = phi i64 [ %4, %land.lhs.true137 ], [ %arrayidx.13p, %entry ]
+  %8 = phi i64 [ %5, %land.lhs.true137 ], [ %arrayidx.13p, %entry ]
+  %9 = phi i64 [ %6, %land.lhs.true137 ], [ %arrayidx14.12p, %entry ]
+  %10 = phi i64 [ %7, %land.lhs.true137 ], [ %arrayidx12.12p, %entry ]
+  %11 = phi i64 [ %8, %land.lhs.true137 ], [ %arrayidx.12p, %entry ]
+  %12 = phi i64 [ %9, %land.lhs.true137 ], [ %arrayidx14.11p, %entry ]
+  %13 = phi i64 [ %10, %land.lhs.true137 ], [ %arrayidx12.11p, %entry ]
+  %14 = phi i64 [ %11, %land.lhs.true137 ], [ %arrayidx.11p, %entry ]
+  %15 = phi i64 [ %12, %land.lhs.true137 ], [ %arrayidx14.10p, %entry ]
+  %16 = phi i64 [ %13, %land.lhs.true137 ], [ %arrayidx12.10p, %entry ]
+  %17 = phi i64 [ %14, %land.lhs.true137 ], [ %arrayidx.10p, %entry ]
+  %18 = phi i64 [ %15, %land.lhs.true137 ], [ %arrayidx14.9p, %entry ]
+  %19 = phi i64 [ %16, %land.lhs.true137 ], [ %arrayidx12.9p, %entry ]
+  %20 = phi i64 [ %17, %land.lhs.true137 ], [ %arrayidx.9p, %entry ]
+  %21 = phi i64 [ %18, %land.lhs.true137 ], [ %arrayidx14.8p, %entry ]
+  %22 = phi i64 [ %19, %land.lhs.true137 ], [ %arrayidx12.8p, %entry ]
+  %23 = phi i64 [ %20, %land.lhs.true137 ], [ %arrayidx.8p, %entry ]
+  %24 = phi i64 [ %21, %land.lhs.true137 ], [ %arrayidx14.7p, %entry ]
+  %25 = phi i64 [ %22, %land.lhs.true137 ], [ %arrayidx12.7p, %entry ]
+  %26 = phi i64 [ %23, %land.lhs.true137 ], [ %arrayidx.7p, %entry ]
+  %27 = phi i64 [ %24, %land.lhs.true137 ], [ %arrayidx14.6p, %entry ]
+  %28 = phi i64 [ %25, %land.lhs.true137 ], [ %arrayidx12.6p, %entry ]
+  %29 = phi i64 [ %26, %land.lhs.true137 ], [ %arrayidx.6p, %entry ]
+  %30 = phi i64 [ %27, %land.lhs.true137 ], [ %arrayidx14.5p, %entry ]
+  %31 = phi i64 [ %28, %land.lhs.true137 ], [ %arrayidx12.5p, %entry ]
+  store i64 %31, ptr %dst, align 8
+  store i64 %30, ptr %src, align 8
+  store i64 %29, ptr %dst, align 8
+  br label %land.lhs.true137
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll
new file mode 100644
index 0000000000000..47dd84c7f6e9c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-1000 < %s | FileCheck %s
+
+define void @e(ptr %c, i64 %0) {
+; CHECK-LABEL: define void @e(
+; CHECK-SAME: ptr [[C:%.*]], i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C]], align 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[TMP1]], i64 96
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i8, ptr [[TMP1]], i64 112
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x ptr> [[TMP5]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <6 x ptr> poison, ptr [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <6 x ptr> [[TMP7]], ptr [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP8]], <2 x ptr> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP9]], <2 x ptr> [[TMP6]], i64 4)
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint <6 x ptr> [[TMP10]] to <6 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <32 x i64> poison, i64 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i64> [[TMP13]], <32 x i64> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i64> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult <32 x i64> [[TMP15]], <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> [[TMP16]])
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[FOR_BODY:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = load ptr, ptr %c, align 8
+  %arrayidx = getelementptr i8, ptr %1, i64 96
+  %arrayidx1 = getelementptr i8, ptr %1, i64 112
+  %2 = load ptr, ptr %arrayidx1, align 8
+  %arrayidx5 = getelementptr i8, ptr %1, i64 104
+  %3 = load ptr, ptr %arrayidx5, align 8
+  %4 = load ptr, ptr %arrayidx, align 8
+  %5 = load ptr, ptr %c, align 8
+  %6 = ptrtoint ptr %5 to i64
+  %7 = ptrtoint ptr %5 to i64
+  %8 = ptrtoint ptr %1 to i64
+  %9 = ptrtoint ptr %4 to i64
+  %10 = ptrtoint ptr %3 to i64
+  %11 = ptrtoint ptr %2 to i64
+  %12 = or i64 %0, %11
+  %dc64 = icmp ult i64 %12, 16
+  %13 = or i64 %0, %11
+  %dc65 = icmp ult i64 %13, 16
+  %cr66 = or i1 %dc64, %dc65
+  %14 = or i64 %0, %11
+  %dc67 = icmp ult i64 %14, 16
+  %cr68 = or i1 %cr66, %dc67
+  %15 = or i64 %0, %11
+  %dc69 = icmp ult i64 %15, 16
+  %cr70 = or i1 %cr68, %dc69
+  %16 = or i64 %0, %11
+  %dc71 = icmp ult i64 %16, 16
+  %cr72 = or i1 %cr70, %dc71
+  %17 = or i64 %0, %11
+  %dc73 = icmp ult i64 %17, 16
+  %cr74 = or i1 %cr72, %dc73
+  %18 = or i64 %0, %11
+  %dc75 = icmp ult i64 %18, 16
+  %cr76 = or i1 %cr74, %dc75
+  %19 = or i64 %0, %10
+  %dc77 = icmp ult i64 %19, 16
+  %cr78 = or i1 %cr76, %dc77
+  %20 = or i64 %0, %10
+  %dc79 = icmp ult i64 %20, 16
+  %cr80 = or i1 %cr78, %dc79
+  %21 = or i64 %0, %10
+  %dc81 = icmp ult i64 %21, 16
+  %cr82 = or i1 %cr80, %dc81
+  %22 = or i64 %0, %10
+  %dc83 = icmp ult i64 %22, 16
+  %cr84 = or i1 %cr82, %dc83
+  %23 = or i64 %0, %10
+  %dc85 = icmp ult i64 %23, 16
+  %cr86 = or i1 %cr84, %dc85
+  %24 = or i64 %0, %10
+  %dc87 = icmp ult i64 %24, 16
+  %cr88 = or i1 %cr86, %dc87
+  %25 = or i64 %0, %10
+  %dc89 = icmp ult i64 %25, 16
+  %cr90 = or i1 %cr88, %dc89
+  %26 = or i64 %0, %9
+  %dc91 = icmp ult i64 %26, 16
+  %cr92 = or i1 %cr90, %dc91
+  %27 = or i64 %0, %9
+  %dc93 = icmp ult i64 %27, 16
+  %cr94 = or i1 %cr92, %dc93
+  %28 = or i64 %0, %9
+  %dc95 = icmp ult i64 %28, 16
+  %cr96 = or i1 %cr94, %dc95
+  %29 = or i64 %0, %9
+  %dc97 = icmp ult i64 %29, 16
+  %cr98 = or i1 %cr96, %dc97
+  %30 = or i64 %0, %9
+  %dc99 = icmp ult i64 %30, 16
+  %cr100 = or i1 %cr98, %dc99
+  %31 = or i64 %0, %9
+  %dc101 = icmp ult i64 %31, 16
+  %cr102 = or i1 %cr100, %dc101
+  %32 = or i64 %0, %8
+  %dc103 = icmp ult i64 %32, 16
+  %cr104 = or i1 %cr102, %dc103
+  %33 = or i64 %0, %8
+  %dc105 = icmp ult i64 %33, 16
+  %cr106 = or i1 %cr104, %dc105
+  %34 = or i64 %0, %8
+  %dc107 = icmp ult i64 %34, 16
+  %cr108 = or i1 %cr106, %dc107
+  %35 = or i64 %0, %8
+  %dc109 = icmp ult i64 %35, 16
+  %cr110 = or i1 %cr108, %dc109
+  %36 = or i64 %0, %8
+  %dc111 = icmp ult i64 %36, 16
+  %cr112 = or i1 %cr110, %dc111
+  %37 = or i64 %0, %7
+  %dc113 = icmp ult i64 %37, 16
+  %cr114 = or i1 %cr112, %dc113
+  %38 = or i64 %0, %7
+  %dc115 = icmp ult i64 %38, 16
+  %cr116 = or i1 %cr114, %dc115
+  %39 = or i64 %0, %7
+  %dc117 = icmp ult i64 %39, 16
+  %cr118 = or i1 %cr116, %dc117
+  %40 = or i64 %0, %7
+  %dc119 = icmp ult i64 %40, 16
+  %cr120 = or i1 %cr118, %dc119
+  %41 = or i64 %0, %6
+  %dc121 = icmp ult i64 %41, 16
+  %cr122 = or i1 %cr120, %dc121
+  %42 = or i64 %0, %6
+  %dc123 = icmp ult i64 %42, 16
+  %cr124 = or i1 %cr122, %dc123
+  %43 = or i64 %0, %6
+  %dc125 = icmp ult i64 %43, 16
+  %cr126 = or i1 %cr124, %dc125
+  br i1 %cr126, label %for.body, label %vector.ph
+
+vector.ph:
+  ret void
+
+for.body:
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
index 7201583f3450e..ec8bcc85e7db0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -144,35 +144,36 @@ define float @foo3(ptr nocapture readonly %A) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP14:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
 ; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
-; CHECK-NEXT:    [[TMP9]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 2)
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <4 x float> [[TMP12]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
-; CHECK-NEXT:    [[TMP14]] = fadd <4 x float> [[TMP3]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP15]], 121
+; CHECK-NEXT:    [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 5, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <4 x float> [[TMP13]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
+; CHECK-NEXT:    [[TMP15]] = fadd <4 x float> [[TMP3]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP16]], 121
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP14]], i32 0
-; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP14]], i32 1
-; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP14]], i32 2
-; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP14]], i32 3
-; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP19]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP15]], i32 0
+; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP15]], i32 1
+; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP15]], i32 2
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP15]], i32 3
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP20]]
 ; CHECK-NEXT:    ret float [[ADD31]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index 12389f4a3dbf4..6200e3ae43fc9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -315,11 +315,12 @@ define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) {
 define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: @logical_and_icmp_clamp_v8i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[X:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[Y:%.*]], <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 poison, i32 poison, i32 poison, i32 poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]])
-; CHECK-NEXT:    ret i1 [[TMP5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[Y:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 poison, i32 poison, i32 poison, i32 poison>, <4 x i32> [[TMP2]], i64 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <8 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
+; CHECK-NEXT:    ret i1 [[TMP6]]
 ;
   %x0 = extractelement <8 x i32> %x, i32 0
   %x1 = extractelement <8 x i32> %x, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
index 4ed52247c2ef3..b79ba458ef706 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -12,30 +12,25 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
 ; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
 ; CHECK-NEXT:    [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1
-; CHECK-NEXT:    [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2
-; CHECK-NEXT:    [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[SUB_I]] to i8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[SHR_1_I_I]] to i8
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[SHR_2_I_I]] to i8
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[TMP6]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP17:%.*]] = trunc <8 x i32> [[TMP16]] to <8 x i8>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i8> [[TMP19]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    store <16 x i8> [[TMP20]], ptr undef, align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <2 x i32> [[TMP1]], <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP3]], <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP5]], <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[SUB_I]] to i8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> poison, i8 [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[SHR_I_I]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc <8 x i32> [[TMP6]] to <8 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP10]], <8 x i8> [[TMP11]], i64 8)
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP12]], <4 x i8> [[TMP13]], i64 4)
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v2i8(<16 x i8> [[TMP14]], <2 x i8> [[TMP15]], i64 2)
+; CHECK-NEXT:    [[TMP17:%.*]] = and <16 x i8> [[TMP16]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    store <16 x i8> [[TMP17]], ptr undef, align 1
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end50.i:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
index 281b5f99540ea..4074b8654362e 100644
--- a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
@@ -1,21 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s --check-prefix=RISCV
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-100 | FileCheck %s
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -slp-threshold=-100 | FileCheck %s
 ; REQUIRES: aarch64-registered-target, x86-registered-target, riscv-registered-target
 
 define i64 @test(ptr %p) {
+; RISCV-LABEL: @test(
+; RISCV-NEXT:  entry:
+; RISCV-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
+; RISCV-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
+; RISCV-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
+; RISCV-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
+; RISCV-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
+; RISCV-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
+; RISCV-NEXT:    [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
+; RISCV-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
+; RISCV-NEXT:    ret i64 [[TMP6]]
+;
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
-; CHECK-NEXT:    ret i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i64> [[TMP1]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %arrayidx.1 = getelementptr inbounds i64, ptr %p, i64 1
diff --git a/llvm/test/Transforms/SimplifyCFG/avoid-complex-phi.ll b/llvm/test/Transforms/SimplifyCFG/avoid-complex-phi.ll
new file mode 100644
index 0000000000000..c24fae7aa67bb
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/avoid-complex-phi.ll
@@ -0,0 +1,164 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -max-phi-entries-increase-after-removing-empty-block=12 -passes=simplifycfg -S | FileCheck --check-prefixes=CHECK-12 %s
+; RUN: opt < %s -max-phi-entries-increase-after-removing-empty-block=11 -passes=simplifycfg -S | FileCheck --check-prefixes=CHECK-11 %s
+; RUN: opt < %s -max-phi-entries-increase-after-removing-empty-block=4 -passes=simplifycfg -S | FileCheck --check-prefixes=CHECK-4 %s
+;
+; This test has the following CFG:
+;   1. entry has a switch to 4 blocks: B1 - B4
+;   2. For B1 and B2, it branches to B5 and B6
+;   3. For B3 and B4, it branches to B5 and B7
+;   4. In B5, %val is defined as phi taking values from B1 to B4
+;   5. B5, B6, B7 branch to block Merge unconditionally
+;   6. Block Merge has 5 phis(%x1 - %x4 and %val_merge).
+;
+; If we remove B5, %x1 - %x4 will increase the number of phi entries by (4 - 1) * 4 = 12. For %val_merge, since the value taking from B5
+; is defined in B5, it will not increase the number of phi entries (it can be considered as move the entries from %val to
+; %val_merge). Therefore, removing B5 will increase the number of phi entries by 12 (not (4 - 1) * 5 = 15).
+;
+; If we remove B6 / B7, it will increase the number of phi entries by (2 - 1) * 5 = 5.
+;
+; In the first test, max-phi-entries-increase-after-removing-empty-block is set to be 12, then B5 will be removed.
+; In the second test, max-phi-entries-increase-after-removing-empty-block is set to be 11, then B5 should not be removed,
+; but B6 and B7 can be removed.
+; In the third test, max-phi-entries-increase-after-removing-empty-block is set to be 4, then no BB can be removed.
+;
+define void @foo(i32 %a, i32 %val1, i32 %val2, i32 %val3, i32 %val4) {
+; CHECK-12-LABEL: define void @foo(
+; CHECK-12-SAME: i32 [[A:%.*]], i32 [[VAL1:%.*]], i32 [[VAL2:%.*]], i32 [[VAL3:%.*]], i32 [[VAL4:%.*]]) {
+; CHECK-12-NEXT:  [[ENTRY:.*:]]
+; CHECK-12-NEXT:    switch i32 [[A]], label %[[B1:.*]] [
+; CHECK-12-NEXT:      i32 4, label %[[B4:.*]]
+; CHECK-12-NEXT:      i32 2, label %[[B2:.*]]
+; CHECK-12-NEXT:      i32 3, label %[[B3:.*]]
+; CHECK-12-NEXT:    ]
+; CHECK-12:       [[B1]]:
+; CHECK-12-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[VAL1]], 1
+; CHECK-12-NEXT:    br i1 [[CMP1]], label %[[B6:.*]], label %[[MERGE:.*]]
+; CHECK-12:       [[B2]]:
+; CHECK-12-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[VAL2]], 2
+; CHECK-12-NEXT:    br i1 [[CMP2]], label %[[B6]], label %[[MERGE]]
+; CHECK-12:       [[B3]]:
+; CHECK-12-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[VAL3]], 3
+; CHECK-12-NEXT:    br i1 [[CMP3]], label %[[B7:.*]], label %[[MERGE]]
+; CHECK-12:       [[B4]]:
+; CHECK-12-NEXT:    [[CMP4:%.*]] = icmp eq i32 [[VAL4]], 4
+; CHECK-12-NEXT:    br i1 [[CMP4]], label %[[B7]], label %[[MERGE]]
+; CHECK-12:       [[B6]]:
+; CHECK-12-NEXT:    br label %[[MERGE]]
+; CHECK-12:       [[B7]]:
+; CHECK-12-NEXT:    br label %[[MERGE]]
+; CHECK-12:       [[MERGE]]:
+; CHECK-12-NEXT:    [[X1:%.*]] = phi i16 [ 0, %[[B6]] ], [ 2, %[[B7]] ], [ 1, %[[B4]] ], [ 1, %[[B3]] ], [ 1, %[[B2]] ], [ 1, %[[B1]] ]
+; CHECK-12-NEXT:    [[X2:%.*]] = phi i16 [ 0, %[[B6]] ], [ 2, %[[B7]] ], [ 2, %[[B4]] ], [ 2, %[[B3]] ], [ 2, %[[B2]] ], [ 2, %[[B1]] ]
+; CHECK-12-NEXT:    [[X3:%.*]] = phi i16 [ 0, %[[B6]] ], [ 2, %[[B7]] ], [ 3, %[[B4]] ], [ 3, %[[B3]] ], [ 3, %[[B2]] ], [ 3, %[[B1]] ]
+; CHECK-12-NEXT:    [[X4:%.*]] = phi i16 [ 0, %[[B6]] ], [ 2, %[[B7]] ], [ 4, %[[B4]] ], [ 4, %[[B3]] ], [ 4, %[[B2]] ], [ 4, %[[B1]] ]
+; CHECK-12-NEXT:    [[VAL_MERGE:%.*]] = phi i32 [ 0, %[[B6]] ], [ 2, %[[B7]] ], [ [[VAL1]], %[[B1]] ], [ [[VAL2]], %[[B2]] ], [ [[VAL3]], %[[B3]] ], [ [[VAL4]], %[[B4]] ]
+; CHECK-12-NEXT:    ret void
+;
+; CHECK-11-LABEL: define void @foo(
+; CHECK-11-SAME: i32 [[A:%.*]], i32 [[VAL1:%.*]], i32 [[VAL2:%.*]], i32 [[VAL3:%.*]], i32 [[VAL4:%.*]]) {
+; CHECK-11-NEXT:  [[ENTRY:.*:]]
+; CHECK-11-NEXT:    switch i32 [[A]], label %[[B1:.*]] [
+; CHECK-11-NEXT:      i32 4, label %[[B4:.*]]
+; CHECK-11-NEXT:      i32 2, label %[[B2:.*]]
+; CHECK-11-NEXT:      i32 3, label %[[B3:.*]]
+; CHECK-11-NEXT:    ]
+; CHECK-11:       [[B1]]:
+; CHECK-11-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[VAL1]], 1
+; CHECK-11-NEXT:    br i1 [[CMP1]], label %[[MERGE:.*]], label %[[B5:.*]]
+; CHECK-11:       [[B2]]:
+; CHECK-11-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[VAL2]], 2
+; CHECK-11-NEXT:    br i1 [[CMP2]], label %[[MERGE]], label %[[B5]]
+; CHECK-11:       [[B3]]:
+; CHECK-11-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[VAL3]], 3
+; CHECK-11-NEXT:    br i1 [[CMP3]], label %[[MERGE]], label %[[B5]]
+; CHECK-11:       [[B4]]:
+; CHECK-11-NEXT:    [[CMP4:%.*]] = icmp eq i32 [[VAL4]], 4
+; CHECK-11-NEXT:    br i1 [[CMP4]], label %[[MERGE]], label %[[B5]]
+; CHECK-11:       [[B5]]:
+; CHECK-11-NEXT:    [[VAL:%.*]] = phi i32 [ [[VAL1]], %[[B1]] ], [ [[VAL2]], %[[B2]] ], [ [[VAL3]], %[[B3]] ], [ [[VAL4]], %[[B4]] ]
+; CHECK-11-NEXT:    br label %[[MERGE]]
+; CHECK-11:       [[MERGE]]:
+; CHECK-11-NEXT:    [[X1:%.*]] = phi i16 [ 1, %[[B5]] ], [ 0, %[[B2]] ], [ 0, %[[B1]] ], [ 2, %[[B4]] ], [ 2, %[[B3]] ]
+; CHECK-11-NEXT:    [[X2:%.*]] = phi i16 [ 2, %[[B5]] ], [ 0, %[[B2]] ], [ 0, %[[B1]] ], [ 2, %[[B4]] ], [ 2, %[[B3]] ]
+; CHECK-11-NEXT:    [[X3:%.*]] = phi i16 [ 3, %[[B5]] ], [ 0, %[[B2]] ], [ 0, %[[B1]] ], [ 2, %[[B4]] ], [ 2, %[[B3]] ]
+; CHECK-11-NEXT:    [[X4:%.*]] = phi i16 [ 4, %[[B5]] ], [ 0, %[[B2]] ], [ 0, %[[B1]] ], [ 2, %[[B4]] ], [ 2, %[[B3]] ]
+; CHECK-11-NEXT:    [[VAL_MERGE:%.*]] = phi i32 [ [[VAL]], %[[B5]] ], [ 0, %[[B2]] ], [ 0, %[[B1]] ], [ 2, %[[B4]] ], [ 2, %[[B3]] ]
+; CHECK-11-NEXT:    ret void
+;
+; CHECK-4-LABEL: define void @foo(
+; CHECK-4-SAME: i32 [[A:%.*]], i32 [[VAL1:%.*]], i32 [[VAL2:%.*]], i32 [[VAL3:%.*]], i32 [[VAL4:%.*]]) {
+; CHECK-4-NEXT:  [[ENTRY:.*:]]
+; CHECK-4-NEXT:    switch i32 [[A]], label %[[B1:.*]] [
+; CHECK-4-NEXT:      i32 4, label %[[B4:.*]]
+; CHECK-4-NEXT:      i32 2, label %[[B2:.*]]
+; CHECK-4-NEXT:      i32 3, label %[[B3:.*]]
+; CHECK-4-NEXT:    ]
+; CHECK-4:       [[B1]]:
+; CHECK-4-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[VAL1]], 1
+; CHECK-4-NEXT:    br i1 [[CMP1]], label %[[B6:.*]], label %[[B5:.*]]
+; CHECK-4:       [[B2]]:
+; CHECK-4-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[VAL2]], 2
+; CHECK-4-NEXT:    br i1 [[CMP2]], label %[[B6]], label %[[B5]]
+; CHECK-4:       [[B3]]:
+; CHECK-4-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[VAL3]], 3
+; CHECK-4-NEXT:    br i1 [[CMP3]], label %[[B7:.*]], label %[[B5]]
+; CHECK-4:       [[B4]]:
+; CHECK-4-NEXT:    [[CMP4:%.*]] = icmp eq i32 [[VAL4]], 4
+; CHECK-4-NEXT:    br i1 [[CMP4]], label %[[B7]], label %[[B5]]
+; CHECK-4:       [[B5]]:
+; CHECK-4-NEXT:    [[VAL:%.*]] = phi i32 [ [[VAL1]], %[[B1]] ], [ [[VAL2]], %[[B2]] ], [ [[VAL3]], %[[B3]] ], [ [[VAL4]], %[[B4]] ]
+; CHECK-4-NEXT:    br label %[[MERGE:.*]]
+; CHECK-4:       [[B6]]:
+; CHECK-4-NEXT:    br label %[[MERGE]]
+; CHECK-4:       [[B7]]:
+; CHECK-4-NEXT:    br label %[[MERGE]]
+; CHECK-4:       [[MERGE]]:
+; CHECK-4-NEXT:    [[X1:%.*]] = phi i16 [ 1, %[[B5]] ], [ 0, %[[B6]] ], [ 2, %[[B7]] ]
+; CHECK-4-NEXT:    [[X2:%.*]] = phi i16 [ 2, %[[B5]] ], [ 0, %[[B6]] ], [ 2, %[[B7]] ]
+; CHECK-4-NEXT:    [[X3:%.*]] = phi i16 [ 3, %[[B5]] ], [ 0, %[[B6]] ], [ 2, %[[B7]] ]
+; CHECK-4-NEXT:    [[X4:%.*]] = phi i16 [ 4, %[[B5]] ], [ 0, %[[B6]] ], [ 2, %[[B7]] ]
+; CHECK-4-NEXT:    [[VAL_MERGE:%.*]] = phi i32 [ [[VAL]], %[[B5]] ], [ 0, %[[B6]] ], [ 2, %[[B7]] ]
+; CHECK-4-NEXT:    ret void
+;
+entry:
+  switch i32 %a, label %B1 [
+  i32 4, label %B4
+  i32 2, label %B2
+  i32 3, label %B3
+  ]
+
+B1:                                               ; preds = %entry
+  %cmp1 = icmp eq i32 %val1, 1
+  br i1 %cmp1, label %B6, label %B5
+
+B2:                                               ; preds = %entry
+  %cmp2 = icmp eq i32 %val2, 2
+  br i1 %cmp2, label %B6, label %B5
+
+B3:                                               ; preds = %entry
+  %cmp3 = icmp eq i32 %val3, 3
+  br i1 %cmp3, label %B7, label %B5
+
+B4:                                               ; preds = %entry
+  %cmp4 = icmp eq i32 %val4, 4
+  br i1 %cmp4, label %B7, label %B5
+
+B5:                                              ; preds = %B4, %B3, %B2, %B1
+  %val = phi i32 [ %val1, %B1 ], [ %val2, %B2 ], [ %val3, %B3 ], [ %val4, %B4 ]
+  br label %Merge
+
+B6:                                              ; preds = %B2, %B1
+  br label %Merge
+
+B7:                                              ; preds = %B4, %B3
+  br label %Merge
+
+Merge:                                            ; preds = %B7, %B6, %B5
+  %x1 = phi i16 [ 1, %B5 ], [ 0, %B6 ], [ 2, %B7 ]
+  %x2 = phi i16 [ 2, %B5 ], [ 0, %B6 ], [ 2, %B7 ]
+  %x3 = phi i16 [ 3, %B5 ], [ 0, %B6 ], [ 2, %B7 ]
+  %x4 = phi i16 [ 4, %B5 ], [ 0, %B6 ], [ 2, %B7 ]
+  %val_merge = phi i32 [ %val, %B5 ], [ 0, %B6 ], [ 2, %B7 ]
+  ret void
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/speculate-store.ll b/llvm/test/Transforms/SimplifyCFG/speculate-store.ll
index c7ebeff5246d6..5addd0e3ad8ee 100644
--- a/llvm/test/Transforms/SimplifyCFG/speculate-store.ll
+++ b/llvm/test/Transforms/SimplifyCFG/speculate-store.ll
@@ -194,6 +194,156 @@ if.end:
   ret i32 %add
 }
 
+define i64 @load_before_store_noescape_byval(ptr byval([2 x i32]) %a, i64 %i, i32 %b)  {
+; CHECK-LABEL: @load_before_store_noescape_byval(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 -1, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 0, i64 [[I:%.*]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[V]], [[B:%.*]]
+; CHECK-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[CMP]], i32 [[B]], i32 [[V]]
+; CHECK-NEXT:    store i32 [[SPEC_STORE_SELECT]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    ret i64 [[V2]]
+;
+entry:
+  store i64 -1, ptr %a, align 8
+  %arrayidx = getelementptr inbounds [2 x i32], ptr %a, i64 0, i64 %i
+  %v = load i32, ptr %arrayidx, align 4
+  %cmp = icmp slt i32 %v, %b
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 %b, ptr %arrayidx, align 4
+  br label %if.end
+
+if.end:
+  %v2 = load i64, ptr %a, align 8
+  ret i64 %v2
+}
+
+declare noalias ptr @malloc(i64 %size)
+
+define i64 @load_before_store_noescape_malloc(i64 %i, i32 %b)  {
+; CHECK-LABEL: @load_before_store_noescape_malloc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = call ptr @malloc(i64 8)
+; CHECK-NEXT:    store i64 -1, ptr [[A]], align 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 0, i64 [[I:%.*]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[V]], [[B:%.*]]
+; CHECK-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[CMP]], i32 [[B]], i32 [[V]]
+; CHECK-NEXT:    store i32 [[SPEC_STORE_SELECT]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    ret i64 [[V2]]
+;
+entry:
+  %a = call ptr @malloc(i64 8)
+  store i64 -1, ptr %a, align 8
+  %arrayidx = getelementptr inbounds [2 x i32], ptr %a, i64 0, i64 %i
+  %v = load i32, ptr %arrayidx, align 4
+  %cmp = icmp slt i32 %v, %b
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 %b, ptr %arrayidx, align 4
+  br label %if.end
+
+if.end:
+  %v2 = load i64, ptr %a, align 8
+  ret i64 %v2
+}
+
+define i64 @load_before_store_noescape_writable(ptr noalias writable dereferenceable(8) %a, i64 %i, i32 %b)  {
+; CHECK-LABEL: @load_before_store_noescape_writable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 -1, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 0, i64 1
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[V]], [[B:%.*]]
+; CHECK-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[CMP]], i32 [[B]], i32 [[V]]
+; CHECK-NEXT:    store i32 [[SPEC_STORE_SELECT]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    ret i64 [[V2]]
+;
+entry:
+  store i64 -1, ptr %a, align 8
+  %arrayidx = getelementptr inbounds [2 x i32], ptr %a, i64 0, i64 1
+  %v = load i32, ptr %arrayidx, align 4
+  %cmp = icmp slt i32 %v, %b
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 %b, ptr %arrayidx, align 4
+  br label %if.end
+
+if.end:
+  %v2 = load i64, ptr %a, align 8
+  ret i64 %v2
+}
+
+define i64 @load_before_store_noescape_writable_missing_noalias(ptr writable dereferenceable(8) %a, i64 %i, i32 %b)  {
+; CHECK-LABEL: @load_before_store_noescape_writable_missing_noalias(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 -1, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 0, i64 1
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[V]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 [[B]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    ret i64 [[V2]]
+;
+entry:
+  store i64 -1, ptr %a, align 8
+  %arrayidx = getelementptr inbounds [2 x i32], ptr %a, i64 0, i64 1
+  %v = load i32, ptr %arrayidx, align 4
+  %cmp = icmp slt i32 %v, %b
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 %b, ptr %arrayidx, align 4
+  br label %if.end
+
+if.end:
+  %v2 = load i64, ptr %a, align 8
+  ret i64 %v2
+}
+
+define i64 @load_before_store_noescape_writable_missing_derefable(ptr noalias writable %a, i64 %i, i32 %b)  {
+; CHECK-LABEL: @load_before_store_noescape_writable_missing_derefable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 -1, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 0, i64 1
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[V]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 [[B]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    ret i64 [[V2]]
+;
+entry:
+  store i64 -1, ptr %a, align 8
+  %arrayidx = getelementptr inbounds [2 x i32], ptr %a, i64 0, i64 1
+  %v = load i32, ptr %arrayidx, align 4
+  %cmp = icmp slt i32 %v, %b
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 %b, ptr %arrayidx, align 4
+  br label %if.end
+
+if.end:
+  %v2 = load i64, ptr %a, align 8
+  ret i64 %v2
+}
+
 declare void @fork_some_threads(ptr);
 declare void @join_some_threads();
 
diff --git a/llvm/test/Transforms/StructurizeCFG/structurizer-keep-perf-md.ll b/llvm/test/Transforms/StructurizeCFG/structurizer-keep-perf-md.ll
index 862c50c6183f1..cdf5ca569701b 100644
--- a/llvm/test/Transforms/StructurizeCFG/structurizer-keep-perf-md.ll
+++ b/llvm/test/Transforms/StructurizeCFG/structurizer-keep-perf-md.ll
@@ -5,7 +5,7 @@ define amdgpu_ps i32 @if_else(i32 %0) {
 ; OPT-LABEL: define amdgpu_ps i32 @if_else(
 ; OPT-SAME: i32 [[TMP0:%.*]]) {
 ; OPT-NEXT:    [[C:%.*]] = icmp ne i32 [[TMP0]], 0
-; OPT-NEXT:    br i1 [[C]], label %[[FALSE:.*]], label %[[FLOW:.*]]
+; OPT-NEXT:    br i1 [[C]], label %[[FALSE:.*]], label %[[FLOW:.*]], !prof [[PROF0:![0-9]+]]
 ; OPT:       [[FLOW]]:
 ; OPT-NEXT:    [[TMP2:%.*]] = phi i32 [ 33, %[[FALSE]] ], [ undef, [[TMP1:%.*]] ]
 ; OPT-NEXT:    [[TMP3:%.*]] = phi i1 [ false, %[[FALSE]] ], [ true, [[TMP1]] ]
@@ -40,7 +40,7 @@ define amdgpu_ps void @loop_if_break(i32 %n) {
 ; OPT:       [[LOOP]]:
 ; OPT-NEXT:    [[I:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[TMP0:%.*]], %[[FLOW:.*]] ]
 ; OPT-NEXT:    [[C:%.*]] = icmp ugt i32 [[I]], 0
-; OPT-NEXT:    br i1 [[C]], label %[[LOOP_BODY:.*]], label %[[FLOW]]
+; OPT-NEXT:    br i1 [[C]], label %[[LOOP_BODY:.*]], label %[[FLOW]], !prof [[PROF1:![0-9]+]]
 ; OPT:       [[LOOP_BODY]]:
 ; OPT-NEXT:    [[I_NEXT:%.*]] = sub i32 [[I]], 1
 ; OPT-NEXT:    br label %[[FLOW]]
@@ -70,3 +70,7 @@ exit:                                             ; preds = %loop
 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 
 !0 = !{!"branch_weights", i32 1000, i32 1}
+;.
+; OPT: [[PROF0]] = !{!"branch_weights", i32 1, i32 1000}
+; OPT: [[PROF1]] = !{!"branch_weights", i32 1000, i32 1}
+;.
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
index 13d13bca56de6..cb99632c287b3 100644
--- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
@@ -36,27 +36,27 @@ define void @struct_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %idx, i32
   ret void
 }
 
-declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
 define void @invalid_image_sample_1d_v4f32_f32(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc, i32 %dmask, i1 %bool, i32 %arg) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %dmask
-  ; CHECK-NEXT: %data0 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 %dmask, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %data0 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 %dmask, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  ; CHECK-NEXT: %data0 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 %dmask, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %data0 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 %dmask, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i1 %bool
-  ; CHECK-NEXT: %data1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 %bool, i32 0, i32 0)
-  %data1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 %bool, i32 0, i32 0)
+  ; CHECK-NEXT: %data1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 %bool, i32 0, i32 0)
+  %data1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 %bool, i32 0, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %arg
-  ; CHECK-NEXT:   %data2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 %arg, i32 0)
-  %data2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 %arg, i32 0)
+  ; CHECK-NEXT:   %data2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 %arg, i32 0)
+  %data2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 %arg, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %arg
-  ; CHECK-NEXT:   %data3 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 %arg)
-  %data3 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 %arg)
+  ; CHECK-NEXT:   %data3 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 %arg)
+  %data3 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 0, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 %arg)
   ret void
 }
 
@@ -336,84 +336,84 @@ define amdgpu_kernel void @update_dpp_test(ptr addrspace(1) %out, i32 %in1, i32
   ret void
 }
 
-declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32)
+declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32, i32, <8 x i32>, i32, i32)
 define amdgpu_ps void @load_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %var) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %var
-  ; CHECK-NEXT: %val0 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 %var, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %val0 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 %var, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ; CHECK-NEXT: %val0 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 %var, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %val0 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 %var, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %var
-  ; CHECK-NEXT: %val1 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 %var, i32 0)
-  %val1 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 %var, i32 0)
+  ; CHECK-NEXT: %val1 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 %var, i32 0)
+  %val1 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 %var, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %var
-  ; CHECK-NEXT: %val2 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 %var)
-  %val2 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 %var)
+  ; CHECK-NEXT: %val2 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 %var)
+  %val2 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 %var)
   ret void
 }
 
-declare {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32)
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32)
 define amdgpu_ps void @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %val) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %val
-  ; CHECK-NEXT: %val0 = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 %val, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
-  %val0 = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 %val, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  ; CHECK-NEXT: %val0 = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32.v8i32(i32 %val, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %val0 = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32.v8i32(i32 %val, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %val
-  ; CHECK-NEXT: %val1 = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 %val, i32 0)
-  %val1 = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 %val, i32 0)
+  ; CHECK-NEXT: %val1 = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32.v8i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 %val, i32 0)
+  %val1 = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32.v8i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 %val, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %val
-  ; CHECK-NEXT: %val2 = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 %val)
-  %val2 = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 %val)
+  ; CHECK-NEXT: %val2 = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32.v8i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 %val)
+  %val2 = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32.v8i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 %val)
   ret void
 }
 
-declare {<4 x float>, i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
+declare {<4 x float>, i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32.v8i32.v4i32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
 define amdgpu_ps void @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, ptr addrspace(1) inreg %out, float %s, i32 %var, i1 %bool) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %var
-  ; CHECK-NEXT: %val0 = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 %var, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
-  %val0 = call {<4 x float>, i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 %var, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
+  ; CHECK-NEXT: %val0 = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32 %var, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
+  %val0 = call {<4 x float>, i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32.v8i32.v4i32(i32 %var, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i1 %bool
-  ; CHECK-NEXT: %val1 = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 16, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 %bool, i32 1, i32 0)
-  %val1 = call {<4 x float>, i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 16, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 %bool, i32 1, i32 0)
+  ; CHECK-NEXT: %val1 = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32 16, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 %bool, i32 1, i32 0)
+  %val1 = call {<4 x float>, i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32.v8i32.v4i32(i32 16, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 %bool, i32 1, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %var
-  ; CHECK-NEXT: %val2 = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 16, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 %var, i32 0)
-  %val2 = call {<4 x float>, i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 16, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 %var, i32 0)
+  ; CHECK-NEXT: %val2 = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32 16, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 %var, i32 0)
+  %val2 = call {<4 x float>, i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32.v8i32.v4i32(i32 16, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 %var, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %var
-  ; CHECK-NEXT: %val3 = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 %var, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 %var)
-  %val3 = call {<4 x float>, i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 %var, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 %var)
+  ; CHECK-NEXT: %val3 = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32.v8i32.v4i32(i32 %var, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 %var)
+  %val3 = call {<4 x float>, i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32.v8i32.v4i32(i32 %var, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 %var)
   ret void
 }
 
-declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32)
+declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16.v8i32(i32, i16, <8 x i32>, i32, i32)
 define amdgpu_ps void @load_1d_a16(<8 x i32> inreg %rsrc, <2 x i16> %coords, i16 %s, i32 %var) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %var
-  ; CHECK-NEXT: %val0 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 %var, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  %val0 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 %var, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ; CHECK-NEXT: %val0 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16.v8i32(i32 %var, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %val0 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16.v8i32(i32 %var, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %var
-  ; CHECK-NEXT: %val1 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 %var, i32 0)
-  %val1 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 %var, i32 0)
+  ; CHECK-NEXT: %val1 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16.v8i32(i32 15, i16 %s, <8 x i32> %rsrc, i32 %var, i32 0)
+  %val1 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16.v8i32(i32 15, i16 %s, <8 x i32> %rsrc, i32 %var, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %var
-  ; CHECK-NEXT: %val2 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 %var)
-  %val2 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 %var)
+  ; CHECK-NEXT: %val2 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16.v8i32(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 %var)
+  %val2 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16.v8i32(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 %var)
   ret void
 }
 
@@ -426,31 +426,31 @@ define amdgpu_ps void @raw_buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 %data,
   ret void
 }
 
-declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32)
+declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32.v8i32(i32, i32, <8 x i32>, i32, i32)
 define amdgpu_ps void @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %val) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %val
-  ; CHECK-NEXT: %val0 = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 %val, i32 0)
-  %val0 = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 %val, i32 0)
+  ; CHECK-NEXT: %val0 = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32.v8i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 %val, i32 0)
+  %val0 = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32.v8i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 %val, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %val
-  ; CHECK-NEXT: %val1 = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 %val)
-  %val1 = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 %val)
+  ; CHECK-NEXT: %val1 = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32.v8i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 %val)
+  %val1 = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32.v8i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 %val)
   ret void
 }
 
-declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32.v8i32(i32, i32, i32, <8 x i32>, i32, i32) #0
 define amdgpu_ps void @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s, i32 %val) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %val
-  ; CHECK-NEXT: %val0 = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 %val, i32 0)
-  %val0 = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 %val, i32 0)
+  ; CHECK-NEXT: %val0 = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32.v8i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 %val, i32 0)
+  %val0 = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32.v8i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 %val, i32 0)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %val
-  ; CHECK-NEXT: %val1 = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 %val)
-  %val1 = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 %val)
+  ; CHECK-NEXT: %val1 = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32.v8i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 %val)
+  %val1 = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32.v8i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 %val)
   ret void
 }
 
diff --git a/llvm/test/tools/dsymutil/X86/remarks-linking-archive.text b/llvm/test/tools/dsymutil/X86/remarks-linking-archive.text
index e23d0b620cac1..47f9df82296fe 100644
--- a/llvm/test/tools/dsymutil/X86/remarks-linking-archive.text
+++ b/llvm/test/tools/dsymutil/X86/remarks-linking-archive.text
@@ -14,9 +14,9 @@ RUN: llvm-bcanalyzer -dump %t/basic.macho.remarks.archive.x86_64.dSYM/Contents/R
 
 Check that we don't error if we're missing remark files from an archive, but we warn instead.
 Instead of creating a new binary, just remove the remarks prepend path.
-RUN: dsymutil -oso-prepend-path=%p/../Inputs %t/basic.macho.remarks.archive.x86_64 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING
+RUN: dsymutil -oso-prepend-path=%p/../Inputs %t/basic.macho.remarks.archive.x86_64 2>&1 | FileCheck -DMSG=%errc_ENOENT %s --check-prefix=CHECK-MISSING
 
-RUN: dsymutil --linker parallel -oso-prepend-path=%p/../Inputs %t/basic.macho.remarks.archive.x86_64 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING
+RUN: dsymutil --linker parallel -oso-prepend-path=%p/../Inputs %t/basic.macho.remarks.archive.x86_64 2>&1 | FileCheck -DMSG=%errc_ENOENT %s --check-prefix=CHECK-MISSING
 
 CHECK: <Meta
 CHECK: <Remark Num
@@ -42,9 +42,9 @@ CHECK: <Remark Num
 CHECK: <Remark Num
 CHECK-NOT: <Remark Num
 
-CHECK-MISSING: warning: '/remarks/basic1.macho.remarks.x86_64.opt.bitstream': No such file or directory
+CHECK-MISSING: warning: '/remarks/basic1.macho.remarks.x86_64.opt.bitstream': [[MSG]]
 CHECK-MISSING-NEXT: note: while processing {{.*}}libbasic.a(basic1.macho.remarks.x86_64.o)
-CHECK-MISSING-NEXT: warning: '/remarks/basic2.macho.remarks.x86_64.opt.bitstream': No such file or directory
+CHECK-MISSING-NEXT: warning: '/remarks/basic2.macho.remarks.x86_64.opt.bitstream': [[MSG]]
 CHECK-MISSING-NEXT: note: while processing {{.*}}libbasic.a(basic2.macho.remarks.x86_64.o)
-CHECK-MISSING-NEXT: warning: '/remarks/basic3.macho.remarks.x86_64.opt.bitstream': No such file or directory
+CHECK-MISSING-NEXT: warning: '/remarks/basic3.macho.remarks.x86_64.opt.bitstream': [[MSG]]
 CHECK-MISSING-NEXT: note: while processing {{.*}}libbasic.a(basic3.macho.remarks.x86_64.o)
diff --git a/llvm/test/tools/gold/X86/stats-file-option.ll b/llvm/test/tools/gold/X86/stats-file-option.ll
index a0009a7a874d7..c0c6ffd1f1080 100644
--- a/llvm/test/tools/gold/X86/stats-file-option.ll
+++ b/llvm/test/tools/gold/X86/stats-file-option.ll
@@ -21,5 +21,5 @@ define i32 @foo() {
 
 ; Try to save statistics to an invalid file.
 ; RUN: not %gold -plugin %llvmshlibdir/LLVMgold%shlibext -plugin-opt=stats-file=%t2/foo.stats \
-; RUN:    -m elf_x86_64 -r -o %t.o %t.bc 2>&1 | FileCheck --check-prefix=ERROR %s
-; ERROR: LLVM gold plugin: No such file or directory
+; RUN:    -m elf_x86_64 -r -o %t.o %t.bc 2>&1 | FileCheck -DMSG=%errc_ENOENT --check-prefix=ERROR %s
+; ERROR: LLVM gold plugin: [[MSG]]
diff --git a/llvm/test/tools/llvm-ar/read-only-archive.test b/llvm/test/tools/llvm-ar/read-only-archive.test
index 0365f6ad054da..e3659903d8cb3 100644
--- a/llvm/test/tools/llvm-ar/read-only-archive.test
+++ b/llvm/test/tools/llvm-ar/read-only-archive.test
@@ -11,12 +11,12 @@
 # RUN: chmod 444 %t/archive.a
 
 # RUN: not llvm-ar r %t/archive.a %t/3.txt \
-# RUN:   FileCheck %s --check-prefix=ERROR -DFILE=%t/archive.a
+# RUN:   FileCheck %s --check-prefix=ERROR -DFILE=%t/archive.a -DMSG=%errc_ENOENT
 
-# ERROR: error: [[FILE]]: no such file or directory
+# ERROR: error: [[FILE]]: [[MSG]]
 
 # RUN: not llvm-ar q %t/archive.a %t/3.txt \
-# RUN:   FileCheck %s --check-prefix=ERROR -DFILE=%t/archive.a
+# RUN:   FileCheck %s --check-prefix=ERROR -DFILE=%t/archive.a -DMSG=%errc_ENOENT
 
 # RUN: not llvm-ar m %t/archive.a t/1.txt \
 # RUN:   FileCheck %s --check-prefix=ERROR-2 -DFILE=%t/archive.a
diff --git a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
index 08c83c9f907fb..e8668a7f198a9 100644
--- a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
+++ b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
@@ -3,7 +3,7 @@
 ; RUN: not llvm-ctxprof-util nofile.json 2>&1 | FileCheck %s --check-prefix=NO_CMD
 ; RUN: not llvm-ctxprof-util invalidCmd --input nofile.json 2>&1 | FileCheck %s --check-prefix=INVALID_CMD
 ; RUN: not llvm-ctxprof-util fromJSON nofile.json 2>&1 | FileCheck %s --check-prefix=NO_FLAG
-; RUN: not llvm-ctxprof-util fromJSON --input nofile.json 2>&1 | FileCheck %s --check-prefix=NO_FILE
+; RUN: not llvm-ctxprof-util fromJSON --input nofile.json 2>&1 | FileCheck -DMSG=%errc_ENOENT %s --check-prefix=NO_FILE
 ; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/bad.json 2>&1 | FileCheck %s --check-prefix=BAD_JSON
 ; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-no-vector.json 2>&1 | FileCheck %s --check-prefix=NO_VECTOR
 ; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-no-ctx.json 2>&1 | FileCheck %s --check-prefix=NO_CTX
@@ -15,7 +15,7 @@
 ; NO_CMD: Unknown subcommand 'nofile.json'
 ; INVALID_CMD: Unknown subcommand 'invalidCmd'
 ; NO_FLAG: Unknown command line argument 'nofile.json'. 
-; NO_FILE: 'nofile.json': No such file or directory
+; NO_FILE: 'nofile.json': [[MSG]]
 ; BAD_JSON: Expected object key
 ; NO_VECTOR: expected array
 ; NO_CTX: missing value at (root)[0].Guid
diff --git a/llvm/test/tools/llvm-objdump/ELF/AArch64/zeroes.test b/llvm/test/tools/llvm-objdump/ELF/AArch64/zeroes.test
new file mode 100644
index 0000000000000..a56d056f8a225
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/AArch64/zeroes.test
@@ -0,0 +1,66 @@
+## Test zero dumping when a data mapping symbol is active.
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t
+# RUN: llvm-objdump -t -d %t | FileCheck %s
+
+# CHECK:      SYMBOL TABLE:
+# CHECK-NEXT: 0000000000000000 l       .text  0000000000000000 $d
+# CHECK-NEXT: 000000000000000c l       .text  0000000000000000 $x
+# CHECK-NEXT: 0000000000000010 l       .text  0000000000000000 $d
+
+# CHECK:      0000000000000000 <_start>:
+# CHECK-NEXT:                 ...
+# CHECK-NEXT:        8: 01 00 00 00   .word   0x00000001
+# CHECK-NEXT:        c: d503201f      nop
+# CHECK-NEXT:                 ...
+# CHECK-NEXT:       18: d503201f      nop
+# CHECK-NEXT:                 ...
+# CHECK-NEXT:       2c: d503201f      nop
+# CHECK-NEXT:                 ...
+# CHECK-NEXT:       48: d503201f      nop
+
+# RUN: llvm-objdump -d -z %t | FileCheck %s --check-prefix=ZERO
+
+# ZERO:      0000000000000000 <_start>:
+# ZERO-NEXT:        0: 00 00 00 00   .word   0x00000000
+# ZERO-NEXT:        4: 00 00 00 00   .word   0x00000000
+# ZERO-NEXT:        8: 01 00 00 00   .word   0x00000001
+# ZERO-NEXT:        c: d503201f      nop
+# ZERO-NEXT:       10: 00 00 00 00   .word   0x00000000
+# ZERO-NEXT:       14: 00 00 00 00   .word   0x00000000
+# ZERO-NEXT:       18: d503201f      nop
+
+## Check we do not skip zeroes blocks if have relocations pointed to these places.
+# RUN: llvm-objdump -d -r %t | FileCheck %s --check-prefix=RELOC
+
+# RELOC:      0000000000000000 <_start>:
+# RELOC-NEXT:                 ...
+# RELOC-NEXT:        8: 01 00 00 00   .word   0x00000001
+# RELOC-NEXT:        c: d503201f      nop
+# RELOC-NEXT:                 ...
+# RELOC-NEXT:       18: d503201f      nop
+# RELOC-NEXT:       1c: 00 00 00 00   .word   0x00000000
+# RELOC-NEXT:                 000000000000001c:  R_AARCH64_ABS64      x1
+# RELOC-NEXT:                 ...
+# RELOC-NEXT:       2c: d503201f      nop
+# RELOC-NEXT:                 ...
+# RELOC-NEXT:       38: 00 00 00 00   .word   0x00000000
+# RELOC-NEXT:                 0000000000000038:  R_AARCH64_ABS64      x2
+# RELOC-NEXT:                 ...
+# RELOC-NEXT:       48: d503201f      nop
+
+.globl _start
+_start:
+  .space 8
+  .long 1
+  nop
+  .space 8
+  nop
+
+  .quad x1
+  .space 8
+  nop
+
+  .space 8
+  .quad x2
+  .space 8
+  nop
diff --git a/llvm/test/tools/llvm-objdump/ELF/ARM/zeroes.test b/llvm/test/tools/llvm-objdump/ELF/ARM/zeroes.test
new file mode 100644
index 0000000000000..8601343bd146e
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/ARM/zeroes.test
@@ -0,0 +1,47 @@
+## Test zero dumping when a data mapping symbol is active.
+# RUN: llvm-mc -filetype=obj -triple=armv7 %s -o %t
+# RUN: llvm-objdump -t -d %t | FileCheck %s
+
+# CHECK:      SYMBOL TABLE:
+# CHECK-NEXT: 00000000 l       .text  00000000 $d
+# CHECK-NEXT: 0000000c l       .text  00000000 $a
+# CHECK-NEXT: 00000010 l       .text  00000000 $d
+
+# CHECK:      00000000 <_start>:
+# CHECK-NEXT:                ...
+# CHECK-NEXT:       8: 01 00 00 00   .word   0x00000001
+# CHECK-NEXT:       c: e320f000      <unknown>
+# CHECK-NEXT:                ...
+# CHECK-NEXT:      18: e320f000      <unknown>
+# CHECK-NEXT:                ...
+# CHECK-NEXT:      28: e320f000      <unknown>
+# CHECK-NEXT:                ...
+# CHECK-NEXT:      40: e320f000      <unknown>
+
+# RUN: llvm-objdump -d -z --triple=armv7 %t | FileCheck %s --check-prefix=ZERO
+
+# ZERO:      00000000 <_start>:
+# ZERO-NEXT:       0: 00 00 00 00   .word   0x00000000
+# ZERO-NEXT:       4: 00 00 00 00   .word   0x00000000
+# ZERO-NEXT:       8: 01 00 00 00   .word   0x00000001
+# ZERO-NEXT:       c: e320f000      nop
+# ZERO-NEXT:      10: 00 00 00 00   .word   0x00000000
+# ZERO-NEXT:      14: 00 00 00 00   .word   0x00000000
+# ZERO-NEXT:      18: e320f000      nop
+
+.globl _start
+_start:
+  .space 8
+  .long 1
+  nop
+  .space 8
+  nop
+
+  .long x1
+  .space 8
+  nop
+
+  .space 8
+  .long x2
+  .space 8
+  nop
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index a3a62f042ddbd..744201289b4a4 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -203,7 +203,7 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  for (std::string InputFilename : InputFilenames) {
+  for (const auto &InputFilename : InputFilenames) {
     ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
         MemoryBuffer::getFileOrSTDIN(InputFilename);
     if (std::error_code EC = BufferOrErr.getError()) {
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index 317b6e20f64cf..34bb6ce30b766 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -330,8 +330,8 @@ static bool importFunctions(const char *argv0, Module &DestModule) {
 
   auto ModuleLoader = [&DestModule](const char *argv0,
                                     const std::string &Identifier) {
-    std::unique_ptr<MemoryBuffer> Buffer =
-        ExitOnErr(errorOrToExpected(MemoryBuffer::getFileOrSTDIN(Identifier)));
+    std::unique_ptr<MemoryBuffer> Buffer = ExitOnErr(errorOrToExpected(
+        MemoryBuffer::getFileOrSTDIN(Identifier, /*IsText=*/true)));
     return loadFile(argv0, std::move(Buffer), DestModule.getContext(), false);
   };
 
@@ -402,7 +402,7 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
   // Similar to some flags, internalization doesn't apply to the first file.
   bool InternalizeLinkedSymbols = false;
   for (const auto &File : Files) {
-    auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(File);
+    auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(File, /*IsText=*/true);
 
     // When we encounter a missing file, make sure we expose its name.
     if (auto EC = BufferOrErr.getError())
diff --git a/llvm/tools/llvm-objdump/SourcePrinter.cpp b/llvm/tools/llvm-objdump/SourcePrinter.cpp
index 7099390f24233..600bd6aa4d51e 100644
--- a/llvm/tools/llvm-objdump/SourcePrinter.cpp
+++ b/llvm/tools/llvm-objdump/SourcePrinter.cpp
@@ -344,7 +344,8 @@ bool SourcePrinter::cacheSource(const DILineInfo &LineInfo) {
   if (LineInfo.Source) {
     Buffer = MemoryBuffer::getMemBuffer(*LineInfo.Source);
   } else {
-    auto BufferOrError = MemoryBuffer::getFile(LineInfo.FileName);
+    auto BufferOrError =
+        MemoryBuffer::getFile(LineInfo.FileName, /*IsText=*/true);
     if (!BufferOrError) {
       if (MissingSources.insert(LineInfo.FileName).second)
         reportWarning("failed to find source " + LineInfo.FileName,
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index b69d14b4e7609..8073c898b8a14 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -2244,27 +2244,28 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
           return false;
         };
 
+        // When -z or --disassemble-zeroes are given we always dissasemble
+        // them. Otherwise we might want to skip zero bytes we see.
+        if (!DisassembleZeroes) {
+          uint64_t MaxOffset = End - Index;
+          // For --reloc: print zero blocks patched by relocations, so that
+          // relocations can be shown in the dump.
+          if (InlineRelocs && RelCur != RelEnd)
+            MaxOffset = std::min(RelCur->getOffset() - RelAdjustment - Index,
+                                 MaxOffset);
+
+          if (size_t N =
+                  countSkippableZeroBytes(Bytes.slice(Index, MaxOffset))) {
+            FOS << "\t\t..." << '\n';
+            Index += N;
+            continue;
+          }
+        }
+
         if (DumpARMELFData) {
           Size = dumpARMELFData(SectionAddr, Index, End, Obj, Bytes,
                                 MappingSymbols, *DT->SubtargetInfo, FOS);
         } else {
-          // When -z or --disassemble-zeroes are given we always dissasemble
-          // them. Otherwise we might want to skip zero bytes we see.
-          if (!DisassembleZeroes) {
-            uint64_t MaxOffset = End - Index;
-            // For --reloc: print zero blocks patched by relocations, so that
-            // relocations can be shown in the dump.
-            if (InlineRelocs && RelCur != RelEnd)
-              MaxOffset = std::min(RelCur->getOffset() - RelAdjustment - Index,
-                                   MaxOffset);
-
-            if (size_t N =
-                    countSkippableZeroBytes(Bytes.slice(Index, MaxOffset))) {
-              FOS << "\t\t..." << '\n';
-              Index += N;
-              continue;
-            }
-          }
 
           if (DumpTracebackTableForXCOFFFunction &&
               doesXCOFFTracebackTableBegin(Bytes.slice(Index, 4))) {
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 5efef97cc5787..8a36726769f22 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -3415,8 +3415,9 @@ int llvm_profdata_main(int argc, char **argvNonConst,
   StringRef ProgName(sys::path::filename(argv[0]));
 
   if (argc < 2) {
-    errs() << ProgName
-           << ": No subcommand specified! Run llvm-profata --help for usage.\n";
+    errs()
+        << ProgName
+        << ": No subcommand specified! Run llvm-profdata --help for usage.\n";
     return 1;
   }
 
diff --git a/llvm/tools/llvm-rc/llvm-rc.cpp b/llvm/tools/llvm-rc/llvm-rc.cpp
index 51214524adeef..4bc9d90095575 100644
--- a/llvm/tools/llvm-rc/llvm-rc.cpp
+++ b/llvm/tools/llvm-rc/llvm-rc.cpp
@@ -603,7 +603,7 @@ void doRc(std::string Src, std::string Dest, RcOptions &Opts,
 
   // Read and tokenize the input file.
   ErrorOr<std::unique_ptr<MemoryBuffer>> File =
-      MemoryBuffer::getFile(PreprocessedFile);
+      MemoryBuffer::getFile(PreprocessedFile, /*IsText=*/true);
   if (!File) {
     fatalError("Error opening file '" + Twine(PreprocessedFile) +
                "': " + File.getError().message());
@@ -682,7 +682,7 @@ void doCvtres(std::string Src, std::string Dest, std::string TargetTriple) {
   object::WindowsResourceParser Parser;
 
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
-      MemoryBuffer::getFile(Src);
+      MemoryBuffer::getFile(Src, /*IsText=*/true);
   if (!BufferOrErr)
     fatalError("Error opening file '" + Twine(Src) +
                "': " + BufferOrErr.getError().message());
diff --git a/llvm/tools/llvm-readtapi/llvm-readtapi.cpp b/llvm/tools/llvm-readtapi/llvm-readtapi.cpp
index 1f183975d9481..c287dac4cd239 100644
--- a/llvm/tools/llvm-readtapi/llvm-readtapi.cpp
+++ b/llvm/tools/llvm-readtapi/llvm-readtapi.cpp
@@ -125,7 +125,7 @@ static std::unique_ptr<InterfaceFile>
 getInterfaceFile(const StringRef Filename, bool ResetBanner = true) {
   ExitOnErr.setBanner(TOOLNAME + ": error: '" + Filename.str() + "' ");
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
-      MemoryBuffer::getFile(Filename);
+      MemoryBuffer::getFile(Filename, /*IsText=*/true);
   if (BufferOrErr.getError())
     ExitOnErr(errorCodeToError(BufferOrErr.getError()));
   auto Buffer = std::move(*BufferOrErr);
diff --git a/llvm/tools/llvm-strings/llvm-strings.cpp b/llvm/tools/llvm-strings/llvm-strings.cpp
index 8642be3127fed..d4305096b60a0 100644
--- a/llvm/tools/llvm-strings/llvm-strings.cpp
+++ b/llvm/tools/llvm-strings/llvm-strings.cpp
@@ -173,7 +173,7 @@ int main(int argc, char **argv) {
 
   for (const auto &File : InputFileNames) {
     ErrorOr<std::unique_ptr<MemoryBuffer>> Buffer =
-        MemoryBuffer::getFileOrSTDIN(File);
+        MemoryBuffer::getFileOrSTDIN(File, /*IsText=*/true);
     if (std::error_code EC = Buffer.getError())
       errs() << File << ": " << EC.message() << '\n';
     else
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index 6c49d78e5c8ea..cd8a00fb18d7f 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -607,6 +607,7 @@ TEST(APFloatTest, Minimum) {
   APFloat zp(0.0);
   APFloat zn(-0.0);
   APFloat nan = APFloat::getNaN(APFloat::IEEEdouble());
+  APFloat snan = APFloat::getSNaN(APFloat::IEEEdouble());
 
   EXPECT_EQ(1.0, minimum(f1, f2).convertToDouble());
   EXPECT_EQ(1.0, minimum(f2, f1).convertToDouble());
@@ -614,6 +615,10 @@ TEST(APFloatTest, Minimum) {
   EXPECT_EQ(-0.0, minimum(zn, zp).convertToDouble());
   EXPECT_TRUE(std::isnan(minimum(f1, nan).convertToDouble()));
   EXPECT_TRUE(std::isnan(minimum(nan, f1).convertToDouble()));
+  EXPECT_TRUE(maximum(snan, f1).isNaN());
+  EXPECT_TRUE(maximum(f1, snan).isNaN());
+  EXPECT_FALSE(maximum(snan, f1).isSignaling());
+  EXPECT_FALSE(maximum(f1, snan).isSignaling());
 }
 
 TEST(APFloatTest, Maximum) {
@@ -622,6 +627,7 @@ TEST(APFloatTest, Maximum) {
   APFloat zp(0.0);
   APFloat zn(-0.0);
   APFloat nan = APFloat::getNaN(APFloat::IEEEdouble());
+  APFloat snan = APFloat::getSNaN(APFloat::IEEEdouble());
 
   EXPECT_EQ(2.0, maximum(f1, f2).convertToDouble());
   EXPECT_EQ(2.0, maximum(f2, f1).convertToDouble());
@@ -629,6 +635,10 @@ TEST(APFloatTest, Maximum) {
   EXPECT_EQ(0.0, maximum(zn, zp).convertToDouble());
   EXPECT_TRUE(std::isnan(maximum(f1, nan).convertToDouble()));
   EXPECT_TRUE(std::isnan(maximum(nan, f1).convertToDouble()));
+  EXPECT_TRUE(maximum(snan, f1).isNaN());
+  EXPECT_TRUE(maximum(f1, snan).isNaN());
+  EXPECT_FALSE(maximum(snan, f1).isSignaling());
+  EXPECT_FALSE(maximum(f1, snan).isSignaling());
 }
 
 TEST(APFloatTest, MinimumNumber) {
diff --git a/llvm/unittests/ADT/SmallSetTest.cpp b/llvm/unittests/ADT/SmallSetTest.cpp
index b50b368ae6636..2feb0b1feb421 100644
--- a/llvm/unittests/ADT/SmallSetTest.cpp
+++ b/llvm/unittests/ADT/SmallSetTest.cpp
@@ -12,11 +12,63 @@
 
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <string>
 
 using namespace llvm;
 
+TEST(SmallSetTest, ConstructorIteratorPair) {
+  std::initializer_list<int> L = {1, 2, 3, 4, 5};
+  SmallSet<int, 4> S(std::begin(L), std::end(L));
+  EXPECT_THAT(S, testing::UnorderedElementsAreArray(L));
+}
+
+TEST(SmallSet, ConstructorRange) {
+  std::initializer_list<int> L = {1, 2, 3, 4, 5};
+
+  SmallSet<int, 4> S(llvm::make_range(std::begin(L), std::end(L)));
+  EXPECT_THAT(S, testing::UnorderedElementsAreArray(L));
+}
+
+TEST(SmallSet, ConstructorInitializerList) {
+  std::initializer_list<int> L = {1, 2, 3, 4, 5};
+  SmallSet<int, 4> S = {1, 2, 3, 4, 5};
+  EXPECT_THAT(S, testing::UnorderedElementsAreArray(L));
+}
+
+TEST(SmallSet, CopyConstructor) {
+  SmallSet<int, 4> S = {1, 2, 3};
+  SmallSet<int, 4> T = S;
+
+  EXPECT_THAT(S, testing::ContainerEq(T));
+}
+
+TEST(SmallSet, MoveConstructor) {
+  std::initializer_list<int> L = {1, 2, 3};
+  SmallSet<int, 4> S = L;
+  SmallSet<int, 4> T = std::move(S);
+
+  EXPECT_THAT(T, testing::UnorderedElementsAreArray(L));
+}
+
+TEST(SmallSet, CopyAssignment) {
+  SmallSet<int, 4> S = {1, 2, 3};
+  SmallSet<int, 4> T;
+  T = S;
+
+  EXPECT_THAT(S, testing::ContainerEq(T));
+}
+
+TEST(SmallSet, MoveAssignment) {
+  std::initializer_list<int> L = {1, 2, 3};
+  SmallSet<int, 4> S = L;
+  SmallSet<int, 4> T;
+  T = std::move(S);
+
+  EXPECT_THAT(T, testing::UnorderedElementsAreArray(L));
+}
+
 TEST(SmallSetTest, Insert) {
 
   SmallSet<int, 4> s1;
@@ -41,6 +93,40 @@ TEST(SmallSetTest, Insert) {
   EXPECT_EQ(0u, s1.count(4));
 }
 
+TEST(SmallSetTest, InsertPerfectFwd) {
+  struct Value {
+    int Key;
+    bool Moved;
+
+    Value(int Key) : Key(Key), Moved(false) {}
+    Value(const Value &) = default;
+    Value(Value &&Other) : Key(Other.Key), Moved(false) { Other.Moved = true; }
+    bool operator==(const Value &Other) const { return Key == Other.Key; }
+    bool operator<(const Value &Other) const { return Key < Other.Key; }
+  };
+
+  {
+    SmallSet<Value, 4> S;
+    Value V1(1), V2(2);
+
+    S.insert(V1);
+    EXPECT_EQ(V1.Moved, false);
+
+    S.insert(std::move(V2));
+    EXPECT_EQ(V2.Moved, true);
+  }
+  {
+    SmallSet<Value, 1> S;
+    Value V1(1), V2(2);
+
+    S.insert(V1);
+    EXPECT_EQ(V1.Moved, false);
+
+    S.insert(std::move(V2));
+    EXPECT_EQ(V2.Moved, true);
+  }
+}
+
 TEST(SmallSetTest, Grow) {
   SmallSet<int, 4> s1;
 
diff --git a/llvm/unittests/Analysis/SparsePropagation.cpp b/llvm/unittests/Analysis/SparsePropagation.cpp
index 8583100dc9bc8..ca73a480cbb2d 100644
--- a/llvm/unittests/Analysis/SparsePropagation.cpp
+++ b/llvm/unittests/Analysis/SparsePropagation.cpp
@@ -139,7 +139,8 @@ class TestLatticeFunc
   /// Compute the lattice values that change as a result of executing the given
   /// instruction. We only handle the few instructions needed for the tests.
   void ComputeInstructionState(
-      Instruction &I, DenseMap<TestLatticeKey, TestLatticeVal> &ChangedValues,
+      Instruction &I,
+      SmallDenseMap<TestLatticeKey, TestLatticeVal, 16> &ChangedValues,
       SparseSolver<TestLatticeKey, TestLatticeVal> &SS) override {
     switch (I.getOpcode()) {
     case Instruction::Call:
@@ -159,7 +160,7 @@ class TestLatticeFunc
   /// actual argument state. The call site state is the merge of the call site
   /// state with the returned value state of the called function.
   void visitCallBase(CallBase &I,
-                     DenseMap<TestLatticeKey, TestLatticeVal> &ChangedValues,
+                     SmallDenseMap<TestLatticeKey, TestLatticeVal, 16> &ChangedValues,
                      SparseSolver<TestLatticeKey, TestLatticeVal> &SS) {
     Function *F = I.getCalledFunction();
     auto RegI = TestLatticeKey(&I, IPOGrouping::Register);
@@ -183,7 +184,7 @@ class TestLatticeFunc
   /// Handle return instructions. The function's return state is the merge of
   /// the returned value state and the function's current return state.
   void visitReturn(ReturnInst &I,
-                   DenseMap<TestLatticeKey, TestLatticeVal> &ChangedValues,
+                   SmallDenseMap<TestLatticeKey, TestLatticeVal, 16> &ChangedValues,
                    SparseSolver<TestLatticeKey, TestLatticeVal> &SS) {
     Function *F = I.getParent()->getParent();
     if (F->getReturnType()->isVoidTy())
@@ -199,7 +200,7 @@ class TestLatticeFunc
   /// is the merge of the stored value state with the current global variable
   /// state.
   void visitStore(StoreInst &I,
-                  DenseMap<TestLatticeKey, TestLatticeVal> &ChangedValues,
+                  SmallDenseMap<TestLatticeKey, TestLatticeVal, 16> &ChangedValues,
                   SparseSolver<TestLatticeKey, TestLatticeVal> &SS) {
     auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand());
     if (!GV)
@@ -213,7 +214,7 @@ class TestLatticeFunc
   /// Handle all other instructions. All other instructions are marked
   /// overdefined.
   void visitInst(Instruction &I,
-                 DenseMap<TestLatticeKey, TestLatticeVal> &ChangedValues,
+                 SmallDenseMap<TestLatticeKey, TestLatticeVal, 16> &ChangedValues,
                  SparseSolver<TestLatticeKey, TestLatticeVal> &SS) {
     auto RegI = TestLatticeKey(&I, IPOGrouping::Register);
     ChangedValues[RegI] = getOverdefinedVal();
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index ca50187e5e5ee..1cd4a47c75739 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -68,6 +68,13 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
                                     "loongarch64"),
             "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128");
 
+  // Check that SPARC targets add -i128:128.
+  EXPECT_EQ(
+      UpgradeDataLayoutString("E-m:e-p:32:32-i64:64-f128:64-n32-S64", "sparc"),
+      "E-m:e-p:32:32-i64:64-i128:128-f128:64-n32-S64");
+  EXPECT_EQ(UpgradeDataLayoutString("E-m:e-i64:64-n32:64-S128", "sparcv9"),
+            "E-m:e-i64:64-i128:128-n32:64-S128");
+
   // Check that SPIR && SPIRV targets add -G1 if it's not present.
   EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spir"), "e-p:32:32-G1");
   EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spir64"), "e-p:32:32-G1");
diff --git a/llvm/unittests/IR/AttributesTest.cpp b/llvm/unittests/IR/AttributesTest.cpp
index da72fa14510cb..3fd161a54b17e 100644
--- a/llvm/unittests/IR/AttributesTest.cpp
+++ b/llvm/unittests/IR/AttributesTest.cpp
@@ -172,8 +172,9 @@ TEST(Attributes, EmptyGet) {
 
 TEST(Attributes, OverflowGet) {
   LLVMContext C;
-  std::pair<unsigned, Attribute> Attrs[] = { { AttributeList::ReturnIndex, Attribute::get(C, Attribute::SExt) },
-                                             { AttributeList::FunctionIndex, Attribute::get(C, Attribute::ReadOnly) } };
+  std::pair<unsigned, Attribute> Attrs[] = {
+      {AttributeList::ReturnIndex, Attribute::get(C, Attribute::SExt)},
+      {AttributeList::FunctionIndex, Attribute::get(C, Attribute::ReadOnly)}};
   AttributeList AL = AttributeList::get(C, Attrs);
   EXPECT_EQ(2U, AL.getNumAttrSets());
 }
diff --git a/llvm/unittests/IR/ConstantFPRangeTest.cpp b/llvm/unittests/IR/ConstantFPRangeTest.cpp
index bf6ea95c00e22..722e6566730da 100644
--- a/llvm/unittests/IR/ConstantFPRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantFPRangeTest.cpp
@@ -7,13 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/ConstantFPRange.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/SmallBitVector.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/Support/KnownBits.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -363,14 +358,11 @@ TEST_F(ConstantFPRangeTest, FPClassify) {
   EXPECT_EQ(SomeNeg.classify(), fcNegFinite);
   EXPECT_EQ(PosInf.classify(), fcPosInf);
   EXPECT_EQ(NegInf.classify(), fcNegInf);
-  EXPECT_TRUE(SomePos.toKnownFPClass().cannotBeOrderedLessThanZero());
   EXPECT_EQ(Finite.getSignBit(), std::nullopt);
   EXPECT_EQ(PosZero.getSignBit(), false);
   EXPECT_EQ(NegZero.getSignBit(), true);
   EXPECT_EQ(SomePos.getSignBit(), false);
   EXPECT_EQ(SomeNeg.getSignBit(), true);
-  EXPECT_EQ(SomePos.toKnownFPClass().SignBit, false);
-  EXPECT_EQ(SomeNeg.toKnownFPClass().SignBit, true);
 
   EnumerateConstantFPRanges(
       [](const ConstantFPRange &CR) {
diff --git a/llvm/unittests/IR/IntrinsicsTest.cpp b/llvm/unittests/IR/IntrinsicsTest.cpp
index 5916a194f76d4..0c4af28a2ab57 100644
--- a/llvm/unittests/IR/IntrinsicsTest.cpp
+++ b/llvm/unittests/IR/IntrinsicsTest.cpp
@@ -31,10 +31,6 @@ using namespace llvm;
 
 namespace {
 
-static const char *const NameTable1[] = {
-    "llvm.foo", "llvm.foo.a", "llvm.foo.b", "llvm.foo.b.a", "llvm.foo.c",
-};
-
 class IntrinsicsTest : public ::testing::Test {
   LLVMContext Context;
   std::unique_ptr<Module> M;
@@ -67,18 +63,24 @@ class IntrinsicsTest : public ::testing::Test {
 };
 
 TEST(IntrinsicNameLookup, Basic) {
-  int I = Intrinsic::lookupLLVMIntrinsicByName(NameTable1, "llvm.foo");
-  EXPECT_EQ(0, I);
-  I = Intrinsic::lookupLLVMIntrinsicByName(NameTable1, "llvm.foo.f64");
-  EXPECT_EQ(0, I);
-  I = Intrinsic::lookupLLVMIntrinsicByName(NameTable1, "llvm.foo.b");
-  EXPECT_EQ(2, I);
-  I = Intrinsic::lookupLLVMIntrinsicByName(NameTable1, "llvm.foo.b.a");
-  EXPECT_EQ(3, I);
-  I = Intrinsic::lookupLLVMIntrinsicByName(NameTable1, "llvm.foo.c");
-  EXPECT_EQ(4, I);
-  I = Intrinsic::lookupLLVMIntrinsicByName(NameTable1, "llvm.foo.c.f64");
-  EXPECT_EQ(4, I);
+  static constexpr const char *const NameTable1[] = {
+      "llvm.foo", "llvm.foo.a", "llvm.foo.b", "llvm.foo.b.a", "llvm.foo.c",
+  };
+
+  static constexpr std::pair<const char *, int> Tests[] = {
+      {"llvm.foo", 0},     {"llvm.foo.f64", 0}, {"llvm.foo.b", 2},
+      {"llvm.foo.b.a", 3}, {"llvm.foo.c", 4},   {"llvm.foo.c.f64", 4},
+      {"llvm.bar", -1},
+  };
+
+  for (const auto &[Name, ExpectedIdx] : Tests) {
+    int Idx = Intrinsic::lookupLLVMIntrinsicByName(NameTable1, Name);
+    EXPECT_EQ(ExpectedIdx, Idx);
+    if (!StringRef(Name).starts_with("llvm.foo"))
+      continue;
+    Idx = Intrinsic::lookupLLVMIntrinsicByName(NameTable1, Name, "foo");
+    EXPECT_EQ(ExpectedIdx, Idx);
+  }
 }
 
 // Tests to verify getIntrinsicForClangBuiltin.
@@ -92,7 +94,6 @@ TEST(IntrinsicNameLookup, ClangBuiltinLookup) {
       {"__builtin_amdgcn_workgroup_id_z", "amdgcn", amdgcn_workgroup_id_z},
       {"__builtin_arm_cdp", "arm", arm_cdp},
       {"__builtin_bpf_preserve_type_info", "bpf", bpf_preserve_type_info},
-      {"__builtin_hlsl_create_handle", "dx", dx_create_handle},
       {"__builtin_HEXAGON_A2_tfr", "hexagon", hexagon_A2_tfr},
       {"__builtin_lasx_xbz_w", "loongarch", loongarch_lasx_xbz_w},
       {"__builtin_mips_bitrev", "mips", mips_bitrev},
diff --git a/llvm/unittests/IR/VFABIDemanglerTest.cpp b/llvm/unittests/IR/VFABIDemanglerTest.cpp
index a9dd93a0c2b1b..07bff16df4933 100644
--- a/llvm/unittests/IR/VFABIDemanglerTest.cpp
+++ b/llvm/unittests/IR/VFABIDemanglerTest.cpp
@@ -161,7 +161,7 @@ TEST_F(VFABIParserTest, ParamListParsing) {
       Type::getVoidTy(Ctx),
       {VectorType::get(Type::getInt32Ty(Ctx), ElementCount::getFixed(2)),
        Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx),
-       Type::getInt32Ty(Ctx)->getPointerTo(), Type::getInt32Ty(Ctx)},
+       PointerType::getUnqual(Ctx), Type::getInt32Ty(Ctx)},
       false);
   EXPECT_EQ(getFunctionType(), FTy);
   EXPECT_EQ(Parameters.size(), (unsigned)5);
@@ -225,11 +225,11 @@ TEST_F(VFABIParserTest, Parse) {
           Type::getInt32Ty(Ctx),
           Type::getInt32Ty(Ctx),
           Type::getInt32Ty(Ctx),
-          Type::getInt32Ty(Ctx)->getPointerTo(),
+          PointerType::getUnqual(Ctx),
           Type::getInt32Ty(Ctx),
           Type::getInt32Ty(Ctx),
           Type::getInt32Ty(Ctx),
-          Type::getInt32Ty(Ctx)->getPointerTo(),
+          PointerType::getUnqual(Ctx),
       },
       false);
   EXPECT_EQ(getFunctionType(), FTy);
@@ -265,11 +265,11 @@ TEST_F(VFABIParserTest, LinearWithCompileTimeNegativeStep) {
                            "void(i32, i32, i32, ptr)"));
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_FALSE(isMasked());
-  FunctionType *FTy = FunctionType::get(
-      Type::getVoidTy(Ctx),
-      {Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx),
-       Type::getInt32Ty(Ctx)->getPointerTo()},
-      false);
+  FunctionType *FTy =
+      FunctionType::get(Type::getVoidTy(Ctx),
+                        {Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx),
+                         Type::getInt32Ty(Ctx), PointerType::getUnqual(Ctx)},
+                        false);
   EXPECT_EQ(getFunctionType(), FTy);
   EXPECT_EQ(VF, ElementCount::getFixed(2));
   EXPECT_EQ(Parameters.size(), (unsigned)4);
@@ -332,13 +332,13 @@ TEST_F(VFABIParserTest, LinearWithoutCompileTime) {
                            "void(i32, i32, ptr, i32, i32, i32, ptr, i32)"));
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   EXPECT_FALSE(isMasked());
-  FunctionType *FTy = FunctionType::get(
-      Type::getVoidTy(Ctx),
-      {Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx),
-       Type::getInt32Ty(Ctx)->getPointerTo(), Type::getInt32Ty(Ctx),
-       Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx),
-       Type::getInt32Ty(Ctx)->getPointerTo(), Type::getInt32Ty(Ctx)},
-      false);
+  FunctionType *FTy =
+      FunctionType::get(Type::getVoidTy(Ctx),
+                        {Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx),
+                         PointerType::getUnqual(Ctx), Type::getInt32Ty(Ctx),
+                         Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx),
+                         PointerType::getUnqual(Ctx), Type::getInt32Ty(Ctx)},
+                        false);
   EXPECT_EQ(getFunctionType(), FTy);
   EXPECT_EQ(Parameters.size(), (unsigned)8);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::OMP_Linear, 1}));
@@ -423,7 +423,7 @@ TEST_F(VFABIParserTest, ISAIndependentMangling) {
       Type::getVoidTy(Ctx),
       {VectorType::get(Type::getInt32Ty(Ctx), ElementCount::getFixed(2)),
        Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx),
-       Type::getInt32Ty(Ctx)->getPointerTo(), Type::getInt32Ty(Ctx),
+       PointerType::getUnqual(Ctx), Type::getInt32Ty(Ctx),
        Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx),
        Type::getInt32Ty(Ctx)},
       false);
@@ -692,8 +692,8 @@ TEST_F(VFABIParserTest, ParseScalableMaskingSVESincos) {
       Type::getVoidTy(Ctx),
       {
           VectorType::get(Type::getDoubleTy(Ctx), ElementCount::getScalable(2)),
-          Type::getInt32Ty(Ctx)->getPointerTo(),
-          Type::getInt32Ty(Ctx)->getPointerTo(),
+          PointerType::getUnqual(Ctx),
+          PointerType::getUnqual(Ctx),
           VectorType::get(Type::getInt1Ty(Ctx), ElementCount::getScalable(2)),
       },
       false);
diff --git a/llvm/unittests/SandboxIR/CMakeLists.txt b/llvm/unittests/SandboxIR/CMakeLists.txt
index a228637b062a4..622496ada567f 100644
--- a/llvm/unittests/SandboxIR/CMakeLists.txt
+++ b/llvm/unittests/SandboxIR/CMakeLists.txt
@@ -2,11 +2,14 @@ set(LLVM_LINK_COMPONENTS
   AsmParser
   SandboxIR
   Core
+  Analysis
   )
 
 add_llvm_unittest(SandboxIRTests
   PassTest.cpp
+  RegionTest.cpp
   SandboxIRTest.cpp
   TrackerTest.cpp
   TypesTest.cpp
+  UtilsTest.cpp
   )
diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp
index 2eaf369caf084..10fe59b654a2e 100644
--- a/llvm/unittests/SandboxIR/PassTest.cpp
+++ b/llvm/unittests/SandboxIR/PassTest.cpp
@@ -9,8 +9,10 @@
 #include "llvm/SandboxIR/Pass.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Module.h"
+#include "llvm/SandboxIR/Constant.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Function.h"
 #include "llvm/SandboxIR/PassManager.h"
-#include "llvm/SandboxIR/SandboxIR.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/RegionTest.cpp b/llvm/unittests/SandboxIR/RegionTest.cpp
similarity index 97%
rename from llvm/unittests/Transforms/Vectorize/SandboxVectorizer/RegionTest.cpp
rename to llvm/unittests/SandboxIR/RegionTest.cpp
index 0318d32c69219..602534530e248 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/RegionTest.cpp
+++ b/llvm/unittests/SandboxIR/RegionTest.cpp
@@ -6,9 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Vectorize/SandboxVectorizer/Region.h"
+#include "llvm/SandboxIR/Region.h"
 #include "llvm/AsmParser/Parser.h"
-#include "llvm/SandboxIR/SandboxIR.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/SandboxIR/Instruction.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gmock/gmock-matchers.h"
 #include "gtest/gtest.h"
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 42df09609b675..7206ee34d36e3 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/SandboxIR/SandboxIR.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -14,7 +13,13 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
+#include "llvm/SandboxIR/BasicBlock.h"
+#include "llvm/SandboxIR/Constant.h"
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/SandboxIR/Instruction.h"
+#include "llvm/SandboxIR/Module.h"
 #include "llvm/SandboxIR/Utils.h"
+#include "llvm/SandboxIR/Value.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gmock/gmock-matchers.h"
 #include "gtest/gtest.h"
@@ -1685,6 +1690,64 @@ void @foo0(i32 %arg0, i32 %arg1) {
 #endif // NDEBUG
 }
 
+TEST_F(SandboxIRTest, Module) {
+  parseIR(C, R"IR(
+@glob0 = global i32 42
+@glob1 = global i32 43
+@internal0 = internal global i32 42
+@const0 = constant i32 42
+@alias0 = dso_local alias void(), ptr @foo
+@ifunc = ifunc void(), ptr @foo
+define void @foo() {
+  ret void
+}
+define void @bar() {
+  ret void
+}
+)IR");
+  llvm::Module *LLVMM = &*M;
+  llvm::Function *LLVMFFoo = &*M->getFunction("foo");
+  llvm::Function *LLVMFBar = &*M->getFunction("bar");
+
+  sandboxir::Context Ctx(C);
+  auto *M = Ctx.createModule(LLVMM);
+  // Check getContext().
+  EXPECT_EQ(&M->getContext(), &Ctx);
+  // Check getFunction().
+  auto *FFoo = M->getFunction("foo");
+  auto *FBar = M->getFunction("bar");
+  EXPECT_EQ(FFoo, Ctx.getValue(LLVMFFoo));
+  EXPECT_EQ(FBar, Ctx.getValue(LLVMFBar));
+  // Check getDataLayout().
+  EXPECT_EQ(&M->getDataLayout(), &LLVMM->getDataLayout());
+  // Check getSourceFileName().
+  EXPECT_EQ(M->getSourceFileName(), LLVMM->getSourceFileName());
+  // Check getGlobalVariable().
+  for (const char *Name : {"global0", "global1", "internal0"})
+    EXPECT_EQ(M->getGlobalVariable(Name),
+              Ctx.getValue(LLVMM->getGlobalVariable(Name)));
+  // Check getGlobalVariable(AllowInternal).
+  {
+    auto *Internal0 = M->getGlobalVariable("internal0", /*AllowInternal=*/true);
+    EXPECT_TRUE(Internal0 != nullptr);
+    EXPECT_EQ(Internal0, Ctx.getValue(LLVMM->getNamedGlobal("internal0")));
+  }
+  // Check getNamedGlobal().
+  {
+    auto *Internal = M->getNamedGlobal("internal0");
+    EXPECT_TRUE(Internal != nullptr);
+    EXPECT_EQ(Internal, Ctx.getValue(LLVMM->getNamedGlobal("internal0")));
+  }
+  // Check getNamedAlias().
+  auto *Alias0 = M->getNamedAlias("alias0");
+  EXPECT_EQ(Alias0, Ctx.getValue(LLVMM->getNamedAlias("alias0")));
+  EXPECT_EQ(M->getNamedAlias("aliasFOO"), nullptr);
+  // Check getNamedIFunc().
+  auto *IFunc0 = M->getNamedIFunc("ifunc0");
+  EXPECT_EQ(IFunc0, Ctx.getValue(LLVMM->getNamedAlias("ifunc0")));
+  EXPECT_EQ(M->getNamedIFunc("ifuncFOO"), nullptr);
+}
+
 TEST_F(SandboxIRTest, BasicBlock) {
   parseIR(C, R"IR(
 define void @foo(i32 %v1) {
@@ -1770,9 +1833,12 @@ define void @foo(i8 %v1, ptr %ptr) {
   %atomicrmw = atomicrmw add ptr %ptr, i8 %v1 acquire
   %udiv = udiv i8 %ld0, %v1
   %urem = urem i8 %ld0, %v1
-  call void @foo()
-  ret void
+  call void @foo(), !dbg !1
+  ret void, !tbaa !2
 }
+
+!1 = !{}
+!2 = !{}
 )IR");
   llvm::Function *LLVMF = &*M->getFunction("foo");
   llvm::BasicBlock *LLVMBB1 = getBasicBlockByName(*LLVMF, "bb1");
@@ -1806,6 +1872,15 @@ define void @foo(i8 %v1, ptr %ptr) {
   EXPECT_EQ(I1->getOpcode(), sandboxir::Instruction::Opcode::Sub);
   EXPECT_EQ(Ret->getOpcode(), sandboxir::Instruction::Opcode::Ret);
 
+  // Check getOpcodeName().
+  EXPECT_STREQ(I0->getOpcodeName(), "Add");
+  EXPECT_STREQ(I1->getOpcodeName(), "Sub");
+  EXPECT_STREQ(Ret->getOpcodeName(), "Ret");
+
+  EXPECT_STREQ(sandboxir::Instruction::getOpcodeName(
+                   sandboxir::Instruction::Opcode::Alloca),
+               "Alloca");
+
   // Check moveBefore(I).
   I1->moveBefore(I0);
   EXPECT_EQ(I0->getPrevNode(), I1);
@@ -1874,6 +1949,19 @@ define void @foo(i8 %v1, ptr %ptr) {
     EXPECT_EQ(LLVMI.isShift(), I.isShift());
     // Check isCast().
     EXPECT_EQ(LLVMI.isCast(), I.isCast());
+    // Check isFuncletPad().
+    EXPECT_EQ(LLVMI.isFuncletPad(), I.isFuncletPad());
+    // Check isSpecialTerminator().
+    EXPECT_EQ(LLVMI.isSpecialTerminator(), I.isSpecialTerminator());
+    // Check isOnlyUserOfAnyOperand().
+    EXPECT_EQ(LLVMI.isOnlyUserOfAnyOperand(), I.isOnlyUserOfAnyOperand());
+    // Check isLogicalShift().
+    EXPECT_EQ(LLVMI.isLogicalShift(), I.isLogicalShift());
+    // Check hasMetadata().
+    EXPECT_EQ(LLVMI.hasMetadata(), I.hasMetadata());
+    // Check hasMetadataOtherThanDebugLoc().
+    EXPECT_EQ(LLVMI.hasMetadataOtherThanDebugLoc(),
+              I.hasMetadataOtherThanDebugLoc());
     // Check isAssociative().
     EXPECT_EQ(LLVMI.isAssociative(), I.isAssociative());
     // Check isCommutative().
@@ -6026,5 +6114,5 @@ define void @foo() {
 TEST_F(SandboxIRTest, CheckClassof) {
 #define DEF_INSTR(ID, OPC, CLASS)                                              \
   EXPECT_NE(&sandboxir::CLASS::classof, &sandboxir::Instruction::classof);
-#include "llvm/SandboxIR/SandboxIRValues.def"
+#include "llvm/SandboxIR/Values.def"
 }
diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp
index da5416395ec42..9c7710a6a9297 100644
--- a/llvm/unittests/SandboxIR/TrackerTest.cpp
+++ b/llvm/unittests/SandboxIR/TrackerTest.cpp
@@ -11,7 +11,8 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
-#include "llvm/SandboxIR/SandboxIR.h"
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/SandboxIR/Instruction.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gmock/gmock-matchers.h"
 #include "gtest/gtest.h"
diff --git a/llvm/unittests/SandboxIR/TypesTest.cpp b/llvm/unittests/SandboxIR/TypesTest.cpp
index 40aa32fb08ed0..6ccd08d4e710f 100644
--- a/llvm/unittests/SandboxIR/TypesTest.cpp
+++ b/llvm/unittests/SandboxIR/TypesTest.cpp
@@ -14,7 +14,9 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
-#include "llvm/SandboxIR/SandboxIR.h"
+#include "llvm/SandboxIR/Constant.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Function.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/SandboxIR/UtilsTest.cpp b/llvm/unittests/SandboxIR/UtilsTest.cpp
new file mode 100644
index 0000000000000..7ce8f40f8985b
--- /dev/null
+++ b/llvm/unittests/SandboxIR/UtilsTest.cpp
@@ -0,0 +1,137 @@
+//===- UtilsTest.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SandboxIR/Utils.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/SandboxIR/Constant.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+struct UtilsTest : public testing::Test {
+  LLVMContext C;
+  std::unique_ptr<Module> M;
+
+  void parseIR(LLVMContext &C, const char *IR) {
+    SMDiagnostic Err;
+    M = parseAssemblyString(IR, Err, C);
+    if (!M)
+      Err.print("UtilsTest", errs());
+  }
+  BasicBlock *getBasicBlockByName(Function &F, StringRef Name) {
+    for (BasicBlock &BB : F)
+      if (BB.getName() == Name)
+        return &BB;
+    llvm_unreachable("Expected to find basic block!");
+  }
+};
+
+TEST_F(UtilsTest, getMemoryLocation) {
+  parseIR(C, R"IR(
+define void @foo(ptr %arg0) {
+  %ld = load i8, ptr %arg0
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  auto *LLVMBB = &*LLVMF->begin();
+  auto *LLVMLd = cast<llvm::LoadInst>(&*LLVMBB->begin());
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(LLVMF);
+  auto *BB = &*F->begin();
+  auto *Ld = cast<sandboxir::LoadInst>(&*BB->begin());
+  EXPECT_EQ(sandboxir::Utils::memoryLocationGetOrNone(Ld),
+            MemoryLocation::getOrNone(LLVMLd));
+}
+
+TEST_F(UtilsTest, GetPointerDiffInBytes) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr) {
+  %gep0 = getelementptr inbounds float, ptr %ptr, i64 0
+  %gep1 = getelementptr inbounds float, ptr %ptr, i64 1
+  %gep2 = getelementptr inbounds float, ptr %ptr, i64 2
+  %gep3 = getelementptr inbounds float, ptr %ptr, i64 3
+
+  %ld0 = load float, ptr %gep0
+  %ld1 = load float, ptr %gep1
+  %ld2 = load float, ptr %gep2
+  %ld3 = load float, ptr %gep3
+
+  %v2ld0 = load <2 x float>, ptr %gep0
+  %v2ld1 = load <2 x float>, ptr %gep1
+  %v2ld2 = load <2 x float>, ptr %gep2
+  %v2ld3 = load <2 x float>, ptr %gep3
+
+  %v3ld0 = load <3 x float>, ptr %gep0
+  %v3ld1 = load <3 x float>, ptr %gep1
+  %v3ld2 = load <3 x float>, ptr %gep2
+  %v3ld3 = load <3 x float>, ptr %gep3
+  ret void
+}
+)IR");
+  llvm::Function &LLVMF = *M->getFunction("foo");
+  DominatorTree DT(LLVMF);
+  TargetLibraryInfoImpl TLII;
+  TargetLibraryInfo TLI(TLII);
+  DataLayout DL(M->getDataLayout());
+  AssumptionCache AC(LLVMF);
+  BasicAAResult BAA(DL, LLVMF, TLI, AC, &DT);
+  AAResults AA(TLI);
+  AA.addAAResult(BAA);
+  LoopInfo LI(DT);
+  ScalarEvolution SE(LLVMF, TLI, AC, DT, LI);
+  sandboxir::Context Ctx(C);
+
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto &BB = *F.begin();
+  auto It = std::next(BB.begin(), 4);
+  auto *L0 = cast<sandboxir::LoadInst>(&*It++);
+  auto *L1 = cast<sandboxir::LoadInst>(&*It++);
+  auto *L2 = cast<sandboxir::LoadInst>(&*It++);
+  [[maybe_unused]] auto *L3 = cast<sandboxir::LoadInst>(&*It++);
+
+  auto *V2L0 = cast<sandboxir::LoadInst>(&*It++);
+  auto *V2L1 = cast<sandboxir::LoadInst>(&*It++);
+  auto *V2L2 = cast<sandboxir::LoadInst>(&*It++);
+  auto *V2L3 = cast<sandboxir::LoadInst>(&*It++);
+
+  [[maybe_unused]] auto *V3L0 = cast<sandboxir::LoadInst>(&*It++);
+  auto *V3L1 = cast<sandboxir::LoadInst>(&*It++);
+  [[maybe_unused]] auto *V3L2 = cast<sandboxir::LoadInst>(&*It++);
+  [[maybe_unused]] auto *V3L3 = cast<sandboxir::LoadInst>(&*It++);
+
+  // getPointerDiffInBytes
+  EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(L0, L1, SE, DL), 4);
+  EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(L0, L2, SE, DL), 8);
+  EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(L1, L0, SE, DL), -4);
+  EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(L0, V2L0, SE, DL), 0);
+
+  EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(L0, V2L1, SE, DL), 4);
+  EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(L0, V3L1, SE, DL), 4);
+  EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(V2L0, V2L2, SE, DL), 8);
+  EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(V2L0, V2L3, SE, DL), 12);
+  EXPECT_EQ(*sandboxir::Utils::getPointerDiffInBytes(V2L3, V2L0, SE, DL), -12);
+
+  // atLowerAddress
+  EXPECT_TRUE(sandboxir::Utils::atLowerAddress(L0, L1, SE, DL));
+  EXPECT_FALSE(sandboxir::Utils::atLowerAddress(L1, L0, SE, DL));
+  EXPECT_FALSE(sandboxir::Utils::atLowerAddress(L3, V3L3, SE, DL));
+}
diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp
index b6e16f809ea77..b701757aed5eb 100644
--- a/llvm/unittests/Support/KnownBitsTest.cpp
+++ b/llvm/unittests/Support/KnownBitsTest.cpp
@@ -305,15 +305,13 @@ TEST(KnownBitsTest, BinaryExhaustive) {
       [](const KnownBits &Known1, const KnownBits &Known2) {
         return KnownBits::add(Known1, Known2);
       },
-      [](const APInt &N1, const APInt &N2) { return N1 + N2; },
-      /*CheckOptimality=*/false);
+      [](const APInt &N1, const APInt &N2) { return N1 + N2; });
   testBinaryOpExhaustive(
       "sub",
       [](const KnownBits &Known1, const KnownBits &Known2) {
         return KnownBits::sub(Known1, Known2);
       },
-      [](const APInt &N1, const APInt &N2) { return N1 - N2; },
-      /*CheckOptimality=*/false);
+      [](const APInt &N1, const APInt &N2) { return N1 - N2; });
   testBinaryOpExhaustive("umax", KnownBits::umax, APIntOps::umax);
   testBinaryOpExhaustive("umin", KnownBits::umin, APIntOps::umin);
   testBinaryOpExhaustive("smax", KnownBits::smax, APIntOps::smax);
@@ -524,16 +522,15 @@ TEST(KnownBitsTest, BinaryExhaustive) {
       /*CheckOptimality=*/false);
 
   testBinaryOpExhaustive("avgFloorS", KnownBits::avgFloorS, APIntOps::avgFloorS,
-                         false);
+                         /*CheckOptimality=*/false);
 
-  testBinaryOpExhaustive("avgFloorU", KnownBits::avgFloorU, APIntOps::avgFloorU,
-                         false);
+  testBinaryOpExhaustive("avgFloorU", KnownBits::avgFloorU,
+                         APIntOps::avgFloorU);
 
-  testBinaryOpExhaustive("avgCeilU", KnownBits::avgCeilU, APIntOps::avgCeilU,
-                         false);
+  testBinaryOpExhaustive("avgCeilU", KnownBits::avgCeilU, APIntOps::avgCeilU);
 
   testBinaryOpExhaustive("avgCeilS", KnownBits::avgCeilS, APIntOps::avgCeilS,
-                         false);
+                         /*CheckOptimality=*/false);
 }
 
 TEST(KnownBitsTest, UnaryExhaustive) {
diff --git a/llvm/unittests/Support/TimerTest.cpp b/llvm/unittests/Support/TimerTest.cpp
index 09545eb6939ae..5686b394e16cd 100644
--- a/llvm/unittests/Support/TimerTest.cpp
+++ b/llvm/unittests/Support/TimerTest.cpp
@@ -27,8 +27,13 @@ void SleepMS() {
   struct timespec Interval;
   Interval.tv_sec = 0;
   Interval.tv_nsec = 1000000;
+#if defined(__MVS__)
+  long Microseconds = (Interval.tv_nsec + 999) / 1000;
+  usleep(Microseconds);
+#else
   nanosleep(&Interval, nullptr);
 #endif
+#endif
 }
 
 TEST(Timer, Additivity) {
diff --git a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
index 4d6111cc257c3..13bd69624867e 100644
--- a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
+++ b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
@@ -954,4 +954,36 @@ TEST_F(ScalarEvolutionExpanderTest, ExpandNonIntegralPtrWithNullBase) {
   });
 }
 
+TEST_F(ScalarEvolutionExpanderTest, GEPFlags) {
+  LLVMContext C;
+  SMDiagnostic Err;
+  StringRef ModStr = R"(
+  define void @f(ptr %p, i64 %x) {
+    %gep_inbounds = getelementptr inbounds i8, ptr %p, i64 %x
+    ret void
+  })";
+  std::unique_ptr<Module> M = parseAssemblyString(ModStr, Err, C);
+
+  assert(M && "Could not parse module?");
+  assert(!verifyModule(*M) && "Must have been well formed!");
+
+  Function *F = M->getFunction("f");
+  ASSERT_NE(F, nullptr) << "Could not find function 'f'";
+  BasicBlock &Entry = F->getEntryBlock();
+  auto *GEP = cast<GetElementPtrInst>(&Entry.front());
+
+  ScalarEvolution SE = buildSE(*F);
+  const SCEV *Ptr = SE.getSCEV(F->getArg(0));
+  const SCEV *X = SE.getSCEV(F->getArg(1));
+  const SCEV *PtrX = SE.getAddExpr(Ptr, X);
+
+  SCEVExpander Exp(SE, M->getDataLayout(), "expander");
+  auto *I = cast<Instruction>(
+      Exp.expandCodeFor(PtrX, nullptr, Entry.getTerminator()));
+  // Check that the GEP is reused, but the inbounds flag cleared. We don't
+  // know that the newly introduced use is inbounds.
+  EXPECT_EQ(I, GEP);
+  EXPECT_EQ(GEP->getNoWrapFlags(), GEPNoWrapFlags::none());
+}
+
 } // end namespace llvm
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt
index 86b1d968094ca..9f1a3409c0c39 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt
@@ -9,7 +9,6 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(SandboxVectorizerTests
   DependencyGraphTest.cpp
-  InstrIntervalTest.cpp
+  IntervalTest.cpp
   LegalityTest.cpp
-  RegionTest.cpp
   )
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
index f6bfd097f20a4..d8b6f519982eb 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
@@ -8,7 +8,9 @@
 
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h"
 #include "llvm/AsmParser/Parser.h"
-#include "llvm/SandboxIR/SandboxIR.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/SandboxIR/Instruction.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gmock/gmock-matchers.h"
 #include "gtest/gtest.h"
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrIntervalTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrIntervalTest.cpp
deleted file mode 100644
index e22bb78a07d30..0000000000000
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrIntervalTest.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-//===- InstrIntervalTest.cpp ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Vectorize/SandboxVectorizer/InstrInterval.h"
-#include "llvm/AsmParser/Parser.h"
-#include "llvm/SandboxIR/SandboxIR.h"
-#include "llvm/Support/SourceMgr.h"
-#include "gmock/gmock-matchers.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-
-struct InstrIntervalTest : public testing::Test {
-  LLVMContext C;
-  std::unique_ptr<Module> M;
-
-  void parseIR(LLVMContext &C, const char *IR) {
-    SMDiagnostic Err;
-    M = parseAssemblyString(IR, Err, C);
-    if (!M)
-      Err.print("InstrIntervalTest", errs());
-  }
-};
-
-TEST_F(InstrIntervalTest, Basic) {
-  parseIR(C, R"IR(
-define void @foo(i8 %v0) {
-  %add0 = add i8 %v0, %v0
-  %add1 = add i8 %v0, %v0
-  %add2 = add i8 %v0, %v0
-  ret void
-}
-)IR");
-  Function &LLVMF = *M->getFunction("foo");
-  sandboxir::Context Ctx(C);
-  auto &F = *Ctx.createFunction(&LLVMF);
-  auto *BB = &*F.begin();
-  auto It = BB->begin();
-  auto *I0 = &*It++;
-  auto *I1 = &*It++;
-  auto *I2 = &*It++;
-  auto *Ret = &*It++;
-
-  sandboxir::InstrInterval Interval(I0, Ret);
-#ifndef NDEBUG
-  EXPECT_DEATH(sandboxir::InstrInterval(I1, I0), ".*before.*");
-#endif // NDEBUG
-  // Check InstrInterval(ArrayRef), from(), to().
-  {
-    sandboxir::InstrInterval Interval(
-        SmallVector<sandboxir::Instruction *>({I0, Ret}));
-    EXPECT_EQ(Interval.top(), I0);
-    EXPECT_EQ(Interval.bottom(), Ret);
-  }
-  {
-    sandboxir::InstrInterval Interval(
-        SmallVector<sandboxir::Instruction *>({Ret, I0}));
-    EXPECT_EQ(Interval.top(), I0);
-    EXPECT_EQ(Interval.bottom(), Ret);
-  }
-  {
-    sandboxir::InstrInterval Interval(
-        SmallVector<sandboxir::Instruction *>({I0, I0}));
-    EXPECT_EQ(Interval.top(), I0);
-    EXPECT_EQ(Interval.bottom(), I0);
-  }
-
-  // Check empty().
-  EXPECT_FALSE(Interval.empty());
-  sandboxir::InstrInterval Empty;
-  EXPECT_TRUE(Empty.empty());
-  sandboxir::InstrInterval One(I0, I0);
-  EXPECT_FALSE(One.empty());
-  // Check contains().
-  for (auto &I : *BB) {
-    EXPECT_TRUE(Interval.contains(&I));
-    EXPECT_FALSE(Empty.contains(&I));
-  }
-  EXPECT_FALSE(One.contains(I1));
-  EXPECT_FALSE(One.contains(I2));
-  EXPECT_FALSE(One.contains(Ret));
-  // Check iterator.
-  auto BBIt = BB->begin();
-  for (auto &I : Interval)
-    EXPECT_EQ(&I, &*BBIt++);
-}
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/IntervalTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/IntervalTest.cpp
new file mode 100644
index 0000000000000..a697ce7727a9b
--- /dev/null
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/IntervalTest.cpp
@@ -0,0 +1,251 @@
+//===- IntervalTest.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/SandboxIR/Instruction.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gmock/gmock-matchers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+struct IntervalTest : public testing::Test {
+  LLVMContext C;
+  std::unique_ptr<Module> M;
+
+  void parseIR(LLVMContext &C, const char *IR) {
+    SMDiagnostic Err;
+    M = parseAssemblyString(IR, Err, C);
+    if (!M)
+      Err.print("InstrIntervalTest", errs());
+  }
+};
+
+TEST_F(IntervalTest, Basic) {
+  parseIR(C, R"IR(
+define void @foo(i8 %v0) {
+  %add0 = add i8 %v0, %v0
+  %add1 = add i8 %v0, %v0
+  %add2 = add i8 %v0, %v0
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *I0 = &*It++;
+  auto *I1 = &*It++;
+  auto *I2 = &*It++;
+  auto *Ret = &*It++;
+
+  sandboxir::Interval<sandboxir::Instruction> Intvl(I0, Ret);
+#ifndef NDEBUG
+  EXPECT_DEATH(sandboxir::Interval<sandboxir::Instruction>(I1, I0),
+               ".*before.*");
+#endif // NDEBUG
+  // Check Interval<sandboxir::Instruction>(ArrayRef), from(), to().
+  {
+    sandboxir::Interval<sandboxir::Instruction> Intvl(
+        SmallVector<sandboxir::Instruction *>({I0, Ret}));
+    EXPECT_EQ(Intvl.top(), I0);
+    EXPECT_EQ(Intvl.bottom(), Ret);
+  }
+  {
+    sandboxir::Interval<sandboxir::Instruction> Intvl(
+        SmallVector<sandboxir::Instruction *>({Ret, I0}));
+    EXPECT_EQ(Intvl.top(), I0);
+    EXPECT_EQ(Intvl.bottom(), Ret);
+  }
+  {
+    sandboxir::Interval<sandboxir::Instruction> Intvl(
+        SmallVector<sandboxir::Instruction *>({I0, I0}));
+    EXPECT_EQ(Intvl.top(), I0);
+    EXPECT_EQ(Intvl.bottom(), I0);
+  }
+
+  // Check empty().
+  EXPECT_FALSE(Intvl.empty());
+  sandboxir::Interval<sandboxir::Instruction> Empty;
+  EXPECT_TRUE(Empty.empty());
+  sandboxir::Interval<sandboxir::Instruction> One(I0, I0);
+  EXPECT_FALSE(One.empty());
+  // Check contains().
+  for (auto &I : *BB) {
+    EXPECT_TRUE(Intvl.contains(&I));
+    EXPECT_FALSE(Empty.contains(&I));
+  }
+  EXPECT_FALSE(One.contains(I1));
+  EXPECT_FALSE(One.contains(I2));
+  EXPECT_FALSE(One.contains(Ret));
+  // Check iterator.
+  auto BBIt = BB->begin();
+  for (auto &I : Intvl)
+    EXPECT_EQ(&I, &*BBIt++);
+  {
+    // Check equality.
+    EXPECT_TRUE(Empty == Empty);
+    EXPECT_FALSE(Empty == One);
+    EXPECT_TRUE(One == One);
+    sandboxir::Interval<sandboxir::Instruction> Intvl1(I0, I2);
+    sandboxir::Interval<sandboxir::Instruction> Intvl2(I0, I2);
+    EXPECT_TRUE(Intvl1 == Intvl1);
+    EXPECT_TRUE(Intvl1 == Intvl2);
+  }
+  {
+    // Check inequality.
+    EXPECT_FALSE(Empty != Empty);
+    EXPECT_TRUE(Empty != One);
+    EXPECT_FALSE(One != One);
+    sandboxir::Interval<sandboxir::Instruction> Intvl1(I0, I2);
+    sandboxir::Interval<sandboxir::Instruction> Intvl2(I0, I2);
+    EXPECT_FALSE(Intvl1 != Intvl1);
+    EXPECT_FALSE(Intvl1 != Intvl2);
+  }
+  {
+    // Check disjoint().
+    EXPECT_TRUE(Empty.disjoint(Empty));
+    EXPECT_TRUE(One.disjoint(Empty));
+    EXPECT_TRUE(Empty.disjoint(One));
+    sandboxir::Interval<sandboxir::Instruction> Intvl1(I0, I2);
+    sandboxir::Interval<sandboxir::Instruction> Intvl2(I1, Ret);
+    EXPECT_FALSE(Intvl1.disjoint(Intvl2));
+    sandboxir::Interval<sandboxir::Instruction> Intvl3(I2, I2);
+    EXPECT_FALSE(Intvl1.disjoint(Intvl3));
+    EXPECT_TRUE(Intvl1.disjoint(Empty));
+  }
+}
+
+// Helper function for returning a vector of instruction pointers from a range
+// of references.
+template <typename RangeT>
+static SmallVector<sandboxir::Instruction *> getPtrVec(RangeT Range) {
+  SmallVector<sandboxir::Instruction *> PtrVec;
+  for (sandboxir::Instruction &I : Range)
+    PtrVec.push_back(&I);
+  return PtrVec;
+}
+
+TEST_F(IntervalTest, Difference) {
+  parseIR(C, R"IR(
+define void @foo(i8 %v0) {
+  %I0 = add i8 %v0, %v0
+  %I1 = add i8 %v0, %v0
+  %I2 = add i8 %v0, %v0
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *I0 = &*It++;
+  auto *I1 = &*It++;
+  auto *I2 = &*It++;
+  auto *Ret = &*It++;
+
+  {
+    // Check [I0,Ret] - []
+    sandboxir::Interval<sandboxir::Instruction> I0Ret(I0, Ret);
+    sandboxir::Interval<sandboxir::Instruction> Empty;
+    auto Diffs = I0Ret - Empty;
+    EXPECT_EQ(Diffs.size(), 1u);
+    const sandboxir::Interval<sandboxir::Instruction> &Diff = Diffs[0];
+    EXPECT_THAT(getPtrVec(Diff), testing::ElementsAre(I0, I1, I2, Ret));
+  }
+  {
+    // Check [] - [I0,Ret]
+    sandboxir::Interval<sandboxir::Instruction> Empty;
+    sandboxir::Interval<sandboxir::Instruction> I0Ret(I0, Ret);
+    auto Diffs = Empty - I0Ret;
+    EXPECT_EQ(Diffs.size(), 1u);
+    const sandboxir::Interval<sandboxir::Instruction> &Diff = Diffs[0];
+    EXPECT_TRUE(Diff.empty());
+  }
+  {
+    // Check [I0,Ret] - [I0].
+    sandboxir::Interval<sandboxir::Instruction> I0Ret(I0, Ret);
+    sandboxir::Interval<sandboxir::Instruction> I0I0(I0, I0);
+    auto Diffs = I0Ret - I0I0;
+    EXPECT_EQ(Diffs.size(), 1u);
+    const sandboxir::Interval<sandboxir::Instruction> &Diff = Diffs[0];
+    EXPECT_THAT(getPtrVec(Diff), testing::ElementsAre(I1, I2, Ret));
+  }
+  {
+    // Check [I0,Ret] - [I1].
+    sandboxir::Interval<sandboxir::Instruction> I0Ret(I0, Ret);
+    sandboxir::Interval<sandboxir::Instruction> I1I1(I1, I1);
+    auto Diffs = I0Ret - I1I1;
+    EXPECT_EQ(Diffs.size(), 2u);
+    const sandboxir::Interval<sandboxir::Instruction> &Diff0 = Diffs[0];
+    EXPECT_THAT(getPtrVec(Diff0), testing::ElementsAre(I0));
+    const sandboxir::Interval<sandboxir::Instruction> &Diff1 = Diffs[1];
+    EXPECT_THAT(getPtrVec(Diff1), testing::ElementsAre(I2, Ret));
+  }
+}
+
+TEST_F(IntervalTest, Intersection) {
+  parseIR(C, R"IR(
+define void @foo(i8 %v0) {
+  %I0 = add i8 %v0, %v0
+  %I1 = add i8 %v0, %v0
+  %I2 = add i8 %v0, %v0
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *I0 = &*It++;
+  auto *I1 = &*It++;
+  [[maybe_unused]] auto *I2 = &*It++;
+  auto *Ret = &*It++;
+
+  {
+    // Check [I0,Ret] ^ []
+    sandboxir::Interval<sandboxir::Instruction> I0Ret(I0, Ret);
+    sandboxir::Interval<sandboxir::Instruction> Empty;
+    auto Intersection = I0Ret.intersection(Empty);
+    EXPECT_TRUE(Intersection.empty());
+  }
+  {
+    // Check [] ^ [I0,Ret]
+    sandboxir::Interval<sandboxir::Instruction> Empty;
+    sandboxir::Interval<sandboxir::Instruction> I0Ret(I0, Ret);
+    auto Intersection = Empty.intersection(I0Ret);
+    EXPECT_TRUE(Intersection.empty());
+  }
+  {
+    // Check [I0,Ret] ^ [I0]
+    sandboxir::Interval<sandboxir::Instruction> I0Ret(I0, Ret);
+    sandboxir::Interval<sandboxir::Instruction> I0I0(I0, I0);
+    auto Intersection = I0Ret.intersection(I0I0);
+    EXPECT_THAT(getPtrVec(Intersection), testing::ElementsAre(I0));
+  }
+  {
+    // Check [I0] ^ [I0,Ret]
+    sandboxir::Interval<sandboxir::Instruction> I0I0(I0, I0);
+    sandboxir::Interval<sandboxir::Instruction> I0Ret(I0, Ret);
+    auto Intersection = I0I0.intersection(I0Ret);
+    EXPECT_THAT(getPtrVec(Intersection), testing::ElementsAre(I0));
+  }
+  {
+    // Check [I0,Ret] ^ [I1].
+    sandboxir::Interval<sandboxir::Instruction> I0Ret(I0, Ret);
+    sandboxir::Interval<sandboxir::Instruction> I1I1(I1, I1);
+    auto Intersection = I0Ret.intersection(I1I1);
+    EXPECT_THAT(getPtrVec(Intersection), testing::ElementsAre(I1));
+  }
+}
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
index a136be41ae363..e16222ddb2d61 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
@@ -8,7 +8,8 @@
 
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h"
 #include "llvm/AsmParser/Parser.h"
-#include "llvm/SandboxIR/SandboxIR.h"
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/SandboxIR/Instruction.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index a7ff51067c8d0..a5d44e9a57246 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1230,9 +1230,9 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     VPValue Op2;
     VPInstruction VPInst(Instruction::Add, {&Op1, &Op2});
     VPRecipeBase &Recipe = VPInst;
-    EXPECT_TRUE(Recipe.mayHaveSideEffects());
+    EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_TRUE(Recipe.mayReadFromMemory());
-    EXPECT_TRUE(Recipe.mayWriteToMemory());
+    EXPECT_FALSE(Recipe.mayWriteToMemory());
     EXPECT_TRUE(Recipe.mayReadOrWriteMemory());
   }
   {
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
index a30dc72a83154..c3bd7efd8387a 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
@@ -75,6 +75,7 @@ CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) {
   Targets.back().Count = Intrinsics.size() - Targets.back().Offset;
 
   CheckDuplicateIntrinsics();
+  CheckTargetIndependentIntrinsics();
 }
 
 // Check for duplicate intrinsic names.
@@ -101,6 +102,28 @@ void CodeGenIntrinsicTable::CheckDuplicateIntrinsics() const {
   PrintFatalNote(First.TheDef, "Previous definition here");
 }
 
+// For target independent intrinsics, check that their second dotted component
+// does not match any target name.
+void CodeGenIntrinsicTable::CheckTargetIndependentIntrinsics() const {
+  SmallDenseSet<StringRef> TargetNames;
+  for (const auto &Target : ArrayRef(Targets).drop_front())
+    TargetNames.insert(Target.Name);
+
+  // Set of target independent intrinsics.
+  const auto &Set = Targets[0];
+  for (const auto &Int : ArrayRef(&Intrinsics[Set.Offset], Set.Count)) {
+    StringRef Name = Int.Name;
+    StringRef Prefix = Name.drop_front(5).split('.').first;
+    if (!TargetNames.contains(Prefix))
+      continue;
+    PrintFatalError(Int.TheDef,
+                    "target independent intrinsic `" + Name +
+                        "' has prefix `llvm." + Prefix +
+                        "` that conflicts with intrinsics for target `" +
+                        Prefix + "`");
+  }
+}
+
 CodeGenIntrinsic &CodeGenIntrinsicMap::operator[](const Record *Record) {
   if (!Record->isSubClassOf("Intrinsic"))
     PrintFatalError("Intrinsic defs should be subclass of 'Intrinsic' class");
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
index 2df598da3f250..1cdeaacd52dcd 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
@@ -192,6 +192,7 @@ class CodeGenIntrinsicTable {
 
 private:
   void CheckDuplicateIntrinsics() const;
+  void CheckTargetIndependentIntrinsics() const;
 };
 
 // This class builds `CodeGenIntrinsic` on demand for a given Def.
diff --git a/llvm/utils/TableGen/CallingConvEmitter.cpp b/llvm/utils/TableGen/CallingConvEmitter.cpp
index 8876bb3ad31e1..bed5aa86846bf 100644
--- a/llvm/utils/TableGen/CallingConvEmitter.cpp
+++ b/llvm/utils/TableGen/CallingConvEmitter.cpp
@@ -38,7 +38,7 @@ class CallingConvEmitter {
 
 private:
   void EmitCallingConv(const Record *CC, raw_ostream &O);
-  void EmitAction(const Record *Action, unsigned Indent, raw_ostream &O);
+  void EmitAction(const Record *Action, indent Indent, raw_ostream &O);
   void EmitArgRegisterLists(raw_ostream &O);
 };
 } // End anonymous namespace
@@ -116,26 +116,24 @@ void CallingConvEmitter::EmitCallingConv(const Record *CC, raw_ostream &O) {
                      });
 
     O << "\n";
-    EmitAction(Action, 2, O);
+    EmitAction(Action, indent(2), O);
   }
 
   O << "\n  return true; // CC didn't match.\n";
   O << "}\n";
 }
 
-void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent,
+void CallingConvEmitter::EmitAction(const Record *Action, indent Indent,
                                     raw_ostream &O) {
-  std::string IndentStr = std::string(Indent, ' ');
-
   if (Action->isSubClassOf("CCPredicateAction")) {
-    O << IndentStr << "if (";
+    O << Indent << "if (";
 
     if (Action->isSubClassOf("CCIfType")) {
-      ListInit *VTs = Action->getValueAsListInit("VTs");
+      const ListInit *VTs = Action->getValueAsListInit("VTs");
       for (unsigned i = 0, e = VTs->size(); i != e; ++i) {
-        Record *VT = VTs->getElementAsRecord(i);
+        const Record *VT = VTs->getElementAsRecord(i);
         if (i != 0)
-          O << " ||\n    " << IndentStr;
+          O << " ||\n    " << Indent;
         O << "LocVT == " << getEnumName(getValueType(VT));
       }
 
@@ -148,29 +146,29 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent,
 
     O << ") {\n";
     EmitAction(Action->getValueAsDef("SubAction"), Indent + 2, O);
-    O << IndentStr << "}\n";
+    O << Indent << "}\n";
   } else {
     if (Action->isSubClassOf("CCDelegateTo")) {
       const Record *CC = Action->getValueAsDef("CC");
-      O << IndentStr << "if (!" << CC->getName()
+      O << Indent << "if (!" << CC->getName()
         << "(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))\n"
-        << IndentStr << "  return false;\n";
+        << Indent + 2 << "return false;\n";
       DelegateToMap[CurrentAction].insert(CC->getName().str());
     } else if (Action->isSubClassOf("CCAssignToReg") ||
                Action->isSubClassOf("CCAssignToRegAndStack")) {
       const ListInit *RegList = Action->getValueAsListInit("RegList");
       if (RegList->size() == 1) {
         std::string Name = getQualifiedName(RegList->getElementAsRecord(0));
-        O << IndentStr << "if (MCRegister Reg = State.AllocateReg(" << Name
+        O << Indent << "if (MCRegister Reg = State.AllocateReg(" << Name
           << ")) {\n";
         if (SwiftAction)
           AssignedSwiftRegsMap[CurrentAction].insert(Name);
         else
           AssignedRegsMap[CurrentAction].insert(Name);
       } else {
-        O << IndentStr << "static const MCPhysReg RegList" << ++Counter
+        O << Indent << "static const MCPhysReg RegList" << ++Counter
           << "[] = {\n";
-        O << IndentStr << "  ";
+        O << Indent << "  ";
         ListSeparator LS;
         for (unsigned i = 0, e = RegList->size(); i != e; ++i) {
           std::string Name = getQualifiedName(RegList->getElementAsRecord(i));
@@ -180,21 +178,21 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent,
             AssignedRegsMap[CurrentAction].insert(Name);
           O << LS << Name;
         }
-        O << "\n" << IndentStr << "};\n";
-        O << IndentStr << "if (MCRegister Reg = State.AllocateReg(RegList"
+        O << "\n" << Indent << "};\n";
+        O << Indent << "if (MCRegister Reg = State.AllocateReg(RegList"
           << Counter << ")) {\n";
       }
-      O << IndentStr << "  State.addLoc(CCValAssign::getReg(ValNo, ValVT, "
+      O << Indent << "  State.addLoc(CCValAssign::getReg(ValNo, ValVT, "
         << "Reg, LocVT, LocInfo));\n";
       if (Action->isSubClassOf("CCAssignToRegAndStack")) {
         int Size = Action->getValueAsInt("Size");
         int Align = Action->getValueAsInt("Align");
-        O << IndentStr << "  (void)State.AllocateStack(";
+        O << Indent << "  (void)State.AllocateStack(";
         if (Size)
           O << Size << ", ";
         else
           O << "\n"
-            << IndentStr
+            << Indent
             << "  State.getMachineFunction().getDataLayout()."
                "getTypeAllocSize(EVT(LocVT).getTypeForEVT(State.getContext())),"
                " ";
@@ -202,14 +200,14 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent,
           O << "Align(" << Align << ")";
         else
           O << "\n"
-            << IndentStr
+            << Indent
             << "  State.getMachineFunction().getDataLayout()."
                "getABITypeAlign(EVT(LocVT).getTypeForEVT(State.getContext()"
                "))";
         O << ");\n";
       }
-      O << IndentStr << "  return false;\n";
-      O << IndentStr << "}\n";
+      O << Indent << "  return false;\n";
+      O << Indent << "}\n";
     } else if (Action->isSubClassOf("CCAssignToRegWithShadow")) {
       const ListInit *RegList = Action->getValueAsListInit("RegList");
       const ListInit *ShadowRegList =
@@ -219,7 +217,7 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent,
                         "Invalid length of list of shadowed registers");
 
       if (RegList->size() == 1) {
-        O << IndentStr << "if (MCRegister Reg = State.AllocateReg(";
+        O << Indent << "if (MCRegister Reg = State.AllocateReg(";
         O << getQualifiedName(RegList->getElementAsRecord(0));
         O << ", " << getQualifiedName(ShadowRegList->getElementAsRecord(0));
         O << ")) {\n";
@@ -227,41 +225,40 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent,
         unsigned RegListNumber = ++Counter;
         unsigned ShadowRegListNumber = ++Counter;
 
-        O << IndentStr << "static const MCPhysReg RegList" << RegListNumber
+        O << Indent << "static const MCPhysReg RegList" << RegListNumber
           << "[] = {\n";
-        O << IndentStr << "  ";
+        O << Indent << "  ";
         ListSeparator LS;
         for (unsigned i = 0, e = RegList->size(); i != e; ++i)
           O << LS << getQualifiedName(RegList->getElementAsRecord(i));
-        O << "\n" << IndentStr << "};\n";
+        O << "\n" << Indent << "};\n";
 
-        O << IndentStr << "static const MCPhysReg RegList"
-          << ShadowRegListNumber << "[] = {\n";
-        O << IndentStr << "  ";
+        O << Indent << "static const MCPhysReg RegList" << ShadowRegListNumber
+          << "[] = {\n";
+        O << Indent << "  ";
         ListSeparator LSS;
         for (unsigned i = 0, e = ShadowRegList->size(); i != e; ++i)
           O << LSS << getQualifiedName(ShadowRegList->getElementAsRecord(i));
-        O << "\n" << IndentStr << "};\n";
+        O << "\n" << Indent << "};\n";
 
-        O << IndentStr << "if (MCRegister Reg = State.AllocateReg(RegList"
+        O << Indent << "if (MCRegister Reg = State.AllocateReg(RegList"
           << RegListNumber << ", "
           << "RegList" << ShadowRegListNumber << ")) {\n";
       }
-      O << IndentStr << "  State.addLoc(CCValAssign::getReg(ValNo, ValVT, "
+      O << Indent << "  State.addLoc(CCValAssign::getReg(ValNo, ValVT, "
         << "Reg, LocVT, LocInfo));\n";
-      O << IndentStr << "  return false;\n";
-      O << IndentStr << "}\n";
+      O << Indent << "  return false;\n";
+      O << Indent << "}\n";
     } else if (Action->isSubClassOf("CCAssignToStack")) {
       int Size = Action->getValueAsInt("Size");
       int Align = Action->getValueAsInt("Align");
 
-      O << IndentStr << "int64_t Offset" << ++Counter
-        << " = State.AllocateStack(";
+      O << Indent << "int64_t Offset" << ++Counter << " = State.AllocateStack(";
       if (Size)
         O << Size << ", ";
       else
         O << "\n"
-          << IndentStr
+          << Indent
           << "  State.getMachineFunction().getDataLayout()."
              "getTypeAllocSize(EVT(LocVT).getTypeForEVT(State.getContext())),"
              " ";
@@ -269,14 +266,14 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent,
         O << "Align(" << Align << ")";
       else
         O << "\n"
-          << IndentStr
+          << Indent
           << "  State.getMachineFunction().getDataLayout()."
              "getABITypeAlign(EVT(LocVT).getTypeForEVT(State.getContext()"
              "))";
       O << ");\n"
-        << IndentStr << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset"
+        << Indent << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset"
         << Counter << ", LocVT, LocInfo));\n";
-      O << IndentStr << "return false;\n";
+      O << Indent << "return false;\n";
     } else if (Action->isSubClassOf("CCAssignToStackWithShadow")) {
       int Size = Action->getValueAsInt("Size");
       int Align = Action->getValueAsInt("Align");
@@ -285,76 +282,73 @@ void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent,
 
       unsigned ShadowRegListNumber = ++Counter;
 
-      O << IndentStr << "static const MCPhysReg ShadowRegList"
+      O << Indent << "static const MCPhysReg ShadowRegList"
         << ShadowRegListNumber << "[] = {\n";
-      O << IndentStr << "  ";
+      O << Indent << "  ";
       ListSeparator LS;
       for (unsigned i = 0, e = ShadowRegList->size(); i != e; ++i)
         O << LS << getQualifiedName(ShadowRegList->getElementAsRecord(i));
-      O << "\n" << IndentStr << "};\n";
+      O << "\n" << Indent << "};\n";
 
-      O << IndentStr << "int64_t Offset" << ++Counter
-        << " = State.AllocateStack(" << Size << ", Align(" << Align << "), "
+      O << Indent << "int64_t Offset" << ++Counter << " = State.AllocateStack("
+        << Size << ", Align(" << Align << "), "
         << "ShadowRegList" << ShadowRegListNumber << ");\n";
-      O << IndentStr << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset"
+      O << Indent << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset"
         << Counter << ", LocVT, LocInfo));\n";
-      O << IndentStr << "return false;\n";
+      O << Indent << "return false;\n";
     } else if (Action->isSubClassOf("CCPromoteToType")) {
       const Record *DestTy = Action->getValueAsDef("DestTy");
       MVT::SimpleValueType DestVT = getValueType(DestTy);
-      O << IndentStr << "LocVT = " << getEnumName(DestVT) << ";\n";
+      O << Indent << "LocVT = " << getEnumName(DestVT) << ";\n";
       if (MVT(DestVT).isFloatingPoint()) {
-        O << IndentStr << "LocInfo = CCValAssign::FPExt;\n";
+        O << Indent << "LocInfo = CCValAssign::FPExt;\n";
       } else {
-        O << IndentStr << "if (ArgFlags.isSExt())\n"
-          << IndentStr << "  LocInfo = CCValAssign::SExt;\n"
-          << IndentStr << "else if (ArgFlags.isZExt())\n"
-          << IndentStr << "  LocInfo = CCValAssign::ZExt;\n"
-          << IndentStr << "else\n"
-          << IndentStr << "  LocInfo = CCValAssign::AExt;\n";
+        O << Indent << "if (ArgFlags.isSExt())\n"
+          << Indent << "  LocInfo = CCValAssign::SExt;\n"
+          << Indent << "else if (ArgFlags.isZExt())\n"
+          << Indent << "  LocInfo = CCValAssign::ZExt;\n"
+          << Indent << "else\n"
+          << Indent << "  LocInfo = CCValAssign::AExt;\n";
       }
     } else if (Action->isSubClassOf("CCPromoteToUpperBitsInType")) {
       const Record *DestTy = Action->getValueAsDef("DestTy");
       MVT::SimpleValueType DestVT = getValueType(DestTy);
-      O << IndentStr << "LocVT = " << getEnumName(DestVT) << ";\n";
+      O << Indent << "LocVT = " << getEnumName(DestVT) << ";\n";
       if (MVT(DestVT).isFloatingPoint()) {
         PrintFatalError(Action->getLoc(),
                         "CCPromoteToUpperBitsInType does not handle floating "
                         "point");
       } else {
-        O << IndentStr << "if (ArgFlags.isSExt())\n"
-          << IndentStr << "  LocInfo = CCValAssign::SExtUpper;\n"
-          << IndentStr << "else if (ArgFlags.isZExt())\n"
-          << IndentStr << "  LocInfo = CCValAssign::ZExtUpper;\n"
-          << IndentStr << "else\n"
-          << IndentStr << "  LocInfo = CCValAssign::AExtUpper;\n";
+        O << Indent << "if (ArgFlags.isSExt())\n"
+          << Indent << "  LocInfo = CCValAssign::SExtUpper;\n"
+          << Indent << "else if (ArgFlags.isZExt())\n"
+          << Indent << "  LocInfo = CCValAssign::ZExtUpper;\n"
+          << Indent << "else\n"
+          << Indent << "  LocInfo = CCValAssign::AExtUpper;\n";
       }
     } else if (Action->isSubClassOf("CCBitConvertToType")) {
       const Record *DestTy = Action->getValueAsDef("DestTy");
-      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy))
-        << ";\n";
-      O << IndentStr << "LocInfo = CCValAssign::BCvt;\n";
+      O << Indent << "LocVT = " << getEnumName(getValueType(DestTy)) << ";\n";
+      O << Indent << "LocInfo = CCValAssign::BCvt;\n";
     } else if (Action->isSubClassOf("CCTruncToType")) {
       const Record *DestTy = Action->getValueAsDef("DestTy");
-      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy))
-        << ";\n";
-      O << IndentStr << "LocInfo = CCValAssign::Trunc;\n";
+      O << Indent << "LocVT = " << getEnumName(getValueType(DestTy)) << ";\n";
+      O << Indent << "LocInfo = CCValAssign::Trunc;\n";
     } else if (Action->isSubClassOf("CCPassIndirect")) {
       const Record *DestTy = Action->getValueAsDef("DestTy");
-      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy))
-        << ";\n";
-      O << IndentStr << "LocInfo = CCValAssign::Indirect;\n";
+      O << Indent << "LocVT = " << getEnumName(getValueType(DestTy)) << ";\n";
+      O << Indent << "LocInfo = CCValAssign::Indirect;\n";
     } else if (Action->isSubClassOf("CCPassByVal")) {
       int Size = Action->getValueAsInt("Size");
       int Align = Action->getValueAsInt("Align");
-      O << IndentStr << "State.HandleByVal(ValNo, ValVT, LocVT, LocInfo, "
-        << Size << ", Align(" << Align << "), ArgFlags);\n";
-      O << IndentStr << "return false;\n";
+      O << Indent << "State.HandleByVal(ValNo, ValVT, LocVT, LocInfo, " << Size
+        << ", Align(" << Align << "), ArgFlags);\n";
+      O << Indent << "return false;\n";
     } else if (Action->isSubClassOf("CCCustom")) {
-      O << IndentStr << "if (" << Action->getValueAsString("FuncName")
+      O << Indent << "if (" << Action->getValueAsString("FuncName")
         << "(ValNo, ValVT, "
         << "LocVT, LocInfo, ArgFlags, State))\n";
-      O << IndentStr << "  return false;\n";
+      O << Indent << "  return false;\n";
     } else {
       errs() << *Action;
       PrintFatalError(Action->getLoc(), "Unknown CCAction!");
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index e8cf7e3998e12..dd728da8076a7 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -2886,6 +2886,35 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
     error("Pattern has unexpected init kind!");
     return nullptr;
   }
+
+  auto ParseCastOperand = [this](DagInit *Dag, StringRef OpName) {
+    if (Dag->getNumArgs() != 1)
+      error("Type cast only takes one operand!");
+
+    if (!OpName.empty())
+      error("Type cast should not have a name!");
+
+    return ParseTreePattern(Dag->getArg(0), Dag->getArgNameStr(0));
+  };
+
+  if (ListInit *LI = dyn_cast<ListInit>(Dag->getOperator())) {
+    // If the operator is a list (of value types), then this must be "type cast"
+    // of a leaf node with multiple results.
+    TreePatternNodePtr New = ParseCastOperand(Dag, OpName);
+
+    size_t NumTypes = New->getNumTypes();
+    if (LI->empty() || LI->size() != NumTypes)
+      error("Invalid number of type casts!");
+
+    // Apply the type casts.
+    const CodeGenHwModes &CGH = getDAGPatterns().getTargetInfo().getHwModes();
+    for (unsigned i = 0; i < std::min(NumTypes, LI->size()); ++i)
+      New->UpdateNodeType(
+          i, getValueTypeByHwMode(LI->getElementAsRecord(i), CGH), *this);
+
+    return New;
+  }
+
   DefInit *OpDef = dyn_cast<DefInit>(Dag->getOperator());
   if (!OpDef) {
     error("Pattern has unexpected operator type!");
@@ -2896,20 +2925,15 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
   if (Operator->isSubClassOf("ValueType")) {
     // If the operator is a ValueType, then this must be "type cast" of a leaf
     // node.
-    if (Dag->getNumArgs() != 1)
-      error("Type cast only takes one operand!");
+    TreePatternNodePtr New = ParseCastOperand(Dag, OpName);
 
-    TreePatternNodePtr New =
-        ParseTreePattern(Dag->getArg(0), Dag->getArgNameStr(0));
+    if (New->getNumTypes() != 1)
+      error("ValueType cast can only have one type!");
 
     // Apply the type cast.
-    if (New->getNumTypes() != 1)
-      error("Type cast can only have one type!");
     const CodeGenHwModes &CGH = getDAGPatterns().getTargetInfo().getHwModes();
     New->UpdateNodeType(0, getValueTypeByHwMode(Operator, CGH), *this);
 
-    if (!OpName.empty())
-      error("ValueType cast should not have a name!");
     return New;
   }
 
@@ -4223,8 +4247,10 @@ void CodeGenDAGPatterns::ParseOnePattern(
   Pattern.InlinePatternFragments();
   Result.InlinePatternFragments();
 
-  if (Result.getNumTrees() != 1)
+  if (Result.getNumTrees() != 1) {
     Result.error("Cannot use multi-alternative fragments in result pattern!");
+    return;
+  }
 
   // Infer types.
   bool IterateInference;
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index d0f4a2fbf5b47..81a15334d63dc 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -651,7 +651,7 @@ struct TupleExpander : SetTheory::Expander {
 
     // Precompute some types.
     Record *RegisterCl = Def->getRecords().getClass("Register");
-    RecTy *RegisterRecTy = RecordRecTy::get(RegisterCl);
+    const RecTy *RegisterRecTy = RecordRecTy::get(RegisterCl);
     std::vector<StringRef> RegNames =
         Def->getValueAsListOfStrings("RegAsmNames");
 
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index 2702e0ae33c77..5de5dd894f84e 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -888,11 +888,9 @@ void RuleMatcher::defineOperand(StringRef SymbolicName, OperandMatcher &OM) {
       RM.getGISelFlags());
 }
 
-void RuleMatcher::definePhysRegOperand(Record *Reg, OperandMatcher &OM) {
-  if (!PhysRegOperands.contains(Reg)) {
+void RuleMatcher::definePhysRegOperand(const Record *Reg, OperandMatcher &OM) {
+  if (!PhysRegOperands.contains(Reg))
     PhysRegOperands[Reg] = &OM;
-    return;
-  }
 }
 
 InstructionMatcher &
@@ -904,7 +902,8 @@ RuleMatcher::getInstructionMatcher(StringRef SymbolicName) const {
       ("Failed to lookup instruction " + SymbolicName).str().c_str());
 }
 
-const OperandMatcher &RuleMatcher::getPhysRegOperandMatcher(Record *Reg) const {
+const OperandMatcher &
+RuleMatcher::getPhysRegOperandMatcher(const Record *Reg) const {
   const auto &I = PhysRegOperands.find(Reg);
 
   if (I == PhysRegOperands.end()) {
@@ -1717,7 +1716,8 @@ OperandMatcher &InstructionMatcher::getOperand(unsigned OpIdx) {
   llvm_unreachable("Failed to lookup operand");
 }
 
-OperandMatcher &InstructionMatcher::addPhysRegInput(Record *Reg, unsigned OpIdx,
+OperandMatcher &InstructionMatcher::addPhysRegInput(const Record *Reg,
+                                                    unsigned OpIdx,
                                                     unsigned TempOpIdx) {
   assert(SymbolicName.empty());
   OperandMatcher *OM = new OperandMatcher(*this, OpIdx, "", TempOpIdx);
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index aa4eae87573a3..315606417fc9e 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -494,7 +494,7 @@ class RuleMatcher : public Matcher {
 
   /// A map of anonymous physical register operands defined by the matchers that
   /// may be referenced by the renderers.
-  DenseMap<Record *, OperandMatcher *> PhysRegOperands;
+  DenseMap<const Record *, OperandMatcher *> PhysRegOperands;
 
   /// ID for the next instruction variable defined with
   /// implicitlyDefineInsnVar()
@@ -651,7 +651,7 @@ class RuleMatcher : public Matcher {
 
   void defineOperand(StringRef SymbolicName, OperandMatcher &OM);
 
-  void definePhysRegOperand(Record *Reg, OperandMatcher &OM);
+  void definePhysRegOperand(const Record *Reg, OperandMatcher &OM);
 
   Error defineComplexSubOperand(StringRef SymbolicName,
                                 const Record *ComplexPattern,
@@ -669,7 +669,7 @@ class RuleMatcher : public Matcher {
   InstructionMatcher &getInstructionMatcher(StringRef SymbolicName) const;
   OperandMatcher &getOperandMatcher(StringRef Name);
   const OperandMatcher &getOperandMatcher(StringRef Name) const;
-  const OperandMatcher &getPhysRegOperandMatcher(Record *) const;
+  const OperandMatcher &getPhysRegOperandMatcher(const Record *) const;
 
   void optimize() override;
   void emit(MatchTable &Table) override;
@@ -1759,7 +1759,7 @@ class InstructionMatcher final : public PredicateListMatcher<PredicateMatcher> {
   /// PhysRegInputs - List list has an entry for each explicitly specified
   /// physreg input to the pattern.  The first elt is the Register node, the
   /// second is the recorded slot number the input pattern match saved it in.
-  SmallVector<std::pair<Record *, unsigned>, 2> PhysRegInputs;
+  SmallVector<std::pair<const Record *, unsigned>, 2> PhysRegInputs;
 
   bool canAddNumOperandsCheck() const {
     // Add if it's allowed, and:
@@ -1799,10 +1799,10 @@ class InstructionMatcher final : public PredicateListMatcher<PredicateMatcher> {
                              unsigned AllocatedTemporariesBaseID,
                              bool IsVariadic = false);
   OperandMatcher &getOperand(unsigned OpIdx);
-  OperandMatcher &addPhysRegInput(Record *Reg, unsigned OpIdx,
+  OperandMatcher &addPhysRegInput(const Record *Reg, unsigned OpIdx,
                                   unsigned TempOpIdx);
 
-  ArrayRef<std::pair<Record *, unsigned>> getPhysRegInputs() const {
+  ArrayRef<std::pair<const Record *, unsigned>> getPhysRegInputs() const {
     return PhysRegInputs;
   }
 
@@ -1969,10 +1969,10 @@ class CopyRenderer : public OperandRenderer {
 class CopyPhysRegRenderer : public OperandRenderer {
 protected:
   unsigned NewInsnID;
-  Record *PhysReg;
+  const Record *PhysReg;
 
 public:
-  CopyPhysRegRenderer(unsigned NewInsnID, Record *Reg)
+  CopyPhysRegRenderer(unsigned NewInsnID, const Record *Reg)
       : OperandRenderer(OR_CopyPhysReg), NewInsnID(NewInsnID), PhysReg(Reg) {
     assert(PhysReg);
   }
@@ -1981,7 +1981,7 @@ class CopyPhysRegRenderer : public OperandRenderer {
     return R->getKind() == OR_CopyPhysReg;
   }
 
-  Record *getPhysReg() const { return PhysReg; }
+  const Record *getPhysReg() const { return PhysReg; }
 
   void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override;
 };
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp
index 8790dc6028ef4..b7926e21ca661 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp
@@ -103,7 +103,7 @@ void GlobalISelMatchTableExecutorEmitter::emitSubtargetFeatureBitsetImpl(
 }
 
 void GlobalISelMatchTableExecutorEmitter::emitComplexPredicates(
-    raw_ostream &OS, ArrayRef<Record *> ComplexOperandMatchers) {
+    raw_ostream &OS, ArrayRef<const Record *> ComplexOperandMatchers) {
   // Emit complex predicate table and an enum to reference them with.
   OS << "// ComplexPattern predicates.\n"
      << "enum {\n"
@@ -174,7 +174,8 @@ void GlobalISelMatchTableExecutorEmitter::emitMatchTable(
 
 void GlobalISelMatchTableExecutorEmitter::emitExecutorImpl(
     raw_ostream &OS, const MatchTable &Table, ArrayRef<LLTCodeGen> TypeObjects,
-    ArrayRef<RuleMatcher> Rules, ArrayRef<Record *> ComplexOperandMatchers,
+    ArrayRef<RuleMatcher> Rules,
+    ArrayRef<const Record *> ComplexOperandMatchers,
     ArrayRef<StringRef> CustomOperandRenderers, StringRef IfDefName) {
   OS << "#ifdef " << IfDefName << "\n";
   emitTypeObjects(OS, TypeObjects);
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h
index 6634c525480d3..862f1e83c169f 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h
@@ -42,7 +42,7 @@ class GlobalISelMatchTableExecutorEmitter {
   /// Emits an enum + an array that stores references to
   /// \p ComplexOperandMatchers.
   void emitComplexPredicates(raw_ostream &OS,
-                             ArrayRef<Record *> ComplexOperandMatchers);
+                             ArrayRef<const Record *> ComplexOperandMatchers);
 
   /// Emits an enum + an array that stores references to
   /// \p CustomOperandRenderers.
@@ -206,7 +206,7 @@ class GlobalISelMatchTableExecutorEmitter {
   void emitExecutorImpl(raw_ostream &OS, const gi::MatchTable &Table,
                         ArrayRef<gi::LLTCodeGen> TypeObjects,
                         ArrayRef<gi::RuleMatcher> Rules,
-                        ArrayRef<Record *> ComplexOperandMatchers,
+                        ArrayRef<const Record *> ComplexOperandMatchers,
                         ArrayRef<StringRef> CustomOperandRenderers,
                         StringRef IfDefName);
   void emitPredicateBitset(raw_ostream &OS, StringRef IfDefName);
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index a4b5495092867..bab53adbaefe3 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -39,10 +39,10 @@ struct DXILOperationDesc {
   StringRef OpClass;  // name of the opcode class
   StringRef Doc;      // the documentation description of this instruction
   // Vector of operand type records - return type is at index 0
-  SmallVector<Record *> OpTypes;
-  SmallVector<Record *> OverloadRecs;
-  SmallVector<Record *> StageRecs;
-  SmallVector<Record *> AttrRecs;
+  SmallVector<const Record *> OpTypes;
+  SmallVector<const Record *> OverloadRecs;
+  SmallVector<const Record *> StageRecs;
+  SmallVector<const Record *> AttrRecs;
   StringRef Intrinsic; // The llvm intrinsic map to OpName. Default is "" which
                        // means no map exists
   SmallVector<StringRef, 4>
@@ -57,8 +57,8 @@ struct DXILOperationDesc {
 /// In-place sort TableGen records of class with a field
 ///    Version dxil_version
 /// in the ascending version order.
-static void AscendingSortByVersion(std::vector<Record *> &Recs) {
-  std::sort(Recs.begin(), Recs.end(), [](Record *RecA, Record *RecB) {
+static void AscendingSortByVersion(std::vector<const Record *> &Recs) {
+  sort(Recs, [](const Record *RecA, const Record *RecB) {
     unsigned RecAMaj =
         RecA->getValueAsDef("dxil_version")->getValueAsInt("Major");
     unsigned RecAMin =
@@ -82,13 +82,12 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
   OpCode = R->getValueAsInt("OpCode");
 
   Doc = R->getValueAsString("Doc");
-  SmallVector<Record *> ParamTypeRecs;
+  SmallVector<const Record *> ParamTypeRecs;
 
   ParamTypeRecs.push_back(R->getValueAsDef("result"));
 
-  std::vector<Record *> ArgTys = R->getValueAsListOfDefs("arguments");
-  for (auto Ty : ArgTys) {
-    ParamTypeRecs.push_back(Ty);
+  for (const Record *ArgTy : R->getValueAsListOfDefs("arguments")) {
+    ParamTypeRecs.push_back(ArgTy);
   }
   size_t ParamTypeRecsSize = ParamTypeRecs.size();
   // Populate OpTypes with return type and parameter types
@@ -100,7 +99,7 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
   // llvm/IR/Intrinsics.td
   OverloadParamIndex = -1; // A sigil meaning none.
   for (unsigned i = 0; i < ParamTypeRecsSize; i++) {
-    Record *TR = ParamTypeRecs[i];
+    const Record *TR = ParamTypeRecs[i];
     // Track operation parameter indices of any overload types
     if (TR->getValueAsInt("isOverload")) {
       if (OverloadParamIndex != -1) {
@@ -117,17 +116,17 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
   }
 
   // Get overload records
-  std::vector<Record *> Recs = R->getValueAsListOfDefs("overloads");
+  std::vector<const Record *> Recs = R->getValueAsListOfConstDefs("overloads");
 
   // Sort records in ascending order of DXIL version
   AscendingSortByVersion(Recs);
 
-  for (Record *CR : Recs) {
+  for (const Record *CR : Recs) {
     OverloadRecs.push_back(CR);
   }
 
   // Get stage records
-  Recs = R->getValueAsListOfDefs("stages");
+  Recs = R->getValueAsListOfConstDefs("stages");
 
   if (Recs.empty()) {
     PrintFatalError(R, Twine("Atleast one specification of valid stage for ") +
@@ -137,17 +136,17 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
   // Sort records in ascending order of DXIL version
   AscendingSortByVersion(Recs);
 
-  for (Record *CR : Recs) {
+  for (const Record *CR : Recs) {
     StageRecs.push_back(CR);
   }
 
   // Get attribute records
-  Recs = R->getValueAsListOfDefs("attributes");
+  Recs = R->getValueAsListOfConstDefs("attributes");
 
   // Sort records in ascending order of DXIL version
   AscendingSortByVersion(Recs);
 
-  for (Record *CR : Recs) {
+  for (const Record *CR : Recs) {
     AttrRecs.push_back(CR);
   }
 
@@ -201,7 +200,7 @@ static StringRef getOverloadKindStr(const Record *R) {
 /// \return std::string string representation of overload mask string
 ///         predicated by DXIL Version. E.g.,
 //          {{{1, 0}, Mask1}, {{1, 2}, Mask2}, ...}
-static std::string getOverloadMaskString(const SmallVector<Record *> Recs) {
+static std::string getOverloadMaskString(ArrayRef<const Record *> Recs) {
   std::string MaskString = "";
   std::string Prefix = "";
   MaskString.append("{");
@@ -247,7 +246,7 @@ static std::string getOverloadMaskString(const SmallVector<Record *> Recs) {
 /// \return std::string string representation of stages mask string
 ///         predicated by DXIL Version. E.g.,
 //          {{{1, 0}, Mask1}, {{1, 2}, Mask2}, ...}
-static std::string getStageMaskString(const SmallVector<Record *> Recs) {
+static std::string getStageMaskString(ArrayRef<const Record *> Recs) {
   std::string MaskString = "";
   std::string Prefix = "";
   MaskString.append("{");
@@ -290,7 +289,7 @@ static std::string getStageMaskString(const SmallVector<Record *> Recs) {
 /// \return std::string string representation of stages mask string
 ///         predicated by DXIL Version. E.g.,
 //          {{{1, 0}, Mask1}, {{1, 2}, Mask2}, ...}
-static std::string getAttributeMaskString(const SmallVector<Record *> Recs) {
+static std::string getAttributeMaskString(ArrayRef<const Record *> Recs) {
   std::string MaskString = "";
   std::string Prefix = "";
   MaskString.append("{");
diff --git a/llvm/utils/TableGen/DirectiveEmitter.cpp b/llvm/utils/TableGen/DirectiveEmitter.cpp
index fafdfa0db89af..9dc29d8262fa2 100644
--- a/llvm/utils/TableGen/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/DirectiveEmitter.cpp
@@ -98,10 +98,8 @@ static void GenerateEnumClauseVal(ArrayRef<const Record *> Records,
 
     OS << "\n";
     OS << "enum class " << EnumName << " {\n";
-    for (const auto &CV : ClauseVals) {
-      ClauseVal CVal(CV);
-      OS << "  " << CV->getName() << "=" << CVal.getValue() << ",\n";
-    }
+    for (const ClauseVal CVal : ClauseVals)
+      OS << "  " << CVal.getRecordName() << "=" << CVal.getValue() << ",\n";
     OS << "};\n";
 
     if (DirLang.hasMakeEnumAvailableInNamespace()) {
@@ -343,8 +341,9 @@ static void GenerateGetKindClauseVal(const DirectiveLanguage &DirLang,
     if (ClauseVals.size() <= 0)
       continue;
 
-    auto DefaultIt = find_if(
-        ClauseVals, [](Record *CV) { return CV->getValueAsBit("isDefault"); });
+    auto DefaultIt = find_if(ClauseVals, [](const Record *CV) {
+      return CV->getValueAsBit("isDefault");
+    });
 
     if (DefaultIt == ClauseVals.end()) {
       PrintError("At least one val in Clause " + C.getFormattedName() +
@@ -505,7 +504,7 @@ static void EmitLeafTable(const DirectiveLanguage &DirLang, raw_ostream &OS,
   std::vector<LeafList> LeafTable(Directives.size());
   for (auto [Idx, Rec] : enumerate(Directives)) {
     Directive Dir(Rec);
-    std::vector<Record *> Leaves = Dir.getLeafConstructs();
+    std::vector<const Record *> Leaves = Dir.getLeafConstructs();
 
     auto &List = LeafTable[Idx];
     List.resize(MaxLeafCount + 2);
@@ -680,7 +679,7 @@ static void GenerateGetDirectiveAssociation(const DirectiveLanguage &DirLang,
       return AS;
     }
     // Compute the association from leaf constructs.
-    std::vector<Record *> leaves = D.getLeafConstructs();
+    std::vector<const Record *> leaves = D.getLeafConstructs();
     if (leaves.empty()) {
       errs() << D.getName() << '\n';
       PrintFatalError(errorPrefixFor(D) +
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index c345662c008e5..c53f705a38db8 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -314,7 +314,7 @@ static Expected<LLTCodeGen> getInstResultType(const TreePatternNode &Dst,
 
 class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter {
 public:
-  explicit GlobalISelEmitter(RecordKeeper &RK);
+  explicit GlobalISelEmitter(const RecordKeeper &RK);
 
   void emitAdditionalImpl(raw_ostream &OS) override;
 
@@ -335,18 +335,18 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter {
 private:
   std::string ClassName;
 
-  RecordKeeper &RK;
+  const RecordKeeper &RK;
   const CodeGenDAGPatterns CGP;
   const CodeGenTarget &Target;
   CodeGenRegBank &CGRegs;
 
-  std::vector<Record *> AllPatFrags;
+  ArrayRef<const Record *> AllPatFrags;
 
   /// Keep track of the equivalence between SDNodes and Instruction by mapping
   /// SDNodes to the GINodeEquiv mapping. We need to map to the GINodeEquiv to
   /// check for attributes on the relation such as CheckMMOIsNonAtomic.
   /// This is defined using 'GINodeEquiv' in the target description.
-  DenseMap<const Record *, Record *> NodeEquivs;
+  DenseMap<const Record *, const Record *> NodeEquivs;
 
   /// Keep track of the equivalence between ComplexPattern's and
   /// GIComplexOperandMatcher. Map entries are specified by subclassing
@@ -379,8 +379,8 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter {
   void gatherTypeIDValues();
   void gatherNodeEquivs();
 
-  Record *findNodeEquiv(const Record *N) const;
-  const CodeGenInstruction *getEquivNode(Record &Equiv,
+  const Record *findNodeEquiv(const Record *N) const;
+  const CodeGenInstruction *getEquivNode(const Record &Equiv,
                                          const TreePatternNode &N) const;
 
   Error importRulePredicates(RuleMatcher &M,
@@ -472,7 +472,7 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter {
                        InstructionMatcher &InsnMatcher, bool &HasAddedMatcher);
 };
 
-StringRef getPatFragPredicateEnumName(Record *R) { return R->getName(); }
+StringRef getPatFragPredicateEnumName(const Record *R) { return R->getName(); }
 
 void GlobalISelEmitter::gatherOpcodeValues() {
   InstructionOpcodeMatcher::initOpcodeValuesMap(Target);
@@ -484,32 +484,35 @@ void GlobalISelEmitter::gatherTypeIDValues() {
 
 void GlobalISelEmitter::gatherNodeEquivs() {
   assert(NodeEquivs.empty());
-  for (Record *Equiv : RK.getAllDerivedDefinitions("GINodeEquiv"))
+  for (const Record *Equiv : RK.getAllDerivedDefinitions("GINodeEquiv"))
     NodeEquivs[Equiv->getValueAsDef("Node")] = Equiv;
 
   assert(ComplexPatternEquivs.empty());
-  for (Record *Equiv : RK.getAllDerivedDefinitions("GIComplexPatternEquiv")) {
-    Record *SelDAGEquiv = Equiv->getValueAsDef("SelDAGEquivalent");
+  for (const Record *Equiv :
+       RK.getAllDerivedDefinitions("GIComplexPatternEquiv")) {
+    const Record *SelDAGEquiv = Equiv->getValueAsDef("SelDAGEquivalent");
     if (!SelDAGEquiv)
       continue;
     ComplexPatternEquivs[SelDAGEquiv] = Equiv;
   }
 
   assert(SDNodeXFormEquivs.empty());
-  for (Record *Equiv : RK.getAllDerivedDefinitions("GISDNodeXFormEquiv")) {
-    Record *SelDAGEquiv = Equiv->getValueAsDef("SelDAGEquivalent");
+  for (const Record *Equiv :
+       RK.getAllDerivedDefinitions("GISDNodeXFormEquiv")) {
+    const Record *SelDAGEquiv = Equiv->getValueAsDef("SelDAGEquivalent");
     if (!SelDAGEquiv)
       continue;
     SDNodeXFormEquivs[SelDAGEquiv] = Equiv;
   }
 }
 
-Record *GlobalISelEmitter::findNodeEquiv(const Record *N) const {
+const Record *GlobalISelEmitter::findNodeEquiv(const Record *N) const {
   return NodeEquivs.lookup(N);
 }
 
 const CodeGenInstruction *
-GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode &N) const {
+GlobalISelEmitter::getEquivNode(const Record &Equiv,
+                                const TreePatternNode &N) const {
   if (N.getNumChildren() >= 1) {
     // setcc operation maps to two different G_* instructions based on the type.
     if (!Equiv.isValueUnset("IfFloatingPoint") &&
@@ -536,7 +539,7 @@ GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode &N) const {
   return &Target.getInstruction(Equiv.getValueAsDef("I"));
 }
 
-GlobalISelEmitter::GlobalISelEmitter(RecordKeeper &RK)
+GlobalISelEmitter::GlobalISelEmitter(const RecordKeeper &RK)
     : GlobalISelMatchTableExecutorEmitter(), RK(RK), CGP(RK),
       Target(CGP.getTargetInfo()), CGRegs(Target.getRegBank()) {
   ClassName = Target.getName().str() + "InstructionSelector";
@@ -721,7 +724,7 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
     const TreePatternNode &Src, unsigned &TempOpIdx) {
   const auto SavedFlags = Rule.setGISelFlags(Src.getGISelFlagsRecord());
 
-  Record *SrcGIEquivOrNull = nullptr;
+  const Record *SrcGIEquivOrNull = nullptr;
   const CodeGenInstruction *SrcGIOrNull = nullptr;
 
   // Start with the defined operands (i.e., the results of the root operator).
@@ -942,7 +945,7 @@ Error GlobalISelEmitter::importComplexPatternOperandMatcher(
 // Get the name to use for a pattern operand. For an anonymous physical register
 // input, this should use the register name.
 static StringRef getSrcChildName(const TreePatternNode &SrcChild,
-                                 Record *&PhysReg) {
+                                 const Record *&PhysReg) {
   StringRef SrcChildName = SrcChild.getName();
   if (SrcChildName.empty() && SrcChild.isLeaf()) {
     if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild.getLeafValue())) {
@@ -962,7 +965,7 @@ Error GlobalISelEmitter::importChildMatcher(
     const TreePatternNode &SrcChild, bool OperandIsAPointer,
     bool OperandIsImmArg, unsigned OpIdx, unsigned &TempOpIdx) {
 
-  Record *PhysReg = nullptr;
+  const Record *PhysReg = nullptr;
   std::string SrcChildName = std::string(getSrcChildName(SrcChild, PhysReg));
   if (!SrcChild.isLeaf() &&
       SrcChild.getOperator()->isSubClassOf("ComplexPattern")) {
@@ -1196,7 +1199,8 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
       auto &Child = DstChild.getChild(0);
       auto I = SDNodeXFormEquivs.find(DstChild.getOperator());
       if (I != SDNodeXFormEquivs.end()) {
-        Record *XFormOpc = DstChild.getOperator()->getValueAsDef("Opcode");
+        const Record *XFormOpc =
+            DstChild.getOperator()->getValueAsDef("Opcode");
         if (XFormOpc->getName() == "timm") {
           // If this is a TargetConstant, there won't be a corresponding
           // instruction to transform. Instead, this will refer directly to an
@@ -2290,65 +2294,65 @@ void GlobalISelEmitter::emitAdditionalImpl(raw_ostream &OS) {
 }
 
 void GlobalISelEmitter::emitMIPredicateFns(raw_ostream &OS) {
-  std::vector<Record *> MatchedRecords;
+  std::vector<const Record *> MatchedRecords;
   std::copy_if(AllPatFrags.begin(), AllPatFrags.end(),
-               std::back_inserter(MatchedRecords), [&](Record *R) {
+               std::back_inserter(MatchedRecords), [](const Record *R) {
                  return !R->getValueAsString("GISelPredicateCode").empty();
                });
-  emitMIPredicateFnsImpl<Record *>(
+  emitMIPredicateFnsImpl<const Record *>(
       OS,
       "  const MachineFunction &MF = *MI.getParent()->getParent();\n"
       "  const MachineRegisterInfo &MRI = MF.getRegInfo();\n"
       "  const auto &Operands = State.RecordedOperands;\n"
       "  (void)Operands;\n"
       "  (void)MRI;",
-      ArrayRef<Record *>(MatchedRecords), &getPatFragPredicateEnumName,
-      [&](Record *R) { return R->getValueAsString("GISelPredicateCode"); },
+      ArrayRef<const Record *>(MatchedRecords), &getPatFragPredicateEnumName,
+      [](const Record *R) { return R->getValueAsString("GISelPredicateCode"); },
       "PatFrag predicates.");
 }
 
 void GlobalISelEmitter::emitI64ImmPredicateFns(raw_ostream &OS) {
-  std::vector<Record *> MatchedRecords;
+  std::vector<const Record *> MatchedRecords;
   std::copy_if(AllPatFrags.begin(), AllPatFrags.end(),
-               std::back_inserter(MatchedRecords), [&](Record *R) {
+               std::back_inserter(MatchedRecords), [](const Record *R) {
                  bool Unset;
                  return !R->getValueAsString("ImmediateCode").empty() &&
                         !R->getValueAsBitOrUnset("IsAPFloat", Unset) &&
                         !R->getValueAsBit("IsAPInt");
                });
-  emitImmPredicateFnsImpl<Record *>(
-      OS, "I64", "int64_t", ArrayRef<Record *>(MatchedRecords),
+  emitImmPredicateFnsImpl<const Record *>(
+      OS, "I64", "int64_t", ArrayRef<const Record *>(MatchedRecords),
       &getPatFragPredicateEnumName,
-      [&](Record *R) { return R->getValueAsString("ImmediateCode"); },
+      [](const Record *R) { return R->getValueAsString("ImmediateCode"); },
       "PatFrag predicates.");
 }
 
 void GlobalISelEmitter::emitAPFloatImmPredicateFns(raw_ostream &OS) {
-  std::vector<Record *> MatchedRecords;
+  std::vector<const Record *> MatchedRecords;
   std::copy_if(AllPatFrags.begin(), AllPatFrags.end(),
-               std::back_inserter(MatchedRecords), [&](Record *R) {
+               std::back_inserter(MatchedRecords), [](const Record *R) {
                  bool Unset;
                  return !R->getValueAsString("ImmediateCode").empty() &&
                         R->getValueAsBitOrUnset("IsAPFloat", Unset);
                });
-  emitImmPredicateFnsImpl<Record *>(
-      OS, "APFloat", "const APFloat &", ArrayRef<Record *>(MatchedRecords),
-      &getPatFragPredicateEnumName,
-      [&](Record *R) { return R->getValueAsString("ImmediateCode"); },
+  emitImmPredicateFnsImpl<const Record *>(
+      OS, "APFloat", "const APFloat &",
+      ArrayRef<const Record *>(MatchedRecords), &getPatFragPredicateEnumName,
+      [](const Record *R) { return R->getValueAsString("ImmediateCode"); },
       "PatFrag predicates.");
 }
 
 void GlobalISelEmitter::emitAPIntImmPredicateFns(raw_ostream &OS) {
-  std::vector<Record *> MatchedRecords;
+  std::vector<const Record *> MatchedRecords;
   std::copy_if(AllPatFrags.begin(), AllPatFrags.end(),
-               std::back_inserter(MatchedRecords), [&](Record *R) {
+               std::back_inserter(MatchedRecords), [](const Record *R) {
                  return !R->getValueAsString("ImmediateCode").empty() &&
                         R->getValueAsBit("IsAPInt");
                });
-  emitImmPredicateFnsImpl<Record *>(
-      OS, "APInt", "const APInt &", ArrayRef<Record *>(MatchedRecords),
+  emitImmPredicateFnsImpl<const Record *>(
+      OS, "APInt", "const APInt &", ArrayRef<const Record *>(MatchedRecords),
       &getPatFragPredicateEnumName,
-      [&](Record *R) { return R->getValueAsString("ImmediateCode"); },
+      [](const Record *R) { return R->getValueAsString("ImmediateCode"); },
       "PatFrag predicates.");
 }
 
@@ -2461,7 +2465,7 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
     return A->getName() < B->getName();
   };
 
-  std::vector<Record *> ComplexPredicates =
+  std::vector<const Record *> ComplexPredicates =
       RK.getAllDerivedDefinitions("GIComplexOperandMatcher");
   llvm::sort(ComplexPredicates, OrderByName);
 
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index 46605095ba85f..5653434ddd682 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -1357,7 +1357,7 @@ void InstrInfoEmitter::emitEnums(raw_ostream &OS) {
   OS << "#endif // GET_INSTRINFO_SCHED_ENUM\n\n";
 }
 
-static void EmitInstrInfo(RecordKeeper &RK, raw_ostream &OS) {
+static void EmitInstrInfo(const RecordKeeper &RK, raw_ostream &OS) {
   RK.startTimer("Analyze DAG patterns");
   InstrInfoEmitter(RK).run(OS);
   RK.startTimer("Emit map table");
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index 51c2e9a12e00c..c8b5ec146dc50 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -29,6 +29,7 @@
 #include <array>
 #include <cassert>
 #include <cctype>
+#include <limits>
 #include <map>
 #include <optional>
 #include <string>
@@ -276,12 +277,10 @@ using TypeSigTy = SmallVector<unsigned char>;
 static TypeSigTy ComputeTypeSignature(const CodeGenIntrinsic &Int) {
   TypeSigTy TypeSig;
   const Record *TypeInfo = Int.TheDef->getValueAsDef("TypeInfo");
-  const ListInit *OuterList = TypeInfo->getValueAsListInit("TypeSig");
+  const ListInit *TypeList = TypeInfo->getValueAsListInit("TypeSig");
 
-  for (const auto *Outer : OuterList->getValues()) {
-    for (const auto *Inner : cast<ListInit>(Outer)->getValues())
-      TypeSig.emplace_back(cast<IntInit>(Inner)->getValue());
-  }
+  for (const auto *TypeListEntry : TypeList->getValues())
+    TypeSig.emplace_back(cast<IntInit>(TypeListEntry)->getValue());
   return TypeSig;
 }
 
@@ -381,8 +380,17 @@ static constexpr {} IIT_Table[] = {{
   OS << "#endif\n\n"; // End of GET_INTRINSIC_GENERATOR_GLOBAL
 }
 
+/// Returns the effective MemoryEffects for intrinsic \p Int.
+static MemoryEffects getEffectiveME(const CodeGenIntrinsic &Int) {
+  MemoryEffects ME = Int.ME;
+  // TODO: IntrHasSideEffects should affect not only readnone intrinsics.
+  if (ME.doesNotAccessMemory() && Int.hasSideEffects)
+    ME = MemoryEffects::unknown();
+  return ME;
+}
+
 static bool compareFnAttributes(const CodeGenIntrinsic *L,
-                                const CodeGenIntrinsic *R, bool Default) {
+                                const CodeGenIntrinsic *R) {
   auto TieBoolAttributes = [](const CodeGenIntrinsic *I) -> auto {
     // Sort throwing intrinsics after non-throwing intrinsics.
     return std::tie(I->canThrow, I->isNoDuplicate, I->isNoMerge, I->isNoReturn,
@@ -398,50 +406,46 @@ static bool compareFnAttributes(const CodeGenIntrinsic *L,
     return TieL < TieR;
 
   // Try to order by readonly/readnone attribute.
-  uint32_t LME = L->ME.toIntValue();
-  uint32_t RME = R->ME.toIntValue();
+  uint32_t LME = getEffectiveME(*L).toIntValue();
+  uint32_t RME = getEffectiveME(*R).toIntValue();
   if (LME != RME)
     return LME > RME;
 
-  return Default;
+  return false;
+}
+
+/// Returns true if \p Int has a non-empty set of function attributes. Note that
+/// NoUnwind = !canThrow, so we need to negate it's sense to test if the
+// intrinsic has NoUnwind attribute.
+static bool hasFnAttributes(const CodeGenIntrinsic &Int) {
+  return !Int.canThrow || Int.isNoReturn || Int.isNoCallback || Int.isNoSync ||
+         Int.isNoFree || Int.isWillReturn || Int.isCold || Int.isNoDuplicate ||
+         Int.isNoMerge || Int.isConvergent || Int.isSpeculatable ||
+         Int.isStrictFP || getEffectiveME(Int) != MemoryEffects::unknown();
 }
 
 namespace {
 struct FnAttributeComparator {
   bool operator()(const CodeGenIntrinsic *L, const CodeGenIntrinsic *R) const {
-    return compareFnAttributes(L, R, false);
+    return compareFnAttributes(L, R);
   }
 };
 
 struct AttributeComparator {
   bool operator()(const CodeGenIntrinsic *L, const CodeGenIntrinsic *R) const {
-    // Order by argument attributes if function attributes are equal.
+    // Order all intrinsics with no functiona attributes before all intrinsics
+    // with function attributes.
+    bool HasFnAttrLHS = hasFnAttributes(*L);
+    bool HasFnAttrRHS = hasFnAttributes(*R);
+
+    // Order by argument attributes if function `hasFnAttributes` is equal.
     // This is reliable because each side is already sorted internally.
-    return compareFnAttributes(L, R,
-                               L->ArgumentAttributes < R->ArgumentAttributes);
+    return std::tie(HasFnAttrLHS, L->ArgumentAttributes) <
+           std::tie(HasFnAttrRHS, R->ArgumentAttributes);
   }
 };
 } // End anonymous namespace
 
-/// Returns the effective MemoryEffects for intrinsic \p Int.
-static MemoryEffects getEffectiveME(const CodeGenIntrinsic &Int) {
-  MemoryEffects ME = Int.ME;
-  // TODO: IntrHasSideEffects should affect not only readnone intrinsics.
-  if (ME.doesNotAccessMemory() && Int.hasSideEffects)
-    ME = MemoryEffects::unknown();
-  return ME;
-}
-
-/// Returns true if \p Int has a non-empty set of function attributes. Note that
-/// NoUnwind = !canThrow, so we need to negate it's sense to test if the
-// intrinsic has NoUnwind attribute.
-static bool hasFnAttributes(const CodeGenIntrinsic &Int) {
-  return !Int.canThrow || Int.isNoReturn || Int.isNoCallback || Int.isNoSync ||
-         Int.isNoFree || Int.isWillReturn || Int.isCold || Int.isNoDuplicate ||
-         Int.isNoMerge || Int.isConvergent || Int.isSpeculatable ||
-         Int.isStrictFP || getEffectiveME(Int) != MemoryEffects::unknown();
-}
-
 /// Returns the name of the IR enum for argument attribute kind \p Kind.
 static StringRef getArgAttrEnumName(CodeGenIntrinsic::ArgAttrKind Kind) {
   switch (Kind) {
@@ -578,75 +582,79 @@ static AttributeSet getIntrinsicFnAttributeSet(LLVMContext &C, unsigned ID) {
 AttributeList Intrinsic::getAttributes(LLVMContext &C, ID id) {
 )";
 
-  // Compute the maximum number of attribute arguments and the map.
-  typedef std::map<const CodeGenIntrinsic *, unsigned, AttributeComparator>
-      UniqAttrMapTy;
-  UniqAttrMapTy UniqAttributes;
-  unsigned MaxArgAttrs = 0;
-  unsigned AttrNum = 0;
+  // Compute the maximum number of attribute arguments and the map. For function
+  // attributes, we only consider whether the intrinsics has any function
+  // arguments or not.
+  std::map<const CodeGenIntrinsic *, unsigned, AttributeComparator>
+      UniqAttributes;
   for (const CodeGenIntrinsic &Int : Ints) {
-    MaxArgAttrs =
-        std::max(MaxArgAttrs, unsigned(Int.ArgumentAttributes.size()));
-    unsigned &N = UniqAttributes[&Int];
-    if (N)
-      continue;
-    N = ++AttrNum;
-    assert(N < 65536 && "Too many unique attributes for table!");
+    unsigned ID = UniqAttributes.size();
+    UniqAttributes.try_emplace(&Int, ID);
   }
 
+  // Assign a 16-bit packed ID for each intrinsic. The lower 8-bits will be its
+  // "argument attribute ID" (index in UniqAttributes) and upper 8 bits will be
+  // its "function attribute ID" (index in UniqFnAttributes).
+  if (UniqAttributes.size() > 256)
+    PrintFatalError("Too many unique argument attributes for table!");
+  if (UniqFnAttributes.size() > 256)
+    PrintFatalError("Too many unique function attributes for table!");
+
   // Emit an array of AttributeList.  Most intrinsics will have at least one
   // entry, for the function itself (index ~1), which is usually nounwind.
   OS << "  static constexpr uint16_t IntrinsicsToAttributesMap[] = {";
-  for (const CodeGenIntrinsic &Int : Ints)
-    OS << formatv("\n    {}, // {}", UniqAttributes[&Int], Int.Name);
+  for (const CodeGenIntrinsic &Int : Ints) {
+    uint16_t FnAttrIndex = hasFnAttributes(Int) ? UniqFnAttributes[&Int] : 0;
+    OS << formatv("\n    {} << 8 | {}, // {}", FnAttrIndex,
+                  UniqAttributes[&Int], Int.Name);
+  }
 
   OS << formatv(R"(
   };
-  std::pair<unsigned, AttributeSet> AS[{}];
-  unsigned NumAttrs = 0;
-  if (id != 0) {{
-    switch(IntrinsicsToAttributesMap[id - 1]) {{
-      default: llvm_unreachable("Invalid attribute number");
-)",
-                MaxArgAttrs + 1);
+  if (id == 0)
+    return AttributeList();
+
+  uint16_t PackedID = IntrinsicsToAttributesMap[id - 1];
+  uint8_t FnAttrID = PackedID >> 8;
+  switch(PackedID & 0xFF) {{
+    default: llvm_unreachable("Invalid attribute number");
+)");
 
   for (const auto [IntPtr, UniqueID] : UniqAttributes) {
-    OS << formatv("    case {}:\n", UniqueID);
+    OS << formatv("  case {}:\n", UniqueID);
     const CodeGenIntrinsic &Int = *IntPtr;
 
     // Keep track of the number of attributes we're writing out.
-    unsigned NumAttrs = 0;
+    unsigned NumAttrs =
+        llvm::count_if(Int.ArgumentAttributes,
+                       [](const auto &Attrs) { return !Attrs.empty(); });
+    NumAttrs += hasFnAttributes(Int);
+    if (NumAttrs == 0) {
+      OS << "    return AttributeList();\n";
+      continue;
+    }
 
+    OS << "    return AttributeList::get(C, {\n";
+    ListSeparator LS(",\n");
     for (const auto &[AttrIdx, Attrs] : enumerate(Int.ArgumentAttributes)) {
       if (Attrs.empty())
         continue;
 
       unsigned ArgAttrID = UniqArgAttributes.find(Attrs)->second;
-      OS << formatv(
-          "      AS[{}] = {{{}, getIntrinsicArgAttributeSet(C, {})};\n",
-          NumAttrs++, AttrIdx, ArgAttrID);
+      OS << LS
+         << formatv("      {{{}, getIntrinsicArgAttributeSet(C, {})}", AttrIdx,
+                    ArgAttrID);
     }
 
     if (hasFnAttributes(Int)) {
-      unsigned FnAttrID = UniqFnAttributes.find(&Int)->second;
-      OS << formatv("      AS[{}] = {{AttributeList::FunctionIndex, "
-                    "getIntrinsicFnAttributeSet(C, {})};\n",
-                    NumAttrs++, FnAttrID);
-    }
-
-    if (NumAttrs) {
-      OS << formatv(R"(      NumAttrs = {};
-      break;
-)",
-                    NumAttrs);
-    } else {
-      OS << "      return AttributeList();\n";
+      OS << LS
+         << "      {AttributeList::FunctionIndex, "
+            "getIntrinsicFnAttributeSet(C, FnAttrID)}";
     }
+    OS << "\n    });\n";
   }
 
-  OS << R"(    }
-  }
-  return AttributeList::get(C, ArrayRef(AS, NumAttrs));
+  OS << R"(  }
 }
 #endif // GET_INTRINSIC_ATTRIBUTES
 
diff --git a/llvm/utils/TableGen/OptionParserEmitter.cpp b/llvm/utils/TableGen/OptionParserEmitter.cpp
index 5ae6f773a3c60..424cf16e719d5 100644
--- a/llvm/utils/TableGen/OptionParserEmitter.cpp
+++ b/llvm/utils/TableGen/OptionParserEmitter.cpp
@@ -471,13 +471,13 @@ static void EmitOptionParser(const RecordKeeper &Records, raw_ostream &OS) {
 
     std::vector<std::pair<std::vector<std::string>, StringRef>>
         HelpTextsForVariants;
-    for (Record *VisibilityHelp :
+    for (const Record *VisibilityHelp :
          R.getValueAsListOfDefs("HelpTextsForVariants")) {
       ArrayRef<Init *> Visibilities =
           VisibilityHelp->getValueAsListInit("Visibilities")->getValues();
 
       std::vector<std::string> VisibilityNames;
-      for (Init *Visibility : Visibilities)
+      for (const Init *Visibility : Visibilities)
         VisibilityNames.push_back(Visibility->getAsUnquotedString());
 
       HelpTextsForVariants.push_back(std::make_pair(
diff --git a/llvm/utils/TableGen/OptionRSTEmitter.cpp b/llvm/utils/TableGen/OptionRSTEmitter.cpp
index b798896a80963..1b4c4cad4f0a4 100644
--- a/llvm/utils/TableGen/OptionRSTEmitter.cpp
+++ b/llvm/utils/TableGen/OptionRSTEmitter.cpp
@@ -18,7 +18,6 @@ using namespace llvm;
 /// and emits a RST man page.
 static void EmitOptionRST(const RecordKeeper &Records, raw_ostream &OS) {
   llvm::StringMap<std::vector<const Record *>> OptionsByGroup;
-  std::vector<Record *> OptionsWithoutGroup;
 
   // Get the options.
   std::vector<const Record *> Opts = Records.getAllDerivedDefinitions("Option");
diff --git a/llvm/utils/TableGen/SearchableTableEmitter.cpp b/llvm/utils/TableGen/SearchableTableEmitter.cpp
index b5bf621b05738..d6cb94cdff24f 100644
--- a/llvm/utils/TableGen/SearchableTableEmitter.cpp
+++ b/llvm/utils/TableGen/SearchableTableEmitter.cpp
@@ -48,15 +48,15 @@ struct GenericEnum {
   using Entry = std::pair<StringRef, int64_t>;
 
   std::string Name;
-  Record *Class = nullptr;
+  const Record *Class = nullptr;
   std::string PreprocessorGuard;
   std::vector<std::unique_ptr<Entry>> Entries;
-  DenseMap<Record *, Entry *> EntryMap;
+  DenseMap<const Record *, Entry *> EntryMap;
 };
 
 struct GenericField {
   std::string Name;
-  RecTy *RecType = nullptr;
+  const RecTy *RecType = nullptr;
   bool IsCode = false;
   bool IsIntrinsic = false;
   bool IsInstruction = false;
@@ -79,7 +79,7 @@ struct GenericTable {
   std::string PreprocessorGuard;
   std::string CppTypeName;
   SmallVector<GenericField, 2> Fields;
-  std::vector<Record *> Entries;
+  std::vector<const Record *> Entries;
 
   std::unique_ptr<SearchIndex> PrimaryKey;
   SmallVector<std::unique_ptr<SearchIndex>, 2> Indices;
@@ -94,20 +94,20 @@ struct GenericTable {
 };
 
 class SearchableTableEmitter {
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
   std::unique_ptr<CodeGenTarget> Target;
   std::unique_ptr<CodeGenIntrinsicMap> Intrinsics;
   std::vector<std::unique_ptr<GenericEnum>> Enums;
-  DenseMap<Record *, GenericEnum *> EnumMap;
+  DenseMap<const Record *, GenericEnum *> EnumMap;
   std::set<std::string> PreprocessorGuards;
 
 public:
-  SearchableTableEmitter(RecordKeeper &R) : Records(R) {}
+  explicit SearchableTableEmitter(const RecordKeeper &R) : Records(R) {}
 
   void run(raw_ostream &OS);
 
 private:
-  typedef std::pair<Init *, int> SearchTableEntry;
+  typedef std::pair<const Init *, int> SearchTableEntry;
 
   enum TypeContext {
     TypeInStaticStruct,
@@ -116,15 +116,15 @@ class SearchableTableEmitter {
   };
 
   std::string primaryRepresentation(SMLoc Loc, const GenericField &Field,
-                                    Init *I) {
-    if (StringInit *SI = dyn_cast<StringInit>(I)) {
+                                    const Init *I) {
+    if (const StringInit *SI = dyn_cast<StringInit>(I)) {
       if (Field.IsCode || SI->hasCodeFormat())
         return std::string(SI->getValue());
       else
         return SI->getAsString();
-    } else if (BitsInit *BI = dyn_cast<BitsInit>(I))
+    } else if (const BitsInit *BI = dyn_cast<BitsInit>(I))
       return "0x" + utohexstr(getAsInt(BI));
-    else if (BitInit *BI = dyn_cast<BitInit>(I))
+    else if (const BitInit *BI = dyn_cast<BitInit>(I))
       return BI->getValue() ? "true" : "false";
     else if (Field.IsIntrinsic)
       return "Intrinsic::" + getIntrinsic(I).EnumName.str();
@@ -152,7 +152,8 @@ class SearchableTableEmitter {
     return Target->getIntrinsic(Def);
   }
 
-  bool compareBy(Record *LHS, Record *RHS, const SearchIndex &Index);
+  bool compareBy(const Record *LHS, const Record *RHS,
+                 const SearchIndex &Index);
 
   std::string searchableFieldType(const GenericTable &Table,
                                   const SearchIndex &Index,
@@ -163,7 +164,7 @@ class SearchableTableEmitter {
       if (Ctx == TypeInTempStruct)
         return "std::string";
       return "StringRef";
-    } else if (BitsRecTy *BI = dyn_cast<BitsRecTy>(Field.RecType)) {
+    } else if (const BitsRecTy *BI = dyn_cast<BitsRecTy>(Field.RecType)) {
       unsigned NumBits = BI->getNumBits();
       if (NumBits <= 8)
         return "uint8_t";
@@ -198,14 +199,11 @@ class SearchableTableEmitter {
   bool parseFieldType(GenericField &Field, Init *II);
   std::unique_ptr<SearchIndex>
   parseSearchIndex(GenericTable &Table, const RecordVal *RecVal, StringRef Name,
-                   const std::vector<StringRef> &Key, bool EarlyOut,
-                   bool ReturnRange);
+                   ArrayRef<StringRef> Key, bool EarlyOut, bool ReturnRange);
   void collectEnumEntries(GenericEnum &Enum, StringRef NameField,
-                          StringRef ValueField,
-                          const std::vector<Record *> &Items);
-  void collectTableEntries(GenericTable &Table,
-                           const std::vector<Record *> &Items);
-  int64_t getNumericKey(const SearchIndex &Index, Record *Rec);
+                          StringRef ValueField, ArrayRef<const Record *> Items);
+  void collectTableEntries(GenericTable &Table, ArrayRef<const Record *> Items);
+  int64_t getNumericKey(const SearchIndex &Index, const Record *Rec);
 };
 
 } // End anonymous namespace.
@@ -213,17 +211,17 @@ class SearchableTableEmitter {
 // For search indices that consists of a single field whose numeric value is
 // known, return that numeric value.
 int64_t SearchableTableEmitter::getNumericKey(const SearchIndex &Index,
-                                              Record *Rec) {
+                                              const Record *Rec) {
   assert(Index.Fields.size() == 1);
 
   // To be consistent with compareBy and primaryRepresentation elsewhere,
   // we check for IsInstruction before Enum-- these fields are not exclusive.
   if (Index.Fields[0].IsInstruction) {
-    Record *TheDef = Rec->getValueAsDef(Index.Fields[0].Name);
+    const Record *TheDef = Rec->getValueAsDef(Index.Fields[0].Name);
     return Target->getInstrIntValue(TheDef);
   }
   if (Index.Fields[0].Enum) {
-    Record *EnumEntry = Rec->getValueAsDef(Index.Fields[0].Name);
+    const Record *EnumEntry = Rec->getValueAsDef(Index.Fields[0].Name);
     return Index.Fields[0].Enum->EntryMap[EnumEntry]->second;
   }
 
@@ -232,7 +230,7 @@ int64_t SearchableTableEmitter::getNumericKey(const SearchIndex &Index,
 
 /// Less-than style comparison between \p LHS and \p RHS according to the
 /// key of \p Index.
-bool SearchableTableEmitter::compareBy(Record *LHS, Record *RHS,
+bool SearchableTableEmitter::compareBy(const Record *LHS, const Record *RHS,
                                        const SearchIndex &Index) {
   for (const auto &Field : Index.Fields) {
     Init *LHSI = LHS->getValueInit(Field.Name);
@@ -256,8 +254,8 @@ bool SearchableTableEmitter::compareBy(Record *LHS, Record *RHS,
         return false;
     } else if (Field.IsInstruction) {
       // This does not correctly compare the predefined instructions!
-      Record *LHSr = cast<DefInit>(LHSI)->getDef();
-      Record *RHSr = cast<DefInit>(RHSI)->getDef();
+      const Record *LHSr = cast<DefInit>(LHSI)->getDef();
+      const Record *RHSr = cast<DefInit>(RHSI)->getDef();
 
       bool LHSpseudo = LHSr->getValueAsBit("isPseudo");
       bool RHSpseudo = RHSr->getValueAsBit("isPseudo");
@@ -325,8 +323,8 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
   emitLookupDeclaration(Table, Index, OS);
   OS << " {\n";
 
-  std::vector<Record *> IndexRowsStorage;
-  ArrayRef<Record *> IndexRows;
+  std::vector<const Record *> IndexRowsStorage;
+  ArrayRef<const Record *> IndexRows;
   StringRef IndexTypeName;
   StringRef IndexName;
 
@@ -346,15 +344,16 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
 
     OS << "  static const struct IndexType Index[] = {\n";
 
-    std::vector<std::pair<Record *, unsigned>> Entries;
+    std::vector<std::pair<const Record *, unsigned>> Entries;
     Entries.reserve(Table.Entries.size());
     for (unsigned i = 0; i < Table.Entries.size(); ++i)
       Entries.emplace_back(Table.Entries[i], i);
 
-    llvm::stable_sort(Entries, [&](const std::pair<Record *, unsigned> &LHS,
-                                   const std::pair<Record *, unsigned> &RHS) {
-      return compareBy(LHS.first, RHS.first, Index);
-    });
+    llvm::stable_sort(Entries,
+                      [&](const std::pair<const Record *, unsigned> &LHS,
+                          const std::pair<const Record *, unsigned> &RHS) {
+                        return compareBy(LHS.first, RHS.first, Index);
+                      });
 
     IndexRowsStorage.reserve(Entries.size());
     for (const auto &Entry : Entries) {
@@ -552,7 +551,7 @@ void SearchableTableEmitter::emitGenericTable(const GenericTable &Table,
   // The primary data table contains all the fields defined for this map.
   OS << "constexpr " << Table.CppTypeName << " " << Table.Name << "[] = {\n";
   for (unsigned i = 0; i < Table.Entries.size(); ++i) {
-    Record *Entry = Table.Entries[i];
+    const Record *Entry = Table.Entries[i];
     OS << "  { ";
 
     ListSeparator LS;
@@ -576,18 +575,22 @@ void SearchableTableEmitter::emitGenericTable(const GenericTable &Table,
 }
 
 bool SearchableTableEmitter::parseFieldType(GenericField &Field, Init *TypeOf) {
-  if (auto Type = dyn_cast<StringInit>(TypeOf)) {
-    if (Type->getValue() == "code") {
-      Field.IsCode = true;
+  auto Type = dyn_cast<StringInit>(TypeOf);
+  if (!Type)
+    return false;
+
+  StringRef TypeStr = Type->getValue();
+
+  if (TypeStr == "code") {
+    Field.IsCode = true;
+    return true;
+  }
+
+  if (const Record *TypeRec = Records.getDef(TypeStr)) {
+    if (TypeRec->isSubClassOf("GenericEnum")) {
+      Field.Enum = EnumMap[TypeRec];
+      Field.RecType = RecordRecTy::get(Field.Enum->Class);
       return true;
-    } else {
-      if (Record *TypeRec = Records.getDef(Type->getValue())) {
-        if (TypeRec->isSubClassOf("GenericEnum")) {
-          Field.Enum = EnumMap[TypeRec];
-          Field.RecType = RecordRecTy::get(Field.Enum->Class);
-          return true;
-        }
-      }
     }
   }
 
@@ -596,7 +599,7 @@ bool SearchableTableEmitter::parseFieldType(GenericField &Field, Init *TypeOf) {
 
 std::unique_ptr<SearchIndex> SearchableTableEmitter::parseSearchIndex(
     GenericTable &Table, const RecordVal *KeyRecVal, StringRef Name,
-    const std::vector<StringRef> &Key, bool EarlyOut, bool ReturnRange) {
+    ArrayRef<StringRef> Key, bool EarlyOut, bool ReturnRange) {
   auto Index = std::make_unique<SearchIndex>();
   Index->Name = std::string(Name);
   Index->Loc = KeyRecVal->getLoc();
@@ -626,8 +629,8 @@ std::unique_ptr<SearchIndex> SearchableTableEmitter::parseSearchIndex(
 
 void SearchableTableEmitter::collectEnumEntries(
     GenericEnum &Enum, StringRef NameField, StringRef ValueField,
-    const std::vector<Record *> &Items) {
-  for (auto *EntryRec : Items) {
+    ArrayRef<const Record *> Items) {
+  for (const Record *EntryRec : Items) {
     StringRef Name;
     if (NameField.empty())
       Name = EntryRec->getName();
@@ -655,7 +658,7 @@ void SearchableTableEmitter::collectEnumEntries(
 }
 
 void SearchableTableEmitter::collectTableEntries(
-    GenericTable &Table, const std::vector<Record *> &Items) {
+    GenericTable &Table, ArrayRef<const Record *> Items) {
   if (Items.empty())
     PrintFatalError(Table.Locs,
                     Twine("Table '") + Table.Name + "' has no entries");
@@ -672,7 +675,7 @@ void SearchableTableEmitter::collectTableEntries(
       if (!Field.RecType) {
         Field.RecType = TI->getType();
       } else {
-        RecTy *Ty = resolveTypes(Field.RecType, TI->getType());
+        const RecTy *Ty = resolveTypes(Field.RecType, TI->getType());
         if (!Ty)
           PrintFatalError(EntryRec->getValue(Field.Name),
                           Twine("Field '") + Field.Name + "' of table '" +
@@ -686,8 +689,8 @@ void SearchableTableEmitter::collectTableEntries(
     Table.Entries.push_back(EntryRec); // Add record to table's record list.
   }
 
-  Record *IntrinsicClass = Records.getClass("Intrinsic");
-  Record *InstructionClass = Records.getClass("Instruction");
+  const Record *IntrinsicClass = Records.getClass("Intrinsic");
+  const Record *InstructionClass = Records.getClass("Instruction");
   for (auto &Field : Table.Fields) {
     if (!Field.RecType)
       PrintFatalError(Twine("Cannot determine type of field '") + Field.Name +
@@ -704,7 +707,7 @@ void SearchableTableEmitter::collectTableEntries(
   SearchIndex Idx;
   std::copy(Table.Fields.begin(), Table.Fields.end(),
             std::back_inserter(Idx.Fields));
-  llvm::sort(Table.Entries, [&](Record *LHS, Record *RHS) {
+  llvm::sort(Table.Entries, [&](const Record *LHS, const Record *RHS) {
     return compareBy(LHS, RHS, Idx);
   });
 }
@@ -712,7 +715,7 @@ void SearchableTableEmitter::collectTableEntries(
 void SearchableTableEmitter::run(raw_ostream &OS) {
   // Emit tables in a deterministic order to avoid needless rebuilds.
   SmallVector<std::unique_ptr<GenericTable>, 4> Tables;
-  DenseMap<Record *, GenericTable *> TableMap;
+  DenseMap<const Record *, GenericTable *> TableMap;
   bool NeedsTarget =
       !Records.getAllDerivedDefinitionsIfDefined("Instruction").empty() ||
       !Records.getAllDerivedDefinitionsIfDefined("Intrinsic").empty();
@@ -720,7 +723,7 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
     Target = std::make_unique<CodeGenTarget>(Records);
 
   // Collect all definitions first.
-  for (auto *EnumRec : Records.getAllDerivedDefinitions("GenericEnum")) {
+  for (const auto *EnumRec : Records.getAllDerivedDefinitions("GenericEnum")) {
     StringRef NameField;
     if (!EnumRec->isValueUnset("NameField"))
       NameField = EnumRec->getValueAsString("NameField");
@@ -746,7 +749,8 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
     Enums.emplace_back(std::move(Enum));
   }
 
-  for (auto *TableRec : Records.getAllDerivedDefinitions("GenericTable")) {
+  for (const auto *TableRec :
+       Records.getAllDerivedDefinitions("GenericTable")) {
     auto Table = std::make_unique<GenericTable>();
     Table->Name = std::string(TableRec->getName());
     Table->Locs = TableRec->getLoc();
@@ -777,8 +781,9 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
                       Twine("Table FilterClass '") + FilterClass +
                           "' does not exist");
 
-    RecordVal *FilterClassFieldVal = TableRec->getValue("FilterClassField");
-    std::vector<Record *> Definitions =
+    const RecordVal *FilterClassFieldVal =
+        TableRec->getValue("FilterClassField");
+    std::vector<const Record *> Definitions =
         Records.getAllDerivedDefinitions(FilterClass);
     if (auto *FilterClassFieldInit =
             dyn_cast<StringInit>(FilterClassFieldVal->getValue())) {
@@ -803,17 +808,19 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
                            TableRec->getValueAsBit("PrimaryKeyEarlyOut"),
                            TableRec->getValueAsBit("PrimaryKeyReturnRange"));
 
-      llvm::stable_sort(Table->Entries, [&](Record *LHS, Record *RHS) {
-        return compareBy(LHS, RHS, *Table->PrimaryKey);
-      });
+      llvm::stable_sort(Table->Entries,
+                        [&](const Record *LHS, const Record *RHS) {
+                          return compareBy(LHS, RHS, *Table->PrimaryKey);
+                        });
     }
 
     TableMap.insert(std::pair(TableRec, Table.get()));
     Tables.emplace_back(std::move(Table));
   }
 
-  for (Record *IndexRec : Records.getAllDerivedDefinitions("SearchIndex")) {
-    Record *TableRec = IndexRec->getValueAsDef("Table");
+  for (const Record *IndexRec :
+       Records.getAllDerivedDefinitions("SearchIndex")) {
+    const Record *TableRec = IndexRec->getValueAsDef("Table");
     auto It = TableMap.find(TableRec);
     if (It == TableMap.end())
       PrintFatalError(IndexRec->getValue("Table"),
@@ -829,15 +836,16 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
   }
 
   // Translate legacy tables.
-  Record *SearchableTable = Records.getClass("SearchableTable");
+  const Record *SearchableTable = Records.getClass("SearchableTable");
   for (auto &NameRec : Records.getClasses()) {
-    Record *Class = NameRec.second.get();
+    const Record *Class = NameRec.second.get();
     if (Class->getSuperClasses().size() != 1 ||
         !Class->isSubClassOf(SearchableTable))
       continue;
 
     StringRef TableName = Class->getName();
-    std::vector<Record *> Items = Records.getAllDerivedDefinitions(TableName);
+    ArrayRef<const Record *> Items =
+        Records.getAllDerivedDefinitions(TableName);
     if (!Class->isValueUnset("EnumNameField")) {
       StringRef NameField = Class->getValueAsString("EnumNameField");
       StringRef ValueField;
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index dfa10f74974c7..8ab7bdcd2214b 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -475,7 +475,7 @@ void X86FoldTablesEmitter::addEntryWithFlags(FoldTable &Table,
   StringRef RegInstName = RegRec->getName();
   unsigned DropLen =
       RegInstName.ends_with("rkz") ? 2 : (RegInstName.ends_with("rk") ? 1 : 0);
-  Record *BaseDef =
+  const Record *BaseDef =
       DropLen ? Records.getDef(RegInstName.drop_back(DropLen)) : nullptr;
   bool IsMoveReg =
       BaseDef ? Target.getInstruction(BaseDef).isMoveReg : RegInst->isMoveReg;
diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
index f68c727cbe923..47df5bf0df8e5 100644
--- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
+++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
@@ -215,7 +215,7 @@ void X86InstrMappingEmitter::emitCompressEVEXTable(
     StringRef Name = Rec->getName();
     const CodeGenInstruction *NewInst = nullptr;
     if (ManualMap.find(Name) != ManualMap.end()) {
-      Record *NewRec = Records.getDef(ManualMap.at(Rec->getName()));
+      const Record *NewRec = Records.getDef(ManualMap.at(Rec->getName()));
       assert(NewRec && "Instruction not found!");
       NewInst = &Target.getInstruction(NewRec);
     } else if (Name.ends_with("_EVEX")) {
diff --git a/llvm/utils/TableGen/X86ManualInstrMapping.def b/llvm/utils/TableGen/X86ManualInstrMapping.def
index bc539d792f38d..7c5a6033237fe 100644
--- a/llvm/utils/TableGen/X86ManualInstrMapping.def
+++ b/llvm/utils/TableGen/X86ManualInstrMapping.def
@@ -246,14 +246,14 @@ ENTRY(VCVTTPD2DQZ256rm, VCVTTPD2DQYrm)
 ENTRY(VCVTTPD2DQZ256rr, VCVTTPD2DQYrr)
 ENTRY(VDIVPDZ256rm, VDIVPDYrm)
 ENTRY(VDIVPDZ256rr, VDIVPDYrr)
-ENTRY(VEXTRACTF64x2Z256mr, VEXTRACTF128mri)
-ENTRY(VEXTRACTF64x2Z256rr, VEXTRACTF128rri)
-ENTRY(VEXTRACTI64x2Z256mr, VEXTRACTI128mri)
-ENTRY(VEXTRACTI64x2Z256rr, VEXTRACTI128rri)
-ENTRY(VINSERTF64x2Z256rm, VINSERTF128rmi)
-ENTRY(VINSERTF64x2Z256rr, VINSERTF128rri)
-ENTRY(VINSERTI64x2Z256rm, VINSERTI128rmi)
-ENTRY(VINSERTI64x2Z256rr, VINSERTI128rri)
+ENTRY(VEXTRACTF64x2Z256mri, VEXTRACTF128mri)
+ENTRY(VEXTRACTF64x2Z256rri, VEXTRACTF128rri)
+ENTRY(VEXTRACTI64x2Z256mri, VEXTRACTI128mri)
+ENTRY(VEXTRACTI64x2Z256rri, VEXTRACTI128rri)
+ENTRY(VINSERTF64x2Z256rmi, VINSERTF128rmi)
+ENTRY(VINSERTF64x2Z256rri, VINSERTF128rri)
+ENTRY(VINSERTI64x2Z256rmi, VINSERTI128rmi)
+ENTRY(VINSERTI64x2Z256rri, VINSERTI128rri)
 ENTRY(VMAXCPDZ256rm, VMAXCPDYrm)
 ENTRY(VMAXCPDZ256rr, VMAXCPDYrr)
 ENTRY(VMAXPDZ256rm, VMAXPDYrm)
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index a4de4f0ce9252..af0b03246b566 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -86,6 +86,7 @@ static_library("bugprone") {
     "SuspiciousStringviewDataUsageCheck.cpp",
     "SwappedArgumentsCheck.cpp",
     "SwitchMissingDefaultCaseCheck.cpp",
+    "TaggedUnionMemberCountCheck.cpp",
     "TerminatingContinueCheck.cpp",
     "ThrowKeywordMissingCheck.cpp",
     "TooSmallLoopVariableCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index cba7867854dff..1bbec962ff68d 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -280,6 +280,7 @@ copy("Headers") {
     "rdpruintrin.h",
     "rdseedintrin.h",
     "riscv_bitmanip.h",
+    "riscv_corev_alu.h",
     "riscv_crypto.h",
     "riscv_ntlh.h",
     "rtmintrin.h",
diff --git a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn
index 3b640ae41b9f6..7a6c360e88c14 100644
--- a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn
@@ -141,7 +141,7 @@ static_library("Checkers") {
     "VforkChecker.cpp",
     "VirtualCallChecker.cpp",
     "WebKit/ASTUtils.cpp",
-    "WebKit/NoUncountedMembersChecker.cpp",
+    "WebKit/RawPtrRefMemberChecker.cpp",
     "WebKit/PtrTypesSemantics.cpp",
     "WebKit/RefCntblBaseVirtualDtorChecker.cpp",
     "WebKit/UncountedCallArgsChecker.cpp",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 91d547da201f2..3e165c3d6f634 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -95,7 +95,6 @@ if (current_toolchain == default_toolchain) {
       "__algorithm/find_if.h",
       "__algorithm/find_if_not.h",
       "__algorithm/find_segment_if.h",
-      "__algorithm/fold.h",
       "__algorithm/for_each.h",
       "__algorithm/for_each_n.h",
       "__algorithm/for_each_segment.h",
@@ -170,6 +169,7 @@ if (current_toolchain == default_toolchain) {
       "__algorithm/ranges_find_if.h",
       "__algorithm/ranges_find_if_not.h",
       "__algorithm/ranges_find_last.h",
+      "__algorithm/ranges_fold.h",
       "__algorithm/ranges_for_each.h",
       "__algorithm/ranges_for_each_n.h",
       "__algorithm/ranges_generate.h",
@@ -608,6 +608,7 @@ if (current_toolchain == default_toolchain) {
       "__memory/allocator_arg_t.h",
       "__memory/allocator_destructor.h",
       "__memory/allocator_traits.h",
+      "__memory/array_cookie.h",
       "__memory/assume_aligned.h",
       "__memory/auto_ptr.h",
       "__memory/builtin_new_allocator.h",
diff --git a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
index 3fecf9477ee76..7a05438441b64 100644
--- a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
@@ -52,6 +52,7 @@ static_library("IR") {
     "Instruction.cpp",
     "Instructions.cpp",
     "IntrinsicInst.cpp",
+    "Intrinsics.cpp",
     "LLVMContext.cpp",
     "LLVMContextImpl.cpp",
     "LLVMRemarkStreamer.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
index e69104909330d..6881637f43884 100644
--- a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
@@ -1,14 +1,25 @@
 static_library("SandboxIR") {
   output_name = "LLVMSandboxIR"
   deps = [
+    "//llvm/lib/Analysis",
     "//llvm/lib/IR",
     "//llvm/lib/Support",
   ]
   sources = [
+    "Argument.cpp",
+    "BasicBlock.cpp",
+    "Constant.cpp",
+    "Context.cpp",
+    "Function.cpp",
+    "Instruction.cpp",
+    "Module.cpp",
     "Pass.cpp",
     "PassManager.cpp",
-    "SandboxIR.cpp",
+    "Region.cpp",
     "Tracker.cpp",
     "Type.cpp",
+    "Use.cpp",
+    "User.cpp",
+    "Value.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index f83efbd355802..d122e8a21671b 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -164,6 +164,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPULowerKernelAttributes.cpp",
     "AMDGPULowerModuleLDSPass.cpp",
     "AMDGPUMCInstLower.cpp",
+    "AMDGPUMCResourceInfo.cpp",
     "AMDGPUMIRFormatter.cpp",
     "AMDGPUMachineFunction.cpp",
     "AMDGPUMachineModuleInfo.cpp",
@@ -181,6 +182,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPURegBankSelect.cpp",
     "AMDGPURegisterBankInfo.cpp",
     "AMDGPURemoveIncompatibleFunctions.cpp",
+    "AMDGPUReserveWWMRegs.cpp",
     "AMDGPUResourceUsageAnalysis.cpp",
     "AMDGPURewriteOutArguments.cpp",
     "AMDGPURewriteUndefForPHI.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
index 66db9100fb597..9f85f2ec59511 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
@@ -15,7 +15,6 @@ static_library("Vectorize") {
     "SLPVectorizer.cpp",
     "SandboxVectorizer/DependencyGraph.cpp",
     "SandboxVectorizer/Passes/BottomUpVec.cpp",
-    "SandboxVectorizer/Region.cpp",
     "SandboxVectorizer/SandboxVectorizer.cpp",
     "VPlan.cpp",
     "VPlanAnalysis.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn
index ac9aebb6903c2..11045f4315877 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn
@@ -2,14 +2,17 @@ import("//third-party/unittest/unittest.gni")
 
 unittest("SandboxIRTests") {
   deps = [
+    "//llvm/lib/Analysis",
     "//llvm/lib/AsmParser",
     "//llvm/lib/IR",
     "//llvm/lib/SandboxIR",
   ]
   sources = [
     "PassTest.cpp",
+    "RegionTest.cpp",
     "SandboxIRTest.cpp",
     "TrackerTest.cpp",
     "TypesTest.cpp",
+    "UtilsTest.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn
index a9e2170ce4be7..a01525a0c80b6 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn
@@ -11,8 +11,7 @@ unittest("SandboxVectorizerTests") {
   ]
   sources = [
     "DependencyGraphTest.cpp",
-    "InstrIntervalTest.cpp",
+    "IntervalTest.cpp",
     "LegalityTest.cpp",
-    "RegionTest.cpp",
   ]
 }
diff --git a/llvm/utils/release/test-release.sh b/llvm/utils/release/test-release.sh
index 2dbc9d281dc69..41240621d4cf5 100755
--- a/llvm/utils/release/test-release.sh
+++ b/llvm/utils/release/test-release.sh
@@ -755,8 +755,8 @@ for Flavor in $Flavors ; do
             # case there are build paths in the debug info. Do the same sub-
             # stitution on both files in case the string occurrs naturally.
             if ! cmp -s \
-                <(env LC_CTYPE=C sed -e 's,Phase1,Phase2,g' -e 's,Phase2,Phase3,g' $p2) \
-                <(env LC_CTYPE=C sed -e 's,Phase1,Phase2,g' -e 's,Phase2,Phase3,g' $p3) \
+                <(env LC_ALL=C sed -e 's,Phase1,Phase2,g' -e 's,Phase2,Phase3,g' $p2) \
+                <(env LC_ALL=C sed -e 's,Phase1,Phase2,g' -e 's,Phase2,Phase3,g' $p3) \
                 16 16; then
                 echo "file `basename $p2` differs between phase 2 and phase 3"
             fi
diff --git a/llvm/utils/split-file/split-file.cpp b/llvm/utils/split-file/split-file.cpp
index 2ad04d6e42f2b..672877adaba31 100644
--- a/llvm/utils/split-file/split-file.cpp
+++ b/llvm/utils/split-file/split-file.cpp
@@ -123,7 +123,7 @@ static int handle(MemoryBuffer &inputBuf, StringRef input) {
     if (ec)
       fatal(input, ec.message());
     auto f = std::make_unique<ToolOutputFile>(partPath.str(), ec,
-                                              llvm::sys::fs::OF_None);
+                                              llvm::sys::fs::OF_Text);
     if (!f)
       fatal(input, ec.message());
 
@@ -156,7 +156,7 @@ int main(int argc, const char **argv) {
   if (output.empty())
     fatal("", "output directory is not specified");
   ErrorOr<std::unique_ptr<MemoryBuffer>> bufferOrErr =
-      MemoryBuffer::getFileOrSTDIN(input);
+      MemoryBuffer::getFileOrSTDIN(input, /*IsText=*/true);
   if (std::error_code ec = bufferOrErr.getError())
     fatal(input, ec.message());
 
diff --git a/mlir/docs/Dialects/OpenMPDialect/_index.md b/mlir/docs/Dialects/OpenMPDialect/_index.md
index 88437b8cf828c..b4e359284edae 100644
--- a/mlir/docs/Dialects/OpenMPDialect/_index.md
+++ b/mlir/docs/Dialects/OpenMPDialect/_index.md
@@ -132,7 +132,8 @@ and optional list of `traits`, a list of `clauses` where all the applicable
 would have to be defined in the operation's body are the `summary` and
 `description`. For the latter, only the operation itself would have to be
 defined, and the description for its clause-inherited arguments is appended
-through the inherited `clausesDescription` property.
+through the inherited `clausesDescription` property. By convention, the list of
+clauses for an operation must be specified in alphabetical order.
 
 If the operation is intended to have a single region, this is better achieved by
 setting the `singleRegion=true` template argument of `OpenMP_Op` rather manually
@@ -285,7 +286,76 @@ argument's type:
   specific `mlir::Attribute` subclass) will be used instead.
   - Other attribute types will be represented with their `storageType`.
 - It will create `<Name>Operands` structure for each operation, which is an
-empty structure subclassing all operand structures defined for the corresponding `OpenMP_Op`'s clauses.
+empty structure subclassing all operand structures defined for the corresponding
+`OpenMP_Op`'s clauses.
+
+### Entry Block Argument-Defining Clauses
+
+In their MLIR representation, certain OpenMP clauses introduce a mapping between
+values defined outside the operation they are applied to and entry block
+arguments for the region of that MLIR operation. This enables, for example, the
+introduction of private copies of the same underlying variable defined outside
+the MLIR operation the clause is attached to. Currently, clauses with this
+property can be classified into three main categories:
+  - Map-like clauses: `map`, `use_device_addr` and `use_device_ptr`.
+  - Reduction-like clauses: `in_reduction`, `reduction` and `task_reduction`.
+  - Privatization clauses: `private`.
+
+All three kinds of entry block argument-defining clauses use a similar custom
+assembly format representation, only differing based on the different pieces of
+information attached to each kind. Below, one example of each is shown:
+
+```mlir
+omp.target map_entries(%x -> %x.m, %y -> %y.m : !llvm.ptr, !llvm.ptr) {
+  // Use %x.m, %y.m in place of %x and %y...
+}
+
+omp.wsloop reduction(@add.i32 %x -> %x.r, byref @add.f32 %y -> %y.r : !llvm.ptr, !llvm.ptr) {
+  // Use %x.r, %y.r in place of %x and %y...
+}
+
+omp.parallel private(@x.privatizer %x -> %x.p, @y.privatizer %y -> %y.p : !llvm.ptr, !llvm.ptr) {
+  // Use %x.p, %y.p in place of %x and %y...
+}
+```
+
+As a consequence of parsing and printing the operation's first region entry
+block argument names together with the custom assembly format of these clauses,
+entry block arguments (i.e. the `^bb0(...):` line) must not be explicitly
+defined for these operations. Additionally, it is not possible to implement this
+feature while allowing each clause to be independently parsed and printed,
+because they need to be printed/parsed together with the corresponding
+operation's first region. They must have a well-defined ordering in which
+multiple of these clauses are specified for a given operation, as well.
+
+The parsing/printing of these clauses together with the region provides the
+ability to define entry block arguments directly after the `->`. Forcing a
+specific ordering between these clauses makes the block argument ordering
+well-defined, which is the property used to easily match each clause with the
+entry block arguments defined by it.
+
+Custom printers and parsers for operation regions based on the entry block
+argument-defining clauses they take are implemented based on the
+`{parse,print}BlockArgRegion` functions, which take care of the sorting and
+formatting of each kind of clause, minimizing code duplication resulting from
+this approach. One example of the custom assembly format of an operation taking
+the `private` and `reduction` clauses is the following:
+
+```tablegen
+let assemblyFormat = clausesAssemblyFormat # [{
+  custom<PrivateReductionRegion>($region, $private_vars, type($private_vars),
+      $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref,
+      $reduction_syms) attr-dict
+}];
+```
+
+The `BlockArgOpenMPOpInterface` has been introduced to simplify the addition and
+handling of these kinds of clauses. It holds `num<ClauseName>BlockArgs()`
+functions that by default return 0, to be overriden by each clause through the
+`extraClassDeclaration` property. Based on these functions and the expected
+alphabetical sorting between entry block argument-defining clauses, it
+implements `get<ClauseName>BlockArgs()` functions that are the intended method
+of accessing clause-defined block arguments.
 
 ## Loop-Associated Directives
 
diff --git a/mlir/docs/Dialects/SPIR-V.md b/mlir/docs/Dialects/SPIR-V.md
index 51ba3482a379a..c1d50f0fbe407 100644
--- a/mlir/docs/Dialects/SPIR-V.md
+++ b/mlir/docs/Dialects/SPIR-V.md
@@ -388,7 +388,7 @@ This corresponds to SPIR-V [struct type][StructType]. Its syntax is
 ```
 struct-member-decoration ::= integer-literal? spirv-decoration*
 struct-type ::= `!spirv.struct<` spirv-type (`[` struct-member-decoration `]`)?
-                     (`, ` spirv-type (`[` struct-member-decoration `]`)?
+                     (`, ` spirv-type (`[` struct-member-decoration `]`)? `>`
 ```
 
 For Example,
diff --git a/mlir/examples/toy/Ch1/include/toy/Lexer.h b/mlir/examples/toy/Ch1/include/toy/Lexer.h
index ecbb3b4e0e583..d420a7ebbf3b6 100644
--- a/mlir/examples/toy/Ch1/include/toy/Lexer.h
+++ b/mlir/examples/toy/Ch1/include/toy/Lexer.h
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/StringRef.h"
 
+#include <cstdlib>
 #include <memory>
 #include <string>
 
diff --git a/mlir/examples/toy/Ch2/include/toy/Lexer.h b/mlir/examples/toy/Ch2/include/toy/Lexer.h
index 3c59cd9805fef..22822cc105957 100644
--- a/mlir/examples/toy/Ch2/include/toy/Lexer.h
+++ b/mlir/examples/toy/Ch2/include/toy/Lexer.h
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/StringRef.h"
 
+#include <cstdlib>
 #include <memory>
 #include <string>
 
diff --git a/mlir/examples/toy/Ch3/include/toy/Lexer.h b/mlir/examples/toy/Ch3/include/toy/Lexer.h
index 3c59cd9805fef..22822cc105957 100644
--- a/mlir/examples/toy/Ch3/include/toy/Lexer.h
+++ b/mlir/examples/toy/Ch3/include/toy/Lexer.h
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/StringRef.h"
 
+#include <cstdlib>
 #include <memory>
 #include <string>
 
diff --git a/mlir/examples/toy/Ch4/include/toy/Lexer.h b/mlir/examples/toy/Ch4/include/toy/Lexer.h
index 3c59cd9805fef..22822cc105957 100644
--- a/mlir/examples/toy/Ch4/include/toy/Lexer.h
+++ b/mlir/examples/toy/Ch4/include/toy/Lexer.h
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/StringRef.h"
 
+#include <cstdlib>
 #include <memory>
 #include <string>
 
diff --git a/mlir/examples/toy/Ch5/include/toy/Lexer.h b/mlir/examples/toy/Ch5/include/toy/Lexer.h
index 3c59cd9805fef..22822cc105957 100644
--- a/mlir/examples/toy/Ch5/include/toy/Lexer.h
+++ b/mlir/examples/toy/Ch5/include/toy/Lexer.h
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/StringRef.h"
 
+#include <cstdlib>
 #include <memory>
 #include <string>
 
diff --git a/mlir/examples/toy/Ch6/include/toy/Lexer.h b/mlir/examples/toy/Ch6/include/toy/Lexer.h
index 3c59cd9805fef..22822cc105957 100644
--- a/mlir/examples/toy/Ch6/include/toy/Lexer.h
+++ b/mlir/examples/toy/Ch6/include/toy/Lexer.h
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/StringRef.h"
 
+#include <cstdlib>
 #include <memory>
 #include <string>
 
diff --git a/mlir/examples/toy/Ch7/include/toy/Lexer.h b/mlir/examples/toy/Ch7/include/toy/Lexer.h
index a3fde91a357a3..f022c2f48ac8e 100644
--- a/mlir/examples/toy/Ch7/include/toy/Lexer.h
+++ b/mlir/examples/toy/Ch7/include/toy/Lexer.h
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/StringRef.h"
 
+#include <cstdlib>
 #include <memory>
 #include <string>
 
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index dbec741cf1b1f..8773fc5881461 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -1096,6 +1096,7 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index",
   ];
 
   let hasVerifier = 1;
+  let hasCanonicalizer = 1;
 }
 
 #endif // AFFINE_OPS
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
index 1036e93a03924..b08e803345f76 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -394,7 +394,7 @@ def LoopCoalescing : Pass<"affine-loop-coalescing", "func::FuncOp"> {
   let summary = "Coalesce nested loops with independent bounds into a single "
                 "loop";
   let constructor = "mlir::affine::createLoopCoalescingPass()";
-  let dependentDialects = ["arith::ArithDialect"];
+  let dependentDialects = ["affine::AffineDialect","arith::ArithDialect"];
 }
 
 def SimplifyAffineStructures : Pass<"affine-simplify-structures", "func::FuncOp"> {
diff --git a/mlir/include/mlir/Dialect/Arith/Utils/Utils.h b/mlir/include/mlir/Dialect/Arith/Utils/Utils.h
index 76f5825025739..d759299cbf762 100644
--- a/mlir/include/mlir/Dialect/Arith/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Arith/Utils/Utils.h
@@ -130,6 +130,10 @@ namespace arith {
 Value createProduct(OpBuilder &builder, Location loc, ArrayRef<Value> values);
 Value createProduct(OpBuilder &builder, Location loc, ArrayRef<Value> values,
                     Type resultType);
+
+// Map strings to float types.
+std::optional<FloatType> parseFloatType(MLIRContext *ctx, StringRef name);
+
 } // namespace arith
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index aceb9d059b95f..d19687ec9afee 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -11,6 +11,7 @@
 
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/DenseMapInfoVariant.h"
 #include "llvm/ADT/SetVector.h"
@@ -260,9 +261,9 @@ struct BufferizationOptions {
   using AnalysisStateInitFn = std::function<void(AnalysisState &)>;
   /// Tensor -> MemRef type converter.
   /// Parameters: Value, memory space, func op, bufferization options
-  using FunctionArgTypeConverterFn =
-      std::function<BaseMemRefType(TensorType, Attribute memorySpace,
-                                   func::FuncOp, const BufferizationOptions &)>;
+  using FunctionArgTypeConverterFn = std::function<BaseMemRefType(
+      TensorType, Attribute memorySpace, FunctionOpInterface,
+      const BufferizationOptions &)>;
   /// Tensor -> MemRef type converter.
   /// Parameters: Value, memory space, bufferization options
   using UnknownTypeConverterFn = std::function<BaseMemRefType(
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h
index 0b91d3d675b7c..8bed0dfc5814b 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h
@@ -50,24 +50,24 @@ struct FuncAnalysisState : public OneShotAnalysisState::Extension {
 
   /// A mapping of ReturnOp OpOperand indices to equivalent FuncOp BBArg
   /// indices.
-  DenseMap<FuncOp, IndexMapping> equivalentFuncArgs;
+  DenseMap<FunctionOpInterface, IndexMapping> equivalentFuncArgs;
 
   /// A mapping of FuncOp BBArg indices to aliasing ReturnOp OpOperand indices.
-  DenseMap<FuncOp, IndexToIndexListMapping> aliasingReturnVals;
+  DenseMap<FunctionOpInterface, IndexToIndexListMapping> aliasingReturnVals;
 
   /// A set of all read BlockArguments of FuncOps.
-  DenseMap<FuncOp, BbArgIndexSet> readBbArgs;
+  DenseMap<FunctionOpInterface, BbArgIndexSet> readBbArgs;
 
   /// A set of all written-to BlockArguments of FuncOps.
-  DenseMap<FuncOp, BbArgIndexSet> writtenBbArgs;
+  DenseMap<FunctionOpInterface, BbArgIndexSet> writtenBbArgs;
 
   /// Keep track of which FuncOps are fully analyzed or currently being
   /// analyzed.
-  DenseMap<FuncOp, FuncOpAnalysisState> analyzedFuncOps;
+  DenseMap<FunctionOpInterface, FuncOpAnalysisState> analyzedFuncOps;
 
   /// This function is called right before analyzing the given FuncOp. It
   /// initializes the data structures for the FuncOp in this state object.
-  void startFunctionAnalysis(FuncOp funcOp);
+  void startFunctionAnalysis(FunctionOpInterface funcOp);
 };
 
 void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 8584a25f8b3d6..030160821bd82 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -551,7 +551,15 @@ def LLVM_InvokeOp : LLVM_Op<"invoke", [
                    Variadic<LLVM_Type>:$normalDestOperands,
                    Variadic<LLVM_Type>:$unwindDestOperands,
                    OptionalAttr<DenseI32ArrayAttr>:$branch_weights,
-                   DefaultValuedAttr<CConv, "CConv::C">:$CConv);
+                   DefaultValuedAttr<CConv, "CConv::C">:$CConv,
+                   VariadicOfVariadic<LLVM_Type,
+                                      "op_bundle_sizes">:$op_bundle_operands,
+                   DenseI32ArrayAttr:$op_bundle_sizes,
+                   DefaultValuedProperty<
+                     ArrayProperty<StringProperty, "operand bundle tags">,
+                     "ArrayRef<std::string>{}",
+                     "SmallVector<std::string>{}"
+                   >:$op_bundle_tags);
   let results = (outs Optional<LLVM_Type>:$result);
   let successors = (successor AnySuccessor:$normalDest,
                               AnySuccessor:$unwindDest);
@@ -607,7 +615,8 @@ def LLVM_VaArgOp : LLVM_Op<"va_arg"> {
 //===----------------------------------------------------------------------===//
 
 def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
-                    [DeclareOpInterfaceMethods<FastmathFlagsInterface>,
+                    [AttrSizedOperandSegments,
+                     DeclareOpInterfaceMethods<FastmathFlagsInterface>,
                      DeclareOpInterfaceMethods<CallOpInterface>,
                      DeclareOpInterfaceMethods<SymbolUserOpInterface>,
                      DeclareOpInterfaceMethods<BranchWeightOpInterface>]> {
@@ -661,8 +670,15 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
                   OptionalAttr<LLVM_MemoryEffectsAttr>:$memory_effects,
                   OptionalAttr<UnitAttr>:$convergent,
                   OptionalAttr<UnitAttr>:$no_unwind,
-                  OptionalAttr<UnitAttr>:$will_return
-                  );
+                  OptionalAttr<UnitAttr>:$will_return,
+                  VariadicOfVariadic<LLVM_Type,
+                                     "op_bundle_sizes">:$op_bundle_operands,
+                  DenseI32ArrayAttr:$op_bundle_sizes,
+                  DefaultValuedProperty<
+                    ArrayProperty<StringProperty, "operand bundle tags">,
+                    "ArrayRef<std::string>{}",
+                    "SmallVector<std::string>{}"
+                  >:$op_bundle_tags);
   // Append the aliasing related attributes defined in LLVM_MemAccessOpBase.
   let arguments = !con(args, aliasAttrs);
   let results = (outs Optional<LLVM_Type>:$result);
@@ -682,6 +698,7 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
     OpBuilder<(ins "LLVMFunctionType":$calleeType, "StringRef":$callee,
                    CArg<"ValueRange", "{}">:$args)>
   ];
+  let hasVerifier = 1;
   let hasCustomAssemblyFormat = 1;
   let extraClassDeclaration = [{
     /// Returns the callee function type.
@@ -1895,7 +1912,8 @@ def LLVM_InlineAsmOp : LLVM_Op<"inline_asm", [DeclareOpInterfaceMethods<MemoryEf
 
 def LLVM_CallIntrinsicOp
     : LLVM_Op<"call_intrinsic",
-              [DeclareOpInterfaceMethods<FastmathFlagsInterface>]> {
+              [AttrSizedOperandSegments,
+               DeclareOpInterfaceMethods<FastmathFlagsInterface>]> {
   let summary = "Call to an LLVM intrinsic function.";
   let description = [{
     Call the specified llvm intrinsic. If the intrinsic is overloaded, use
@@ -1903,13 +1921,25 @@ def LLVM_CallIntrinsicOp
   }];
   let arguments = (ins StrAttr:$intrin, Variadic<LLVM_Type>:$args,
                        DefaultValuedAttr<LLVM_FastmathFlagsAttr,
-                                         "{}">:$fastmathFlags);
+                                         "{}">:$fastmathFlags,
+                       VariadicOfVariadic<LLVM_Type,
+                                          "op_bundle_sizes">:$op_bundle_operands,
+                       DenseI32ArrayAttr:$op_bundle_sizes,
+                       DefaultValuedProperty<
+                         ArrayProperty<StringProperty, "operand bundle tags">,
+                         "ArrayRef<std::string>{}",
+                         "SmallVector<std::string>{}"
+                       >:$op_bundle_tags);
   let results = (outs Optional<LLVM_Type>:$results);
   let llvmBuilder = [{
     return convertCallLLVMIntrinsicOp(op, builder, moduleTranslation);
   }];
   let assemblyFormat = [{
-    $intrin `(` $args `)` `:` functional-type($args, $results) attr-dict
+    $intrin `(` $args `)`
+    ( custom<OpBundles>($op_bundle_operands, type($op_bundle_operands),
+                        $op_bundle_tags)^ )?
+    `:` functional-type($args, $results)
+    attr-dict
   }];
 
   let hasVerifier = 1;
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
index db25c9b241734..9ede21e87cf53 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Transform/IR/TransformAttrs.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformTypes.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/IR/OpImplementation.h"
@@ -42,10 +43,6 @@ class UnPackOp;
 } // namespace tensor
 
 namespace transform {
-class AnyOpType;
-class AnyValueType;
-class OperationType;
-class TransformHandleTypeInterface;
 // Types needed for builders.
 struct TileSizesSpec {};
 struct NumThreadsSpec {};
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 0208f854f799e..48e657cca96e3 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -549,7 +549,7 @@ namespace detail {
 struct PackingResult {
   SmallVector<OpFoldResult> offsets, sizes, strides;
   SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
-  GenericOp maybeTransposeOp;
+  TransposeOp maybeTransposeOp;
   tensor::PadOp hoistedPadOp;
 };
 
@@ -568,9 +568,9 @@ buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist,
 /// a larger tensor. On success, `opToHoist` is replaced by the cloned version
 /// in the packing loop so the caller can continue reasoning about the padding
 /// operation. If `transposeVector` is non-empty, hoist padding introduces a
-/// GenericOp to transpose the padded tensor before inserting it into the packed
-/// tensor. A `transposeVector` can change the storage order of the padded
-/// tensor but does not change the order of the pack or compute loops.
+/// TransposeOp to transpose the padded tensor before inserting it into the
+/// packed tensor. A `transposeVector` can change the storage order of the
+/// padded tensor but does not change the order of the pack or compute loops.
 ///
 /// TODO: In the future, we should consider rewriting as a tensor.pack after
 /// hoisting since this abstraction is now available.
@@ -615,13 +615,13 @@ FailureOr<Value>
 hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist,
                       int64_t numLoops, ArrayRef<int64_t> transposeVector,
                       tensor::PadOp &hoistedOp,
-                      SmallVectorImpl<GenericOp> &transposeOps);
+                      SmallVectorImpl<TransposeOp> &transposeOps);
 /// Calls into `hoistPaddingOnTensors` with a local IRRewriter.
 FailureOr<Value>
 hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops,
                       ArrayRef<int64_t> transposeVector,
                       tensor::PadOp &hoistedOp,
-                      SmallVectorImpl<GenericOp> &transposeOps);
+                      SmallVectorImpl<TransposeOp> &transposeOps);
 
 /// Apply padding and hoisting to `linalgOp` according to the configuration
 /// specified in `options`.
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index f1df49ce3eaa3..1e4f3004dec7e 100644
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -75,12 +75,6 @@ bool isReductionIterator(utils::IteratorType iteratorType);
 Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
                             Value source, Value pad, bool nofold);
 
-/// Returns a GenericOp that transposes `inputTensor` into `outputTensor`
-/// using `transposeVector` to permute the `inputTensor` dimensions.
-GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor,
-                          Value outputTensor,
-                          ArrayRef<int64_t> transposeVector);
-
 /// Returns GenericOp that copies an n-D memref. Unlike the current
 /// implementation of memref::CopyOp, this op can further tile, lower to loops
 /// or vectorize.
diff --git a/mlir/include/mlir/Dialect/Math/Transforms/Passes.h b/mlir/include/mlir/Dialect/Math/Transforms/Passes.h
index 2dd7f6431f03e..2974bb344ad96 100644
--- a/mlir/include/mlir/Dialect/Math/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Math/Transforms/Passes.h
@@ -56,11 +56,13 @@ void populateMathPolynomialApproximationPatterns(
 void populateUpliftToFMAPatterns(RewritePatternSet &patterns);
 
 namespace math {
-void populateLegalizeToF32TypeConverter(TypeConverter &typeConverter);
-void populateLegalizeToF32ConversionTarget(ConversionTarget &target,
-                                           TypeConverter &typeConverter);
-void populateLegalizeToF32Patterns(RewritePatternSet &patterns,
-                                   TypeConverter &typeConverter);
+void populateExtendToSupportedTypesTypeConverter(
+    TypeConverter &typeConverter, const SetVector<Type> &sourceTypes,
+    Type targetType);
+void populateExtendToSupportedTypesConversionTarget(
+    ConversionTarget &target, TypeConverter &typeConverter);
+void populateExtendToSupportedTypesPatterns(RewritePatternSet &patterns,
+                                            TypeConverter &typeConverter);
 } // namespace math
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Math/Transforms/Passes.td b/mlir/include/mlir/Dialect/Math/Transforms/Passes.td
index e870e714bfda5..a84c89020d4f3 100644
--- a/mlir/include/mlir/Dialect/Math/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Math/Transforms/Passes.td
@@ -19,7 +19,7 @@ def MathUpliftToFMA : Pass<"math-uplift-to-fma"> {
   let dependentDialects = ["math::MathDialect"];
 }
 
-def MathLegalizeToF32 : Pass<"math-legalize-to-f32"> {
+def MathExtendToSupportedTypes : Pass<"math-extend-to-supported-types"> {
   let summary = "Legalize floating-point math ops on low-precision floats";
   let description = [{
     On many targets, the math functions are not implemented for floating-point
@@ -28,11 +28,19 @@ def MathLegalizeToF32 : Pass<"math-legalize-to-f32"> {
 
     This pass explicitly legalizes these math functions by inserting
     `arith.extf` and `arith.truncf` pairs around said op, which preserves
-    the original semantics while enabling lowering.
+    the original semantics while enabling lowering. The extra supported floating-point
+    types for the target are passed as arguments. Types f64 and f32 are implicitly 
+    supported.
 
     As an exception, this pass does not legalize `math.fma`, because
     that is an operation frequently implemented at low precisions.
   }];
+  let options = [
+    ListOption<"extraTypeStrs", "extra-types", "std::string",
+      "MLIR types with arithmetic support on a given target (f64 and f32 are implicitly supported)">,
+    Option<"targetTypeStr", "target-type", "std::string", "\"f32\"",
+      "MLIR type to convert the unsupported source types to">,
+  ];
   let dependentDialects = ["math::MathDialect", "arith::ArithDialect"];
 }
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
index c579ba6e751d2..886554f66afff 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
@@ -451,7 +451,7 @@ class OpenMP_InReductionClauseSkip<
   > : OpenMP_Clause<traits, arguments, assemblyFormat, description,
                     extraClassDeclaration> {
   let traits = [
-    ReductionClauseInterface
+    BlockArgOpenMPOpInterface, ReductionClauseInterface
   ];
 
   let arguments = (ins
@@ -460,21 +460,19 @@ class OpenMP_InReductionClauseSkip<
     OptionalAttr<SymbolRefArrayAttr>:$in_reduction_syms
   );
 
-  let optAssemblyFormat = [{
-    `in_reduction` `(`
-      custom<ReductionVarList>($in_reduction_vars, type($in_reduction_vars),
-                               $in_reduction_byref, $in_reduction_syms) `)`
-  }];
-
   let extraClassDeclaration = [{
     /// Returns the reduction variables.
     SmallVector<Value> getReductionVars() {
       return SmallVector<Value>(getInReductionVars().begin(),
                                 getInReductionVars().end());
     }
+
+    unsigned numInReductionBlockArgs() { return getInReductionVars().size(); }
   }];
 
-  // Description varies depending on the operation.
+  // Description varies depending on the operation. Assembly format not defined
+  // because this clause must be processed together with the first region of the
+  // operation, as it defines entry block arguments.
 }
 
 def OpenMP_InReductionClause : OpenMP_InReductionClauseSkip<>;
@@ -575,6 +573,8 @@ class OpenMP_MapClauseSkip<
   > : OpenMP_Clause<traits, arguments, assemblyFormat, description,
                     extraClassDeclaration> {
   let traits = [
+    // Not adding the BlockArgOpenMPOpInterface here because omp.target is the
+    // only operation defining block arguments for `map` clauses.
     MapClauseOwningOpInterface
   ];
 
@@ -583,7 +583,7 @@ class OpenMP_MapClauseSkip<
   );
 
   let optAssemblyFormat = [{
-    `map_entries` `(` custom<MapEntries>($map_vars, type($map_vars)) `)`
+    `map_entries` `(` $map_vars `:` type($map_vars) `)`
   }];
 
   let description = [{
@@ -923,17 +923,23 @@ class OpenMP_PrivateClauseSkip<
     bit description = false, bit extraClassDeclaration = false
   > : OpenMP_Clause<traits, arguments, assemblyFormat, description,
                     extraClassDeclaration> {
+  let traits = [
+    BlockArgOpenMPOpInterface
+  ];
+
   let arguments = (ins
     Variadic<AnyType>:$private_vars,
     OptionalAttr<SymbolRefArrayAttr>:$private_syms
   );
 
-  let optAssemblyFormat = [{
-    `private` `(`
-      custom<PrivateList>($private_vars, type($private_vars), $private_syms) `)`
+  let extraClassDeclaration = [{
+    unsigned numPrivateBlockArgs() { return getPrivateVars().size(); }
   }];
 
   // TODO: Add description.
+  // Assembly format not defined because this clause must be processed together
+  // with the first region of the operation, as it defines entry block
+  // arguments.
 }
 
 def OpenMP_PrivateClause : OpenMP_PrivateClauseSkip<>;
@@ -973,7 +979,7 @@ class OpenMP_ReductionClauseSkip<
   > : OpenMP_Clause<traits, arguments, assemblyFormat, description,
                     extraClassDeclaration> {
   let traits = [
-    ReductionClauseInterface
+    BlockArgOpenMPOpInterface, ReductionClauseInterface
   ];
 
   let arguments = (ins
@@ -982,15 +988,10 @@ class OpenMP_ReductionClauseSkip<
     OptionalAttr<SymbolRefArrayAttr>:$reduction_syms
   );
 
-  let optAssemblyFormat = [{
-    `reduction` `(`
-      custom<ReductionVarList>($reduction_vars, type($reduction_vars),
-                               $reduction_byref, $reduction_syms) `)`
-  }];
-
   let extraClassDeclaration = [{
     /// Returns the number of reduction variables.
     unsigned getNumReductionVars() { return getReductionVars().size(); }
+    unsigned numReductionBlockArgs() { return getReductionVars().size(); }
   }];
 
   // Description varies depending on the operation.
@@ -1007,6 +1008,10 @@ class OpenMP_ReductionClauseSkip<
     thread or simd lane defined by the operation's region into the final value,
     which is available in the accumulator after they all complete.
   }];
+
+  // Assembly format not defined because this clause must be processed together
+  // with the first region of the operation, as it defines entry block
+  // arguments.
 }
 
 def OpenMP_ReductionClause : OpenMP_ReductionClauseSkip<>;
@@ -1104,7 +1109,7 @@ class OpenMP_TaskReductionClauseSkip<
   > : OpenMP_Clause<traits, arguments, assemblyFormat, description,
                     extraClassDeclaration> {
   let traits = [
-    ReductionClauseInterface
+    BlockArgOpenMPOpInterface, ReductionClauseInterface
   ];
 
   let arguments = (ins
@@ -1113,10 +1118,16 @@ class OpenMP_TaskReductionClauseSkip<
     OptionalAttr<SymbolRefArrayAttr>:$task_reduction_syms
   );
 
-  let optAssemblyFormat = [{
-    `task_reduction` `(`
-      custom<ReductionVarList>($task_reduction_vars, type($task_reduction_vars),
-                               $task_reduction_byref, $task_reduction_syms) `)`
+  let extraClassDeclaration = [{
+    /// Returns the reduction variables.
+    SmallVector<Value> getReductionVars() {
+      return SmallVector<Value>(getTaskReductionVars().begin(),
+                                getTaskReductionVars().end());
+    }
+
+    unsigned numTaskReductionBlockArgs() {
+      return getTaskReductionVars().size();
+    }
   }];
 
   let description = [{
@@ -1131,13 +1142,9 @@ class OpenMP_TaskReductionClauseSkip<
     reduction region by value or by reference in `task_reduction_byref`.
   }];
 
-  let extraClassDeclaration = [{
-    /// Returns the reduction variables.
-    SmallVector<Value> getReductionVars() {
-      return SmallVector<Value>(getTaskReductionVars().begin(),
-                                getTaskReductionVars().end());
-    }
-  }];
+  // Assembly format not defined because this clause must be processed together
+  // with the first region of the operation, as it defines entry block
+  // arguments.
 }
 
 def OpenMP_TaskReductionClause : OpenMP_TaskReductionClauseSkip<>;
@@ -1202,18 +1209,28 @@ class OpenMP_UseDeviceAddrClauseSkip<
     bit description = false, bit extraClassDeclaration = false
   > : OpenMP_Clause<traits, arguments, assemblyFormat, description,
                     extraClassDeclaration> {
+  let traits = [
+    BlockArgOpenMPOpInterface
+  ];
+
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$use_device_addr_vars
   );
 
-  let optAssemblyFormat = [{
-    `use_device_addr` `(` $use_device_addr_vars `:` type($use_device_addr_vars) `)`
+  let extraClassDeclaration = [{
+    unsigned numUseDeviceAddrBlockArgs() {
+      return getUseDeviceAddrVars().size();
+    }
   }];
 
   let description = [{
     The optional `use_device_addr_vars` specifies the address of the objects in
     the device data environment.
   }];
+
+  // Assembly format not defined because this clause must be processed together
+  // with the first region of the operation, as it defines entry block
+  // arguments.
 }
 
 def OpenMP_UseDeviceAddrClause : OpenMP_UseDeviceAddrClauseSkip<>;
@@ -1227,18 +1244,28 @@ class OpenMP_UseDevicePtrClauseSkip<
     bit description = false, bit extraClassDeclaration = false
   > : OpenMP_Clause<traits, arguments, assemblyFormat, description,
                     extraClassDeclaration> {
+  let traits = [
+    BlockArgOpenMPOpInterface
+  ];
+
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$use_device_ptr_vars
   );
 
-  let optAssemblyFormat = [{
-    `use_device_ptr` `(` $use_device_ptr_vars `:` type($use_device_ptr_vars) `)`
+  let extraClassDeclaration = [{
+    unsigned numUseDevicePtrBlockArgs() {
+      return getUseDevicePtrVars().size();
+    }
   }];
 
   let description = [{
     The optional `use_device_ptr_vars` specifies the device pointers to the
     corresponding list items in the device data environment.
   }];
+
+  // Assembly format not defined because this clause must be processed together
+  // with the first region of the operation, as it defines entry block
+  // arguments.
 }
 
 def OpenMP_UseDevicePtrClause : OpenMP_UseDevicePtrClauseSkip<>;
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 9d2123a2e9bf5..66f63fc02fe2f 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -133,8 +133,7 @@ def ParallelOp : OpenMP_Op<"parallel", traits = [
     RecursiveMemoryEffects
   ], clauses = [
     OpenMP_AllocateClause, OpenMP_IfClause, OpenMP_NumThreadsClause,
-    OpenMP_PrivateClauseSkip<assemblyFormat = true>, OpenMP_ProcBindClause,
-    OpenMP_ReductionClauseSkip<assemblyFormat = true>
+    OpenMP_PrivateClause, OpenMP_ProcBindClause, OpenMP_ReductionClause
   ], singleRegion = true> {
   let summary = "parallel construct";
   let description = [{
@@ -151,16 +150,11 @@ def ParallelOp : OpenMP_Op<"parallel", traits = [
     OpBuilder<(ins CArg<"const ParallelOperands &">:$clauses)>
   ];
 
-  // TODO: Use default assembly format inherited from OpenMP_Op once printing
-  // and parsing of the parallel region is not intermingled with printing and
-  // parsing of reduction and private clauses. `assemblyFormat` should also be
-  // no longer skipped for clauses added to this operation at that time.
-  let assemblyFormat =
-    clausesReqAssemblyFormat # " oilist(" # clausesOptAssemblyFormat # ")" # [{
-      custom<ParallelRegion>($region, $reduction_vars, type($reduction_vars),
-                            $reduction_byref, $reduction_syms, $private_vars,
-                            type($private_vars), $private_syms) attr-dict
-    }];
+  let assemblyFormat = clausesAssemblyFormat # [{
+    custom<PrivateReductionRegion>($region, $private_vars, type($private_vars),
+        $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref,
+        $reduction_syms) attr-dict
+  }];
 
   let hasVerifier = 1;
 }
@@ -200,6 +194,12 @@ def TeamsOp : OpenMP_Op<"teams", traits = [
     OpBuilder<(ins CArg<"const TeamsOperands &">:$clauses)>
   ];
 
+  let assemblyFormat = clausesAssemblyFormat # [{
+    custom<PrivateReductionRegion>($region, $private_vars, type($private_vars),
+        $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref,
+        $reduction_syms) attr-dict
+  }];
+
   let hasVerifier = 1;
 }
 
@@ -207,8 +207,9 @@ def TeamsOp : OpenMP_Op<"teams", traits = [
 // 2.8.1 Sections Construct
 //===----------------------------------------------------------------------===//
 
-def SectionOp : OpenMP_Op<"section", [HasParent<"SectionsOp">],
-                          singleRegion = true> {
+def SectionOp : OpenMP_Op<"section", traits = [
+    BlockArgOpenMPOpInterface, HasParent<"SectionsOp">
+  ], singleRegion = true> {
   let summary = "section directive";
   let description = [{
     A section operation encloses a region which represents one section in a
@@ -218,6 +219,13 @@ def SectionOp : OpenMP_Op<"section", [HasParent<"SectionsOp">],
     operation. This is done to reflect situations where these block arguments
     represent variables private to each section.
   }];
+  let extraClassDeclaration = [{
+    // Override BlockArgOpenMPOpInterface methods based on the parent
+    // omp.sections operation. Only forward-declare here because SectionsOp is
+    // not completely defined at this point.
+    unsigned numPrivateBlockArgs();
+    unsigned numReductionBlockArgs();
+  }] # clausesExtraClassDeclaration;
   let assemblyFormat = "$region attr-dict";
 }
 
@@ -245,6 +253,12 @@ def SectionsOp : OpenMP_Op<"sections", traits = [
     OpBuilder<(ins CArg<"const SectionsOperands &">:$clauses)>
   ];
 
+  let assemblyFormat = clausesAssemblyFormat # [{
+    custom<PrivateReductionRegion>($region, $private_vars, type($private_vars),
+        $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref,
+        $reduction_syms) attr-dict
+  }];
+
   let hasVerifier = 1;
   let hasRegionVerifier = 1;
 }
@@ -272,6 +286,11 @@ def SingleOp : OpenMP_Op<"single", traits = [
     OpBuilder<(ins CArg<"const SingleOperands &">:$clauses)>
   ];
 
+  let assemblyFormat = clausesAssemblyFormat # [{
+    custom<PrivateRegion>($region, $private_vars, type($private_vars),
+        $private_syms) attr-dict
+  }];
+
   let hasVerifier = 1;
 }
 
@@ -352,7 +371,7 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [
   ], clauses = [
     OpenMP_AllocateClause, OpenMP_LinearClause, OpenMP_NowaitClause,
     OpenMP_OrderClause, OpenMP_OrderedClause, OpenMP_PrivateClause,
-    OpenMP_ReductionClauseSkip<assemblyFormat = true>, OpenMP_ScheduleClause
+    OpenMP_ReductionClause, OpenMP_ScheduleClause
   ], singleRegion = true> {
   let summary = "worksharing-loop construct";
   let description = [{
@@ -384,15 +403,11 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [
     OpBuilder<(ins CArg<"const WsloopOperands &">:$clauses)>
   ];
 
-  // TODO: Use default assembly format inherited from OpenMP_Op once printing
-  // and parsing of the workshare loop region is not intermingled with printing
-  // and parsing of reduction clauses. `assemblyFormat` should also be no longer
-  // skipped for clauses added to this operation at that time.
-  let assemblyFormat =
-    clausesReqAssemblyFormat # " oilist(" # clausesOptAssemblyFormat # ")" # [{
-      custom<Wsloop>($region, $reduction_vars, type($reduction_vars),
-                    $reduction_byref, $reduction_syms) attr-dict
-    }];
+  let assemblyFormat = clausesAssemblyFormat # [{
+    custom<PrivateReductionRegion>($region, $private_vars, type($private_vars),
+        $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref,
+        $reduction_syms) attr-dict
+  }];
 
   let hasVerifier = 1;
 }
@@ -443,6 +458,12 @@ def SimdOp : OpenMP_Op<"simd", traits = [
     OpBuilder<(ins CArg<"const SimdOperands &">:$clauses)>
   ];
 
+  let assemblyFormat = clausesAssemblyFormat # [{
+    custom<PrivateReductionRegion>($region, $private_vars, type($private_vars),
+        $private_syms, $reduction_vars, type($reduction_vars), $reduction_byref,
+        $reduction_syms) attr-dict
+  }];
+
   let hasVerifier = 1;
 }
 
@@ -517,6 +538,11 @@ def DistributeOp : OpenMP_Op<"distribute", traits = [
     OpBuilder<(ins CArg<"const DistributeOperands &">:$clauses)>
   ];
 
+  let assemblyFormat = clausesAssemblyFormat # [{
+    custom<PrivateRegion>($region, $private_vars, type($private_vars),
+        $private_syms) attr-dict
+  }];
+
   let hasVerifier = 1;
 }
 
@@ -556,6 +582,13 @@ def TaskOp : OpenMP_Op<"task", traits = [
     OpBuilder<(ins CArg<"const TaskOperands &">:$clauses)>
   ];
 
+  let assemblyFormat = clausesAssemblyFormat # [{
+    custom<InReductionPrivateRegion>(
+        $region, $in_reduction_vars, type($in_reduction_vars),
+        $in_reduction_byref, $in_reduction_syms, $private_vars,
+        type($private_vars), $private_syms) attr-dict
+  }];
+
   let hasVerifier = 1;
 }
 
@@ -628,10 +661,23 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [
     OpBuilder<(ins CArg<"const TaskloopOperands &">:$clauses)>
   ];
 
+  let assemblyFormat = clausesAssemblyFormat # [{
+    custom<InReductionPrivateReductionRegion>(
+        $region, $in_reduction_vars, type($in_reduction_vars),
+        $in_reduction_byref, $in_reduction_syms, $private_vars,
+        type($private_vars), $private_syms, $reduction_vars,
+        type($reduction_vars), $reduction_byref, $reduction_syms) attr-dict
+  }];
+
   let extraClassDeclaration = [{
     /// Returns the reduction variables
     SmallVector<Value> getAllReductionVars();
 
+    // Define BlockArgOpenMPOpInterface methods here because they are not
+    // inherited from the respective clauses.
+    unsigned numInReductionBlockArgs() { return getInReductionVars().size(); }
+    unsigned numReductionBlockArgs() { return getReductionVars().size(); }
+
     void getEffects(SmallVectorImpl<MemoryEffects::EffectInstance> &effects);
   }] # clausesExtraClassDeclaration;
 
@@ -661,6 +707,12 @@ def TaskgroupOp : OpenMP_Op<"taskgroup", traits = [
     OpBuilder<(ins CArg<"const TaskgroupOperands &">:$clauses)>
   ];
 
+  let assemblyFormat = clausesAssemblyFormat # [{
+    custom<TaskReductionRegion>(
+        $region, $task_reduction_vars, type($task_reduction_vars),
+        $task_reduction_byref, $task_reduction_syms) attr-dict
+  }];
+
   let hasVerifier = 1;
 }
 
@@ -943,6 +995,12 @@ def TargetDataOp: OpenMP_Op<"target_data", traits = [
     OpBuilder<(ins CArg<"const TargetDataOperands &">:$clauses)>
   ];
 
+  let assemblyFormat = clausesAssemblyFormat # [{
+    custom<UseDeviceAddrUseDevicePtrRegion>(
+        $region, $use_device_addr_vars, type($use_device_addr_vars),
+        $use_device_ptr_vars, type($use_device_ptr_vars)) attr-dict
+  }];
+
   let hasVerifier = 1;
 }
 
@@ -1043,13 +1101,14 @@ def TargetUpdateOp: OpenMP_Op<"target_update", traits = [
 //===----------------------------------------------------------------------===//
 
 def TargetOp : OpenMP_Op<"target", traits = [
-    AttrSizedOperandSegments, IsolatedFromAbove, OutlineableOpenMPOpInterface
+    AttrSizedOperandSegments, BlockArgOpenMPOpInterface, IsolatedFromAbove,
+    OutlineableOpenMPOpInterface
   ], clauses = [
     // TODO: Complete clause list (defaultmap, uses_allocators).
     OpenMP_AllocateClause, OpenMP_DependClause, OpenMP_DeviceClause,
     OpenMP_HasDeviceAddrClause, OpenMP_IfClause, OpenMP_InReductionClause,
-    OpenMP_IsDevicePtrClause, OpenMP_MapClause, OpenMP_NowaitClause,
-    OpenMP_PrivateClause, OpenMP_ThreadLimitClause
+    OpenMP_IsDevicePtrClause, OpenMP_MapClauseSkip<assemblyFormat = true>,
+    OpenMP_NowaitClause, OpenMP_PrivateClause, OpenMP_ThreadLimitClause
   ], singleRegion = true> {
   let summary = "target construct";
   let description = [{
@@ -1065,6 +1124,17 @@ def TargetOp : OpenMP_Op<"target", traits = [
     OpBuilder<(ins CArg<"const TargetOperands &">:$clauses)>
   ];
 
+  let extraClassDeclaration = [{
+    unsigned numMapBlockArgs() { return getMapVars().size(); }
+  }] # clausesExtraClassDeclaration;
+
+  let assemblyFormat = clausesAssemblyFormat # [{
+    custom<InReductionMapPrivateRegion>(
+        $region, $in_reduction_vars, type($in_reduction_vars),
+        $in_reduction_byref, $in_reduction_syms, $map_vars, type($map_vars),
+        $private_vars, type($private_vars), $private_syms) attr-dict
+  }];
+
   let hasVerifier = 1;
 }
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
index 0078e22b1c89a..22521b08637cf 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -15,6 +15,149 @@
 
 include "mlir/IR/OpBase.td"
 
+def BlockArgOpenMPOpInterface : OpInterface<"BlockArgOpenMPOpInterface"> {
+  let description = [{
+    OpenMP operations that define entry block arguments as part of the
+    representation of its clauses.
+  }];
+
+  let cppNamespace = "::mlir::omp";
+
+  let methods = [
+    // Default-implemented methods to be overriden by the corresponding clauses.
+    InterfaceMethod<"Get number of block arguments defined by `in_reduction`.",
+                    "unsigned", "numInReductionBlockArgs", (ins), [{}], [{
+      return 0;
+    }]>,
+    InterfaceMethod<"Get number of block arguments defined by `map`.",
+                    "unsigned", "numMapBlockArgs", (ins), [{}], [{
+      return 0;
+    }]>,
+    InterfaceMethod<"Get number of block arguments defined by `private`.",
+                    "unsigned", "numPrivateBlockArgs", (ins), [{}], [{
+      return 0;
+    }]>,
+    InterfaceMethod<"Get number of block arguments defined by `reduction`.",
+                    "unsigned", "numReductionBlockArgs", (ins), [{}], [{
+      return 0;
+    }]>,
+    InterfaceMethod<"Get number of block arguments defined by `task_reduction`.",
+                    "unsigned", "numTaskReductionBlockArgs", (ins), [{}], [{
+      return 0;
+    }]>,
+    InterfaceMethod<"Get number of block arguments defined by `use_device_addr`.",
+                    "unsigned", "numUseDeviceAddrBlockArgs", (ins), [{}], [{
+      return 0;
+    }]>,
+    InterfaceMethod<"Get number of block arguments defined by `use_device_ptr`.",
+                    "unsigned", "numUseDevicePtrBlockArgs", (ins), [{}], [{
+      return 0;
+    }]>,
+
+    // Unified access methods for clause-associated entry block arguments.
+    InterfaceMethod<"Get start index of block arguments defined by `in_reduction`.",
+                    "unsigned", "getInReductionBlockArgsStart", (ins), [{
+      return 0;
+    }]>,
+    InterfaceMethod<"Get start index of block arguments defined by `map`.",
+                    "unsigned", "getMapBlockArgsStart", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return iface.getInReductionBlockArgsStart() +
+             $_op.numInReductionBlockArgs();
+    }]>,
+    InterfaceMethod<"Get start index of block arguments defined by `private`.",
+                    "unsigned", "getPrivateBlockArgsStart", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return iface.getMapBlockArgsStart() + $_op.numMapBlockArgs();
+    }]>,
+    InterfaceMethod<"Get start index of block arguments defined by `reduction`.",
+                    "unsigned", "getReductionBlockArgsStart", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return iface.getPrivateBlockArgsStart() + $_op.numPrivateBlockArgs();
+    }]>,
+    InterfaceMethod<"Get start index of block arguments defined by `task_reduction`.",
+                    "unsigned", "getTaskReductionBlockArgsStart", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return iface.getReductionBlockArgsStart() + $_op.numReductionBlockArgs();
+    }]>,
+    InterfaceMethod<"Get start index of block arguments defined by `use_device_addr`.",
+                    "unsigned", "getUseDeviceAddrBlockArgsStart", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return iface.getTaskReductionBlockArgsStart() + $_op.numTaskReductionBlockArgs();
+    }]>,
+    InterfaceMethod<"Get start index of block arguments defined by `use_device_ptr`.",
+                    "unsigned", "getUseDevicePtrBlockArgsStart", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return iface.getUseDeviceAddrBlockArgsStart() + $_op.numUseDeviceAddrBlockArgs();
+    }]>,
+
+    InterfaceMethod<"Get block arguments defined by `in_reduction`.",
+                    "::llvm::MutableArrayRef<::mlir::BlockArgument>",
+                    "getInReductionBlockArgs", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return $_op->getRegion(0).getArguments().slice(
+          iface.getInReductionBlockArgsStart(), $_op.numInReductionBlockArgs());
+    }]>,
+    InterfaceMethod<"Get block arguments defined by `map`.",
+                    "::llvm::MutableArrayRef<::mlir::BlockArgument>",
+                    "getMapBlockArgs", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return $_op->getRegion(0).getArguments().slice(
+          iface.getMapBlockArgsStart(), $_op.numMapBlockArgs());
+    }]>,
+    InterfaceMethod<"Get block arguments defined by `private`.",
+                    "::llvm::MutableArrayRef<::mlir::BlockArgument>",
+                    "getPrivateBlockArgs", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return $_op->getRegion(0).getArguments().slice(
+          iface.getPrivateBlockArgsStart(), $_op.numPrivateBlockArgs());
+    }]>,
+    InterfaceMethod<"Get block arguments defined by `reduction`.",
+                    "::llvm::MutableArrayRef<::mlir::BlockArgument>",
+                    "getReductionBlockArgs", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return $_op->getRegion(0).getArguments().slice(
+          iface.getReductionBlockArgsStart(), $_op.numReductionBlockArgs());
+    }]>,
+    InterfaceMethod<"Get block arguments defined by `task_reduction`.",
+                    "::llvm::MutableArrayRef<::mlir::BlockArgument>",
+                    "getTaskReductionBlockArgs", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return $_op->getRegion(0).getArguments().slice(
+          iface.getTaskReductionBlockArgsStart(),
+          $_op.numTaskReductionBlockArgs());
+    }]>,
+    InterfaceMethod<"Get block arguments defined by `use_device_addr`.",
+                    "::llvm::MutableArrayRef<::mlir::BlockArgument>",
+                    "getUseDeviceAddrBlockArgs", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return $_op->getRegion(0).getArguments().slice(
+          iface.getUseDeviceAddrBlockArgsStart(),
+          $_op.numUseDeviceAddrBlockArgs());
+    }]>,
+    InterfaceMethod<"Get block arguments defined by `use_device_ptr`.",
+                    "::llvm::MutableArrayRef<::mlir::BlockArgument>",
+                    "getUseDevicePtrBlockArgs", (ins), [{
+      auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>(*$_op);
+      return $_op->getRegion(0).getArguments().slice(
+          iface.getUseDevicePtrBlockArgsStart(),
+          $_op.numUseDevicePtrBlockArgs());
+    }]>,
+  ];
+
+  let verify = [{
+    auto iface = ::llvm::cast<BlockArgOpenMPOpInterface>($_op);
+    unsigned expectedArgs = iface.numInReductionBlockArgs() +
+        iface.numMapBlockArgs() + iface.numPrivateBlockArgs() +
+        iface.numReductionBlockArgs() + iface.numTaskReductionBlockArgs() +
+        iface.numUseDeviceAddrBlockArgs() + iface.numUseDevicePtrBlockArgs();
+    if ($_op->getRegion(0).getNumArguments() < expectedArgs)
+      return $_op->emitOpError() << "expected at least " << expectedArgs
+                                 << " entry block argument(s)";
+    return ::mlir::success();
+  }];
+}
+
 def OutlineableOpenMPOpInterface : OpInterface<"OutlineableOpenMPOpInterface"> {
   let description = [{
     OpenMP operations whose region will be outlined will implement this
@@ -106,6 +249,15 @@ def LoopWrapperInterface : OpInterface<"LoopWrapperInterface"> {
       }]
     >
   ];
+
+  let extraClassDeclaration = [{
+    /// Interface verifier imlementation.
+    llvm::LogicalResult verifyImpl();
+  }];
+
+  let verify = [{
+    return ::llvm::cast<::mlir::omp::LoopWrapperInterface>($_op).verifyImpl();
+  }];
 }
 
 def ComposableOpInterface : OpInterface<"ComposableOpInterface"> {
diff --git a/mlir/include/mlir/Dialect/Quant/CMakeLists.txt b/mlir/include/mlir/Dialect/Quant/CMakeLists.txt
index c08f399ee182d..9f57627c321fb 100644
--- a/mlir/include/mlir/Dialect/Quant/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Quant/CMakeLists.txt
@@ -1,6 +1,2 @@
-add_mlir_dialect(QuantOps quant)
-add_mlir_doc(QuantOps QuantDialect Dialects/ -gen-dialect-doc)
-
-set(LLVM_TARGET_DEFINITIONS QuantDialectBytecode.td)
-mlir_tablegen(QuantDialectBytecode.cpp.inc -gen-bytecode -bytecode-dialect="Quant")
-add_public_tablegen_target(MLIRQuantDialectBytecodeIncGen)
+add_subdirectory(IR)
+add_subdirectory(Transforms)
diff --git a/mlir/include/mlir/Dialect/Quant/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Quant/IR/CMakeLists.txt
new file mode 100644
index 0000000000000..c08f399ee182d
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Quant/IR/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_mlir_dialect(QuantOps quant)
+add_mlir_doc(QuantOps QuantDialect Dialects/ -gen-dialect-doc)
+
+set(LLVM_TARGET_DEFINITIONS QuantDialectBytecode.td)
+mlir_tablegen(QuantDialectBytecode.cpp.inc -gen-bytecode -bytecode-dialect="Quant")
+add_public_tablegen_target(MLIRQuantDialectBytecodeIncGen)
diff --git a/mlir/include/mlir/Dialect/Quant/QuantOps.h b/mlir/include/mlir/Dialect/Quant/IR/Quant.h
similarity index 59%
rename from mlir/include/mlir/Dialect/Quant/QuantOps.h
rename to mlir/include/mlir/Dialect/Quant/IR/Quant.h
index 14fb3035ab0d3..11a969a3ee519 100644
--- a/mlir/include/mlir/Dialect/Quant/QuantOps.h
+++ b/mlir/include/mlir/Dialect/Quant/IR/Quant.h
@@ -1,4 +1,4 @@
-//===- QuantOps.h - Quantization Ops and Types ------------------*- C++ -*-===//
+//===- Quant.h - Quantization Ops -------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_DIALECT_QUANT_QUANTOPS_H_
-#define MLIR_DIALECT_QUANT_QUANTOPS_H_
+#ifndef MLIR_DIALECT_QUANT_IR_QUANT_H_
+#define MLIR_DIALECT_QUANT_IR_QUANT_H_
 
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -19,9 +19,19 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "llvm/Support/MathExtras.h"
 
-#include "mlir/Dialect/Quant/QuantOpsDialect.h.inc"
+#include "mlir/Dialect/Quant/IR/QuantOpsDialect.h.inc"
+
+namespace mlir {
+namespace quant {
+
+class QuantizedType;
+class UniformQuantizedType;
+class UniformQuantizedPerAxisType;
+
+} // namespace quant
+} // namespace mlir
 
 #define GET_OP_CLASSES
-#include "mlir/Dialect/Quant/QuantOps.h.inc"
+#include "mlir/Dialect/Quant/IR/QuantOps.h.inc"
 
-#endif // MLIR_DIALECT_QUANT_QUANTOPS_H_
+#endif // MLIR_DIALECT_QUANT_IR_QUANT_H_
diff --git a/mlir/include/mlir/Dialect/Quant/IR/QuantBase.td b/mlir/include/mlir/Dialect/Quant/IR/QuantBase.td
new file mode 100644
index 0000000000000..791cb9de48d05
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Quant/IR/QuantBase.td
@@ -0,0 +1,297 @@
+//===- QuantBase.td - Quantization dialect base ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Quantization dialect, types, and traits.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef QUANT_BASE
+#define QUANT_BASE
+
+include "mlir/IR/OpBase.td"
+
+def Quant_Dialect : Dialect {
+  let name = "quant";
+  let description = [{
+    The `quant` dialect offers a framework for defining and manipulating
+    quantized values. Central to this framework is the `!quant.uniform` data
+    type, used to represent quantized values. This dialect also provides a
+    suite of operations to handle and convert quantized values between their
+    original floating-point representations and the optimized, lower bit-width
+    integer representations. The `quant` dialect is instrumented with
+    transformation passes to lower these operations into other core MLIR
+    dialects, while also flattening all occurrences of quantized types into
+    their integer counterparts.
+
+
+    ## The `!quant.uniform` type
+
+    The quantization process establishes a relationship between two types of
+    values: an *expressed value* and a *stored value*. The former refers to the
+    floating-point representation used in an original machine learning model,
+    capturing the precise numerical characteristics needed for accurate
+    calculations. The latter is the simplified integer representation that
+    resides in memory after quantization. The `!quant.uniform` data type
+    encodes the necessary information for (lossy) round-trip conversion between
+    an expressed and a stored value.
+
+    The `quant.uniform` type has two variants: per-layer quantization and
+    per-channel (or per-axis) quantization. In per-layer quantization, the
+    quantization information affects an entire tensor uniformly. Conversely, in
+    per-channel quantization, the data type encodes the specific tensor axis
+    that serves as the channel and includes quantization information for each
+    individual channel within the tensor. Below are the specific syntactic and
+    semantic considerations for each modality.
+
+
+    ### Per-layer quantization
+
+    This is the general syntax of the `!quant.uniform` type representing
+    per-layer quantization:
+
+    ```
+    `!quant.uniform` `<`
+      storedType (`<` storageMin `:` storageMax `>`)? `:`
+      expressedType `,`
+      scale (`:` zeroPoint)?
+    `>`
+    ```
+
+    The type contains the following parameters:
+
+    - `storedType`: Integer type of the value stored in memory. This type
+      conveys the bit width and signedness of the quantized stored value.
+      Signed integer types are represented as `'i' bitWidth` (e.g., `i8`),
+      while unsigned integer types are represented as `'u' bitWidth` (e.g.,
+      `u8`).
+
+    - `storageMin`, `storageMax`: Optional bounds for the stored value. If
+      given, they must be within the range of `storedType`. If omitted, the
+      entire range of `storedType` is allowed (e.g., `-128...127` for `i8` or
+      `0...255` for `u8`).
+
+    - `expressedType`: Floating-point type of the value expressed by this
+      quantized type (e.g., `f32`, `f80`, `bf16`, or `tf32`).
+
+    - `scale`: Floating-point value of type `expressedType` used in the
+      conversion between stored and expressed values.
+
+    - `zeroPoint`: Optional integer value of type `storageType` used in the
+      conversion between stored and expressed values. If omitted, the default
+      is 0.
+
+    Type conversions, rounding methods, and clamping actions aside, the
+    relationship between the expressed and stored values as encoded in a
+    quantized type is denoted by the following formula:
+
+    $$
+    expressedValue = (storedValue ~-~ zeroPoint) ~\times~ scale
+    $$
+
+    Operations `quant.qcast` (quantize cast) and `quant.dcast` (dequantize
+    cast) can be used to quantize a floating-point value and dequantize a
+    stored value, respectively. See the documentation for these operations for
+    details on how the quantization and dequantization processes are influenced
+    by the `!quant.uniform` type parameters.
+
+    Here are some examples of the use of `!quant.uniform` with per-layer
+    quantization:
+
+    ```
+    // An 8-bit signed integer type is used to represent a 32-bit float. No
+    // clamping information is provided, so the full [-128, 127] range is
+    // available. The scale is set to 3.0, and the zero point takes its default
+    // 0 value.
+    !quant.uniform<i8:f32, 3.0>
+
+    // A 16-bit unsigned integer type is used to represent a 32-bit float. Out
+    // of the 16 bits, only 10 are used, acoording to the 0..1023 clamping
+    // range. The type sets the scale to 1.23 and the zero point to 512.
+    !quant.uniform<u16<0:1023>:f32, 1.23:512>
+    ```
+
+    ### Per-channel quantization
+
+    The general syntax of the `!quant.uniform` type representing per-channel
+    quantization is as follows:
+
+    ```
+    `!quant.uniform` `<`
+      storedType (`<` storageMin `:` storageMax `>`)? `:`
+      expressedType `:`
+      channelAxis `,`
+      `{`
+        scale0 (`:` zeroPoint0)? `,`
+        scale1 (`:` zeroPoint1)? ...
+      '}'
+    `>`
+    ```
+
+    In this data type, there are multiple pairs of `scale` and `zeroPoint`
+    values. The `channelAxis` field represents the dimension of the containing
+    tensor acting as the channel. The size of the tensor along this dimension
+    is expected to match the number of provided `scale`-`zeroPoint` pairs, and
+    a given pair *i* applies to all elements in the tensor whose index along
+    dimension `channelAxis` is *i*. A quantized data type using per-channel
+    quantization is always expected to be contained within a tensor type.
+
+    Here are some examples:
+
+    ```
+    // A 2x3x4 tensor contains 8-bit signed integers representing 32-bit
+    // floats. Dimension 1 of the tensor acts as the channel dimension. Its
+    // size 3 matches the number of provided scale values. Tensor elemenets at
+    // positions [*][0][*], [*][1][*], and [*][2][*] use scales 3.0, 4.0, and
+    // 5.0, respectively.
+    tensor<2x3x4x!quant.uniform<i8:f32:1, {3.0, 4.0, 5.0}>>
+
+    // A 2D dynamically sized tensor contains 16-bit unsigned integers
+    // representing 32-bit floats. Dimension 0 of the tensor acts as the
+    // channel dimension. Since 2 scale and zero-point values are provided, the
+    // size of dimension 0 is expected to be 2 at runtime. Tensor elements
+    // [0][*] use scale 2.0 and zero point 10, while elements [1][*] use scale
+    // 3.0 and zero point 20.
+    tensor<?x?x!quant.uniform<u16:f32:0, {2.0:10, 3.0:20}>>
+    ```
+
+
+    ## Per-axis quantization integrity
+
+    When type `!quant.uniform` contains per-axis quantization information, the
+    rules below are enforced. These rules guarantee that the quantization
+    information encoded in the data type is applicable to the context in which
+    the quantized type is used. For efficiency, these rules are actively
+    enforced by the verifiers of `quant` dialect ops, but they must be
+    respected in any context in which the `!quant.uniform` data type is used,
+    such as the header of a `func.func` op, or the input of an arithmetic
+    operation.
+ 
+    - A quantized type with per-channel quantization information must be the
+      element type of a tensor container type, and may not occur directly as
+      the data type of a scalar value.
+
+    ```
+    // Incorrect. Type !quant.uniform specifies per-channel quantization for a
+    // scalar type.
+    %result = quant.qcast %input : f32 to !quant.uniform<i8:f32:0, {1.0, 2.0}>
+
+    // Correct. Type `!quant.uniform` with per-channel quantization is wrapped
+    // in a `tensor` type.
+    %result = quant.qcast %input : tensor<2xf32> to tensor<2x!quant.uniform<i8:f32:0, {1.0, 2.0}>>
+    ```
+
+    - If the tensor containing the `!quant.uniform` type is ranked, its rank
+      must be greater than the channel axis specified in the quantized type.
+
+    ```
+    // Incorrect. The tensor rank (2) is not greater than the channel axis in
+    // the quantized type (3).
+    %result = quant.qcast %input : tensor<1x2xf32> to tensor<1x2x!quant.uniform<i8:f32:3, {1.0, 2.0}>>
+
+    // Correct. The tensor rank (2) is now greater than the channel axis (1):
+    %result = quant.qcast %input : tensor<1x2xf32> to tensor<1x2x!quant.uniform<i8:f32:1, {1.0, 2.0}>>
+    ```
+
+    - If the axis dimension in the containing tensor is static, its size must
+      be equal to the number of scales present in the quantized type.
+
+    ```
+    // Incorrect. The channel axis is 1, and the size of dimension 1 in the
+    // containing tensor is 3. However, there are 4 scale values present in the
+    // quantized type.
+    %result = quant.qcast %input : tensor<?x3xf32> to tensor<?x3x!quant.uniform<i8:f32:1, {1.0, 2.0, 3.0, 4.0}>>
+
+    // Correct. The quantized type now includes 3 scale values, matching the
+    // size of dimension 1 of the result tensor.
+    %result = quant.qcast %input : tensor<?x3xf32> to tensor<?x3x!quant.uniform<i8:f32:1, {2.0, 3.0, 4.0}>>
+    ```
+  }];
+  let cppNamespace = "::mlir::quant";
+  let useDefaultTypePrinterParser = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Type predicates
+//===----------------------------------------------------------------------===//
+
+class quant_ScalarOrTensorOf<Type etype> :
+    Type<Or<[etype.predicate, TensorOf<[etype]>.predicate]>,
+         "scalar or tensor of " # etype.summary>;
+
+def quant_QuantizedType :
+    Type<CPred<"::llvm::isa<mlir::quant::QuantizedType>($_self)">, "quantized type">;
+
+def quant_ScalarType :
+    Type<Or<[
+      AnySignlessInteger.predicate,
+      AnyFloat.predicate,
+      quant_QuantizedType.predicate
+    ]>,
+    "signless integer, float, or quantized scalar">;
+
+def quant_IntegerOrQuantizedType :
+    Type<Or<[
+      AnySignlessInteger.predicate,
+      quant_QuantizedType.predicate
+    ]>,
+    "signless integer or quantized type">;
+
+def quant_FloatScalarOrTensor :
+    quant_ScalarOrTensorOf<AnyFloat>;
+
+def quant_IntegerScalarOrTensor :
+    quant_ScalarOrTensorOf<AnySignlessInteger>;
+
+def quant_QuantizedScalarOrTensor :
+    quant_ScalarOrTensorOf<quant_QuantizedType>;
+
+def quant_IntegerOrQuantizedScalarOrTensor :
+    quant_ScalarOrTensorOf<quant_IntegerOrQuantizedType>;
+
+
+//===----------------------------------------------------------------------===//
+// Traits
+//===----------------------------------------------------------------------===//
+
+def quant_SameScalarOrTensorShape :
+    PredOpTrait<
+      "input and result are both scalars or both tensors with matching shape",
+      Or<[
+        And<[
+          TypeIsPred<"input", quant_ScalarType>,
+          TypeIsPred<"result", quant_ScalarType>
+        ]>,
+        And<[
+          TypeIsPred<"input", AnyUnrankedTensor>,
+          TypeIsPred<"result", AnyUnrankedTensor>
+        ]>,
+        And<[
+          TypeIsPred<"input", AnyRankedTensor>,
+          TypeIsPred<"result", AnyRankedTensor>,
+          AllShapesMatch<["input", "result"]>.predicate
+        ]>
+      ]>
+    >;
+
+def quant_IntegerAndQuantizedCombination :
+    PredOpTrait<
+      "input must be integer and result must be quantized, or vice versa",
+      Or<[
+        And<[
+          TypeIsPred<"input", quant_QuantizedScalarOrTensor>,
+          TypeIsPred<"result", quant_IntegerScalarOrTensor>
+        ]>,
+        And<[
+          TypeIsPred<"input", quant_IntegerScalarOrTensor>,
+          TypeIsPred<"result", quant_QuantizedScalarOrTensor>
+        ]>
+      ]>
+    >;
+
+#endif // QUANT_BASE
diff --git a/mlir/include/mlir/Dialect/Quant/QuantDialectBytecode.td b/mlir/include/mlir/Dialect/Quant/IR/QuantDialectBytecode.td
similarity index 100%
rename from mlir/include/mlir/Dialect/Quant/QuantDialectBytecode.td
rename to mlir/include/mlir/Dialect/Quant/IR/QuantDialectBytecode.td
diff --git a/mlir/include/mlir/Dialect/Quant/IR/QuantOps.td b/mlir/include/mlir/Dialect/Quant/IR/QuantOps.td
new file mode 100644
index 0000000000000..6ef925146dce6
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Quant/IR/QuantOps.td
@@ -0,0 +1,243 @@
+//===- QuantOps.td - Quantization operation definition -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the operation definition file for Quantization.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef QUANT_OPS
+#define QUANT_OPS
+
+include "mlir/Dialect/Quant/IR/QuantBase.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+//===----------------------------------------------------------------------===//
+// Base classes
+//===----------------------------------------------------------------------===//
+
+class quant_Op<string mnemonic, list<Trait> traits> :
+    Op<Quant_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Quantization casts
+//===----------------------------------------------------------------------===//
+
+def quant_DequantizeCastOp : quant_Op<"dcast", [
+    Pure,
+    quant_SameScalarOrTensorShape]> {
+  let summary = "Dequantize cast operation";
+  let description = [{
+    Convert an input quantized value into its expressed floating-point value.
+    The dequantization process consists of the following steps:
+
+    ```
+    def dequantize(quantizedValue: quantizedType) -> expressedType:
+        storedValue = reinterpretCast(quantizedValue, storageType)
+        storedValueFloat = convertIntToFloat(storedValue, expressedType)
+        zeroPointFloat = convertIntToFloat(zeroPoint, expressedType)
+        expressedValue = (storedValueFloat - zeroPointFloat) * scale
+        return expressedValue
+    ```
+
+    Here, `storageType`, `expressedType`, `scale`, and `zeroPoint` are obtained
+    from the corresponding parameters encoded in `quantizedType`. For
+    per-channel quantization, the appropriate `scale` and `zeroPoint` values
+    are used for each tensor element computation according to the channel the
+    element belongs to.
+    
+    The numerical results produced by the algorithm above may vary depending on
+    the rounding methods used by `convertIntToFloat()`, subtraction (`-`), and
+    multiplication (`*`). This operation does not define specific rounding
+    methods; instead, it is the responsibility of a transform pipeline to
+    determine which rounding method to apply when this operation is broken down
+    into lower-level dialects.
+
+    The operation must satisfy the following syntactic constraints:
+
+    - Operand `input` must be a scalar or tensor of type `!quant.uniform`.
+
+    - The result type must be a floating-point scalar or tensor.
+
+    - The `expressedType` parameter of the `!quant.uniform` type of the input
+      must match the floating-point type of the result.
+
+    - The operand and result types must be both scalars or both tensors. If
+      tensors, they must be both ranked or both unranked. If ranked, both must
+      have the same shape, including matching static and dynamic dimensions.
+
+    - If the operand uses per-channel quantization, its `!quant.uniform` type
+      must adhere to the [Per-axis quantization
+      integrity](#per-axis-quantization-integrity) guidelines.
+
+    Examples:
+
+    ```
+    // Dequantize a scalar quantized value
+    %result = quant.dcast %input : !quant.uniform<i8:f32, 2.0> to f32
+
+    // Dequantize a dynamically shaped tensor of quantized values
+    %result = quant.dcast %input : tensor<?x!quant.uniform<i8:f32, 2.0>> to tensor<?xf32>
+
+    // Dequantize an unranked tensor using per-axis quantization information
+    %result = quant.dcast %input : tensor<*x!quant.uniform<i8:f32:1, {2.0, 3.0}>> to tensor<*xf32>
+    ```
+  }];
+  let arguments = (ins quant_QuantizedScalarOrTensor:$input);
+  let results = (outs quant_FloatScalarOrTensor:$result);
+  let assemblyFormat = "$input attr-dict `:` type($input) `to` type($result)";
+  let hasVerifier = 1;
+  let hasFolder = 1;
+  let extraClassDeclaration = [{
+    /// Return the float type of the scalar or tensor result.
+    FloatType getFloatType();
+    
+    /// Return the quantized type of the scalar or tensor input.
+    quant::QuantizedType getQuantizedType();
+  }];
+}
+
+def quant_QuantizeCastOp : quant_Op<"qcast", [
+    Pure,
+    quant_SameScalarOrTensorShape]> {
+  let summary = "Quantize cast operation";
+  let description = [{
+    Convert a floating-point value to a quantized type. The quantization
+    process consists of the following steps:
+
+    ```
+    def quantize(expressedValue: expressedType) -> quantizedType:
+        zeroPointFloat = convertIntToFloat(zeroPoint, expressedType)
+        scaledValue = expressedValue / scale
+        storedValueFloat = scaledValue + zeroPointFloat
+        storedValue = convertFloatToInt(storedValueFloat, storageType)
+        storedValueClamped = clamp(storedValue, storageMin, storageMax)
+        quantizedValue = reinterpretCast(storedValueClamped, quantizedType)
+        return quantizedValue
+    ```
+
+    Here, `storageType`, `storageMin`, `storageMax`, `expressedType`, `scale`,
+    and `zeroPoint` are obtained from the corresponding parameters encoded in
+    `quantizedType`. For per-channel quantization, the appropriate `scale` and
+    `zeroPoint` values are used for each tensor element computation according
+    to the channel the element belongs to.
+
+    The numerical results produced by the algorithm above may vary depending on
+    the rounding methods used by `convertIntToFloat()`, `convertFloatToInt()`,
+    `clamp()`, division (`/`), and addition (`+`). This operation does not
+    define specific rounding methods; instead, it is the responsibility of a
+    transform pipeline to determine which rounding method to apply when this
+    operation is broken down into lower-level dialects.
+
+    The operation must satisfy the following syntactic constraints:
+
+    - Operand `input` must be a floating-point scalar or tensor.
+
+    - The result type must be a scalar or tensor of type `!quant.uniform`.
+
+    - The `expressedType` parameter in the `!quant.uniform` type of the result
+      must match the floating-point type of the input.
+
+    - The operand and result types must be both scalars or both tensors. If
+      tensors, they must be both ranked or both unranked. If ranked, both must
+      have the same shape, including matching static and dynamic dimensions.
+
+    - If the result uses per-channel quantization, its `!quant.uniform` type
+      must adhere to the [Per-axis quantization
+      integrity](#per-axis-quantization-integrity) guidelines.
+
+    Examples:
+
+    ```
+    // Quantize a scalar floating-point value
+    %result = quant.qcast %input : f32 to !quant.uniform<i8:f32, 2.0>
+
+    // Quantize a dynamically shaped tensor of quantized values
+    %result = quant.qcast %input : tensor<?xf32> to tensor<?x!quant.uniform<i8:f32, 2.0>>
+
+    // Quantize an unranked tensor using per-axis quantization information
+    %result = quant.qcast %input : tensor<*xf32> to tensor<*x!quant.uniform<i8:f32:1, {2.0, 3.0}>>
+    ```
+  }];
+  let arguments = (ins quant_FloatScalarOrTensor:$input);
+  let results = (outs quant_QuantizedScalarOrTensor:$result);
+  let assemblyFormat = "$input attr-dict `:` type($input) `to` type($result)";
+  let hasVerifier = 1;
+  let hasFolder = 1;
+  let extraClassDeclaration = [{
+    /// Return the float type of the scalar or tensor input.
+    FloatType getFloatType();
+    
+    /// Return the quantized type of the scalar or tensor result.
+    quant::QuantizedType getQuantizedType();
+  }];
+}
+
+def quant_StorageCastOp : quant_Op<"scast", [
+    Pure,
+    quant_SameScalarOrTensorShape,
+    quant_IntegerAndQuantizedCombination]> {
+  let summary = "Storage cast operation";
+  let description = [{
+    Convert a value from a quantized type to the corresponding signless integer
+    storage type, or vice versa. This conversion simply involves a
+    reinterpretation of the input bits and does not involve any data
+    manipulation.
+
+    The following syntactic restrictions must be met:
+
+    - Operand `input` must be a scalar or tensor of a signless integer or
+      `!quant.uniform` type.
+
+    - The result must be a scalar or tensor of a signless integer or
+      `!quant.uniform` type.
+
+    - If the operand is a scalar or tensor of type integer, the result must be
+      a scalar or tensor of type `!quant.uniform`, and vice versa.
+
+    - The operand and result must be both scalars or both tensors. If tensors,
+      they must be both ranked or both unranked. If ranked, both must have the
+      same shape, including matching static and dynamic dimensions.
+
+    - The width of the `storageType` parameter of the quantized type of the
+      operand or result must match the width of the signless integer type of
+      the operand or result.
+
+    - If the operand or result uses per-channel quantization, its
+      `!quant.uniform` type must adhere to the [Per-axis quantization
+      integrity](#per-axis-quantization-integrity) guidelines.
+
+    Examples:
+
+    ```
+    // Cast a scalar quantized value into its storage type
+    %result = quant.scast %input : !quant.uniform<i8:f32, 2.0> to i8
+
+    // Cast a dynamically shaped tensor of quantized values into their storage type
+    %result = quant.scast %input : tensor<?x!quant.uniform<i8:f32, 2.0>> to tensor<?xi8>
+
+    // Cast an unranked tensor of signless integers into a quantized type using
+    // per-channel quantization
+    %result = quant.scast %input : tensor<*xi8> to tensor<*x!quant.uniform<i8:f32:1, {2.0, 3.0}>>
+    ```
+  }];
+  let arguments = (ins quant_IntegerOrQuantizedScalarOrTensor:$input);
+  let results = (outs quant_IntegerOrQuantizedScalarOrTensor:$result);
+  let assemblyFormat = "$input attr-dict `:` type($input) `to` type($result)";
+  let hasVerifier = 1;
+  let hasFolder = 1;
+  let extraClassDeclaration = [{
+    /// Return the integer type used either in the input or the result.
+    IntegerType getIntegerType();
+    
+    /// Return the quantized type used either in the input or the result.
+    quant::QuantizedType getQuantizedType();
+  }];
+}
+
+#endif // QUANT_OPS
diff --git a/mlir/include/mlir/Dialect/Quant/QuantTypes.h b/mlir/include/mlir/Dialect/Quant/IR/QuantTypes.h
similarity index 98%
rename from mlir/include/mlir/Dialect/Quant/QuantTypes.h
rename to mlir/include/mlir/Dialect/Quant/IR/QuantTypes.h
index 57a2aa2983365..43440ba623b9c 100644
--- a/mlir/include/mlir/Dialect/Quant/QuantTypes.h
+++ b/mlir/include/mlir/Dialect/Quant/IR/QuantTypes.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_DIALECT_QUANT_QUANTTYPES_H
-#define MLIR_DIALECT_QUANT_QUANTTYPES_H
+#ifndef MLIR_DIALECT_QUANT_IR_QUANTTYPES_H
+#define MLIR_DIALECT_QUANT_IR_QUANTTYPES_H
 
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -114,6 +114,10 @@ class QuantizedType : public Type {
   /// The maximum value that storageType can take.
   int64_t getStorageTypeMax() const;
 
+  /// Return whether the storage type has explicit min or max boundaries
+  /// different from the minimum and maximum representable values.
+  bool hasStorageTypeBounds() const;
+
   /// Gets the integral bit width that the underlying storage type can exactly
   /// represent. For integral storage types, this will just be their width.
   unsigned getStorageTypeIntegralWidth() const;
@@ -413,4 +417,4 @@ class CalibratedQuantizedType
 } // namespace quant
 } // namespace mlir
 
-#endif // MLIR_DIALECT_QUANT_QUANTTYPES_H
+#endif // MLIR_DIALECT_QUANT_IR_QUANTTYPES_H
diff --git a/mlir/include/mlir/Dialect/Quant/QuantOps.td b/mlir/include/mlir/Dialect/Quant/QuantOps.td
deleted file mode 100644
index 7937265ce2f20..0000000000000
--- a/mlir/include/mlir/Dialect/Quant/QuantOps.td
+++ /dev/null
@@ -1,103 +0,0 @@
-//===- QuantOps.td - Quantization operation definition -----*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This is the operation definition file for Quantization.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef DIALECT_QUANT_QUANT_OPS_
-#define DIALECT_QUANT_QUANT_OPS_
-
-include "mlir/Dialect/Quant/QuantOpsBase.td"
-include "mlir/Interfaces/InferTypeOpInterface.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-
-//===----------------------------------------------------------------------===//
-// Base classes
-//===----------------------------------------------------------------------===//
-
-class quant_Op<string mnemonic, list<Trait> traits> :
-    Op<Quantization_Dialect, mnemonic, traits>;
-
-//===----------------------------------------------------------------------===//
-// Quantization casts
-//===----------------------------------------------------------------------===//
-
-def quant_QuantizeCastOp : quant_Op<"qcast", [Pure]> {
-  let summary = "convert a quantizable type to a quantized type";
-  let description = [{
-    A QuantizeCast `qcast` represents a potential type shift from a quantizable
-    type to a quantized type.
-
-    At runtime, a `qcast` will apply the transformation expressed by its
-    operand and result type. For flexibility during transformation, it is also
-    possible to have a `qcast` that performs no transformation (both its
-    operand and result type are quantizable).
-
-    A `qcast` will typically originate from either:
-      a) An expressed or implied constraint in the source dialect which signals
-         that a certain level of quantization is possible or required.
-      b) An inference made by a quantization algorithm indicating that a
-         quantized representation may be acceptable.
-
-    Especially early in transformation, it is common to have pairs of
-    `qcast` and `dcast` at points where a transition to a quantized type is
-    required. In addition, it is also common to have an identity `qcast`
-    (where the operand and result type are not quantized) at all points where
-    it is legal to use a quantized representation (but is not known to be
-    acceptable).
-  }];
-  let arguments = (ins quant_RealValueType:$arg);
-  let results = (outs quant_RealValueType:$res);
-}
-
-def quant_DequantizeCastOp : quant_Op<"dcast", [Pure]> {
-  let summary = "convert back from a quantized to quantizable (expressed) type operation";
-  let description = [{
-    A DequantizeCast op `dcast` represents the inverse of a `qcast`,
-    converting back from a quantized to quantizable (expressed) type.
-
-    Like `qcast`s, a `dcast` is allowed to have both its operand and result
-    as non quantized types. This facilitates transformations and marks edges
-    where the computation must be carried out in the expressed type.
-
-    Especially early in transformation, it is common to have `dcast`s on
-    all operands to ops that must operate with the expressed type (typically
-    math ops prior to lowering to target-specific, quantized kernels).
-  }];
-  let arguments = (ins quant_RealValueType:$arg);
-  let results = (outs quant_RealValueType:$res);
-}
-
-def quant_StorageCastOp : quant_Op<"scast", [Pure]> {
-  let summary = "cast from or to a type based on the storage type and the corresponding quantized type";
-  let description = [{
-    A StorageCast `scast` represents a cast from or to a type based on the
-    storage type and a type based on a corresponding quantized type.
-
-    This op exists to ensure type coherency for between parts of the computation
-    which are operating directly on an underlying storage type and those which
-    operate on quantized values.
-
-    Examples from storage to quantized type:
-    ```
-    i8 -> !quant<"uniform[i8:f32]{1.0}">
-    ```
-    ```
-    tensor<4xi8> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
-    ```
-    ```
-    vector<4xi8> -> vector<4x!quant<"uniform[i8:f32]{1.0}">>
-    ```
-  }];
-  let arguments = (ins quant_RealOrStorageValueType:$arg);
-  let results = (outs quant_RealOrStorageValueType:$res);
-  let hasFolder = 1;
-}
-
-#endif // DIALECT_QUANT_QUANT_OPS_
diff --git a/mlir/include/mlir/Dialect/Quant/QuantOpsBase.td b/mlir/include/mlir/Dialect/Quant/QuantOpsBase.td
deleted file mode 100644
index da822d0a61deb..0000000000000
--- a/mlir/include/mlir/Dialect/Quant/QuantOpsBase.td
+++ /dev/null
@@ -1,74 +0,0 @@
-//===- QuantOpsBase.td - Quantization dialect base ---------*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Predicates for types in the Quantization dialect.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef DIALECT_QUANT_QUANT_OPS_BASE_
-#define DIALECT_QUANT_QUANT_OPS_BASE_
-
-include "mlir/IR/OpBase.td"
-
-def Quantization_Dialect : Dialect {
-  let name = "quant";
-  let cppNamespace = "::mlir::quant";
-
-  let useDefaultTypePrinterParser = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Quantization type definitions
-//===----------------------------------------------------------------------===//
-
-class quant_TypedPrimitiveOrContainer<Type etype> :
-    Type<Or<[etype.predicate,
-                TensorOf<[etype]>.predicate,
-                VectorOf<[etype]>.predicate]>,
-         "primitive/tensor/vector of " # etype.summary>;
-
-// An implementation of QuantizedType.
-def quant_QuantizedType :
-    Type<CPred<"::llvm::isa<mlir::quant::QuantizedType>($_self)">, "QuantizedType">;
-
-// A primitive type that can represent a real value. This is either a
-// floating point value or a quantized type.
-def quant_RealPrimitiveType :
-    Type<Or<[AnyFloat.predicate, quant_QuantizedType.predicate]>,
-    "real valued primitive (float or quantized type)">;
-
-// A primitive type that can represent a storage value. This is either an
-// integer or quantized type.
-def quant_StoragePrimitiveType :
-    Type<Or<[AnySignlessInteger.predicate, quant_QuantizedType.predicate]>,
-    "quantized storage primitive (integer or quantized type)">;
-
-// A primitive or container of RealPrimitiveType.
-def quant_RealValueType :
-    quant_TypedPrimitiveOrContainer<quant_RealPrimitiveType>;
-
-// A primitive or container of StoragePrimitiveType.
-def quant_StorageValueType :
-    quant_TypedPrimitiveOrContainer<quant_StoragePrimitiveType>;
-
-// Either a real valued or storage primitive or container type.
-def quant_RealOrStorageValueType :
-    Type<Or<[quant_RealValueType.predicate, quant_StorageValueType.predicate]>,
-    "real valued or storage primitive or container type">;
-
-// An implementation of UniformQuantizedType.
-def quant_UniformQuantizedType :
-    DialectType<Quantization_Dialect,
-                CPred<"::llvm::isa<UniformQuantizedType>($_self)">,
-                "UniformQuantizedType">;
-
-// Predicate for detecting a container or primitive of UniformQuantizedType.
-def quant_UniformQuantizedValueType :
-    quant_TypedPrimitiveOrContainer<quant_UniformQuantizedType>;
-
-#endif // DIALECT_QUANT_QUANT_OPS_BASE_
diff --git a/mlir/include/mlir/Dialect/Quant/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/Quant/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000..30f7c1696bdb9
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Quant/Transforms/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name Quant)
+add_public_tablegen_target(MLIRQuantTransformsIncGen)
+
+add_mlir_doc(Passes QuantPasses ./ -gen-pass-doc)
diff --git a/mlir/include/mlir/Dialect/Quant/Transforms/Passes.h b/mlir/include/mlir/Dialect/Quant/Transforms/Passes.h
new file mode 100644
index 0000000000000..84be2a21b34ed
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Quant/Transforms/Passes.h
@@ -0,0 +1,29 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_QUANT_TRANSFORMS_PASSES_H_
+#define MLIR_DIALECT_QUANT_TRANSFORMS_PASSES_H_
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace quant {
+
+#define GEN_PASS_DECL
+#include "mlir/Dialect/Quant/Transforms/Passes.h.inc"
+
+/// Generate the code for registering passes.
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/Quant/Transforms/Passes.h.inc"
+
+void populateLowerQuantOpsPatterns(RewritePatternSet &patterns);
+
+} // namespace quant
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANT_TRANSFORMS_PASSES_H_
diff --git a/mlir/include/mlir/Dialect/Quant/Transforms/Passes.td b/mlir/include/mlir/Dialect/Quant/Transforms/Passes.td
new file mode 100644
index 0000000000000..b25296d4db5a9
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Quant/Transforms/Passes.td
@@ -0,0 +1,49 @@
+//===-- Passes.td - Arith pass definition file --------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_QUANT_TRANSFORMS_PASSES
+#define MLIR_DIALECT_QUANT_TRANSFORMS_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def LowerQuantOps : Pass<"lower-quant-ops", "func::FuncOp"> {
+  let summary = "Lower quant.dcast and quant.qcast ops";
+  let description = [{
+    Lower quantization (`quant.qcast`) and dequantization (`quant.dcast`) ops
+    into other core dialects.
+
+    The lowering process generates storage type casts in the form of
+    `quant.scast` ops to act as an interface between the original quantized
+    types of operands and results and their corresponding storage types used in
+    the generated arithmetic computations.
+  }];
+  let dependentDialects = [
+    "arith::ArithDialect",
+    "linalg::LinalgDialect",
+    "quant::QuantDialect",
+    "shape::ShapeDialect",
+    "tensor::TensorDialect"
+  ];
+}
+
+def StripFuncQuantTypes : Pass<"strip-func-quant-types"> {
+  let summary = "Strip quantized types from function headers";
+  let description = [{
+    Identify occurrences of function arguments using a quantized type and
+    replace them with a new value of the corresponding storage (signless
+    integer) type. For each converted argument, a `quant.scast` op is introduced
+    at the head of the function's entry block converting the new integer
+    argument into the original quantized value.
+  }];
+  let dependentDialects = [
+    "func::FuncDialect",
+    "quant::QuantDialect"
+  ];
+}
+
+#endif // MLIR_DIALECT_QUANT_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/Quant/FakeQuantSupport.h b/mlir/include/mlir/Dialect/Quant/Utils/FakeQuantSupport.h
similarity index 93%
rename from mlir/include/mlir/Dialect/Quant/FakeQuantSupport.h
rename to mlir/include/mlir/Dialect/Quant/Utils/FakeQuantSupport.h
index 367d468b2acf1..6551efc6242a6 100644
--- a/mlir/include/mlir/Dialect/Quant/FakeQuantSupport.h
+++ b/mlir/include/mlir/Dialect/Quant/Utils/FakeQuantSupport.h
@@ -34,10 +34,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_DIALECT_QUANT_FAKEQUANTSUPPORT_H_
-#define MLIR_DIALECT_QUANT_FAKEQUANTSUPPORT_H_
+#ifndef MLIR_DIALECT_QUANT_UTILS_FAKEQUANTSUPPORT_H_
+#define MLIR_DIALECT_QUANT_UTILS_FAKEQUANTSUPPORT_H_
 
-#include "mlir/Dialect/Quant/QuantTypes.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"
 
 namespace mlir {
 namespace quant {
@@ -64,4 +64,4 @@ fakeQuantAttrsToType(Location loc, unsigned numBits, int32_t quantizedDimension,
 } // namespace quant
 } // namespace mlir
 
-#endif // MLIR_DIALECT_QUANT_FAKEQUANTSUPPORT_H_
+#endif // MLIR_DIALECT_QUANT_UTILS_FAKEQUANTSUPPORT_H_
diff --git a/mlir/include/mlir/Dialect/Quant/UniformSupport.h b/mlir/include/mlir/Dialect/Quant/Utils/UniformSupport.h
similarity index 97%
rename from mlir/include/mlir/Dialect/Quant/UniformSupport.h
rename to mlir/include/mlir/Dialect/Quant/Utils/UniformSupport.h
index 4119aced4c075..6773f45069c87 100644
--- a/mlir/include/mlir/Dialect/Quant/UniformSupport.h
+++ b/mlir/include/mlir/Dialect/Quant/Utils/UniformSupport.h
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_DIALECT_QUANT_UNIFORMSUPPORT_H_
-#define MLIR_DIALECT_QUANT_UNIFORMSUPPORT_H_
+#ifndef MLIR_DIALECT_QUANT_UTILS_UNIFORMSUPPORT_H_
+#define MLIR_DIALECT_QUANT_UTILS_UNIFORMSUPPORT_H_
 
 #include <utility>
 
-#include "mlir/Dialect/Quant/QuantTypes.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Types.h"
 #include "llvm/ADT/APFloat.h"
@@ -218,4 +218,4 @@ class UniformQuantizedPerAxisValueConverter {
 } // namespace quant
 } // namespace mlir
 
-#endif // MLIR_DIALECT_QUANT_UNIFORMSUPPORT_H_
+#endif // MLIR_DIALECT_QUANT_UTILS_UNIFORMSUPPORT_H_
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCF.h b/mlir/include/mlir/Dialect/SCF/IR/SCF.h
index 644118ca884c6..b62c941797947 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCF.h
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCF.h
@@ -107,6 +107,17 @@ LoopNest buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs,
                        function_ref<void(OpBuilder &, Location, ValueRange)>
                            bodyBuilder = nullptr);
 
+/// Perform a replacement of one iter OpOperand of an scf.for to the
+/// `replacement` value with a different type. A callback is used to insert
+/// cast ops inside the block to account for type differences.
+using ValueTypeCastFnTy =
+    llvm::function_ref<Value(OpBuilder &, Location loc, Type, Value)>;
+SmallVector<Value> replaceAndCastForOpIterArg(RewriterBase &rewriter,
+                                              scf::ForOp forOp,
+                                              OpOperand &operand,
+                                              Value replacement,
+                                              const ValueTypeCastFnTy &castFn);
+
 } // namespace scf
 } // namespace mlir
 #endif // MLIR_DIALECT_SCF_SCF_H
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td
index 9b29affb97c43..53d1ae10dc87d 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Passes.td
@@ -56,6 +56,7 @@ def SCFParallelLoopFusion : Pass<"scf-parallel-loop-fusion"> {
 def TestSCFParallelLoopCollapsing : Pass<"test-scf-parallel-loop-collapsing"> {
   let summary = "Test parallel loops collapsing transformation";
   let constructor = "mlir::createTestSCFParallelLoopCollapsingPass()";
+  let dependentDialects = ["affine::AffineDialect"];
   let description = [{
       This pass is purely for testing the scf::collapseParallelLoops
       transformation. The transformation does not have opinions on how a
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td
index c7c2fe8bc742c..5d086325fa5b1 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td
@@ -875,7 +875,7 @@ def SPIRV_CLPrintfOp : SPIRV_CLOp<"printf", 184, []> {
     #### Example:
 
     ```mlir
-    %0 = spirv.CL.printf %0 %1 %2 : (!spirv.ptr<i8, UniformConstant>, (i32, i32)) -> i32
+    %0 = spirv.CL.printf %fmt %1, %2  : !spirv.ptr<i8, UniformConstant>, i32, i32 -> i32
     ```
   }];
 
@@ -889,7 +889,7 @@ def SPIRV_CLPrintfOp : SPIRV_CLOp<"printf", 184, []> {
   );
 
   let assemblyFormat = [{
-  $format `,` $arguments  attr-dict `:`  `(` type($format) `,` `(` type($arguments) `)` `)` `->` type($result)
+   $format ( $arguments^ )? attr-dict `:`  type($format) ( `,` type($arguments)^ )? `->` type($result)
   }];
 
   let hasVerifier = 0;
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index a154d7fa5fb6e..620fd7c63146d 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -293,7 +293,7 @@ class SparseTensorType {
   /// Returns the number of dimensions which have dynamic sizes.
   /// The return type is `int64_t` to maintain consistency with
   /// `ShapedType::Trait<T>::getNumDynamicDims`.
-  int64_t getNumDynamicDims() const { return rtp.getNumDynamicDims(); }
+  size_t getNumDynamicDims() const { return rtp.getNumDynamicDims(); }
 
   ArrayRef<LevelType> getLvlTypes() const { return enc.getLvlTypes(); }
   LevelType getLvlType(Level l) const {
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
index 64bacd0e432fe..67b41187e5bfb 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
@@ -40,7 +40,7 @@ def Tosa_Dialect : Dialect {
     there will be tools to lower from the ML frameworks into TOSA.
   }];
 
-  let dependentDialects = ["tensor::TensorDialect", "quant::QuantizationDialect"];
+  let dependentDialects = ["tensor::TensorDialect", "quant::QuantDialect"];
 
   let cppNamespace = "mlir::tosa";
   let hasConstantMaterializer = 1;
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 539b7cd0b7426..07402c8695b38 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -881,7 +881,7 @@ def Tosa_TableOp : Tosa_InferShapedTypeOp<"table"> {
   }];
 
   let arguments = (ins
-    Tosa_Tensor: $input,
+    Tosa_Tensor: $input1,
     Tosa_Tensor1D: $table
   );
 
@@ -890,7 +890,7 @@ def Tosa_TableOp : Tosa_InferShapedTypeOp<"table"> {
   );
 
   let assemblyFormat = [{
-    $input `,` $table attr-dict `:` `(` type($input) `,` type($table) `)` `->` type($output)
+    $input1 `,` $table attr-dict `:` `(` type($input1) `,` type($table) `)` `->` type($output)
   }];
 
   let hasVerifier = 1;
@@ -1640,7 +1640,7 @@ def Tosa_ReverseOp: Tosa_Op<"reverse", [
   }];
 
   let arguments = (ins
-    Tosa_Tensor:$input,
+    Tosa_Tensor:$input1,
     I32Attr:$axis
   );
 
@@ -1667,7 +1667,7 @@ def Tosa_SliceOp : Tosa_InferShapedTypeOp<"slice"> {
   }];
 
   let arguments = (ins
-    Tosa_Tensor:$input,
+    Tosa_Tensor:$input1,
     DenseI64ArrayAttr:$start,
     DenseI64ArrayAttr:$size
   );
diff --git a/mlir/include/mlir/Dialect/Tosa/Utils/QuantUtils.h b/mlir/include/mlir/Dialect/Tosa/Utils/QuantUtils.h
index 298c97015fe2e..5e80745777b3b 100644
--- a/mlir/include/mlir/Dialect/Tosa/Utils/QuantUtils.h
+++ b/mlir/include/mlir/Dialect/Tosa/Utils/QuantUtils.h
@@ -16,8 +16,8 @@
 
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 
-#include "mlir/Dialect/Quant/FakeQuantSupport.h"
-#include "mlir/Dialect/Quant/UniformSupport.h"
+#include "mlir/Dialect/Quant/Utils/FakeQuantSupport.h"
+#include "mlir/Dialect/Quant/Utils/UniformSupport.h"
 
 namespace mlir {
 namespace tosa {
diff --git a/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td b/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td
index aad2eab83dbd3..c973eca0132a9 100644
--- a/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td
+++ b/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td
@@ -68,6 +68,22 @@ def ApplyRankReducingSubviewPatternsOp : Op<Transform_Dialect,
   let assemblyFormat = "attr-dict";
 }
 
+def ApplyDropUnitDimWithShapeCastPatternsOp : Op<Transform_Dialect,
+    "apply_patterns.vector.drop_unit_dims_with_shape_cast",
+    [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
+  let description = [{
+     Apply vector patterns to fold unit dims with vector.shape_cast Ops:
+      - DropUnitDimFromElementwiseOps
+      - DropUnitDimsFromScfForOp
+      - DropUnitDimsFromTransposeOp
+
+    Excludes patterns for vector.transfer Ops. This is complemented by
+    shape_cast folding patterns.
+  }];
+
+  let assemblyFormat = "attr-dict";
+}
+
 def ApplyTransferPermutationPatternsOp : Op<Transform_Dialect,
     "apply_patterns.vector.transfer_permutation_patterns",
     [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h
index 676da6d176497..e30950bbf292d 100644
--- a/mlir/include/mlir/IR/AffineMap.h
+++ b/mlir/include/mlir/IR/AffineMap.h
@@ -354,6 +354,24 @@ class AffineMap {
   /// returns the resulting values. `this` must be symbol-less.
   SmallVector<int64_t, 4> compose(ArrayRef<int64_t> values) const;
 
+  /// Returns the number of "zero" results (constant values == 0) in this map.
+  ///
+  /// Example:
+  ///   * For `(d0, d1) -> (d0, d1, 0)` returns 1
+  ///   * For `(d0, d1, d2) -> (d0, d1)` returns 0
+  ///   * For `(d0, d1, d2) -> (d0, 0, d1, 0, d2)` returns 2
+  size_t getNumOfZeroResults() const;
+
+  /// Returns the AffineMap resulting from removing "zero" results (constant
+  /// values == 0) from this map.
+  ///
+  /// Example:
+  ///   * For `(d0, d1) -> (d0, d1, 0)` returns `(d0, d1) -> (d0, d1)`
+  ///   * For `(d0, d1, d2) -> (d0, d1)` returns `(d0, d1, d2) -> (d0, d1)`
+  ///   * For `(d0, d1, d2) -> (d0, 0, d1, 0, d2)` returns
+  ///     `(d0, d1, d2) -> (d0, d1, d2)`
+  AffineMap dropZeroResults();
+
   /// Returns true if the AffineMap represents a subset (i.e. a projection) of a
   /// symbol-less permutation map. `allowZeroInResults` allows projected
   /// permutation maps with constant zero result expressions.
diff --git a/mlir/include/mlir/IR/BuiltinTypeInterfaces.td b/mlir/include/mlir/IR/BuiltinTypeInterfaces.td
index db38e2e1bce22..c9dcd546cf67c 100644
--- a/mlir/include/mlir/IR/BuiltinTypeInterfaces.td
+++ b/mlir/include/mlir/IR/BuiltinTypeInterfaces.td
@@ -166,7 +166,7 @@ def ShapedTypeInterface : TypeInterface<"ShapedType"> {
 
     /// If this is a ranked type, return the number of dimensions with dynamic
     /// size. Otherwise, abort.
-    int64_t getNumDynamicDims() const {
+    size_t getNumDynamicDims() const {
       return llvm::count_if($_type.getShape(), ::mlir::ShapedType::isDynamic);
     }
 
diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td
index c738a8a3becc1..b2b41b16beec2 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.td
+++ b/mlir/include/mlir/IR/BuiltinTypes.td
@@ -1253,7 +1253,7 @@ def Builtin_Vector : Builtin_Type<"Vector", "vector",
     }
 
     /// Get the number of scalable dimensions.
-    int64_t getNumScalableDims() const {
+    size_t getNumScalableDims() const {
       return llvm::count(getScalableDims(), true);
     }
 
diff --git a/mlir/include/mlir/IR/CommonAttrConstraints.td b/mlir/include/mlir/IR/CommonAttrConstraints.td
index 853fb318c76e7..de5f6797235e3 100644
--- a/mlir/include/mlir/IR/CommonAttrConstraints.td
+++ b/mlir/include/mlir/IR/CommonAttrConstraints.td
@@ -178,6 +178,7 @@ class AnyAttrOf<list<Attr> allowedAttrs, string summary = "",
         summary)> {
     let returnType = cppType;
     let convertFromStorage = fromStorage;
+    list<Attr> allowedAttributes = allowedAttrs;
 }
 
 def LocationAttr : Attr<CPred<"::llvm::isa<::mlir::LocationAttr>($_self)">,
@@ -743,6 +744,8 @@ class ConfinedAttr<Attr attr, list<AttrConstraint> constraints> : Attr<
   let isOptional = attr.isOptional;
 
   let baseAttr = attr;
+
+  list<AttrConstraint> attrConstraints = constraints;
 }
 
 // An AttrConstraint that holds if all attr constraints specified in
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index 73dccdb017ee1..7fd0432ddce1b 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -65,7 +65,7 @@
 #include "mlir/Dialect/PDLInterp/IR/PDLInterp.h"
 #include "mlir/Dialect/Polynomial/IR/PolynomialDialect.h"
 #include "mlir/Dialect/Ptr/IR/PtrDialect.h"
-#include "mlir/Dialect/Quant/QuantOps.h"
+#include "mlir/Dialect/Quant/IR/Quant.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.h"
 #include "mlir/Dialect/SCF/TransformOps/SCFTransformOps.h"
@@ -137,7 +137,7 @@ inline void registerAllDialects(DialectRegistry &registry) {
                   pdl_interp::PDLInterpDialect,
                   polynomial::PolynomialDialect,
                   ptr::PtrDialect,
-                  quant::QuantizationDialect,
+                  quant::QuantDialect,
                   ROCDL::ROCDLDialect,
                   scf::SCFDialect,
                   shape::ShapeDialect,
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index 1b9c1b193ace6..dd8b292a87344 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -35,6 +35,7 @@
 #include "mlir/Dialect/Mesh/Transforms/Passes.h"
 #include "mlir/Dialect/NVGPU/Transforms/Passes.h"
 #include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+#include "mlir/Dialect/Quant/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/Transforms/Passes.h"
 #include "mlir/Dialect/SPIRV/Transforms/Passes.h"
 #include "mlir/Dialect/Shape/Transforms/Passes.h"
@@ -82,6 +83,7 @@ inline void registerAllPasses() {
   memref::registerMemRefPasses();
   mesh::registerMeshPasses();
   ml_program::registerMLProgramPasses();
+  quant::registerQuantPasses();
   registerSCFPasses();
   registerShapePasses();
   spirv::registerSPIRVPasses();
diff --git a/mlir/lib/CAPI/Dialect/Quant.cpp b/mlir/lib/CAPI/Dialect/Quant.cpp
index 0a7181d8bc17c..c94dbb5692fdb 100644
--- a/mlir/lib/CAPI/Dialect/Quant.cpp
+++ b/mlir/lib/CAPI/Dialect/Quant.cpp
@@ -8,12 +8,12 @@
 
 #include "mlir-c/Dialect/Quant.h"
 #include "mlir/CAPI/Registration.h"
-#include "mlir/Dialect/Quant/QuantOps.h"
-#include "mlir/Dialect/Quant/QuantTypes.h"
+#include "mlir/Dialect/Quant/IR/Quant.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"
 
 using namespace mlir;
 
-MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(quant, quant, quant::QuantizationDialect)
+MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(quant, quant, quant::QuantDialect)
 
 //===---------------------------------------------------------------------===//
 // QuantizedType
diff --git a/mlir/lib/Conversion/BufferizationToMemRef/BufferizationToMemRef.cpp b/mlir/lib/Conversion/BufferizationToMemRef/BufferizationToMemRef.cpp
index f9903071be084..06aedc5e139d3 100644
--- a/mlir/lib/Conversion/BufferizationToMemRef/BufferizationToMemRef.cpp
+++ b/mlir/lib/Conversion/BufferizationToMemRef/BufferizationToMemRef.cpp
@@ -134,8 +134,7 @@ struct BufferizationToMemRefPass
 
     bufferization::DeallocHelperMap deallocHelperFuncMap;
     if (auto module = dyn_cast<ModuleOp>(getOperation())) {
-      OpBuilder builder =
-          OpBuilder::atBlockBegin(&module.getBodyRegion().front());
+      OpBuilder builder = OpBuilder::atBlockBegin(module.getBody());
 
       // Build dealloc helper function if there are deallocs.
       getOperation()->walk([&](bufferization::DeallocOp deallocOp) {
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 4c2e8682285c5..2cc77e8fd41b9 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -544,6 +544,10 @@ struct CallOpInterfaceLowering : public ConvertOpToLLVMPattern<CallOpType> {
         callOp.getLoc(), packedResult ? TypeRange(packedResult) : TypeRange(),
         promoted, callOp->getAttrs());
 
+    newOp.getProperties().operandSegmentSizes = {
+        static_cast<int32_t>(promoted.size()), 0};
+    newOp.getProperties().op_bundle_sizes = rewriter.getDenseI32ArrayAttr({});
+
     SmallVector<Value, 4> results;
     if (numResults < 2) {
       // If < 2 results, packing did not do anything and we can just return.
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 5b590a457f771..b3c3fd4956d0b 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -340,6 +340,34 @@ static SmallString<16> getUniqueFormatGlobalName(gpu::GPUModuleOp moduleOp) {
   return stringConstName;
 }
 
+/// Create an global that contains the given format string. If a global with
+/// the same format string exists already in the module, return that global.
+static LLVM::GlobalOp getOrCreateFormatStringConstant(
+    OpBuilder &b, Location loc, gpu::GPUModuleOp moduleOp, Type llvmI8,
+    StringRef str, uint64_t alignment = 0, unsigned addrSpace = 0) {
+  llvm::SmallString<20> formatString(str);
+  formatString.push_back('\0'); // Null terminate for C
+  auto globalType =
+      LLVM::LLVMArrayType::get(llvmI8, formatString.size_in_bytes());
+  StringAttr attr = b.getStringAttr(formatString);
+
+  // Try to find existing global.
+  for (auto globalOp : moduleOp.getOps<LLVM::GlobalOp>())
+    if (globalOp.getGlobalType() == globalType && globalOp.getConstant() &&
+        globalOp.getValueAttr() == attr &&
+        globalOp.getAlignment().value_or(0) == alignment &&
+        globalOp.getAddrSpace() == addrSpace)
+      return globalOp;
+
+  // Not found: create new global.
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(moduleOp.getBody());
+  SmallString<16> name = getUniqueFormatGlobalName(moduleOp);
+  return b.create<LLVM::GlobalOp>(loc, globalType,
+                                  /*isConstant=*/true, LLVM::Linkage::Internal,
+                                  name, attr, alignment, addrSpace);
+}
+
 template <typename T>
 static LLVM::LLVMFuncOp getOrDefineFunction(T &moduleOp, const Location loc,
                                             ConversionPatternRewriter &rewriter,
@@ -391,33 +419,20 @@ LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
   auto printfBeginCall = rewriter.create<LLVM::CallOp>(loc, ocklBegin, zeroI64);
   Value printfDesc = printfBeginCall.getResult();
 
-  // Get a unique global name for the format.
-  SmallString<16> stringConstName = getUniqueFormatGlobalName(moduleOp);
-
-  llvm::SmallString<20> formatString(adaptor.getFormat());
-  formatString.push_back('\0'); // Null terminate for C
-  size_t formatStringSize = formatString.size_in_bytes();
-
-  auto globalType = LLVM::LLVMArrayType::get(llvmI8, formatStringSize);
-  LLVM::GlobalOp global;
-  {
-    ConversionPatternRewriter::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPointToStart(moduleOp.getBody());
-    global = rewriter.create<LLVM::GlobalOp>(
-        loc, globalType,
-        /*isConstant=*/true, LLVM::Linkage::Internal, stringConstName,
-        rewriter.getStringAttr(formatString));
-  }
+  // Create the global op or find an existing one.
+  LLVM::GlobalOp global = getOrCreateFormatStringConstant(
+      rewriter, loc, moduleOp, llvmI8, adaptor.getFormat());
 
   // Get a pointer to the format string's first element and pass it to printf()
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
       loc,
       LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
       global.getSymNameAttr());
-  Value stringStart = rewriter.create<LLVM::GEPOp>(
-      loc, ptrType, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
-  Value stringLen =
-      rewriter.create<LLVM::ConstantOp>(loc, llvmI64, formatStringSize);
+  Value stringStart =
+      rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getGlobalType(),
+                                   globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+  Value stringLen = rewriter.create<LLVM::ConstantOp>(
+      loc, llvmI64, cast<StringAttr>(global.getValueAttr()).size());
 
   Value oneI32 = rewriter.create<LLVM::ConstantOp>(loc, llvmI32, 1);
   Value zeroI32 = rewriter.create<LLVM::ConstantOp>(loc, llvmI32, 0);
@@ -486,30 +501,19 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
   LLVM::LLVMFuncOp printfDecl =
       getOrDefineFunction(moduleOp, loc, rewriter, "printf", printfType);
 
-  // Get a unique global name for the format.
-  SmallString<16> stringConstName = getUniqueFormatGlobalName(moduleOp);
-
-  llvm::SmallString<20> formatString(adaptor.getFormat());
-  formatString.push_back('\0'); // Null terminate for C
-  auto globalType =
-      LLVM::LLVMArrayType::get(llvmI8, formatString.size_in_bytes());
-  LLVM::GlobalOp global;
-  {
-    ConversionPatternRewriter::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPointToStart(moduleOp.getBody());
-    global = rewriter.create<LLVM::GlobalOp>(
-        loc, globalType,
-        /*isConstant=*/true, LLVM::Linkage::Internal, stringConstName,
-        rewriter.getStringAttr(formatString), /*allignment=*/0, addressSpace);
-  }
+  // Create the global op or find an existing one.
+  LLVM::GlobalOp global = getOrCreateFormatStringConstant(
+      rewriter, loc, moduleOp, llvmI8, adaptor.getFormat(), /*alignment=*/0,
+      addressSpace);
 
   // Get a pointer to the format string's first element
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
       loc,
       LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
       global.getSymNameAttr());
-  Value stringStart = rewriter.create<LLVM::GEPOp>(
-      loc, ptrType, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+  Value stringStart =
+      rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getGlobalType(),
+                                   globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
 
   // Construct arguments and function call
   auto argsRange = adaptor.getArgs();
@@ -541,27 +545,15 @@ LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite(
   LLVM::LLVMFuncOp vprintfDecl =
       getOrDefineFunction(moduleOp, loc, rewriter, "vprintf", vprintfType);
 
-  // Get a unique global name for the format.
-  SmallString<16> stringConstName = getUniqueFormatGlobalName(moduleOp);
-
-  llvm::SmallString<20> formatString(adaptor.getFormat());
-  formatString.push_back('\0'); // Null terminate for C
-  auto globalType =
-      LLVM::LLVMArrayType::get(llvmI8, formatString.size_in_bytes());
-  LLVM::GlobalOp global;
-  {
-    ConversionPatternRewriter::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPointToStart(moduleOp.getBody());
-    global = rewriter.create<LLVM::GlobalOp>(
-        loc, globalType,
-        /*isConstant=*/true, LLVM::Linkage::Internal, stringConstName,
-        rewriter.getStringAttr(formatString), /*allignment=*/0);
-  }
+  // Create the global op or find an existing one.
+  LLVM::GlobalOp global = getOrCreateFormatStringConstant(
+      rewriter, loc, moduleOp, llvmI8, adaptor.getFormat());
 
   // Get a pointer to the format string's first element
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(loc, global);
-  Value stringStart = rewriter.create<LLVM::GEPOp>(
-      loc, ptrType, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+  Value stringStart =
+      rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getGlobalType(),
+                                   globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
   SmallVector<Type> types;
   SmallVector<Value> args;
   // Promote and pack the arguments into a stack allocation.
@@ -642,11 +634,10 @@ static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space) {
 
 /// Generates a symbol with 0-sized array type for dynamic shared memory usage,
 /// or uses existing symbol.
-LLVM::GlobalOp
-getDynamicSharedMemorySymbol(ConversionPatternRewriter &rewriter,
-                             Operation *moduleOp, gpu::DynamicSharedMemoryOp op,
-                             const LLVMTypeConverter *typeConverter,
-                             MemRefType memrefType, unsigned alignmentBit) {
+LLVM::GlobalOp getDynamicSharedMemorySymbol(
+    ConversionPatternRewriter &rewriter, gpu::GPUModuleOp moduleOp,
+    gpu::DynamicSharedMemoryOp op, const LLVMTypeConverter *typeConverter,
+    MemRefType memrefType, unsigned alignmentBit) {
   uint64_t alignmentByte = alignmentBit / memrefType.getElementTypeBitWidth();
 
   FailureOr<unsigned> addressSpace =
@@ -661,8 +652,7 @@ getDynamicSharedMemorySymbol(ConversionPatternRewriter &rewriter,
   // Step 1. Collect symbol names of LLVM::GlobalOp Ops. Also if any of
   // LLVM::GlobalOp is suitable for shared memory, return it.
   llvm::StringSet<> existingGlobalNames;
-  for (auto globalOp :
-       moduleOp->getRegion(0).front().getOps<LLVM::GlobalOp>()) {
+  for (auto globalOp : moduleOp.getBody()->getOps<LLVM::GlobalOp>()) {
     existingGlobalNames.insert(globalOp.getSymName());
     if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(globalOp.getType())) {
       if (globalOp.getAddrSpace() == addressSpace.value() &&
@@ -684,7 +674,7 @@ getDynamicSharedMemorySymbol(ConversionPatternRewriter &rewriter,
 
   // Step 3. Generate a global op
   OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(&moduleOp->getRegion(0).front().front());
+  rewriter.setInsertionPointToStart(moduleOp.getBody());
 
   auto zeroSizedArrayType = LLVM::LLVMArrayType::get(
       typeConverter->convertType(memrefType.getElementType()), 0);
@@ -709,10 +699,8 @@ LogicalResult GPUDynamicSharedMemoryOpLowering::matchAndRewrite(
 
   // Step 2: Generate a global symbol or existing for the dynamic shared
   // memory with memref<0xi8> type
-  LLVM::LLVMFuncOp funcOp = op->getParentOfType<LLVM::LLVMFuncOp>();
-  LLVM::GlobalOp shmemOp = {};
-  Operation *moduleOp = funcOp->getParentWithTrait<OpTrait::SymbolTable>();
-  shmemOp = getDynamicSharedMemorySymbol(
+  auto moduleOp = op->getParentOfType<gpu::GPUModuleOp>();
+  LLVM::GlobalOp shmemOp = getDynamicSharedMemorySymbol(
       rewriter, moduleOp, op, getTypeConverter(), memrefType0sz, alignmentBit);
 
   // Step 3. Get address of the global symbol
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index 41a3ac76df4b7..739a34e0aa610 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -316,6 +316,53 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Subgroup query ops.
+//===----------------------------------------------------------------------===//
+
+template <typename SubgroupOp>
+struct GPUSubgroupOpConversion final : ConvertOpToLLVMPattern<SubgroupOp> {
+  using ConvertOpToLLVMPattern<SubgroupOp>::ConvertOpToLLVMPattern;
+  using ConvertToLLVMPattern::getTypeConverter;
+
+  LogicalResult
+  matchAndRewrite(SubgroupOp op, typename SubgroupOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    constexpr StringRef funcName = [] {
+      if constexpr (std::is_same_v<SubgroupOp, gpu::SubgroupIdOp>) {
+        return "_Z16get_sub_group_id";
+      } else if constexpr (std::is_same_v<SubgroupOp, gpu::LaneIdOp>) {
+        return "_Z22get_sub_group_local_id";
+      } else if constexpr (std::is_same_v<SubgroupOp, gpu::NumSubgroupsOp>) {
+        return "_Z18get_num_sub_groups";
+      } else if constexpr (std::is_same_v<SubgroupOp, gpu::SubgroupSizeOp>) {
+        return "_Z18get_sub_group_size";
+      }
+    }();
+
+    Operation *moduleOp =
+        op->template getParentWithTrait<OpTrait::SymbolTable>();
+    Type resultTy = rewriter.getI32Type();
+    LLVM::LLVMFuncOp func =
+        lookupOrCreateSPIRVFn(moduleOp, funcName, {}, resultTy,
+                              /*isMemNone=*/false, /*isConvergent=*/false);
+
+    Location loc = op->getLoc();
+    Value result = createSPIRVBuiltinCall(loc, rewriter, func, {}).getResult();
+
+    Type indexTy = getTypeConverter()->getIndexType();
+    if (resultTy != indexTy) {
+      if (indexTy.getIntOrFloatBitWidth() < resultTy.getIntOrFloatBitWidth()) {
+        return failure();
+      }
+      result = rewriter.create<LLVM::ZExtOp>(loc, indexTy, result);
+    }
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // GPU To LLVM-SPV Pass.
 //===----------------------------------------------------------------------===//
@@ -337,7 +384,9 @@ struct GPUToLLVMSPVConversionPass final
 
     target.addIllegalOp<gpu::BarrierOp, gpu::BlockDimOp, gpu::BlockIdOp,
                         gpu::GPUFuncOp, gpu::GlobalIdOp, gpu::GridDimOp,
-                        gpu::ReturnOp, gpu::ShuffleOp, gpu::ThreadIdOp>();
+                        gpu::LaneIdOp, gpu::NumSubgroupsOp, gpu::ReturnOp,
+                        gpu::ShuffleOp, gpu::SubgroupIdOp, gpu::SubgroupSizeOp,
+                        gpu::ThreadIdOp>();
 
     populateGpuToLLVMSPVConversionPatterns(converter, patterns);
     populateGpuMemorySpaceAttributeConversions(converter);
@@ -366,11 +415,15 @@ gpuAddressSpaceToOCLAddressSpace(gpu::AddressSpace addressSpace) {
 void populateGpuToLLVMSPVConversionPatterns(LLVMTypeConverter &typeConverter,
                                             RewritePatternSet &patterns) {
   patterns.add<GPUBarrierConversion, GPUReturnOpLowering, GPUShuffleConversion,
+               GPUSubgroupOpConversion<gpu::LaneIdOp>,
+               GPUSubgroupOpConversion<gpu::NumSubgroupsOp>,
+               GPUSubgroupOpConversion<gpu::SubgroupIdOp>,
+               GPUSubgroupOpConversion<gpu::SubgroupSizeOp>,
+               LaunchConfigOpConversion<gpu::BlockDimOp>,
                LaunchConfigOpConversion<gpu::BlockIdOp>,
+               LaunchConfigOpConversion<gpu::GlobalIdOp>,
                LaunchConfigOpConversion<gpu::GridDimOp>,
-               LaunchConfigOpConversion<gpu::BlockDimOp>,
-               LaunchConfigOpConversion<gpu::ThreadIdOp>,
-               LaunchConfigOpConversion<gpu::GlobalIdOp>>(typeConverter);
+               LaunchConfigOpConversion<gpu::ThreadIdOp>>(typeConverter);
   MLIRContext *context = &typeConverter.getContext();
   unsigned privateAddressSpace =
       gpuAddressSpaceToOCLAddressSpace(gpu::AddressSpace::Private);
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
index a8ff9247e796a..53b4c720ae56d 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
@@ -121,6 +121,15 @@ class GPUShuffleConversion final : public OpConversionPattern<gpu::ShuffleOp> {
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+class GPUPrintfConversion final : public OpConversionPattern<gpu::PrintfOp> {
+public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(gpu::PrintfOp gpuPrintfOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -597,6 +606,124 @@ class GPUSubgroupReduceConversion final
   }
 };
 
+// Formulate a unique variable/constant name after
+// searching in the module for existing variable/constant names.
+// This is to avoid name collision with existing variables.
+// Example: printfMsg0, printfMsg1, printfMsg2, ...
+static std::string makeVarName(spirv::ModuleOp moduleOp, llvm::Twine prefix) {
+  std::string name;
+  unsigned number = 0;
+
+  do {
+    name.clear();
+    name = (prefix + llvm::Twine(number++)).str();
+  } while (moduleOp.lookupSymbol(name));
+
+  return name;
+}
+
+/// Pattern to convert a gpu.printf op into a SPIR-V CLPrintf op.
+
+LogicalResult GPUPrintfConversion::matchAndRewrite(
+    gpu::PrintfOp gpuPrintfOp, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+
+  Location loc = gpuPrintfOp.getLoc();
+
+  auto moduleOp = gpuPrintfOp->getParentOfType<spirv::ModuleOp>();
+  if (!moduleOp)
+    return failure();
+
+  // SPIR-V global variable is used to initialize printf
+  // format string value, if there are multiple printf messages,
+  // each global var needs to be created with a unique name.
+  std::string globalVarName = makeVarName(moduleOp, llvm::Twine("printfMsg"));
+  spirv::GlobalVariableOp globalVar;
+
+  IntegerType i8Type = rewriter.getI8Type();
+  IntegerType i32Type = rewriter.getI32Type();
+
+  // Each character of printf format string is
+  // stored as a spec constant. We need to create
+  // unique name for this spec constant like
+  // @printfMsg0_sc0, @printfMsg0_sc1, ... by searching in the module
+  // for existing spec constant names.
+  auto createSpecConstant = [&](unsigned value) {
+    auto attr = rewriter.getI8IntegerAttr(value);
+    std::string specCstName =
+        makeVarName(moduleOp, llvm::Twine(globalVarName) + "_sc");
+
+    return rewriter.create<spirv::SpecConstantOp>(
+        loc, rewriter.getStringAttr(specCstName), attr);
+  };
+  {
+    Operation *parent =
+        SymbolTable::getNearestSymbolTable(gpuPrintfOp->getParentOp());
+
+    ConversionPatternRewriter::InsertionGuard guard(rewriter);
+
+    Block &entryBlock = *parent->getRegion(0).begin();
+    rewriter.setInsertionPointToStart(
+        &entryBlock); // insertion point at module level
+
+    // Create Constituents with SpecConstant by scanning format string
+    // Each character of format string is stored as a spec constant
+    // and then these spec constants are used to create a
+    // SpecConstantCompositeOp.
+    llvm::SmallString<20> formatString(adaptor.getFormat());
+    formatString.push_back('\0'); // Null terminate for C.
+    SmallVector<Attribute, 4> constituents;
+    for (char c : formatString) {
+      spirv::SpecConstantOp cSpecConstantOp = createSpecConstant(c);
+      constituents.push_back(SymbolRefAttr::get(cSpecConstantOp));
+    }
+
+    // Create SpecConstantCompositeOp to initialize the global variable
+    size_t contentSize = constituents.size();
+    auto globalType = spirv::ArrayType::get(i8Type, contentSize);
+    spirv::SpecConstantCompositeOp specCstComposite;
+    // There will be one SpecConstantCompositeOp per printf message/global var,
+    // so no need do lookup for existing ones.
+    std::string specCstCompositeName =
+        (llvm::Twine(globalVarName) + "_scc").str();
+
+    specCstComposite = rewriter.create<spirv::SpecConstantCompositeOp>(
+        loc, TypeAttr::get(globalType),
+        rewriter.getStringAttr(specCstCompositeName),
+        rewriter.getArrayAttr(constituents));
+
+    auto ptrType = spirv::PointerType::get(
+        globalType, spirv::StorageClass::UniformConstant);
+
+    // Define a GlobalVarOp initialized using specialized constants
+    // that is used to specify the printf format string
+    // to be passed to the SPIRV CLPrintfOp.
+    globalVar = rewriter.create<spirv::GlobalVariableOp>(
+        loc, ptrType, globalVarName, FlatSymbolRefAttr::get(specCstComposite));
+
+    globalVar->setAttr("Constant", rewriter.getUnitAttr());
+  }
+  // Get SSA value of Global variable and create pointer to i8 to point to
+  // the format string.
+  Value globalPtr = rewriter.create<spirv::AddressOfOp>(loc, globalVar);
+  Value fmtStr = rewriter.create<spirv::BitcastOp>(
+      loc,
+      spirv::PointerType::get(i8Type, spirv::StorageClass::UniformConstant),
+      globalPtr);
+
+  // Get printf arguments.
+  auto printfArgs = llvm::to_vector_of<Value, 4>(adaptor.getArgs());
+
+  rewriter.create<spirv::CLPrintfOp>(loc, i32Type, fmtStr, printfArgs);
+
+  // Need to erase the gpu.printf op as gpu.printf does not use result vs
+  // spirv::CLPrintfOp has i32 resultType so cannot replace with new SPIR-V
+  // printf op.
+  rewriter.eraseOp(gpuPrintfOp);
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // GPU To SPIRV Patterns.
 //===----------------------------------------------------------------------===//
@@ -620,5 +747,6 @@ void mlir::populateGPUToSPIRVPatterns(SPIRVTypeConverter &typeConverter,
       SingleDimLaunchConfigConversion<gpu::SubgroupSizeOp,
                                       spirv::BuiltIn::SubgroupSize>,
       WorkGroupSizeConversion, GPUAllReduceConversion,
-      GPUSubgroupReduceConversion>(typeConverter, patterns.getContext());
+      GPUSubgroupReduceConversion, GPUPrintfConversion>(typeConverter,
+                                                        patterns.getContext());
 }
diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
index ca78631632419..6ae607f75adbd 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
@@ -837,8 +837,11 @@ class FunctionCallPattern
   matchAndRewrite(spirv::FunctionCallOp callOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     if (callOp.getNumResults() == 0) {
-      rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+      auto newOp = rewriter.replaceOpWithNewOp<LLVM::CallOp>(
           callOp, std::nullopt, adaptor.getOperands(), callOp->getAttrs());
+      newOp.getProperties().operandSegmentSizes = {
+          static_cast<int32_t>(adaptor.getOperands().size()), 0};
+      newOp.getProperties().op_bundle_sizes = rewriter.getDenseI32ArrayAttr({});
       return success();
     }
 
@@ -846,8 +849,11 @@ class FunctionCallPattern
     auto dstType = typeConverter.convertType(callOp.getType(0));
     if (!dstType)
       return rewriter.notifyMatchFailure(callOp, "type conversion failed");
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+    auto newOp = rewriter.replaceOpWithNewOp<LLVM::CallOp>(
         callOp, dstType, adaptor.getOperands(), callOp->getAttrs());
+    newOp.getProperties().operandSegmentSizes = {
+        static_cast<int32_t>(adaptor.getOperands().size()), 0};
+    newOp.getProperties().op_bundle_sizes = rewriter.getDenseI32ArrayAttr({});
     return success();
   }
 };
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 93e284af05188..01fdd57260797 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -1830,7 +1830,7 @@ class ReverseConverter : public OpRewritePattern<tosa::ReverseOp> {
   LogicalResult matchAndRewrite(tosa::ReverseOp op,
                                 PatternRewriter &rewriter) const final {
     auto loc = op.getLoc();
-    Value input = op.getInput();
+    Value input = op.getInput1();
     auto inputTy = cast<ShapedType>(input.getType());
     auto resultTy = cast<ShapedType>(op.getType());
     auto axis = op.getAxis();
@@ -2161,7 +2161,7 @@ class TableConverter : public OpRewritePattern<tosa::TableOp> {
   LogicalResult matchAndRewrite(tosa::TableOp op,
                                 PatternRewriter &rewriter) const final {
     auto loc = op.getLoc();
-    Value input = op.getInput();
+    Value input = op.getInput1();
     Value table = op.getTable();
     auto inputTy = cast<ShapedType>(input.getType());
     auto tableTy = cast<ShapedType>(table.getType());
diff --git a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
index c0c015ab34aab..3f104ed1e3f7f 100644
--- a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
+++ b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
@@ -144,7 +144,7 @@ TensorType inferReshapeCollapsedType(TensorType lhsType, TensorType rhsType) {
   for (; currRhsDim < rhsShape.size(); currRhsDim++) {
     assert(rhsShape[currRhsDim] == 1);
   }
-  
+
   return lhsType.clone(intermediateShape);
 }
 
@@ -264,7 +264,7 @@ class SliceConverter : public OpConversionPattern<tosa::SliceOp> {
   matchAndRewrite(tosa::SliceOp sliceOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const final {
     Location loc = sliceOp.getLoc();
-    Value input = adaptor.getInput();
+    Value input = adaptor.getInput1();
     ShapedType resultType = cast<ShapedType>(sliceOp.getType());
     if (llvm::isa<UnrankedTensorType>(resultType))
       return failure();
diff --git a/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt b/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt
index aef3cf467fb65..35576732c82cf 100644
--- a/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt
@@ -35,12 +35,9 @@ add_mlir_conversion_library(MLIRVectorToLLVMPass
   MLIRVectorToLLVM
 
   MLIRArmNeonDialect
-  MLIRArmNeonTransforms
   MLIRArmSMEDialect
-  MLIRArmSMETransforms
   MLIRArmSVEDialect
   MLIRArmSVETransforms
-  MLIRVectorToArmSME
   MLIRAMXDialect
   MLIRAMXTransforms
   MLIRX86VectorDialect
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index fa03442765539..0e21e96cc3fbb 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -106,7 +106,7 @@ createNdDescriptor(PatternRewriter &rewriter, Location loc,
       std::optional<int64_t> staticVal = getConstantIntValue(offset);
       if (!staticVal)
         dynOffsets.push_back(offset);
-      constOffsets.push_back(staticVal ? *staticVal : ShapedType::kDynamic);
+      constOffsets.push_back(staticVal.value_or(ShapedType::kDynamic));
     }
 
     SmallVector<Value> dynShapes;
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index b89888e6aa83f..5e7a6b6ca883c 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -4537,6 +4537,133 @@ LogicalResult AffineDelinearizeIndexOp::verify() {
   return success();
 }
 
+namespace {
+
+// Drops delinearization indices that correspond to unit-extent basis
+struct DropUnitExtentBasis
+    : public OpRewritePattern<affine::AffineDelinearizeIndexOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(affine::AffineDelinearizeIndexOp delinearizeOp,
+                                PatternRewriter &rewriter) const override {
+    SmallVector<Value> replacements(delinearizeOp->getNumResults(), nullptr);
+    std::optional<Value> zero = std::nullopt;
+    Location loc = delinearizeOp->getLoc();
+    auto getZero = [&]() -> Value {
+      if (!zero)
+        zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+      return zero.value();
+    };
+
+    // Replace all indices corresponding to unit-extent basis with 0.
+    // Remaining basis can be used to get a new `affine.delinearize_index` op.
+    SmallVector<Value> newOperands;
+    for (auto [index, basis] : llvm::enumerate(delinearizeOp.getBasis())) {
+      if (matchPattern(basis, m_One()))
+        replacements[index] = getZero();
+      else
+        newOperands.push_back(basis);
+    }
+
+    if (newOperands.size() == delinearizeOp.getBasis().size())
+      return failure();
+
+    if (!newOperands.empty()) {
+      auto newDelinearizeOp = rewriter.create<affine::AffineDelinearizeIndexOp>(
+          loc, delinearizeOp.getLinearIndex(), newOperands);
+      int newIndex = 0;
+      // Map back the new delinearized indices to the values they replace.
+      for (auto &replacement : replacements) {
+        if (replacement)
+          continue;
+        replacement = newDelinearizeOp->getResult(newIndex++);
+      }
+    }
+
+    rewriter.replaceOp(delinearizeOp, replacements);
+    return success();
+  }
+};
+
+/// Drop delinearization pattern related to loops in the following way
+///
+/// ```
+/// <loop>(%iv) = (%c0) to (%ub) step (%c1) {
+///   %0 = affine.delinearize_index %iv into (%ub) : index
+///   <some_use>(%0)
+/// }
+/// ```
+///
+/// can be canonicalized to
+///
+/// ```
+/// <loop>(%iv) = (%c0) to (%ub) step (%c1) {
+///   <some_use>(%iv)
+/// }
+/// ```
+struct DropDelinearizeOfSingleLoop
+    : public OpRewritePattern<affine::AffineDelinearizeIndexOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(affine::AffineDelinearizeIndexOp delinearizeOp,
+                                PatternRewriter &rewriter) const override {
+    auto basis = delinearizeOp.getBasis();
+    if (basis.size() != 1)
+      return failure();
+
+    // Check that the `linear_index` is an induction variable.
+    auto inductionVar = dyn_cast<BlockArgument>(delinearizeOp.getLinearIndex());
+    if (!inductionVar)
+      return failure();
+
+    // Check that the parent is a `LoopLikeOpInterface`.
+    auto loopLikeOp = dyn_cast<LoopLikeOpInterface>(
+        inductionVar.getParentRegion()->getParentOp());
+    if (!loopLikeOp)
+      return failure();
+
+    // Check that loop is unit-rank and that the `linear_index` is the induction
+    // variable.
+    auto inductionVars = loopLikeOp.getLoopInductionVars();
+    if (!inductionVars || inductionVars->size() != 1 ||
+        inductionVars->front() != inductionVar) {
+      return rewriter.notifyMatchFailure(
+          delinearizeOp, "`linear_index` is not loop induction variable");
+    }
+
+    // Check that the upper-bound is the basis.
+    auto upperBounds = loopLikeOp.getLoopUpperBounds();
+    if (!upperBounds || upperBounds->size() != 1 ||
+        upperBounds->front() != getAsOpFoldResult(basis.front())) {
+      return rewriter.notifyMatchFailure(delinearizeOp,
+                                         "`basis` is not upper bound");
+    }
+
+    // Check that the lower bound is zero.
+    auto lowerBounds = loopLikeOp.getLoopLowerBounds();
+    if (!lowerBounds || lowerBounds->size() != 1 ||
+        !isZeroIndex(lowerBounds->front())) {
+      return rewriter.notifyMatchFailure(delinearizeOp,
+                                         "loop lower bound is not zero");
+    }
+
+    // Check that the step is one.
+    auto steps = loopLikeOp.getLoopSteps();
+    if (!steps || steps->size() != 1 || !isConstantIntValue(steps->front(), 1))
+      return rewriter.notifyMatchFailure(delinearizeOp, "loop step is not one");
+
+    rewriter.replaceOp(delinearizeOp, inductionVar);
+    return success();
+  }
+};
+
+} // namespace
+
+void affine::AffineDelinearizeIndexOp::getCanonicalizationPatterns(
+    RewritePatternSet &patterns, MLIRContext *context) {
+  patterns.insert<DropDelinearizeOfSingleLoop, DropUnitExtentBasis>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index d68a29f07f1b6..150b9824c41e3 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -2067,7 +2067,7 @@ static LogicalResult generateCopy(
     // fastMemRefType is a constant shaped memref.
     auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
     // We don't account for things of unknown size.
-    *sizeInBytes = maySizeInBytes ? *maySizeInBytes : 0;
+    *sizeInBytes = maySizeInBytes.value_or(0);
 
     LLVM_DEBUG(emitRemarkForBlock(*block)
                << "Creating fast buffer of type " << fastMemRefType
diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
index 0bf8c8942885e..b51444e884aae 100644
--- a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Location.h"
@@ -49,30 +50,6 @@ struct EmulateFloatPattern final : ConversionPattern {
 };
 } // end namespace
 
-/// Map strings to float types. This function is here because no one else needs
-/// it yet, feel free to abstract it out.
-static std::optional<FloatType> parseFloatType(MLIRContext *ctx,
-                                               StringRef name) {
-  Builder b(ctx);
-  return llvm::StringSwitch<std::optional<FloatType>>(name)
-      .Case("f4E2M1FN", b.getFloat4E2M1FNType())
-      .Case("f6E2M3FN", b.getFloat6E2M3FNType())
-      .Case("f6E3M2FN", b.getFloat6E3M2FNType())
-      .Case("f8E5M2", b.getFloat8E5M2Type())
-      .Case("f8E4M3", b.getFloat8E4M3Type())
-      .Case("f8E4M3FN", b.getFloat8E4M3FNType())
-      .Case("f8E5M2FNUZ", b.getFloat8E5M2FNUZType())
-      .Case("f8E4M3FNUZ", b.getFloat8E4M3FNUZType())
-      .Case("f8E3M4", b.getFloat8E3M4Type())
-      .Case("bf16", b.getBF16Type())
-      .Case("f16", b.getF16Type())
-      .Case("f32", b.getF32Type())
-      .Case("f64", b.getF64Type())
-      .Case("f80", b.getF80Type())
-      .Case("f128", b.getF128Type())
-      .Default(std::nullopt);
-}
-
 LogicalResult EmulateFloatPattern::match(Operation *op) const {
   if (getTypeConverter()->isLegal(op))
     return failure();
@@ -156,7 +133,8 @@ void EmulateUnsupportedFloatsPass::runOnOperation() {
   SmallVector<Type> sourceTypes;
   Type targetType;
 
-  std::optional<FloatType> maybeTargetType = parseFloatType(ctx, targetTypeStr);
+  std::optional<FloatType> maybeTargetType =
+      arith::parseFloatType(ctx, targetTypeStr);
   if (!maybeTargetType) {
     emitError(UnknownLoc::get(ctx), "could not map target type '" +
                                         targetTypeStr +
@@ -166,7 +144,7 @@ void EmulateUnsupportedFloatsPass::runOnOperation() {
   targetType = *maybeTargetType;
   for (StringRef sourceTypeStr : sourceTypeStrs) {
     std::optional<FloatType> maybeSourceType =
-        parseFloatType(ctx, sourceTypeStr);
+        arith::parseFloatType(ctx, sourceTypeStr);
     if (!maybeSourceType) {
       emitError(UnknownLoc::get(ctx), "could not map source type '" +
                                           sourceTypeStr +
diff --git a/mlir/lib/Dialect/Arith/Utils/Utils.cpp b/mlir/lib/Dialect/Arith/Utils/Utils.cpp
index e75db84b75e28..c0aa16cc0da40 100644
--- a/mlir/lib/Dialect/Arith/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Arith/Utils/Utils.cpp
@@ -357,4 +357,26 @@ Value createProduct(OpBuilder &builder, Location loc, ArrayRef<Value> values,
       [&arithBuilder](Value acc, Value v) { return arithBuilder.mul(acc, v); });
 }
 
+/// Map strings to float types.
+std::optional<FloatType> parseFloatType(MLIRContext *ctx, StringRef name) {
+  Builder b(ctx);
+  return llvm::StringSwitch<std::optional<FloatType>>(name)
+      .Case("f4E2M1FN", b.getFloat4E2M1FNType())
+      .Case("f6E2M3FN", b.getFloat6E2M3FNType())
+      .Case("f6E3M2FN", b.getFloat6E3M2FNType())
+      .Case("f8E5M2", b.getFloat8E5M2Type())
+      .Case("f8E4M3", b.getFloat8E4M3Type())
+      .Case("f8E4M3FN", b.getFloat8E4M3FNType())
+      .Case("f8E5M2FNUZ", b.getFloat8E5M2FNUZType())
+      .Case("f8E4M3FNUZ", b.getFloat8E4M3FNUZType())
+      .Case("f8E3M4", b.getFloat8E3M4Type())
+      .Case("bf16", b.getBF16Type())
+      .Case("f16", b.getF16Type())
+      .Case("f32", b.getF32Type())
+      .Case("f64", b.getF64Type())
+      .Case("f80", b.getF80Type())
+      .Case("f128", b.getF128Type())
+      .Default(std::nullopt);
+}
+
 } // namespace mlir::arith
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index 85604eef2f283..92f757111cbaf 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -18,6 +18,7 @@
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/Debug.h"
 
@@ -314,7 +315,7 @@ namespace {
 /// Default function arg type converter: Use a fully dynamic layout map.
 BaseMemRefType
 defaultFunctionArgTypeConverter(TensorType type, Attribute memorySpace,
-                                func::FuncOp funcOp,
+                                FunctionOpInterface funcOp,
                                 const BufferizationOptions &options) {
   return getMemRefTypeWithFullyDynamicLayout(type, memorySpace);
 }
@@ -361,7 +362,7 @@ BufferizationOptions::dynCastBufferizableOp(Value value) const {
 void BufferizationOptions::setFunctionBoundaryTypeConversion(
     LayoutMapOption layoutMapOption) {
   functionArgTypeConverterFn = [=](TensorType tensorType, Attribute memorySpace,
-                                   func::FuncOp funcOp,
+                                   FunctionOpInterface funcOp,
                                    const BufferizationOptions &options) {
     if (layoutMapOption == LayoutMapOption::IdentityLayoutMap)
       return bufferization::getMemRefTypeWithStaticIdentityLayout(tensorType,
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index 04a8ff30ee946..f1841b860ff81 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -249,8 +249,7 @@ AllocTensorOp::getBufferType(Value value, const BufferizationOptions &options,
 LogicalResult AllocTensorOp::verify() {
   if (getCopy() && !getDynamicSizes().empty())
     return emitError("dynamic sizes not needed when copying a tensor");
-  if (!getCopy() && getType().getNumDynamicDims() !=
-                        static_cast<int64_t>(getDynamicSizes().size()))
+  if (!getCopy() && getType().getNumDynamicDims() != getDynamicSizes().size())
     return emitError("expected ")
            << getType().getNumDynamicDims() << " dynamic sizes";
   if (getCopy() && getCopy().getType() != getType())
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
index 9fbe574ec392d..9749a71f3514b 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
@@ -22,7 +22,7 @@ namespace mlir {
 namespace bufferization {
 namespace func_ext {
 
-void FuncAnalysisState::startFunctionAnalysis(FuncOp funcOp) {
+void FuncAnalysisState::startFunctionAnalysis(FunctionOpInterface funcOp) {
   analyzedFuncOps[funcOp] = FuncOpAnalysisState::InProgress;
   auto createdEquiv = equivalentFuncArgs.try_emplace(funcOp, IndexMapping());
   auto createdAliasingResults =
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp b/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp
index 9e2c91bad7bfd..31d165ce15407 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp
@@ -391,8 +391,7 @@ struct LowerDeallocationsPass
 
     bufferization::DeallocHelperMap deallocHelperFuncMap;
     if (auto module = dyn_cast<ModuleOp>(getOperation())) {
-      OpBuilder builder =
-          OpBuilder::atBlockBegin(&module.getBodyRegion().front());
+      OpBuilder builder = OpBuilder::atBlockBegin(module.getBody());
 
       // Build dealloc helper function if there are deallocs.
       getOperation()->walk([&](bufferization::DeallocOp deallocOp) {
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
index 0a4072605c265..a0e5c7fff7690 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
@@ -75,7 +75,7 @@ using namespace mlir::bufferization;
 using namespace mlir::bufferization::func_ext;
 
 /// A mapping of FuncOps to their callers.
-using FuncCallerMap = DenseMap<func::FuncOp, DenseSet<Operation *>>;
+using FuncCallerMap = DenseMap<FunctionOpInterface, DenseSet<Operation *>>;
 
 /// Get or create FuncAnalysisState.
 static FuncAnalysisState &
@@ -88,10 +88,11 @@ getOrCreateFuncAnalysisState(OneShotAnalysisState &state) {
 
 /// Return the unique ReturnOp that terminates `funcOp`.
 /// Return nullptr if there is no such unique ReturnOp.
-static func::ReturnOp getAssumedUniqueReturnOp(func::FuncOp funcOp) {
-  func::ReturnOp returnOp;
-  for (Block &b : funcOp.getBody()) {
-    if (auto candidateOp = dyn_cast<func::ReturnOp>(b.getTerminator())) {
+static Operation *getAssumedUniqueReturnOp(FunctionOpInterface funcOp) {
+  Operation *returnOp = nullptr;
+  for (Block &b : funcOp.getFunctionBody()) {
+    auto candidateOp = b.getTerminator();
+    if (candidateOp && candidateOp->hasTrait<OpTrait::ReturnLike>()) {
       if (returnOp)
         return nullptr;
       returnOp = candidateOp;
@@ -126,16 +127,16 @@ static void annotateEquivalentReturnBbArg(OpOperand &returnVal,
 /// Store function BlockArguments that are equivalent to/aliasing a returned
 /// value in FuncAnalysisState.
 static LogicalResult
-aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+aliasingFuncOpBBArgsAnalysis(FunctionOpInterface funcOp,
+                             OneShotAnalysisState &state,
                              FuncAnalysisState &funcState) {
-  if (funcOp.getBody().empty()) {
+  if (funcOp.getFunctionBody().empty()) {
     // No function body available. Conservatively assume that every tensor
     // return value may alias with any tensor bbArg.
-    FunctionType type = funcOp.getFunctionType();
-    for (const auto &inputIt : llvm::enumerate(type.getInputs())) {
+    for (const auto &inputIt : llvm::enumerate(funcOp.getArgumentTypes())) {
       if (!isa<TensorType>(inputIt.value()))
         continue;
-      for (const auto &resultIt : llvm::enumerate(type.getResults())) {
+      for (const auto &resultIt : llvm::enumerate(funcOp.getResultTypes())) {
         if (!isa<TensorType>(resultIt.value()))
           continue;
         int64_t returnIdx = resultIt.index();
@@ -147,7 +148,7 @@ aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
   }
 
   // Support only single return-terminated block in the function.
-  func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
+  Operation *returnOp = getAssumedUniqueReturnOp(funcOp);
   assert(returnOp && "expected func with single return op");
 
   for (OpOperand &returnVal : returnOp->getOpOperands())
@@ -168,8 +169,8 @@ aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
   return success();
 }
 
-static void annotateFuncArgAccess(func::FuncOp funcOp, int64_t idx, bool isRead,
-                                  bool isWritten) {
+static void annotateFuncArgAccess(FunctionOpInterface funcOp, int64_t idx,
+                                  bool isRead, bool isWritten) {
   OpBuilder b(funcOp.getContext());
   Attribute accessType;
   if (isRead && isWritten) {
@@ -189,12 +190,12 @@ static void annotateFuncArgAccess(func::FuncOp funcOp, int64_t idx, bool isRead,
 /// function with unknown ops, we conservatively assume that such ops bufferize
 /// to a read + write.
 static LogicalResult
-funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
+funcOpBbArgReadWriteAnalysis(FunctionOpInterface funcOp,
+                             OneShotAnalysisState &state,
                              FuncAnalysisState &funcState) {
-  for (int64_t idx = 0, e = funcOp.getFunctionType().getNumInputs(); idx < e;
-       ++idx) {
+  for (int64_t idx = 0, e = funcOp.getNumArguments(); idx < e; ++idx) {
     // Skip non-tensor arguments.
-    if (!isa<TensorType>(funcOp.getFunctionType().getInput(idx)))
+    if (!isa<TensorType>(funcOp.getArgumentTypes()[idx]))
       continue;
     bool isRead;
     bool isWritten;
@@ -204,7 +205,7 @@ funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
       StringRef str = accessAttr.getValue();
       isRead = str == "read" || str == "read-write";
       isWritten = str == "write" || str == "read-write";
-    } else if (funcOp.getBody().empty()) {
+    } else if (funcOp.getFunctionBody().empty()) {
       // If the function has no body, conservatively assume that all args are
       // read + written.
       isRead = true;
@@ -230,20 +231,19 @@ funcOpBbArgReadWriteAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
 
 /// Remove bufferization attributes on FuncOp arguments.
 static void removeBufferizationAttributes(BlockArgument bbArg) {
-  auto funcOp = cast<func::FuncOp>(bbArg.getOwner()->getParentOp());
+  auto funcOp = cast<FunctionOpInterface>(bbArg.getOwner()->getParentOp());
   funcOp.removeArgAttr(bbArg.getArgNumber(),
                        BufferizationDialect::kBufferLayoutAttrName);
   funcOp.removeArgAttr(bbArg.getArgNumber(),
                        BufferizationDialect::kWritableAttrName);
 }
 
-/// Return the func::FuncOp called by `callOp`.
-static func::FuncOp getCalledFunction(func::CallOp callOp) {
+static FunctionOpInterface getCalledFunction(CallOpInterface callOp) {
   SymbolRefAttr sym =
       llvm::dyn_cast_if_present<SymbolRefAttr>(callOp.getCallableForCallee());
   if (!sym)
     return nullptr;
-  return dyn_cast_or_null<func::FuncOp>(
+  return dyn_cast_or_null<FunctionOpInterface>(
       SymbolTable::lookupNearestSymbolFrom(callOp, sym));
 }
 
@@ -251,12 +251,13 @@ static func::FuncOp getCalledFunction(func::CallOp callOp) {
 /// Note: This only adds new equivalence info if the called function was already
 /// analyzed.
 // TODO: This does not handle cyclic function call graphs etc.
-static void equivalenceAnalysis(func::FuncOp funcOp,
+static void equivalenceAnalysis(FunctionOpInterface funcOp,
                                 OneShotAnalysisState &state,
                                 FuncAnalysisState &funcState) {
-  funcOp->walk([&](func::CallOp callOp) {
-    func::FuncOp calledFunction = getCalledFunction(callOp);
-    assert(calledFunction && "could not retrieved called func::FuncOp");
+  funcOp->walk([&](CallOpInterface callOp) {
+    FunctionOpInterface calledFunction = getCalledFunction(callOp);
+    if (!calledFunction)
+      return WalkResult::skip();
 
     // No equivalence info available for the called function.
     if (!funcState.equivalentFuncArgs.count(calledFunction))
@@ -267,7 +268,7 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
       int64_t bbargIdx = it.second;
       if (!state.isInPlace(callOp->getOpOperand(bbargIdx)))
         continue;
-      Value returnVal = callOp.getResult(returnIdx);
+      Value returnVal = callOp->getResult(returnIdx);
       Value argVal = callOp->getOperand(bbargIdx);
       state.unionEquivalenceClasses(returnVal, argVal);
     }
@@ -277,11 +278,9 @@ static void equivalenceAnalysis(func::FuncOp funcOp,
 }
 
 /// Return "true" if the given function signature has tensor semantics.
-static bool hasTensorSignature(func::FuncOp funcOp) {
-  return llvm::any_of(funcOp.getFunctionType().getInputs(),
-                      llvm::IsaPred<TensorType>) ||
-         llvm::any_of(funcOp.getFunctionType().getResults(),
-                      llvm::IsaPred<TensorType>);
+static bool hasTensorSignature(FunctionOpInterface funcOp) {
+  return llvm::any_of(funcOp.getArgumentTypes(), llvm::IsaPred<TensorType>) ||
+         llvm::any_of(funcOp.getResultTypes(), llvm::IsaPred<TensorType>);
 }
 
 /// Store all functions of the `moduleOp` in `orderedFuncOps`, sorted by
@@ -291,16 +290,16 @@ static bool hasTensorSignature(func::FuncOp funcOp) {
 /// retrieve the called FuncOp from any func::CallOp.
 static LogicalResult
 getFuncOpsOrderedByCalls(ModuleOp moduleOp,
-                         SmallVectorImpl<func::FuncOp> &orderedFuncOps,
+                         SmallVectorImpl<FunctionOpInterface> &orderedFuncOps,
                          FuncCallerMap &callerMap) {
   // For each FuncOp, the set of functions called by it (i.e. the union of
   // symbols of all nested func::CallOp).
-  DenseMap<func::FuncOp, DenseSet<func::FuncOp>> calledBy;
+  DenseMap<FunctionOpInterface, DenseSet<FunctionOpInterface>> calledBy;
   // For each FuncOp, the number of func::CallOp it contains.
-  DenseMap<func::FuncOp, unsigned> numberCallOpsContainedInFuncOp;
-  WalkResult res = moduleOp.walk([&](func::FuncOp funcOp) -> WalkResult {
-    if (!funcOp.getBody().empty()) {
-      func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
+  DenseMap<FunctionOpInterface, unsigned> numberCallOpsContainedInFuncOp;
+  WalkResult res = moduleOp.walk([&](FunctionOpInterface funcOp) -> WalkResult {
+    if (!funcOp.getFunctionBody().empty()) {
+      Operation *returnOp = getAssumedUniqueReturnOp(funcOp);
       if (!returnOp)
         return funcOp->emitError()
                << "cannot bufferize a FuncOp with tensors and "
@@ -309,9 +308,10 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
 
     // Collect function calls and populate the caller map.
     numberCallOpsContainedInFuncOp[funcOp] = 0;
-    return funcOp.walk([&](func::CallOp callOp) -> WalkResult {
-      func::FuncOp calledFunction = getCalledFunction(callOp);
-      assert(calledFunction && "could not retrieved called func::FuncOp");
+    return funcOp.walk([&](CallOpInterface callOp) -> WalkResult {
+      FunctionOpInterface calledFunction = getCalledFunction(callOp);
+      if (!calledFunction)
+        return WalkResult::skip();
       // If the called function does not have any tensors in its signature, then
       // it is not necessary to bufferize the callee before the caller.
       if (!hasTensorSignature(calledFunction))
@@ -349,11 +349,11 @@ getFuncOpsOrderedByCalls(ModuleOp moduleOp,
 /// most generic layout map as function return types. After bufferizing the
 /// entire function body, a more concise memref type can potentially be used for
 /// the return type of the function.
-static void foldMemRefCasts(func::FuncOp funcOp) {
-  if (funcOp.getBody().empty())
+static void foldMemRefCasts(FunctionOpInterface funcOp) {
+  if (funcOp.getFunctionBody().empty())
     return;
 
-  func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
+  Operation *returnOp = getAssumedUniqueReturnOp(funcOp);
   SmallVector<Type> resultTypes;
 
   for (OpOperand &operand : returnOp->getOpOperands()) {
@@ -365,8 +365,8 @@ static void foldMemRefCasts(func::FuncOp funcOp) {
     }
   }
 
-  auto newFuncType = FunctionType::get(
-      funcOp.getContext(), funcOp.getFunctionType().getInputs(), resultTypes);
+  auto newFuncType = FunctionType::get(funcOp.getContext(),
+                                       funcOp.getArgumentTypes(), resultTypes);
   funcOp.setType(newFuncType);
 }
 
@@ -379,7 +379,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
   FuncAnalysisState &funcState = getOrCreateFuncAnalysisState(state);
 
   // A list of functions in the order in which they are analyzed + bufferized.
-  SmallVector<func::FuncOp> orderedFuncOps;
+  SmallVector<FunctionOpInterface> orderedFuncOps;
 
   // A mapping of FuncOps to their callers.
   FuncCallerMap callerMap;
@@ -388,7 +388,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
     return failure();
 
   // Analyze ops.
-  for (func::FuncOp funcOp : orderedFuncOps) {
+  for (FunctionOpInterface funcOp : orderedFuncOps) {
     if (!state.getOptions().isOpAllowed(funcOp))
       continue;
 
@@ -416,7 +416,7 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
 
 void mlir::bufferization::removeBufferizationAttributesInModule(
     ModuleOp moduleOp) {
-  moduleOp.walk([&](func::FuncOp op) {
+  moduleOp.walk([&](FunctionOpInterface op) {
     for (BlockArgument bbArg : op.getArguments())
       removeBufferizationAttributes(bbArg);
   });
@@ -430,7 +430,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
   IRRewriter rewriter(moduleOp.getContext());
 
   // A list of functions in the order in which they are analyzed + bufferized.
-  SmallVector<func::FuncOp> orderedFuncOps;
+  SmallVector<FunctionOpInterface> orderedFuncOps;
 
   // A mapping of FuncOps to their callers.
   FuncCallerMap callerMap;
@@ -439,11 +439,11 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
     return failure();
 
   // Bufferize functions.
-  for (func::FuncOp funcOp : orderedFuncOps) {
+  for (FunctionOpInterface funcOp : orderedFuncOps) {
     // Note: It would be good to apply cleanups here but we cannot as aliasInfo
     // would be invalidated.
 
-    if (llvm::is_contained(options.noAnalysisFuncFilter, funcOp.getSymName())) {
+    if (llvm::is_contained(options.noAnalysisFuncFilter, funcOp.getName())) {
       // This function was not analyzed and RaW conflicts were not resolved.
       // Buffer copies must be inserted before every write.
       OneShotBufferizationOptions updatedOptions = options;
@@ -463,7 +463,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
   // Bufferize all other ops.
   for (Operation &op : llvm::make_early_inc_range(moduleOp.getOps())) {
     // Functions were already bufferized.
-    if (isa<func::FuncOp>(&op))
+    if (isa<FunctionOpInterface>(&op))
       continue;
     if (failed(bufferizeOp(&op, options, statistics)))
       return failure();
@@ -490,12 +490,12 @@ LogicalResult mlir::bufferization::runOneShotModuleBufferize(
       // FuncOps whose names are specified in options.noAnalysisFuncFilter will
       // not be analyzed. Ops in these FuncOps will not be analyzed as well.
       OpFilter::Entry::FilterFn analysisFilterFn = [=](Operation *op) {
-        auto func = dyn_cast<func::FuncOp>(op);
+        auto func = dyn_cast<FunctionOpInterface>(op);
         if (!func)
-          func = op->getParentOfType<func::FuncOp>();
+          func = op->getParentOfType<FunctionOpInterface>();
         if (func)
           return llvm::is_contained(options.noAnalysisFuncFilter,
-                                    func.getSymName());
+                                    func.getName());
         return false;
       };
       OneShotBufferizationOptions updatedOptions(options);
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index f822c11aeec00..956877497d933 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2045,8 +2045,7 @@ void WaitOp::getCanonicalizationPatterns(RewritePatternSet &results,
 LogicalResult AllocOp::verify() {
   auto memRefType = llvm::cast<MemRefType>(getMemref().getType());
 
-  if (static_cast<int64_t>(getDynamicSizes().size()) !=
-      memRefType.getNumDynamicDims())
+  if (getDynamicSizes().size() != memRefType.getNumDynamicDims())
     return emitOpError("dimension operand count does not equal memref "
                        "dynamic dimension count");
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 205d7494d4378..0561c364c7d59 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -220,6 +220,91 @@ static RetTy parseOptionalLLVMKeyword(OpAsmParser &parser,
   return static_cast<RetTy>(index);
 }
 
+//===----------------------------------------------------------------------===//
+// Operand bundle helpers.
+//===----------------------------------------------------------------------===//
+
+static void printOneOpBundle(OpAsmPrinter &p, OperandRange operands,
+                             TypeRange operandTypes, StringRef tag) {
+  p.printString(tag);
+  p << "(";
+
+  if (!operands.empty()) {
+    p.printOperands(operands);
+    p << " : ";
+    llvm::interleaveComma(operandTypes, p);
+  }
+
+  p << ")";
+}
+
+static void printOpBundles(OpAsmPrinter &p, Operation *op,
+                           OperandRangeRange opBundleOperands,
+                           TypeRangeRange opBundleOperandTypes,
+                           ArrayRef<std::string> opBundleTags) {
+  p << "[";
+  llvm::interleaveComma(
+      llvm::zip(opBundleOperands, opBundleOperandTypes, opBundleTags), p,
+      [&p](auto bundle) {
+        printOneOpBundle(p, std::get<0>(bundle), std::get<1>(bundle),
+                         std::get<2>(bundle));
+      });
+  p << "]";
+}
+
+static ParseResult parseOneOpBundle(
+    OpAsmParser &p,
+    SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
+    SmallVector<SmallVector<Type>> &opBundleOperandTypes,
+    SmallVector<std::string> &opBundleTags) {
+  SMLoc currentParserLoc = p.getCurrentLocation();
+  SmallVector<OpAsmParser::UnresolvedOperand> operands;
+  SmallVector<Type> types;
+  std::string tag;
+
+  if (p.parseString(&tag))
+    return p.emitError(currentParserLoc, "expect operand bundle tag");
+
+  if (p.parseLParen())
+    return failure();
+
+  if (p.parseOptionalRParen()) {
+    if (p.parseOperandList(operands) || p.parseColon() ||
+        p.parseTypeList(types) || p.parseRParen())
+      return failure();
+  }
+
+  opBundleOperands.push_back(std::move(operands));
+  opBundleOperandTypes.push_back(std::move(types));
+  opBundleTags.push_back(std::move(tag));
+
+  return success();
+}
+
+static std::optional<ParseResult> parseOpBundles(
+    OpAsmParser &p,
+    SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
+    SmallVector<SmallVector<Type>> &opBundleOperandTypes,
+    SmallVector<std::string> &opBundleTags) {
+  if (p.parseOptionalLSquare())
+    return std::nullopt;
+
+  if (succeeded(p.parseOptionalRSquare()))
+    return success();
+
+  auto bundleParser = [&] {
+    return parseOneOpBundle(p, opBundleOperands, opBundleOperandTypes,
+                            opBundleTags);
+  };
+  if (p.parseCommaSeparatedList(bundleParser))
+    return failure();
+
+  if (p.parseRSquare())
+    return failure();
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Printing, parsing, folding and builder for LLVM::CmpOp.
 //===----------------------------------------------------------------------===//
@@ -954,6 +1039,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
         /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -980,6 +1066,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
         /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr,
         /*no_unwind=*/nullptr, /*will_return=*/nullptr,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
         /*access_groups=*/nullptr,
         /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -992,6 +1079,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1004,6 +1092,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1027,7 +1116,7 @@ void CallOp::setCalleeFromCallable(CallInterfaceCallable callee) {
 }
 
 Operation::operand_range CallOp::getArgOperands() {
-  return getOperands().drop_front(getCallee().has_value() ? 0 : 1);
+  return getCalleeOperands().drop_front(getCallee().has_value() ? 0 : 1);
 }
 
 MutableOperandRange CallOp::getArgOperandsMutable() {
@@ -1100,6 +1189,21 @@ LogicalResult verifyCallOpVarCalleeType(OpTy callOp) {
   return success();
 }
 
+template <typename OpType>
+static LogicalResult verifyOperandBundles(OpType &op) {
+  OperandRangeRange opBundleOperands = op.getOpBundleOperands();
+  ArrayRef<std::string> opBundleTags = op.getOpBundleTags();
+
+  if (opBundleTags.size() != opBundleOperands.size())
+    return op.emitError("expected ")
+           << opBundleOperands.size()
+           << " operand bundle tags, but actually got " << opBundleTags.size();
+
+  return success();
+}
+
+LogicalResult CallOp::verify() { return verifyOperandBundles(*this); }
+
 LogicalResult CallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   if (failed(verifyCallOpVarCalleeType(*this)))
     return failure();
@@ -1150,15 +1254,15 @@ LogicalResult CallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   // Verify that the operand and result types match the callee.
 
   if (!funcType.isVarArg() &&
-      funcType.getNumParams() != (getNumOperands() - isIndirect))
+      funcType.getNumParams() != (getCalleeOperands().size() - isIndirect))
     return emitOpError() << "incorrect number of operands ("
-                         << (getNumOperands() - isIndirect)
+                         << (getCalleeOperands().size() - isIndirect)
                          << ") for callee (expecting: "
                          << funcType.getNumParams() << ")";
 
-  if (funcType.getNumParams() > (getNumOperands() - isIndirect))
+  if (funcType.getNumParams() > (getCalleeOperands().size() - isIndirect))
     return emitOpError() << "incorrect number of operands ("
-                         << (getNumOperands() - isIndirect)
+                         << (getCalleeOperands().size() - isIndirect)
                          << ") for varargs callee (expecting at least: "
                          << funcType.getNumParams() << ")";
 
@@ -1208,16 +1312,24 @@ void CallOp::print(OpAsmPrinter &p) {
   else
     p << getOperand(0);
 
-  auto args = getOperands().drop_front(isDirect ? 0 : 1);
+  auto args = getCalleeOperands().drop_front(isDirect ? 0 : 1);
   p << '(' << args << ')';
 
   // Print the variadic callee type if the call is variadic.
   if (std::optional<LLVMFunctionType> varCalleeType = getVarCalleeType())
     p << " vararg(" << *varCalleeType << ")";
 
+  if (!getOpBundleOperands().empty()) {
+    p << " ";
+    printOpBundles(p, *this, getOpBundleOperands(),
+                   getOpBundleOperands().getTypes(), getOpBundleTags());
+  }
+
   p.printOptionalAttrDict(processFMFAttr((*this)->getAttrs()),
                           {getCalleeAttrName(), getTailCallKindAttrName(),
-                           getVarCalleeTypeAttrName(), getCConvAttrName()});
+                           getVarCalleeTypeAttrName(), getCConvAttrName(),
+                           getOperandSegmentSizesAttrName(),
+                           getOpBundleSizesAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1285,14 +1397,47 @@ static ParseResult parseOptionalCallFuncPtr(
   return success();
 }
 
+static ParseResult resolveOpBundleOperands(
+    OpAsmParser &parser, SMLoc loc, OperationState &state,
+    ArrayRef<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands,
+    ArrayRef<SmallVector<Type>> opBundleOperandTypes,
+    StringAttr opBundleSizesAttrName) {
+  unsigned opBundleIndex = 0;
+  for (const auto &[operands, types] :
+       llvm::zip_equal(opBundleOperands, opBundleOperandTypes)) {
+    if (operands.size() != types.size())
+      return parser.emitError(loc, "expected ")
+             << operands.size()
+             << " types for operand bundle operands for operand bundle #"
+             << opBundleIndex << ", but actually got " << types.size();
+    if (parser.resolveOperands(operands, types, loc, state.operands))
+      return failure();
+  }
+
+  SmallVector<int32_t> opBundleSizes;
+  opBundleSizes.reserve(opBundleOperands.size());
+  for (const auto &operands : opBundleOperands)
+    opBundleSizes.push_back(operands.size());
+
+  state.addAttribute(
+      opBundleSizesAttrName,
+      DenseI32ArrayAttr::get(parser.getContext(), opBundleSizes));
+
+  return success();
+}
+
 // <operation> ::= `llvm.call` (cconv)? (tailcallkind)? (function-id | ssa-use)
 //                             `(` ssa-use-list `)`
 //                             ( `vararg(` var-callee-type `)` )?
+//                             ( `[` op-bundles-list `]` )?
 //                             attribute-dict? `:` (type `,`)? function-type
 ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
   SymbolRefAttr funcAttr;
   TypeAttr varCalleeType;
   SmallVector<OpAsmParser::UnresolvedOperand> operands;
+  SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
+  SmallVector<SmallVector<Type>> opBundleOperandTypes;
+  SmallVector<std::string> opBundleTags;
 
   // Default to C Calling Convention if no keyword is provided.
   result.addAttribute(
@@ -1333,11 +1478,35 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
       return failure();
   }
 
+  SMLoc opBundlesLoc = parser.getCurrentLocation();
+  if (std::optional<ParseResult> result = parseOpBundles(
+          parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
+      result && failed(*result))
+    return failure();
+  if (!opBundleTags.empty())
+    result.getOrAddProperties<CallOp::Properties>().op_bundle_tags =
+        std::move(opBundleTags);
+
   if (parser.parseOptionalAttrDict(result.attributes))
     return failure();
 
   // Parse the trailing type list and resolve the operands.
-  return parseCallTypeAndResolveOperands(parser, result, isDirect, operands);
+  if (parseCallTypeAndResolveOperands(parser, result, isDirect, operands))
+    return failure();
+  if (resolveOpBundleOperands(parser, opBundlesLoc, result, opBundleOperands,
+                              opBundleOperandTypes,
+                              getOpBundleSizesAttrName(result.name)))
+    return failure();
+
+  int32_t numOpBundleOperands = 0;
+  for (const auto &operands : opBundleOperands)
+    numOpBundleOperands += operands.size();
+
+  result.addAttribute(
+      CallOp::getOperandSegmentSizeAttr(),
+      parser.getBuilder().getDenseI32ArrayAttr(
+          {static_cast<int32_t>(operands.size()), numOpBundleOperands}));
+  return success();
 }
 
 LLVMFunctionType CallOp::getCalleeFunctionType() {
@@ -1356,7 +1525,8 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
   auto calleeType = func.getFunctionType();
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), ops,
-        normalOps, unwindOps, nullptr, nullptr, normal, unwind);
+        normalOps, unwindOps, nullptr, nullptr, {}, std::nullopt, normal,
+        unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
@@ -1365,7 +1535,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
                      ValueRange unwindOps) {
   build(builder, state, tys,
         /*var_callee_type=*/nullptr, callee, ops, normalOps, unwindOps, nullptr,
-        nullptr, normal, unwind);
+        nullptr, {}, std::nullopt, normal, unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state,
@@ -1374,7 +1544,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state,
                      Block *unwind, ValueRange unwindOps) {
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), callee, ops, normalOps, unwindOps,
-        nullptr, nullptr, normal, unwind);
+        nullptr, nullptr, {}, std::nullopt, normal, unwind);
 }
 
 SuccessorOperands InvokeOp::getSuccessorOperands(unsigned index) {
@@ -1402,7 +1572,7 @@ void InvokeOp::setCalleeFromCallable(CallInterfaceCallable callee) {
 }
 
 Operation::operand_range InvokeOp::getArgOperands() {
-  return getOperands().drop_front(getCallee().has_value() ? 0 : 1);
+  return getCalleeOperands().drop_front(getCallee().has_value() ? 0 : 1);
 }
 
 MutableOperandRange InvokeOp::getArgOperandsMutable() {
@@ -1423,6 +1593,9 @@ LogicalResult InvokeOp::verify() {
     return emitError("first operation in unwind destination should be a "
                      "llvm.landingpad operation");
 
+  if (failed(verifyOperandBundles(*this)))
+    return failure();
+
   return success();
 }
 
@@ -1442,7 +1615,7 @@ void InvokeOp::print(OpAsmPrinter &p) {
   else
     p << getOperand(0);
 
-  p << '(' << getOperands().drop_front(isDirect ? 0 : 1) << ')';
+  p << '(' << getCalleeOperands().drop_front(isDirect ? 0 : 1) << ')';
   p << " to ";
   p.printSuccessorAndUseList(getNormalDest(), getNormalDestOperands());
   p << " unwind ";
@@ -1452,15 +1625,23 @@ void InvokeOp::print(OpAsmPrinter &p) {
   if (std::optional<LLVMFunctionType> varCalleeType = getVarCalleeType())
     p << " vararg(" << *varCalleeType << ")";
 
+  if (!getOpBundleOperands().empty()) {
+    p << " ";
+    printOpBundles(p, *this, getOpBundleOperands(),
+                   getOpBundleOperands().getTypes(), getOpBundleTags());
+  }
+
   p.printOptionalAttrDict((*this)->getAttrs(),
                           {getCalleeAttrName(), getOperandSegmentSizeAttr(),
-                           getCConvAttrName(), getVarCalleeTypeAttrName()});
+                           getCConvAttrName(), getVarCalleeTypeAttrName(),
+                           getOpBundleSizesAttrName()});
 
   p << " : ";
   if (!isDirect)
     p << getOperand(0).getType() << ", ";
-  p.printFunctionalType(llvm::drop_begin(getOperandTypes(), isDirect ? 0 : 1),
-                        getResultTypes());
+  p.printFunctionalType(
+      llvm::drop_begin(getCalleeOperands().getTypes(), isDirect ? 0 : 1),
+      getResultTypes());
 }
 
 // <operation> ::= `llvm.invoke` (cconv)? (function-id | ssa-use)
@@ -1468,11 +1649,15 @@ void InvokeOp::print(OpAsmPrinter &p) {
 //                  `to` bb-id (`[` ssa-use-and-type-list `]`)?
 //                  `unwind` bb-id (`[` ssa-use-and-type-list `]`)?
 //                  ( `vararg(` var-callee-type `)` )?
+//                  ( `[` op-bundles-list `]` )?
 //                  attribute-dict? `:` (type `,`)? function-type
 ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
   SmallVector<OpAsmParser::UnresolvedOperand, 8> operands;
   SymbolRefAttr funcAttr;
   TypeAttr varCalleeType;
+  SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
+  SmallVector<SmallVector<Type>> opBundleOperandTypes;
+  SmallVector<std::string> opBundleTags;
   Block *normalDest, *unwindDest;
   SmallVector<Value, 4> normalOperands, unwindOperands;
   Builder &builder = parser.getBuilder();
@@ -1513,22 +1698,40 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
       return failure();
   }
 
+  SMLoc opBundlesLoc = parser.getCurrentLocation();
+  if (std::optional<ParseResult> result = parseOpBundles(
+          parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
+      result && failed(*result))
+    return failure();
+  if (!opBundleTags.empty())
+    result.getOrAddProperties<InvokeOp::Properties>().op_bundle_tags =
+        std::move(opBundleTags);
+
   if (parser.parseOptionalAttrDict(result.attributes))
     return failure();
 
   // Parse the trailing type list and resolve the function operands.
   if (parseCallTypeAndResolveOperands(parser, result, isDirect, operands))
     return failure();
+  if (resolveOpBundleOperands(parser, opBundlesLoc, result, opBundleOperands,
+                              opBundleOperandTypes,
+                              getOpBundleSizesAttrName(result.name)))
+    return failure();
 
   result.addSuccessors({normalDest, unwindDest});
   result.addOperands(normalOperands);
   result.addOperands(unwindOperands);
 
-  result.addAttribute(InvokeOp::getOperandSegmentSizeAttr(),
-                      builder.getDenseI32ArrayAttr(
-                          {static_cast<int32_t>(operands.size()),
-                           static_cast<int32_t>(normalOperands.size()),
-                           static_cast<int32_t>(unwindOperands.size())}));
+  int32_t numOpBundleOperands = 0;
+  for (const auto &operands : opBundleOperands)
+    numOpBundleOperands += operands.size();
+
+  result.addAttribute(
+      InvokeOp::getOperandSegmentSizeAttr(),
+      builder.getDenseI32ArrayAttr({static_cast<int32_t>(operands.size()),
+                                    static_cast<int32_t>(normalOperands.size()),
+                                    static_cast<int32_t>(unwindOperands.size()),
+                                    numOpBundleOperands}));
   return success();
 }
 
@@ -3108,6 +3311,8 @@ OpFoldResult LLVM::OrOp::fold(FoldAdaptor adaptor) {
 LogicalResult CallIntrinsicOp::verify() {
   if (!getIntrin().starts_with("llvm."))
     return emitOpError() << "intrinsic name must start with 'llvm.'";
+  if (failed(verifyOperandBundles(*this)))
+    return failure();
   return success();
 }
 
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index 0c48a5aeb26a2..0b5191664a9e2 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -222,7 +222,7 @@ bool mlir::linalg::detail::isContractionBody(
   Value contributed = getSourceSkipUnary(
       isa<BlockArgument>(reductionLHS) ? reductionRHS : reductionLHS);
   Operation *elementwiseOp = contributed.getDefiningOp();
-  if (elementwiseOp->getNumResults() != 1 ||
+  if (!elementwiseOp || elementwiseOp->getNumResults() != 1 ||
       elementwiseOp->getNumOperands() != 2) {
     errs << "expected elementwise op to be binary";
     return false;
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 29b5631f61b48..46c8510f4ed51 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2000,7 +2000,7 @@ transform::HoistPadOp::applyToOne(transform::TransformRewriter &rewriter,
                                   transform::ApplyToEachResultList &results,
                                   transform::TransformState &state) {
   tensor::PadOp hoistedPadOp;
-  SmallVector<GenericOp> transposeOps;
+  SmallVector<TransposeOp> transposeOps;
   FailureOr<Value> result =
       hoistPaddingOnTensors(rewriter, target, getNumLoops(), getTranspose(),
                             hoistedPadOp, transposeOps);
@@ -3504,9 +3504,7 @@ DiagnosedSilenceableFailure transform::VectorizeOp::apply(
 
     if (failed(linalg::vectorize(rewriter, target, vectorSizes,
                                  getScalableSizes(),
-                                 getVectorizeNdExtract().has_value()
-                                     ? getVectorizeNdExtract().value()
-                                     : false))) {
+                                 getVectorizeNdExtract().value_or(false)))) {
       return mlir::emitSilenceableFailure(target->getLoc())
              << "Attempted to vectorize, but failed";
     }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
index c3a08ce86082a..d33a17af63459 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -633,15 +633,15 @@ static FailureOr<PackingResult> buildPackingLoopNestImpl(
                                       rewriter.getIndexAttr(1));
 
   // Step 3. Optionally transpose the padded tensor.
-  GenericOp maybeTransposeOp;
+  TransposeOp maybeTransposeOp;
   Value paddedTensor = bvm.lookup(opToHoist.getResult());
   if (!transposeVector.empty()) {
     Value outputTensor = rewriter.create<tensor::ExtractSliceOp>(
         loc, transposedTensorType, hoistedPackedTensor, offsets, sizes,
         strides);
-    maybeTransposeOp = makeTransposeOp(rewriter, loc, paddedTensor,
-                                       outputTensor, transposeVector);
-    paddedTensor = maybeTransposeOp.getResult(0);
+    maybeTransposeOp = rewriter.create<linalg::TransposeOp>(
+        loc, paddedTensor, outputTensor, transposeVector);
+    paddedTensor = maybeTransposeOp.getResult()[0];
   }
 
   // Innermost tensor.insert_slice and yields are optional / need loops.
@@ -938,7 +938,7 @@ static Value replaceByPackingResult(RewriterBase &rewriter,
 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
     RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops,
     ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,
-    SmallVectorImpl<GenericOp> &transposeOps) {
+    SmallVectorImpl<TransposeOp> &transposeOps) {
   LLVM_DEBUG(DBGS() << "\n"; DBGS() << " Try to hoist " << *(opToHoist) << "\n";
              DBGS() << " by " << numLoops << " loops\n");
 
@@ -980,9 +980,9 @@ FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
     // Transpose the packed tensor back to the original storage order.
     Value emptyTensor = rewriter.create<tensor::EmptyOp>(
         loc, paddedTensorType.getShape(), paddedTensorType.getElementType());
-    GenericOp unTransposeOp =
-        makeTransposeOp(rewriter, loc, newResult, emptyTensor, transposeVector);
-    newResult = unTransposeOp.getResult(0);
+    TransposeOp unTransposeOp = rewriter.create<linalg::TransposeOp>(
+        loc, newResult, emptyTensor, transposeVector);
+    newResult = unTransposeOp.getResult()[0];
     transposeOps.push_back(unTransposeOp);
   }
 
@@ -999,11 +999,10 @@ FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
   return newResult;
 }
 
-FailureOr<Value>
-mlir::linalg::hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops,
-                                    ArrayRef<int64_t> transposeVector,
-                                    tensor::PadOp &hoistedOp,
-                                    SmallVectorImpl<GenericOp> &transposeOps) {
+FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
+    tensor::PadOp opToHoist, int64_t numLoops,
+    ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,
+    SmallVectorImpl<TransposeOp> &transposeOps) {
   IRRewriter rewriter(opToHoist.getContext());
   return hoistPaddingOnTensors(rewriter, opToHoist, numLoops, transposeVector,
                                hoistedOp, transposeOps);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
index 518d2e138c02a..a066c44408915 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
@@ -299,7 +299,7 @@ mlir::linalg::padAndHoistLinalgOp(RewriterBase &rewriter, LinalgOp linalgOp,
     }
 
     tensor::PadOp hoistedOp;
-    SmallVector<GenericOp> transposeOps;
+    SmallVector<TransposeOp> transposeOps;
     SmallVector<int64_t> transposeVector =
         en.index() < options.transposePaddings.size()
             ? options.transposePaddings[en.index()]
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index fa20001f66182..ca85f4b9b9c15 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -224,10 +224,10 @@ struct VectorizationState {
   /// Masks an operation with the canonical vector mask if the operation needs
   /// masking. Returns the masked operation or the original operation if masking
   /// is not needed. If provided, the canonical mask for this operation is
-  /// permuted using `maybeMaskingMap`.
+  /// permuted using `maybeIndexingMap`.
   Operation *
   maskOperation(RewriterBase &rewriter, Operation *opToMask, LinalgOp linalgOp,
-                std::optional<AffineMap> maybeMaskingMap = std::nullopt);
+                std::optional<AffineMap> maybeIndexingMap = std::nullopt);
 
 private:
   /// Initializes the iteration space static sizes using the Linalg op
@@ -422,16 +422,28 @@ Value VectorizationState::getOrCreateMaskFor(
   return mask;
 }
 
-/// Masks an operation with the canonical vector mask if the operation needs
-/// masking. Returns the masked operation or the original operation if masking
-/// is not needed. If provided, the canonical mask for this operation is
-/// permuted using `maybeMaskingMap`.
 Operation *
 VectorizationState::maskOperation(RewriterBase &rewriter, Operation *opToMask,
                                   LinalgOp linalgOp,
-                                  std::optional<AffineMap> maybeMaskingMap) {
+                                  std::optional<AffineMap> maybeIndexingMap) {
   LDBG("Trying to mask: " << *opToMask << "\n");
 
+  std::optional<AffineMap> maybeMaskingMap = std::nullopt;
+  // The Operand indexing map may contain "zero" results, e.g.:
+  //    (d0, d1, d2, d3) -> (d0, d1, d2, 0)
+  // When applied to canonical vector shapes like these:
+  //    (1, 16, 16, 4)
+  // we would get:
+  //    (1, 16, 16, 0)
+  // Instead, we should extract the following map permutation map for masking:
+  //    (d0, d1, d2, d3) -> (d0, d1, d2)
+  // This way, the corresponding vector/mask type will be:
+  //    vector<1x16x16xty>
+  // rather than:
+  //    vector<1x16x16x0xty>
+  if (maybeIndexingMap)
+    maybeMaskingMap = maybeIndexingMap->dropZeroResults();
+
   // Create or retrieve mask for this operation.
   Value mask =
       getOrCreateMaskFor(rewriter, opToMask, linalgOp, maybeMaskingMap);
@@ -476,7 +488,8 @@ static AffineMap reindexIndexingMap(AffineMap map) {
   assert(map.isProjectedPermutation(/*allowZeroInResults=*/true) &&
          "expected projected permutation");
   auto res = compressUnusedDims(map);
-  assert(res.getNumDims() == res.getNumResults() &&
+  assert(res.getNumDims() ==
+             (res.getNumResults() - res.getNumOfZeroResults()) &&
          "expected reindexed map with same number of dims and results");
   return res;
 }
@@ -1349,16 +1362,6 @@ vectorizeAsLinalgGeneric(RewriterBase &rewriter, VectorizationState &state,
     // permutation map and masking map.
     AffineMap indexingMap = linalgOp.getMatchingIndexingMap(opOperand);
 
-    // Remove zeros from indexing map to use it as masking map.
-    SmallVector<int64_t> zeroPos;
-    auto results = indexingMap.getResults();
-    for (const auto &result : llvm::enumerate(results)) {
-      if (isa<AffineConstantExpr>(result.value())) {
-        zeroPos.push_back(result.index());
-      }
-    }
-    AffineMap maskingMap = indexingMap.dropResults(zeroPos);
-
     AffineMap readMap;
     VectorType readType;
     Type elemType = getElementTypeOrSelf(opOperand->get());
@@ -1388,7 +1391,7 @@ vectorizeAsLinalgGeneric(RewriterBase &rewriter, VectorizationState &state,
     Operation *read = rewriter.create<vector::TransferReadOp>(
         loc, readType, opOperand->get(), indices, readMap,
         ArrayRef<bool>(inBounds));
-    read = state.maskOperation(rewriter, read, linalgOp, maskingMap);
+    read = state.maskOperation(rewriter, read, linalgOp, indexingMap);
     Value readValue = read->getResult(0);
 
     // 3.b. If masked, set in-bounds to true. Masking guarantees that the access
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 6a3f2fc5fbc49..38e427af1c484 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -249,41 +249,6 @@ Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
   return sliceOp.getSource();
 }
 
-GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor,
-                          Value outputTensor,
-                          ArrayRef<int64_t> transposeVector) {
-  auto resultTensorType = cast<RankedTensorType>(outputTensor.getType());
-  Type elementType = resultTensorType.getElementType();
-
-  assert(isPermutationVector(transposeVector) &&
-         "expect transpose vector to be a permutation");
-  assert(transposeVector.size() ==
-             static_cast<size_t>(resultTensorType.getRank()) &&
-         "expect transpose vector size to match result tensor rank");
-
-  // Compute the transpose and the indentity indexing maps.
-  SmallVector<AffineMap> indexingMaps = {
-      inversePermutation(AffineMap::getPermutationMap(
-          SmallVector<unsigned>(transposeVector), b.getContext())),
-      AffineMap::getMultiDimIdentityMap(transposeVector.size(),
-                                        b.getContext())};
-  SmallVector<utils::IteratorType> iteratorTypes(transposeVector.size(),
-                                                 utils::IteratorType::parallel);
-
-  // Create a GenericOp to transpose `inputTensor` into `outputTensor`.
-  auto transposeOp =
-      b.create<GenericOp>(loc, resultTensorType, inputTensor, outputTensor,
-                          indexingMaps, iteratorTypes);
-
-  // Create the body of the transpose operation.
-  OpBuilder::InsertionGuard g(b);
-  Region &body = transposeOp.getRegion();
-  Block *bodyBlock = b.createBlock(&body, /*insertPt=*/{},
-                                   {elementType, elementType}, {loc, loc});
-  b.create<YieldOp>(loc, bodyBlock->getArgument(0));
-  return transposeOp;
-}
-
 GenericOp makeMemRefCopyOp(OpBuilder &b, Location loc, Value from, Value to) {
   auto memrefTypeTo = cast<MemRefType>(to.getType());
 #ifndef NDEBUG
diff --git a/mlir/lib/Dialect/Math/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Math/Transforms/CMakeLists.txt
index 2a5b4fbcb5271..e1c0c2410c126 100644
--- a/mlir/lib/Dialect/Math/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Math/Transforms/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_mlir_dialect_library(MLIRMathTransforms
   AlgebraicSimplification.cpp
   ExpandPatterns.cpp
-  LegalizeToF32.cpp
+  ExtendToSupportedTypes.cpp
   PolynomialApproximation.cpp
   UpliftToFMA.cpp
 
diff --git a/mlir/lib/Dialect/Math/Transforms/ExtendToSupportedTypes.cpp b/mlir/lib/Dialect/Math/Transforms/ExtendToSupportedTypes.cpp
new file mode 100644
index 0000000000000..1a9eafec9fdd5
--- /dev/null
+++ b/mlir/lib/Dialect/Math/Transforms/ExtendToSupportedTypes.cpp
@@ -0,0 +1,164 @@
+//===- ExtendToSupportedTypes.cpp - Legalize functions on unsupported floats
+//----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements legalizing math operations on unsupported floating-point
+// types through arith.extf and arith.truncf.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/Math/Transforms/Passes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace mlir::math {
+#define GEN_PASS_DEF_MATHEXTENDTOSUPPORTEDTYPES
+#include "mlir/Dialect/Math/Transforms/Passes.h.inc"
+} // namespace mlir::math
+
+using namespace mlir;
+
+namespace {
+struct ExtendToSupportedTypesRewritePattern final : ConversionPattern {
+  ExtendToSupportedTypesRewritePattern(TypeConverter &converter,
+                                       MLIRContext *context)
+      : ConversionPattern(converter, MatchAnyOpTypeTag{}, 1, context) {}
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+struct ExtendToSupportedTypesPass
+    : mlir::math::impl::MathExtendToSupportedTypesBase<
+          ExtendToSupportedTypesPass> {
+  using math::impl::MathExtendToSupportedTypesBase<
+      ExtendToSupportedTypesPass>::MathExtendToSupportedTypesBase;
+
+  void runOnOperation() override;
+};
+} // namespace
+
+void mlir::math::populateExtendToSupportedTypesTypeConverter(
+    TypeConverter &typeConverter, const SetVector<Type> &sourceTypes,
+    Type targetType) {
+
+  typeConverter.addConversion(
+      [](Type type) -> std::optional<Type> { return type; });
+  typeConverter.addConversion(
+      [&sourceTypes, targetType](FloatType type) -> std::optional<Type> {
+        if (!sourceTypes.contains(type))
+          return targetType;
+
+        return std::nullopt;
+      });
+  typeConverter.addConversion(
+      [&sourceTypes, targetType](ShapedType type) -> std::optional<Type> {
+        if (auto elemTy = dyn_cast<FloatType>(type.getElementType()))
+          if (!sourceTypes.contains(elemTy))
+            return type.clone(targetType);
+
+        return std::nullopt;
+      });
+  typeConverter.addTargetMaterialization(
+      [](OpBuilder &b, Type target, ValueRange input, Location loc) {
+        auto extFOp = b.create<arith::ExtFOp>(loc, target, input);
+        extFOp.setFastmath(arith::FastMathFlags::contract);
+        return extFOp;
+      });
+}
+
+void mlir::math::populateExtendToSupportedTypesConversionTarget(
+    ConversionTarget &target, TypeConverter &typeConverter) {
+  target.markUnknownOpDynamicallyLegal([&typeConverter](Operation *op) -> bool {
+    if (isa<MathDialect>(op->getDialect()))
+      return typeConverter.isLegal(op);
+    return true;
+  });
+  target.addLegalOp<FmaOp>();
+  target.addLegalOp<arith::ExtFOp, arith::TruncFOp>();
+}
+
+LogicalResult ExtendToSupportedTypesRewritePattern::matchAndRewrite(
+    Operation *op, ArrayRef<Value> operands,
+    ConversionPatternRewriter &rewriter) const {
+  Location loc = op->getLoc();
+  const TypeConverter *converter = getTypeConverter();
+  FailureOr<Operation *> legalized =
+      convertOpResultTypes(op, operands, *converter, rewriter);
+  if (failed(legalized))
+    return failure();
+
+  SmallVector<Value> results = (*legalized)->getResults();
+  for (auto [result, newType, origType] : llvm::zip_equal(
+           results, (*legalized)->getResultTypes(), op->getResultTypes())) {
+    if (newType != origType) {
+      auto truncFOp = rewriter.create<arith::TruncFOp>(loc, origType, result);
+      truncFOp.setFastmath(arith::FastMathFlags::contract);
+      result = truncFOp.getResult();
+    }
+  }
+  rewriter.replaceOp(op, results);
+  return success();
+}
+
+void mlir::math::populateExtendToSupportedTypesPatterns(
+    RewritePatternSet &patterns, TypeConverter &typeConverter) {
+  patterns.add<ExtendToSupportedTypesRewritePattern>(typeConverter,
+                                                     patterns.getContext());
+}
+
+void ExtendToSupportedTypesPass::runOnOperation() {
+  Operation *op = getOperation();
+  MLIRContext *ctx = &getContext();
+
+  // Parse target type
+  std::optional<Type> maybeTargetType =
+      arith::parseFloatType(ctx, targetTypeStr);
+  if (!maybeTargetType.has_value()) {
+    emitError(UnknownLoc::get(ctx), "could not map target type '" +
+                                        targetTypeStr +
+                                        "' to a known floating-point type");
+    return signalPassFailure();
+  }
+  Type targetType = maybeTargetType.value();
+
+  // Parse source types
+  llvm::SetVector<Type> sourceTypes;
+  for (const auto &extraTypeStr : extraTypeStrs) {
+    std::optional<FloatType> maybeExtraType =
+        arith::parseFloatType(ctx, extraTypeStr);
+    if (!maybeExtraType.has_value()) {
+      emitError(UnknownLoc::get(ctx), "could not map source type '" +
+                                          extraTypeStr +
+                                          "' to a known floating-point type");
+      return signalPassFailure();
+    }
+    sourceTypes.insert(maybeExtraType.value());
+  }
+  // f64 and f32 are implicitly supported
+  Builder b(ctx);
+  sourceTypes.insert(b.getF64Type());
+  sourceTypes.insert(b.getF32Type());
+
+  TypeConverter typeConverter;
+  math::populateExtendToSupportedTypesTypeConverter(typeConverter, sourceTypes,
+                                                    targetType);
+  ConversionTarget target(*ctx);
+  math::populateExtendToSupportedTypesConversionTarget(target, typeConverter);
+  RewritePatternSet patterns(ctx);
+  math::populateExtendToSupportedTypesPatterns(patterns, typeConverter);
+  if (failed(applyPartialConversion(op, target, std::move(patterns))))
+    return signalPassFailure();
+}
diff --git a/mlir/lib/Dialect/Math/Transforms/LegalizeToF32.cpp b/mlir/lib/Dialect/Math/Transforms/LegalizeToF32.cpp
deleted file mode 100644
index 2e60fe455dcad..0000000000000
--- a/mlir/lib/Dialect/Math/Transforms/LegalizeToF32.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-//===- LegalizeToF32.cpp - Legalize functions on small floats ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements legalizing math operations on small floating-point
-// types through arith.extf and arith.truncf.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/Math/Transforms/Passes.h"
-#include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "llvm/ADT/STLExtras.h"
-
-namespace mlir::math {
-#define GEN_PASS_DEF_MATHLEGALIZETOF32
-#include "mlir/Dialect/Math/Transforms/Passes.h.inc"
-} // namespace mlir::math
-
-using namespace mlir;
-namespace {
-struct LegalizeToF32RewritePattern final : ConversionPattern {
-  LegalizeToF32RewritePattern(TypeConverter &converter, MLIRContext *context)
-      : ConversionPattern(converter, MatchAnyOpTypeTag{}, 1, context) {}
-  LogicalResult
-  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
-                  ConversionPatternRewriter &rewriter) const override;
-};
-
-struct LegalizeToF32Pass final
-    : mlir::math::impl::MathLegalizeToF32Base<LegalizeToF32Pass> {
-  void runOnOperation() override;
-};
-} // namespace
-
-void mlir::math::populateLegalizeToF32TypeConverter(
-    TypeConverter &typeConverter) {
-  typeConverter.addConversion(
-      [](Type type) -> std::optional<Type> { return type; });
-  typeConverter.addConversion([](FloatType type) -> std::optional<Type> {
-    if (type.getWidth() < 32)
-      return Float32Type::get(type.getContext());
-    return std::nullopt;
-  });
-  typeConverter.addConversion([](ShapedType type) -> std::optional<Type> {
-    if (auto elemTy = dyn_cast<FloatType>(type.getElementType()))
-      return type.clone(Float32Type::get(type.getContext()));
-    return std::nullopt;
-  });
-  typeConverter.addTargetMaterialization(
-      [](OpBuilder &b, Type target, ValueRange input, Location loc) {
-        auto extFOp = b.create<arith::ExtFOp>(loc, target, input);
-        extFOp.setFastmath(arith::FastMathFlags::contract);
-        return extFOp;
-      });
-}
-
-void mlir::math::populateLegalizeToF32ConversionTarget(
-    ConversionTarget &target, TypeConverter &typeConverter) {
-  target.markUnknownOpDynamicallyLegal([&typeConverter](Operation *op) -> bool {
-    if (isa<MathDialect>(op->getDialect()))
-      return typeConverter.isLegal(op);
-    return true;
-  });
-  target.addLegalOp<FmaOp>();
-  target.addLegalOp<arith::ExtFOp, arith::TruncFOp>();
-}
-
-LogicalResult LegalizeToF32RewritePattern::matchAndRewrite(
-    Operation *op, ArrayRef<Value> operands,
-    ConversionPatternRewriter &rewriter) const {
-  Location loc = op->getLoc();
-  const TypeConverter *converter = getTypeConverter();
-  FailureOr<Operation *> legalized =
-      convertOpResultTypes(op, operands, *converter, rewriter);
-  if (failed(legalized))
-    return failure();
-
-  SmallVector<Value> results = (*legalized)->getResults();
-  for (auto [result, newType, origType] : llvm::zip_equal(
-           results, (*legalized)->getResultTypes(), op->getResultTypes())) {
-    if (newType != origType) {
-      auto truncFOp = rewriter.create<arith::TruncFOp>(loc, origType, result);
-      truncFOp.setFastmath(arith::FastMathFlags::contract);
-      result = truncFOp.getResult();
-    }
-  }
-  rewriter.replaceOp(op, results);
-  return success();
-}
-
-void mlir::math::populateLegalizeToF32Patterns(RewritePatternSet &patterns,
-                                               TypeConverter &typeConverter) {
-  patterns.add<LegalizeToF32RewritePattern>(typeConverter,
-                                            patterns.getContext());
-}
-
-void LegalizeToF32Pass::runOnOperation() {
-  Operation *op = getOperation();
-  MLIRContext &ctx = getContext();
-
-  TypeConverter typeConverter;
-  math::populateLegalizeToF32TypeConverter(typeConverter);
-  ConversionTarget target(ctx);
-  math::populateLegalizeToF32ConversionTarget(target, typeConverter);
-  RewritePatternSet patterns(&ctx);
-  math::populateLegalizeToF32Patterns(patterns, typeConverter);
-  if (failed(applyPartialConversion(op, target, std::move(patterns))))
-    return signalPassFailure();
-}
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 75b9729e63648..d579a27359dfa 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -205,8 +205,7 @@ static LogicalResult verifyAllocLikeOp(AllocLikeOp op) {
   if (!memRefType)
     return op.emitOpError("result must be a memref");
 
-  if (static_cast<int64_t>(op.getDynamicSizes().size()) !=
-      memRefType.getNumDynamicDims())
+  if (op.getDynamicSizes().size() != memRefType.getNumDynamicDims())
     return op.emitOpError("dimension operand count does not equal memref "
                           "dynamic dimension count");
 
@@ -283,8 +282,7 @@ struct SimplifyAllocConst : public OpRewritePattern<AllocLikeOp> {
     // Create new memref type (which will have fewer dynamic dimensions).
     MemRefType newMemRefType =
         MemRefType::Builder(memrefType).setShape(newShapeConstants);
-    assert(static_cast<int64_t>(dynamicSizes.size()) ==
-           newMemRefType.getNumDynamicDims());
+    assert(dynamicSizes.size() == newMemRefType.getNumDynamicDims());
 
     // Create and insert the alloc op for the new memref.
     auto newAlloc = rewriter.create<AllocLikeOp>(
diff --git a/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp b/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
index 0cb5931ce6bf9..fb2921fec9f79 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
@@ -103,6 +103,44 @@ struct DimOfReifyRankedShapedTypeOpInterface : public OpRewritePattern<OpTy> {
     return success();
   }
 };
+
+/// Fold dim ops of iter_args to dim ops of their respective init args. E.g.:
+///
+/// ```
+/// %0 = ... : tensor<?x?xf32>
+/// scf.forall ... shared_outs(%arg0 = %0) -> (tensor<?x?xf32>) {
+///   %1 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+///   ...
+/// }
+/// ```
+///
+/// is folded to:
+///
+/// ```
+/// %0 = ... : tensor<?x?xf32>
+/// scf.forall ... shared_outs(%arg0 = %0) -> (tensor<?x?xf32>) {
+///   %1 = tensor.dim %0, %c0 : tensor<?x?xf32>
+///   ...
+/// }
+/// ```
+struct IterArgsToInitArgs : public OpRewritePattern<tensor::DimOp> {
+  using OpRewritePattern<tensor::DimOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::DimOp dimOp,
+                                PatternRewriter &rewriter) const final {
+    auto blockArg = dyn_cast<BlockArgument>(dimOp.getSource());
+    if (!blockArg)
+      return failure();
+    auto loopLikeOp =
+        dyn_cast<LoopLikeOpInterface>(blockArg.getParentBlock()->getParentOp());
+    if (!loopLikeOp)
+      return failure();
+    Value initArg = loopLikeOp.getTiedLoopInit(blockArg)->get();
+    rewriter.modifyOpInPlace(
+        dimOp, [&]() { dimOp.getSourceMutable().assign(initArg); });
+    return success();
+  }
+};
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -127,8 +165,8 @@ struct ResolveShapedTypeResultDimsPass final
 void memref::populateResolveRankedShapedTypeResultDimsPatterns(
     RewritePatternSet &patterns) {
   patterns.add<DimOfReifyRankedShapedTypeOpInterface<memref::DimOp>,
-               DimOfReifyRankedShapedTypeOpInterface<tensor::DimOp>>(
-      patterns.getContext());
+               DimOfReifyRankedShapedTypeOpInterface<tensor::DimOp>,
+               IterArgsToInitArgs>(patterns.getContext());
 }
 
 void memref::populateResolveShapedTypeResultDimsPatterns(
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index db47276dcefe9..d516c8d9e0be6 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -469,32 +469,92 @@ static void printOrderClause(OpAsmPrinter &p, Operation *op,
 }
 
 //===----------------------------------------------------------------------===//
-// Parser, printer and verifier for ReductionVarList
+// Parsers for operations including clauses that define entry block arguments.
 //===----------------------------------------------------------------------===//
 
+namespace {
+struct MapParseArgs {
+  SmallVectorImpl<OpAsmParser::UnresolvedOperand> &vars;
+  SmallVectorImpl<Type> &types;
+  MapParseArgs(SmallVectorImpl<OpAsmParser::UnresolvedOperand> &vars,
+               SmallVectorImpl<Type> &types)
+      : vars(vars), types(types) {}
+};
+struct PrivateParseArgs {
+  llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &vars;
+  llvm::SmallVectorImpl<Type> &types;
+  ArrayAttr &syms;
+  PrivateParseArgs(SmallVectorImpl<OpAsmParser::UnresolvedOperand> &vars,
+                   SmallVectorImpl<Type> &types, ArrayAttr &syms)
+      : vars(vars), types(types), syms(syms) {}
+};
+struct ReductionParseArgs {
+  SmallVectorImpl<OpAsmParser::UnresolvedOperand> &vars;
+  SmallVectorImpl<Type> &types;
+  DenseBoolArrayAttr &byref;
+  ArrayAttr &syms;
+  ReductionParseArgs(SmallVectorImpl<OpAsmParser::UnresolvedOperand> &vars,
+                     SmallVectorImpl<Type> &types, DenseBoolArrayAttr &byref,
+                     ArrayAttr &syms)
+      : vars(vars), types(types), byref(byref), syms(syms) {}
+};
+struct AllRegionParseArgs {
+  std::optional<ReductionParseArgs> inReductionArgs;
+  std::optional<MapParseArgs> mapArgs;
+  std::optional<PrivateParseArgs> privateArgs;
+  std::optional<ReductionParseArgs> reductionArgs;
+  std::optional<ReductionParseArgs> taskReductionArgs;
+  std::optional<MapParseArgs> useDeviceAddrArgs;
+  std::optional<MapParseArgs> useDevicePtrArgs;
+};
+} // namespace
+
 static ParseResult parseClauseWithRegionArgs(
-    OpAsmParser &parser, Region &region,
+    OpAsmParser &parser,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-    SmallVectorImpl<Type> &types, DenseBoolArrayAttr &byref, ArrayAttr &symbols,
-    SmallVectorImpl<OpAsmParser::Argument> &regionPrivateArgs) {
-  SmallVector<SymbolRefAttr> reductionVec;
+    SmallVectorImpl<Type> &types,
+    SmallVectorImpl<OpAsmParser::Argument> &regionPrivateArgs,
+    ArrayAttr *symbols = nullptr, DenseBoolArrayAttr *byref = nullptr) {
+  SmallVector<SymbolRefAttr> symbolVec;
   SmallVector<bool> isByRefVec;
   unsigned regionArgOffset = regionPrivateArgs.size();
 
-  if (failed(
-          parser.parseCommaSeparatedList(OpAsmParser::Delimiter::Paren, [&]() {
-            ParseResult optionalByref = parser.parseOptionalKeyword("byref");
-            if (parser.parseAttribute(reductionVec.emplace_back()) ||
-                parser.parseOperand(operands.emplace_back()) ||
-                parser.parseArrow() ||
-                parser.parseArgument(regionPrivateArgs.emplace_back()) ||
-                parser.parseColonType(types.emplace_back()))
-              return failure();
-            isByRefVec.push_back(optionalByref.succeeded());
-            return success();
-          })))
+  if (parser.parseLParen())
+    return failure();
+
+  if (parser.parseCommaSeparatedList([&]() {
+        if (byref)
+          isByRefVec.push_back(
+              parser.parseOptionalKeyword("byref").succeeded());
+
+        if (symbols && parser.parseAttribute(symbolVec.emplace_back()))
+          return failure();
+
+        if (parser.parseOperand(operands.emplace_back()) ||
+            parser.parseArrow() ||
+            parser.parseArgument(regionPrivateArgs.emplace_back()))
+          return failure();
+
+        return success();
+      }))
+    return failure();
+
+  if (parser.parseColon())
+    return failure();
+
+  if (parser.parseCommaSeparatedList([&]() {
+        if (parser.parseType(types.emplace_back()))
+          return failure();
+
+        return success();
+      }))
+    return failure();
+
+  if (operands.size() != types.size())
+    return failure();
+
+  if (parser.parseRParen())
     return failure();
-  byref = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
 
   auto *argsBegin = regionPrivateArgs.begin();
   MutableArrayRef argsSubrange(argsBegin + regionArgOffset,
@@ -502,142 +562,410 @@ static ParseResult parseClauseWithRegionArgs(
   for (auto [prv, type] : llvm::zip_equal(argsSubrange, types)) {
     prv.type = type;
   }
-  SmallVector<Attribute> reductions(reductionVec.begin(), reductionVec.end());
-  symbols = ArrayAttr::get(parser.getContext(), reductions);
+
+  if (symbols) {
+    SmallVector<Attribute> symbolAttrs(symbolVec.begin(), symbolVec.end());
+    *symbols = ArrayAttr::get(parser.getContext(), symbolAttrs);
+  }
+
+  if (byref)
+    *byref = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
+
   return success();
 }
 
-static void printClauseWithRegionArgs(OpAsmPrinter &p, Operation *op,
-                                      ValueRange argsSubrange,
-                                      StringRef clauseName, ValueRange operands,
-                                      TypeRange types, DenseBoolArrayAttr byref,
-                                      ArrayAttr symbols) {
-  if (!clauseName.empty())
-    p << clauseName << "(";
-
-  llvm::interleaveComma(llvm::zip_equal(symbols, operands, argsSubrange, types,
-                                        byref.asArrayRef()),
-                        p, [&p](auto t) {
-                          auto [sym, op, arg, type, isByRef] = t;
-                          p << (isByRef ? "byref " : "") << sym << " " << op
-                            << " -> " << arg << " : " << type;
-                        });
+static ParseResult parseBlockArgClause(
+    OpAsmParser &parser,
+    llvm::SmallVectorImpl<OpAsmParser::Argument> &entryBlockArgs,
+    StringRef keyword, std::optional<MapParseArgs> mapArgs) {
+  if (succeeded(parser.parseOptionalKeyword(keyword))) {
+    if (!mapArgs)
+      return failure();
 
-  if (!clauseName.empty())
-    p << ") ";
+    if (failed(parseClauseWithRegionArgs(parser, mapArgs->vars, mapArgs->types,
+                                         entryBlockArgs)))
+      return failure();
+  }
+  return success();
 }
 
-static ParseResult parseParallelRegion(
-    OpAsmParser &parser, Region &region,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionVars,
-    SmallVectorImpl<Type> &reductionTypes, DenseBoolArrayAttr &reductionByref,
-    ArrayAttr &reductionSyms,
-    llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVars,
-    llvm::SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms) {
-  llvm::SmallVector<OpAsmParser::Argument> regionPrivateArgs;
+static ParseResult parseBlockArgClause(
+    OpAsmParser &parser,
+    llvm::SmallVectorImpl<OpAsmParser::Argument> &entryBlockArgs,
+    StringRef keyword, std::optional<PrivateParseArgs> reductionArgs) {
+  if (succeeded(parser.parseOptionalKeyword(keyword))) {
+    if (!reductionArgs)
+      return failure();
 
-  if (succeeded(parser.parseOptionalKeyword("reduction"))) {
-    if (failed(parseClauseWithRegionArgs(parser, region, reductionVars,
-                                         reductionTypes, reductionByref,
-                                         reductionSyms, regionPrivateArgs)))
+    if (failed(parseClauseWithRegionArgs(parser, reductionArgs->vars,
+                                         reductionArgs->types, entryBlockArgs,
+                                         &reductionArgs->syms)))
       return failure();
   }
+  return success();
+}
 
-  if (succeeded(parser.parseOptionalKeyword("private"))) {
-    auto privateByref = DenseBoolArrayAttr::get(parser.getContext(), {});
-    if (failed(parseClauseWithRegionArgs(parser, region, privateVars,
-                                         privateTypes, privateByref,
-                                         privateSyms, regionPrivateArgs)))
+static ParseResult parseBlockArgClause(
+    OpAsmParser &parser,
+    llvm::SmallVectorImpl<OpAsmParser::Argument> &entryBlockArgs,
+    StringRef keyword, std::optional<ReductionParseArgs> reductionArgs) {
+  if (succeeded(parser.parseOptionalKeyword(keyword))) {
+    if (!reductionArgs)
       return failure();
-    if (llvm::any_of(privateByref.asArrayRef(),
-                     [](bool byref) { return byref; })) {
-      parser.emitError(parser.getCurrentLocation(),
-                       "private clause cannot have byref attributes");
+
+    if (failed(parseClauseWithRegionArgs(
+            parser, reductionArgs->vars, reductionArgs->types, entryBlockArgs,
+            &reductionArgs->syms, &reductionArgs->byref)))
       return failure();
-    }
   }
+  return success();
+}
+
+static ParseResult parseBlockArgRegion(OpAsmParser &parser, Region &region,
+                                       AllRegionParseArgs args) {
+  llvm::SmallVector<OpAsmParser::Argument> entryBlockArgs;
+
+  if (failed(parseBlockArgClause(parser, entryBlockArgs, "in_reduction",
+                                 args.inReductionArgs)))
+    return parser.emitError(parser.getCurrentLocation())
+           << "invalid `in_reduction` format";
+
+  if (failed(parseBlockArgClause(parser, entryBlockArgs, "map_entries",
+                                 args.mapArgs)))
+    return parser.emitError(parser.getCurrentLocation())
+           << "invalid `map_entries` format";
+
+  if (failed(parseBlockArgClause(parser, entryBlockArgs, "private",
+                                 args.privateArgs)))
+    return parser.emitError(parser.getCurrentLocation())
+           << "invalid `private` format";
+
+  if (failed(parseBlockArgClause(parser, entryBlockArgs, "reduction",
+                                 args.reductionArgs)))
+    return parser.emitError(parser.getCurrentLocation())
+           << "invalid `reduction` format";
+
+  if (failed(parseBlockArgClause(parser, entryBlockArgs, "task_reduction",
+                                 args.taskReductionArgs)))
+    return parser.emitError(parser.getCurrentLocation())
+           << "invalid `task_reduction` format";
 
-  return parser.parseRegion(region, regionPrivateArgs);
+  if (failed(parseBlockArgClause(parser, entryBlockArgs, "use_device_addr",
+                                 args.useDeviceAddrArgs)))
+    return parser.emitError(parser.getCurrentLocation())
+           << "invalid `use_device_addr` format";
+
+  if (failed(parseBlockArgClause(parser, entryBlockArgs, "use_device_ptr",
+                                 args.useDevicePtrArgs)))
+    return parser.emitError(parser.getCurrentLocation())
+           << "invalid `use_device_addr` format";
+
+  return parser.parseRegion(region, entryBlockArgs);
 }
 
-static void printParallelRegion(OpAsmPrinter &p, Operation *op, Region &region,
-                                ValueRange reductionVars,
-                                TypeRange reductionTypes,
-                                DenseBoolArrayAttr reductionByref,
-                                ArrayAttr reductionSyms, ValueRange privateVars,
-                                TypeRange privateTypes, ArrayAttr privateSyms) {
-  if (reductionSyms) {
-    auto *argsBegin = region.front().getArguments().begin();
-    MutableArrayRef argsSubrange(argsBegin, argsBegin + reductionTypes.size());
-    printClauseWithRegionArgs(p, op, argsSubrange, "reduction", reductionVars,
-                              reductionTypes, reductionByref, reductionSyms);
-  }
+static ParseResult parseInReductionMapPrivateRegion(
+    OpAsmParser &parser, Region &region,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &inReductionVars,
+    SmallVectorImpl<Type> &inReductionTypes,
+    DenseBoolArrayAttr &inReductionByref, ArrayAttr &inReductionSyms,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &mapVars,
+    SmallVectorImpl<Type> &mapTypes,
+    llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVars,
+    llvm::SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms) {
+  AllRegionParseArgs args;
+  args.inReductionArgs.emplace(inReductionVars, inReductionTypes,
+                               inReductionByref, inReductionSyms);
+  args.mapArgs.emplace(mapVars, mapTypes);
+  args.privateArgs.emplace(privateVars, privateTypes, privateSyms);
+  return parseBlockArgRegion(parser, region, args);
+}
 
-  if (privateSyms) {
-    auto *argsBegin = region.front().getArguments().begin();
-    MutableArrayRef argsSubrange(argsBegin + reductionVars.size(),
-                                 argsBegin + reductionVars.size() +
-                                     privateTypes.size());
-    mlir::SmallVector<bool> isByRefVec;
-    isByRefVec.resize(privateTypes.size(), false);
-    DenseBoolArrayAttr isByRef =
-        makeDenseBoolArrayAttr(op->getContext(), isByRefVec);
+static ParseResult parseInReductionPrivateRegion(
+    OpAsmParser &parser, Region &region,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &inReductionVars,
+    SmallVectorImpl<Type> &inReductionTypes,
+    DenseBoolArrayAttr &inReductionByref, ArrayAttr &inReductionSyms,
+    llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVars,
+    llvm::SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms) {
+  AllRegionParseArgs args;
+  args.inReductionArgs.emplace(inReductionVars, inReductionTypes,
+                               inReductionByref, inReductionSyms);
+  args.privateArgs.emplace(privateVars, privateTypes, privateSyms);
+  return parseBlockArgRegion(parser, region, args);
+}
 
-    printClauseWithRegionArgs(p, op, argsSubrange, "private", privateVars,
-                              privateTypes, isByRef, privateSyms);
-  }
+static ParseResult parseInReductionPrivateReductionRegion(
+    OpAsmParser &parser, Region &region,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &inReductionVars,
+    SmallVectorImpl<Type> &inReductionTypes,
+    DenseBoolArrayAttr &inReductionByref, ArrayAttr &inReductionSyms,
+    llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVars,
+    llvm::SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionVars,
+    SmallVectorImpl<Type> &reductionTypes, DenseBoolArrayAttr &reductionByref,
+    ArrayAttr &reductionSyms) {
+  AllRegionParseArgs args;
+  args.inReductionArgs.emplace(inReductionVars, inReductionTypes,
+                               inReductionByref, inReductionSyms);
+  args.privateArgs.emplace(privateVars, privateTypes, privateSyms);
+  args.reductionArgs.emplace(reductionVars, reductionTypes, reductionByref,
+                             reductionSyms);
+  return parseBlockArgRegion(parser, region, args);
+}
 
-  p.printRegion(region, /*printEntryBlockArgs=*/false);
+static ParseResult parsePrivateRegion(
+    OpAsmParser &parser, Region &region,
+    llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVars,
+    llvm::SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms) {
+  AllRegionParseArgs args;
+  args.privateArgs.emplace(privateVars, privateTypes, privateSyms);
+  return parseBlockArgRegion(parser, region, args);
 }
 
-/// reduction-entry-list ::= reduction-entry
-///                        | reduction-entry-list `,` reduction-entry
-/// reduction-entry ::= (`byref`)? symbol-ref `->` ssa-id `:` type
-static ParseResult parseReductionVarList(
-    OpAsmParser &parser,
+static ParseResult parsePrivateReductionRegion(
+    OpAsmParser &parser, Region &region,
+    llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVars,
+    llvm::SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionVars,
     SmallVectorImpl<Type> &reductionTypes, DenseBoolArrayAttr &reductionByref,
     ArrayAttr &reductionSyms) {
-  SmallVector<SymbolRefAttr> reductionVec;
-  SmallVector<bool> isByRefVec;
-  if (failed(parser.parseCommaSeparatedList([&]() {
-        ParseResult optionalByref = parser.parseOptionalKeyword("byref");
-        if (parser.parseAttribute(reductionVec.emplace_back()) ||
-            parser.parseArrow() ||
-            parser.parseOperand(reductionVars.emplace_back()) ||
-            parser.parseColonType(reductionTypes.emplace_back()))
-          return failure();
-        isByRefVec.push_back(optionalByref.succeeded());
-        return success();
-      })))
-    return failure();
-  reductionByref = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
-  SmallVector<Attribute> reductions(reductionVec.begin(), reductionVec.end());
-  reductionSyms = ArrayAttr::get(parser.getContext(), reductions);
-  return success();
+  AllRegionParseArgs args;
+  args.privateArgs.emplace(privateVars, privateTypes, privateSyms);
+  args.reductionArgs.emplace(reductionVars, reductionTypes, reductionByref,
+                             reductionSyms);
+  return parseBlockArgRegion(parser, region, args);
 }
 
-/// Print Reduction clause
-static void
-printReductionVarList(OpAsmPrinter &p, Operation *op,
-                      OperandRange reductionVars, TypeRange reductionTypes,
-                      std::optional<DenseBoolArrayAttr> reductionByref,
-                      std::optional<ArrayAttr> reductionSyms) {
-  auto getByRef = [&](unsigned i) -> const char * {
-    if (!reductionByref || !*reductionByref)
-      return "";
-    assert(reductionByref->empty() || i < reductionByref->size());
-    if (!reductionByref->empty() && (*reductionByref)[i])
-      return "byref ";
-    return "";
-  };
+static ParseResult parseTaskReductionRegion(
+    OpAsmParser &parser, Region &region,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &taskReductionVars,
+    SmallVectorImpl<Type> &taskReductionTypes,
+    DenseBoolArrayAttr &taskReductionByref, ArrayAttr &taskReductionSyms) {
+  AllRegionParseArgs args;
+  args.taskReductionArgs.emplace(taskReductionVars, taskReductionTypes,
+                                 taskReductionByref, taskReductionSyms);
+  return parseBlockArgRegion(parser, region, args);
+}
 
-  for (unsigned i = 0, e = reductionVars.size(); i < e; ++i) {
-    if (i != 0)
-      p << ", ";
-    p << getByRef(i) << (*reductionSyms)[i] << " -> " << reductionVars[i]
-      << " : " << reductionVars[i].getType();
+static ParseResult parseUseDeviceAddrUseDevicePtrRegion(
+    OpAsmParser &parser, Region &region,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &useDeviceAddrVars,
+    SmallVectorImpl<Type> &useDeviceAddrTypes,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &useDevicePtrVars,
+    SmallVectorImpl<Type> &useDevicePtrTypes) {
+  AllRegionParseArgs args;
+  args.useDeviceAddrArgs.emplace(useDeviceAddrVars, useDeviceAddrTypes);
+  args.useDevicePtrArgs.emplace(useDevicePtrVars, useDevicePtrTypes);
+  return parseBlockArgRegion(parser, region, args);
+}
+
+//===----------------------------------------------------------------------===//
+// Printers for operations including clauses that define entry block arguments.
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct MapPrintArgs {
+  ValueRange vars;
+  TypeRange types;
+  MapPrintArgs(ValueRange vars, TypeRange types) : vars(vars), types(types) {}
+};
+struct PrivatePrintArgs {
+  ValueRange vars;
+  TypeRange types;
+  ArrayAttr syms;
+  PrivatePrintArgs(ValueRange vars, TypeRange types, ArrayAttr syms)
+      : vars(vars), types(types), syms(syms) {}
+};
+struct ReductionPrintArgs {
+  ValueRange vars;
+  TypeRange types;
+  DenseBoolArrayAttr byref;
+  ArrayAttr syms;
+  ReductionPrintArgs(ValueRange vars, TypeRange types, DenseBoolArrayAttr byref,
+                     ArrayAttr syms)
+      : vars(vars), types(types), byref(byref), syms(syms) {}
+};
+struct AllRegionPrintArgs {
+  std::optional<ReductionPrintArgs> inReductionArgs;
+  std::optional<MapPrintArgs> mapArgs;
+  std::optional<PrivatePrintArgs> privateArgs;
+  std::optional<ReductionPrintArgs> reductionArgs;
+  std::optional<ReductionPrintArgs> taskReductionArgs;
+  std::optional<MapPrintArgs> useDeviceAddrArgs;
+  std::optional<MapPrintArgs> useDevicePtrArgs;
+};
+} // namespace
+
+static void printClauseWithRegionArgs(OpAsmPrinter &p, MLIRContext *ctx,
+                                      StringRef clauseName,
+                                      ValueRange argsSubrange,
+                                      ValueRange operands, TypeRange types,
+                                      ArrayAttr symbols = nullptr,
+                                      DenseBoolArrayAttr byref = nullptr) {
+  if (argsSubrange.empty())
+    return;
+
+  p << clauseName << "(";
+
+  if (!symbols) {
+    llvm::SmallVector<Attribute> values(operands.size(), nullptr);
+    symbols = ArrayAttr::get(ctx, values);
   }
+
+  if (!byref) {
+    mlir::SmallVector<bool> values(operands.size(), false);
+    byref = DenseBoolArrayAttr::get(ctx, values);
+  }
+
+  llvm::interleaveComma(
+      llvm::zip_equal(operands, argsSubrange, symbols, byref.asArrayRef()), p,
+      [&p](auto t) {
+        auto [op, arg, sym, isByRef] = t;
+        if (isByRef)
+          p << "byref ";
+        if (sym)
+          p << sym << " ";
+        p << op << " -> " << arg;
+      });
+  p << " : ";
+  llvm::interleaveComma(types, p);
+  p << ") ";
+}
+
+static void printBlockArgClause(OpAsmPrinter &p, MLIRContext *ctx,
+                                StringRef clauseName, ValueRange argsSubrange,
+                                std::optional<MapPrintArgs> mapArgs) {
+  if (mapArgs)
+    printClauseWithRegionArgs(p, ctx, clauseName, argsSubrange, mapArgs->vars,
+                              mapArgs->types);
+}
+
+static void printBlockArgClause(OpAsmPrinter &p, MLIRContext *ctx,
+                                StringRef clauseName, ValueRange argsSubrange,
+                                std::optional<PrivatePrintArgs> privateArgs) {
+  if (privateArgs)
+    printClauseWithRegionArgs(p, ctx, clauseName, argsSubrange,
+                              privateArgs->vars, privateArgs->types,
+                              privateArgs->syms);
+}
+
+static void
+printBlockArgClause(OpAsmPrinter &p, MLIRContext *ctx, StringRef clauseName,
+                    ValueRange argsSubrange,
+                    std::optional<ReductionPrintArgs> reductionArgs) {
+  if (reductionArgs)
+    printClauseWithRegionArgs(p, ctx, clauseName, argsSubrange,
+                              reductionArgs->vars, reductionArgs->types,
+                              reductionArgs->syms, reductionArgs->byref);
+}
+
+static void printBlockArgRegion(OpAsmPrinter &p, Operation *op, Region &region,
+                                const AllRegionPrintArgs &args) {
+  auto iface = llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(op);
+  MLIRContext *ctx = op->getContext();
+
+  printBlockArgClause(p, ctx, "in_reduction", iface.getInReductionBlockArgs(),
+                      args.inReductionArgs);
+  printBlockArgClause(p, ctx, "map_entries", iface.getMapBlockArgs(),
+                      args.mapArgs);
+  printBlockArgClause(p, ctx, "private", iface.getPrivateBlockArgs(),
+                      args.privateArgs);
+  printBlockArgClause(p, ctx, "reduction", iface.getReductionBlockArgs(),
+                      args.reductionArgs);
+  printBlockArgClause(p, ctx, "task_reduction",
+                      iface.getTaskReductionBlockArgs(),
+                      args.taskReductionArgs);
+  printBlockArgClause(p, ctx, "use_device_addr",
+                      iface.getUseDeviceAddrBlockArgs(),
+                      args.useDeviceAddrArgs);
+  printBlockArgClause(p, ctx, "use_device_ptr",
+                      iface.getUseDevicePtrBlockArgs(), args.useDevicePtrArgs);
+
+  p.printRegion(region, /*printEntryBlockArgs=*/false);
+}
+
+static void printInReductionMapPrivateRegion(
+    OpAsmPrinter &p, Operation *op, Region &region, ValueRange inReductionVars,
+    TypeRange inReductionTypes, DenseBoolArrayAttr inReductionByref,
+    ArrayAttr inReductionSyms, ValueRange mapVars, TypeRange mapTypes,
+    ValueRange privateVars, TypeRange privateTypes, ArrayAttr privateSyms) {
+  AllRegionPrintArgs args;
+  args.inReductionArgs.emplace(inReductionVars, inReductionTypes,
+                               inReductionByref, inReductionSyms);
+  args.mapArgs.emplace(mapVars, mapTypes);
+  args.privateArgs.emplace(privateVars, privateTypes, privateSyms);
+  printBlockArgRegion(p, op, region, args);
+}
+
+static void printInReductionPrivateRegion(
+    OpAsmPrinter &p, Operation *op, Region &region, ValueRange inReductionVars,
+    TypeRange inReductionTypes, DenseBoolArrayAttr inReductionByref,
+    ArrayAttr inReductionSyms, ValueRange privateVars, TypeRange privateTypes,
+    ArrayAttr privateSyms) {
+  AllRegionPrintArgs args;
+  args.inReductionArgs.emplace(inReductionVars, inReductionTypes,
+                               inReductionByref, inReductionSyms);
+  args.privateArgs.emplace(privateVars, privateTypes, privateSyms);
+  printBlockArgRegion(p, op, region, args);
+}
+
+static void printInReductionPrivateReductionRegion(
+    OpAsmPrinter &p, Operation *op, Region &region, ValueRange inReductionVars,
+    TypeRange inReductionTypes, DenseBoolArrayAttr inReductionByref,
+    ArrayAttr inReductionSyms, ValueRange privateVars, TypeRange privateTypes,
+    ArrayAttr privateSyms, ValueRange reductionVars, TypeRange reductionTypes,
+    DenseBoolArrayAttr reductionByref, ArrayAttr reductionSyms) {
+  AllRegionPrintArgs args;
+  args.inReductionArgs.emplace(inReductionVars, inReductionTypes,
+                               inReductionByref, inReductionSyms);
+  args.privateArgs.emplace(privateVars, privateTypes, privateSyms);
+  args.reductionArgs.emplace(reductionVars, reductionTypes, reductionByref,
+                             reductionSyms);
+  printBlockArgRegion(p, op, region, args);
+}
+
+static void printPrivateRegion(OpAsmPrinter &p, Operation *op, Region &region,
+                               ValueRange privateVars, TypeRange privateTypes,
+                               ArrayAttr privateSyms) {
+  AllRegionPrintArgs args;
+  args.privateArgs.emplace(privateVars, privateTypes, privateSyms);
+  printBlockArgRegion(p, op, region, args);
+}
+
+static void printPrivateReductionRegion(
+    OpAsmPrinter &p, Operation *op, Region &region, ValueRange privateVars,
+    TypeRange privateTypes, ArrayAttr privateSyms, ValueRange reductionVars,
+    TypeRange reductionTypes, DenseBoolArrayAttr reductionByref,
+    ArrayAttr reductionSyms) {
+  AllRegionPrintArgs args;
+  args.privateArgs.emplace(privateVars, privateTypes, privateSyms);
+  args.reductionArgs.emplace(reductionVars, reductionTypes, reductionByref,
+                             reductionSyms);
+  printBlockArgRegion(p, op, region, args);
+}
+
+static void printTaskReductionRegion(OpAsmPrinter &p, Operation *op,
+                                     Region &region,
+                                     ValueRange taskReductionVars,
+                                     TypeRange taskReductionTypes,
+                                     DenseBoolArrayAttr taskReductionByref,
+                                     ArrayAttr taskReductionSyms) {
+  AllRegionPrintArgs args;
+  args.taskReductionArgs.emplace(taskReductionVars, taskReductionTypes,
+                                 taskReductionByref, taskReductionSyms);
+  printBlockArgRegion(p, op, region, args);
+}
+
+static void printUseDeviceAddrUseDevicePtrRegion(OpAsmPrinter &p, Operation *op,
+                                                 Region &region,
+                                                 ValueRange useDeviceAddrVars,
+                                                 TypeRange useDeviceAddrTypes,
+                                                 ValueRange useDevicePtrVars,
+                                                 TypeRange useDevicePtrTypes) {
+  AllRegionPrintArgs args;
+  args.useDeviceAddrArgs.emplace(useDeviceAddrVars, useDeviceAddrTypes);
+  args.useDevicePtrArgs.emplace(useDevicePtrVars, useDevicePtrTypes);
+  printBlockArgRegion(p, op, region, args);
 }
 
 /// Verifies Reduction Clause
@@ -1135,117 +1463,6 @@ static void printMembersIndex(OpAsmPrinter &p, MapInfoOp op,
   }
 }
 
-static ParseResult
-parseMapEntries(OpAsmParser &parser,
-                SmallVectorImpl<OpAsmParser::UnresolvedOperand> &mapVars,
-                SmallVectorImpl<Type> &mapTypes) {
-  OpAsmParser::UnresolvedOperand arg;
-  OpAsmParser::UnresolvedOperand blockArg;
-  Type argType;
-  auto parseEntries = [&]() -> ParseResult {
-    if (parser.parseOperand(arg))
-      return failure();
-    if (succeeded(parser.parseOptionalArrow()) && parser.parseOperand(blockArg))
-      return failure();
-    mapVars.push_back(arg);
-    return success();
-  };
-
-  auto parseTypes = [&]() -> ParseResult {
-    if (parser.parseType(argType))
-      return failure();
-    mapTypes.push_back(argType);
-    return success();
-  };
-
-  if (parser.parseCommaSeparatedList(parseEntries))
-    return failure();
-
-  if (parser.parseColon())
-    return failure();
-
-  if (parser.parseCommaSeparatedList(parseTypes))
-    return failure();
-
-  return success();
-}
-
-static void printMapEntries(OpAsmPrinter &p, Operation *op,
-                            OperandRange mapVars, TypeRange mapTypes) {
-  // Get pointer to the region if this is an omp.target, because printing map
-  // clauses for that operation has to also show the correspondence of each
-  // variable to the corresponding block argument.
-  Block *entryBlock = isa<TargetOp>(op) ? &op->getRegion(0).front() : nullptr;
-  unsigned argIndex = 0;
-
-  for (const auto &mapOp : mapVars) {
-    p << mapOp;
-    if (entryBlock) {
-      const auto &blockArg = entryBlock->getArgument(argIndex);
-      p << " -> " << blockArg;
-    }
-    argIndex++;
-    if (argIndex < mapVars.size())
-      p << ", ";
-  }
-  p << " : ";
-
-  argIndex = 0;
-  for (const auto &mapType : mapTypes) {
-    p << mapType;
-    argIndex++;
-    if (argIndex < mapVars.size())
-      p << ", ";
-  }
-}
-
-static ParseResult
-parsePrivateList(OpAsmParser &parser,
-                 SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVars,
-                 SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms) {
-  SmallVector<SymbolRefAttr> privateSymRefs;
-  SmallVector<OpAsmParser::Argument> regionPrivateArgs;
-
-  if (failed(parser.parseCommaSeparatedList([&]() {
-        if (parser.parseAttribute(privateSymRefs.emplace_back()) ||
-            parser.parseOperand(privateVars.emplace_back()) ||
-            parser.parseArrow() ||
-            parser.parseArgument(regionPrivateArgs.emplace_back()) ||
-            parser.parseColonType(privateTypes.emplace_back()))
-          return failure();
-        return success();
-      })))
-    return failure();
-
-  SmallVector<Attribute> privateSymAttrs(privateSymRefs.begin(),
-                                         privateSymRefs.end());
-  privateSyms = ArrayAttr::get(parser.getContext(), privateSymAttrs);
-
-  return success();
-}
-
-static void printPrivateList(OpAsmPrinter &p, Operation *op,
-                             ValueRange privateVars, TypeRange privateTypes,
-                             ArrayAttr privateSyms) {
-  // TODO: Remove target-specific logic from this function.
-  auto targetOp = mlir::dyn_cast<mlir::omp::TargetOp>(op);
-  assert(targetOp);
-
-  auto &region = op->getRegion(0);
-  auto *argsBegin = region.front().getArguments().begin();
-  MutableArrayRef argsSubrange(argsBegin + targetOp.getMapVars().size(),
-                               argsBegin + targetOp.getMapVars().size() +
-                                   privateTypes.size());
-  mlir::SmallVector<bool> isByRefVec;
-  isByRefVec.resize(privateTypes.size(), false);
-  DenseBoolArrayAttr isByRef =
-      DenseBoolArrayAttr::get(op->getContext(), isByRefVec);
-
-  printClauseWithRegionArgs(p, op, argsSubrange,
-                            /*clauseName=*/llvm::StringRef{}, privateVars,
-                            privateTypes, isByRef, privateSyms);
-}
-
 static void printCaptureType(OpAsmPrinter &p, Operation *op,
                              VariableCaptureKindAttr mapCaptureType) {
   std::string typeCapStr;
@@ -1627,6 +1844,18 @@ LogicalResult TeamsOp::verify() {
                                 getReductionByref());
 }
 
+//===----------------------------------------------------------------------===//
+// SectionOp
+//===----------------------------------------------------------------------===//
+
+unsigned SectionOp::numPrivateBlockArgs() {
+  return getParentOp().numPrivateBlockArgs();
+}
+
+unsigned SectionOp::numReductionBlockArgs() {
+  return getParentOp().numReductionBlockArgs();
+}
+
 //===----------------------------------------------------------------------===//
 // SectionsOp
 //===----------------------------------------------------------------------===//
@@ -1687,63 +1916,40 @@ LogicalResult SingleOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// WsloopOp
+// LoopWrapperInterface
 //===----------------------------------------------------------------------===//
 
-ParseResult
-parseWsloop(OpAsmParser &parser, Region &region,
-            SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionOperands,
-            SmallVectorImpl<Type> &reductionTypes,
-            DenseBoolArrayAttr &reductionByRef, ArrayAttr &reductionSymbols) {
-  // Parse an optional reduction clause
-  llvm::SmallVector<OpAsmParser::Argument> privates;
-  if (succeeded(parser.parseOptionalKeyword("reduction"))) {
-    if (failed(parseClauseWithRegionArgs(parser, region, reductionOperands,
-                                         reductionTypes, reductionByRef,
-                                         reductionSymbols, privates)))
-      return failure();
-  }
-  return parser.parseRegion(region, privates);
-}
-
-void printWsloop(OpAsmPrinter &p, Operation *op, Region &region,
-                 ValueRange reductionOperands, TypeRange reductionTypes,
-                 DenseBoolArrayAttr isByRef, ArrayAttr reductionSymbols) {
-  if (reductionSymbols) {
-    auto reductionArgs = region.front().getArguments();
-    printClauseWithRegionArgs(p, op, reductionArgs, "reduction",
-                              reductionOperands, reductionTypes, isByRef,
-                              reductionSymbols);
-  }
-  p.printRegion(region, /*printEntryBlockArgs=*/false);
-}
-
-static LogicalResult verifyLoopWrapperInterface(Operation *op) {
+LogicalResult LoopWrapperInterface::verifyImpl() {
+  Operation *op = this->getOperation();
   if (op->getNumRegions() != 1)
-    return op->emitOpError() << "loop wrapper contains multiple regions";
+    return emitOpError() << "loop wrapper contains multiple regions";
 
   Region &region = op->getRegion(0);
   if (!region.hasOneBlock())
-    return op->emitOpError() << "loop wrapper contains multiple blocks";
+    return emitOpError() << "loop wrapper contains multiple blocks";
 
   if (::llvm::range_size(region.getOps()) != 2)
-    return op->emitOpError()
+    return emitOpError()
            << "loop wrapper does not contain exactly two nested ops";
 
   Operation &firstOp = *region.op_begin();
   Operation &secondOp = *(std::next(region.op_begin()));
 
   if (!secondOp.hasTrait<OpTrait::IsTerminator>())
-    return op->emitOpError()
+    return emitOpError()
            << "second nested op in loop wrapper is not a terminator";
 
   if (!::llvm::isa<LoopNestOp, LoopWrapperInterface>(firstOp))
-    return op->emitOpError() << "first nested op in loop wrapper is not "
-                                "another loop wrapper or `omp.loop_nest`";
+    return emitOpError() << "first nested op in loop wrapper is not "
+                            "another loop wrapper or `omp.loop_nest`";
 
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// WsloopOp
+//===----------------------------------------------------------------------===//
+
 void WsloopOp::build(OpBuilder &builder, OperationState &state,
                      ArrayRef<NamedAttribute> attributes) {
   build(builder, state, /*allocate_vars=*/{}, /*allocator_vars=*/{},
@@ -1774,9 +1980,6 @@ void WsloopOp::build(OpBuilder &builder, OperationState &state,
 }
 
 LogicalResult WsloopOp::verify() {
-  if (verifyLoopWrapperInterface(*this).failed())
-    return failure();
-
   bool isCompositeChildLeaf =
       llvm::dyn_cast_if_present<LoopWrapperInterface>((*this)->getParentOp());
 
@@ -1833,9 +2036,6 @@ LogicalResult SimdOp::verify() {
   if (verifyNontemporalClause(*this, getNontemporalVars()).failed())
     return failure();
 
-  if (verifyLoopWrapperInterface(*this).failed())
-    return failure();
-
   if (getNestedWrapper())
     return emitOpError() << "must wrap an 'omp.loop_nest' directly";
 
@@ -1859,11 +2059,11 @@ LogicalResult SimdOp::verify() {
 
 void DistributeOp::build(OpBuilder &builder, OperationState &state,
                          const DistributeOperands &clauses) {
-  // TODO Store clauses in op: privateVars, privateSyms.
-  DistributeOp::build(
-      builder, state, clauses.allocateVars, clauses.allocatorVars,
-      clauses.distScheduleStatic, clauses.distScheduleChunkSize, clauses.order,
-      clauses.orderMod, /*private_vars=*/{}, /*private_syms=*/nullptr);
+  DistributeOp::build(builder, state, clauses.allocateVars,
+                      clauses.allocatorVars, clauses.distScheduleStatic,
+                      clauses.distScheduleChunkSize, clauses.order,
+                      clauses.orderMod, clauses.privateVars,
+                      makeArrayAttr(builder.getContext(), clauses.privateSyms));
 }
 
 LogicalResult DistributeOp::verify() {
@@ -1875,9 +2075,6 @@ LogicalResult DistributeOp::verify() {
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
-  if (verifyLoopWrapperInterface(*this).failed())
-    return failure();
-
   if (LoopWrapperInterface nested = getNestedWrapper()) {
     if (!isComposite())
       return emitError()
@@ -2083,9 +2280,6 @@ LogicalResult TaskloopOp::verify() {
         "may not appear on the same taskloop directive");
   }
 
-  if (verifyLoopWrapperInterface(*this).failed())
-    return failure();
-
   if (LoopWrapperInterface nested = getNestedWrapper()) {
     if (!isComposite())
       return emitError()
diff --git a/mlir/lib/Dialect/PDL/IR/PDL.cpp b/mlir/lib/Dialect/PDL/IR/PDL.cpp
index 9d7c36520874d..d3f7c9798b9b8 100644
--- a/mlir/lib/Dialect/PDL/IR/PDL.cpp
+++ b/mlir/lib/Dialect/PDL/IR/PDL.cpp
@@ -387,7 +387,7 @@ LogicalResult PatternOp::verifyRegions() {
 void PatternOp::build(OpBuilder &builder, OperationState &state,
                       std::optional<uint16_t> benefit,
                       std::optional<StringRef> name) {
-  build(builder, state, builder.getI16IntegerAttr(benefit ? *benefit : 0),
+  build(builder, state, builder.getI16IntegerAttr(benefit.value_or(0)),
         name ? builder.getStringAttr(*name) : StringAttr());
   state.regions[0]->emplaceBlock();
 }
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td b/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td
index 93ea6e4e43698..28c45e6846380 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td
@@ -11,12 +11,9 @@
 
 include "mlir/Dialect/Arith/IR/ArithOps.td"
 include "mlir/Dialect/Polynomial/IR/Polynomial.td"
-include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
 
-defvar DefOverflow = ConstantEnumCase<Arith_IntegerOverflowAttr, "none">;
-
 def Equal : Constraint<CPred<"$0 == $1">>;
 
 // Get a -1 integer attribute of the same type as the polynomial SSA value's
@@ -44,40 +41,4 @@ def NTTAfterINTT : Pat<
   [(Equal $r1, $r2)]
 >;
 
-// NTTs are expensive, and addition in coefficient or NTT domain should be
-// equivalently expensive, so reducing the number of NTTs is optimal.
-// ntt(a) + ntt(b) -> ntt(a + b)
-def NTTOfAdd : Pat<
-  (Arith_AddIOp
-    (Polynomial_NTTOp $p1, $r1),
-    (Polynomial_NTTOp $p2, $r2),
-    $overflow),
-  (Polynomial_NTTOp (Polynomial_AddOp $p1, $p2), $r1),
-  [(Equal $r1, $r2)]
->;
-// intt(a) + intt(b) -> intt(a + b)
-def INTTOfAdd : Pat<
-  (Polynomial_AddOp
-    (Polynomial_INTTOp $t1, $r1),
-    (Polynomial_INTTOp $t2, $r2)),
-  (Polynomial_INTTOp (Arith_AddIOp $t1, $t2, DefOverflow), $r1),
-  [(Equal $r1, $r2)]
->;
-// repeated for sub
-def NTTOfSub : Pat<
-  (Arith_SubIOp
-    (Polynomial_NTTOp $p1, $r1),
-    (Polynomial_NTTOp $p2, $r2),
-    $overflow),
-  (Polynomial_NTTOp (Polynomial_SubOp $p1, $p2), $r1),
-  [(Equal $r1, $r2)]
->;
-def INTTOfSub : Pat<
-  (Polynomial_SubOp
-    (Polynomial_INTTOp $t1, $r1),
-    (Polynomial_INTTOp $t2, $r2)),
-  (Polynomial_INTTOp (Arith_SubIOp $t1, $t2, DefOverflow), $r1),
-  [(Equal $r1, $r2)]
->;
-
 #endif  // POLYNOMIAL_CANONICALIZATION
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
index 2ba13bb7dab56..460ef17167e80 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
@@ -289,10 +289,10 @@ void SubOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 void NTTOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                         MLIRContext *context) {
-  results.add<NTTAfterINTT, NTTOfAdd, NTTOfSub>(context);
+  results.add<NTTAfterINTT>(context);
 }
 
 void INTTOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                          MLIRContext *context) {
-  results.add<INTTAfterNTT, INTTOfAdd, INTTOfSub>(context);
+  results.add<INTTAfterNTT>(context);
 }
diff --git a/mlir/lib/Dialect/Quant/CMakeLists.txt b/mlir/lib/Dialect/Quant/CMakeLists.txt
index 037bba8dcb5c9..31167e6af908b 100644
--- a/mlir/lib/Dialect/Quant/CMakeLists.txt
+++ b/mlir/lib/Dialect/Quant/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(IR)
+add_subdirectory(Transforms)
 add_subdirectory(Utils)
diff --git a/mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.cpp b/mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.cpp
index c0c00fb4893cb..6a4ac310eb052 100644
--- a/mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.cpp
+++ b/mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.cpp
@@ -9,8 +9,8 @@
 
 #include "QuantDialectBytecode.h"
 #include "mlir/Bytecode/BytecodeImplementation.h"
-#include "mlir/Dialect/Quant/QuantOps.h"
-#include "mlir/Dialect/Quant/QuantTypes.h"
+#include "mlir/Dialect/Quant/IR/Quant.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallVector.h"
@@ -31,7 +31,7 @@ static LogicalResult readDoubleAPFloat(DialectBytecodeReader &reader,
   return success();
 }
 
-#include "mlir/Dialect/Quant/QuantDialectBytecode.cpp.inc"
+#include "mlir/Dialect/Quant/IR/QuantDialectBytecode.cpp.inc"
 
 /// This class implements the bytecode interface for the Quant dialect.
 struct QuantDialectBytecodeInterface : public BytecodeDialectInterface {
@@ -64,6 +64,6 @@ struct QuantDialectBytecodeInterface : public BytecodeDialectInterface {
 };
 } // namespace
 
-void quant::detail::addBytecodeInterface(QuantizationDialect *dialect) {
+void quant::detail::addBytecodeInterface(QuantDialect *dialect) {
   dialect->addInterfaces<QuantDialectBytecodeInterface>();
 }
diff --git a/mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.h b/mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.h
index 9e9cbf66d84d9..eef2b5bbefecc 100644
--- a/mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.h
+++ b/mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.h
@@ -15,12 +15,12 @@
 #define LIB_MLIR_DIALECT_QUANT_IR_QUANTDIALECTBYTECODE_H
 
 namespace mlir::quant {
-class QuantizationDialect;
+class QuantDialect;
 
 namespace detail {
 /// Add the interfaces necessary for encoding the quantization dialect
 /// components in bytecode.
-void addBytecodeInterface(QuantizationDialect *dialect);
+void addBytecodeInterface(QuantDialect *dialect);
 } // namespace detail
 } // namespace mlir::quant
 
diff --git a/mlir/lib/Dialect/Quant/IR/QuantOps.cpp b/mlir/lib/Dialect/Quant/IR/QuantOps.cpp
index c9a6bbc9ceeea..c584903f3a15d 100644
--- a/mlir/lib/Dialect/Quant/IR/QuantOps.cpp
+++ b/mlir/lib/Dialect/Quant/IR/QuantOps.cpp
@@ -6,44 +6,209 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Quant/QuantOps.h"
 #include "QuantDialectBytecode.h"
 #include "TypeDetail.h"
 
-#include "mlir/Dialect/Quant/QuantTypes.h"
+#include "mlir/Dialect/Quant/IR/Quant.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/MathExtras.h"
-#include <numeric>
+#include "mlir/IR/TypeUtilities.h"
 
-using namespace mlir;
-using namespace mlir::quant;
-using namespace mlir::quant::detail;
+#include "mlir/Dialect/Quant/IR/QuantOpsDialect.cpp.inc"
 
-#include "mlir/Dialect/Quant/QuantOpsDialect.cpp.inc"
 
-void QuantizationDialect::initialize() {
+namespace mlir {
+namespace quant {
+
+namespace {
+
+// Verify the integrity of per-axis quantization information, if present.
+//
+// - quantizedType
+//   Any quantized type. Any quantized type with no per-axis quantization is
+//   ignored.
+//
+// - containerType
+//   Original input or result type of the operation using the provided quantized
+//   type. Used to ensure that the quantized type appears within a tensor and
+//   that the tensor is compatible with per-axis quantization information.
+//
+LogicalResult verifyPerAxisQuantization(Operation *op,
+                                        QuantizedType quantizedType,
+                                        Type containerType) {
+  auto quantizedPerAxisType = dyn_cast<UniformQuantizedPerAxisType>(quantizedType);
+  if (!quantizedPerAxisType)
+    return success();
+
+  auto tensorType = dyn_cast<TensorType>(containerType);
+  if (!tensorType)
+    return op->emitError("scalar types may not use per-axis quantization");
+
+  if (!tensorType.hasRank())
+    return success();
+
+  int64_t quantizedDimension = quantizedPerAxisType.getQuantizedDimension();
+  if (quantizedDimension >= tensorType.getRank())
+    return op->emitError("quantized dimension must be less than tensor rank");
+
+  int64_t quantizedDimensionSize = tensorType.getDimSize(quantizedDimension);
+  if (quantizedDimensionSize != ShapedType::kDynamic &&
+      quantizedDimensionSize != (int64_t)quantizedPerAxisType.getScales().size())
+    return op->emitError(
+        "quantized dimension size does not match number of scales");
+
+  return success();
+}
+
+// Common verification logic for 'quant.dcast' and 'quant.qcast' ops.
+//
+// - quantizedType
+//   Quantized type used in the input ('quant.dcast') or result ('quant.qcast'),
+//   whether as a primitive type or in a tensor.
+//
+// - floatType
+//   Float type used in the input ('quant.qcast') or result ('quant.dcast'),
+//   whether as a primitive type or in a tensor.
+//
+// - containerType
+//   Type of original input or result.
+//
+LogicalResult verifyQuantizationOp(Operation *op, QuantizedType quantizedType,
+                                   FloatType floatType, Type containerType) {
+  if (quantizedType.getExpressedType() != floatType)
+    return op->emitError(
+        "expressed type in quantized type expected to match float type");
+
+  // Veriy integrity of per-axis quantization information, if present.
+  return verifyPerAxisQuantization(op, quantizedType, containerType);
+}
+
+}  // namespace
+
+
+//===----------------------------------------------------------------------===//
+// Dialect
+//===----------------------------------------------------------------------===//
+
+void QuantDialect::initialize() {
   addTypes<AnyQuantizedType, CalibratedQuantizedType, UniformQuantizedType,
            UniformQuantizedPerAxisType>();
   addOperations<
 #define GET_OP_LIST
-#include "mlir/Dialect/Quant/QuantOps.cpp.inc"
+#include "mlir/Dialect/Quant/IR/QuantOps.cpp.inc"
       >();
-  addBytecodeInterface(this);
+  detail::addBytecodeInterface(this);
+}
+
+
+//===----------------------------------------------------------------------===//
+// DequantizeCastOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult DequantizeCastOp::verify() {
+  return verifyQuantizationOp(*this, getQuantizedType(), getFloatType(),
+                              getInput().getType());
+}
+
+OpFoldResult DequantizeCastOp::fold(FoldAdaptor adaptor) {
+  // Matches x -> quant.qcast -> quant.dcast -> y, replacing the quant.dcast op
+  // with the value of x. Values x and y are guaranteed to be of the same type
+  // in this pattern.
+  auto srcQcastOp = getInput().getDefiningOp<QuantizeCastOp>();
+  if (!srcQcastOp)
+    return {};
+  assert(srcQcastOp.getInput().getType() == getType());
+  return srcQcastOp.getInput();
+}
+
+FloatType DequantizeCastOp::getFloatType() {
+  return cast<FloatType>(getElementTypeOrSelf(getResult().getType()));
+}
+
+QuantizedType DequantizeCastOp::getQuantizedType() {
+  return cast<QuantizedType>(getElementTypeOrSelf(getInput().getType()));
+}
+
+
+//===----------------------------------------------------------------------===//
+// QuantizeCastOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult QuantizeCastOp::verify() {
+  return verifyQuantizationOp(*this, getQuantizedType(), getFloatType(),
+                              getInput().getType());
+}
+
+OpFoldResult QuantizeCastOp::fold(FoldAdaptor adaptor) {
+  // Matches x -> quant.dcast -> quant.qcast -> y, replacing the quant.qcast op
+  // with the value of x if the casts invert each other. Contrary to the folding
+  // pattern in quant.dcast (i.e., x -> quant.qcast -> quant.dcast -> y), values
+  // x and y are not guaranteed to be of the same type here, as they may use
+  // different quantization parameters.
+  auto srcDcastOp = getInput().getDefiningOp<DequantizeCastOp>();
+  if (!srcDcastOp || srcDcastOp.getInput().getType() != getType())
+    return {};
+  return srcDcastOp.getInput();
+}
+
+FloatType QuantizeCastOp::getFloatType() {
+  return cast<FloatType>(getElementTypeOrSelf(getInput().getType()));
+}
+
+QuantizedType QuantizeCastOp::getQuantizedType() {
+  return cast<QuantizedType>(getElementTypeOrSelf(getResult().getType()));
+}
+
+
+//===----------------------------------------------------------------------===//
+// StorageCastOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult StorageCastOp::verify() {
+  auto quantizedType = getQuantizedType();
+  auto integerType = getIntegerType();
+  if (quantizedType.getStorageType() != integerType)
+    return emitError(
+        "storage type in quantized type expected to match integer type");
+
+  // Verify integrity of per-axis quantization information, if available. While
+  // the quantization type may appear in the input or the result, their tensor
+  // shapes are guaranteed to be identical at this point.
+  return verifyPerAxisQuantization(*this, quantizedType, getInput().getType());
 }
 
 OpFoldResult StorageCastOp::fold(FoldAdaptor adaptor) {
-  // Matches x -> [scast -> scast] -> y, replacing the second scast with the
-  // value of x if the casts invert each other.
-  auto srcScastOp = getArg().getDefiningOp<StorageCastOp>();
-  if (!srcScastOp || srcScastOp.getArg().getType() != getType())
-    return OpFoldResult();
-  return srcScastOp.getArg();
+  // Matches x -> quant.scast -> quant.scast -> y, replacing the second
+  // quant.scast with the value of x if the casts invert each other.
+  auto srcScastOp = getInput().getDefiningOp<StorageCastOp>();
+  if (!srcScastOp || srcScastOp.getInput().getType() != getType())
+    return {};
+  return srcScastOp.getInput();
+}
+
+IntegerType StorageCastOp::getIntegerType() {
+  auto inputScalarType = getElementTypeOrSelf(getInput().getType());
+  if (auto integerType = dyn_cast<IntegerType>(inputScalarType))
+    return integerType;
+
+  auto resultScalarType = getElementTypeOrSelf(getResult().getType());
+  return cast<IntegerType>(resultScalarType);
+}
+
+QuantizedType StorageCastOp::getQuantizedType() {
+  auto inputScalarType = getElementTypeOrSelf(getInput().getType());
+  if (auto quantizedType = dyn_cast<QuantizedType>(inputScalarType))
+    return quantizedType;
+
+  auto resultScalarType = getElementTypeOrSelf(getResult().getType());
+  return cast<QuantizedType>(resultScalarType);
 }
 
+
+} // namespace quant
+} // namespace mlir
+
 #define GET_OP_CLASSES
-#include "mlir/Dialect/Quant/QuantOps.cpp.inc"
+#include "mlir/Dialect/Quant/IR/QuantOps.cpp.inc"
+
diff --git a/mlir/lib/Dialect/Quant/IR/QuantTypes.cpp b/mlir/lib/Dialect/Quant/IR/QuantTypes.cpp
index c2ba9c04e8771..ac01b37a55307 100644
--- a/mlir/lib/Dialect/Quant/IR/QuantTypes.cpp
+++ b/mlir/lib/Dialect/Quant/IR/QuantTypes.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Quant/QuantTypes.h"
 #include "TypeDetail.h"
-#include "mlir/Dialect/Quant/QuantOps.h"
+#include "mlir/Dialect/Quant/IR/Quant.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"
 
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
@@ -20,12 +20,28 @@ using namespace mlir;
 using namespace mlir::quant;
 using namespace mlir::quant::detail;
 
+namespace {
+
+// Return the minimum scale representable in a given float type
+double getMinScale(Type expressedType) {
+  auto floatType = cast<FloatType>(expressedType);
+  return APFloat::getSmallest(floatType.getFloatSemantics()).convertToDouble();
+}
+
+// Return the maximum scale representable in a given float type
+double getMaxScale(Type expressedType) {
+  auto floatType = cast<FloatType>(expressedType);
+  return APFloat::getLargest(floatType.getFloatSemantics()).convertToDouble();
+}
+
+}  // namespace
+
 unsigned QuantizedType::getFlags() const {
   return static_cast<ImplType *>(impl)->flags;
 }
 
 bool QuantizedType::classof(Type type) {
-  return llvm::isa<QuantizationDialect>(type.getDialect());
+  return llvm::isa<QuantDialect>(type.getDialect());
 }
 
 LogicalResult
@@ -73,6 +89,17 @@ int64_t QuantizedType::getStorageTypeMax() const {
   return static_cast<ImplType *>(impl)->storageTypeMax;
 }
 
+bool QuantizedType::hasStorageTypeBounds() const {
+  unsigned int integralWidth = getStorageTypeIntegralWidth();
+  bool isSignedInteger = isSigned();
+  int64_t defaultIntegerMin =
+      getDefaultMinimumForInteger(isSignedInteger, integralWidth);
+  int64_t defaultIntegerMax =
+      getDefaultMaximumForInteger(isSignedInteger, integralWidth);
+  return defaultIntegerMin != getStorageTypeMin() ||
+         defaultIntegerMax != getStorageTypeMax();
+}
+
 unsigned QuantizedType::getStorageTypeIntegralWidth() const {
   // NOTE: If ever supporting non-integral storage types, some other scheme
   // for determining the width will be needed.
@@ -293,8 +320,13 @@ LogicalResult UniformQuantizedType::verifyInvariants(
     return emitError() << "expressed type must be floating point";
 
   // Verify scale.
+  double minScale = getMinScale(expressedType);
+  double maxScale = getMaxScale(expressedType);
   if (scale <= 0.0 || std::isinf(scale) || std::isnan(scale))
     return emitError() << "illegal scale: " << scale;
+  if (scale < minScale || scale > maxScale)
+    return emitError() << "scale out of expressed type range [" << minScale
+                       << ", " << maxScale << "]";
 
   return success();
 }
@@ -353,11 +385,20 @@ LogicalResult UniformQuantizedPerAxisType::verifyInvariants(
                        << scales.size() << ", " << zeroPoints.size();
 
   // Verify scale.
+  double minScale = getMinScale(expressedType);
+  double maxScale = getMaxScale(expressedType);
   for (double scale : scales) {
     if (scale <= 0.0 || std::isinf(scale) || std::isnan(scale))
       return emitError() << "illegal scale: " << scale;
+    if (scale < minScale || scale > maxScale)
+      return emitError() << "scale out of expressed type range [" << minScale
+                         << ", " << maxScale << "]";
   }
 
+  // Verify quantized dimension.
+  if (quantizedDimension < 0)
+    return emitError() << "illegal quantized dimension: " << quantizedDimension;
+
   return success();
 }
 
diff --git a/mlir/lib/Dialect/Quant/IR/TypeParser.cpp b/mlir/lib/Dialect/Quant/IR/TypeParser.cpp
index 926a8a0aa13d5..851763d8942e8 100644
--- a/mlir/lib/Dialect/Quant/IR/TypeParser.cpp
+++ b/mlir/lib/Dialect/Quant/IR/TypeParser.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Quant/QuantOps.h"
-#include "mlir/Dialect/Quant/QuantTypes.h"
+#include "mlir/Dialect/Quant/IR/Quant.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/Location.h"
@@ -317,7 +317,7 @@ static Type parseCalibratedType(DialectAsmParser &parser) {
 }
 
 /// Parse a type registered to this dialect.
-Type QuantizationDialect::parseType(DialectAsmParser &parser) const {
+Type QuantDialect::parseType(DialectAsmParser &parser) const {
   // All types start with an identifier that we switch on.
   StringRef typeNameSpelling;
   if (failed(parser.parseKeyword(&typeNameSpelling)))
@@ -346,12 +346,7 @@ static void printStorageType(QuantizedType type, DialectAsmPrinter &out) {
   }
 
   // storageTypeMin and storageTypeMax if not default.
-  int64_t defaultIntegerMin =
-      QuantizedType::getDefaultMinimumForInteger(isSigned, storageWidth);
-  int64_t defaultIntegerMax =
-      QuantizedType::getDefaultMaximumForInteger(isSigned, storageWidth);
-  if (defaultIntegerMin != type.getStorageTypeMin() ||
-      defaultIntegerMax != type.getStorageTypeMax()) {
+  if (type.hasStorageTypeBounds()) {
     out << "<" << type.getStorageTypeMin() << ":" << type.getStorageTypeMax()
         << ">";
   }
@@ -419,7 +414,7 @@ static void printCalibratedQuantizedType(CalibratedQuantizedType type,
 }
 
 /// Print a type registered to this dialect.
-void QuantizationDialect::printType(Type type, DialectAsmPrinter &os) const {
+void QuantDialect::printType(Type type, DialectAsmPrinter &os) const {
   if (auto anyType = llvm::dyn_cast<AnyQuantizedType>(type))
     printAnyQuantizedType(anyType, os);
   else if (auto uniformType = llvm::dyn_cast<UniformQuantizedType>(type))
diff --git a/mlir/lib/Dialect/Quant/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Quant/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000..2fd4a41999d45
--- /dev/null
+++ b/mlir/lib/Dialect/Quant/Transforms/CMakeLists.txt
@@ -0,0 +1,26 @@
+add_mlir_dialect_library(MLIRQuantTransforms
+  LowerQuantOps.cpp
+  StripFuncQuantTypes.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  {$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Quant/Transforms
+
+  DEPENDS
+  MLIRQuantTransformsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRFuncDialect
+  MLIRFuncTransforms
+  MLIRIndexDialect
+  MLIRIR
+  MLIRLinalgDialect
+  MLIRLinalgUtils
+  MLIRPass
+  MLIRQuantDialect
+  MLIRShapeDialect
+  MLIRTensorDialect
+  MLIRTransforms
+  MLIRTransformUtils
+
+  )
diff --git a/mlir/lib/Dialect/Quant/Transforms/LowerQuantOps.cpp b/mlir/lib/Dialect/Quant/Transforms/LowerQuantOps.cpp
new file mode 100644
index 0000000000000..4adeb9218ff8e
--- /dev/null
+++ b/mlir/lib/Dialect/Quant/Transforms/LowerQuantOps.cpp
@@ -0,0 +1,676 @@
+//===- LowerQuantOps.cpp - Lower 'quant' dialect ops ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Transforms `quant.dcast` and `quant.qcast` into lower-level ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Quant/IR/Quant.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"
+#include "mlir/Dialect/Quant/Transforms/Passes.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+namespace quant {
+
+#define GEN_PASS_DEF_LOWERQUANTOPS
+#include "mlir/Dialect/Quant/Transforms/Passes.h.inc"
+
+namespace {
+
+// If 'inputType' is a tensor, return its element type. If it is a scalar,
+// return it as is.
+Type getScalarType(Type inputType) {
+  if (auto tensorType = dyn_cast<TensorType>(inputType))
+    return tensorType.getElementType();
+  return inputType;
+}
+
+// Return the shape of an input value as a list of attributes (static dimensions)
+// and values (dynamic dimensions). If 'input' is a scalar, an empty list is
+// returned. If 'input' is a tensor, its shape is returned.
+SmallVector<OpFoldResult>
+getScalarOrTensorShape(OpBuilder &builder, Location loc, Value input) {
+  if (isa<TensorType>(input.getType()))
+    return tensor::getMixedSizes(builder, loc, input);
+  return {};
+}
+
+// If 'referenceType' is a scalar, return 'elementType' as is. If
+// 'referenceType' is a tensor, return another tensor with the same shape and
+// elements of type 'elementType'.
+Type getScalarOrTensorType(Type elementType, Type referenceType) {
+  if (auto tensorType = dyn_cast<TensorType>(referenceType))
+    return tensorType.clone(elementType);
+  return elementType;
+}
+
+// Return a constant with the given value. If 'referenceType' is a tensor, a
+// tensor splat of shape 'referenceShape' is returned. If 'referenceType' is a
+// scalar, 'referenceShape' is ignored and a scalar constant is returned.
+Value getScalarOrTensorConstant(OpBuilder &builder, Location loc, Value scalar,
+                                Type referenceType,
+                                ArrayRef<OpFoldResult> referenceShape) {
+  // If the result type is a scalar, return the unmodified scalar constant.
+  auto tensorType = dyn_cast<TensorType>(referenceType);
+  if (!tensorType) {
+    assert(referenceShape.empty());
+    return scalar;
+  }
+
+  // Create tensor splat
+  auto tensorConstant =
+      builder.create<tensor::SplatOp>(loc, scalar, referenceShape);
+  return tensorConstant;
+}
+
+// Reshape an unranked tensor into a 1D ranked tensor.
+//
+// - input
+//   Unranked tensor.
+//
+// Return values:
+//
+// - flatInput
+//   1D ranked, dynamically shaped tensor.
+//
+// - inputShape
+//   1D extent tensor containing the shape of the original unranked input.
+//
+std::pair<Value, Value> flattenUnrankedTensor(OpBuilder &builder, Location loc,
+                                              Value input) {
+  // Get unranked input shape and total size
+  auto *context = builder.getContext();
+  auto shapeType = shape::getExtentTensorType(context);
+  auto inputShape = builder.create<shape::ShapeOfOp>(loc, shapeType, input);
+  Value inputSize = builder.create<shape::NumElementsOp>(
+      loc, builder.getIndexType(), inputShape);
+
+  // Turn input size into 1D tensor
+  auto flatShapeType = shape::getExtentTensorType(context, 1);
+  auto flatInputShape = builder.create<tensor::FromElementsOp>(
+      loc, flatShapeType, inputSize);
+
+  // Reshape input tensor into 1D
+  auto inputType = cast<UnrankedTensorType>(input.getType());
+  auto elementType = inputType.getElementType();
+  auto flatInputType =
+      RankedTensorType::get({ShapedType::kDynamic}, elementType);
+  auto flatInput = builder.create<tensor::ReshapeOp>(
+      loc, flatInputType, input, flatInputShape);
+  return std::make_pair(flatInput, inputShape);
+}
+
+// Reshape an unranked tensor into a 3D ranked tensor where the central
+// dimension of the result tensor corresponds to dimension 'axis' of the input
+// tensor.
+//
+// - input
+//   Unranked tensor.
+//
+// - axis
+//   Index of the input dimension around which other input dimiensions will be
+//   collapsed.
+//
+// - axisSize
+//   Size of input dimension 'axis'.
+//
+// Return values:
+//
+// - flatInput
+//   3D ranked tensor of shape [?, axisSize, ?].
+//
+// - inputShape
+//   1D extent tensor containing the shape of the original unranked input.
+//
+std::pair<Value, Value> flattenUnrankedTensorAroundAxis(OpBuilder &builder,
+                                                        Location loc,
+                                                        Value input,
+                                                        int64_t axis,
+                                                        int64_t axisSize) {
+  // Get full tensor shape
+  auto *context = builder.getContext();
+  auto indexType = builder.getIndexType();
+  auto shapeType = shape::getExtentTensorType(context);
+  auto inputShape = builder.create<shape::ShapeOfOp>(loc, shapeType, input);
+
+  // Get shape and sizes on left and right of axis
+  auto axisValue = builder.create<arith::ConstantIndexOp>(loc, axis);
+  auto axisNextValue = builder.create<arith::ConstantIndexOp>(loc, axis + 1);
+  auto shapeLeft = builder.create<shape::SplitAtOp>(
+      loc, TypeRange{shapeType, shapeType}, inputShape, axisValue)
+      .getResult(0);
+  auto sizeLeft = builder.create<shape::NumElementsOp>(
+      loc, indexType, shapeLeft);
+  auto shapeRight = builder.create<shape::SplitAtOp>(
+      loc, TypeRange{shapeType, shapeType}, inputShape, axisNextValue)
+      .getResult(1);
+  auto sizeRight = builder.create<shape::NumElementsOp>(
+      loc, indexType, shapeRight);
+
+  // Compute flat input shape as a 3-element 1D tensor
+  auto axisSizeValue = builder.create<arith::ConstantIndexOp>(loc, axisSize);
+  auto flatShapeType = shape::getExtentTensorType(context, 3);
+  auto flatInputShape = builder.create<tensor::FromElementsOp>(
+      loc, flatShapeType, ValueRange{sizeLeft, axisSizeValue, sizeRight});
+
+  // Reshape input to 3D tensor
+  auto inputType = cast<UnrankedTensorType>(input.getType());
+  auto elementType = inputType.getElementType();
+  auto flatInputType = RankedTensorType::get(
+      {ShapedType::kDynamic, axisSize, ShapedType::kDynamic}, elementType);
+  auto flatInput = builder.create<tensor::ReshapeOp>(
+      loc, flatInputType, input, flatInputShape);
+
+  return std::make_pair(flatInput, inputShape);
+}
+
+// Reshape an input tensor into its original unranked shape.
+//
+// - input
+//   Ranked tensor.
+//
+// - inputShape
+//   1D extent tensor.
+//
+Value restoreUnrankedTensorShape(OpBuilder &builder, Location loc, Value input,
+                                 Value inputShape) {
+  auto inputType = cast<RankedTensorType>(input.getType());
+  auto elementType = inputType.getElementType();
+  auto unrankedType = UnrankedTensorType::get(elementType);
+  return builder.create<tensor::ReshapeOp>(loc, unrankedType, input, inputShape);
+}
+
+// Create a tensor constant containing all scales in a per-channel quantized
+// type. Example:
+//
+//   !quant.uniform<i8:f32:1, {2.0:10, 3.0:20}>
+//
+// produces
+//
+//   %cst = arith.constant dense<[2.0, 3.0]> : tensor<2xf32>
+//
+Value materializePerChannelScales(OpBuilder &builder, Location loc,
+                                  UniformQuantizedPerAxisType quantizedType) {
+  auto scales = quantizedType.getScales();
+  auto expressedType = quantizedType.getExpressedType();
+  auto scaleAttrs = llvm::map_to_vector(scales, [&](double scale) -> Attribute {
+    return builder.getFloatAttr(expressedType, scale);
+  });
+  auto tensorType = RankedTensorType::get({(int64_t) scales.size()}, expressedType);
+  auto scalesAttr = DenseElementsAttr::get(tensorType, scaleAttrs);
+  return builder.create<arith::ConstantOp>(loc, tensorType, scalesAttr);
+}
+
+// Create a tensor constant containing all zero points in a per-channel
+// quantized type. Example:
+//
+//   !quant.uniform<i8:f32:1, {2.0:10, 3.0:20}>
+//
+// produces
+//
+//   %cst = arith.constant dense<[10, 20]> : tensor<2xi8>
+//
+Value materializePerChannelZeroPoints(
+    OpBuilder &builder, Location loc,
+    UniformQuantizedPerAxisType quantizedType) {
+  auto zeroPoints = quantizedType.getZeroPoints();
+  auto storageType = quantizedType.getStorageType();
+  auto zeroPointAttrs = llvm::map_to_vector(
+      zeroPoints,
+      [&](int64_t zeroPoint) -> Attribute {
+        return builder.getIntegerAttr(storageType, zeroPoint);
+      });
+  auto tensorType =
+      RankedTensorType::get({(int64_t)zeroPoints.size()}, storageType);
+  auto zeroPointsAttr = DenseElementsAttr::get(tensorType, zeroPointAttrs);
+  return builder.create<arith::ConstantOp>(loc, tensorType, zeroPointsAttr);
+}
+
+// Clamp the given scalar or tensor input using the storage bounds encoded in
+// the given quantized type, if present.
+//
+// - input
+//   Scalar or ranked tensor input. The element type must match the storage type
+//   of 'quantizedType'.
+//
+// - inputShape
+//   If 'input' is a tensor, combination of attributes/values representing its
+//   static/dynamic dimensions. If 'input' is a scalar, empty list.
+//
+// - quantizedType
+//   Per-axis or per-channel quantized type.
+Value clampScalarOrTensor(OpBuilder &builder, Location loc, Value input,
+                          ArrayRef<OpFoldResult> inputShape,
+                          QuantizedType quantizedType) {
+  // If quantized type does not narrow down the storage type range, there is
+  // nothing to do.
+  if (!quantizedType.hasStorageTypeBounds())
+    return input;
+
+  // Materialize bounds
+  auto inputType = input.getType();
+  auto storageType = quantizedType.getStorageType();
+  auto storageMinScalar = builder.create<arith::ConstantIntOp>(
+      loc, quantizedType.getStorageTypeMin(), storageType);
+  auto storageMaxScalar = builder.create<arith::ConstantIntOp>(
+      loc, quantizedType.getStorageTypeMax(), storageType);
+  auto storageMin = getScalarOrTensorConstant(builder, loc, storageMinScalar,
+                                              inputType, inputShape);
+  auto storageMax = getScalarOrTensorConstant(builder, loc, storageMaxScalar,
+                                              inputType, inputShape);
+
+  // Clamp
+  if (quantizedType.isSigned()) {
+    input = builder.create<arith::MaxSIOp>(loc, input, storageMin);
+    input = builder.create<arith::MinSIOp>(loc, input, storageMax);
+  } else {
+    input = builder.create<arith::MaxUIOp>(loc, input, storageMin);
+    input = builder.create<arith::MinUIOp>(loc, input, storageMax);
+  }
+  return input;
+}
+
+// Emit op 'arith.fptosi' or 'arith.fptoui'.
+Value convertFloatToInteger(OpBuilder &builder, Location loc, Value input,
+                            Type resultType, bool isSigned) {
+  if (isSigned)
+    return builder.create<arith::FPToSIOp>(loc, resultType, input);
+  return builder.create<arith::FPToUIOp>(loc, resultType, input);
+}
+
+// Emit op 'arith.sitofp' or 'arith.uitofp'.
+Value convertIntegerToFloat(OpBuilder &builder, Location loc, Value input,
+                            Type resultType, bool isSigned) {
+  if (isSigned)
+    return builder.create<arith::SIToFPOp>(loc, resultType, input);
+  return builder.create<arith::UIToFPOp>(loc, resultType, input);
+}
+
+// Quantize a scalar or ranked tensor value. The stored value is clamped using 
+// the storage bounds encoded in the given quantized type.
+//
+// See function 'convertRanked()' below for a description of the arguments.
+Value quantizeValue(OpBuilder &builder, Location loc, Value input,
+                    ArrayRef<OpFoldResult> inputShape, Value scale,
+                    Value zeroPoint, QuantizedType quantizedType) {
+  // Convert scale to tensor if necessary
+  auto inputType = input.getType();
+  scale = getScalarOrTensorConstant(
+      builder, loc, scale, inputType, inputShape);
+
+  // Scale input
+  auto scaledValue = builder.create<arith::DivFOp>(loc, input, scale);
+
+  // Skip unnecessary computations if no zero point is given
+  Value storedValueFloat = scaledValue;
+  if (!matchPattern(zeroPoint, m_Zero())) {
+    // Convert zero point to tensor if necessary
+    zeroPoint = getScalarOrTensorConstant(builder, loc, zeroPoint, inputType,
+                                          inputShape);
+
+    // Convert zero point from storage to expressed type
+    zeroPoint = convertIntegerToFloat(builder, loc, zeroPoint,
+                                      scale.getType(),
+                                      quantizedType.isSigned());
+
+    // Add zero point to stored value
+    storedValueFloat =
+        builder.create<arith::AddFOp>(loc, scaledValue, zeroPoint);
+  }
+
+  // Convert stored value to storage type
+  auto storageScalarOrTensorType =
+      getScalarOrTensorType(quantizedType.getStorageType(), inputType);
+  auto storedValueInt = convertFloatToInteger(
+      builder, loc, storedValueFloat, storageScalarOrTensorType,
+      quantizedType.isSigned());
+
+  // Clamp stored value it if the storage type is bound
+  auto storedValueClamped = clampScalarOrTensor(builder, loc, storedValueInt,
+                                                inputShape, quantizedType);
+  return storedValueClamped;
+}
+
+// Dequantize a scalar or ranked tensor input.
+//
+// See function 'convertRanked()' below for a description of the arguments.
+Value dequantizeValue(OpBuilder &builder, Location loc, Value input,
+                      ArrayRef<OpFoldResult> inputShape, Value scale,
+                      Value zeroPoint, QuantizedType quantizedType) {
+  // Convert scale to tensor if necessary
+  auto inputType = input.getType();
+  scale = getScalarOrTensorConstant(
+      builder, loc, scale, inputType, inputShape);
+
+  // Convert stored value to float
+  auto result = convertIntegerToFloat(
+      builder, loc, input, scale.getType(), quantizedType.isSigned());
+
+  // Skip unnecessary computations if no zero point is given
+  if (!matchPattern(zeroPoint, m_Zero())) {
+    // Convert zero point to tensor if necessary
+    zeroPoint = getScalarOrTensorConstant(builder, loc, zeroPoint, inputType,
+                                          inputShape);
+
+    // Convert zero point from storage to expressed type
+    zeroPoint = convertIntegerToFloat(builder, loc, zeroPoint,
+                                      scale.getType(),
+                                      quantizedType.isSigned());
+
+    // Subtract zero point to stored value
+    result = builder.create<arith::SubFOp>(loc, result, zeroPoint);
+  }
+
+  // Multiply by scale
+  result = builder.create<arith::MulFOp>(loc, result, scale);
+  return result;
+}
+
+// Convert a scalar or ranked tensor input with the given scale and zero point
+// values.
+//
+// - input
+//   Scalar or ranked tensor value.
+//
+// - inputShape
+//   If 'input' is a tensor, combination or attributes/values representing its
+//   static/dynamic dimensions. If 'input' is a scalar, empty list.
+//
+// - scale
+//   Scale as a floating-point scalar value.
+//
+// - zeroPoint
+//   Zero point as an integer scalar value.
+//
+// - quantizedType
+//   Scalar quantized type of the result ('quant.qcast') or of the input
+//   ('quant.dcast').
+//
+Value convertRanked(OpBuilder &builder, Location loc, Operation *op,
+                    Value input, ArrayRef<OpFoldResult> inputShape, Value scale,
+                    Value zeroPoint, QuantizedType quantizedType) {
+  if (isa<QuantizeCastOp>(op))
+    return quantizeValue(builder, loc, input, inputShape, scale, zeroPoint,
+                         quantizedType);
+  if (isa<DequantizeCastOp>(op))
+    return dequantizeValue(builder, loc, input, inputShape, scale, zeroPoint,
+                           quantizedType);
+  llvm_unreachable("unexpected quant op");
+}
+
+// Convert an operation using per-layer quantization with a scalar or ranked
+// tensor input.
+//
+// - op
+//   'quant.dcast' or 'quant.qcast' op.
+//
+// - input
+//   Scalar or ranked tensor.
+//
+// - quantizedType
+//   Per-layer quantized type.
+//
+Value convertPerLayerRanked(OpBuilder &builder, Location loc, Operation *op,
+                            Value input, UniformQuantizedType quantizedType) {
+  // Create scale and zero point constants
+  auto expressedType = quantizedType.getExpressedType();
+  auto storageType = quantizedType.getStorageType();
+  auto scaleAttr =
+      builder.getFloatAttr(expressedType, quantizedType.getScale());
+  auto scale = builder.create<arith::ConstantOp>(loc, expressedType, scaleAttr);
+  auto zeroPointAttr =
+      builder.getIntegerAttr(storageType, quantizedType.getZeroPoint());
+  auto zeroPoint =
+      builder.create<arith::ConstantOp>(loc, storageType, zeroPointAttr);
+
+  auto inputShape = getScalarOrTensorShape(builder, loc, input);
+  return convertRanked(builder, loc, op, input, inputShape, scale, zeroPoint,
+                       quantizedType);
+}
+
+// Convert an operation using per-layer quantization.
+//
+// - op
+//   'quant.dcast' or 'quant.qcast' op.
+//
+// - input
+//   Scalar, ranked tensor, or unranked tensor.
+//
+// - quantizedType
+//   Per-layer quantized type.
+//
+Value convertPerLayer(OpBuilder &builder, Location loc, Operation *op,
+                      Value input, UniformQuantizedType quantizedType) {
+  // Flatten input if unranked
+  bool isUnranked = isa<UnrankedTensorType>(input.getType());
+  Value inputShape;
+  if (isUnranked)
+    std::tie(input, inputShape) = flattenUnrankedTensor(builder, loc, input);
+
+  // Process ranked tensor
+  auto result = convertPerLayerRanked(builder, loc, op, input, quantizedType);
+
+  // Restore original shape if unranked
+  if (isUnranked)
+    result = restoreUnrankedTensorShape(builder, loc, result, inputShape);
+
+  return result;
+}
+
+// Convert an operation using per-channel quantization and a scalar or ranked
+// tensor as an input.
+//
+// - op
+//   'quant.dcast' or 'quant.qcast' op.
+//
+// - input
+//   Scalar or ranked tensor.
+//
+// - quantizedType
+//   Per-channel quantized type.
+//
+Value convertPerChannelRanked(OpBuilder &builder, Location loc, Operation *op,
+                              Value input,
+                              UniformQuantizedPerAxisType quantizedType,
+                              int64_t channelAxis) {
+  auto *context = builder.getContext();
+
+  auto inputType = cast<RankedTensorType>(input.getType());
+  auto inputRank = inputType.getRank();
+
+  auto scales = materializePerChannelScales(builder, loc, quantizedType);
+  auto zeroPoints =
+      materializePerChannelZeroPoints(builder, loc, quantizedType);
+
+  auto elementType = isa<FloatType>(inputType.getElementType())
+                         ? quantizedType.getStorageType()
+                         : quantizedType.getExpressedType();
+  auto initShape = tensor::getMixedSizes(builder, loc, input);
+  Value init = builder.create<tensor::EmptyOp>(loc, initShape, elementType);
+
+  SmallVector<utils::IteratorType> iteratorTypes(
+      inputRank, utils::IteratorType::parallel);
+  auto channelAxisAffineMap = AffineMap::get(
+      inputRank, 0, builder.getAffineDimExpr(channelAxis), context);
+  SmallVector<AffineMap> indexingMaps{
+    builder.getMultiDimIdentityMap(inputRank),
+    channelAxisAffineMap,
+    channelAxisAffineMap,
+    builder.getMultiDimIdentityMap(inputRank)
+  };
+  auto result = builder.create<linalg::GenericOp>(
+      loc,
+      init.getType(),  // resultType
+      ValueRange{input, scales, zeroPoints},  // inputs
+      ValueRange{init},  // outputs
+      indexingMaps,
+      iteratorTypes,
+      [&](OpBuilder& builder, Location loc, ValueRange args) {
+        assert(args.size() == 4);
+        auto input = args[0];
+        auto scale = args[1];
+        auto zeroPoint = args[2];
+
+        auto result = convertRanked(builder, loc, op, input, {}, scale,
+                                    zeroPoint, quantizedType);
+
+        builder.create<linalg::YieldOp>(loc, result);
+      })
+      .getResult(0);
+
+  return result;
+}
+
+// Convert an operation using per-channel quantization.
+//
+// - op
+//   'quant.dcast' or 'quant.qcast' op.
+//
+// - input
+//   Scalar, ranked tensor, or unranked tensor.
+//
+// - quantizedType
+//   Per-channel quantized type.
+//
+Value convertPerChannel(OpBuilder &builder, Location loc, Operation *op,
+                        Value input,
+                        UniformQuantizedPerAxisType quantizedType) {
+  // Flatten unranked tensor into a 3D ranked tensor if necessary
+  bool isUnranked = isa<UnrankedTensorType>(input.getType());
+  int64_t channelAxis = quantizedType.getQuantizedDimension();
+  int64_t channelAxisSize = (int64_t) quantizedType.getScales().size();
+  Value inputShape;
+  if (isUnranked) {
+    std::tie(input, inputShape) = flattenUnrankedTensorAroundAxis(
+        builder, loc, input, channelAxis, channelAxisSize);
+    channelAxis = 1;
+  }
+
+  // Work on a ranked tensor
+  auto result = convertPerChannelRanked(builder, loc, op, input, quantizedType,
+                                        channelAxis);
+
+  // Restore original tensor shape if unranked
+  if (isUnranked)
+    result = restoreUnrankedTensorShape(builder, loc, result, inputShape);
+
+  return result;
+}
+
+// Convert a quantization operation.
+//
+// - op
+//   'quant.dcast' or 'quant.qcast' op.
+//
+// - input
+//   Scalar, ranked tensor, or unranked tensor. The element type matches
+//   the storage type (quant.dcast) or expressed type (quant.qcast) of
+//   'quantizedType'.
+//
+// - quantizedType
+//   Per-layer or per-channel quantized type.
+//
+Value convertQuantized(OpBuilder &builder, Location loc, Operation *op,
+                       Value input, Type quantizedType) {
+  if (auto uniformQuantizedType = dyn_cast<UniformQuantizedType>(quantizedType))
+    return convertPerLayer(builder, loc, op, input, uniformQuantizedType);
+
+  if (auto uniformQuantizedPerAxisType =
+          dyn_cast<UniformQuantizedPerAxisType>(quantizedType))
+    return convertPerChannel(builder, loc, op, input,
+                             uniformQuantizedPerAxisType);
+
+  llvm_unreachable("unexpected quantized type");
+}
+
+// Lowering pattern for 'quant.dcast'
+struct DequantizeCastOpConversion : public OpConversionPattern<quant::DequantizeCastOp> {
+  using OpConversionPattern<quant::DequantizeCastOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(quant::DequantizeCastOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto input = op.getInput();
+    auto quantizedType =
+        cast<QuantizedType>(getScalarType(op.getInput().getType()));
+
+    // Convert quantized input to storage type
+    auto storageScalarOrTensorType =
+        getScalarOrTensorType(quantizedType.getStorageType(), input.getType());
+    input = rewriter.create<quant::StorageCastOp>(
+        loc, storageScalarOrTensorType, input);
+
+    auto result = convertQuantized(rewriter, loc, op, input, quantizedType);
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
+// Lowering pattern for 'quant.qcast'
+struct QuantizeCastOpConversion : public OpConversionPattern<quant::QuantizeCastOp> {
+  using OpConversionPattern<quant::QuantizeCastOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(quant::QuantizeCastOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto input = op.getInput();
+    auto quantizedType = getScalarType(op.getResult().getType());
+
+    // Flatten unranked tensor input
+    auto result = convertQuantized(rewriter, loc, op, input, quantizedType);
+
+    // Cast stored value to result quantized value
+    rewriter.replaceOpWithNewOp<quant::StorageCastOp>(
+        op, op.getResult().getType(), result);
+    return success();
+  }
+};
+
+struct LowerQuantOps : public impl::LowerQuantOpsBase<LowerQuantOps> {
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    populateLowerQuantOpsPatterns(patterns);
+
+    ConversionTarget target(getContext());
+    target.addLegalOp<quant::StorageCastOp>();
+    target.addIllegalDialect<quant::QuantDialect>();
+    target.addLegalDialect<
+      arith::ArithDialect,
+      linalg::LinalgDialect,
+      shape::ShapeDialect,
+      tensor::TensorDialect
+    >();
+
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+void populateLowerQuantOpsPatterns(RewritePatternSet &patterns) {
+  patterns.add<
+    DequantizeCastOpConversion,
+    QuantizeCastOpConversion
+  >(patterns.getContext());
+}
+
+} // namespace quant
+} // namespace mlir
diff --git a/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp
new file mode 100644
index 0000000000000..8996eff61a39c
--- /dev/null
+++ b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp
@@ -0,0 +1,114 @@
+//===- StripFuncQuantTypes.cpp - Strip quantized types --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Strips quantized types from function headers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Quant/IR/Quant.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"
+#include "mlir/Dialect/Quant/Transforms/Passes.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+namespace quant {
+
+#define GEN_PASS_DEF_STRIPFUNCQUANTTYPES
+#include "mlir/Dialect/Quant/Transforms/Passes.h.inc"
+
+namespace {
+
+class QuantizedTypeConverter : public TypeConverter {
+
+  static Type convertQuantizedType(QuantizedType quantizedType) {
+    return quantizedType.getStorageType();
+  }
+  
+  static Type convertTensorType(TensorType tensorType) {
+    if (auto quantizedType = dyn_cast<QuantizedType>(tensorType.getElementType()))
+      return tensorType.clone(convertQuantizedType(quantizedType));
+    return tensorType;
+  }
+
+  static Value materializeConversion(OpBuilder &builder, Type type,
+                                     ValueRange inputs, Location loc) {
+    assert(inputs.size() == 1);
+    return builder.create<quant::StorageCastOp>(loc, type, inputs[0]);
+  }
+
+public:
+
+  explicit QuantizedTypeConverter() {
+    addConversion([](Type type) { return type; });
+    addConversion(convertQuantizedType);
+    addConversion(convertTensorType);
+
+    addArgumentMaterialization(materializeConversion);
+    addSourceMaterialization(materializeConversion);
+    addTargetMaterialization(materializeConversion);
+  }
+};
+
+// Conversion pass
+class StripFuncQuantTypes : public impl::StripFuncQuantTypesBase<StripFuncQuantTypes> {
+
+  // Return whether a type is considered legal when occurring in the header of
+  // a function or as an operand to a 'return' op.
+  static bool isLegalType(Type type) {
+    if (auto tensorType = dyn_cast<TensorType>(type))
+      return isLegalType(tensorType.getElementType());
+    return !isa<quant::QuantizedType>(type);
+  }
+
+public:
+
+  void runOnOperation() override {
+    
+    auto moduleOp = cast<ModuleOp>(getOperation());
+    auto* context = &getContext();
+
+    QuantizedTypeConverter typeConverter;
+    ConversionTarget target(*context);
+    RewritePatternSet patterns(context);
+
+    // Mark func.func, func.return, and func.call illegal if they contain any
+    // quantized types.
+    target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
+      return typeConverter.isSignatureLegal(op.getFunctionType()) &&
+             typeConverter.isLegal(&op.getBody());
+    });
+    target.addDynamicallyLegalOp<func::ReturnOp>(
+        [&](func::ReturnOp op) { return typeConverter.isLegal(op); });
+    target.addDynamicallyLegalOp<func::CallOp>(
+        [&](func::CallOp op) { return typeConverter.isLegal(op); });
+
+    // Register conversion patterns
+    populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(
+        patterns, typeConverter);
+    populateReturnOpTypeConversionPattern(patterns, typeConverter);
+    populateCallOpTypeConversionPattern(patterns, typeConverter);
+
+    // Apply conversion
+    if (failed(applyPartialConversion(moduleOp, target, std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+} // namespace quant
+} // namespace mlir
+
diff --git a/mlir/lib/Dialect/Quant/Utils/FakeQuantSupport.cpp b/mlir/lib/Dialect/Quant/Utils/FakeQuantSupport.cpp
index 8c69729824691..fb27640bfd278 100644
--- a/mlir/lib/Dialect/Quant/Utils/FakeQuantSupport.cpp
+++ b/mlir/lib/Dialect/Quant/Utils/FakeQuantSupport.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Quant/FakeQuantSupport.h"
-#include "mlir/Dialect/Quant/QuantTypes.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"
+#include "mlir/Dialect/Quant/Utils/FakeQuantSupport.h"
 
 using namespace mlir;
 using namespace mlir::quant;
diff --git a/mlir/lib/Dialect/Quant/Utils/UniformSupport.cpp b/mlir/lib/Dialect/Quant/Utils/UniformSupport.cpp
index 408701f80444a..62c7a7128d63a 100644
--- a/mlir/lib/Dialect/Quant/Utils/UniformSupport.cpp
+++ b/mlir/lib/Dialect/Quant/Utils/UniformSupport.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Quant/UniformSupport.h"
+#include "mlir/Dialect/Quant/Utils/UniformSupport.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include <numeric>
 
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 6d47ff3890977..d1c9fd2d217da 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -772,6 +772,70 @@ LoopNest mlir::scf::buildLoopNest(
                        });
 }
 
+SmallVector<Value>
+mlir::scf::replaceAndCastForOpIterArg(RewriterBase &rewriter, scf::ForOp forOp,
+                                      OpOperand &operand, Value replacement,
+                                      const ValueTypeCastFnTy &castFn) {
+  assert(operand.getOwner() == forOp);
+  Type oldType = operand.get().getType(), newType = replacement.getType();
+
+  // 1. Create new iter operands, exactly 1 is replaced.
+  assert(operand.getOperandNumber() >= forOp.getNumControlOperands() &&
+         "expected an iter OpOperand");
+  assert(operand.get().getType() != replacement.getType() &&
+         "Expected a different type");
+  SmallVector<Value> newIterOperands;
+  for (OpOperand &opOperand : forOp.getInitArgsMutable()) {
+    if (opOperand.getOperandNumber() == operand.getOperandNumber()) {
+      newIterOperands.push_back(replacement);
+      continue;
+    }
+    newIterOperands.push_back(opOperand.get());
+  }
+
+  // 2. Create the new forOp shell.
+  scf::ForOp newForOp = rewriter.create<scf::ForOp>(
+      forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(),
+      forOp.getStep(), newIterOperands);
+  newForOp->setAttrs(forOp->getAttrs());
+  Block &newBlock = newForOp.getRegion().front();
+  SmallVector<Value, 4> newBlockTransferArgs(newBlock.getArguments().begin(),
+                                             newBlock.getArguments().end());
+
+  // 3. Inject an incoming cast op at the beginning of the block for the bbArg
+  // corresponding to the `replacement` value.
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(&newBlock, newBlock.begin());
+  BlockArgument newRegionIterArg = newForOp.getTiedLoopRegionIterArg(
+      &newForOp->getOpOperand(operand.getOperandNumber()));
+  Value castIn = castFn(rewriter, newForOp.getLoc(), oldType, newRegionIterArg);
+  newBlockTransferArgs[newRegionIterArg.getArgNumber()] = castIn;
+
+  // 4. Steal the old block ops, mapping to the newBlockTransferArgs.
+  Block &oldBlock = forOp.getRegion().front();
+  rewriter.mergeBlocks(&oldBlock, &newBlock, newBlockTransferArgs);
+
+  // 5. Inject an outgoing cast op at the end of the block and yield it instead.
+  auto clonedYieldOp = cast<scf::YieldOp>(newBlock.getTerminator());
+  rewriter.setInsertionPoint(clonedYieldOp);
+  unsigned yieldIdx =
+      newRegionIterArg.getArgNumber() - forOp.getNumInductionVars();
+  Value castOut = castFn(rewriter, newForOp.getLoc(), newType,
+                         clonedYieldOp.getOperand(yieldIdx));
+  SmallVector<Value> newYieldOperands = clonedYieldOp.getOperands();
+  newYieldOperands[yieldIdx] = castOut;
+  rewriter.create<scf::YieldOp>(newForOp.getLoc(), newYieldOperands);
+  rewriter.eraseOp(clonedYieldOp);
+
+  // 6. Inject an outgoing cast op after the forOp.
+  rewriter.setInsertionPointAfter(newForOp);
+  SmallVector<Value> newResults = newForOp.getResults();
+  newResults[yieldIdx] =
+      castFn(rewriter, newForOp.getLoc(), oldType, newResults[yieldIdx]);
+
+  return newResults;
+}
+
 namespace {
 // Fold away ForOp iter arguments when:
 // 1) The op yields the iter arguments.
@@ -973,76 +1037,6 @@ struct SimplifyTrivialLoops : public OpRewritePattern<ForOp> {
   }
 };
 
-/// Perform a replacement of one iter OpOperand of an scf.for to the
-/// `replacement` value which is expected to be the source of a tensor.cast.
-/// tensor.cast ops are inserted inside the block to account for the type cast.
-static SmallVector<Value>
-replaceTensorCastForOpIterArg(PatternRewriter &rewriter, OpOperand &operand,
-                              Value replacement) {
-  Type oldType = operand.get().getType(), newType = replacement.getType();
-  assert(llvm::isa<RankedTensorType>(oldType) &&
-         llvm::isa<RankedTensorType>(newType) &&
-         "expected ranked tensor types");
-
-  // 1. Create new iter operands, exactly 1 is replaced.
-  ForOp forOp = cast<ForOp>(operand.getOwner());
-  assert(operand.getOperandNumber() >= forOp.getNumControlOperands() &&
-         "expected an iter OpOperand");
-  assert(operand.get().getType() != replacement.getType() &&
-         "Expected a different type");
-  SmallVector<Value> newIterOperands;
-  for (OpOperand &opOperand : forOp.getInitArgsMutable()) {
-    if (opOperand.getOperandNumber() == operand.getOperandNumber()) {
-      newIterOperands.push_back(replacement);
-      continue;
-    }
-    newIterOperands.push_back(opOperand.get());
-  }
-
-  // 2. Create the new forOp shell.
-  scf::ForOp newForOp = rewriter.create<scf::ForOp>(
-      forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(),
-      forOp.getStep(), newIterOperands);
-  newForOp->setAttrs(forOp->getAttrs());
-  Block &newBlock = newForOp.getRegion().front();
-  SmallVector<Value, 4> newBlockTransferArgs(newBlock.getArguments().begin(),
-                                             newBlock.getArguments().end());
-
-  // 3. Inject an incoming cast op at the beginning of the block for the bbArg
-  // corresponding to the `replacement` value.
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(&newBlock, newBlock.begin());
-  BlockArgument newRegionIterArg = newForOp.getTiedLoopRegionIterArg(
-      &newForOp->getOpOperand(operand.getOperandNumber()));
-  Value castIn = rewriter.create<tensor::CastOp>(newForOp.getLoc(), oldType,
-                                                 newRegionIterArg);
-  newBlockTransferArgs[newRegionIterArg.getArgNumber()] = castIn;
-
-  // 4. Steal the old block ops, mapping to the newBlockTransferArgs.
-  Block &oldBlock = forOp.getRegion().front();
-  rewriter.mergeBlocks(&oldBlock, &newBlock, newBlockTransferArgs);
-
-  // 5. Inject an outgoing cast op at the end of the block and yield it instead.
-  auto clonedYieldOp = cast<scf::YieldOp>(newBlock.getTerminator());
-  rewriter.setInsertionPoint(clonedYieldOp);
-  unsigned yieldIdx =
-      newRegionIterArg.getArgNumber() - forOp.getNumInductionVars();
-  Value castOut = rewriter.create<tensor::CastOp>(
-      newForOp.getLoc(), newType, clonedYieldOp.getOperand(yieldIdx));
-  SmallVector<Value> newYieldOperands = clonedYieldOp.getOperands();
-  newYieldOperands[yieldIdx] = castOut;
-  rewriter.create<scf::YieldOp>(newForOp.getLoc(), newYieldOperands);
-  rewriter.eraseOp(clonedYieldOp);
-
-  // 6. Inject an outgoing cast op after the forOp.
-  rewriter.setInsertionPointAfter(newForOp);
-  SmallVector<Value> newResults = newForOp.getResults();
-  newResults[yieldIdx] = rewriter.create<tensor::CastOp>(
-      newForOp.getLoc(), oldType, newResults[yieldIdx]);
-
-  return newResults;
-}
-
 /// Fold scf.for iter_arg/result pairs that go through incoming/ougoing
 /// a tensor.cast op pair so as to pull the tensor.cast inside the scf.for:
 ///
@@ -1090,9 +1084,13 @@ struct ForOpTensorCastFolder : public OpRewritePattern<ForOp> {
         continue;
 
       // Create a new ForOp with that iter operand replaced.
+      ValueTypeCastFnTy castFn = [](OpBuilder &b, Location loc, Type type,
+                                    Value source) {
+        return b.create<tensor::CastOp>(loc, type, source);
+      };
       rewriter.replaceOp(
-          op, replaceTensorCastForOpIterArg(rewriter, iterOpOperand,
-                                            incomingCast.getSource()));
+          op, replaceAndCastForOpIterArg(rewriter, op, iterOpOperand,
+                                         incomingCast.getSource(), castFn));
       return success();
     }
     return failure();
diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
index 3d6da066875f9..83c9cf69ba036 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
@@ -648,15 +648,22 @@ LoopPipelinerInternal::emitEpilogue(RewriterBase &rewriter,
   // bounds_range = ub - lb
   // total_iterations = (bounds_range + step - 1) / step
   Type t = lb.getType();
-  Value minus1 =
-      rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, -1));
-  Value boundsRange = rewriter.create<arith::SubIOp>(loc, ub, lb);
-  Value rangeIncr = rewriter.create<arith::AddIOp>(loc, boundsRange, step);
-  Value rangeDecr = rewriter.create<arith::AddIOp>(loc, rangeIncr, minus1);
-  Value totalIterations = rewriter.create<arith::DivUIOp>(loc, rangeDecr, step);
-
   Value zero =
       rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, 0));
+  Value one =
+      rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, 1));
+  Value minusOne =
+      rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, -1));
+  Value stepLessZero = rewriter.create<arith::CmpIOp>(
+      loc, arith::CmpIPredicate::slt, step, zero);
+  Value stepDecr =
+      rewriter.create<arith::SelectOp>(loc, stepLessZero, one, minusOne);
+
+  Value rangeDiff = rewriter.create<arith::SubIOp>(loc, ub, lb);
+  Value rangeIncrStep = rewriter.create<arith::AddIOp>(loc, rangeDiff, step);
+  Value rangeDecr =
+      rewriter.create<arith::AddIOp>(loc, rangeIncrStep, stepDecr);
+  Value totalIterations = rewriter.create<arith::DivSIOp>(loc, rangeDecr, step);
 
   SmallVector<Value> predicates(maxStage + 1);
   for (int64_t i = 0; i < maxStage; i++) {
@@ -665,7 +672,7 @@ LoopPipelinerInternal::emitEpilogue(RewriterBase &rewriter,
     Value minusI =
         rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, -i));
     Value iterI = rewriter.create<arith::AddIOp>(
-        loc, rewriter.create<arith::AddIOp>(loc, totalIterations, minus1),
+        loc, rewriter.create<arith::AddIOp>(loc, totalIterations, minusOne),
         minusI);
     // newLastIter = lb + step * iterI
     Value newlastIter = rewriter.create<arith::AddIOp>(
diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
index 6ba7020e86fa6..358a3b38a4cd3 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Dialect/SCF/Transforms/Passes.h"
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/Transforms/RegionUtils.h"
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 7cfd772a72b17..50cfd29e6bf90 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -1481,21 +1481,29 @@ checkAssumptionForFusingConsumer(tensor::InsertSliceOp candidateSliceOp) {
 /// failure otherwise.
 static FailureOr<OpOperand *> getConsumerFromUses(Value val,
                                                   Block *containingOpBlock) {
-  // Step 1. Check that the value has exactly one use.
-  if (!llvm::hasSingleElement(val.getUses()))
-    return failure();
-  // Step 2. Get uses.
-  OpOperand &operand = (*val.getUses().begin());
-  Operation *consumerOp = operand.getOwner();
-  // TODO: We have to init result of consumer before scf.for, use
-  //       DestinationStyleOpInterface to get result shape from init for now.
-  //       Add support for other op such as op has InferTypeOpInterface.
-  if (!isa<TilingInterface>(consumerOp) ||
-      !isa<DestinationStyleOpInterface>(consumerOp))
-    return failure();
-  if (containingOpBlock != consumerOp->getBlock())
-    return failure();
-  return &operand;
+  // Check that the value has exactly one use which isn't a scf.yield or a
+  // tensor.parallel_insert_slice op.
+  OpOperand *operand = nullptr;
+  for (OpOperand &opOperand : val.getUses()) {
+    Operation *consumerOp = opOperand.getOwner();
+    if (isa<scf::YieldOp, tensor::ParallelInsertSliceOp>(consumerOp))
+      continue;
+    if (operand)
+      return failure();
+    // TODO: We have to init result of consumer before scf.for, use
+    //       DestinationStyleOpInterface to get result shape from init for now.
+    //       Add support for other op such as op has InferTypeOpInterface.
+    if (!isa<TilingInterface>(consumerOp) ||
+        !isa<DestinationStyleOpInterface>(consumerOp))
+      return failure();
+    if (containingOpBlock != consumerOp->getBlock())
+      return failure();
+    operand = &opOperand;
+  }
+
+  if (operand)
+    return operand;
+  return failure();
 }
 
 /// Find the perfectly nested loops outside of given loop(included) sorted from
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index a794a121d6267..43fcc595af0f7 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -671,9 +672,26 @@ LogicalResult mlir::loopUnrollJamByFactor(scf::ForOp forOp,
   return success();
 }
 
+Range emitNormalizedLoopBoundsForIndexType(RewriterBase &rewriter, Location loc,
+                                           OpFoldResult lb, OpFoldResult ub,
+                                           OpFoldResult step) {
+  Range normalizedLoopBounds;
+  normalizedLoopBounds.offset = rewriter.getIndexAttr(0);
+  normalizedLoopBounds.stride = rewriter.getIndexAttr(1);
+  AffineExpr s0, s1, s2;
+  bindSymbols(rewriter.getContext(), s0, s1, s2);
+  AffineExpr e = (s1 - s0).ceilDiv(s2);
+  normalizedLoopBounds.size =
+      affine::makeComposedFoldedAffineApply(rewriter, loc, e, {lb, ub, step});
+  return normalizedLoopBounds;
+}
+
 Range mlir::emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc,
                                      OpFoldResult lb, OpFoldResult ub,
                                      OpFoldResult step) {
+  if (getType(lb).isIndex()) {
+    return emitNormalizedLoopBoundsForIndexType(rewriter, loc, lb, ub, step);
+  }
   // For non-index types, generate `arith` instructions
   // Check if the loop is already known to have a constant zero lower bound or
   // a constant one step.
@@ -714,9 +732,38 @@ Range mlir::emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc,
   return {newLowerBound, newUpperBound, newStep};
 }
 
+static void denormalizeInductionVariableForIndexType(RewriterBase &rewriter,
+                                                     Location loc,
+                                                     Value normalizedIv,
+                                                     OpFoldResult origLb,
+                                                     OpFoldResult origStep) {
+  AffineExpr d0, s0, s1;
+  bindSymbols(rewriter.getContext(), s0, s1);
+  bindDims(rewriter.getContext(), d0);
+  AffineExpr e = d0 * s1 + s0;
+  OpFoldResult denormalizedIv = affine::makeComposedFoldedAffineApply(
+      rewriter, loc, e, ArrayRef<OpFoldResult>{normalizedIv, origLb, origStep});
+  Value denormalizedIvVal =
+      getValueOrCreateConstantIndexOp(rewriter, loc, denormalizedIv);
+  SmallPtrSet<Operation *, 1> preservedUses;
+  // If an `affine.apply` operation is generated for denormalization, the use
+  // of `origLb` in those ops must not be replaced. These arent not generated
+  // when `origLb == 0` and `origStep == 1`.
+  if (!isConstantIntValue(origLb, 0) || !isConstantIntValue(origStep, 1)) {
+    if (Operation *preservedUse = denormalizedIvVal.getDefiningOp()) {
+      preservedUses.insert(preservedUse);
+    }
+  }
+  rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIvVal, preservedUses);
+}
+
 void mlir::denormalizeInductionVariable(RewriterBase &rewriter, Location loc,
                                         Value normalizedIv, OpFoldResult origLb,
                                         OpFoldResult origStep) {
+  if (getType(origLb).isIndex()) {
+    return denormalizeInductionVariableForIndexType(rewriter, loc, normalizedIv,
+                                                    origLb, origStep);
+  }
   Value denormalizedIv;
   SmallPtrSet<Operation *, 2> preserve;
   bool isStepOne = isConstantIntValue(origStep, 1);
@@ -739,10 +786,29 @@ void mlir::denormalizeInductionVariable(RewriterBase &rewriter, Location loc,
   rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIv, preserve);
 }
 
+static OpFoldResult getProductOfIndexes(RewriterBase &rewriter, Location loc,
+                                        ArrayRef<OpFoldResult> values) {
+  assert(!values.empty() && "unexecpted empty array");
+  AffineExpr s0, s1;
+  bindSymbols(rewriter.getContext(), s0, s1);
+  AffineExpr mul = s0 * s1;
+  OpFoldResult products = rewriter.getIndexAttr(1);
+  for (auto v : values) {
+    products = affine::makeComposedFoldedAffineApply(
+        rewriter, loc, mul, ArrayRef<OpFoldResult>{products, v});
+  }
+  return products;
+}
+
 /// Helper function to multiply a sequence of values.
 static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc,
                                        ArrayRef<Value> values) {
   assert(!values.empty() && "unexpected empty list");
+  if (getType(values.front()).isIndex()) {
+    SmallVector<OpFoldResult> ofrs = getAsOpFoldResult(values);
+    OpFoldResult product = getProductOfIndexes(rewriter, loc, ofrs);
+    return getValueOrCreateConstantIndexOp(rewriter, loc, product);
+  }
   std::optional<Value> productOf;
   for (auto v : values) {
     auto vOne = getConstantIntValue(v);
@@ -757,7 +823,7 @@ static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc,
   if (!productOf) {
     productOf = rewriter
                     .create<arith::ConstantOp>(
-                        loc, rewriter.getOneAttr(values.front().getType()))
+                        loc, rewriter.getOneAttr(getType(values.front())))
                     .getResult();
   }
   return productOf.value();
@@ -774,6 +840,16 @@ static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc,
 static std::pair<SmallVector<Value>, SmallPtrSet<Operation *, 2>>
 delinearizeInductionVariable(RewriterBase &rewriter, Location loc,
                              Value linearizedIv, ArrayRef<Value> ubs) {
+
+  if (linearizedIv.getType().isIndex()) {
+    Operation *delinearizedOp =
+        rewriter.create<affine::AffineDelinearizeIndexOp>(loc, linearizedIv,
+                                                          ubs);
+    auto resultVals = llvm::map_to_vector(
+        delinearizedOp->getResults(), [](OpResult r) -> Value { return r; });
+    return {resultVals, SmallPtrSet<Operation *, 2>{delinearizedOp}};
+  }
+
   SmallVector<Value> delinearizedIvs(ubs.size());
   SmallPtrSet<Operation *, 2> preservedUsers;
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index 8be76cac87f29..b7fac163ba5fe 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -58,7 +58,7 @@ static gpu::GPUModuleOp genGPUModule(OpBuilder &builder, ModuleOp topModule) {
   for (auto op : topModule.getBodyRegion().getOps<gpu::GPUModuleOp>())
     return op; // existing
   markAsGPUContainer(topModule);
-  builder.setInsertionPointToStart(&topModule.getBodyRegion().front());
+  builder.setInsertionPointToStart(topModule.getBody());
   return builder.create<gpu::GPUModuleOp>(topModule->getLoc(),
                                           "sparse_kernels");
 }
@@ -75,7 +75,7 @@ static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule,
     ("kernel" + Twine(kernelNumber++)).toStringRef(kernelName);
   } while (gpuModule.lookupSymbol(kernelName));
   // Then we insert a new kernel with given arguments into the module.
-  builder.setInsertionPointToStart(&gpuModule.getBodyRegion().front());
+  builder.setInsertionPointToStart(gpuModule.getBody());
   SmallVector<Type> argsTp;
   for (auto arg : args)
     argsTp.push_back(arg.getType());
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index cc372ed1be621..60db71d96547f 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -881,25 +881,27 @@ struct TensorReshapeRewriter : public OpRewritePattern<tensor::ReshapeOp> {
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     Value srcTensor = op.getSource();
-    const auto srcTp = getSparseTensorType(srcTensor);
-    const auto dstTp = getSparseTensorType(op.getResult());
+    const auto srcTp = tryGetSparseTensorType(srcTensor);
+    const auto dstTp = tryGetSparseTensorType(op.getResult());
+    if (!srcTp || !dstTp)
+      return failure();
 
-    if (!srcTp.hasEncoding() || !dstTp.hasEncoding() ||
-        !dstTp.hasStaticDimShape())
+    if (!srcTp->hasEncoding() || !dstTp->hasEncoding() ||
+        !dstTp->hasStaticDimShape())
       return failure();
 
     SmallVector<Value> srcSizes;
-    sizesForTensor(rewriter, srcSizes, loc, srcTp, srcTensor);
+    sizesForTensor(rewriter, srcSizes, loc, *srcTp, srcTensor);
     SmallVector<Value> dstSizes;
-    for (Dimension d : dstTp.getDimShape())
+    for (Dimension d : dstTp->getDimShape())
       dstSizes.push_back(constantIndex(rewriter, loc, d));
 
     Value nnz = rewriter.create<NumberOfEntriesOp>(loc, srcTensor);
     // Only need an unordered COO buffer if input and output are not sorted
     // in the same way.
     Type bufferTp = getBufferType(
-        dstTp.withoutDimToLvl(),
-        !srcTp.isAllOrdered() || !srcTp.isIdentity() || !dstTp.isIdentity());
+        dstTp->withoutDimToLvl(),
+        !srcTp->isAllOrdered() || !srcTp->isIdentity() || !dstTp->isIdentity());
     SmallVector<Value> dynSizes;
     Value buffer = rewriter
                        .create<AllocTensorOp>(loc, bufferTp, dynSizes, Value(),
@@ -917,12 +919,12 @@ struct TensorReshapeRewriter : public OpRewritePattern<tensor::ReshapeOp> {
     // followed by an optional
     //   %t = sparse_tensor.cast %tmp
     // depending on whether the input/output are sorted in the same way.
-    const auto encSrc = srcTp.getEncoding();
+    const auto encSrc = srcTp->getEncoding();
     ForeachOp foreachOp = rewriter.create<ForeachOp>(
         loc, srcTensor, buffer,
         [&](OpBuilder &builder, Location loc, ValueRange srcLcvs, Value v,
             ValueRange reduc) {
-          const Dimension srcRank = srcTp.getDimRank();
+          const Dimension srcRank = srcTp->getDimRank();
           SmallVector<Value> srcDcvs;
           srcDcvs.reserve(srcRank);
           for (Dimension d = 0; d < srcRank; d++) {
@@ -945,7 +947,7 @@ struct TensorReshapeRewriter : public OpRewritePattern<tensor::ReshapeOp> {
                      collapsedSizes, collapsedDcvs);
 
           ReassociationIndices expandIdx;
-          for (Dimension i = 0; i < dstTp.getDimRank(); i++)
+          for (Dimension i = 0; i < dstTp->getDimRank(); i++)
             expandIdx.push_back(i);
           SmallVector<ReassociationIndices, 1> expandReass = {expandIdx};
           SmallVector<Value> dstDcvs;
@@ -958,8 +960,8 @@ struct TensorReshapeRewriter : public OpRewritePattern<tensor::ReshapeOp> {
         });
 
     Value t = rewriter.create<LoadOp>(loc, foreachOp.getResult(0), true);
-    if (bufferTp != dstTp) {
-      auto dstRTT = dstTp.getRankedTensorType();
+    if (bufferTp != *dstTp) {
+      auto dstRTT = dstTp->getRankedTensorType();
       Value converted = rewriter.create<ConvertOp>(loc, dstRTT, t).getResult();
       rewriter.create<DeallocTensorOp>(loc, t);
       t = converted;
@@ -1139,13 +1141,13 @@ struct SparseTensorDimOpRewriter : public OpRewritePattern<tensor::DimOp> {
   LogicalResult matchAndRewrite(tensor::DimOp op,
                                 PatternRewriter &rewriter) const override {
     std::optional<int64_t> dim = op.getConstantIndex();
-    auto stt = getSparseTensorType(op.getSource());
-    if (!dim || !stt.hasEncoding())
+    auto stt = tryGetSparseTensorType(op.getSource());
+    if (!dim || !stt || !stt->hasEncoding())
       return failure();
 
-    if (stt.isPermutation()) {
+    if (stt->isPermutation()) {
       rewriter.replaceOpWithNewOp<LvlOp>(op, op.getSource(),
-                                         toLvl(stt.getEncoding(), *dim));
+                                         toLvl(stt->getEncoding(), *dim));
       return success();
     }
 
@@ -1157,16 +1159,16 @@ struct SparseTensorDimOpRewriter : public OpRewritePattern<tensor::DimOp> {
     // computed simply by lvl_size * block_size.
     Location loc = op.getLoc();
     SmallVector<Value> maxLvlCrds;
-    for (Level l = 0; l < stt.getLvlRank(); l++) {
+    for (Level l = 0; l < stt->getLvlRank(); l++) {
       Value lvlSz = rewriter.create<LvlOp>(loc, op.getSource(), l);
       Value maxLvlCrd = rewriter.create<arith::SubIOp>(
           loc, lvlSz, constantOne(rewriter, loc, rewriter.getIndexType()));
       maxLvlCrds.push_back(maxLvlCrd);
     }
 
-    AffineExpr lvl2DimExp = stt.getLvlToDim().getResult(*dim);
+    AffineExpr lvl2DimExp = stt->getLvlToDim().getResult(*dim);
     Value maxDimCrd = rewriter.create<affine::AffineApplyOp>(
-        op.getLoc(), AffineMap::get(stt.getLvlRank(), 0, lvl2DimExp),
+        op.getLoc(), AffineMap::get(stt->getLvlRank(), 0, lvl2DimExp),
         maxLvlCrds);
 
     Value dimSz = rewriter.create<arith::AddIOp>(
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 1ac96756e22b5..defac8308b909 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -179,8 +179,7 @@ static RankedTensorType
 foldDynamicToStaticDimSizes(RankedTensorType type, ValueRange dynamicSizes,
                             SmallVector<Value> &foldedDynamicSizes) {
   SmallVector<int64_t> staticShape(type.getShape());
-  assert(type.getNumDynamicDims() ==
-             static_cast<int64_t>(dynamicSizes.size()) &&
+  assert(type.getNumDynamicDims() == dynamicSizes.size() &&
          "incorrect number of dynamic sizes");
 
   // Compute new static and dynamic sizes.
@@ -894,8 +893,7 @@ void EmptyOp::build(OpBuilder &builder, OperationState &result,
 }
 
 LogicalResult EmptyOp::verify() {
-  if (getType().getNumDynamicDims() !=
-      static_cast<int64_t>(getDynamicSizes().size()))
+  if (getType().getNumDynamicDims() != getDynamicSizes().size())
     return emitOpError("incorrect number of dynamic sizes, has ")
            << getDynamicSizes().size() << ", expected "
            << getType().getNumDynamicDims();
@@ -3672,8 +3670,7 @@ void SplatOp::getAsmResultNames(
 }
 
 LogicalResult SplatOp::verify() {
-  if (getType().getNumDynamicDims() !=
-      static_cast<int64_t>(getDynamicSizes().size()))
+  if (getType().getNumDynamicDims() != getDynamicSizes().size())
     return emitOpError("incorrect number of dynamic sizes, has ")
            << getDynamicSizes().size() << ", expected "
            << getType().getNumDynamicDims();
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index 03876a7c64d07..dbd573f96a79f 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Quant/QuantOps.h"
+#include "mlir/Dialect/Quant/IR/Quant.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"
@@ -380,7 +380,7 @@ struct ConcatSliceOptimization : public OpRewritePattern<tosa::SliceOp> {
 
   LogicalResult matchAndRewrite(tosa::SliceOp sliceOp,
                                 PatternRewriter &rewriter) const override {
-    Value sliceInput = sliceOp.getInput();
+    Value sliceInput = sliceOp.getInput1();
     auto concatOp = sliceInput.getDefiningOp<tosa::ConcatOp>();
     if (!concatOp)
       return rewriter.notifyMatchFailure(
@@ -919,11 +919,11 @@ OpFoldResult ResizeOp::fold(FoldAdaptor adaptor) {
 }
 
 OpFoldResult ReverseOp::fold(FoldAdaptor adaptor) {
-  auto operand = getInput();
+  auto operand = getInput1();
   auto operandTy = llvm::cast<ShapedType>(operand.getType());
   auto axis = getAxis();
   auto operandAttr =
-      llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getInput());
+      llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getInput1());
   if (operandAttr)
     return operandAttr;
 
@@ -936,16 +936,16 @@ OpFoldResult ReverseOp::fold(FoldAdaptor adaptor) {
 }
 
 OpFoldResult SliceOp::fold(FoldAdaptor adaptor) {
-  auto inputTy = llvm::dyn_cast<RankedTensorType>(getInput().getType());
+  auto inputTy = llvm::dyn_cast<RankedTensorType>(getInput1().getType());
   auto outputTy = llvm::dyn_cast<RankedTensorType>(getType());
 
   if (!inputTy || !outputTy)
     return {};
 
   if (inputTy == outputTy && inputTy.hasStaticShape())
-    return getInput();
+    return getInput1();
 
-  if (!adaptor.getInput())
+  if (!adaptor.getInput1())
     return {};
 
   // Cannot create an ElementsAttr from non-int/float/index types
@@ -953,7 +953,7 @@ OpFoldResult SliceOp::fold(FoldAdaptor adaptor) {
       !outputTy.getElementType().isIntOrIndexOrFloat())
     return {};
 
-  auto operand = llvm::cast<ElementsAttr>(adaptor.getInput());
+  auto operand = llvm::cast<ElementsAttr>(adaptor.getInput1());
   if (operand.isSplat() && outputTy.hasStaticShape()) {
     return SplatElementsAttr::get(outputTy, operand.getSplatValue<Attribute>());
   }
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 6dce3d03066c9..1f3e19fe90c6d 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -14,7 +14,7 @@
 
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.h"
-#include "mlir/Dialect/Quant/QuantOps.h"
+#include "mlir/Dialect/Quant/IR/Quant.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tosa/Utils/QuantUtils.h"
 #include "mlir/Dialect/Tosa/Utils/ShapeUtils.h"
@@ -850,7 +850,7 @@ LogicalResult tosa::SliceOp::inferReturnTypeComponents(
 }
 
 LogicalResult tosa::SliceOp::verify() {
-  auto inputType = llvm::dyn_cast<RankedTensorType>(getInput().getType());
+  auto inputType = llvm::dyn_cast<RankedTensorType>(getInput1().getType());
   if (!inputType)
     return success();
 
@@ -869,7 +869,7 @@ LogicalResult tosa::TableOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
     TableOp::Adaptor adaptor,
     SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
-  ShapeAdaptor inputShape(adaptor.getInput().getType());
+  ShapeAdaptor inputShape(adaptor.getInput1().getType());
 
   if (!inputShape.hasRank()) {
     inferredReturnShapes.push_back(ShapedTypeComponents());
@@ -882,7 +882,7 @@ LogicalResult tosa::TableOp::inferReturnTypeComponents(
 }
 
 LogicalResult tosa::TableOp::verify() {
-  TensorType inputType = getInput().getType();
+  TensorType inputType = getInput1().getType();
   TensorType outputType = getOutput().getType();
 
   if (inputType.hasRank() && outputType.hasRank() &&
@@ -1973,7 +1973,7 @@ void IfOp::print(OpAsmPrinter &p) {
 }
 
 LogicalResult ReverseOp::verify() {
-  TensorType inputType = getInput().getType();
+  TensorType inputType = getInput1().getType();
   TensorType outputType = getOutput().getType();
   int32_t reverseAxis = getAxis();
 
diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
index b01d3183af135..547d120404aba 100644
--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
@@ -124,7 +124,7 @@ getConstantIntValues(ArrayRef<OpFoldResult> ofrs) {
     auto cv = getConstantIntValue(ofr);
     if (!cv.has_value())
       failed = true;
-    return cv.has_value() ? cv.value() : 0;
+    return cv.value_or(0);
   });
   if (failed)
     return std::nullopt;
diff --git a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
index 3ae70ace3934c..241e83e234d62 100644
--- a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
+++ b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
@@ -85,6 +85,11 @@ void transform::ApplyTransferPermutationPatternsOp::populatePatterns(
   vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
 }
 
+void transform::ApplyDropUnitDimWithShapeCastPatternsOp::populatePatterns(
+    RewritePatternSet &patterns) {
+  vector::populateDropUnitDimWithShapeCastPatterns(patterns);
+}
+
 void transform::ApplyLowerBitCastPatternsOp::populatePatterns(
     RewritePatternSet &patterns) {
   vector::populateVectorBitCastLoweringPatterns(patterns);
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index ad4e42b31962e..7f6b2303f86e1 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -1796,6 +1796,62 @@ struct DropUnitDimsFromTransposeOp final
   }
 };
 
+/// A pattern to drop unit dims from the iter_args of an scf.for.
+///
+/// Example:
+///
+///  BEFORE:
+///  ```mlir
+///  %res = scf.for ... iter_args(%iter = %init) -> vector<[4]x1x1x4xf32> {
+///    ...
+///    scf.yield %
+///  }
+///  ```
+///
+///  AFTER:
+///  ```mlir
+///  %drop = vector.shape_cast %init
+///    : vector<4x1x1x[4]xf32> to vector<4x[4]xf32>
+///  %new_loop = scf.for ... iter_args(%iter = %drop) -> vector<[4]x4xf32> {
+///    %new_iter = vector.shape_cast %iter
+///      : vector<[4]x4xf32> to vector<[4]x1x1x4xf32>
+///    ...
+///  }
+///  %res = vector.shape_cast %new_loop
+///    : vector<[4]x4xf32> to vector<[4]x1x1x4xf32>
+///  ```
+struct DropUnitDimsFromScfForOp final : OpRewritePattern<scf::ForOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(scf::ForOp forOp,
+                                PatternRewriter &rewriter) const override {
+    /// Find the first iter_arg with droppable unit dims. Further applications
+    /// of this pattern will apply to later arguments.
+    for (OpOperand &operand : forOp.getInitArgsMutable()) {
+      auto vectorType = dyn_cast<VectorType>(operand.get().getType());
+      if (!vectorType)
+        continue;
+
+      VectorType newVectorType = dropNonScalableUnitDimFromType(vectorType);
+      if (vectorType == newVectorType)
+        continue;
+
+      // Create a new ForOp with that iter operand replaced.
+      auto castFn = [](OpBuilder &b, Location loc, Type type, Value source) {
+        return b.create<vector::ShapeCastOp>(loc, type, source);
+      };
+
+      Value replacement =
+          castFn(rewriter, forOp.getLoc(), newVectorType, operand.get());
+      rewriter.replaceOp(forOp,
+                         replaceAndCastForOpIterArg(rewriter, forOp, operand,
+                                                    replacement, castFn));
+      return success();
+    }
+    return failure();
+  }
+};
+
 /// Pattern to eliminate redundant zero-constants added to reduction operands.
 /// It's enough for there to be one initial zero value, so we can eliminate the
 /// extra ones that feed into `vector.reduction <add>`. These get created by the
@@ -2000,8 +2056,14 @@ void mlir::vector::populateShapeCastFoldingPatterns(RewritePatternSet &patterns,
 
 void mlir::vector::populateDropUnitDimWithShapeCastPatterns(
     RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<DropUnitDimFromElementwiseOps, DropUnitDimsFromTransposeOp,
-               ShapeCastOpFolder>(patterns.getContext(), benefit);
+  // TODO: Consider either:
+  //  * including DropInnerMostUnitDimsTransferRead and
+  //    DropInnerMostUnitDimsTransferWrite, or
+  //  * better naming to distinguish this and
+  //    populateVectorTransferCollapseInnerMostContiguousDimsPatterns.
+  patterns.add<DropUnitDimFromElementwiseOps, DropUnitDimsFromScfForOp,
+               DropUnitDimsFromTransposeOp, ShapeCastOpFolder>(
+      patterns.getContext(), benefit);
 }
 
 void mlir::vector::populateBubbleVectorBitCastOpPatterns(
diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp
index 5cbd0b090492b..ea3c0723b0775 100644
--- a/mlir/lib/IR/AffineMap.cpp
+++ b/mlir/lib/IR/AffineMap.cpp
@@ -592,6 +592,29 @@ SmallVector<int64_t, 4> AffineMap::compose(ArrayRef<int64_t> values) const {
   return res;
 }
 
+size_t AffineMap::getNumOfZeroResults() const {
+  size_t res = 0;
+  for (auto expr : getResults()) {
+    auto constExpr = dyn_cast<AffineConstantExpr>(expr);
+    if (constExpr && constExpr.getValue() == 0)
+      res++;
+  }
+
+  return res;
+}
+
+AffineMap AffineMap::dropZeroResults() {
+  auto exprs = llvm::to_vector(getResults());
+  SmallVector<AffineExpr> newExprs;
+
+  for (auto expr : getResults()) {
+    auto constExpr = dyn_cast<AffineConstantExpr>(expr);
+    if (!constExpr || constExpr.getValue() != 0)
+      newExprs.push_back(expr);
+  }
+  return AffineMap::get(getNumDims(), getNumSymbols(), newExprs, getContext());
+}
+
 bool AffineMap::isProjectedPermutation(bool allowZeroInResults) const {
   if (getNumSymbols() > 0)
     return false;
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index d0fd8e420d38e..7f95f5ace8c00 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -545,6 +545,10 @@ class SymbolAlias {
   bool isType : 1;
   /// A flag indicating whether this alias may be deferred or not.
   bool isDeferrable : 1;
+
+public:
+  /// Used to avoid printing incomplete aliases for recursive types.
+  bool isPrinted = false;
 };
 
 /// This class represents a utility that initializes the set of attribute and
@@ -1222,6 +1226,8 @@ LogicalResult AliasState::getAlias(Type ty, raw_ostream &os) const {
   const auto *it = attrTypeToAlias.find(ty.getAsOpaquePointer());
   if (it == attrTypeToAlias.end())
     return failure();
+  if (!it->second.isPrinted)
+    return failure();
 
   it->second.print(os);
   return success();
@@ -1238,12 +1244,9 @@ void AliasState::printAliases(AsmPrinter::Impl &p, NewLineCounter &newLine,
     p.getStream() << " = ";
 
     if (alias.isTypeAlias()) {
-      // TODO: Support nested aliases in mutable types.
       Type type = Type::getFromOpaquePointer(opaqueSymbol);
-      if (type.hasTrait<TypeTrait::IsMutable>())
-        p.getStream() << type;
-      else
-        p.printTypeImpl(type);
+      p.printTypeImpl(type);
+      alias.isPrinted = true;
     } else {
       // TODO: Support nested aliases in mutable attributes.
       Attribute attr = Attribute::getFromOpaquePointer(opaqueSymbol);
diff --git a/mlir/lib/Rewrite/CMakeLists.txt b/mlir/lib/Rewrite/CMakeLists.txt
index a6c39406aa4b3..15b3739e4c633 100644
--- a/mlir/lib/Rewrite/CMakeLists.txt
+++ b/mlir/lib/Rewrite/CMakeLists.txt
@@ -9,6 +9,7 @@ add_mlir_library(MLIRRewrite
 
   DEPENDS
   mlir-generic-headers
+  MLIRConversionPassIncGen
 
   LINK_LIBS PUBLIC
   MLIRIR
diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp
index 806c405ac17df..69602af8563aa 100644
--- a/mlir/lib/Target/LLVM/NVVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
 
 #include <cstdlib>
 
@@ -401,6 +402,26 @@ NVPTXSerializer::compileToBinary(const std::string &ptxCode) {
                                 /*MemoryLimit=*/0,
                                 /*ErrMsg=*/&message))
     return emitLogError("`ptxas`");
+#define DEBUG_TYPE "dump-sass"
+  LLVM_DEBUG({
+    std::optional<std::string> nvdisasm = findTool("nvdisasm");
+    SmallVector<StringRef> nvdisasmArgs(
+        {StringRef("nvdisasm"), StringRef(cubinFile.first)});
+    if (llvm::sys::ExecuteAndWait(nvdisasm.value(), nvdisasmArgs,
+                                  /*Env=*/std::nullopt,
+                                  /*Redirects=*/redirects,
+                                  /*SecondsToWait=*/0,
+                                  /*MemoryLimit=*/0,
+                                  /*ErrMsg=*/&message))
+      return emitLogError("`nvdisasm`");
+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> logBuffer =
+        llvm::MemoryBuffer::getFile(logFile->first);
+    if (logBuffer && !(*logBuffer)->getBuffer().empty()) {
+      llvm::dbgs() << "Output:\n" << (*logBuffer)->getBuffer() << "\n";
+      llvm::dbgs().flush();
+    }
+  });
+#undef DEBUG_TYPE
 
   // Invoke `fatbin`.
   message.clear();
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
index d948ff5eaf176..46b7b0a473c69 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
@@ -102,13 +102,35 @@ getOverloadedDeclaration(CallIntrinsicOp op, llvm::Intrinsic::ID id,
   return llvm::Intrinsic::getDeclaration(module, id, overloadedArgTysRef);
 }
 
+static llvm::OperandBundleDef
+convertOperandBundle(OperandRange bundleOperands, StringRef bundleTag,
+                     LLVM::ModuleTranslation &moduleTranslation) {
+  std::vector<llvm::Value *> operands;
+  operands.reserve(bundleOperands.size());
+  for (Value bundleArg : bundleOperands)
+    operands.push_back(moduleTranslation.lookupValue(bundleArg));
+  return llvm::OperandBundleDef(bundleTag.str(), std::move(operands));
+}
+
+static SmallVector<llvm::OperandBundleDef>
+convertOperandBundles(OperandRangeRange bundleOperands,
+                      ArrayRef<std::string> bundleTags,
+                      LLVM::ModuleTranslation &moduleTranslation) {
+  SmallVector<llvm::OperandBundleDef> bundles;
+  bundles.reserve(bundleOperands.size());
+
+  for (auto [operands, tag] : llvm::zip_equal(bundleOperands, bundleTags))
+    bundles.push_back(convertOperandBundle(operands, tag, moduleTranslation));
+  return bundles;
+}
+
 /// Builder for LLVM_CallIntrinsicOp
 static LogicalResult
 convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder,
                            LLVM::ModuleTranslation &moduleTranslation) {
   llvm::Module *module = builder.GetInsertBlock()->getModule();
   llvm::Intrinsic::ID id =
-      llvm::Function::lookupIntrinsicID(op.getIntrinAttr());
+      llvm::Intrinsic::lookupIntrinsicID(op.getIntrinAttr());
   if (!id)
     return mlir::emitError(op.getLoc(), "could not find LLVM intrinsic: ")
            << op.getIntrinAttr();
@@ -138,15 +160,15 @@ convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder,
   // Check the argument types of the call. If the function is variadic, check
   // the subrange of required arguments.
   if (!fn->getFunctionType()->isVarArg() &&
-      op.getNumOperands() != fn->arg_size()) {
+      op.getArgs().size() != fn->arg_size()) {
     return mlir::emitError(op.getLoc(), "intrinsic call has ")
-           << op.getNumOperands() << " operands but " << op.getIntrinAttr()
+           << op.getArgs().size() << " operands but " << op.getIntrinAttr()
            << " expects " << fn->arg_size();
   }
   if (fn->getFunctionType()->isVarArg() &&
-      op.getNumOperands() < fn->arg_size()) {
+      op.getArgs().size() < fn->arg_size()) {
     return mlir::emitError(op.getLoc(), "intrinsic call has ")
-           << op.getNumOperands() << " operands but variadic "
+           << op.getArgs().size() << " operands but variadic "
            << op.getIntrinAttr() << " expects at least " << fn->arg_size();
   }
   // Check the arguments up to the number the function requires.
@@ -164,8 +186,10 @@ convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder,
   FastmathFlagsInterface itf = op;
   builder.setFastMathFlags(getFastmathFlags(itf));
 
-  auto *inst =
-      builder.CreateCall(fn, moduleTranslation.lookupValues(op.getOperands()));
+  auto *inst = builder.CreateCall(
+      fn, moduleTranslation.lookupValues(op.getArgs()),
+      convertOperandBundles(op.getOpBundleOperands(), op.getOpBundleTags(),
+                            moduleTranslation));
   if (op.getNumResults() == 1)
     moduleTranslation.mapValue(op->getResults().front()) = inst;
   return success();
@@ -205,17 +229,21 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder,
   // itself.  Otherwise, this is an indirect call and the callee is the first
   // operand, look it up as a normal value.
   if (auto callOp = dyn_cast<LLVM::CallOp>(opInst)) {
-    auto operands = moduleTranslation.lookupValues(callOp.getOperands());
+    auto operands = moduleTranslation.lookupValues(callOp.getCalleeOperands());
+    SmallVector<llvm::OperandBundleDef> opBundles =
+        convertOperandBundles(callOp.getOpBundleOperands(),
+                              callOp.getOpBundleTags(), moduleTranslation);
     ArrayRef<llvm::Value *> operandsRef(operands);
     llvm::CallInst *call;
     if (auto attr = callOp.getCalleeAttr()) {
-      call = builder.CreateCall(
-          moduleTranslation.lookupFunction(attr.getValue()), operandsRef);
+      call =
+          builder.CreateCall(moduleTranslation.lookupFunction(attr.getValue()),
+                             operandsRef, opBundles);
     } else {
       llvm::FunctionType *calleeType = llvm::cast<llvm::FunctionType>(
           moduleTranslation.convertType(callOp.getCalleeFunctionType()));
       call = builder.CreateCall(calleeType, operandsRef.front(),
-                                operandsRef.drop_front());
+                                operandsRef.drop_front(), opBundles);
     }
     call->setCallingConv(convertCConvToLLVM(callOp.getCConv()));
     call->setTailCallKind(convertTailCallKindToLLVM(callOp.getTailCallKind()));
@@ -312,13 +340,17 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder,
 
   if (auto invOp = dyn_cast<LLVM::InvokeOp>(opInst)) {
     auto operands = moduleTranslation.lookupValues(invOp.getCalleeOperands());
+    SmallVector<llvm::OperandBundleDef> opBundles =
+        convertOperandBundles(invOp.getOpBundleOperands(),
+                              invOp.getOpBundleTags(), moduleTranslation);
     ArrayRef<llvm::Value *> operandsRef(operands);
     llvm::InvokeInst *result;
     if (auto attr = opInst.getAttrOfType<FlatSymbolRefAttr>("callee")) {
       result = builder.CreateInvoke(
           moduleTranslation.lookupFunction(attr.getValue()),
           moduleTranslation.lookupBlock(invOp.getSuccessor(0)),
-          moduleTranslation.lookupBlock(invOp.getSuccessor(1)), operandsRef);
+          moduleTranslation.lookupBlock(invOp.getSuccessor(1)), operandsRef,
+          opBundles);
     } else {
       llvm::FunctionType *calleeType = llvm::cast<llvm::FunctionType>(
           moduleTranslation.convertType(invOp.getCalleeFunctionType()));
@@ -326,7 +358,7 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder,
           calleeType, operandsRef.front(),
           moduleTranslation.lookupBlock(invOp.getSuccessor(0)),
           moduleTranslation.lookupBlock(invOp.getSuccessor(1)),
-          operandsRef.drop_front());
+          operandsRef.drop_front(), opBundles);
     }
     result->setCallingConv(convertCConvToLLVM(invOp.getCConv()));
     moduleTranslation.mapBranch(invOp, result);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 0cba8d80681f1..9e5f800dca60b 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -920,7 +920,7 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
   DenseMap<Value, llvm::Value *> reductionVariableMap;
 
   MutableArrayRef<BlockArgument> reductionArgs =
-      sectionsOp.getRegion().getArguments();
+      cast<omp::BlockArgOpenMPOpInterface>(opInst).getReductionBlockArgs();
 
   if (failed(allocAndInitializeReductionVars(
           sectionsOp, reductionArgs, builder, moduleTranslation, allocaIP,
@@ -1216,7 +1216,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   DenseMap<Value, llvm::Value *> reductionVariableMap;
 
   MutableArrayRef<BlockArgument> reductionArgs =
-      wsloopOp.getRegion().getArguments();
+      cast<omp::BlockArgOpenMPOpInterface>(opInst).getReductionBlockArgs();
 
   if (failed(allocAndInitializeReductionVars(
           wsloopOp, reductionArgs, builder, moduleTranslation, allocaIP,
@@ -1329,33 +1329,60 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
 class OmpParallelOpConversionManager {
 public:
   OmpParallelOpConversionManager(omp::ParallelOp opInst)
-      : region(opInst.getRegion()), privateVars(opInst.getPrivateVars()),
-        privateArgBeginIdx(opInst.getNumReductionVars()),
-        privateArgEndIdx(privateArgBeginIdx + privateVars.size()) {
-    auto privateVarsIt = privateVars.begin();
-
-    for (size_t argIdx = privateArgBeginIdx; argIdx < privateArgEndIdx;
-         ++argIdx, ++privateVarsIt)
-      mlir::replaceAllUsesInRegionWith(region.getArgument(argIdx),
-                                       *privateVarsIt, region);
+      : region(opInst.getRegion()),
+        privateBlockArgs(cast<omp::BlockArgOpenMPOpInterface>(*opInst)
+                             .getPrivateBlockArgs()),
+        privateVars(opInst.getPrivateVars()) {
+    for (auto [blockArg, var] : llvm::zip_equal(privateBlockArgs, privateVars))
+      mlir::replaceAllUsesInRegionWith(blockArg, var, region);
   }
 
   ~OmpParallelOpConversionManager() {
-    auto privateVarsIt = privateVars.begin();
-
-    for (size_t argIdx = privateArgBeginIdx; argIdx < privateArgEndIdx;
-         ++argIdx, ++privateVarsIt)
-      mlir::replaceAllUsesInRegionWith(*privateVarsIt,
-                                       region.getArgument(argIdx), region);
+    for (auto [blockArg, var] : llvm::zip_equal(privateBlockArgs, privateVars))
+      mlir::replaceAllUsesInRegionWith(var, blockArg, region);
   }
 
 private:
   Region &region;
+  llvm::MutableArrayRef<BlockArgument> privateBlockArgs;
   OperandRange privateVars;
-  unsigned privateArgBeginIdx;
-  unsigned privateArgEndIdx;
 };
 
+// Looks up from the operation from and returns the PrivateClauseOp with
+// name symbolName
+static omp::PrivateClauseOp findPrivatizer(Operation *from,
+                                           SymbolRefAttr symbolName) {
+  omp::PrivateClauseOp privatizer =
+      SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(from,
+                                                                 symbolName);
+  assert(privatizer && "privatizer not found in the symbol table");
+  return privatizer;
+}
+// clones the given privatizer. The original privatizer is used as
+// the insert point for the clone.
+static omp::PrivateClauseOp
+clonePrivatizer(LLVM::ModuleTranslation &moduleTranslation,
+                omp::PrivateClauseOp privatizer, Operation *fromOperation) {
+  MLIRContext &context = moduleTranslation.getContext();
+  mlir::IRRewriter opCloner(&context);
+  opCloner.setInsertionPoint(privatizer);
+  auto clone =
+      llvm::cast<mlir::omp::PrivateClauseOp>(opCloner.clone(*privatizer));
+
+  // Unique the clone name to avoid clashes in the symbol table.
+  unsigned counter = 0;
+  SmallString<256> cloneName = SymbolTable::generateSymbolName<256>(
+      privatizer.getSymName(),
+      [&](llvm::StringRef candidate) {
+        return SymbolTable::lookupNearestSymbolFrom(
+                   fromOperation, StringAttr::get(&context, candidate)) !=
+               nullptr;
+      },
+      counter);
+
+  clone.setSymName(cloneName);
+  return clone;
+}
 /// Converts the OpenMP parallel operation to LLVM IR.
 static LogicalResult
 convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
@@ -1382,9 +1409,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     DenseMap<Value, llvm::Value *> reductionVariableMap;
 
     MutableArrayRef<BlockArgument> reductionArgs =
-        opInst.getRegion().getArguments().slice(
-            opInst.getNumAllocateVars() + opInst.getNumAllocatorsVars(),
-            opInst.getNumReductionVars());
+        cast<omp::BlockArgOpenMPOpInterface>(*opInst).getReductionBlockArgs();
 
     allocaIP =
         InsertPointTy(allocaIP.getBlock(),
@@ -1611,34 +1636,14 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
             continue;
 
           SymbolRefAttr privSym = llvm::cast<SymbolRefAttr>(mlirPrivatizerAttr);
-          omp::PrivateClauseOp privatizer =
-              SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
-                  opInst, privSym);
+          omp::PrivateClauseOp privatizer = findPrivatizer(opInst, privSym);
 
           // Clone the privatizer in case it is used by more than one parallel
           // region. The privatizer is processed in-place (see below) before it
           // gets inlined in the parallel region and therefore processing the
           // original op is dangerous.
-
-          MLIRContext &context = moduleTranslation.getContext();
-          mlir::IRRewriter opCloner(&context);
-          opCloner.setInsertionPoint(privatizer);
-          auto clone = llvm::cast<mlir::omp::PrivateClauseOp>(
-              opCloner.clone(*privatizer));
-
-          // Unique the clone name to avoid clashes in the symbol table.
-          unsigned counter = 0;
-          SmallString<256> cloneName = SymbolTable::generateSymbolName<256>(
-              privatizer.getSymName(),
-              [&](llvm::StringRef candidate) {
-                return SymbolTable::lookupNearestSymbolFrom(
-                           opInst, StringAttr::get(&context, candidate)) !=
-                       nullptr;
-              },
-              counter);
-
-          clone.setSymName(cloneName);
-          return {mlirPrivVar, clone};
+          return {mlirPrivVar,
+                  clonePrivatizer(moduleTranslation, privatizer, opInst)};
         }
       }
 
@@ -2457,8 +2462,8 @@ static void collectMapDataFromMapOperands(
     }
   };
 
-  addDevInfos(useDevPtrOperands, llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer);
   addDevInfos(useDevAddrOperands, llvm::OpenMPIRBuilder::DeviceInfoTy::Address);
+  addDevInfos(useDevPtrOperands, llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer);
 }
 
 static int getMapDataMemberIdx(MapInfoData &mapData, omp::MapInfoOp memberOp) {
@@ -3064,6 +3069,31 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
     return combinedInfo;
   };
 
+  // Define a lambda to apply mappings between use_device_addr and
+  // use_device_ptr base pointers, and their associated block arguments.
+  auto mapUseDevice =
+      [&moduleTranslation](
+          llvm::OpenMPIRBuilder::DeviceInfoTy type,
+          llvm::ArrayRef<BlockArgument> blockArgs,
+          llvm::OpenMPIRBuilder::MapValuesArrayTy &basePointers,
+          llvm::OpenMPIRBuilder::MapDeviceInfoArrayTy &devicePointers,
+          llvm::function_ref<llvm::Value *(llvm::Value *)> mapper = nullptr) {
+        // Get a range to iterate over `basePointers` after filtering based on
+        // `devicePointers` and the given device info type.
+        auto basePtrRange = llvm::map_range(
+            llvm::make_filter_range(
+                llvm::zip_equal(basePointers, devicePointers),
+                [type](auto x) { return std::get<1>(x) == type; }),
+            [](auto x) { return std::get<0>(x); });
+
+        // Map block arguments to the corresponding processed base pointer. If
+        // a mapper is not specified, map the block argument to the base pointer
+        // directly.
+        for (auto [arg, basePointer] : llvm::zip_equal(blockArgs, basePtrRange))
+          moduleTranslation.mapValue(arg, mapper ? mapper(basePointer)
+                                                 : basePointer);
+      };
+
   llvm::OpenMPIRBuilder::TargetDataInfo info(/*RequiresDevicePointerInfo=*/true,
                                              /*SeparateBeginEndCalls=*/true);
 
@@ -3072,29 +3102,28 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   auto bodyGenCB = [&](InsertPointTy codeGenIP, BodyGenTy bodyGenType) {
     assert(isa<omp::TargetDataOp>(op) &&
            "BodyGen requested for non TargetDataOp");
+    auto blockArgIface = cast<omp::BlockArgOpenMPOpInterface>(op);
     Region &region = cast<omp::TargetDataOp>(op).getRegion();
     switch (bodyGenType) {
     case BodyGenTy::Priv:
       // Check if any device ptr/addr info is available
       if (!info.DevicePtrInfoMap.empty()) {
         builder.restoreIP(codeGenIP);
-        unsigned argIndex = 0;
-        for (auto [basePointer, devicePointer] : llvm::zip_equal(
-                 combinedInfo.BasePointers, combinedInfo.DevicePointers)) {
-          if (devicePointer == llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer) {
-            const auto &arg = region.front().getArgument(argIndex);
-            moduleTranslation.mapValue(
-                arg, info.DevicePtrInfoMap[basePointer].second);
-            argIndex++;
-          } else if (devicePointer ==
-                     llvm::OpenMPIRBuilder::DeviceInfoTy::Address) {
-            const auto &arg = region.front().getArgument(argIndex);
-            auto *loadInst = builder.CreateLoad(
-                builder.getPtrTy(), info.DevicePtrInfoMap[basePointer].second);
-            moduleTranslation.mapValue(arg, loadInst);
-            argIndex++;
-          }
-        }
+
+        mapUseDevice(llvm::OpenMPIRBuilder::DeviceInfoTy::Address,
+                     blockArgIface.getUseDeviceAddrBlockArgs(),
+                     combinedInfo.BasePointers, combinedInfo.DevicePointers,
+                     [&](llvm::Value *basePointer) -> llvm::Value * {
+                       return builder.CreateLoad(
+                           builder.getPtrTy(),
+                           info.DevicePtrInfoMap[basePointer].second);
+                     });
+        mapUseDevice(llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer,
+                     blockArgIface.getUseDevicePtrBlockArgs(),
+                     combinedInfo.BasePointers, combinedInfo.DevicePointers,
+                     [&](llvm::Value *basePointer) {
+                       return info.DevicePtrInfoMap[basePointer].second;
+                     });
 
         bodyGenStatus = inlineConvertOmpRegions(region, "omp.data.region",
                                                 builder, moduleTranslation);
@@ -3109,17 +3138,14 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
         // For device pass, if use_device_ptr(addr) mappings were present,
         // we need to link them here before codegen.
         if (ompBuilder->Config.IsTargetDevice.value_or(false)) {
-          unsigned argIndex = 0;
-          for (auto [basePointer, devicePointer] :
-               llvm::zip_equal(mapData.BasePointers, mapData.DevicePointers)) {
-            if (devicePointer == llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer ||
-                devicePointer == llvm::OpenMPIRBuilder::DeviceInfoTy::Address) {
-              const auto &arg = region.front().getArgument(argIndex);
-              moduleTranslation.mapValue(arg, basePointer);
-              argIndex++;
-            }
-          }
+          mapUseDevice(llvm::OpenMPIRBuilder::DeviceInfoTy::Address,
+                       blockArgIface.getUseDeviceAddrBlockArgs(),
+                       mapData.BasePointers, mapData.DevicePointers);
+          mapUseDevice(llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer,
+                       blockArgIface.getUseDevicePtrBlockArgs(),
+                       mapData.BasePointers, mapData.DevicePointers);
         }
+
         bodyGenStatus = inlineConvertOmpRegions(region, "omp.data.region",
                                                 builder, moduleTranslation);
       }
@@ -3359,8 +3385,7 @@ createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
   llvm::Value *v = builder.CreateAlloca(arg.getType(), allocaAS);
 
   if (allocaAS != defaultAS && arg.getType()->isPointerTy())
-    v = builder.CreatePointerBitCastOrAddrSpaceCast(
-        v, arg.getType()->getPointerTo(defaultAS));
+    v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS));
 
   builder.CreateStore(&arg, v);
 
@@ -3400,6 +3425,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   auto &targetRegion = targetOp.getRegion();
   DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
   SmallVector<Value> mapVars = targetOp.getMapVars();
+  ArrayRef<BlockArgument> mapBlockArgs =
+      cast<omp::BlockArgOpenMPOpInterface>(opInst).getMapBlockArgs();
   llvm::Function *llvmOutlinedFn = nullptr;
 
   // TODO: It can also be false if a compile-time constant `false` IF clause is
@@ -3428,13 +3455,57 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       llvmOutlinedFn->addFnAttr(attr);
 
     builder.restoreIP(codeGenIP);
-    for (auto [argIndex, mapOp] : llvm::enumerate(mapVars)) {
+    for (auto [arg, mapOp] : llvm::zip_equal(mapBlockArgs, mapVars)) {
       auto mapInfoOp = cast<omp::MapInfoOp>(mapOp.getDefiningOp());
       llvm::Value *mapOpValue =
           moduleTranslation.lookupValue(mapInfoOp.getVarPtr());
-      const auto &arg = targetRegion.front().getArgument(argIndex);
       moduleTranslation.mapValue(arg, mapOpValue);
     }
+
+    // Do privatization after moduleTranslation has already recorded
+    // mapped values.
+    if (!targetOp.getPrivateVars().empty()) {
+      builder.restoreIP(allocaIP);
+
+      OperandRange privateVars = targetOp.getPrivateVars();
+      std::optional<ArrayAttr> privateSyms = targetOp.getPrivateSyms();
+      MutableArrayRef<BlockArgument> privateBlockArgs =
+          cast<omp::BlockArgOpenMPOpInterface>(opInst).getPrivateBlockArgs();
+
+      for (auto [privVar, privatizerNameAttr, privBlockArg] :
+           llvm::zip_equal(privateVars, *privateSyms, privateBlockArgs)) {
+
+        SymbolRefAttr privSym = cast<SymbolRefAttr>(privatizerNameAttr);
+        omp::PrivateClauseOp privatizer = findPrivatizer(&opInst, privSym);
+        if (privatizer.getDataSharingType() ==
+                omp::DataSharingClauseType::FirstPrivate ||
+            !privatizer.getDeallocRegion().empty()) {
+          opInst.emitError("Translation of omp.target from MLIR to LLVMIR "
+                           "failed because translation of firstprivate and "
+                           " private allocatables is not supported yet");
+          bodyGenStatus = failure();
+        } else {
+          Region &allocRegion = privatizer.getAllocRegion();
+          BlockArgument allocRegionArg = allocRegion.getArgument(0);
+          moduleTranslation.mapValue(allocRegionArg,
+                                     moduleTranslation.lookupValue(privVar));
+          SmallVector<llvm::Value *, 1> yieldedValues;
+          if (failed(inlineConvertOmpRegions(
+                  allocRegion, "omp.targetop.privatizer", builder,
+                  moduleTranslation, &yieldedValues))) {
+            opInst.emitError(
+                "failed to inline `alloc` region of an `omp.private` "
+                "op in the target region");
+            bodyGenStatus = failure();
+          } else {
+            assert(yieldedValues.size() == 1);
+            moduleTranslation.mapValue(privBlockArg, yieldedValues.front());
+          }
+          moduleTranslation.forgetMapping(allocRegion);
+          builder.restoreIP(builder.saveIP());
+        }
+      }
+    }
     llvm::BasicBlock *exitBlock = convertOmpOpRegions(
         targetRegion, "omp.target", builder, moduleTranslation, bodyGenStatus);
     builder.SetInsertPoint(exitBlock);
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index 38293f7106a05..6c7fe41069824 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -319,6 +319,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
   case spirv::Decoration::Restrict:
   case spirv::Decoration::RestrictPointer:
   case spirv::Decoration::NoContraction:
+  case spirv::Decoration::Constant:
     if (words.size() != 2) {
       return emitError(unknownLoc, "OpDecoration with ")
              << decorationName << "needs a single target <id>";
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
index 7719eb68b2c2e..f355982e9ed88 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -286,6 +286,7 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID,
   case spirv::Decoration::Restrict:
   case spirv::Decoration::RestrictPointer:
   case spirv::Decoration::NoContraction:
+  case spirv::Decoration::Constant:
     // For unit attributes and decoration attributes, the args list
     // has no values so we do nothing.
     if (isa<UnitAttr, DecorationAttr>(attr))
diff --git a/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp b/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp
index 1911b6e3aa392..7119324dd125d 100644
--- a/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp
+++ b/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp
@@ -126,7 +126,7 @@ static const mlir::GenInfo *generator;
 
 // TableGenMain requires a function pointer so this function is passed in which
 // simply wraps the call to the generator.
-static bool mlirTableGenMain(raw_ostream &os, RecordKeeper &records) {
+static bool mlirTableGenMain(raw_ostream &os, const RecordKeeper &records) {
   if (actionOnDeprecatedValue != DeprecatedAction::None)
     warnOfDeprecatedUses(records);
 
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 69036e947ebdb..a5872cd3d3069 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -71,10 +71,16 @@ namespace {
 /// This class wraps a IRMapping to provide recursive lookup
 /// functionality, i.e. we will traverse if the mapped value also has a mapping.
 struct ConversionValueMapping {
-  /// Lookup a mapped value within the map. If a mapping for the provided value
-  /// does not exist then return the provided value. If `desiredType` is
-  /// non-null, returns the most recently mapped value with that type. If an
-  /// operand of that type does not exist, defaults to normal behavior.
+  /// Lookup the most recently mapped value with the desired type in the
+  /// mapping.
+  ///
+  /// Special cases:
+  /// - If the desired type is "null", simply return the most recently mapped
+  ///   value.
+  /// - If there is no mapping to the desired type, also return the most
+  ///   recently mapped value.
+  /// - If there is no mapping for the given value at all, return the given
+  ///   value.
   Value lookupOrDefault(Value from, Type desiredType = nullptr) const;
 
   /// Lookup a mapped value within the map, or return null if a mapping does not
@@ -115,19 +121,11 @@ struct ConversionValueMapping {
 
 Value ConversionValueMapping::lookupOrDefault(Value from,
                                               Type desiredType) const {
-  // If there was no desired type, simply find the leaf value.
-  if (!desiredType) {
-    // If this value had a valid mapping, unmap that value as well in the case
-    // that it was also replaced.
-    while (auto mappedValue = mapping.lookupOrNull(from))
-      from = mappedValue;
-    return from;
-  }
-
-  // Otherwise, try to find the deepest value that has the desired type.
+  // Try to find the deepest value that has the desired type. If there is no
+  // such value, simply return the deepest value.
   Value desiredValue;
   do {
-    if (from.getType() == desiredType)
+    if (!desiredType || from.getType() == desiredType)
       desiredValue = from;
 
     Value mappedValue = mapping.lookupOrNull(from);
@@ -1094,49 +1092,55 @@ LogicalResult ConversionPatternRewriterImpl::remapValues(
     SmallVectorImpl<Value> &remapped) {
   remapped.reserve(llvm::size(values));
 
-  SmallVector<Type, 1> legalTypes;
   for (const auto &it : llvm::enumerate(values)) {
     Value operand = it.value();
     Type origType = operand.getType();
+    Location operandLoc = inputLoc ? *inputLoc : operand.getLoc();
 
-    // If a converter was provided, get the desired legal types for this
-    // operand.
-    Type desiredType;
-    if (currentTypeConverter) {
-      // If there is no legal conversion, fail to match this pattern.
-      legalTypes.clear();
-      if (failed(currentTypeConverter->convertType(origType, legalTypes))) {
-        Location operandLoc = inputLoc ? *inputLoc : operand.getLoc();
-        notifyMatchFailure(operandLoc, [=](Diagnostic &diag) {
-          diag << "unable to convert type for " << valueDiagTag << " #"
-               << it.index() << ", type was " << origType;
-        });
-        return failure();
-      }
-      // TODO: There currently isn't any mechanism to do 1->N type conversion
-      // via the PatternRewriter replacement API, so for now we just ignore it.
-      if (legalTypes.size() == 1)
-        desiredType = legalTypes.front();
-    } else {
-      // TODO: What we should do here is just set `desiredType` to `origType`
-      // and then handle the necessary type conversions after the conversion
-      // process has finished. Unfortunately a lot of patterns currently rely on
-      // receiving the new operands even if the types change, so we keep the
-      // original behavior here for now until all of the patterns relying on
-      // this get updated.
+    if (!currentTypeConverter) {
+      // The current pattern does not have a type converter. I.e., it does not
+      // distinguish between legal and illegal types. For each operand, simply
+      // pass through the most recently mapped value.
+      remapped.push_back(mapping.lookupOrDefault(operand));
+      continue;
+    }
+
+    // If there is no legal conversion, fail to match this pattern.
+    SmallVector<Type, 1> legalTypes;
+    if (failed(currentTypeConverter->convertType(origType, legalTypes))) {
+      notifyMatchFailure(operandLoc, [=](Diagnostic &diag) {
+        diag << "unable to convert type for " << valueDiagTag << " #"
+             << it.index() << ", type was " << origType;
+      });
+      return failure();
+    }
+
+    if (legalTypes.size() != 1) {
+      // TODO: Parts of the dialect conversion infrastructure do not support
+      // 1->N type conversions yet. Therefore, if a type is converted to 0 or
+      // multiple types, the only thing that we can do for now is passing
+      // through the most recently mapped value. Fixing this requires
+      // improvements to the `ConversionValueMapping` (to be able to store 1:N
+      // mappings) and to the `ConversionPattern` adaptor handling (to be able
+      // to pass multiple remapped values for a single operand to the adaptor).
+      remapped.push_back(mapping.lookupOrDefault(operand));
+      continue;
     }
-    Value newOperand = mapping.lookupOrDefault(operand, desiredType);
 
-    // Handle the case where the conversion was 1->1 and the new operand type
-    // isn't legal.
-    Type newOperandType = newOperand.getType();
-    if (currentTypeConverter && desiredType && newOperandType != desiredType) {
-      Location operandLoc = inputLoc ? *inputLoc : operand.getLoc();
+    // Handle 1->1 type conversions.
+    Type desiredType = legalTypes.front();
+    // Try to find a mapped value with the desired type. (Or the operand itself
+    // if the value is not mapped at all.)
+    Value newOperand = mapping.lookupOrDefault(operand, desiredType);
+    if (newOperand.getType() != desiredType) {
+      // If the looked up value's type does not have the desired type, it means
+      // that the value was replaced with a value of different type and no
+      // source materialization was created yet.
       Value castValue = buildUnresolvedMaterialization(
           MaterializationKind::Target, computeInsertPoint(newOperand),
           operandLoc, /*inputs=*/newOperand, /*outputType=*/desiredType,
           currentTypeConverter);
-      mapping.map(mapping.lookupOrDefault(newOperand), castValue);
+      mapping.map(newOperand, castValue);
       newOperand = castValue;
     }
     remapped.push_back(newOperand);
@@ -2336,7 +2340,7 @@ struct OperationConverter {
 
   /// This method is called after the conversion process to legalize any
   /// remaining artifacts and complete the conversion.
-  LogicalResult finalize(ConversionPatternRewriter &rewriter);
+  void finalize(ConversionPatternRewriter &rewriter);
 
   /// Dialect conversion configuration.
   ConversionConfig config;
@@ -2460,8 +2464,7 @@ LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
   // Now that all of the operations have been converted, finalize the conversion
   // process to ensure any lingering conversion artifacts are cleaned up and
   // legalized.
-  if (failed(finalize(rewriter)))
-    return rewriterImpl.undoRewrites(), failure();
+  finalize(rewriter);
 
   // After a successful conversion, apply rewrites if this is not an analysis
   // conversion.
@@ -2538,8 +2541,7 @@ getReplacedValues(IRRewrite *rewrite) {
   return {};
 }
 
-LogicalResult
-OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
+void OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
   ConversionPatternRewriterImpl &rewriterImpl = rewriter.getImpl();
   DenseMap<Value, SmallVector<Value>> inverseMapping =
       rewriterImpl.mapping.getInverse();
@@ -2574,8 +2576,6 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
       llvm::erase(inverseMapping[newValue], originalValue);
     }
   }
-
-  return success();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi b/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi
index 93b978c75540f..42694747e5f24 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi
@@ -1,9 +1,8 @@
-from typing import List
 
 globals: "_Globals"
 
 class _Globals:
-    dialect_search_modules: List[str]
+    dialect_search_modules: list[str]
     def _register_dialect_impl(self, dialect_namespace: str, dialect_class: type) -> None: ...
     def _register_operation_impl(self, operation_name: str, operation_class: type) -> None: ...
     def append_dialect_search_prefix(self, module_name: str) -> None: ...
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/dialects/pdl.pyi b/mlir/python/mlir/_mlir_libs/_mlir/dialects/pdl.pyi
index 8ec944d191c6f..d12c6839deaba 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/dialects/pdl.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/dialects/pdl.pyi
@@ -2,7 +2,6 @@
 #  See https://llvm.org/LICENSE.txt for license information.
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from typing import Optional
 
 from mlir.ir import Type, Context
 
@@ -26,7 +25,7 @@ class AttributeType(Type):
   def isinstance(type: Type) -> bool: ...
 
   @staticmethod
-  def get(context: Optional[Context] = None) -> AttributeType: ...
+  def get(context: Context | None = None) -> AttributeType: ...
 
 
 class OperationType(Type):
@@ -34,7 +33,7 @@ class OperationType(Type):
   def isinstance(type: Type) -> bool: ...
 
   @staticmethod
-  def get(context: Optional[Context] = None) -> OperationType: ...
+  def get(context: Context | None = None) -> OperationType: ...
 
 
 class RangeType(Type):
@@ -53,7 +52,7 @@ class TypeType(Type):
   def isinstance(type: Type) -> bool: ...
 
   @staticmethod
-  def get(context: Optional[Context] = None) -> TypeType: ...
+  def get(context: Context | None = None) -> TypeType: ...
 
 
 class ValueType(Type):
@@ -61,4 +60,4 @@ class ValueType(Type):
   def isinstance(type: Type) -> bool: ...
 
   @staticmethod
-  def get(context: Optional[Context] = None) -> ValueType: ...
+  def get(context: Context | None = None) -> ValueType: ...
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/dialects/quant.pyi b/mlir/python/mlir/_mlir_libs/_mlir/dialects/quant.pyi
index c9c66d52b8c25..a10bc693ba600 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/dialects/quant.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/dialects/quant.pyi
@@ -2,7 +2,6 @@
 #  See https://llvm.org/LICENSE.txt for license information.
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from typing import List
 
 from mlir.ir import Type
 
@@ -94,15 +93,15 @@ class UniformQuantizedPerAxisType(QuantizedType):
 
   @classmethod
   def get(cls, flags: int, storage_type: Type, expressed_type: Type,
-          scales: List[float], zero_points: List[int], quantized_dimension: int,
+          scales: list[float], zero_points: list[int], quantized_dimension: int,
           storage_type_min: int, storage_type_max: int):
     ...
 
   @property
-  def scales(self) -> List[float]: ...
+  def scales(self) -> list[float]: ...
 
   @property
-  def zero_points(self) -> List[float]: ...
+  def zero_points(self) -> list[float]: ...
 
   @property
   def quantized_dimension(self) -> int: ...
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/dialects/transform/__init__.pyi b/mlir/python/mlir/_mlir_libs/_mlir/dialects/transform/__init__.pyi
index 2a29541734a82..a3f1b09102379 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/dialects/transform/__init__.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/dialects/transform/__init__.pyi
@@ -2,7 +2,6 @@
 #  See https://llvm.org/LICENSE.txt for license information.
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from typing import Optional
 
 from mlir.ir import Type, Context
 
@@ -12,7 +11,7 @@ class AnyOpType(Type):
   def isinstance(type: Type) -> bool: ...
 
   @staticmethod
-  def get(context: Optional[Context] = None) -> AnyOpType: ...
+  def get(context: Context | None = None) -> AnyOpType: ...
 
 
 class OperationType(Type):
@@ -20,7 +19,7 @@ class OperationType(Type):
   def isinstance(type: Type) -> bool: ...
 
   @staticmethod
-  def get(operation_name: str, context: Optional[Context] = None) -> OperationType: ...
+  def get(operation_name: str, context: Context | None = None) -> OperationType: ...
 
   @property
   def operation_name(self) -> str: ...
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index 4d5b4cef9d8aa..41ed84e046725 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -44,22 +44,9 @@ from __future__ import annotations
 
 import abc
 import collections
+from collections.abc import Callable, Sequence
 import io
-from typing import (
-    Any,
-    Callable,
-    ClassVar,
-    Dict,
-    List,
-    Optional,
-    Sequence,
-    Tuple,
-    Type as _Type,
-    TypeVar,
-    Union,
-)
-
-from typing import overload
+from typing import Any, ClassVar, TypeVar, overload
 
 __all__ = [
     "AffineAddExpr",
@@ -210,14 +197,14 @@ class _OperationBase:
     def get_asm(
         self,
         binary: bool = False,
-        large_elements_limit: Optional[int] = None,
+        large_elements_limit: int | None = None,
         enable_debug_info: bool = False,
         pretty_debug_info: bool = False,
         print_generic_op_form: bool = False,
         use_local_scope: bool = False,
         assume_verified: bool = False,
         skip_regions: bool = False,
-    ) -> Union[io.BytesIO, io.StringIO]:
+    ) -> io.BytesIO | io.StringIO:
         """
         Gets the assembly form of the operation with all options available.
 
@@ -242,7 +229,7 @@ class _OperationBase:
     def print(
         self,
         state: AsmState,
-        file: Optional[Any] = None,
+        file: Any | None = None,
         binary: bool = False,
     ) -> None:
         """
@@ -256,13 +243,13 @@ class _OperationBase:
     @overload
     def print(
         self,
-        large_elements_limit: Optional[int] = None,
+        large_elements_limit: int | None = None,
         enable_debug_info: bool = False,
         pretty_debug_info: bool = False,
         print_generic_op_form: bool = False,
         use_local_scope: bool = False,
         assume_verified: bool = False,
-        file: Optional[Any] = None,
+        file: Any | None = None,
         binary: bool = False,
         skip_regions: bool = False,
     ) -> None:
@@ -296,7 +283,7 @@ class _OperationBase:
         """
         Verify the operation. Raises MLIRError if verification fails, and returns true otherwise.
         """
-    def write_bytecode(self, file: Any, desired_version: Optional[int] = None) -> None:
+    def write_bytecode(self, file: Any, desired_version: int | None = None) -> None:
         """
         Write the bytecode form of the operation to a file like object.
 
@@ -325,7 +312,7 @@ class _OperationBase:
     @property
     def operands(self) -> OpOperandList: ...
     @property
-    def parent(self) -> Optional[_OperationBase]: ...
+    def parent(self) -> _OperationBase | None: ...
     @property
     def regions(self) -> RegionSequence: ...
     @property
@@ -380,13 +367,13 @@ class AffineExpr:
         """
     @staticmethod
     def get_constant(
-        value: int, context: Optional[Context] = None
+        value: int, context: Context | None = None
     ) -> AffineConstantExpr:
         """
         Gets a constant affine expression with the given value.
         """
     @staticmethod
-    def get_dim(position: int, context: Optional[Context] = None) -> AffineDimExpr:
+    def get_dim(position: int, context: Context | None = None) -> AffineDimExpr:
         """
         Gets an affine expression of a dimension at the given position.
         """
@@ -446,7 +433,7 @@ class AffineExpr:
         """
     @staticmethod
     def get_symbol(
-        position: int, context: Optional[Context] = None
+        position: int, context: Context | None = None
     ) -> AffineSymbolExpr:
         """
         Gets an affine expression of a symbol at the given position.
@@ -489,7 +476,7 @@ class AffineExpr:
 
 class Attribute:
     @staticmethod
-    def parse(asm: str | bytes, context: Optional[Context] = None) -> Attribute:
+    def parse(asm: str | bytes, context: Context | None = None) -> Attribute:
         """
         Parses an attribute from an assembly form. Raises an MLIRError on failure.
         """
@@ -530,7 +517,7 @@ class Attribute:
 
 class Type:
     @staticmethod
-    def parse(asm: str | bytes, context: Optional[Context] = None) -> Type:
+    def parse(asm: str | bytes, context: Context | None = None) -> Type:
         """
         Parses the assembly form of a type.
 
@@ -640,7 +627,7 @@ class AffineCeilDivExpr(AffineBinaryExpr):
 
 class AffineConstantExpr(AffineExpr):
     @staticmethod
-    def get(value: int, context: Optional[Context] = None) -> AffineConstantExpr: ...
+    def get(value: int, context: Context | None = None) -> AffineConstantExpr: ...
     @staticmethod
     def isinstance(other: AffineExpr) -> bool: ...
     def __init__(self, expr: AffineExpr) -> None: ...
@@ -649,7 +636,7 @@ class AffineConstantExpr(AffineExpr):
 
 class AffineDimExpr(AffineExpr):
     @staticmethod
-    def get(position: int, context: Optional[Context] = None) -> AffineDimExpr: ...
+    def get(position: int, context: Context | None = None) -> AffineDimExpr: ...
     @staticmethod
     def isinstance(other: AffineExpr) -> bool: ...
     def __init__(self, expr: AffineExpr) -> None: ...
@@ -657,7 +644,7 @@ class AffineDimExpr(AffineExpr):
     def position(self) -> int: ...
 
 class AffineExprList:
-    def __add__(self, arg0: AffineExprList) -> List[AffineExpr]: ...
+    def __add__(self, arg0: AffineExprList) -> list[AffineExpr]: ...
 
 class AffineFloorDivExpr(AffineBinaryExpr):
     @staticmethod
@@ -669,43 +656,43 @@ class AffineFloorDivExpr(AffineBinaryExpr):
 class AffineMap:
     @staticmethod
     def compress_unused_symbols(
-        arg0: List, arg1: Optional[Context]
-    ) -> List[AffineMap]: ...
+        arg0: list, arg1: Context | None
+    ) -> list[AffineMap]: ...
     @staticmethod
     def get(
         dim_count: int,
         symbol_count: int,
-        exprs: List,
-        context: Optional[Context] = None,
+        exprs: list,
+        context: Context | None = None,
     ) -> AffineMap:
         """
         Gets a map with the given expressions as results.
         """
     @staticmethod
-    def get_constant(value: int, context: Optional[Context] = None) -> AffineMap:
+    def get_constant(value: int, context: Context | None = None) -> AffineMap:
         """
         Gets an affine map with a single constant result
         """
     @staticmethod
-    def get_empty(context: Optional[Context] = None) -> AffineMap:
+    def get_empty(context: Context | None = None) -> AffineMap:
         """
         Gets an empty affine map.
         """
     @staticmethod
-    def get_identity(n_dims: int, context: Optional[Context] = None) -> AffineMap:
+    def get_identity(n_dims: int, context: Context | None = None) -> AffineMap:
         """
         Gets an identity map with the given number of dimensions.
         """
     @staticmethod
     def get_minor_identity(
-        n_dims: int, n_results: int, context: Optional[Context] = None
+        n_dims: int, n_results: int, context: Context | None = None
     ) -> AffineMap:
         """
         Gets a minor identity map with the given number of dimensions and results.
         """
     @staticmethod
     def get_permutation(
-        permutation: List[int], context: Optional[Context] = None
+        permutation: list[int], context: Context | None = None
     ) -> AffineMap:
         """
         Gets an affine map that permutes its inputs.
@@ -722,7 +709,7 @@ class AffineMap:
         """
     def get_major_submap(self, n_results: int) -> AffineMap: ...
     def get_minor_submap(self, n_results: int) -> AffineMap: ...
-    def get_submap(self, result_positions: List[int]) -> AffineMap: ...
+    def get_submap(self, result_positions: list[int]) -> AffineMap: ...
     def replace(
         self,
         expr: AffineExpr,
@@ -748,7 +735,7 @@ class AffineMap:
     @property
     def n_symbols(self) -> int: ...
     @property
-    def results(self) -> "AffineMapExprList": ...
+    def results(self) -> AffineMapExprList: ...
 
 class AffineMapAttr(Attribute):
     static_typeid: ClassVar[TypeID]
@@ -781,7 +768,7 @@ class AffineMulExpr(AffineBinaryExpr):
 
 class AffineSymbolExpr(AffineExpr):
     @staticmethod
-    def get(position: int, context: Optional[Context] = None) -> AffineSymbolExpr: ...
+    def get(position: int, context: Context | None = None) -> AffineSymbolExpr: ...
     @staticmethod
     def isinstance(other: AffineExpr) -> bool: ...
     def __init__(self, expr: AffineExpr) -> None: ...
@@ -791,13 +778,13 @@ class AffineSymbolExpr(AffineExpr):
 class ArrayAttr(Attribute):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(attributes: List, context: Optional[Context] = None) -> ArrayAttr:
+    def get(attributes: list, context: Context | None = None) -> ArrayAttr:
         """
         Gets a uniqued Array attribute
         """
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
-    def __add__(self, arg0: List) -> ArrayAttr: ...
+    def __add__(self, arg0: list) -> ArrayAttr: ...
     def __getitem__(self, arg0: int) -> Attribute: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
     def __iter__(
@@ -835,7 +822,7 @@ class AttrBuilder:
 class BF16Type(Type):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> BF16Type:
+    def get(context: Context | None = None) -> BF16Type:
         """
         Create a bf16 type.
         """
@@ -849,8 +836,8 @@ class Block:
     @staticmethod
     def create_at_start(
         parent: Region,
-        arg_types: List[Type],
-        arg_locs: Optional[Sequence] = None,
+        arg_types: list[Type],
+        arg_locs: Sequence | None = None,
     ) -> Block:
         """
         Creates and returns a new Block at the beginning of the given region (with given argument types and locations).
@@ -876,11 +863,11 @@ class Block:
         """
         Append this block to a region, transferring ownership if necessary
         """
-    def create_after(self, *args, arg_locs: Optional[Sequence] = None) -> Block:
+    def create_after(self, *args, arg_locs: Sequence | None = None) -> Block:
         """
         Creates and returns a new Block after this block (with given argument types and locations).
         """
-    def create_before(self, *args, arg_locs: Optional[Sequence] = None) -> Block:
+    def create_before(self, *args, arg_locs: Sequence | None = None) -> Block:
         """
         Creates and returns a new Block before this block (with given argument types and locations).
         """
@@ -924,9 +911,9 @@ class BlockArgumentList:
     @overload
     def __getitem__(self, arg0: slice) -> BlockArgumentList: ...
     def __len__(self) -> int: ...
-    def __add__(self, arg0: BlockArgumentList) -> List[BlockArgument]: ...
+    def __add__(self, arg0: BlockArgumentList) -> list[BlockArgument]: ...
     @property
-    def types(self) -> List[Type]: ...
+    def types(self) -> list[Type]: ...
 
 class BlockIterator:
     def __iter__(self) -> BlockIterator: ...
@@ -936,7 +923,7 @@ class BlockList:
     def __getitem__(self, arg0: int) -> Block: ...
     def __iter__(self) -> BlockIterator: ...
     def __len__(self) -> int: ...
-    def append(self, *args, arg_locs: Optional[Sequence] = None) -> Block:
+    def append(self, *args, arg_locs: Sequence | None = None) -> Block:
         """
         Appends a new block, with argument types as positional args.
 
@@ -946,7 +933,7 @@ class BlockList:
 
 class BoolAttr(Attribute):
     @staticmethod
-    def get(value: bool, context: Optional[Context] = None) -> BoolAttr:
+    def get(value: bool, context: Context | None = None) -> BoolAttr:
         """
         Gets an uniqued bool attribute
         """
@@ -1000,7 +987,7 @@ class Context:
     def _get_context_again(self) -> Context: ...
     def _get_live_module_count(self) -> int: ...
     def _get_live_operation_count(self) -> int: ...
-    def _get_live_operation_objects(self) -> List[Operation]: ...
+    def _get_live_operation_objects(self) -> list[Operation]: ...
     def append_dialect_registry(self, registry: DialectRegistry) -> None: ...
     def attach_diagnostic_handler(
         self, callback: Callable[[Diagnostic], bool]
@@ -1031,14 +1018,14 @@ class Context:
 class DenseBoolArrayAttr(Attribute):
     @staticmethod
     def get(
-        values: Sequence[bool], context: Optional[Context] = None
+        values: Sequence[bool], context: Context | None = None
     ) -> DenseBoolArrayAttr:
         """
         Gets a uniqued dense array attribute
         """
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
-    def __add__(self, arg0: List) -> DenseBoolArrayAttr: ...
+    def __add__(self, arg0: list) -> DenseBoolArrayAttr: ...
     def __getitem__(self, arg0: int) -> bool: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
     def __iter__(
@@ -1061,9 +1048,9 @@ class DenseElementsAttr(Attribute):
     def get(
         array: Buffer,
         signless: bool = True,
-        type: Optional[Type] = None,
-        shape: Optional[List[int]] = None,
-        context: Optional[Context] = None,
+        type: Type | None = None,
+        shape: list[int] | None = None,
+        context: Context | None = None,
     ) -> DenseElementsAttr:
         """
         Gets a DenseElementsAttr from a Python buffer or array.
@@ -1128,14 +1115,14 @@ class DenseElementsAttr(Attribute):
 class DenseF32ArrayAttr(Attribute):
     @staticmethod
     def get(
-        values: Sequence[float], context: Optional[Context] = None
+        values: Sequence[float], context: Context | None = None
     ) -> DenseF32ArrayAttr:
         """
         Gets a uniqued dense array attribute
         """
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
-    def __add__(self, arg0: List) -> DenseF32ArrayAttr: ...
+    def __add__(self, arg0: list) -> DenseF32ArrayAttr: ...
     def __getitem__(self, arg0: int) -> float: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
     def __iter__(
@@ -1156,14 +1143,14 @@ class DenseF32ArrayIterator:
 class DenseF64ArrayAttr(Attribute):
     @staticmethod
     def get(
-        values: Sequence[float], context: Optional[Context] = None
+        values: Sequence[float], context: Context | None = None
     ) -> DenseF64ArrayAttr:
         """
         Gets a uniqued dense array attribute
         """
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
-    def __add__(self, arg0: List) -> DenseF64ArrayAttr: ...
+    def __add__(self, arg0: list) -> DenseF64ArrayAttr: ...
     def __getitem__(self, arg0: int) -> float: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
     def __iter__(
@@ -1186,9 +1173,9 @@ class DenseFPElementsAttr(DenseElementsAttr):
     def get(
         array: Buffer,
         signless: bool = True,
-        type: Optional[Type] = None,
-        shape: Optional[List[int]] = None,
-        context: Optional[Context] = None,
+        type: Type | None = None,
+        shape: list[int] | None = None,
+        context: Context | None = None,
     ) -> DenseFPElementsAttr: ...
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
@@ -1203,13 +1190,13 @@ class DenseFPElementsAttr(DenseElementsAttr):
 
 class DenseI16ArrayAttr(Attribute):
     @staticmethod
-    def get(values: Sequence[int], context: Optional[Context] = None) -> DenseI16ArrayAttr:
+    def get(values: Sequence[int], context: Context | None = None) -> DenseI16ArrayAttr:
         """
         Gets a uniqued dense array attribute
         """
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
-    def __add__(self, arg0: List) -> DenseI16ArrayAttr: ...
+    def __add__(self, arg0: list) -> DenseI16ArrayAttr: ...
     def __getitem__(self, arg0: int) -> int: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
     def __iter__(
@@ -1229,13 +1216,13 @@ class DenseI16ArrayIterator:
 
 class DenseI32ArrayAttr(Attribute):
     @staticmethod
-    def get(values: Sequence[int], context: Optional[Context] = None) -> DenseI32ArrayAttr:
+    def get(values: Sequence[int], context: Context | None = None) -> DenseI32ArrayAttr:
         """
         Gets a uniqued dense array attribute
         """
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
-    def __add__(self, arg0: List) -> DenseI32ArrayAttr: ...
+    def __add__(self, arg0: list) -> DenseI32ArrayAttr: ...
     def __getitem__(self, arg0: int) -> int: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
     def __iter__(
@@ -1255,13 +1242,13 @@ class DenseI32ArrayIterator:
 
 class DenseI64ArrayAttr(Attribute):
     @staticmethod
-    def get(values: Sequence[int], context: Optional[Context] = None) -> DenseI64ArrayAttr:
+    def get(values: Sequence[int], context: Context | None = None) -> DenseI64ArrayAttr:
         """
         Gets a uniqued dense array attribute
         """
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
-    def __add__(self, arg0: List) -> DenseI64ArrayAttr: ...
+    def __add__(self, arg0: list) -> DenseI64ArrayAttr: ...
     def __getitem__(self, arg0: int) -> int: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
     def __iter__(
@@ -1281,13 +1268,13 @@ class DenseI64ArrayIterator:
 
 class DenseI8ArrayAttr(Attribute):
     @staticmethod
-    def get(values: Sequence[int], context: Optional[Context] = None) -> DenseI8ArrayAttr:
+    def get(values: Sequence[int], context: Context | None = None) -> DenseI8ArrayAttr:
         """
         Gets a uniqued dense array attribute
         """
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
-    def __add__(self, arg0: List) -> DenseI8ArrayAttr: ...
+    def __add__(self, arg0: list) -> DenseI8ArrayAttr: ...
     def __getitem__(self, arg0: int) -> int: ...
     def __init__(self, cast_from_attr: Attribute) -> None: ...
     def __iter__(
@@ -1310,9 +1297,9 @@ class DenseIntElementsAttr(DenseElementsAttr):
     def get(
         array: Buffer,
         signless: bool = True,
-        type: Optional[Type] = None,
-        shape: Optional[List[int]] = None,
-        context: Optional[Context] = None,
+        type: Type | None = None,
+        shape: list[int] | None = None,
+        context: Context | None = None,
     ) -> DenseIntElementsAttr: ...
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
@@ -1331,9 +1318,9 @@ class DenseResourceElementsAttr(Attribute):
         array: Buffer,
         name: str,
         type: Type,
-        alignment: Optional[int] = None,
+        alignment: int | None = None,
         is_mutable: bool = False,
-        context: Optional[Context] = None,
+        context: Context | None = None,
     ) -> DenseResourceElementsAttr:
         """
         Gets a DenseResourceElementsAttr from a Python buffer or array.
@@ -1376,7 +1363,7 @@ class Diagnostic:
     @property
     def message(self) -> str: ...
     @property
-    def notes(self) -> Tuple[Diagnostic]: ...
+    def notes(self) -> tuple[Diagnostic]: ...
     @property
     def severity(self) -> DiagnosticSeverity: ...
 
@@ -1396,7 +1383,7 @@ class DiagnosticInfo:
     @property
     def message(self) -> str: ...
     @property
-    def notes(self) -> List[DiagnosticInfo]: ...
+    def notes(self) -> list[DiagnosticInfo]: ...
     @property
     def severity(self) -> DiagnosticSeverity: ...
 
@@ -1418,7 +1405,7 @@ class DiagnosticSeverity:
     REMARK: ClassVar[DiagnosticSeverity]  # value = <DiagnosticSeverity.REMARK: 3>
     WARNING: ClassVar[DiagnosticSeverity]  # value = <DiagnosticSeverity.WARNING: 1>
     __members__: ClassVar[
-        Dict[str, DiagnosticSeverity]
+        dict[str, DiagnosticSeverity]
     ]  # value = {'ERROR': <DiagnosticSeverity.ERROR: 0>, 'WARNING': <DiagnosticSeverity.WARNING: 1>, 'NOTE': <DiagnosticSeverity.NOTE: 2>, 'REMARK': <DiagnosticSeverity.REMARK: 3>}
     def __eq__(self, other: Any) -> bool: ...
     def __getstate__(self) -> int: ...
@@ -1455,7 +1442,7 @@ class Dialects:
 class DictAttr(Attribute):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(value: Dict = {}, context: Optional[Context] = None) -> DictAttr:
+    def get(value: dict = {}, context: Context | None = None) -> DictAttr:
         """
         Gets an uniqued Dict attribute
         """
@@ -1486,7 +1473,7 @@ class FloatType(Type):
 class F16Type(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> F16Type:
+    def get(context: Context | None = None) -> F16Type:
         """
         Create a f16 type.
         """
@@ -1499,7 +1486,7 @@ class F16Type(FloatType):
 class F32Type(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> F32Type:
+    def get(context: Context | None = None) -> F32Type:
         """
         Create a f32 type.
         """
@@ -1512,7 +1499,7 @@ class F32Type(FloatType):
 class F64Type(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> F64Type:
+    def get(context: Context | None = None) -> F64Type:
         """
         Create a f64 type.
         """
@@ -1524,7 +1511,7 @@ class F64Type(FloatType):
 
 class FlatSymbolRefAttr(Attribute):
     @staticmethod
-    def get(value: str, context: Optional[Context] = None) -> FlatSymbolRefAttr:
+    def get(value: str, context: Context | None = None) -> FlatSymbolRefAttr:
         """
         Gets a uniqued FlatSymbolRef attribute
         """
@@ -1546,7 +1533,7 @@ class FlatSymbolRefAttr(Attribute):
 class Float4E2M1FNType(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> Float4E2M1FNType:
+    def get(context: Context | None = None) -> Float4E2M1FNType:
         """
         Create a float4_e2m1fn type.
         """
@@ -1559,7 +1546,7 @@ class Float4E2M1FNType(FloatType):
 class Float6E2M3FNType(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> Float6E2M3FNType:
+    def get(context: Context | None = None) -> Float6E2M3FNType:
         """
         Create a float6_e2m3fn type.
         """
@@ -1572,7 +1559,7 @@ class Float6E2M3FNType(FloatType):
 class Float6E3M2FNType(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> Float6E3M2FNType:
+    def get(context: Context | None = None) -> Float6E3M2FNType:
         """
         Create a float6_e3m2fn type.
         """
@@ -1585,7 +1572,7 @@ class Float6E3M2FNType(FloatType):
 class Float8E3M4Type(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> Float8E3M4Type:
+    def get(context: Context | None = None) -> Float8E3M4Type:
         """
         Create a float8_e3m4 type.
         """
@@ -1598,7 +1585,7 @@ class Float8E3M4Type(FloatType):
 class Float8E4M3B11FNUZType(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> Float8E4M3B11FNUZType:
+    def get(context: Context | None = None) -> Float8E4M3B11FNUZType:
         """
         Create a float8_e4m3b11fnuz type.
         """
@@ -1611,7 +1598,7 @@ class Float8E4M3B11FNUZType(FloatType):
 class Float8E4M3FNType(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> Float8E4M3FNType:
+    def get(context: Context | None = None) -> Float8E4M3FNType:
         """
         Create a float8_e4m3fn type.
         """
@@ -1624,7 +1611,7 @@ class Float8E4M3FNType(FloatType):
 class Float8E4M3FNUZType(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> Float8E4M3FNUZType:
+    def get(context: Context | None = None) -> Float8E4M3FNUZType:
         """
         Create a float8_e4m3fnuz type.
         """
@@ -1637,7 +1624,7 @@ class Float8E4M3FNUZType(FloatType):
 class Float8E4M3Type(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> Float8E4M3Type:
+    def get(context: Context | None = None) -> Float8E4M3Type:
         """
         Create a float8_e4m3 type.
         """
@@ -1650,7 +1637,7 @@ class Float8E4M3Type(FloatType):
 class Float8E5M2FNUZType(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> Float8E5M2FNUZType:
+    def get(context: Context | None = None) -> Float8E5M2FNUZType:
         """
         Create a float8_e5m2fnuz type.
         """
@@ -1663,7 +1650,7 @@ class Float8E5M2FNUZType(FloatType):
 class Float8E5M2Type(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> Float8E5M2Type:
+    def get(context: Context | None = None) -> Float8E5M2Type:
         """
         Create a float8_e5m2 type.
         """
@@ -1676,17 +1663,17 @@ class Float8E5M2Type(FloatType):
 class FloatAttr(Attribute):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(type: Type, value: float, loc: Optional[Location] = None) -> FloatAttr:
+    def get(type: Type, value: float, loc: Location | None = None) -> FloatAttr:
         """
         Gets an uniqued float point attribute associated to a type
         """
     @staticmethod
-    def get_f32(value: float, context: Optional[Context] = None) -> FloatAttr:
+    def get_f32(value: float, context: Context | None = None) -> FloatAttr:
         """
         Gets an uniqued float point attribute associated to a f32 type
         """
     @staticmethod
-    def get_f64(value: float, context: Optional[Context] = None) -> FloatAttr:
+    def get_f64(value: float, context: Context | None = None) -> FloatAttr:
         """
         Gets an uniqued float point attribute associated to a f64 type
         """
@@ -1710,7 +1697,7 @@ class FloatAttr(Attribute):
 class FloatTF32Type(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> FloatTF32Type:
+    def get(context: Context | None = None) -> FloatTF32Type:
         """
         Create a tf32 type.
         """
@@ -1724,7 +1711,7 @@ class FunctionType(Type):
     static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
-        inputs: List[Type], results: List[Type], context: Optional[Context] = None
+        inputs: list[Type], results: list[Type], context: Context | None = None
     ) -> FunctionType:
         """
         Gets a FunctionType from a List of input and result types
@@ -1733,12 +1720,12 @@ class FunctionType(Type):
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
     @property
-    def inputs(self) -> List:
+    def inputs(self) -> list:
         """
         Returns the List of input types in the FunctionType.
         """
     @property
-    def results(self) -> List:
+    def results(self) -> list:
         """
         Returns the List of result types in the FunctionType.
         """
@@ -1748,7 +1735,7 @@ class FunctionType(Type):
 class IndexType(Type):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> IndexType:
+    def get(context: Context | None = None) -> IndexType:
         """
         Create a index type.
         """
@@ -1759,7 +1746,7 @@ class IndexType(Type):
     def typeid(self) -> TypeID: ...
 
 class InferShapedTypeOpInterface:
-    def __init__(self, object: object, context: Optional[Context] = None) -> None:
+    def __init__(self, object: object, context: Context | None = None) -> None:
         """
         Creates an interface from a given operation/opview object or from a
         subclass of OpView. Raises ValueError if the operation does not implement the
@@ -1767,13 +1754,13 @@ class InferShapedTypeOpInterface:
         """
     def inferReturnTypeComponents(
         self,
-        operands: Optional[List] = None,
-        attributes: Optional[Attribute] = None,
+        operands: list | None = None,
+        attributes: Attribute | None = None,
         properties=None,
-        regions: Optional[List[Region]] = None,
-        context: Optional[Context] = None,
-        loc: Optional[Location] = None,
-    ) -> List[ShapedTypeComponents]:
+        regions: list[Region] | None = None,
+        context: Context | None = None,
+        loc: Location | None = None,
+    ) -> list[ShapedTypeComponents]:
         """
         Given the arguments required to build an operation, attempts to infer
         its return shaped type components. Raises ValueError on failure.
@@ -1791,7 +1778,7 @@ class InferShapedTypeOpInterface:
         """
 
 class InferTypeOpInterface:
-    def __init__(self, object: object, context: Optional[Context] = None) -> None:
+    def __init__(self, object: object, context: Context | None = None) -> None:
         """
         Creates an interface from a given operation/opview object or from a
         subclass of OpView. Raises ValueError if the operation does not implement the
@@ -1799,13 +1786,13 @@ class InferTypeOpInterface:
         """
     def inferReturnTypes(
         self,
-        operands: Optional[List] = None,
-        attributes: Optional[Attribute] = None,
+        operands: list | None = None,
+        attributes: Attribute | None = None,
         properties=None,
-        regions: Optional[List[Region]] = None,
-        context: Optional[Context] = None,
-        loc: Optional[Location] = None,
-    ) -> List[Type]:
+        regions: list[Region] | None = None,
+        context: Context | None = None,
+        loc: Location | None = None,
+    ) -> list[Type]:
         """
         Given the arguments required to build an operation, attempts to infer
         its return types. Raises ValueError on failure.
@@ -1856,7 +1843,7 @@ class InsertionPoint:
         Returns the block that this InsertionPoint points to.
         """
     @property
-    def ref_operation(self) -> Optional[_OperationBase]:
+    def ref_operation(self) -> _OperationBase | None:
         """
         The reference operation before which new operations are inserted, or None if the insertion point is at the end of the block
         """
@@ -1890,13 +1877,13 @@ class IntegerSet:
     def get(
         num_dims: int,
         num_symbols: int,
-        exprs: List,
-        eq_flags: List[bool],
-        context: Optional[Context] = None,
+        exprs: list,
+        eq_flags: list[bool],
+        context: Context | None = None,
     ) -> IntegerSet: ...
     @staticmethod
     def get_empty(
-        num_dims: int, num_symbols: int, context: Optional[Context] = None
+        num_dims: int, num_symbols: int, context: Context | None = None
     ) -> IntegerSet: ...
     def _CAPICreate(self) -> IntegerSet: ...
     @overload
@@ -1910,8 +1897,8 @@ class IntegerSet:
         """
     def get_replaced(
         self,
-        dim_exprs: List,
-        symbol_exprs: List,
+        dim_exprs: list,
+        symbol_exprs: list,
         num_result_dims: int,
         num_result_symbols: int,
     ) -> IntegerSet: ...
@@ -1958,7 +1945,7 @@ class IntegerSetConstraint:
 
 class IntegerSetConstraintList:
     def __init__(self, *args, **kwargs) -> None: ...
-    def __add__(self, arg0: IntegerSetConstraintList) -> List[IntegerSetConstraint]: ...
+    def __add__(self, arg0: IntegerSetConstraintList) -> list[IntegerSetConstraint]: ...
     @overload
     def __getitem__(self, arg0: int) -> IntegerSetConstraint: ...
     @overload
@@ -1968,17 +1955,17 @@ class IntegerSetConstraintList:
 class IntegerType(Type):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get_signed(width: int, context: Optional[Context] = None) -> IntegerType:
+    def get_signed(width: int, context: Context | None = None) -> IntegerType:
         """
         Create a signed integer type
         """
     @staticmethod
-    def get_signless(width: int, context: Optional[Context] = None) -> IntegerType:
+    def get_signless(width: int, context: Context | None = None) -> IntegerType:
         """
         Create a signless integer type
         """
     @staticmethod
-    def get_unsigned(width: int, context: Optional[Context] = None) -> IntegerType:
+    def get_unsigned(width: int, context: Context | None = None) -> IntegerType:
         """
         Create an unsigned integer type
         """
@@ -2013,28 +2000,28 @@ class Location:
     __hash__: ClassVar[None] = None
     @staticmethod
     def callsite(
-        callee: Location, frames: Sequence[Location], context: Optional[Context] = None
+        callee: Location, frames: Sequence[Location], context: Context | None = None
     ) -> Location:
         """
         Gets a Location representing a caller and callsite
         """
     @staticmethod
     def file(
-        filename: str, line: int, col: int, context: Optional[Context] = None
+        filename: str, line: int, col: int, context: Context | None = None
     ) -> Location:
         """
         Gets a Location representing a file, line and column
         """
     @staticmethod
-    def from_attr(attribute: Attribute, context: Optional[Context] = None) -> Location:
+    def from_attr(attribute: Attribute, context: Context | None = None) -> Location:
         """
         Gets a Location from a LocationAttr
         """
     @staticmethod
     def fused(
         locations: Sequence[Location],
-        metadata: Optional[Attribute] = None,
-        context: Optional[Context] = None,
+        metadata: Attribute | None = None,
+        context: Context | None = None,
     ) -> Location:
         """
         Gets a Location representing a fused location with optional metadata
@@ -2042,14 +2029,14 @@ class Location:
     @staticmethod
     def name(
         name: str,
-        childLoc: Optional[Location] = None,
-        context: Optional[Context] = None,
+        childLoc: Location | None = None,
+        context: Context | None = None,
     ) -> Location:
         """
         Gets a Location representing a named location with optional child location
         """
     @staticmethod
-    def unknown(context: Optional[Context] = None) -> Location:
+    def unknown(context: Context | None = None) -> Location:
         """
         Gets a Location representing an unknown location
         """
@@ -2081,11 +2068,11 @@ class MemRefType(ShapedType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
-        shape: List[int],
+        shape: list[int],
         element_type: Type,
         layout: Attribute = None,
         memory_space: Attribute = None,
-        loc: Optional[Location] = None,
+        loc: Location | None = None,
     ) -> MemRefType:
         """
         Create a memref type
@@ -2104,21 +2091,25 @@ class MemRefType(ShapedType):
         The layout of the MemRef type.
         """
     @property
-    def memory_space(self) -> Optional[Attribute]:
+    def memory_space(self) -> Attribute | None:
         """
         Returns the memory space of the given MemRef type.
         """
     @property
     def typeid(self) -> TypeID: ...
+    def get_strides_and_offset(self) -> tuple[list[int], list[int]]:
+        """
+        The strides and offset of the MemRef type.
+        """
 
 class Module:
     @staticmethod
-    def create(loc: Optional[Location] = None) -> Module:
+    def create(loc: Location | None = None) -> Module:
         """
         Creates an empty module
         """
     @staticmethod
-    def parse(asm: str | bytes, context: Optional[Context] = None) -> Module:
+    def parse(asm: str | bytes, context: Context | None = None) -> Module:
         """
         Parses a module's assembly format from a string.
 
@@ -2159,7 +2150,7 @@ class Module:
 
 class MLIRError(Exception):
     def __init__(
-        self, message: str, error_diagnostics: List[DiagnosticInfo]
+        self, message: str, error_diagnostics: list[DiagnosticInfo]
     ) -> None: ...
 
 class NamedAttribute:
@@ -2177,7 +2168,7 @@ class NamedAttribute:
 class NoneType(Type):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> NoneType:
+    def get(context: Context | None = None) -> NoneType:
         """
         Create a none type.
         """
@@ -2208,7 +2199,7 @@ class OpOperandIterator:
     def __next__(self) -> OpOperand: ...
 
 class OpOperandList:
-    def __add__(self, arg0: OpOperandList) -> List[Value]: ...
+    def __add__(self, arg0: OpOperandList) -> list[Value]: ...
     @overload
     def __getitem__(self, arg0: int) -> Value: ...
     @overload
@@ -2228,7 +2219,7 @@ class OpResult(Value):
     def result_number(self) -> int: ...
 
 class OpResultList:
-    def __add__(self, arg0: OpResultList) -> List[OpResult]: ...
+    def __add__(self, arg0: OpResultList) -> list[OpResult]: ...
     @overload
     def __getitem__(self, arg0: int) -> OpResult: ...
     @overload
@@ -2237,10 +2228,10 @@ class OpResultList:
     @property
     def owner(self) -> _OperationBase: ...
     @property
-    def types(self) -> List[Type]: ...
+    def types(self) -> list[Type]: ...
 
 class OpSuccessors:
-    def __add__(self, arg0: OpSuccessors) -> List[Block]: ...
+    def __add__(self, arg0: OpSuccessors) -> list[Block]: ...
     @overload
     def __getitem__(self, arg0: int) -> Block: ...
     @overload
@@ -2255,25 +2246,25 @@ class OpView(_OperationBase):
     def __init__(self, operation: _OperationBase) -> None: ...
     @classmethod
     def build_generic(
-        cls: _Type[_TOperation],
-        results: Optional[Sequence[Type]] = None,
-        operands: Optional[Sequence[Value]] = None,
-        attributes: Optional[Dict[str, Attribute]] = None,
-        successors: Optional[Sequence[Block]] = None,
-        regions: Optional[int] = None,
-        loc: Optional[Location] = None,
-        ip: Optional[InsertionPoint] = None,
+        cls: type[_TOperation],
+        results: Sequence[Type] | None = None,
+        operands: Sequence[Value] | None = None,
+        attributes: dict[str, Attribute] | None = None,
+        successors: Sequence[Block] | None = None,
+        regions: int | None = None,
+        loc: Location | None = None,
+        ip: InsertionPoint | None = None,
     ) -> _TOperation:
         """
         Builds a specific, generated OpView based on class level attributes.
         """
     @classmethod
     def parse(
-        cls: _Type[_TOperation],
+        cls: type[_TOperation],
         source: str | bytes,
         *,
         source_name: str = "",
-        context: Optional[Context] = None,
+        context: Context | None = None,
     ) -> _TOperation:
         """
         Parses a specific, generated OpView based on class level attributes
@@ -2296,7 +2287,7 @@ class OpaqueAttr(Attribute):
         dialect_namespace: str,
         buffer: Buffer,
         type: Type,
-        context: Optional[Context] = None,
+        context: Context | None = None,
     ) -> OpaqueAttr:
         """
         Gets an Opaque attribute.
@@ -2323,7 +2314,7 @@ class OpaqueType(Type):
     static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
-        dialect_namespace: str, buffer: str, context: Optional[Context] = None
+        dialect_namespace: str, buffer: str, context: Context | None = None
     ) -> OpaqueType:
         """
         Create an unregistered (opaque) dialect type.
@@ -2349,13 +2340,13 @@ class Operation(_OperationBase):
     @staticmethod
     def create(
         name: str,
-        results: Optional[Sequence[Type]] = None,
-        operands: Optional[Sequence[Value]] = None,
-        attributes: Optional[Dict[str, Attribute]] = None,
-        successors: Optional[Sequence[Block]] = None,
+        results: Sequence[Type] | None = None,
+        operands: Sequence[Value] | None = None,
+        attributes: dict[str, Attribute] | None = None,
+        successors: Sequence[Block] | None = None,
         regions: int = 0,
-        loc: Optional[Location] = None,
-        ip: Optional[InsertionPoint] = None,
+        loc: Location | None = None,
+        ip: InsertionPoint | None = None,
         infer_type: bool = False,
     ) -> Operation:
         """
@@ -2378,7 +2369,7 @@ class Operation(_OperationBase):
         """
     @staticmethod
     def parse(
-        source: str | bytes, *, source_name: str = "", context: Optional[Context] = None
+        source: str | bytes, *, source_name: str = "", context: Context | None = None
     ) -> Operation:
         """
         Parses an operation. Supports both text assembly format and binary bytecode format.
@@ -2409,10 +2400,10 @@ class RankedTensorType(ShapedType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
-        shape: List[int],
+        shape: list[int],
         element_type: Type,
-        encoding: Optional[Attribute] = None,
-        loc: Optional[Location] = None,
+        encoding: Attribute | None = None,
+        loc: Location | None = None,
     ) -> RankedTensorType:
         """
         Create a ranked tensor type
@@ -2421,7 +2412,7 @@ class RankedTensorType(ShapedType):
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
     @property
-    def encoding(self) -> Optional[Attribute]: ...
+    def encoding(self) -> Attribute | None: ...
     @property
     def typeid(self) -> TypeID: ...
 
@@ -2507,7 +2498,7 @@ class ShapedType(Type):
         Returns the rank of the given ranked shaped type.
         """
     @property
-    def shape(self) -> List[int]:
+    def shape(self) -> list[int]:
         """
         Returns the shape of the ranked shaped type as a List of integers.
         """
@@ -2525,14 +2516,14 @@ class ShapedTypeComponents:
         """
     @staticmethod
     @overload
-    def get(shape: List, element_type: Type) -> ShapedTypeComponents:
+    def get(shape: list, element_type: Type) -> ShapedTypeComponents:
         """
         Create a ranked shaped type components object.
         """
     @staticmethod
     @overload
     def get(
-        shape: List, element_type: Type, attribute: Attribute
+        shape: list, element_type: Type, attribute: Attribute
     ) -> ShapedTypeComponents:
         """
         Create a ranked shaped type components object with attribute.
@@ -2553,7 +2544,7 @@ class ShapedTypeComponents:
         Returns the rank of the given ranked shaped type components. If the shaped type components does not have a rank, None is returned.
         """
     @property
-    def shape(self) -> List[int]:
+    def shape(self) -> list[int]:
         """
         Returns the shape of the ranked shaped type components as a List of integers. Returns none if the shaped type component does not have a rank.
         """
@@ -2562,14 +2553,14 @@ class StridedLayoutAttr(Attribute):
     static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
-        offset: int, strides: List[int], context: Optional[Context] = None
+        offset: int, strides: list[int], context: Context | None = None
     ) -> StridedLayoutAttr:
         """
         Gets a strided layout attribute.
         """
     @staticmethod
     def get_fully_dynamic(
-        rank: int, context: Optional[Context] = None
+        rank: int, context: Context | None = None
     ) -> StridedLayoutAttr:
         """
         Gets a strided layout attribute with dynamic offset and strides of a given rank.
@@ -2583,7 +2574,7 @@ class StridedLayoutAttr(Attribute):
         Returns the value of the float point attribute
         """
     @property
-    def strides(self) -> List[int]:
+    def strides(self) -> list[int]:
         """
         Returns the value of the float point attribute
         """
@@ -2595,7 +2586,7 @@ class StridedLayoutAttr(Attribute):
 class StringAttr(Attribute):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(value: str | bytes, context: Optional[Context] = None) -> StringAttr:
+    def get(value: str | bytes, context: Context | None = None) -> StringAttr:
         """
         Gets a uniqued string attribute
         """
@@ -2624,7 +2615,7 @@ class StringAttr(Attribute):
 
 class SymbolRefAttr(Attribute):
     @staticmethod
-    def get(symbols: List[str], context: Optional[Context] = None) -> Attribute:
+    def get(symbols: list[str], context: Context | None = None) -> Attribute:
         """
         Gets a uniqued SymbolRef attribute from a List of symbol names
         """
@@ -2638,7 +2629,7 @@ class SymbolRefAttr(Attribute):
     @property
     def typeid(self) -> TypeID: ...
     @property
-    def value(self) -> List[str]:
+    def value(self) -> list[str]:
         """
         Returns the value of the SymbolRef attribute as a List[str]
         """
@@ -2672,7 +2663,7 @@ class SymbolTable:
 class TupleType(Type):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get_tuple(elements: List[Type], context: Optional[Context] = None) -> TupleType:
+    def get_tuple(elements: list[Type], context: Context | None = None) -> TupleType:
         """
         Create a Tuple type
         """
@@ -2694,7 +2685,7 @@ class TupleType(Type):
 class TypeAttr(Attribute):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(value: Type, context: Optional[Context] = None) -> TypeAttr:
+    def get(value: Type, context: Context | None = None) -> TypeAttr:
         """
         Gets a uniqued Type attribute
         """
@@ -2721,7 +2712,7 @@ class TypeID:
 class UnitAttr(Attribute):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(context: Optional[Context] = None) -> UnitAttr:
+    def get(context: Context | None = None) -> UnitAttr:
         """
         Create a Unit attribute.
         """
@@ -2737,7 +2728,7 @@ class UnrankedMemRefType(ShapedType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
-        element_type: Type, memory_space: Attribute, loc: Optional[Location] = None
+        element_type: Type, memory_space: Attribute, loc: Location | None = None
     ) -> UnrankedMemRefType:
         """
         Create a unranked memref type
@@ -2746,7 +2737,7 @@ class UnrankedMemRefType(ShapedType):
     def isinstance(other: Type) -> bool: ...
     def __init__(self, cast_from_type: Type) -> None: ...
     @property
-    def memory_space(self) -> Optional[Attribute]:
+    def memory_space(self) -> Attribute | None:
         """
         Returns the memory space of the given Unranked MemRef type.
         """
@@ -2756,7 +2747,7 @@ class UnrankedMemRefType(ShapedType):
 class UnrankedTensorType(ShapedType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(element_type: Type, loc: Optional[Location] = None) -> UnrankedTensorType:
+    def get(element_type: Type, loc: Location | None = None) -> UnrankedTensorType:
         """
         Create a unranked tensor type
         """
@@ -2770,12 +2761,12 @@ class VectorType(ShapedType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
-        shape: List[int],
+        shape: list[int],
         element_type: Type,
         *,
-        scalable: Optional[List] = None,
-        scalable_dims: Optional[List[int]] = None,
-        loc: Optional[Location] = None,
+        scalable: list | None = None,
+        scalable_dims: list[int] | None = None,
+        loc: Location | None = None,
     ) -> VectorType:
         """
         Create a vector type
@@ -2786,7 +2777,7 @@ class VectorType(ShapedType):
     @property
     def scalable(self) -> bool: ...
     @property
-    def scalable_dims(self) -> List[bool]: ...
+    def scalable_dims(self) -> list[bool]: ...
     @property
     def typeid(self) -> TypeID: ...
 
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi
index 5d115e8222d73..229979ae33608 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi
@@ -4,7 +4,6 @@
 #   * Relative imports for cross-module references.
 #   * Add __all__
 
-from typing import Any, Optional
 
 from . import ir as _ir
 
@@ -13,7 +12,7 @@ __all__ = [
 ]
 
 class PassManager:
-    def __init__(self, context: Optional[_ir.Context] = None) -> None: ...
+    def __init__(self, context: _ir.Context | None = None) -> None: ...
     def _CAPICreate(self) -> object: ...
     def _testing_release(self) -> None: ...
     def enable_ir_printing(
@@ -26,7 +25,7 @@ class PassManager:
     ) -> None: ...
     def enable_verifier(self, enable: bool) -> None: ...
     @staticmethod
-    def parse(pipeline: str, context: Optional[_ir.Context] = None) -> PassManager: ...
+    def parse(pipeline: str, context: _ir.Context | None = None) -> PassManager: ...
     def run(self, module: _ir._OperationBase) -> None: ...
     @property
     def _CAPIPtr(self) -> object: ...
diff --git a/mlir/python/mlir/_mlir_libs/_mlirExecutionEngine.pyi b/mlir/python/mlir/_mlir_libs/_mlirExecutionEngine.pyi
index 893dab8a431fd..58d453d2b2d37 100644
--- a/mlir/python/mlir/_mlir_libs/_mlirExecutionEngine.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlirExecutionEngine.pyi
@@ -4,7 +4,7 @@
 #   * Relative imports for cross-module references.
 #   * Add __all__
 
-from typing import List, Sequence
+from collections.abc import Sequence
 
 from ._mlir import ir as _ir
 
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
index 860bb60726352..910105ddf6958 100644
--- a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
+++ b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
@@ -563,3 +563,36 @@ gpu.module @kernels {
     gpu.return
   }
 }
+
+// -----
+
+// Lowering of subgroup query operations
+
+// CHECK-DAG: llvm.func spir_funccc @_Z18get_sub_group_size() -> i32 attributes {no_unwind, will_return}
+// CHECK-DAG: llvm.func spir_funccc @_Z18get_num_sub_groups() -> i32 attributes {no_unwind, will_return}
+// CHECK-DAG: llvm.func spir_funccc @_Z22get_sub_group_local_id() -> i32 attributes {no_unwind, will_return}
+// CHECK-DAG: llvm.func spir_funccc @_Z16get_sub_group_id() -> i32 attributes {no_unwind, will_return}
+
+
+gpu.module @subgroup_operations {
+// CHECK-LABEL: @gpu_subgroup
+  func.func @gpu_subgroup() {
+    // CHECK:       %[[SG_ID:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32
+    // CHECK-32-NOT:                llvm.zext
+    // CHECK-64           %{{.*}} = llvm.zext %[[SG_ID]] : i32 to i64
+    %0 = gpu.subgroup_id : index
+    // CHECK: %[[SG_LOCAL_ID:.*]] = llvm.call spir_funccc @_Z22get_sub_group_local_id() {no_unwind, will_return}  : () -> i32
+    // CHECK-32-NOT:                llvm.zext
+    // CHECK-64:          %{{.*}} = llvm.zext %[[SG_LOCAL_ID]] : i32 to i64
+    %1 = gpu.lane_id
+    // CHECK:     %[[NUM_SGS:.*]] = llvm.call spir_funccc @_Z18get_num_sub_groups() {no_unwind, will_return} : () -> i32
+    // CHECK-32-NOT:                llvm.zext
+    // CHECK-64:          %{{.*}} = llvm.zext %[[NUM_SGS]] : i32 to i64
+    %2 = gpu.num_subgroups : index
+    // CHECK:     %[[SG_SIZE:.*]] = llvm.call spir_funccc @_Z18get_sub_group_size() {no_unwind, will_return} : () -> i32
+    // CHECK-32-NOT:                llvm.zext
+    // CHECK-64:          %{{.*}} = llvm.zext %[[SG_SIZE]] : i32 to i64
+    %3 = gpu.subgroup_size : index
+    return
+  }
+}
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index ad4e9ec1791a7..748dfe8c68fc7 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -610,6 +610,13 @@ gpu.module @test_module_29 {
     // CHECK-NEXT: %[[ALLOC:.*]] = llvm.alloca %[[O]] x !llvm.struct<()> : (i64) -> !llvm.ptr
     // CHECK-NEXT: llvm.call @vprintf(%[[FORMATSTART]], %[[ALLOC]]) : (!llvm.ptr, !llvm.ptr) -> i32
     gpu.printf "Hello, world\n"
+
+    // Make sure that the same global is reused.
+    // CHECK: %[[FORMATSTR2:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL0]] : !llvm.ptr
+    // CHECK: %[[FORMATSTART2:.*]] = llvm.getelementptr %[[FORMATSTR2]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<14 x i8>
+    // CHECK: llvm.call @vprintf(%[[FORMATSTART2]], %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
+    gpu.printf "Hello, world\n"
+
     gpu.return
   }
 
diff --git a/mlir/test/Conversion/GPUToSPIRV/printf.mlir b/mlir/test/Conversion/GPUToSPIRV/printf.mlir
new file mode 100644
index 0000000000000..bc091124ea4c6
--- /dev/null
+++ b/mlir/test/Conversion/GPUToSPIRV/printf.mlir
@@ -0,0 +1,71 @@
+// RUN: mlir-opt -allow-unregistered-dialect -split-input-file  -convert-gpu-to-spirv -verify-diagnostics %s | FileCheck %s
+
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int8, Kernel], []>, #spirv.resource_limits<>>
+} {
+  func.func @main() {
+    %c1 = arith.constant 1 : index
+
+    gpu.launch_func @kernels::@printf
+        blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1)
+        args()
+    return
+  }
+  
+  gpu.module @kernels {
+    // CHECK: spirv.module @{{.*}} Physical32 OpenCL
+    // CHECK-DAG: spirv.SpecConstant [[SPECCST:@.*]] = {{.*}} : i8
+    // CHECK-DAG: spirv.SpecConstantComposite [[SPECCSTCOMPOSITE:@.*]] ([[SPECCST]], {{.*}}) : !spirv.array<[[ARRAYSIZE:.*]] x i8>
+    // CHECK-DAG: spirv.GlobalVariable [[PRINTMSG:@.*]] initializer([[SPECCSTCOMPOSITE]]) {Constant}  : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant>
+    gpu.func @printf() kernel
+      attributes 
+        {spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+          // CHECK: [[FMTSTR_ADDR:%.*]] = spirv.mlir.addressof [[PRINTMSG]] : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant>
+          // CHECK-NEXT: [[FMTSTR_PTR:%.*]] = spirv.Bitcast [[FMTSTR_ADDR]] : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant> to !spirv.ptr<i8, UniformConstant>
+          // CHECK-NEXT {{%.*}} = spirv.CL.printf [[FMTSTR_PTR]] : !spirv.ptr<i8, UniformConstant> -> i32
+          gpu.printf "\nHello\n"
+          // CHECK: spirv.Return
+          gpu.return
+    }
+  }
+}
+
+// -----
+
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int8, Kernel], []>, #spirv.resource_limits<>>
+} {
+  func.func @main() {
+    %c1   = arith.constant 1 : index
+    %c100 = arith.constant 100: i32
+    %cst_f32 = arith.constant 314.4: f32
+
+    gpu.launch_func @kernels1::@printf_args
+        blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1)
+        args(%c100: i32, %cst_f32: f32)
+    return
+  }
+
+   gpu.module @kernels1 {
+    // CHECK: spirv.module @{{.*}} Physical32 OpenCL {
+    // CHECK-DAG: spirv.SpecConstant [[SPECCST:@.*]] = {{.*}} : i8
+    // CHECK-DAG: spirv.SpecConstantComposite [[SPECCSTCOMPOSITE:@.*]] ([[SPECCST]], {{.*}}) : !spirv.array<[[ARRAYSIZE:.*]] x i8>
+    // CHECK-DAG: spirv.GlobalVariable [[PRINTMSG:@.*]] initializer([[SPECCSTCOMPOSITE]]) {Constant}  : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant>
+    gpu.func @printf_args(%arg0: i32, %arg1: f32) kernel
+      attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+        %0 = gpu.block_id x
+        %1 = gpu.block_id y
+        %2 = gpu.thread_id x
+
+        // CHECK: [[FMTSTR_ADDR:%.*]] = spirv.mlir.addressof [[PRINTMSG]] : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant>
+        // CHECK-NEXT: [[FMTSTR_PTR1:%.*]] = spirv.Bitcast [[FMTSTR_ADDR]] : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant> to !spirv.ptr<i8, UniformConstant>
+        // CHECK-NEXT:  {{%.*}} = spirv.CL.printf [[FMTSTR_PTR1]] {{%.*}}, {{%.*}}, {{%.*}} : !spirv.ptr<i8, UniformConstant>, i32, f32, i32 -> i32
+        gpu.printf "\nHello, world : %d %f \n Thread id: %d\n" %arg0, %arg1, %2: i32, f32, index
+
+        // CHECK: spirv.Return
+        gpu.return
+    }
+  }
+}
diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
index 97e5b578017ea..5ab6802c75700 100644
--- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
+++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
@@ -267,8 +267,7 @@ llvm.func @_QPomp_target_data_region(%a : !llvm.ptr, %i : !llvm.ptr) {
 // CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(64 : i32) : i32
 // CHECK:           %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
 // CHECK:           %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG_1]] : !llvm.ptr, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = ""}
-// CHECK:           omp.target map_entries(%[[MAP1]] -> %[[BB_ARG0:.*]], %[[MAP2]] -> %[[BB_ARG1:.*]] : !llvm.ptr, !llvm.ptr) thread_limit(%[[VAL_0]] : i32) {
-// CHECK:           ^bb0(%[[BB_ARG0]]: !llvm.ptr, %[[BB_ARG1]]: !llvm.ptr):
+// CHECK:           omp.target thread_limit(%[[VAL_0]] : i32) map_entries(%[[MAP1]] -> %[[BB_ARG0:.*]], %[[MAP2]] -> %[[BB_ARG1:.*]] : !llvm.ptr, !llvm.ptr) {
 // CHECK:             %[[VAL_1:.*]] = llvm.mlir.constant(10 : i32) : i32
 // CHECK:             llvm.store %[[VAL_1]], %[[BB_ARG1]] : i32, !llvm.ptr
 // CHECK:             omp.terminator
@@ -281,7 +280,6 @@ llvm.func @_QPomp_target(%a : !llvm.ptr, %i : !llvm.ptr) {
   %1 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
   %3 = omp.map.info var_ptr(%i : !llvm.ptr, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = ""}
   omp.target   thread_limit(%0 : i32) map_entries(%1 -> %arg0, %3 -> %arg1 : !llvm.ptr, !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
     %2 = llvm.mlir.constant(10 : i32) : i32
     llvm.store %2, %arg1 : i32, !llvm.ptr
     omp.terminator
@@ -486,7 +484,6 @@ llvm.func @sub_() {
 // CHECK: %[[BOUNDS1:.*]] = omp.map.bounds   lower_bound(%[[C_12]] : i64) upper_bound(%[[C_11]] : i64) stride(%[[C_14]] : i64) start_idx(%[[C_14]] : i64)
 // CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG_2]] : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !llvm.ptr {name = ""}
 // CHECK: omp.target   map_entries(%[[MAP0]] -> %[[BB_ARG0:.*]], %[[MAP1]]  -> %[[BB_ARG1:.*]] : !llvm.ptr, !llvm.ptr) {
-// CHECK: ^bb0(%[[BB_ARG0]]: !llvm.ptr, %[[BB_ARG1]]: !llvm.ptr):
 // CHECK:   omp.terminator
 // CHECK: }
 // CHECK: llvm.return
@@ -506,7 +503,6 @@ llvm.func @_QPtarget_map_with_bounds(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2:
   %10 = omp.map.bounds   lower_bound(%7 : i64) upper_bound(%6 : i64) stride(%9 : i64) start_idx(%9 : i64)
   %11 = omp.map.info var_ptr(%arg2 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%10) -> !llvm.ptr {name = ""}
   omp.target   map_entries(%5 -> %arg3, %11 -> %arg4: !llvm.ptr, !llvm.ptr) {
-    ^bb0(%arg3: !llvm.ptr, %arg4: !llvm.ptr):
     omp.terminator
   }
   llvm.return
diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir
index 730808dbbdf18..906ae81c76d11 100644
--- a/mlir/test/Dialect/Affine/canonicalize.mlir
+++ b/mlir/test/Dialect/Affine/canonicalize.mlir
@@ -1466,3 +1466,70 @@ func.func @prefetch_canonicalize(%arg0: memref<512xf32>) -> () {
   }
   return
 }
+
+// -----
+
+func.func @drop_unit_basis_in_delinearize(%arg0 : index, %arg1 : index, %arg2 : index) ->
+    (index, index, index, index, index, index) {
+  %c1 = arith.constant 1 : index
+  %0:6 = affine.delinearize_index %arg0 into (%c1, %arg1, %c1, %c1, %arg2, %c1)
+      : index, index, index, index, index, index
+  return %0#0, %0#1, %0#2, %0#3, %0#4, %0#5 : index, index, index, index, index, index
+}
+// CHECK-LABEL: func @drop_unit_basis_in_delinearize(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index)
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[DELINEARIZE:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[ARG1]], %[[ARG2]])
+//       CHECK:   return %[[C0]], %[[DELINEARIZE]]#0, %[[C0]], %[[C0]], %[[DELINEARIZE]]#1, %[[C0]]
+
+// -----
+
+func.func @drop_all_unit_bases(%arg0 : index) -> (index, index) {
+  %c1 = arith.constant 1 : index
+  %0:2 = affine.delinearize_index %arg0 into (%c1, %c1) : index, index
+  return %0#0, %0#1 : index, index
+}
+// CHECK-LABEL: func @drop_all_unit_bases(
+//  CHECK-SAME:     %[[ARG0:.+]]: index)
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-NOT:   affine.delinearize_index
+//       CHECK:   return %[[C0]], %[[C0]]
+
+// -----
+
+func.func @drop_single_loop_delinearize(%arg0 : index, %arg1 : index) -> index {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %2 = scf.for %iv = %c0 to %arg1 step %c1 iter_args(%arg2 = %c0) -> index {
+    %0 = affine.delinearize_index %iv into (%arg1) : index
+    %1 = "some_use"(%arg2, %0) : (index, index) -> (index)
+    scf.yield %1 : index
+  }
+  return %2 : index
+}
+// CHECK-LABEL: func @drop_single_loop_delinearize(
+//  CHECK-SAME:     %[[ARG0:.+]]: index)
+//       CHECK:   scf.for %[[IV:[a-zA-Z0-9]+]] =
+//   CHECK-NOT:     affine.delinearize_index
+//       CHECK:     "some_use"(%{{.+}}, %[[IV]])
+
+// -----
+
+// CHECK-LABEL: func @delinearize_non_induction_variable
+func.func @delinearize_non_induction_variable(%arg0: memref<?xi32>, %i : index, %t0 : index, %t1 : index, %t2 : index) -> index {
+  %c1024 = arith.constant 1024 : index
+  %1 = affine.apply affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 64 + s2 * 128)>(%i)[%t0, %t1, %t2]
+  %2 = affine.delinearize_index %1 into (%c1024) : index
+  return %2 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @delinearize_non_loop_like
+func.func @delinearize_non_loop_like(%arg0: memref<?xi32>, %i : index) -> index {
+  %c1024 = arith.constant 1024 : index
+  %2 = affine.delinearize_index %i into (%c1024) : index
+  return %2 : index
+}
diff --git a/mlir/test/Dialect/Affine/loop-coalescing.mlir b/mlir/test/Dialect/Affine/loop-coalescing.mlir
index 45dd299295f64..f6e7b21bc66ab 100644
--- a/mlir/test/Dialect/Affine/loop-coalescing.mlir
+++ b/mlir/test/Dialect/Affine/loop-coalescing.mlir
@@ -1,14 +1,15 @@
-// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -affine-loop-coalescing --cse %s | FileCheck %s
+// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -affine-loop-coalescing --cse --mlir-print-local-scope %s | FileCheck %s
 
 // CHECK-LABEL: @one_3d_nest
 func.func @one_3d_nest() {
   // Capture original bounds.  Note that for zero-based step-one loops, the
   // upper bound is also the number of iterations.
-  // CHECK: %[[orig_lb:.*]] = arith.constant 0
-  // CHECK: %[[orig_step:.*]] = arith.constant 1
-  // CHECK: %[[orig_ub_k:.*]] = arith.constant 3
-  // CHECK: %[[orig_ub_i:.*]] = arith.constant 42
-  // CHECK: %[[orig_ub_j:.*]] = arith.constant 56
+  // CHECK-DAG: %[[orig_lb:.*]] = arith.constant 0
+  // CHECK-DAG: %[[orig_step:.*]] = arith.constant 1
+  // CHECK-DAG: %[[orig_ub_k:.*]] = arith.constant 3
+  // CHECK-DAG: %[[orig_ub_i:.*]] = arith.constant 42
+  // CHECK-DAG: %[[orig_ub_j:.*]] = arith.constant 56
+  // CHECK-DAG: %[[range:.*]] = arith.constant 7056
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -16,9 +17,6 @@ func.func @one_3d_nest() {
   %c42 = arith.constant 42 : index
   %c56 = arith.constant 56 : index
   // The range of the new scf.
-  // CHECK:     %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]]
-  // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]]
-
   // Updated loop bounds.
   // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]]
   scf.for %i = %c0 to %c42 step %c1 {
@@ -26,13 +24,11 @@ func.func @one_3d_nest() {
     // CHECK-NOT: scf.for
 
     // Reconstruct original IVs from the linearized one.
-    // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]]
-    // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]]
+    // CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[i]]
+    // CHECK-SAME: into (%[[orig_ub_i]], %[[orig_ub_j]], %[[orig_ub_k]])
     scf.for %j = %c0 to %c56 step %c1 {
       scf.for %k = %c0 to %c3 step %c1 {
-        // CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]])
+        // CHECK: "use"(%[[delinearize]]#0, %[[delinearize]]#1, %[[delinearize]]#2)
         "use"(%i, %j, %k) : (index, index, index) -> ()
       }
     }
@@ -40,6 +36,8 @@ func.func @one_3d_nest() {
   return
 }
 
+// -----
+
 // Check that there is no chasing the replacement of value uses by ensuring
 // multiple uses of loop induction variables get rewritten to the same values.
 
@@ -52,13 +50,10 @@ func.func @multi_use() {
   scf.for %i = %c1 to %c10 step %c1 {
     scf.for %j = %c1 to %c10 step %c1 {
       scf.for %k = %c1 to %c10 step %c1 {
-        // CHECK: %[[k_unshifted:.*]] = arith.remsi %[[iv]], %[[k_extent:.*]]
-        // CHECK: %[[ij:.*]] = arith.divsi %[[iv]], %[[k_extent]]
-        // CHECK: %[[j_unshifted:.*]] = arith.remsi %[[ij]], %[[j_extent:.*]]
-        // CHECK: %[[i_unshifted:.*]] = arith.divsi %[[ij]], %[[j_extent]]
-        // CHECK: %[[k:.*]] = arith.addi %[[k_unshifted]]
-        // CHECK: %[[j:.*]] = arith.addi %[[j_unshifted]]
-        // CHECK: %[[i:.*]] = arith.addi %[[i_unshifted]]
+      	// CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[iv]]
+        // CHECK: %[[k:.*]] = affine.apply affine_map<(d0) -> (d0 + 1)>(%[[delinearize]]#2)
+        // CHECK: %[[j:.*]] = affine.apply affine_map<(d0) -> (d0 + 1)>(%[[delinearize]]#1)
+        // CHECK: %[[i:.*]] = affine.apply affine_map<(d0) -> (d0 + 1)>(%[[delinearize]]#0)
 
         // CHECK: "use1"(%[[i]], %[[j]], %[[k]])
         "use1"(%i,%j,%k) : (index,index,index) -> ()
@@ -72,12 +67,20 @@ func.func @multi_use() {
   return
 }
 
+// -----
+
 func.func @unnormalized_loops() {
-  // CHECK: %[[orig_step_i:.*]] = arith.constant 2
+  // Normalized lower bound and step for the outer scf.
+  // CHECK-DAG: %[[lb_i:.*]] = arith.constant 0
+  // CHECK-DAG: %[[step_i:.*]] = arith.constant 1
+  // CHECK-DAG: %[[orig_step_j_and_numiter_i:.*]] = arith.constant 3
+
+  // Number of iterations in the inner loop, the pattern is the same as above,
+  // only capture the final result.
+  // CHECK-DAG: %[[numiter_j:.*]] = arith.constant 4
+
+  // CHECK-DAG: %[[range:.*]] = arith.constant 12
 
-  // CHECK: %[[orig_step_j_and_numiter_i:.*]] = arith.constant 3
-  // CHECK: %[[orig_lb_i:.*]] = arith.constant 5
-  // CHECK: %[[orig_lb_j:.*]] = arith.constant 7
   %c2 = arith.constant 2 : index
   %c3 = arith.constant 3 : index
   %c5 = arith.constant 5 : index
@@ -85,28 +88,18 @@ func.func @unnormalized_loops() {
   %c10 = arith.constant 10 : index
   %c17 = arith.constant 17 : index
 
-  // Normalized lower bound and step for the outer scf.
-  // CHECK: %[[lb_i:.*]] = arith.constant 0
-  // CHECK: %[[step_i:.*]] = arith.constant 1
-
-  // Number of iterations in the inner loop, the pattern is the same as above,
-  // only capture the final result.
-  // CHECK: %[[numiter_j:.*]] = arith.constant 4
 
   // New bounds of the outer scf.
-  // CHECK: %[[range:.*]] = arith.muli %[[orig_step_j_and_numiter_i:.*]], %[[numiter_j]]
   // CHECK: scf.for %[[i:.*]] = %[[lb_i]] to %[[range]] step %[[step_i]]
   scf.for %i = %c5 to %c10 step %c2 {
     // The inner loop has been removed.
     // CHECK-NOT: scf.for
     scf.for %j = %c7 to %c17 step %c3 {
       // The IVs are rewritten.
-      // CHECK: %[[normalized_j:.*]] = arith.remsi %[[i]], %[[numiter_j]]
-      // CHECK: %[[normalized_i:.*]] = arith.divsi %[[i]], %[[numiter_j]]
-      // CHECK: %[[scaled_j:.*]] = arith.muli %[[normalized_j]], %[[orig_step_j_and_numiter_i]]
-      // CHECK: %[[orig_j:.*]] = arith.addi %[[scaled_j]], %[[orig_lb_j]]
-      // CHECK: %[[scaled_i:.*]] = arith.muli %[[normalized_i]], %[[orig_step_i]]
-      // CHECK: %[[orig_i:.*]] = arith.addi %[[scaled_i]], %[[orig_lb_i]]
+      // CHECK: %[[delinearize:.+]]:2 = affine.delinearize_index %[[i]]
+      // CHECK-SAME: into (%[[orig_step_j_and_numiter_i]], %[[numiter_j]])
+      // CHECK: %[[orig_j:.*]] = affine.apply affine_map<(d0) -> (d0 * 3 + 7)>(%[[delinearize]]#1)
+      // CHECK: %[[orig_i:.*]] = affine.apply affine_map<(d0) -> (d0 * 2 + 5)>(%[[delinearize]]#0)
       // CHECK: "use"(%[[orig_i]], %[[orig_j]])
       "use"(%i, %j) : (index, index) -> ()
     }
@@ -114,20 +107,21 @@ func.func @unnormalized_loops() {
   return
 }
 
+// -----
+
 func.func @noramalized_loops_with_yielded_iter_args() {
-  // CHECK: %[[orig_lb:.*]] = arith.constant 0
-  // CHECK: %[[orig_step:.*]] = arith.constant 1
-  // CHECK: %[[orig_ub_k:.*]] = arith.constant 3
-  // CHECK: %[[orig_ub_i:.*]] = arith.constant 42
-  // CHECK: %[[orig_ub_j:.*]] = arith.constant 56
+  // CHECK-DAG: %[[orig_lb:.*]] = arith.constant 0
+  // CHECK-DAG: %[[orig_ub_i:.*]] = arith.constant 42
+  // CHECK-DAG: %[[orig_step:.*]] = arith.constant 1
+  // CHECK-DAG: %[[orig_ub_j:.*]] = arith.constant 56
+  // CHECK-DAG: %[[orig_ub_k:.*]] = arith.constant 3
+  // CHECK-DAG: %[[range:.*]] = arith.constant 7056
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
   %c42 = arith.constant 42 : index
   %c56 = arith.constant 56 : index
   // The range of the new scf.
-  // CHECK:     %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]]
-  // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]]
 
   // Updated loop bounds.
   // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]] iter_args(%[[VAL_1:.*]] = %[[orig_lb]]) -> (index) {
@@ -136,13 +130,10 @@ func.func @noramalized_loops_with_yielded_iter_args() {
     // CHECK-NOT: scf.for
 
     // Reconstruct original IVs from the linearized one.
-    // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]]
-    // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]]
+    // CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[i]] into (%[[orig_ub_i]], %[[orig_ub_j]], %[[orig_ub_k]])
     %1:1 = scf.for %j = %c0 to %c56 step %c1 iter_args(%arg1 = %arg0) -> (index){
       %0:1 = scf.for %k = %c0 to %c3 step %c1 iter_args(%arg2 = %arg1) -> (index) {
-        // CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]])
+        // CHECK: "use"(%[[delinearize]]#0, %[[delinearize]]#1, %[[delinearize]]#2)
         "use"(%i, %j, %k) : (index, index, index) -> ()
         // CHECK: scf.yield %[[VAL_1]] : index
         scf.yield %arg2 : index
@@ -154,20 +145,21 @@ func.func @noramalized_loops_with_yielded_iter_args() {
   return
 }
 
+// -----
+
 func.func @noramalized_loops_with_shuffled_yielded_iter_args() {
-  // CHECK: %[[orig_lb:.*]] = arith.constant 0
-  // CHECK: %[[orig_step:.*]] = arith.constant 1
-  // CHECK: %[[orig_ub_k:.*]] = arith.constant 3
-  // CHECK: %[[orig_ub_i:.*]] = arith.constant 42
-  // CHECK: %[[orig_ub_j:.*]] = arith.constant 56
+  // CHECK-DAG: %[[orig_lb:.*]] = arith.constant 0
+  // CHECK-DAG: %[[orig_step:.*]] = arith.constant 1
+  // CHECK-DAG: %[[orig_ub_k:.*]] = arith.constant 3
+  // CHECK-DAG: %[[orig_ub_i:.*]] = arith.constant 42
+  // CHECK-DAG: %[[orig_ub_j:.*]] = arith.constant 56
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
   %c42 = arith.constant 42 : index
   %c56 = arith.constant 56 : index
   // The range of the new scf.
-  // CHECK:     %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]]
-  // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]]
+  // CHECK-DAG:%[[range:.*]] = arith.constant 7056
 
   // Updated loop bounds.
   // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]] iter_args(%[[VAL_1:.*]] = %[[orig_lb]], %[[VAL_2:.*]] = %[[orig_lb]]) -> (index, index) {
@@ -176,13 +168,11 @@ func.func @noramalized_loops_with_shuffled_yielded_iter_args() {
     // CHECK-NOT: scf.for
 
     // Reconstruct original IVs from the linearized one.
-    // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]]
-    // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]]
+    // CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[i]]
+    // CHECK-SAME: into (%[[orig_ub_i]], %[[orig_ub_j]], %[[orig_ub_k]])
     %1:2 = scf.for %j = %c0 to %c56 step %c1 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (index, index){
       %0:2 = scf.for %k = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (index, index) {
-        // CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]])
+        // CHECK: "use"(%[[delinearize]]#0, %[[delinearize]]#1, %[[delinearize]]#2)
         "use"(%i, %j, %k) : (index, index, index) -> ()
         // CHECK: scf.yield %[[VAL_2]], %[[VAL_1]] : index, index
         scf.yield %arg5, %arg4 : index, index
@@ -194,20 +184,21 @@ func.func @noramalized_loops_with_shuffled_yielded_iter_args() {
   return
 }
 
+// -----
+
 func.func @noramalized_loops_with_yielded_non_iter_args() {
-  // CHECK: %[[orig_lb:.*]] = arith.constant 0
-  // CHECK: %[[orig_step:.*]] = arith.constant 1
-  // CHECK: %[[orig_ub_k:.*]] = arith.constant 3
-  // CHECK: %[[orig_ub_i:.*]] = arith.constant 42
-  // CHECK: %[[orig_ub_j:.*]] = arith.constant 56
+  // CHECK-DAG: %[[orig_lb:.*]] = arith.constant 0
+  // CHECK-DAG: %[[orig_step:.*]] = arith.constant 1
+  // CHECK-DAG: %[[orig_ub_k:.*]] = arith.constant 3
+  // CHECK-DAG: %[[orig_ub_i:.*]] = arith.constant 42
+  // CHECK-DAG: %[[orig_ub_j:.*]] = arith.constant 56
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
   %c42 = arith.constant 42 : index
   %c56 = arith.constant 56 : index
   // The range of the new scf.
-  // CHECK:     %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]]
-  // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]]
+  // CHECK-DAG: %[[range:.*]] = arith.constant 7056
 
   // Updated loop bounds.
   // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]] iter_args(%[[VAL_1:.*]] = %[[orig_lb]]) -> (index) {
@@ -216,13 +207,11 @@ func.func @noramalized_loops_with_yielded_non_iter_args() {
     // CHECK-NOT: scf.for
 
     // Reconstruct original IVs from the linearized one.
-    // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]]
-    // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]]
-    // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]]
+    // CHECK: %[[delinearize:.+]]:3 = affine.delinearize_index %[[i]]
+    // CHECK-SAME: into (%[[orig_ub_i]], %[[orig_ub_j]], %[[orig_ub_k]])
     %1:1 = scf.for %j = %c0 to %c56 step %c1 iter_args(%arg1 = %arg0) -> (index){
       %0:1 = scf.for %k = %c0 to %c3 step %c1 iter_args(%arg2 = %arg1) -> (index) {
-        // CHECK: %[[res:.*]] = "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]])
+        // CHECK: %[[res:.*]] = "use"(%[[delinearize]]#0, %[[delinearize]]#1, %[[delinearize]]#2)
         %res = "use"(%i, %j, %k) : (index, index, index) -> (index)
         // CHECK: scf.yield %[[res]] : index
         scf.yield %res : index
@@ -234,6 +223,8 @@ func.func @noramalized_loops_with_yielded_non_iter_args() {
   return
 }
 
+// -----
+
 // Check with parametric loop bounds and steps, capture the bounds here.
 // CHECK-LABEL: @parametric
 // CHECK-SAME: %[[orig_lb1:[A-Za-z0-9]+]]:
@@ -246,25 +237,28 @@ func.func @parametric(%lb1 : index, %ub1 : index, %step1 : index,
                  %lb2 : index, %ub2 : index, %step2 : index) {
   // Compute the number of iterations for each of the loops and the total
   // number of iterations.
-  // CHECK: %[[range1:.*]] = arith.subi %[[orig_ub1]], %[[orig_lb1]]
-  // CHECK: %[[numiter1:.*]] = arith.ceildivsi %[[range1]], %[[orig_step1]]
-  // CHECK: %[[range2:.*]] = arith.subi %[[orig_ub2]], %[[orig_lb2]]
-  // CHECK: %[[numiter2:.*]] = arith.ceildivsi %[[range2]], %[[orig_step2]]
-  // CHECK: %[[range:.*]] = arith.muli %[[numiter1]], %[[numiter2]] : index
+  // CHECK: %[[normalized_i:.*]] = affine.apply
+  // CHECK-SAME: affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[orig_lb1]], %[[orig_ub1]], %[[orig_step1]]]
+  // CHECK: %[[c0:.+]] = arith.constant 0
+  // CHECK: %[[c1:.+]] = arith.constant 1
+  // CHECK: %[[normalized_j:.*]] = affine.apply
+  // CHECK-SAME: affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[orig_lb2]], %[[orig_ub2]], %[[orig_step2]]]
+  // CHECK: %[[range:.+]] = affine.apply
+  // CHECK-SAME: affine_map<()[s0, s1, s2, s3, s4, s5] -> (((-s0 + s1) ceildiv s2) * ((-s3 + s4) ceildiv s5))>()
+  // CHECK-SAME: [%[[orig_lb1]], %[[orig_ub1]], %[[orig_step1]], %[[orig_lb2]], %[[orig_ub2]], %[[orig_step2]]]
 
   // Check that the outer loop is updated.
-  // CHECK: scf.for %[[i:.*]] = %c0{{.*}} to %[[range]] step %c1
+  // CHECK: scf.for %[[i:.*]] = %[[c0]] to %[[range]] step %[[c1]]
   scf.for %i = %lb1 to %ub1 step %step1 {
     // Check that the inner loop is removed.
     // CHECK-NOT: scf.for
     scf.for %j = %lb2 to %ub2 step %step2 {
       // Remapping of the induction variables.
-      // CHECK: %[[normalized_j:.*]] = arith.remsi %[[i]], %[[numiter2]] : index
-      // CHECK: %[[normalized_i:.*]] = arith.divsi %[[i]], %[[numiter2]] : index
-      // CHECK: %[[scaled_j:.*]] = arith.muli %[[normalized_j]], %[[orig_step2]]
-      // CHECK: %[[orig_j:.*]] = arith.addi %[[scaled_j]], %[[orig_lb2]]
-      // CHECK: %[[scaled_i:.*]] = arith.muli %[[normalized_i]], %[[orig_step1]]
-      // CHECK: %[[orig_i:.*]] = arith.addi %[[scaled_i]], %[[orig_lb1]]
+      // CHECK: %[[delinearize:.+]]:2 = affine.delinearize_index %[[i]] into (%[[normalized_i]], %[[normalized_j]])
+      // CHECK: %[[orig_j:.*]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+      // CHECK-SAME: (%[[delinearize]]#1)[%[[orig_lb2]], %[[orig_step2]]]
+      // CHECK: %[[orig_i:.*]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+      // CHECK-SAME: (%[[delinearize]]#0)[%[[orig_lb1]], %[[orig_step1]]]
 
       // CHECK: "foo"(%[[orig_i]], %[[orig_j]])
       "foo"(%i, %j) : (index, index) -> ()
@@ -273,19 +267,21 @@ func.func @parametric(%lb1 : index, %ub1 : index, %step1 : index,
   return
 }
 
+// -----
+
 // CHECK-LABEL: @two_bands
 func.func @two_bands() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c10 = arith.constant 10 : index
-  // CHECK: %[[outer_range:.*]] = arith.muli
+  // CHECK: %[[outer_range:.*]] = arith.constant 100
   // CHECK: scf.for %{{.*}} = %{{.*}} to %[[outer_range]]
   scf.for %i = %c0 to %c10 step %c1 {
     // Check that the "j" loop was removed and that the inner loops were
     // coalesced as well.  The preparation step for coalescing will inject the
     // subtraction operation unlike the IV remapping.
     // CHECK-NOT: scf.for
-    // CHECK: arith.subi
+    // CHECK: affine.delinearize_index
     scf.for %j = %c0 to %c10 step %c1 {
       // The inner pair of loops is coalesced separately.
       // CHECK: scf.for
@@ -303,12 +299,6 @@ func.func @two_bands() {
 // -----
 
 // Check coalescing of affine.for loops when all the loops have constant upper bound.
-// CHECK-DAG: #[[SIXTEEN:.*]] = affine_map<() -> (16)>
-// CHECK-DAG: #[[SIXTY_FOUR:.*]] = affine_map<() -> (64)>
-// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
-// CHECK-DAG: #[[EIGHT:.*]] = affine_map<() -> (8)>
-// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
-// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
 func.func @coalesce_affine_for() {
   affine.for %i = 0 to 16 {
     affine.for %j = 0 to 64 {
@@ -319,16 +309,16 @@ func.func @coalesce_affine_for() {
   }
   return
 }
-// CHECK-DAG: %[[T0:.*]] = affine.apply #[[SIXTEEN]]()
-// CHECK-DAG: %[[T1:.*]] = affine.apply #[[SIXTY_FOUR]]()
-// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T1]]]
-// CHECK-DAG: %[[T3:.*]] = affine.apply #[[EIGHT]]()
-// CHECK-DAG: %[[T4:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T3]]]
+// CHECK-DAG: %[[T0:.*]] = affine.apply affine_map<() -> (16)>()
+// CHECK-DAG: %[[T1:.*]] = affine.apply affine_map<() -> (64)>()
+// CHECK-DAG: %[[T2:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T1]]]
+// CHECK-DAG: %[[T3:.*]] = affine.apply affine_map<() -> (8)>()
+// CHECK-DAG: %[[T4:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T2]])[%[[T3]]]
 // CHECK:       affine.for %[[IV:.*]] = 0 to %[[T4]]
-// CHECK-DAG:    %[[K:.*]] =  affine.apply #[[MOD]](%[[IV]])[%[[T3]]]
-// CHECK-DAG:    %[[T6:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T3]]]
-// CHECK-DAG:    %[[J:.*]] =  affine.apply #[[MOD]](%[[T6]])[%[[T1]]]
-// CHECK-DAG:    %[[I:.*]] =  affine.apply #[[DIV]](%[[T6]])[%[[T1]]]
+// CHECK-DAG:    %[[K:.*]] =  affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV]])[%[[T3]]]
+// CHECK-DAG:    %[[T6:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV]])[%[[T3]]]
+// CHECK-DAG:    %[[J:.*]] =  affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[T6]])[%[[T1]]]
+// CHECK-DAG:    %[[I:.*]] =  affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[T6]])[%[[T1]]]
 // CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
@@ -336,10 +326,6 @@ func.func @coalesce_affine_for() {
 // -----
 
 // Check coalescing of affine.for loops when all the loops have non constant upper bounds.
-// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)>
-// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
-// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
-// CHECK-DAG: #[[FLOOR:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
 func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
   %c0 = arith.constant 0 : index
   %M = memref.dim %arg0, %c0 : memref<?x?xf32>
@@ -355,14 +341,14 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
   return
 }
 // CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK-DAG: %[[T0:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]]
-// CHECK-DAG: %[[T1:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T0]]]
-// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T1]])[%[[T0]]]
+// CHECK-DAG: %[[T0:.*]] = affine.apply affine_map<()[s0] -> (s0)>()[%[[DIM]]]
+// CHECK-DAG: %[[T1:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T0]]]
+// CHECK-DAG: %[[T2:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T1]])[%[[T0]]]
 // CHECK: affine.for %[[IV:.*]] = 0 to %[[T2]]
-// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T0]]]
-// CHECK-DAG:    %[[T9:.*]] = affine.apply #[[FLOOR]](%[[IV]])[%[[T0]]]
-// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T0]]]
-// CHECK-DAG:    %[[I:.*]] = affine.apply #[[FLOOR]](%[[T9]])[%[[T0]]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV]])[%[[T0]]]
+// CHECK-DAG:    %[[T9:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV]])[%[[T0]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[T9]])[%[[T0]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[T9]])[%[[T0]]]
 // CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
@@ -370,11 +356,6 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
 // -----
 
 // Check coalescing of affine.for loops when some of the loop has constant upper bounds while others have nin constant upper bounds.
-// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)>
-// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
-// CHECK-DAG: #[[SIXTY_FOUR:.*]] = affine_map<() -> (64)>
-// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
-// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
 func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
   %c0 = arith.constant 0 : index
   %M = memref.dim %arg0, %c0 : memref<?x?xf32>
@@ -389,15 +370,15 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
   return
 }
 // CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK-DAG: %[[T0:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]]
-// CHECK-DAG: %[[T1:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T0]]]
-// CHECK-DAG: %[[T2:.*]] = affine.apply #[[SIXTY_FOUR]]()
-// CHECK-DAG: %[[T3:.*]] = affine.apply #[[PRODUCT]](%[[T1]])[%[[T2]]]
+// CHECK-DAG: %[[T0:.*]] = affine.apply affine_map<()[s0] -> (s0)>()[%[[DIM]]]
+// CHECK-DAG: %[[T1:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T0]]]
+// CHECK-DAG: %[[T2:.*]] = affine.apply affine_map<() -> (64)>()
+// CHECK-DAG: %[[T3:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T1]])[%[[T2]]]
 // CHECK: affine.for %[[IV:.*]] = 0 to %[[T3]]
-// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T2]]]
-// CHECK-DAG:    %[[T5:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T2]]]
-// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T5]])[%[[T0]]]
-// CHECK-DAG:    %[[I:.*]] = affine.apply #[[DIV]](%[[T5]])[%[[T0]]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV]])[%[[T2]]]
+// CHECK-DAG:    %[[T5:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV]])[%[[T2]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[T5]])[%[[T0]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[T5]])[%[[T0]]]
 // CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
@@ -405,11 +386,6 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
 // -----
 
 // Check coalescing of affine.for loops when upper bound contains multi result upper bound map.
-// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0, -s0)>
-// CHECK-DAG: #[[IDENTITY:.*]] = affine_map<()[s0] -> (s0)>
-// CHECK-DAG: #[[PRODUCT:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
-// CHECK-DAG: #[[MOD:.*]] = affine_map<(d0)[s0] -> (d0 mod s0)>
-// CHECK-DAG: #[[DIV:.*]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
 #myMap = affine_map<()[s1] -> (s1, -s1)>
 func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
  %c0 = arith.constant 0 : index
@@ -426,23 +402,21 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
  return
 }
 // CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK-DAG: %[[T0:.*]] = affine.min #[[MAP0]]()[%[[DIM]]]
-// CHECK-DAG: %[[T1:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]]
-// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T1]]]
-// CHECK-DAG: %[[T3:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T1]]]
+// CHECK-DAG: %[[T0:.*]] = affine.min affine_map<()[s0] -> (s0, -s0)>()[%[[DIM]]]
+// CHECK-DAG: %[[T1:.*]] = affine.apply affine_map<()[s0] -> (s0)>()[%[[DIM]]]
+// CHECK-DAG: %[[T2:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T1]]]
+// CHECK-DAG: %[[T3:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T2]])[%[[T1]]]
 // CHECK: affine.for %[[IV:.*]] = 0 to %[[T3]]
-// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T1]]]
-// CHECK-DAG:    %[[T5:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T1]]]
-// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T5]])[%[[T1]]]
-// CHECK-DAG:    %[[I:.*]] = affine.apply #[[DIV]](%[[T5]])[%[[T1]]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV]])[%[[T1]]]
+// CHECK-DAG:    %[[T5:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV]])[%[[T1]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[T5]])[%[[T1]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[T5]])[%[[T1]]]
 // CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
 
 // -----
 
-// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (d0 * 110)>
-// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (696, d0 * 110 + 110)>
 #map0 = affine_map<(d0) -> (d0 * 110)>
 #map1 = affine_map<(d0) -> (696, d0 * 110 + 110)>
 func.func @test_loops_do_not_get_coalesced() {
@@ -454,7 +428,7 @@ func.func @test_loops_do_not_get_coalesced() {
   return
 }
 // CHECK: affine.for %[[IV0:.*]] = 0 to 7
-// CHECK-NEXT: affine.for %[[IV1:.*]] = #[[MAP0]](%[[IV0]]) to min #[[MAP1]](%[[IV0]])
+// CHECK-NEXT: affine.for %[[IV1:.*]] = affine_map<(d0) -> (d0 * 110)>(%[[IV0]]) to min affine_map<(d0) -> (696, d0 * 110 + 110)>(%[[IV0]])
 // CHECK-NEXT:   "use"(%[[IV0]], %[[IV1]])
 // CHECK-NEXT: }
 // CHECK-NEXT: }
diff --git a/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir b/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
index 3c50a9e72d9d9..588aa8a85a84e 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --transform-interpreter %s -split-input-file -verify-diagnostics | FileCheck %s
+// RUN: mlir-opt --transform-interpreter="debug-payload-root-tag=payload" %s -split-input-file -verify-diagnostics | FileCheck %s
 
 // Test One-Shot Bufferize.
 
@@ -12,19 +12,21 @@ module attributes {transform.with_named_sequence} {
 
 // CHECK-LABEL: func @test_function(
 //  CHECK-SAME:     %[[A:.*]]: tensor<?xf32>
-func.func @test_function(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
-  %c0 = arith.constant 0 : index
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @test_function(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
+    %c0 = arith.constant 0 : index
 
-  // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]]
-  // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]]
-  // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]])
-  // CHECK: memref.copy %[[A_memref]], %[[alloc]]
-  // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
-  // CHECK: %[[res_tensor:.*]] = bufferization.to_tensor %[[alloc]]
-  %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
+    // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]]
+    // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]]
+    // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]])
+    // CHECK: memref.copy %[[A_memref]], %[[alloc]]
+    // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
+    // CHECK: %[[res_tensor:.*]] = bufferization.to_tensor %[[alloc]]
+    %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
 
-  // CHECK: return %[[res_tensor]]
-  return %0 : tensor<?xf32>
+    // CHECK: return %[[res_tensor]]
+    return %0 : tensor<?xf32>
+  }
 }
 
 // -----
@@ -42,19 +44,21 @@ module attributes {transform.with_named_sequence} {
 // CHECK-LABEL: func @test_function(
 //  CHECK-SAME:     %[[A:.*]]: tensor<?xf32>
 //   CHECK-NOT:   memref.copy
-func.func @test_function(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
-  %c0 = arith.constant 0 : index
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @test_function(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
+    %c0 = arith.constant 0 : index
 
-  // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]]
-  // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]]
-  // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]])
-  // CHECK: linalg.copy ins(%[[A_memref]] : memref<{{.*}}>) outs(%[[alloc]]
-  // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
-  // CHECK: %[[res_tensor:.*]] = bufferization.to_tensor %[[alloc]]
-  %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
+    // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]]
+    // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]]
+    // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]])
+    // CHECK: linalg.copy ins(%[[A_memref]] : memref<{{.*}}>) outs(%[[alloc]]
+    // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
+    // CHECK: %[[res_tensor:.*]] = bufferization.to_tensor %[[alloc]]
+    %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
 
-  // CHECK: return %[[res_tensor]]
-  return %0 : tensor<?xf32>
+    // CHECK: return %[[res_tensor]]
+    return %0 : tensor<?xf32>
+  }
 }
 
 // -----
@@ -72,13 +76,15 @@ module attributes {transform.with_named_sequence} {
 
 // CHECK-LABEL: func @test_function_analysis(
 //  CHECK-SAME:     %[[A:.*]]: tensor<?xf32>
-func.func @test_function_analysis(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
-  %c0 = arith.constant 0 : index
-  // CHECK: vector.transfer_write
-  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]}
-  // CHECK-SAME: tensor<?xf32>
-  %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
-  return %0 : tensor<?xf32>
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @test_function_analysis(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
+    %c0 = arith.constant 0 : index
+    // CHECK: vector.transfer_write
+    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]}
+    // CHECK-SAME: tensor<?xf32>
+    %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
+    return %0 : tensor<?xf32>
+  }
 }
 
 // -----
@@ -95,10 +101,12 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-func.func @test_unknown_op_failure() -> (tensor<?xf32>) {
-  // expected-error @+1 {{op was not bufferized}}
-  %0 = "test.dummy_op"() : () -> (tensor<?xf32>)
-  return %0 : tensor<?xf32>
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @test_unknown_op_failure() -> (tensor<?xf32>) {
+    // expected-error @+1 {{op was not bufferized}}
+    %0 = "test.dummy_op"() : () -> (tensor<?xf32>)
+    return %0 : tensor<?xf32>
+  }
 }
 
 // -----
@@ -111,7 +119,7 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-module {
+module @payload attributes { transform.target_tag = "payload" } {
   // CHECK-LABEL: func @test_function(
   //  CHECK-SAME:     %[[A:.*]]: tensor<?xf32>
   func.func @test_function(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
@@ -146,11 +154,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:  %[[A:.*]]: memref<12x9xf32>,
 // CHECK-SAME:  %[[B:.*]]: memref<9x6xf32>,
 // CHECK-SAME:  %[[C:.*]]: memref<12x6xf32>) -> memref<12x6xf32> {
-func.func @matmul(%A: tensor<12x9xf32>, %B: tensor<9x6xf32>, %C: tensor<12x6xf32>) -> tensor<12x6xf32> {
-  // CHECK: linalg.matmul ins(%[[A]], %[[B]] : memref<12x9xf32>, memref<9x6xf32>) outs(%[[C]] : memref<12x6xf32>)
-  %D = linalg.matmul ins(%A, %B: tensor<12x9xf32>, tensor<9x6xf32>) outs(%C: tensor<12x6xf32>) -> tensor<12x6xf32>
-  // CHECK: return %[[C]] : memref<12x6xf32>
-  return %D : tensor<12x6xf32>
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @matmul(%A: tensor<12x9xf32>, %B: tensor<9x6xf32>, %C: tensor<12x6xf32>) -> tensor<12x6xf32> {
+    // CHECK: linalg.matmul ins(%[[A]], %[[B]] : memref<12x9xf32>, memref<9x6xf32>) outs(%[[C]] : memref<12x6xf32>)
+    %D = linalg.matmul ins(%A, %B: tensor<12x9xf32>, tensor<9x6xf32>) outs(%C: tensor<12x6xf32>) -> tensor<12x6xf32>
+    // CHECK: return %[[C]] : memref<12x6xf32>
+    return %D : tensor<12x6xf32>
+  }
 }
 
 // -----
@@ -165,10 +175,12 @@ module attributes {transform.with_named_sequence} {
 }
 
 // Expect `bufferization.empty_tensor_to_alloc_tensor` to replace the tensor.empty.
-func.func @empty_to_tensor_alloc() -> tensor<2x2xf32> {
-  // CHECK: bufferization.alloc_tensor
-  %0 = tensor.empty() : tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @empty_to_tensor_alloc() -> tensor<2x2xf32> {
+    // CHECK: bufferization.alloc_tensor
+    %0 = tensor.empty() : tensor<2x2xf32>
+    return %0 : tensor<2x2xf32>
+  }
 }
 
 // -----
@@ -185,13 +197,15 @@ module attributes {transform.with_named_sequence} {
 //       CHECK:   tensor.extract_slice
 //       CHECK:   linalg.fill
 //       CHECK:   tensor.insert_slice
-func.func @empty_tensor_elimination(
-    %t: tensor<10xf32>, %f: f32) -> tensor<10xf32> {
-  %0 = tensor.empty() : tensor<5xf32>
-  %1 = linalg.fill ins(%f : f32) outs(%0 : tensor<5xf32>) -> tensor<5xf32>
-  %2 = tensor.insert_slice %1 into %t [1][5][1]
-      : tensor<5xf32> into tensor<10xf32>
-  return %2 : tensor<10xf32>
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @empty_tensor_elimination(
+      %t: tensor<10xf32>, %f: f32) -> tensor<10xf32> {
+    %0 = tensor.empty() : tensor<5xf32>
+    %1 = linalg.fill ins(%f : f32) outs(%0 : tensor<5xf32>) -> tensor<5xf32>
+    %2 = tensor.insert_slice %1 into %t [1][5][1]
+        : tensor<5xf32> into tensor<10xf32>
+    return %2 : tensor<10xf32>
+  }
 }
 
 // -----
@@ -208,12 +222,14 @@ module attributes {transform.with_named_sequence} {
 //       CHECK:   memref.alloca
 //       CHECK:   scf.for
 //       CHECK:     memref.store
-func.func @buffer_loop_hoisting(%lb: index, %ub: index, %step: index, %f: f32, %pos: index) {
-  scf.for %iv = %lb to %ub step %step {
-    %0 = memref.alloca() : memref<5xf32>
-    memref.store %f, %0[%pos] : memref<5xf32>
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @buffer_loop_hoisting(%lb: index, %ub: index, %step: index, %f: f32, %pos: index) {
+    scf.for %iv = %lb to %ub step %step {
+      %0 = memref.alloca() : memref<5xf32>
+      memref.store %f, %0[%pos] : memref<5xf32>
+    }
+    return
   }
-  return
 }
 
 // -----
@@ -231,10 +247,12 @@ module attributes {transform.with_named_sequence} {
 
 // Expect `bufferization.bufferize_to_allocation` to create an alloc.
 //  CHECK-LABEL: func.func @empty_to_tensor_alloc()
-func.func @empty_to_tensor_alloc() -> tensor<2x2xf32> {
-  // CHECK-NEXT: %[[alloca:.*]] = memref.alloca() : memref<2x2xf32>
-  // CHECK-NEXT: %[[tensor:.*]] = bufferization.to_tensor %[[alloca]] restrict writable : memref<2x2xf32>
-  // CHECK-NEXT: return %[[tensor]] : tensor<2x2xf32>
-  %0 = bufferization.alloc_tensor() : tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @empty_to_tensor_alloc() -> tensor<2x2xf32> {
+    // CHECK-NEXT: %[[alloca:.*]] = memref.alloca() : memref<2x2xf32>
+    // CHECK-NEXT: %[[tensor:.*]] = bufferization.to_tensor %[[alloca]] restrict writable : memref<2x2xf32>
+    // CHECK-NEXT: return %[[tensor]] : tensor<2x2xf32>
+    %0 = bufferization.alloc_tensor() : tensor<2x2xf32>
+    return %0 : tensor<2x2xf32>
+  }
 }
diff --git a/mlir/test/Dialect/LLVM/transform-e2e.mlir b/mlir/test/Dialect/LLVM/transform-e2e.mlir
index c00b47fb936e9..3e637a3ec49a4 100644
--- a/mlir/test/Dialect/LLVM/transform-e2e.mlir
+++ b/mlir/test/Dialect/LLVM/transform-e2e.mlir
@@ -1,15 +1,17 @@
-// RUN: mlir-opt %s --transform-interpreter -test-transform-dialect-erase-schedule --test-lower-to-llvm --split-input-file | FileCheck %s
+// RUN: mlir-opt %s --transform-interpreter="debug-payload-root-tag=payload" -test-transform-dialect-erase-schedule --test-lower-to-llvm --split-input-file | FileCheck %s
 
 // CHECK-LABEL: llvm.func @matmul_tensors
-func.func @matmul_tensors(
-  %arg0: tensor<2x4xf32>, %arg1: tensor<4x6xf32>, %arg2: tensor<2x6xf32>)
-    -> tensor<2x6xf32> {
-// CHECK-NOT: linalg
-// CHECK: llvm.intr.fmuladd{{.*}}
-  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<2x4xf32>, tensor<4x6xf32>)
-                     outs(%arg2: tensor<2x6xf32>)
-    -> tensor<2x6xf32>
-  return %0 : tensor<2x6xf32>
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @matmul_tensors(
+    %arg0: tensor<2x4xf32>, %arg1: tensor<4x6xf32>, %arg2: tensor<2x6xf32>)
+      -> tensor<2x6xf32> {
+  // CHECK-NOT: linalg
+  // CHECK: llvm.intr.fmuladd{{.*}}
+    %0 = linalg.matmul  ins(%arg0, %arg1: tensor<2x4xf32>, tensor<4x6xf32>)
+                       outs(%arg2: tensor<2x6xf32>)
+      -> tensor<2x6xf32>
+    return %0 : tensor<2x6xf32>
+  }
 }
 
 module attributes {transform.with_named_sequence} {
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index 6670e4b186c39..9388d7ef24936 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -218,7 +218,7 @@ func.func @store_unaligned_atomic(%val : f32, %ptr : !llvm.ptr) {
 
 func.func @invalid_call() {
   // expected-error@+1 {{'llvm.call' op must have either a `callee` attribute or at least an operand}}
-  "llvm.call"() : () -> ()
+  "llvm.call"() {op_bundle_sizes = array<i32>} : () -> ()
   llvm.return
 }
 
@@ -286,7 +286,7 @@ func.func @call_non_llvm() {
 
 func.func @call_non_llvm_arg(%arg0 : tensor<*xi32>) {
   // expected-error@+1 {{'llvm.call' op operand #0 must be variadic of LLVM dialect-compatible type}}
-  "llvm.call"(%arg0) : (tensor<*xi32>) -> ()
+  "llvm.call"(%arg0) {operandSegmentSizes = array<i32: 1, 0>, op_bundle_sizes = array<i32>} : (tensor<*xi32>) -> ()
   llvm.return
 }
 
@@ -1588,7 +1588,7 @@ llvm.func @variadic(...)
 
 llvm.func @invalid_variadic_call(%arg: i32)  {
   // expected-error@+1 {{missing var_callee_type attribute for vararg call}}
-  "llvm.call"(%arg) <{callee = @variadic}> : (i32) -> ()
+  "llvm.call"(%arg) <{callee = @variadic}> {operandSegmentSizes = array<i32: 1, 0>, op_bundle_sizes = array<i32>} : (i32) -> ()
   llvm.return
 }
 
@@ -1598,7 +1598,7 @@ llvm.func @variadic(...)
 
 llvm.func @invalid_variadic_call(%arg: i32)  {
   // expected-error@+1 {{missing var_callee_type attribute for vararg call}}
-  "llvm.call"(%arg) <{callee = @variadic}> : (i32) -> ()
+  "llvm.call"(%arg) <{callee = @variadic}> {operandSegmentSizes = array<i32: 1, 0>, op_bundle_sizes = array<i32>} : (i32) -> ()
   llvm.return
 }
 
@@ -1655,3 +1655,28 @@ llvm.func @alwaysinline_noinline() attributes { always_inline, no_inline } {
 llvm.func @optnone_requires_noinline() attributes { optimize_none } {
   llvm.return
 }
+
+// -----
+
+llvm.func @foo()
+llvm.func @wrong_number_of_bundle_types() {
+  %0 = llvm.mlir.constant(0 : i32) : i32
+  // expected-error@+1 {{expected 1 types for operand bundle operands for operand bundle #0, but actually got 2}}
+  llvm.call @foo() ["tag"(%0 : i32, i32)] : () -> ()
+  llvm.return
+}
+
+// -----
+
+llvm.func @foo()
+llvm.func @wrong_number_of_bundle_tags() {
+  %0 = llvm.mlir.constant(0 : i32) : i32
+  %1 = llvm.mlir.constant(1 : i32) : i32
+  // expected-error@+1 {{expected 2 operand bundle tags, but actually got 1}}
+  "llvm.call"(%0, %1) <{ op_bundle_tags = ["tag"] }> {
+    callee = @foo,
+    operandSegmentSizes = array<i32: 0, 2>,
+    op_bundle_sizes = array<i32: 1, 1>
+  } : (i32, i32) -> ()
+  llvm.return
+}
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index 89d303fcac8ff..62f1de2b7fe7d 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -751,3 +751,86 @@ llvm.func @vector_predication_intrinsics(%A: vector<8xi32>, %B: vector<8xi32>,
          (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32>
   llvm.return
 }
+
+llvm.func @op_bundle_target()
+
+// CHECK-LABEL: @test_call_with_empty_opbundle
+llvm.func @test_call_with_empty_opbundle() {
+  // CHECK: llvm.call @op_bundle_target() : () -> ()
+  llvm.call @op_bundle_target() [] : () -> ()
+  llvm.return
+}
+
+// CHECK-LABEL: @test_call_with_empty_opbundle_operands
+llvm.func @test_call_with_empty_opbundle_operands() {
+  // CHECK: llvm.call @op_bundle_target() ["tag"()] : () -> ()
+  llvm.call @op_bundle_target() ["tag"()] : () -> ()
+  llvm.return
+}
+
+// CHECK-LABEL: @test_call_with_opbundle
+llvm.func @test_call_with_opbundle() {
+  %0 = llvm.mlir.constant(0 : i32) : i32
+  %1 = llvm.mlir.constant(1 : i32) : i32
+  %2 = llvm.mlir.constant(2 : i32) : i32
+  // CHECK: llvm.call @op_bundle_target() ["tag1"(%{{.+}}, %{{.+}} : i32, i32), "tag2"(%{{.+}} : i32)] : () -> ()
+  llvm.call @op_bundle_target() ["tag1"(%0, %1 : i32, i32), "tag2"(%2 : i32)] : () -> ()
+  llvm.return
+}
+
+// CHECK-LABEL: @test_invoke_with_empty_opbundle
+llvm.func @test_invoke_with_empty_opbundle() attributes { personality = @__gxx_personality_v0 } {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.mlir.constant(2 : i32) : i32
+  %2 = llvm.mlir.constant(3 : i32) : i32
+  // CHECK: llvm.invoke @op_bundle_target() to ^{{.+}} unwind ^{{.+}} : () -> ()
+  llvm.invoke @op_bundle_target() to ^bb2 unwind ^bb1 [] : () -> ()
+
+^bb1:
+  %3 = llvm.landingpad cleanup : !llvm.struct<(ptr, i32)>
+  llvm.return
+
+^bb2:
+  llvm.return
+}
+
+// CHECK-LABEL: @test_invoke_with_empty_opbundle_operands
+llvm.func @test_invoke_with_empty_opbundle_operands() attributes { personality = @__gxx_personality_v0 } {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.mlir.constant(2 : i32) : i32
+  %2 = llvm.mlir.constant(3 : i32) : i32
+  // CHECK: llvm.invoke @op_bundle_target() to ^{{.+}} unwind ^{{.+}} ["tag"()] : () -> ()
+  llvm.invoke @op_bundle_target() to ^bb2 unwind ^bb1 ["tag"()] : () -> ()
+
+^bb1:
+  %3 = llvm.landingpad cleanup : !llvm.struct<(ptr, i32)>
+  llvm.return
+
+^bb2:
+  llvm.return
+}
+
+// CHECK-LABEL: @test_invoke_with_opbundle
+llvm.func @test_invoke_with_opbundle() attributes { personality = @__gxx_personality_v0 } {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.mlir.constant(2 : i32) : i32
+  %2 = llvm.mlir.constant(3 : i32) : i32
+  // CHECK: llvm.invoke @op_bundle_target() to ^{{.+}} unwind ^{{.+}} ["tag1"(%{{.+}}, %{{.+}} : i32, i32), "tag2"(%{{.+}} : i32)] : () -> ()
+  llvm.invoke @op_bundle_target() to ^bb2 unwind ^bb1 ["tag1"(%0, %1 : i32, i32), "tag2"(%2 : i32)] : () -> ()
+
+^bb1:
+  %3 = llvm.landingpad cleanup : !llvm.struct<(ptr, i32)>
+  llvm.return
+
+^bb2:
+  llvm.return
+}
+
+// CHECK-LABEL: @test_call_intrin_with_opbundle
+llvm.func @test_call_intrin_with_opbundle(%arg0 : !llvm.ptr) {
+  %0 = llvm.mlir.constant(1 : i1) : i1
+  %1 = llvm.mlir.constant(16 : i32) : i32
+  // CHECK: llvm.call_intrinsic "llvm.assume"(%{{.+}}) ["align"(%{{.+}}, %{{.+}} : !llvm.ptr, i32)] : (i1) -> ()
+  llvm.call_intrinsic "llvm.assume"(%0) ["align"(%arg0, %1 : !llvm.ptr, i32)] : (i1) -> ()
+  llvm.return
+}
diff --git a/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir b/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir
index 4bfed475d44f6..bfe7a07cb38a5 100644
--- a/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir
+++ b/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir
@@ -996,6 +996,21 @@ module attributes { transform.target_tag = "start_here" } {
     } -> tensor<40x10x50x15xf32>
     return %result : tensor<40x10x50x15xf32>
   }
+
+  func.func @generic_min(%arg0: tensor<1x7x4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<1x1x4xf32>) {
+    linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1 * 2 + d3 * 2, d2)>, 
+      affine_map<(d0, d1, d2, d3) -> (d3)>, 
+      affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], 
+      iterator_types = ["parallel", "parallel", "parallel", "reduction"]} 
+      ins(%arg0, %arg1 : tensor<1x7x4xf32>, tensor<4xf32>) 
+      outs(%arg2 : tensor<1x1x4xf32>) {
+    ^bb0(%in: f32, %in_1: f32, %out: f32):
+      %5 = arith.minimumf %out, %in : f32
+      linalg.yield %5 : f32
+    } -> tensor<1x1x4xf32>
+    return
+  }
 }
 
 // -----
diff --git a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
index 3f8d2ea06641e..9c223737750a9 100644
--- a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
+++ b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --split-input-file --transform-interpreter %s | FileCheck %s
+// RUN: mlir-opt --split-input-file --transform-interpreter="debug-payload-root-tag=payload" %s | FileCheck %s
 
 // CHECK-LABEL: func @matmul_divisible
 //       CHECK:   scf.forall
@@ -24,19 +24,21 @@
 //       CHECK:       scf.forall
 //       CHECK:         vector.transfer_read
 //       CHECK:         vector.transfer_write
-func.func @matmul_divisible(%A: tensor<1024x1024xf32>,
-                            %B: tensor<1024x1024xf32>,
-                            %C: tensor<1024x1024xf32>)
-    -> tensor<1024x1024xf32>
-{
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.fill ins(%cst : f32)
-                   outs(%C : tensor<1024x1024xf32>)
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @matmul_divisible(%A: tensor<1024x1024xf32>,
+                              %B: tensor<1024x1024xf32>,
+                              %C: tensor<1024x1024xf32>)
       -> tensor<1024x1024xf32>
-  %1 = linalg.matmul ins(%A, %B : tensor<1024x1024xf32>, tensor<1024x1024xf32>)
-                     outs(%0 : tensor<1024x1024xf32>)
-      -> tensor<1024x1024xf32>
-  return %1 : tensor<1024x1024xf32>
+  {
+    %cst = arith.constant 0.000000e+00 : f32
+    %0 = linalg.fill ins(%cst : f32)
+                     outs(%C : tensor<1024x1024xf32>)
+        -> tensor<1024x1024xf32>
+    %1 = linalg.matmul ins(%A, %B : tensor<1024x1024xf32>, tensor<1024x1024xf32>)
+                       outs(%0 : tensor<1024x1024xf32>)
+        -> tensor<1024x1024xf32>
+    return %1 : tensor<1024x1024xf32>
+  }
 }
 
 module attributes {transform.with_named_sequence} {
@@ -143,19 +145,21 @@ module attributes {transform.with_named_sequence} {
 //       CHECK:       linalg.matmul
 //       CHECK:       vector.transfer_read
 //       CHECK:       vector.transfer_write
+module @payload attributes { transform.target_tag = "payload" } {
 func.func @matmul_not_divisible(%A: tensor<1023x1023xf32>,
-                                %B: tensor<1023x1023xf32>,
-                                %C: tensor<1023x1023xf32>)
-    -> tensor<1023x1023xf32>
-{
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.fill ins(%cst : f32)
-                   outs(%C : tensor<1023x1023xf32>)
+                                  %B: tensor<1023x1023xf32>,
+                                  %C: tensor<1023x1023xf32>)
       -> tensor<1023x1023xf32>
-  %1 = linalg.matmul ins(%A, %B : tensor<1023x1023xf32>, tensor<1023x1023xf32>)
-                     outs(%0 : tensor<1023x1023xf32>)
-      -> tensor<1023x1023xf32>
-  return %1 : tensor<1023x1023xf32>
+  {
+    %cst = arith.constant 0.000000e+00 : f32
+    %0 = linalg.fill ins(%cst : f32)
+                     outs(%C : tensor<1023x1023xf32>)
+        -> tensor<1023x1023xf32>
+    %1 = linalg.matmul ins(%A, %B : tensor<1023x1023xf32>, tensor<1023x1023xf32>)
+                       outs(%0 : tensor<1023x1023xf32>)
+        -> tensor<1023x1023xf32>
+    return %1 : tensor<1023x1023xf32>
+  }
 }
 
 module attributes {transform.with_named_sequence} {
diff --git a/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir b/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir
index f2e9e839b7c46..5e5657980ba12 100644
--- a/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir
+++ b/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir
@@ -1,5 +1,5 @@
 
-// RUN: mlir-opt --transform-interpreter -cse -canonicalize -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt --transform-interpreter="debug-payload-root-tag=payload" -cse -canonicalize -split-input-file -verify-diagnostics %s | FileCheck %s
 
 #map = affine_map<()[s0] -> (-s0 + 12, 7)>
 
@@ -7,43 +7,45 @@
 //  CHECK-SAME:     %[[arg0:.*]]: memref<24x12xf32, strided<[?, ?], offset: ?>>,
 //  CHECK-SAME:     %[[arg1:.*]]: memref<12x25xf32, strided<[?, ?], offset: ?>>,
 //  CHECK-SAME:     %[[arg2:.*]]: memref<24x25xf32, strided<[?, ?], offset: ?>>,
-func.func @pad_to_memory_space(%arg0: tensor<24x12xf32>,
-                               %arg1: tensor<12x25xf32>,
-                               %arg2: tensor<24x25xf32>,
-                               %iv0 : index, %iv1 : index,
-                               %iv2 : index) -> tensor<24x25xf32> {
-  %0 = affine.min #map()[%iv2]
-
-  // CHECK: %[[s0:.*]] = memref.subview %[[arg0]]
-  %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
-  // CHECK: %[[s1:.*]] = memref.subview %[[arg1]]
-  %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
-  // CHECK: %[[s2:.*]] = memref.subview %[[arg2]]
-  %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
-
-  // CHECK: %[[alloc0:.*]] = memref.alloc() : memref<4x7xf32, 3>
-  // CHECK: linalg.fill {{.*}} outs(%[[alloc0]]
-  // CHECK: %[[alloc0_view:.*]] = memref.subview %[[alloc0]][0, 0] [4, %{{.*}}] [1, 1]
-  // CHECK: memref.copy %[[s0]], %[[alloc0_view]]
-
-  // CHECK: %[[alloc1:.*]] = memref.alloc() : memref<7x5xf32, 3>
-  // CHECK: linalg.fill {{.*}} outs(%[[alloc1]]
-  // CHECK: %[[alloc1_view:.*]] = memref.subview %[[alloc1]][0, 0] [%{{.*}}, 5] [1, 1]
-  // CHECK: memref.copy %[[s1]], %[[alloc1_view]]
-
-  // CHECK: %[[alloc2:.*]] = memref.alloc() : memref<4x5xf32, 3>
-  // CHECK-NOT: linalg.fill {{.*}} outs(%[[alloc2]]
-  // No subview because there is 0 padding
-  // CHECK: memref.copy %[[s2]], %[[alloc2]]
-
-  // CHECK: linalg.matmul ins(%[[alloc0]], %[[alloc1]] : {{.*}}) outs(%[[alloc2]] : {{.*}})
-  // Copy back result.
-  // CHECK: memref.copy %[[alloc2]], %[[s2]]
-  %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
-
-  // insert_slice bufferizes to a no-op.
-  %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
-  func.return %5 : tensor<24x25xf32>
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @pad_to_memory_space(%arg0: tensor<24x12xf32>,
+                                 %arg1: tensor<12x25xf32>,
+                                 %arg2: tensor<24x25xf32>,
+                                 %iv0 : index, %iv1 : index,
+                                 %iv2 : index) -> tensor<24x25xf32> {
+    %0 = affine.min #map()[%iv2]
+
+    // CHECK: %[[s0:.*]] = memref.subview %[[arg0]]
+    %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
+    // CHECK: %[[s1:.*]] = memref.subview %[[arg1]]
+    %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
+    // CHECK: %[[s2:.*]] = memref.subview %[[arg2]]
+    %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
+
+    // CHECK: %[[alloc0:.*]] = memref.alloc() : memref<4x7xf32, 3>
+    // CHECK: linalg.fill {{.*}} outs(%[[alloc0]]
+    // CHECK: %[[alloc0_view:.*]] = memref.subview %[[alloc0]][0, 0] [4, %{{.*}}] [1, 1]
+    // CHECK: memref.copy %[[s0]], %[[alloc0_view]]
+
+    // CHECK: %[[alloc1:.*]] = memref.alloc() : memref<7x5xf32, 3>
+    // CHECK: linalg.fill {{.*}} outs(%[[alloc1]]
+    // CHECK: %[[alloc1_view:.*]] = memref.subview %[[alloc1]][0, 0] [%{{.*}}, 5] [1, 1]
+    // CHECK: memref.copy %[[s1]], %[[alloc1_view]]
+
+    // CHECK: %[[alloc2:.*]] = memref.alloc() : memref<4x5xf32, 3>
+    // CHECK-NOT: linalg.fill {{.*}} outs(%[[alloc2]]
+    // No subview because there is 0 padding
+    // CHECK: memref.copy %[[s2]], %[[alloc2]]
+
+    // CHECK: linalg.matmul ins(%[[alloc0]], %[[alloc1]] : {{.*}}) outs(%[[alloc2]] : {{.*}})
+    // Copy back result.
+    // CHECK: memref.copy %[[alloc2]], %[[s2]]
+    %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
+
+    // insert_slice bufferizes to a no-op.
+    %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
+    func.return %5 : tensor<24x25xf32>
+  }
 }
 
 module attributes {transform.with_named_sequence} {
@@ -69,40 +71,42 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-SAME:     %[[arg0:.*]]: memref<24x12xf32, strided<[?, ?], offset: ?>>,
 //  CHECK-SAME:     %[[arg1:.*]]: memref<12x25xf32, strided<[?, ?], offset: ?>>,
 //  CHECK-SAME:     %[[arg2:.*]]: memref<24x25xf32, strided<[?, ?], offset: ?>>,
-func.func @vectorize_and_bufferize_pad(%arg0: tensor<24x12xf32>,
-                                       %arg1: tensor<12x25xf32>,
-                                       %arg2: tensor<24x25xf32>,
-                                       %iv0 : index, %iv1 : index,
-                                       %iv2 : index) -> tensor<24x25xf32> {
-  %0 = affine.min #map()[%iv2]
-
-  // CHECK: %[[s0:.*]] = memref.subview %[[arg0]]
-  %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
-  // CHECK: %[[s1:.*]] = memref.subview %[[arg1]]
-  %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
-  // CHECK: %[[s2:.*]] = memref.subview %[[arg2]]
-  %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
-
-  // CHECK: %[[v0:.*]] = vector.mask {{.*}} { vector.transfer_read %[[s0]]
-  // CHECK: %[[alloc0:.*]] = memref.alloc() : memref<4x7xf32, 3>
-  // CHECK: vector.mask {{.*}} { vector.transfer_write %[[v0]], %[[alloc0]]
-
-  // CHECK: %[[v1:.*]] = vector.mask {{.*}} { vector.transfer_read %[[s1]]
-  // CHECK: %[[alloc1:.*]] = memref.alloc() : memref<7x5xf32, 3>
-  // CHECK: vector.mask {{.*}} { vector.transfer_write %[[v1]], %[[alloc1]]
-
-  // CHECK: %[[v2:.*]] = vector.mask {{.*}} { vector.transfer_read %[[s2]]
-  // CHECK: %[[alloc2:.*]] = memref.alloc() : memref<4x5xf32, 3>
-  // CHECK: vector.mask {{.*}} { vector.transfer_write %[[v2]], %[[alloc0]]
-
-  // CHECK: linalg.matmul ins(%[[alloc0]], %[[alloc1]] : {{.*}}) outs(%[[alloc2]] : {{.*}})
-  // Copy back result.
-  // CHECK: memref.copy %[[alloc2]], %[[s2]]
-  %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
-
-  // insert_slice bufferizes to a no-op.
-  %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
-  func.return %5 : tensor<24x25xf32>
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @vectorize_and_bufferize_pad(%arg0: tensor<24x12xf32>,
+                                         %arg1: tensor<12x25xf32>,
+                                         %arg2: tensor<24x25xf32>,
+                                         %iv0 : index, %iv1 : index,
+                                         %iv2 : index) -> tensor<24x25xf32> {
+    %0 = affine.min #map()[%iv2]
+
+    // CHECK: %[[s0:.*]] = memref.subview %[[arg0]]
+    %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
+    // CHECK: %[[s1:.*]] = memref.subview %[[arg1]]
+    %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
+    // CHECK: %[[s2:.*]] = memref.subview %[[arg2]]
+    %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
+
+    // CHECK: %[[v0:.*]] = vector.mask {{.*}} { vector.transfer_read %[[s0]]
+    // CHECK: %[[alloc0:.*]] = memref.alloc() : memref<4x7xf32, 3>
+    // CHECK: vector.mask {{.*}} { vector.transfer_write %[[v0]], %[[alloc0]]
+
+    // CHECK: %[[v1:.*]] = vector.mask {{.*}} { vector.transfer_read %[[s1]]
+    // CHECK: %[[alloc1:.*]] = memref.alloc() : memref<7x5xf32, 3>
+    // CHECK: vector.mask {{.*}} { vector.transfer_write %[[v1]], %[[alloc1]]
+
+    // CHECK: %[[v2:.*]] = vector.mask {{.*}} { vector.transfer_read %[[s2]]
+    // CHECK: %[[alloc2:.*]] = memref.alloc() : memref<4x5xf32, 3>
+    // CHECK: vector.mask {{.*}} { vector.transfer_write %[[v2]], %[[alloc0]]
+
+    // CHECK: linalg.matmul ins(%[[alloc0]], %[[alloc1]] : {{.*}}) outs(%[[alloc2]] : {{.*}})
+    // Copy back result.
+    // CHECK: memref.copy %[[alloc2]], %[[s2]]
+    %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
+
+    // insert_slice bufferizes to a no-op.
+    %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
+    func.return %5 : tensor<24x25xf32>
+  }
 }
 
 module attributes {transform.with_named_sequence} {
diff --git a/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir b/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir
index ae63ed5f1a410..a6943cf338d42 100644
--- a/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir
@@ -115,8 +115,8 @@ func.func @pad_and_hoist_lhs_transpose(
   //     BUILD-PACKING-LOOP-NEST: %[[PACKED:.*]] = scf.for %{{.*}} -> (tensor<?x12x5xf32>) {
   //     BUILD-PACKING-LOOP-NEST:   tensor.pad %{{.*}}
   //     BUILD-PACKING-LOOP-NEST:     : tensor<?x12xf32> to tensor<5x12xf32>
-  //     BUILD-PACKING-LOOP-NEST:   linalg.generic
-  //     BUILD-PACKING-LOOP-NEST:     -> tensor<12x5xf32>
+  //     BUILD-PACKING-LOOP-NEST:   linalg.transpose
+  //     BUILD-PACKING-LOOP-NEST:     ins({{.*}} : tensor<5x12xf32>) outs({{.*}} : tensor<12x5xf32>)
   //     BUILD-PACKING-LOOP-NEST:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0, 0] [1, 12, 5] [1, 1, 1]
   // BUILD-PACKING-LOOP-NEST-SAME:   : tensor<12x5xf32> into tensor<?x12x5xf32>
   //     BUILD-PACKING-LOOP-NEST: scf.for %{{.*}} -> (tensor<24x25xf32>)
diff --git a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir
index 499d9904c06b9..e075ff57666b0 100644
--- a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir
@@ -123,17 +123,17 @@ func.func @pad_and_hoist_lhs_transpose(
      -> tensor<24x25xf32>
 {
   //     CHECK: %[[PACKED:.*]] = scf.for %{{.*}} -> (tensor<5x12x5xf32>) {
-  //     CHECK:   tensor.pad %{{.*}}
+  //     CHECK:   %[[PAD:.*]] = tensor.pad %{{.*}}
   //     CHECK:     : tensor<?x12xf32> to tensor<5x12xf32>
-  //     CHECK:   linalg.generic
-  //     CHECK:     -> tensor<12x5xf32>
+  //     CHECK:   linalg.transpose
+  //     CHECK:      ins(%[[PAD]] : tensor<5x12xf32>) outs(%{{.*}} : tensor<12x5xf32>)
   //     CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0, 0] [1, 12, 5] [1, 1, 1]
   // CHECK-SAME:   : tensor<12x5xf32> into tensor<5x12x5xf32>
   //     CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>) {
   //     CHECK:   %[[PADDED:.*]] = tensor.extract_slice %[[PACKED]][%{{.*}}, 0, 0] [1, 12, 5] [1, 1, 1]
   // CHECK-SAME:    : tensor<5x12x5xf32> to tensor<12x5xf32>
-  //     CHECK:   %[[TRANSPOSED:.*]] = linalg.generic
-  //     CHECK:     -> tensor<5x12xf32>
+  //     CHECK:   %[[TRANSPOSED:.*]] = linalg.transpose ins(%[[PADDED]] : tensor<12x5xf32>)
+  //     CHECK:     outs(%{{.*}} : tensor<5x12xf32>
   //     CHECK:   linalg.matmul ins(%[[TRANSPOSED]]
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
   func.return %0 : tensor<24x25xf32>
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
index 47bb5ddf4afc3..120a525f3bdae 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
@@ -209,12 +209,26 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: @pad(
-func.func @pad(%arg0: tensor<24x12xf32>,
-               %arg1: tensor<12x25xf32>,
-               %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
-  // This is attached to an error that is silenceable and is not reported by this transform
-  //   {{when applied to this op}}
+// With all padded being static, there's nothing to pad. However, with the
+// `nofold` attribute set (see `pack_paddings`), the corresponding pad Ops are
+// preserved.
+
+// CHECK-LABEL: @zero_pad_static(
+func.func @zero_pad_static(%arg0: tensor<24x12xf32>,
+                           %arg1: tensor<12x25xf32>,
+                           %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
+
+// CHECK-SAME:      %[[ARG_0:.*]]: tensor<24x12xf32>,
+// CHECK-SAME:      %[[ARG_1:.*]]: tensor<12x25xf32>,
+// CHECK-SAME:      %[[ARG_2:.*]]: tensor<24x25xf32>) -> tensor<24x25xf32> {
+
+// CHECK:           %[[PAD_ARG_0:.*]] = tensor.pad %[[ARG_0]] nofold low[0, 0] high[0, 0]
+// CHECK:           %[[PAD_ARG_1:.*]] = tensor.pad %[[ARG_1]] nofold low[0, 0] high[0, 0]
+// CHECK-NOT:       tensor.pad
+
+// CHECK:           %[[MATMUL:.*]] = linalg.matmul
+// CHECK-SAME:      ins(%[[PAD_ARG_0:.*]], %[[PAD_ARG_1:.*]] : tensor<24x12xf32>, tensor<12x25xf32>)
+// CHECK-SAME:      outs(%[[ARG_2]]
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
   func.return %0 : tensor<24x25xf32>
 }
@@ -222,8 +236,6 @@ func.func @pad(%arg0: tensor<24x12xf32>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    // This error is silenceable and is not reported by this transform
-    //   {{transform.structured.pad failed to apply}}
     %padded, %pad, %copy_back = transform.structured.pad %0 {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
@@ -235,6 +247,72 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// With all padded dims being static, there's nothing to pad. However, with the
+// `nofold` attribute set (see `pack_paddings`), the corresponding pad Ops are
+// preserved. Same as above, but some dims are now dynamic.
+
+// CHECK-LABEL: @zero_pad_dynamic(
+func.func @zero_pad_dynamic(%arg0: tensor<?x12xf32>,
+                            %arg1: tensor<12x?xf32>,
+                            %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+
+// CHECK-SAME:      %[[ARG_0:.*]]: tensor<?x12xf32>,
+// CHECK-SAME:      %[[ARG_1:.*]]: tensor<12x?xf32>,
+// CHECK-SAME:      %[[ARG_2:.*]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
+
+// CHECK:           %[[PAD_ARG_0:.*]] = tensor.pad %[[ARG_0]] nofold low[0, 0] high[0, 0]
+// CHECK:           %[[PAD_ARG_1:.*]] = tensor.pad %[[ARG_1]] nofold low[0, 0] high[0, 0]
+// CHECK:           %[[PAD_ARG_2:.*]] = tensor.pad %[[ARG_2]] nofold low[0, 0] high[0, 0]
+
+// CHECK:           %[[MATMUL:.*]] = linalg.matmul
+// CHECK-SAME:      ins(%[[PAD_ARG_0:.*]], %[[PAD_ARG_1:.*]] : tensor<?x12xf32>, tensor<12x?xf32>)
+// CHECK-SAME:      outs(%[[PAD_ARG_2]]
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x12xf32>, tensor<12x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  func.return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %padded, %pad, %copy_back = transform.structured.pad %0 {
+      padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
+      // Note - only the static dim is padded
+      padding_dimensions=[2],
+      pack_paddings=[1, 1, 1]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
+// Impossible to get a bound for padding - fails
+
+func.func @negative_no_ub_estimate(%arg0: tensor<?x12xf32>,
+                                   %arg1: tensor<12x?xf32>,
+                                   %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+
+  // expected-note @below {{target op}}
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x12xf32>, tensor<12x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  func.return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{ailed to pad op}}
+    %padded, %pad, %copy_back = transform.structured.pad %0 {
+      padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
+      // Note - attempting to pad non-static dim
+      padding_dimensions=[1],
+      pack_paddings=[1, 1, 1]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
 // Check that the padding can be applied even when the output argument of the
 // linalg op is not produced by an empty op or an extract_slice op.
 
diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
index 3404b73102e6a..9a43d43cd9460 100644
--- a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
@@ -1964,3 +1964,43 @@ module attributes {transform.with_named_sequence} {
 //       CHECK:     %[[VAL_8:.*]] = vector.transpose %[[VAL_7]], [1, 0] : vector<1x4xf32> to vector<4x1xf32>
 //       CHECK:     vector.transfer_write %[[VAL_8]], %{{.*}} {in_bounds = [true, true]} : vector<4x1xf32>, tensor<4x1xf32>
 //       CHECK:     vector.transfer_write %[[VAL_7]], %{{.*}} {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
+
+// -----
+
+// Extracted from: https://github.com/llvm/llvm-project/issues/97247
+
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, 0)>
+
+func.func @generic_with_reduction_and_broadcast(%arg0: tensor<1x12x197x197xf32>) -> (tensor<1x12x197x1xf32>) {
+  %0 = tensor.empty() : tensor<1x12x197x1xf32>
+  %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%arg0 : tensor<1x12x197x197xf32>) outs(%0 : tensor<1x12x197x1xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %818 = arith.addf %in, %out : f32
+    linalg.yield %818 : f32
+  } -> tensor<1x12x197x1xf32>
+  return %1 : tensor<1x12x197x1xf32>
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK: #[[$ATTR_32:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
+// CHECK-LABEL:   func.func @generic_with_reduction_and_broadcast(
+// CHECK-SAME:                                                    %[[VAL_0:.*]]: tensor<1x12x197x197xf32>) -> tensor<1x12x197x1xf32> {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]] = tensor.empty() : tensor<1x12x197x1xf32>
+// CHECK:           %[[VAL_4:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_2]], %[[VAL_2]], %[[VAL_2]], %[[VAL_2]]], %[[VAL_1]] {in_bounds = [true, true, true, true]} : tensor<1x12x197x197xf32>, vector<1x12x197x197xf32>
+// CHECK:           %[[VAL_5:.*]] = vector.transfer_read %[[VAL_3]]{{\[}}%[[VAL_2]], %[[VAL_2]], %[[VAL_2]], %[[VAL_2]]], %[[VAL_1]] {in_bounds = [true, true, true], permutation_map = #[[$ATTR_32]]} : tensor<1x12x197x1xf32>, vector<1x12x197xf32>
+// CHECK:           %[[VAL_6:.*]] = vector.multi_reduction <add>, %[[VAL_4]], %[[VAL_5]] [3] : vector<1x12x197x197xf32> to vector<1x12x197xf32>
+// CHECK:           %[[VAL_7:.*]] = vector.broadcast %[[VAL_6]] : vector<1x12x197xf32> to vector<1x1x12x197xf32>
+// CHECK:           %[[VAL_8:.*]] = vector.transpose %[[VAL_7]], [1, 2, 3, 0] : vector<1x1x12x197xf32> to vector<1x12x197x1xf32>
+// CHECK:           %[[VAL_9:.*]] = vector.transfer_write %[[VAL_8]], %[[VAL_3]]{{\[}}%[[VAL_2]], %[[VAL_2]], %[[VAL_2]], %[[VAL_2]]] {in_bounds = [true, true, true, true]} : vector<1x12x197x1xf32>, tensor<1x12x197x1xf32>
+// CHECK:           return %[[VAL_9]] : tensor<1x12x197x1xf32>
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
index 783149971f0d6..0e2b2458d29cd 100644
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -147,6 +147,51 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0, 0)>
+
+func.func @dynamic_generic_with_reduction_and_broadcast(%arg0: tensor<?x?xf32>, %init: tensor<?x?xf32>) -> (tensor<?x?xf32>) {
+  %0 = linalg.generic { indexing_maps = [#map, #map1],
+                        iterator_types = ["parallel", "reduction"]}
+    ins(%arg0 : tensor<?x?xf32>)
+    outs(%init : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %1 = arith.addf %in, %out : f32
+    linalg.yield %1 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+// CHECK: #[[$MAP:.+]] = affine_map<(d0, d1) -> (d0)>
+
+// CHECK-LABEL:   func.func @dynamic_generic_with_reduction_and_broadcast(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?xf32>
+// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_5:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf32>
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_8:.*]] = vector.create_mask %[[VAL_3]], %[[VAL_5]] : vector<4x4xi1>
+// CHECK:           %[[VAL_9:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]], %[[VAL_7]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x4xf32> } : vector<4x4xi1> -> vector<4x4xf32>
+// CHECK:           %[[VAL_10:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_11:.*]] = vector.create_mask %[[VAL_3]] : vector<4xi1>
+// CHECK:           %[[VAL_12:.*]] = vector.mask %[[VAL_11]] { vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_6]], %[[VAL_6]]], %[[VAL_10]] {in_bounds = [true], permutation_map = #[[$MAP]]} : tensor<?x?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
+// CHECK:           %[[VAL_13:.*]] = vector.mask %[[VAL_8]] { vector.multi_reduction <add>, %[[VAL_9]], %[[VAL_12]] [1] : vector<4x4xf32> to vector<4xf32> } : vector<4x4xi1> -> vector<4xf32>
+// CHECK:           %[[VAL_14:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_15:.*]] = vector.mask %[[VAL_11]] { vector.transfer_write %[[VAL_13]], %[[VAL_1]]{{\[}}%[[VAL_14]], %[[VAL_14]]] {in_bounds = [true], permutation_map = #[[$MAP]]} : vector<4xf32>, tensor<?x?xf32> } : vector<4xi1> -> tensor<?x?xf32>
+// CHECK:           return %[[VAL_15]] : tensor<?x?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [4, 4] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 func.func @vectorize_dynamic_2d_transpose(%arg0: tensor<?x?xf32>,
                                           %arg1: tensor<?x?xf32>,
                                           %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
diff --git a/mlir/test/Dialect/Math/extend-to-supported-types-f16.mlir b/mlir/test/Dialect/Math/extend-to-supported-types-f16.mlir
new file mode 100644
index 0000000000000..3674a91ef425f
--- /dev/null
+++ b/mlir/test/Dialect/Math/extend-to-supported-types-f16.mlir
@@ -0,0 +1,146 @@
+// RUN: mlir-opt %s --split-input-file -math-extend-to-supported-types="extra-types=f16 target-type=f32" | FileCheck %s
+
+// CHECK-LABEL: @sin_f8E5M2
+// CHECK-SAME: ([[ARG0:%.+]]: f8E5M2)
+func.func @sin_f8E5M2(%arg0: f8E5M2) -> f8E5M2 {
+  // CHECK: [[EXTF:%.+]] = arith.extf [[ARG0]]
+  // CHECK: [[SIN:%.+]] = math.sin [[EXTF]]
+  // CHECK: [[TRUNCF:%.+]] = arith.truncf [[SIN]]
+  // CHECK: return [[TRUNCF]] : f8E5M2
+  %0 = math.sin %arg0 : f8E5M2
+  return %0 : f8E5M2
+}
+
+// CHECK-LABEL: @sin
+// CHECK-SAME: ([[ARG0:%.+]]: f16)
+func.func @sin(%arg0: f16) -> f16 {
+  // CHECK16: [[SIN:%.+]] = math.sin [[ARG0]] : f16
+  // CHECK16: return [[SIN]] : f16
+  %0 = math.sin %arg0 : f16
+  return %0 : f16
+}
+
+// CHECK-LABEL: @fpowi_f8E5M2
+// CHECK-SAME: ([[ARG0:%.+]]: f8E5M2, [[ARG1:%.+]]: i32)
+func.func @fpowi_f8E5M2(%arg0: f8E5M2, %arg1: i32) -> f8E5M2 {
+  // CHECK: [[EXTF:%.+]] = arith.extf [[ARG0]]
+  // CHECK: [[FPOWI:%.+]] = math.fpowi [[EXTF]], [[ARG1]]
+  // CHECK: [[TRUNCF:%.+]] = arith.truncf [[FPOWI]]
+  // CHECK: return [[TRUNCF]] : f8E5M2
+  %0 = math.fpowi %arg0, %arg1 : f8E5M2, i32
+  return %0 : f8E5M2
+}
+
+// CHECK-LABEL: @fpowi
+// CHECK-SAME: ([[ARG0:%.+]]: f16, [[ARG1:%.+]]: i32)
+func.func @fpowi(%arg0: f16, %arg1: i32) -> f16 {
+  // CHECK: [[FPOWI:%.+]] = math.fpowi [[ARG0]], [[ARG1]]
+  // CHECK: return [[FPOWI]] : f16
+  %0 = math.fpowi %arg0, %arg1 : f16, i32
+  return %0 : f16
+}
+
+// COM: Verify that the pass leaves `math.fma` untouched, since it is often
+// COM: implemented on small data types.
+// CHECK-LABEL: @fma
+// CHECK-SAME: ([[ARG0:%.+]]: f16, [[ARG1:%.+]]: f16, [[ARG2:%.+]]: f16)
+// CHECK: [[FMA:%.+]] = math.fma [[ARG0]], [[ARG1]], [[ARG2]]
+// CHECK: return [[FMA]] : f16
+func.func @fma(%arg0: f16, %arg1: f16, %arg2: f16) -> f16 {
+  %0 = math.fma %arg0, %arg1, %arg2 : f16
+  return %0 : f16
+}
+
+// CHECK-LABEL: @absf_f16
+// CHECK-SAME: ([[ARG0:%.+]]: f16)
+// CHECK: [[ABSF:%.+]] = math.absf [[ARG0]]
+// CHECK: return [[ABSF]] : f16
+func.func @absf_f16(%arg0: f16) -> f16 {
+  %0 = math.absf %arg0 : f16
+  return %0 : f16
+}
+
+// CHECK-LABEL: @absf_f32
+// CHECK-SAME: ([[ARG0:%.+]]: f32)
+// CHECK: [[ABSF:%.+]] = math.absf [[ARG0]]
+// CHECK: return [[ABSF]] : f32
+func.func @absf_f32(%arg0: f32) -> f32 {
+  %0 = math.absf %arg0 : f32
+  return %0 : f32
+}
+
+// CHECK-LABEL: @absf_f64
+// CHECK-SAME: ([[ARG0:%.+]]: f64)
+// CHECK: [[ABSF:%.+]] = math.absf [[ARG0]]
+// CHECK: return [[ABSF]] : f64
+func.func @absf_f64(%arg0: f64) -> f64 {
+  %0 = math.absf %arg0 : f64
+  return %0 : f64
+}
+
+// CHECK-LABEL: @sin_vector
+// CHECK-SAME: ([[ARG0:%.+]]: vector<2xbf16>)
+// CHECK: [[EXTF:%.+]] = arith.extf [[ARG0]]
+// CHECK: [[SIN:%.+]] = math.sin [[EXTF]]
+// CHECK: [[TRUNCF:%.+]] = arith.truncf [[SIN]]
+// CHECK: return [[TRUNCF]] : vector<2xbf16>
+func.func @sin_vector(%arg0: vector<2xbf16>) -> vector<2xbf16> {
+  %0 = math.sin %arg0 : vector<2xbf16>
+  return %0 : vector<2xbf16>
+}
+
+// CHECK-LABEL: @sin_vector_f16
+// CHECK-SAME: ([[ARG0:%.+]]: vector<2xf16>)
+// CHECK: [[SIN:%.+]] = math.sin [[ARG0]]
+// CHECK: return [[SIN]] : vector<2xf16>
+func.func @sin_vector_f16(%arg0: vector<2xf16>) -> vector<2xf16> {
+  %0 = math.sin %arg0 : vector<2xf16>
+  return %0 : vector<2xf16>
+}
+
+// CHECK-LABEL: @fastmath
+// CHECK: math.sin %{{.+}} fastmath<nsz>
+func.func @fastmath(%arg0: f16) -> f16 {
+  %0 = math.sin %arg0 fastmath<nsz> : f16
+  return %0 : f16
+}
+
+// CHECK-LABEL: @sequences_f8E5M2
+// CHECK-SAME: ([[ARG0:%.+]]: f8E5M2)
+// CHECK: [[EXTF0:%.+]] = arith.extf [[ARG0]]
+// CHECK: [[ABSF:%.+]] = math.absf [[EXTF0]]
+// CHECK: [[TRUNCF0:%.+]] = arith.truncf [[ABSF]]
+// CHECK: [[EXTF1:%.+]] = arith.extf [[TRUNCF0]]
+// CHECK: [[SIN:%.+]] = math.sin [[EXTF1]]
+// CHECK: [[TRUNCF1:%.+]] = arith.truncf [[SIN]]
+// CHECK: return [[TRUNCF1]] : f8E5M2
+func.func @sequences_f8E5M2(%arg0: f8E5M2) -> f8E5M2 {
+  %0 = math.absf %arg0 : f8E5M2
+  %1 = math.sin %0 : f8E5M2
+  return %1 : f8E5M2
+}
+
+// CHECK-LABEL: @sequences
+// CHECK-SAME: ([[ARG0:%.+]]: f16)
+// CHECK: [[ABSF:%.+]] = math.absf [[ARG0]]
+// CHECK: [[SIN:%.+]] = math.sin [[ABSF]]
+// CHECK: return [[SIN]] : f16
+func.func @sequences(%arg0: f16) -> f16 {
+  %0 = math.absf %arg0 : f16
+  %1 = math.sin %0 : f16
+  return %1 : f16
+}
+
+// CHECK-LABEL: @promote_in_if_block
+func.func @promote_in_if_block(%arg0: bf16, %arg1: bf16, %arg2: i1) -> bf16 {
+  // CHECK: [[EXTF0:%.+]] = arith.extf
+  // CHECK-NEXT: %[[RES:.*]] = scf.if
+  %0 = scf.if %arg2 -> bf16 {
+    %1 = math.absf %arg0 : bf16
+    // CHECK: [[TRUNCF0:%.+]] = arith.truncf
+    scf.yield %1 : bf16
+  } else {
+    scf.yield %arg1 : bf16
+  }
+  return %0 : bf16
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/Math/legalize-to-f32.mlir b/mlir/test/Dialect/Math/extend-to-supported-types.mlir
similarity index 96%
rename from mlir/test/Dialect/Math/legalize-to-f32.mlir
rename to mlir/test/Dialect/Math/extend-to-supported-types.mlir
index ebb0de9d2653e..ad7169d4cf4ae 100644
--- a/mlir/test/Dialect/Math/legalize-to-f32.mlir
+++ b/mlir/test/Dialect/Math/extend-to-supported-types.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --split-input-file -math-legalize-to-f32 | FileCheck %s
+// RUN: mlir-opt %s --split-input-file -math-extend-to-supported-types="target-type=f32" | FileCheck %s
 
 // CHECK-LABEL: @sin
 // CHECK-SAME: ([[ARG0:%.+]]: f16)
diff --git a/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir b/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
index 85a4853972457..ef8b80f6b5c22 100644
--- a/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
+++ b/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
@@ -71,3 +71,31 @@ func.func @dim_of_memref_expand_shape(%arg0: memref<?x8xi32>)
   %1 = memref.dim %0, %c1 : memref<1x?x2x4xi32>
   return %1 : index
 }
+
+// -----
+
+// CHECK-LABEL: @iter_to_init_arg_loop_like
+//  CHECK-SAME:   (%[[ARG0:.*]]: tensor<?x?xf32>, %[[ARG1:.*]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
+//       CHECK:    %[[RESULT:.*]] = scf.forall
+//  CHECK-SAME:                       shared_outs(%[[OUTS:.*]] = %[[ARG1]]) -> (tensor<?x?xf32>) {
+//  CHECK-NEXT:       %{{.*}} = tensor.dim %[[ARG1]], %{{.*}} : tensor<?x?xf32>
+func.func @iter_to_init_arg_loop_like(
+  %arg0 : tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+
+  %result = scf.forall (%i) = (%c0) to (%dim0)
+      step (%c1) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
+
+    %dim1 = tensor.dim %o, %c1 : tensor<?x?xf32>
+    %slice = tensor.extract_slice %arg1[%i, 0] [1, %dim1] [1, 1]
+      : tensor<?x?xf32> to tensor<1x?xf32>
+
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %slice into %o[%i, 0] [1, %dim1] [1, 1]
+        : tensor<1x?xf32> into tensor<?x?xf32>
+    }
+  }
+  return %result : tensor<?x?xf32>
+}
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index d8745f1015af8..a780efe6d22e1 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -355,6 +355,7 @@ func.func @omp_simd_aligned_mismatch(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
+    omp.terminator
   }) {alignments = [128],
       operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
@@ -370,6 +371,7 @@ func.func @omp_simd_aligned_negative(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
+    omp.terminator
   }) {alignments = [-1, 128], operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
@@ -384,6 +386,7 @@ func.func @omp_simd_unexpected_alignment(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
+    omp.terminator
   }) {alignments = [1, 128]} : () -> ()
   return
 }
@@ -398,6 +401,7 @@ func.func @omp_simd_aligned_float(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
+    omp.terminator
   }) {alignments = [1.5, 128], operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
@@ -412,6 +416,7 @@ func.func @omp_simd_aligned_the_same_var(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
+    omp.terminator
   }) {alignments = [1, 128], operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
@@ -426,6 +431,7 @@ func.func @omp_simd_nontemporal_the_same_var(%arg0 : index,  %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
+    omp.terminator
   }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
@@ -438,6 +444,7 @@ func.func @omp_simd_order_value(%lb : index, %ub : index, %step : index) {
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
+    omp.terminator
   }
   return
 }
@@ -450,6 +457,7 @@ func.func @omp_simd_reproducible_order(%lb : index, %ub : index, %step : index)
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
+    omp.terminator
   }
   return
 }
@@ -460,6 +468,7 @@ func.func @omp_simd_unconstrained_order(%lb : index, %ub : index, %step : index)
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
+    omp.terminator
   }
   return
 }
@@ -470,6 +479,7 @@ func.func @omp_simd_pretty_simdlen(%lb : index, %ub : index, %step : index) -> (
     omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
       omp.yield
     }
+    omp.terminator
   }
   return
 }
@@ -482,6 +492,7 @@ func.func @omp_simd_pretty_safelen(%lb : index, %ub : index, %step : index) -> (
     omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
       omp.yield
     }
+    omp.terminator
   }
   return
 }
@@ -494,6 +505,7 @@ func.func @omp_simd_pretty_simdlen_safelen(%lb : index, %ub : index, %step : ind
     omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
       omp.yield
     }
+    omp.terminator
   }
   return
 }
@@ -736,7 +748,7 @@ func.func @foo(%lb : index, %ub : index, %step : index) {
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
 
   // expected-error @below {{accumulator variable used more than once}}
-  omp.wsloop reduction(@add_f32 %0 -> %prv : !llvm.ptr, @add_f32 %0 -> %prv1 : !llvm.ptr) {
+  omp.wsloop reduction(@add_f32 %0 -> %prv, @add_f32 %0 -> %prv1 : !llvm.ptr, !llvm.ptr) {
     omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
       %2 = arith.constant 2.0 : f32
       omp.yield
@@ -1459,6 +1471,7 @@ func.func @omp_sections(%data_var : memref<i32>) -> () {
 func.func @omp_sections(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
   "omp.sections" (%data_var) ({
+  ^bb0(%arg0: memref<i32>):
     omp.terminator
   }) {operandSegmentSizes = array<i32: 0,0,0,1>} : (memref<i32>) -> ()
   return
@@ -1571,6 +1584,31 @@ func.func @omp_sections() {
 
 // -----
 
+omp.declare_reduction @add_f32 : f32
+init {
+^bb0(%arg: f32):
+  %0 = arith.constant 0.0 : f32
+  omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+  %1 = arith.addf %arg0, %arg1 : f32
+  omp.yield (%1 : f32)
+}
+
+func.func @omp_sections(%x : !llvm.ptr) {
+  omp.sections reduction(@add_f32 %x -> %arg0 : !llvm.ptr) {
+    // expected-error @below {{op expected at least 1 entry block argument(s)}}
+    omp.section {
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
 func.func @omp_single(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.single" (%data_var) ({
@@ -1649,7 +1687,7 @@ func.func @omp_task_depend(%data_var: memref<i32>) {
 
 func.func @omp_task(%ptr: !llvm.ptr) {
   // expected-error @below {{op expected symbol reference @add_f32 to point to a reduction declaration}}
-  omp.task in_reduction(@add_f32 -> %ptr : !llvm.ptr) {
+  omp.task in_reduction(@add_f32 %ptr -> %arg0 : !llvm.ptr) {
     // CHECK: "test.foo"() : () -> ()
     "test.foo"() : () -> ()
     // CHECK: omp.terminator
@@ -1673,7 +1711,7 @@ combiner {
 
 func.func @omp_task(%ptr: !llvm.ptr) {
   // expected-error @below {{op accumulator variable used more than once}}
-  omp.task in_reduction(@add_f32 -> %ptr : !llvm.ptr, @add_f32 -> %ptr : !llvm.ptr) {
+  omp.task in_reduction(@add_f32 %ptr -> %arg0, @add_f32 %ptr -> %arg1 : !llvm.ptr, !llvm.ptr) {
     // CHECK: "test.foo"() : () -> ()
     "test.foo"() : () -> ()
     // CHECK: omp.terminator
@@ -1703,7 +1741,7 @@ atomic {
 
 func.func @omp_task(%mem: memref<1xf32>) {
   // expected-error @below {{op expected accumulator ('memref<1xf32>') to be the same type as reduction declaration ('!llvm.ptr')}}
-  omp.task in_reduction(@add_i32 -> %mem : memref<1xf32>) {
+  omp.task in_reduction(@add_i32 %mem -> %arg0 : memref<1xf32>) {
     // CHECK: "test.foo"() : () -> ()
     "test.foo"() : () -> ()
     // CHECK: omp.terminator
@@ -1838,6 +1876,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
+    omp.terminator
   }) {operandSegmentSizes = array<i32: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>} : (memref<i32>) -> ()
   return
 }
@@ -1849,9 +1888,11 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
   "omp.taskloop"(%testf32, %testf32_2) ({
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
+    omp.terminator
   }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>, reduction_syms = [@add_f32]} : (!llvm.ptr, !llvm.ptr) -> ()
   return
 }
@@ -1862,9 +1903,11 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
   "omp.taskloop"(%testf32) ({
+  ^bb0(%arg0: !llvm.ptr):
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
+    omp.terminator
   }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>, reduction_syms = [@add_f32, @add_f32]} : (!llvm.ptr) -> ()
   return
 }
@@ -1876,9 +1919,11 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
   "omp.taskloop"(%testf32, %testf32_2) ({
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
+    omp.terminator
   }) {in_reduction_syms = [@add_f32], operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>} : (!llvm.ptr, !llvm.ptr) -> ()
   return
 }
@@ -1889,9 +1934,11 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
   "omp.taskloop"(%testf32) ({
+  ^bb0(%arg0: !llvm.ptr):
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
+    omp.terminator
   }) {in_reduction_syms = [@add_f32, @add_f32], operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>} : (!llvm.ptr) -> ()
   return
 }
@@ -1914,10 +1961,11 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{if a reduction clause is present on the taskloop directive, the nogroup clause must not be specified}}
-  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) nogroup {
+  omp.taskloop nogroup reduction(@add_f32 %testf32 -> %arg0, @add_f32 %testf32_2 -> %arg1 : !llvm.ptr, !llvm.ptr) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
+    omp.terminator
   }
   return
 }
@@ -1939,10 +1987,11 @@ combiner {
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{the same list item cannot appear in both a reduction and an in_reduction clause}}
-  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr) in_reduction(@add_f32 -> %testf32 : !llvm.ptr) {
+  omp.taskloop in_reduction(@add_f32 %testf32 -> %arg0 : !llvm.ptr) reduction(@add_f32 %testf32 -> %arg1 : !llvm.ptr) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
+    omp.terminator
   }
   return
 }
@@ -1956,6 +2005,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
+    omp.terminator
   }
   return
 }
@@ -2002,7 +2052,7 @@ func.func @omp_target(%map1: memref<?xi32>) {
   %mapv = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(delete) capture(ByRef) -> memref<?xi32> {name = ""}
   // expected-error @below {{to, from, tofrom and alloc map types are permitted}}
   omp.target map_entries(%mapv -> %arg0: memref<?xi32>) {
-    ^bb0(%arg0: memref<?xi32>):
+    omp.terminator
   }
   return
 }
@@ -2153,20 +2203,26 @@ func.func @omp_target_depend(%data_var: memref<i32>) {
 
 // -----
 
-func.func @omp_distribute_schedule(%chunk_size : i32) -> () {
+func.func @omp_distribute_schedule(%chunk_size : i32, %lb : i32, %ub : i32, %step : i32) -> () {
   // expected-error @below {{op chunk size set without dist_schedule_static being present}}
   "omp.distribute"(%chunk_size) <{operandSegmentSizes = array<i32: 0, 0, 1, 0>}> ({
-      "omp.terminator"() : () -> ()
-    }) : (i32) -> ()
+    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      "omp.yield"() : () -> ()
+    }
+    "omp.terminator"() : () -> ()
+  }) : (i32) -> ()
 }
 
 // -----
 
-func.func @omp_distribute_allocate(%data_var : memref<i32>) -> () {
+func.func @omp_distribute_allocate(%data_var : memref<i32>, %lb : i32, %ub : i32, %step : i32) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.distribute"(%data_var) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>}> ({
-      "omp.terminator"() : () -> ()
-    }) : (memref<i32>) -> ()
+    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      "omp.yield"() : () -> ()
+    }
+    "omp.terminator"() : () -> ()
+  }) : (memref<i32>) -> ()
 }
 
 // -----
@@ -2174,10 +2230,10 @@ func.func @omp_distribute_allocate(%data_var : memref<i32>) -> () {
 func.func @omp_distribute_wrapper(%lb: index, %ub: index, %step: index) -> () {
   // expected-error @below {{op second nested op in loop wrapper is not a terminator}}
   omp.distribute {
-      omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
-        "omp.yield"() : () -> ()
-      }
-      %0 = arith.constant 0 : i32
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+      "omp.yield"() : () -> ()
+    }
+    %0 = arith.constant 0 : i32
   }
 }
 
@@ -2408,7 +2464,8 @@ omp.private {type = private} @var1.privatizer : !llvm.ptr alloc {
 }
 
 func.func @byref_in_private(%arg0: index) {
-  // expected-error @below {{private clause cannot have byref attributes}}
+  // expected-error @below {{expected attribute value}}
+  // expected-error @below {{custom op 'omp.parallel' invalid `private` format}}
   omp.parallel private(byref @var1.privatizer %arg0 -> %arg2 : index) {
     omp.terminator
   }
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index e7d3e67ca7e05..a4423782a723b 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -840,8 +840,7 @@ func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %devic
     // CHECK: omp.target has_device_addr(%[[VAL_5:.*]] : memref<?xi32>) is_device_ptr(%[[VAL_4:.*]] : memref<i32>) map_entries(%[[MAP_A]] -> {{.*}}, %[[MAP_B]] -> {{.*}} : memref<?xi32>, memref<?xi32>) {
     %mapv1 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
     %mapv2 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
-    omp.target map_entries(%mapv1 -> %arg0, %mapv2 -> %arg1 : memref<?xi32>, memref<?xi32>) is_device_ptr(%device_ptr : memref<i32>) has_device_addr(%device_addr : memref<?xi32>) {
-    ^bb0(%arg0: memref<?xi32>, %arg1: memref<?xi32>):
+    omp.target is_device_ptr(%device_ptr : memref<i32>) has_device_addr(%device_addr : memref<?xi32>) map_entries(%mapv1 -> %arg0, %mapv2 -> %arg1 : memref<?xi32>, memref<?xi32>) {
       omp.terminator
     }
     // CHECK: %[[MAP_C:.*]] = omp.map.info var_ptr(%[[VAL_1:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(to) capture(ByRef) -> memref<?xi32> {name = ""}
@@ -850,7 +849,6 @@ func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %devic
     %mapv3 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(to) capture(ByRef) -> memref<?xi32> {name = ""}
     %mapv4 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(always, from) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target map_entries(%mapv3 -> %arg0, %mapv4 -> %arg1 : memref<?xi32>, memref<?xi32>) {
-    ^bb0(%arg0: memref<?xi32>, %arg1: memref<?xi32>):
       omp.terminator
     }
     // CHECK: omp.barrier
@@ -866,9 +864,11 @@ func.func @omp_target_data (%if_cond : i1, %device : si32, %device_ptr: memref<i
     omp.target_data if(%if_cond) device(%device : si32) map_entries(%mapv1 : memref<?xi32>){}
 
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(close, present, to) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: omp.target_data map_entries(%[[MAP_A]] : memref<?xi32>) use_device_addr(%[[VAL_4:.*]] : memref<?xi32>) use_device_ptr(%[[VAL_3:.*]] : memref<i32>)
+    // CHECK: omp.target_data map_entries(%[[MAP_A]] : memref<?xi32>) use_device_addr(%[[VAL_3:.*]] -> %{{.*}} : memref<?xi32>) use_device_ptr(%[[VAL_4:.*]] -> %{{.*}} : memref<i32>)
     %mapv2 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(close, present, to) capture(ByRef) -> memref<?xi32> {name = ""}
-    omp.target_data use_device_ptr(%device_ptr : memref<i32>) use_device_addr(%device_addr : memref<?xi32>) map_entries(%mapv2 : memref<?xi32>) {}
+    omp.target_data map_entries(%mapv2 : memref<?xi32>) use_device_addr(%device_addr -> %arg0 : memref<?xi32>) use_device_ptr(%device_ptr -> %arg1 : memref<i32>) {
+      omp.terminator
+    }
 
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_1:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
     // CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
@@ -1094,16 +1094,16 @@ func.func @omp_teams(%lb : i32, %ub : i32, %if_cond : i1, %num_threads : i32,
   // Test reduction.
   %c1 = arith.constant 1 : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  // CHECK: omp.teams reduction(@add_f32 -> %{{.+}} : !llvm.ptr) {
-  omp.teams reduction(@add_f32 -> %0 : !llvm.ptr) {
+  // CHECK: omp.teams reduction(@add_f32 %{{.+}} -> %{{.+}} : !llvm.ptr) {
+  omp.teams reduction(@add_f32 %0 -> %arg0 : !llvm.ptr) {
     %1 = arith.constant 2.0 : f32
     // CHECK: omp.terminator
     omp.terminator
   }
 
   // Test reduction byref
-  // CHECK: omp.teams reduction(byref @add_f32 -> %{{.+}} : !llvm.ptr) {
-  omp.teams reduction(byref @add_f32 -> %0 : !llvm.ptr) {
+  // CHECK: omp.teams reduction(byref @add_f32 %{{.+}} -> %{{.+}} : !llvm.ptr) {
+  omp.teams reduction(byref @add_f32 %0 -> %arg0 : !llvm.ptr) {
     %1 = arith.constant 2.0 : f32
     // CHECK: omp.terminator
     omp.terminator
@@ -1123,15 +1123,17 @@ func.func @omp_teams(%lb : i32, %ub : i32, %if_cond : i1, %num_threads : i32,
 func.func @sections_reduction() {
   %c1 = arith.constant 1 : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  // CHECK: omp.sections reduction(@add_f32 -> {{.+}} : !llvm.ptr)
-  omp.sections reduction(@add_f32 -> %0 : !llvm.ptr) {
+  // CHECK: omp.sections reduction(@add_f32 %{{.+}} -> {{.+}} : !llvm.ptr)
+  omp.sections reduction(@add_f32 %0 -> %arg0 : !llvm.ptr) {
     // CHECK: omp.section
     omp.section {
+    ^bb0(%arg1 : !llvm.ptr):
       %1 = arith.constant 2.0 : f32
       omp.terminator
     }
     // CHECK: omp.section
     omp.section {
+    ^bb0(%arg1 : !llvm.ptr):
       %1 = arith.constant 3.0 : f32
       omp.terminator
     }
@@ -1144,15 +1146,17 @@ func.func @sections_reduction() {
 func.func @sections_reduction_byref() {
   %c1 = arith.constant 1 : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  // CHECK: omp.sections reduction(byref @add_f32 -> {{.+}} : !llvm.ptr)
-  omp.sections reduction(byref @add_f32 -> %0 : !llvm.ptr) {
+  // CHECK: omp.sections reduction(byref @add_f32 %{{.+}} -> {{.+}} : !llvm.ptr)
+  omp.sections reduction(byref @add_f32 %0 -> %arg0 : !llvm.ptr) {
     // CHECK: omp.section
     omp.section {
+    ^bb0(%arg1 : !llvm.ptr):
       %1 = arith.constant 2.0 : f32
       omp.terminator
     }
     // CHECK: omp.section
     omp.section {
+    ^bb0(%arg1 : !llvm.ptr):
       %1 = arith.constant 3.0 : f32
       omp.terminator
     }
@@ -1243,13 +1247,15 @@ func.func @parallel_wsloop_reduction2(%lb : index, %ub : index, %step : index) {
 // CHECK-LABEL: func @sections_reduction2
 func.func @sections_reduction2() {
   %0 = memref.alloca() : memref<1xf32>
-  // CHECK: omp.sections reduction(@add2_f32 -> %{{.+}} : memref<1xf32>)
-  omp.sections reduction(@add2_f32 -> %0 : memref<1xf32>) {
+  // CHECK: omp.sections reduction(@add2_f32 %{{.+}} -> %{{.+}} : memref<1xf32>)
+  omp.sections reduction(@add2_f32 %0 -> %arg0 : memref<1xf32>) {
     omp.section {
+    ^bb0(%arg1 : !llvm.ptr):
       %1 = arith.constant 2.0 : f32
       omp.terminator
     }
     omp.section {
+    ^bb0(%arg1 : !llvm.ptr):
       %1 = arith.constant 2.0 : f32
       omp.terminator
     }
@@ -1899,8 +1905,9 @@ func.func @omp_sectionsop(%data_var1 : memref<i32>, %data_var2 : memref<i32>,
     omp.terminator
   }) {operandSegmentSizes = array<i32: 1,1,0,0>} : (memref<i32>, memref<i32>) -> ()
 
-    // CHECK: omp.sections reduction(@add_f32 -> %{{.*}} : !llvm.ptr)
+    // CHECK: omp.sections reduction(@add_f32 %{{.*}} -> %{{.*}} : !llvm.ptr)
   "omp.sections" (%redn_var) ({
+  ^bb0(%arg0: !llvm.ptr):
     // CHECK: omp.terminator
     omp.terminator
   }) {operandSegmentSizes = array<i32: 0,0,0,1>, reduction_byref = array<i1: false>, reduction_syms=[@add_f32]} : (!llvm.ptr) -> ()
@@ -1911,8 +1918,8 @@ func.func @omp_sectionsop(%data_var1 : memref<i32>, %data_var2 : memref<i32>,
     omp.terminator
   }
 
-  // CHECK: omp.sections reduction(@add_f32 -> %{{.*}} : !llvm.ptr) {
-  omp.sections reduction(@add_f32 -> %redn_var : !llvm.ptr) {
+  // CHECK: omp.sections reduction(@add_f32 %{{.*}} -> %{{.*}} : !llvm.ptr) {
+  omp.sections reduction(@add_f32 %redn_var -> %arg0 : !llvm.ptr) {
     // CHECK: omp.terminator
     omp.terminator
   }
@@ -2085,8 +2092,8 @@ func.func @omp_task(%bool_var: i1, %i64_var: i64, %i32_var: i32, %data_var: memr
   %0 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr
   // CHECK: %[[redn_var2:.*]] = llvm.alloca %{{.*}} x f32 : (i32) -> !llvm.ptr
   %1 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr
-  // CHECK: omp.task in_reduction(@add_f32 -> %[[redn_var1]] : !llvm.ptr, @add_f32 -> %[[redn_var2]] : !llvm.ptr) {
-  omp.task in_reduction(@add_f32 -> %0 : !llvm.ptr, @add_f32 -> %1 : !llvm.ptr) {
+  // CHECK: omp.task in_reduction(@add_f32 %[[redn_var1]] -> %{{.+}}, @add_f32 %[[redn_var2]] -> %{{.+}} : !llvm.ptr, !llvm.ptr) {
+  omp.task in_reduction(@add_f32 %0 -> %arg0, @add_f32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
     // CHECK: "test.foo"() : () -> ()
     "test.foo"() : () -> ()
     // CHECK: omp.terminator
@@ -2094,8 +2101,8 @@ func.func @omp_task(%bool_var: i1, %i64_var: i64, %i32_var: i32, %data_var: memr
   }
 
   // Checking `in_reduction` clause (mixed) byref
-  // CHECK: omp.task in_reduction(byref @add_f32 -> %[[redn_var1]] : !llvm.ptr, @add_f32 -> %[[redn_var2]] : !llvm.ptr) {
-  omp.task in_reduction(byref @add_f32 -> %0 : !llvm.ptr, @add_f32 -> %1 : !llvm.ptr) {
+  // CHECK: omp.task in_reduction(byref @add_f32 %[[redn_var1]] -> %{{.+}}, @add_f32 %[[redn_var2]] -> %{{.+}} : !llvm.ptr, !llvm.ptr) {
+  omp.task in_reduction(byref @add_f32 %0 -> %arg0, @add_f32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
     // CHECK: "test.foo"() : () -> ()
     "test.foo"() : () -> ()
     // CHECK: omp.terminator
@@ -2125,10 +2132,10 @@ func.func @omp_task(%bool_var: i1, %i64_var: i64, %i32_var: i32, %data_var: memr
   omp.task allocate(%data_var : memref<i32> -> %data_var : memref<i32>)
       // CHECK-SAME: final(%[[bool_var]]) if(%[[bool_var]])
       final(%bool_var) if(%bool_var)
-      // CHECK-SAME: in_reduction(@add_f32 -> %[[redn_var1]] : !llvm.ptr, byref @add_f32 -> %[[redn_var2]] : !llvm.ptr)
-      in_reduction(@add_f32 -> %0 : !llvm.ptr, byref @add_f32 -> %1 : !llvm.ptr)
       // CHECK-SAME: priority(%[[i32_var]] : i32) untied
-      priority(%i32_var : i32) untied {
+      priority(%i32_var : i32) untied
+      // CHECK-SAME: in_reduction(@add_f32 %[[redn_var1]] -> %{{.+}}, byref @add_f32 %[[redn_var2]] -> %{{.+}} : !llvm.ptr, !llvm.ptr)
+      in_reduction(@add_f32 %0 -> %arg0, byref @add_f32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
     // CHECK: "test.foo"() : () -> ()
     "test.foo"() : () -> ()
     // CHECK: omp.terminator
@@ -2304,8 +2311,8 @@ func.func @omp_taskgroup_multiple_tasks() -> () {
 func.func @omp_taskgroup_clauses() -> () {
   %testmemref = "test.memref"() : () -> (memref<i32>)
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
-  // CHECK: omp.taskgroup allocate(%{{.+}}: memref<i32> -> %{{.+}}: memref<i32>) task_reduction(@add_f32 -> %{{.+}}: !llvm.ptr)
-  omp.taskgroup allocate(%testmemref : memref<i32> -> %testmemref : memref<i32>) task_reduction(@add_f32 -> %testf32 : !llvm.ptr) {
+  // CHECK: omp.taskgroup allocate(%{{.+}}: memref<i32> -> %{{.+}} : memref<i32>) task_reduction(@add_f32 %{{.+}} -> %{{.+}} : !llvm.ptr)
+  omp.taskgroup allocate(%testmemref : memref<i32> -> %testmemref : memref<i32>) task_reduction(@add_f32 %testf32 -> %arg0 : !llvm.ptr) {
     // CHECK: omp.task
     omp.task {
       "test.foo"() : () -> ()
@@ -2376,8 +2383,8 @@ func.func @omp_taskloop(%lb: i32, %ub: i32, %step: i32) -> () {
 
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
-  // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) {
-  omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) {
+  // CHECK: omp.taskloop in_reduction(@add_f32 %{{.+}} -> %{{.+}}, @add_f32 %{{.+}} -> %{{.+}} : !llvm.ptr, !llvm.ptr) {
+  omp.taskloop in_reduction(@add_f32 %testf32 -> %arg0, @add_f32 %testf32_2 -> %arg1 : !llvm.ptr, !llvm.ptr) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       // CHECK: omp.yield
       omp.yield
@@ -2386,8 +2393,8 @@ func.func @omp_taskloop(%lb: i32, %ub: i32, %step: i32) -> () {
   }
 
   // Checking byref attribute for in_reduction
-  // CHECK: omp.taskloop in_reduction(byref @add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) {
-  omp.taskloop in_reduction(byref @add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) {
+  // CHECK: omp.taskloop in_reduction(byref @add_f32 %{{.+}} -> %{{.+}}, @add_f32 %{{.+}} -> %{{.+}} : !llvm.ptr, !llvm.ptr) {
+  omp.taskloop in_reduction(byref @add_f32 %testf32 -> %arg0, @add_f32 %testf32_2 -> %arg1 : !llvm.ptr, !llvm.ptr) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       // CHECK: omp.yield
       omp.yield
@@ -2395,8 +2402,8 @@ func.func @omp_taskloop(%lb: i32, %ub: i32, %step: i32) -> () {
     omp.terminator
   }
 
-  // CHECK: omp.taskloop reduction(byref @add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) {
-  omp.taskloop reduction(byref @add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) {
+  // CHECK: omp.taskloop reduction(byref @add_f32 %{{.+}} -> %{{.+}}, @add_f32 %{{.+}} -> %{{.+}} : !llvm.ptr, !llvm.ptr) {
+  omp.taskloop reduction(byref @add_f32 %testf32 -> %arg0, @add_f32 %testf32_2 -> %arg1 : !llvm.ptr, !llvm.ptr) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       // CHECK: omp.yield
       omp.yield
@@ -2405,8 +2412,8 @@ func.func @omp_taskloop(%lb: i32, %ub: i32, %step: i32) -> () {
   }
 
   // check byref attrbute for reduction
-  // CHECK: omp.taskloop reduction(byref @add_f32 -> %{{.+}} : !llvm.ptr, byref @add_f32 -> %{{.+}} : !llvm.ptr) {
-  omp.taskloop reduction(byref @add_f32 -> %testf32 : !llvm.ptr, byref @add_f32 -> %testf32_2 : !llvm.ptr) {
+  // CHECK: omp.taskloop reduction(byref @add_f32 %{{.+}} -> %{{.+}}, byref @add_f32 %{{.+}} -> %{{.+}} : !llvm.ptr, !llvm.ptr) {
+  omp.taskloop reduction(byref @add_f32 %testf32 -> %arg0, byref @add_f32 %testf32_2 -> %arg1 : !llvm.ptr, !llvm.ptr) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       // CHECK: omp.yield
       omp.yield
@@ -2414,8 +2421,8 @@ func.func @omp_taskloop(%lb: i32, %ub: i32, %step: i32) -> () {
     omp.terminator
   }
 
-  // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr) reduction(@add_f32 -> %{{.+}} : !llvm.ptr) {
-  omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr) reduction(@add_f32 -> %testf32_2 : !llvm.ptr) {
+  // CHECK: omp.taskloop in_reduction(@add_f32 %{{.+}} -> %{{.+}} : !llvm.ptr) reduction(@add_f32 %{{.+}} -> %{{.+}} : !llvm.ptr) {
+  omp.taskloop in_reduction(@add_f32 %testf32 -> %arg0 : !llvm.ptr) reduction(@add_f32 %testf32_2 -> %arg1 : !llvm.ptr) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       // CHECK: omp.yield
       omp.yield
@@ -2604,7 +2611,6 @@ func.func @omp_targets_with_map_bounds(%arg0: !llvm.ptr, %arg1: !llvm.ptr) -> ()
 
     // CHECK: omp.target map_entries(%[[MAP0]] -> {{.*}}, %[[MAP1]] -> {{.*}} : !llvm.ptr, !llvm.ptr)
     omp.target map_entries(%mapv1 -> %arg2, %mapv2 -> %arg3 : !llvm.ptr, !llvm.ptr) {
-    ^bb0(%arg2: !llvm.ptr, %arg3: !llvm.ptr):
       omp.terminator
     }
 
@@ -2644,8 +2650,7 @@ func.func @omp_targets_is_allocatable(%arg0: !llvm.ptr, %arg1: !llvm.ptr) -> ()
   %mapv2 = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>)   map_clauses(tofrom) capture(ByRef) members(%mapv1 : [0] : !llvm.ptr) -> !llvm.ptr {name = ""}
   // CHECK: omp.target map_entries(%[[MAP0]] -> {{.*}}, %[[MAP1]] -> {{.*}} : !llvm.ptr, !llvm.ptr)
   omp.target map_entries(%mapv1 -> %arg2, %mapv2 -> %arg3 : !llvm.ptr, !llvm.ptr) {
-    ^bb0(%arg2: !llvm.ptr, %arg3 : !llvm.ptr):
-      omp.terminator
+    omp.terminator
   }
   return
 }
@@ -2673,9 +2678,8 @@ func.func @omp_target_enter_update_exit_data_depend(%a: memref<?xi32>, %b: memre
   // Compute 'b' on the target and copy it back
   // CHECK: omp.target map_entries([[MAP1]] -> {{%.*}} : memref<?xi32>) {
   omp.target map_entries(%map_b -> %arg0 : memref<?xi32>) {
-    ^bb0(%arg0: memref<?xi32>) :
-      "test.foo"(%arg0) : (memref<?xi32>) -> ()
-      omp.terminator
+    "test.foo"(%arg0) : (memref<?xi32>) -> ()
+    omp.terminator
   }
 
   // Update 'a' on the host using 'b'
@@ -2689,8 +2693,7 @@ func.func @omp_target_enter_update_exit_data_depend(%a: memref<?xi32>, %b: memre
 
   // Compute 'c' on the target and copy it back
   %map_c_from = omp.map.info var_ptr(%c: memref<?xi32>, tensor<?xi32>) map_clauses(from) capture(ByRef) -> memref<?xi32>
-  omp.target map_entries(%map_a -> %arg0, %map_c_from -> %arg1 : memref<?xi32>, memref<?xi32>) depend(taskdependout -> %c : memref<?xi32>) {
-  ^bb0(%arg0 : memref<?xi32>, %arg1 : memref<?xi32>) :
+  omp.target depend(taskdependout -> %c : memref<?xi32>) map_entries(%map_a -> %arg0, %map_c_from -> %arg1 : memref<?xi32>, memref<?xi32>) {
     "test.foobar"() : ()->()
     omp.terminator
   }
@@ -2734,9 +2737,9 @@ func.func @omp_map_with_members(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm
 // CHECK-SAME: (%[[ARG0:[^[:space:]]+]]: !llvm.ptr, %[[ARG1:[^[:space:]]+]]: !llvm.ptr)
 func.func @parallel_op_privatizers(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
   // CHECK: omp.parallel private(
-  // CHECK-SAME: @x.privatizer %[[ARG0]] -> %[[ARG0_PRIV:[^[:space:]]+]] : !llvm.ptr,
-  // CHECK-SAME: @y.privatizer %[[ARG1]] -> %[[ARG1_PRIV:[^[:space:]]+]] : !llvm.ptr)
-  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr, @y.privatizer %arg1 -> %arg3 : !llvm.ptr) {
+  // CHECK-SAME: @x.privatizer %[[ARG0]] -> %[[ARG0_PRIV:[^[:space:]]+]],
+  // CHECK-SAME: @y.privatizer %[[ARG1]] -> %[[ARG1_PRIV:[^[:space:]]+]] : !llvm.ptr, !llvm.ptr)
+  omp.parallel private(@x.privatizer %arg0 -> %arg2, @y.privatizer %arg1 -> %arg3 : !llvm.ptr, !llvm.ptr) {
     // CHECK: llvm.load %[[ARG0_PRIV]]
     %0 = llvm.load %arg2 : !llvm.ptr -> i32
     // CHECK: llvm.load %[[ARG1_PRIV]]
@@ -2783,15 +2786,15 @@ omp.private {type = firstprivate} @y.privatizer : !llvm.ptr alloc {
 // CHECK-LABEL: parallel_op_reduction_and_private
 func.func @parallel_op_reduction_and_private(%priv_var: !llvm.ptr, %priv_var2: !llvm.ptr, %reduc_var: !llvm.ptr, %reduc_var2: !llvm.ptr) {
   // CHECK: omp.parallel
-  // CHECK-SAME: reduction(
-  // CHECK-SAME: @add_f32 %[[REDUC_VAR:[^[:space:]]+]] -> %[[REDUC_ARG:[^[:space:]]+]] : !llvm.ptr,
-  // CHECK-SAME: @add_f32 %[[REDUC_VAR2:[^[:space:]]+]] -> %[[REDUC_ARG2:[^[:space:]]+]] : !llvm.ptr)
-  //
   // CHECK-SAME: private(
-  // CHECK-SAME: @x.privatizer %[[PRIV_VAR:[^[:space:]]+]] -> %[[PRIV_ARG:[^[:space:]]+]] : !llvm.ptr,
-  // CHECK-SAME: @y.privatizer %[[PRIV_VAR2:[^[:space:]]+]] -> %[[PRIV_ARG2:[^[:space:]]+]] : !llvm.ptr)
-  omp.parallel reduction(@add_f32 %reduc_var -> %reduc_arg : !llvm.ptr, @add_f32 %reduc_var2 -> %reduc_arg2 : !llvm.ptr)
-               private(@x.privatizer %priv_var -> %priv_arg : !llvm.ptr, @y.privatizer %priv_var2 -> %priv_arg2 : !llvm.ptr) {
+  // CHECK-SAME: @x.privatizer %[[PRIV_VAR:[^[:space:]]+]] -> %[[PRIV_ARG:[^[:space:]]+]],
+  // CHECK-SAME: @y.privatizer %[[PRIV_VAR2:[^[:space:]]+]] -> %[[PRIV_ARG2:[^[:space:]]+]] : !llvm.ptr, !llvm.ptr)
+  //
+  // CHECK-SAME: reduction(
+  // CHECK-SAME: @add_f32 %[[REDUC_VAR:[^[:space:]]+]] -> %[[REDUC_ARG:[^[:space:]]+]],
+  // CHECK-SAME: @add_f32 %[[REDUC_VAR2:[^[:space:]]+]] -> %[[REDUC_ARG2:[^[:space:]]+]] : !llvm.ptr, !llvm.ptr)
+  omp.parallel private(@x.privatizer %priv_var -> %priv_arg, @y.privatizer %priv_var2 -> %priv_arg2 : !llvm.ptr, !llvm.ptr)
+               reduction(@add_f32 %reduc_var -> %reduc_arg, @add_f32 %reduc_var2 -> %reduc_arg2 : !llvm.ptr, !llvm.ptr) {
     // CHECK: llvm.load %[[PRIV_ARG]]
     %0 = llvm.load %priv_arg : !llvm.ptr -> f32
     // CHECK: llvm.load %[[PRIV_ARG2]]
@@ -2816,8 +2819,6 @@ func.func @omp_target_private(%map1: memref<?xi32>, %map2: memref<?xi32>, %priv_
   // CHECK-SAME:   : !llvm.ptr
   // CHECK-SAME: )
   omp.target private(@x.privatizer %priv_var -> %priv_arg : !llvm.ptr) {
-  // CHECK: ^bb0(%[[PRIV_ARG]]: !llvm.ptr):
-  ^bb0(%priv_arg: !llvm.ptr):
     omp.terminator
   }
 
@@ -2834,9 +2835,6 @@ func.func @omp_target_private(%map1: memref<?xi32>, %map2: memref<?xi32>, %priv_
   // CHECK-SAME:   : !llvm.ptr
   // CHECK-SAME: )
   omp.target map_entries(%mapv1 -> %arg0, %mapv2 -> %arg1 : memref<?xi32>, memref<?xi32>) private(@x.privatizer %priv_var -> %priv_arg : !llvm.ptr) {
-  // CHECK: ^bb0(%[[MAP1_ARG]]: memref<?xi32>, %[[MAP2_ARG]]: memref<?xi32>
-  // CHECK-SAME: , %[[PRIV_ARG]]: !llvm.ptr):
-  ^bb0(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %priv_arg: !llvm.ptr):
     omp.terminator
   }
 
diff --git a/mlir/test/Dialect/Polynomial/canonicalization.mlir b/mlir/test/Dialect/Polynomial/canonicalization.mlir
index b79938627e415..c0ee514daab64 100644
--- a/mlir/test/Dialect/Polynomial/canonicalization.mlir
+++ b/mlir/test/Dialect/Polynomial/canonicalization.mlir
@@ -45,73 +45,3 @@ func.func @test_canonicalize_sub(%poly0 : !sub_ty, %poly1 : !sub_ty) -> !sub_ty
   return %0 : !sub_ty
 }
 
-// CHECK-LABEL: test_canonicalize_fold_add_through_ntt
-// CHECK: polynomial.add
-// CHECK-NOT: polynomial.ntt
-// CHECK-NOT: polynomial.intt
-func.func @test_canonicalize_fold_add_through_ntt(
-    %poly0 : !ntt_poly_ty,
-    %poly1 : !ntt_poly_ty) -> !ntt_poly_ty {
-  %0 = polynomial.ntt %poly0 {root=#root} : !ntt_poly_ty -> !tensor_ty
-  %1 = polynomial.ntt %poly1 {root=#root} : !ntt_poly_ty -> !tensor_ty
-  %a_plus_b = arith.addi %0, %1 : !tensor_ty
-  %out = polynomial.intt %a_plus_b {root=#root} : !tensor_ty -> !ntt_poly_ty
-  return %out : !ntt_poly_ty
-}
-
-// CHECK-LABEL: test_canonicalize_fold_add_through_intt
-// CHECK: arith.addi
-// CHECK-NOT: polynomial.intt
-// CHECK-NOT: polynomial.iintt
-func.func @test_canonicalize_fold_add_through_intt(
-    %tensor0 : !tensor_ty,
-    %tensor1 : !tensor_ty) -> !tensor_ty {
-  %0 = polynomial.intt %tensor0 {root=#root} : !tensor_ty -> !ntt_poly_ty
-  %1 = polynomial.intt %tensor1 {root=#root} : !tensor_ty -> !ntt_poly_ty
-  %a_plus_b = polynomial.add %0, %1 : !ntt_poly_ty
-  %out = polynomial.ntt %a_plus_b {root=#root} : !ntt_poly_ty -> !tensor_ty
-  return %out : !tensor_ty
-}
-
-// CHECK-LABEL: test_canonicalize_fold_sub_through_ntt
-// CHECK: polynomial.mul_scalar
-// CHECK: polynomial.add
-// CHECK-NOT: polynomial.ntt
-// CHECK-NOT: polynomial.intt
-func.func @test_canonicalize_fold_sub_through_ntt(
-    %poly0 : !ntt_poly_ty,
-    %poly1 : !ntt_poly_ty) -> !ntt_poly_ty {
-  %0 = polynomial.ntt %poly0 {root=#root} : !ntt_poly_ty -> !tensor_ty
-  %1 = polynomial.ntt %poly1 {root=#root} : !ntt_poly_ty -> !tensor_ty
-  %a_plus_b = arith.subi %0, %1 : !tensor_ty
-  %out = polynomial.intt %a_plus_b {root=#root} : !tensor_ty -> !ntt_poly_ty
-  return %out : !ntt_poly_ty
-}
-
-// CHECK-LABEL: test_canonicalize_fold_sub_through_intt
-// CHECK: arith.subi
-// CHECK-NOT: polynomial.intt
-// CHECK-NOT: polynomial.iintt
-func.func @test_canonicalize_fold_sub_through_intt(
-    %tensor0 : !tensor_ty,
-    %tensor1 : !tensor_ty) -> !tensor_ty {
-  %0 = polynomial.intt %tensor0 {root=#root} : !tensor_ty -> !ntt_poly_ty
-  %1 = polynomial.intt %tensor1 {root=#root} : !tensor_ty -> !ntt_poly_ty
-  %a_plus_b = polynomial.sub %0, %1 : !ntt_poly_ty
-  %out = polynomial.ntt %a_plus_b {root=#root} : !ntt_poly_ty -> !tensor_ty
-  return %out : !tensor_ty
-}
-
-
-// CHECK-LABEL: test_canonicalize_do_not_fold_different_roots
-// CHECK: arith.addi
-func.func @test_canonicalize_do_not_fold_different_roots(
-    %poly0 : !ntt_poly_ty,
-    %poly1 : !ntt_poly_ty) -> !ntt_poly_ty {
-  %0 = polynomial.ntt %poly0 {root=#polynomial.primitive_root<value=31:i32, degree=8:index>} : !ntt_poly_ty -> !tensor_ty
-  %1 = polynomial.ntt %poly1 {root=#polynomial.primitive_root<value=33:i32, degree=8:index>} : !ntt_poly_ty -> !tensor_ty
-  %a_plus_b = arith.addi %0, %1 : !tensor_ty
-  %out = polynomial.intt %a_plus_b {root=#root} : !tensor_ty -> !ntt_poly_ty
-  return %out : !ntt_poly_ty
-}
-
diff --git a/mlir/test/Dialect/Quant/canonicalize.mlir b/mlir/test/Dialect/Quant/canonicalize.mlir
index 36c3eaf5e10d2..73c57e2a48212 100644
--- a/mlir/test/Dialect/Quant/canonicalize.mlir
+++ b/mlir/test/Dialect/Quant/canonicalize.mlir
@@ -1,24 +1,124 @@
 // RUN: mlir-opt %s -split-input-file -pass-pipeline='builtin.module(func.func(canonicalize{test-convergence}))' | FileCheck %s
 
+// CHECK-LABEL: @dcast_fold
+// CHECK-SAME: %[[ARG_0:.*]]: tensor
+
+// CHECK: return %[[ARG_0]]
+
+!qalias = !quant.uniform<u8:f32, 2.0:128>
+func.func @dcast_fold(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  %0 = quant.qcast %arg0 : tensor<4xf32> to tensor<4x!qalias>
+  %1 = quant.dcast %0 : tensor<4x!qalias> to tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
 // -----
-// CHECK-LABEL: redundant_scast
-func.func @redundant_scast() -> tensor<4xi8> {
-  // CHECK-NEXT: arith.constant dense<10> : tensor<4xi8>
-  // CHECK-NEXT: return
-  %cst = arith.constant dense<5> : tensor<4xi8>
-  %1 = "quant.scast"(%cst) : (tensor<4xi8>) -> tensor<4x!quant.uniform<u8:f32, 7.812500e-03:128>>
-  %2 = "quant.scast"(%1) : (tensor<4x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<4xi8>
-  %3 = arith.addi %2, %2 : tensor<4xi8>
-  return %3 : tensor<4xi8>
+
+// CHECK-LABEL: @dcast_no_fold_source
+// CHECK-SAME: %[[ARG_0:.*]]: tensor
+
+// CHECK: %[[VAL_0:.*]] = quant.scast %[[ARG_0]]
+// CHECK: %[[VAL_1:.*]] = quant.dcast %[[VAL_0]]
+// CHECK: return %[[VAL_1]]
+
+!qalias = !quant.uniform<u8:f32, 2.0:128>
+func.func @dcast_no_fold_source(%arg0: tensor<4xi8>) -> tensor<4xf32> {
+  %0 = quant.scast %arg0 : tensor<4xi8> to tensor<4x!qalias>
+  %1 = quant.dcast %0 : tensor<4x!qalias> to tensor<4xf32>
+  return %1 : tensor<4xf32>
 }
 
 // -----
-// CHECK-LABEL: non_redundant_scast
-func.func @non_redundant_scast() -> tensor<4x!quant.uniform<u8:f32, 7.812500e-03:128>> {
-  // CHECK-NEXT: arith.constant dense<5> : tensor<4xi8>
-  // CHECK-NEXT: scast
-  // CHECK-NEXT: return
-  %cst = arith.constant dense<5> : tensor<4xi8>
-  %1 = "quant.scast"(%cst) : (tensor<4xi8>) -> tensor<4x!quant.uniform<u8:f32, 7.812500e-03:128>>
-  return %1 : tensor<4x!quant.uniform<u8:f32, 7.812500e-03:128>>
+
+// CHECK-LABEL: @qcast_fold
+// CHECK-SAME: %[[ARG_0:.*]]: tensor
+
+// CHECK: return %[[ARG_0]]
+
+!qalias = !quant.uniform<u8:f32, 2.0:128>
+func.func @qcast_fold(%arg0: tensor<4x!qalias>) -> tensor<4x!qalias> {
+  %0 = quant.dcast %arg0 : tensor<4x!qalias> to tensor<4xf32>
+  %1 = quant.qcast %0 : tensor<4xf32> to tensor<4x!qalias>
+  return %1 : tensor<4x!qalias>
 }
+
+// -----
+
+// CHECK-LABEL: @qcast_no_fold_source
+// CHECK-SAME: %[[ARG_0:.*]]: tensor
+
+// CHECK: %[[VAL_0:.*]] = arith.negf %[[ARG_0]]
+// CHECK: %[[VAL_1:.*]] = quant.qcast %[[VAL_0]]
+// CHECK: return %[[VAL_1]]
+
+!qalias = !quant.uniform<u8:f32, 2.0:128>
+func.func @qcast_no_fold_source(%arg0: tensor<4xf32>) -> tensor<4x!qalias> {
+  %0 = arith.negf %arg0 : tensor<4xf32>
+  %1 = quant.qcast %0 : tensor<4xf32> to tensor<4x!qalias>
+  return %1 : tensor<4x!qalias>
+}
+
+// -----
+
+// CHECK-LABEL: @qcast_no_fold_type
+// CHECK-SAME: %[[ARG_0:.*]]: tensor
+
+// CHECK: %[[VAL_0:.*]] = quant.dcast %[[ARG_0]]
+// CHECK: %[[VAL_1:.*]] = quant.qcast %[[VAL_0]]
+// CHECK: return %[[VAL_1]]
+
+!qalias = !quant.uniform<u8:f32, 2.0:128>
+!qalias1 = !quant.uniform<u8:f32, 3.0:128>
+func.func @qcast_no_fold_type(%arg0: tensor<4x!qalias>) -> tensor<4x!qalias1> {
+  %0 = quant.dcast %arg0 : tensor<4x!qalias> to tensor<4xf32>
+  %1 = quant.qcast %0 : tensor<4xf32> to tensor<4x!qalias1>
+  return %1 : tensor<4x!qalias1>
+}
+
+// -----
+
+// CHECK-LABEL: @scast_fold
+// CHECK-SAME: %[[ARG_0:.*]]: tensor
+
+// CHECK: return %[[ARG_0]]
+
+!qalias = !quant.uniform<u8:f32, 2.0:128>
+func.func @scast_fold(%arg0: tensor<4x!qalias>) -> tensor<4x!qalias> {
+  %0 = quant.scast %arg0 : tensor<4x!qalias> to tensor<4xi8>
+  %1 = quant.scast %0 : tensor<4xi8> to tensor<4x!qalias>
+  return %1 : tensor<4x!qalias>
+}
+
+// -----
+
+// CHECK-LABEL: @scast_no_fold_source
+// CHECK-SAME: %[[ARG_0:.*]]: tensor
+
+// CHECK: %[[QCAST:.*]] = quant.qcast %[[ARG_0]]
+// CHECK: %[[SCAST:.*]] = quant.scast %[[QCAST]]
+// CHECK: return %[[SCAST]]
+
+!qalias = !quant.uniform<u8:f32, 2.0:128>
+func.func @scast_no_fold_source(%arg0: tensor<4xf32>) -> tensor<4xi8> {
+  %0 = quant.qcast %arg0 : tensor<4xf32> to tensor<4x!qalias>
+  %1 = quant.scast %0 : tensor<4x!qalias> to tensor<4xi8>
+  return %1 : tensor<4xi8>
+}
+
+// -----
+
+// CHECK-LABEL: @scast_no_fold_type
+// CHECK-SAME: %[[ARG_0:.*]]: tensor
+
+// CHECK: %[[VAL_0:.*]] = quant.scast %[[ARG_0]]
+// CHECK: %[[VAL_1:.*]] = quant.scast %[[VAL_0]]
+// CHECK: return %[[VAL_1]]
+
+!qalias = !quant.uniform<u8:f32, 2.0:128>
+!qalias1 = !quant.uniform<u8:f32, 3.0:128>
+func.func @scast_no_fold_type(%arg0: tensor<4x!qalias>) -> tensor<4x!qalias1> {
+  %0 = quant.scast %arg0 : tensor<4x!qalias> to tensor<4xi8>
+  %1 = quant.scast %0 : tensor<4xi8> to tensor<4x!qalias1>
+  return %1 : tensor<4x!qalias1>
+}
+
diff --git a/mlir/test/Dialect/Quant/invalid.mlir b/mlir/test/Dialect/Quant/invalid.mlir
new file mode 100644
index 0000000000000..ba3a8e312d96e
--- /dev/null
+++ b/mlir/test/Dialect/Quant/invalid.mlir
@@ -0,0 +1,258 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+func.func @dcast_invalid_input(%arg0: f32) {
+  // expected-error@+1 {{operand #0 must be scalar or tensor of quantized type}}
+  %0 = quant.dcast %arg0 : f32 to f32
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @dcast_invalid_result(%arg0: !qalias) {
+  // expected-error@+1 {{result #0 must be scalar or tensor of floating-point}}
+  %0 = quant.dcast %arg0 : !qalias to !qalias
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @dcast_mismatch_scalar_tensor(%arg0: !qalias) {
+  // expected-error@+1 {{input and result are both scalars or both tensors with matching shape}}
+  %0 = quant.dcast %arg0 : !qalias to tensor<f32>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @dcast_mismatch_ranked_unranked_tensor(%arg0: tensor<!qalias>) {
+  // expected-error@+1 {{input and result are both scalars or both tensors with matching shape}}
+  %0 = quant.dcast %arg0 : tensor<!qalias> to tensor<*xf32>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @dcast_mismatch_static_dynamic_tensor(%arg0: tensor<2x3x!qalias>) {
+  // expected-error@+1 {{input and result are both scalars or both tensors with matching shape}}
+  %0 = quant.dcast %arg0 : tensor<2x3x!qalias> to tensor<?x3xf32>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @dcast_float_type_mismatch(%arg0: !qalias) {
+  // expected-error@+1 {{expressed type in quantized type expected to match float type}}
+  %0 = quant.dcast %arg0 : !qalias to f64
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0}>
+func.func @dcast_per_axis_scalar(%arg0: !qalias) {
+  // expected-error@+1 {{scalar types may not use per-axis quantization}}
+  %0 = quant.dcast %arg0 : !qalias to f32
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0}>
+func.func @dcast_per_axis_invalid_rank(%arg0: tensor<2x3x!qalias>) {
+  // expected-error@+1 {{quantized dimension must be less than tensor rank}}
+  %0 = quant.dcast %arg0 : tensor<2x3x!qalias> to tensor<2x3xf32>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0, 3.0}>
+func.func @dcast_per_axis_invalid_rank(%arg0: tensor<2x3x4x!qalias>) {
+  // expected-error@+1 {{quantized dimension size does not match number of scales}}
+  %0 = quant.dcast %arg0 : tensor<2x3x4x!qalias> to tensor<2x3x4xf32>
+  return
+}
+
+// -----
+
+func.func @qcast_invalid_input(%arg0: f32) {
+  // expected-error@+1 {{result #0 must be scalar or tensor of quantized type}}
+  %0 = quant.qcast %arg0 : f32 to f32
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @qcast_invalid_result(%arg0: !qalias) {
+  // expected-error@+1 {{operand #0 must be scalar or tensor of floating-point}}
+  %0 = quant.qcast %arg0 : !qalias to !qalias
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @qcast_mismatch_scalar_tensor(%arg0: tensor<f32>) {
+  // expected-error@+1 {{input and result are both scalars or both tensors with matching shape}}
+  %0 = quant.qcast %arg0 : tensor<f32> to !qalias
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @qcast_mismatch_ranked_unranked_tensor(%arg0: tensor<f32>) {
+  // expected-error@+1 {{input and result are both scalars or both tensors with matching shape}}
+  %0 = quant.qcast %arg0 : tensor<f32> to tensor<*x!qalias>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @qcast_mismatch_static_dynamic_tensor(%arg0: tensor<2x3xf32>) {
+  // expected-error@+1 {{input and result are both scalars or both tensors with matching shape}}
+  %0 = quant.qcast %arg0 : tensor<2x3xf32> to tensor<?x3x!qalias>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @qcast_float_type_mismatch(%arg0: f64) {
+  // expected-error@+1 {{expressed type in quantized type expected to match float type}}
+  %0 = quant.qcast %arg0 : f64 to !qalias
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0}>
+func.func @qcast_per_axis_scalar(%arg0: f32) {
+  // expected-error@+1 {{scalar types may not use per-axis quantization}}
+  %0 = quant.qcast %arg0 : f32 to !qalias
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0}>
+func.func @qcast_per_axis_invalid_rank(%arg0: tensor<2x3xf32>) {
+  // expected-error@+1 {{quantized dimension must be less than tensor rank}}
+  %0 = quant.qcast %arg0 : tensor<2x3xf32> to tensor<2x3x!qalias>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0, 3.0}>
+func.func @qcast_per_axis_invalid_rank(%arg0: tensor<2x3x4xf32>) {
+  // expected-error@+1 {{quantized dimension size does not match number of scales}}
+  %0 = quant.qcast %arg0 : tensor<2x3x4xf32> to tensor<2x3x4x!qalias>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @scast_invalid_input(%arg0: si32) {
+  // expected-error@+1 {{operand #0 must be scalar or tensor of signless integer or quantized type}}
+  %0 = quant.scast %arg0 : si32 to !qalias
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @scast_invalid_result(%arg0: !qalias) {
+  // expected-error@+1 {{result #0 must be scalar or tensor of signless integer or quantized type}}
+  %0 = quant.scast %arg0 : !qalias to si32
+  return
+}
+
+// -----
+
+func.func @scast_both_integers(%arg0: i8) {
+  // expected-error@+1 {{input must be integer and result must be quantized, or vice versa}}
+  %0 = quant.scast %arg0 : i8 to i8
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @scast_both_quantized(%arg0: !qalias) {
+  // expected-error@+1 {{input must be integer and result must be quantized, or vice versa}}
+  %0 = quant.scast %arg0 : !qalias to !qalias
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @scast_mismatch_scalar_tensor(%arg0: tensor<i8>) {
+  // expected-error@+1 {{input and result are both scalars or both tensors with matching shape}}
+  %0 = quant.scast %arg0 : tensor<i8> to !qalias
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @scast_mismatch_ranked_unranked_tensor(%arg0: tensor<i8>) {
+  // expected-error@+1 {{input and result are both scalars or both tensors with matching shape}}
+  %0 = quant.scast %arg0 : tensor<i8> to tensor<*x!qalias>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @scast_mismatch_static_dynamic_tensor(%arg0: tensor<2x3xi8>) {
+  // expected-error@+1 {{input and result are both scalars or both tensors with matching shape}}
+  %0 = quant.scast %arg0 : tensor<2x3xi8> to tensor<?x3x!qalias>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @qcast_integer_type_mismatch(%arg0: i32) {
+  // expected-error@+1 {{storage type in quantized type expected to match integer type}}
+  %0 = quant.scast %arg0 : i32 to !qalias
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0}>
+func.func @scast_per_axis_scalar(%arg0: i8) {
+  // expected-error@+1 {{scalar types may not use per-axis quantization}}
+  %0 = quant.scast %arg0 : i8 to !qalias
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0}>
+func.func @scast_per_axis_invalid_rank(%arg0: tensor<2x3xi8>) {
+  // expected-error@+1 {{quantized dimension must be less than tensor rank}}
+  %0 = quant.scast %arg0 : tensor<2x3xi8> to tensor<2x3x!qalias>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0, 3.0}>
+func.func @scast_per_axis_invalid_rank(%arg0: tensor<2x3x4xi8>) {
+  // expected-error@+1 {{quantized dimension size does not match number of scales}}
+  %0 = quant.scast %arg0 : tensor<2x3x4xi8> to tensor<2x3x4x!qalias>
+  return
+}
+
diff --git a/mlir/test/Dialect/Quant/lower-quant-ops.mlir b/mlir/test/Dialect/Quant/lower-quant-ops.mlir
new file mode 100644
index 0000000000000..6bba9f5c03772
--- /dev/null
+++ b/mlir/test/Dialect/Quant/lower-quant-ops.mlir
@@ -0,0 +1,511 @@
+// RUN: mlir-opt %s --lower-quant-ops --split-input-file | FileCheck %s
+
+// CHECK-LABEL: @dcast_per_layer_scalar
+// CHECK-SAME: %[[ARG_0:.*]]: !quant.uniform
+
+// CHECK: %[[STORED_INT:.*]] = quant.scast %[[ARG_0]] : !quant.uniform<i8:f32, 2.000000e+00:10> to i8
+
+// CHECK: %[[SCALE:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK: %[[ZERO_POINT:.*]] = arith.constant 10 : i8
+// CHECK: %[[STORED_FLOAT:.*]] = arith.sitofp %[[STORED_INT]] : i8 to f32
+// CHECK: %[[ZERO_POINT_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT]] : i8 to f32
+
+// CHECK: %[[SCALED:.*]] = arith.subf %[[STORED_FLOAT]], %[[ZERO_POINT_FLOAT]] : f32
+// CHECK: %[[EXPRESSED:.*]] = arith.mulf %[[SCALED]], %[[SCALE]] : f32
+// CHECK: return %[[EXPRESSED]] : f32
+
+!qalias = !quant.uniform<i8:f32, 2.0:10>
+func.func @dcast_per_layer_scalar(%arg0: !qalias) -> f32 {
+  %0 = quant.dcast %arg0 : !qalias to f32
+  return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: @dcast_per_layer_scalar_unsigned
+// CHECK-SAME: %[[ARG_0:.*]]: !quant.uniform
+
+// CHECK: %[[STORED_INT:.*]] = quant.scast %[[ARG_0]] : !quant.uniform<u8:f32, 2.000000e+00:10> to i8
+
+// CHECK: %[[SCALE:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK: %[[ZERO_POINT:.*]] = arith.constant 10 : i8
+
+// CHECK: %[[STORED_FLOAT:.*]] = arith.uitofp %[[STORED_INT]] : i8 to f32
+// CHECK: %[[ZERO_POINT_FLOAT:.*]] = arith.uitofp %[[ZERO_POINT]] : i8 to f32
+
+// CHECK: %[[SCALED:.*]] = arith.subf %[[STORED_FLOAT]], %[[ZERO_POINT_FLOAT]] : f32
+// CHECK: %[[EXPRESSED:.*]] = arith.mulf %[[SCALED]], %[[SCALE]] : f32
+// CHECK: return %[[EXPRESSED]] : f32
+
+!qalias = !quant.uniform<u8:f32, 2.0:10>
+func.func @dcast_per_layer_scalar_unsigned(%arg0: !qalias) -> f32 {
+  %0 = quant.dcast %arg0 : !qalias to f32
+  return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: @dcast_per_layer_0d
+// CHECK-SAME: %[[ARG_0:.*]]: tensor
+
+// CHECK: %[[STORED_INT:.*]] = quant.scast %[[ARG_0]] : tensor<!quant.uniform<i8:f32, 2.000000e+00:10>> to tensor<i8>
+
+// CHECK: %[[SCALE:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK: %[[ZERO_POINT:.*]] = arith.constant 10 : i8
+// CHECK: %[[SCALE_TENSOR:.*]] = tensor.splat %[[SCALE]] : tensor<f32>
+// CHECK: %[[STORED_FLOAT:.*]] = arith.sitofp %[[STORED_INT]] : tensor<i8> to tensor<f32>
+// CHECK: %[[ZERO_POINT_TENSOR:.*]] = tensor.splat %[[ZERO_POINT]] : tensor<i8>
+// CHECK: %[[ZERO_POINT_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT_TENSOR]] : tensor<i8> to tensor<f32>
+
+// CHECK: %[[SCALED:.*]] = arith.subf %[[STORED_FLOAT]], %[[ZERO_POINT_FLOAT]] : tensor<f32>
+// CHECK: %[[EXPRESSED:.*]] = arith.mulf %[[SCALED]], %[[SCALE_TENSOR]] : tensor<f32>
+// CHECK: return %[[EXPRESSED]] : tensor<f32>
+
+!qalias = !quant.uniform<i8:f32, 2.0:10>
+func.func @dcast_per_layer_0d(%arg0: tensor<!qalias>) -> tensor<f32> {
+  %0 = quant.dcast %arg0 : tensor<!qalias> to tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: @dcast_per_layer_ranked
+// CHECK-SAME: %[[ARG_0:.*]]: tensor
+
+// CHECK: %[[STORED_INT:.*]] = quant.scast %[[ARG_0]] : tensor<3x?x5x!quant.uniform<i8:f32, 2.000000e+00:10>> to tensor<3x?x5xi8>
+// CHECK: %[[SCALE:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK: %[[ZERO_POINT:.*]] = arith.constant 10 : i8
+// CHECK: %[[C_1:.*]] = arith.constant 1 : index
+// CHECK: %[[DIM_1:.*]] = tensor.dim %[[STORED_INT]], %[[C_1]] : tensor<3x?x5xi8>
+// CHECK: %[[SCALE_TENSOR:.*]] = tensor.splat %[[SCALE]]{{\[}}%[[DIM_1]]] : tensor<3x?x5xf32>
+// CHECK: %[[STORED_FLOAT:.*]] = arith.sitofp %[[STORED_INT]] : tensor<3x?x5xi8> to tensor<3x?x5xf32>
+// CHECK: %[[ZERO_POINT_TENSOR:.*]] = tensor.splat %[[ZERO_POINT]]{{\[}}%[[DIM_1]]] : tensor<3x?x5xi8>
+// CHECK: %[[ZERO_POINT_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT_TENSOR]] : tensor<3x?x5xi8> to tensor<3x?x5xf32>
+
+// CHECK: %[[SCALED:.*]] = arith.subf %[[STORED_FLOAT]], %[[ZERO_POINT_FLOAT]] : tensor<3x?x5xf32>
+// CHECK: %[[EXPRESSED:.*]] = arith.mulf %[[SCALED]], %[[SCALE_TENSOR]] : tensor<3x?x5xf32>
+// CHECK: return %[[EXPRESSED]] : tensor<3x?x5xf32>
+
+!qalias = !quant.uniform<i8:f32, 2.0:10>
+func.func @dcast_per_layer_ranked(%arg0: tensor<3x?x5x!qalias>) -> tensor<3x?x5xf32> {
+  %0 = quant.dcast %arg0 : tensor<3x?x5x!qalias> to tensor<3x?x5xf32>
+  return %0 : tensor<3x?x5xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @dcast_per_layer_unranked
+// CHECK-SAME: %[[ARG_0:.*]]: tensor
+
+// CHECK: %[[STORED_INT:.*]] = quant.scast %[[ARG_0]] : tensor<*x!quant.uniform<i8:f32, 2.000000e+00:10>> to tensor<*xi8>
+// CHECK: %[[INPUT_SHAPE:.*]] = shape.shape_of %[[STORED_INT]] : tensor<*xi8> -> tensor<?xindex>
+// CHECK: %[[INPUT_SIZE:.*]] = shape.num_elements %[[INPUT_SHAPE]] : tensor<?xindex> -> index
+// CHECK: %[[COLLAPSED_SHAPE:.*]] = tensor.from_elements %[[INPUT_SIZE]] : tensor<1xindex>
+// CHECK: %[[STORED_COLLAPSED:.*]] = tensor.reshape %[[STORED_INT]](%[[COLLAPSED_SHAPE]]) : (tensor<*xi8>, tensor<1xindex>) -> tensor<?xi8>
+// CHECK: %[[SCALE:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK: %[[ZERO_POINT:.*]] = arith.constant 10 : i8
+// CHECK: %[[C_0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM_0:.*]] = tensor.dim %[[STORED_COLLAPSED]], %[[C_0]] : tensor<?xi8>
+// CHECK: %[[SCALE_TENSOR:.*]] = tensor.splat %[[SCALE]]{{\[}}%[[DIM_0]]] : tensor<?xf32>
+// CHECK: %[[STORED_FLOAT:.*]] = arith.sitofp %[[STORED_COLLAPSED]] : tensor<?xi8> to tensor<?xf32>
+// CHECK: %[[ZERO_POINT_TENSOR:.*]] = tensor.splat %[[ZERO_POINT]]{{\[}}%[[DIM_0]]] : tensor<?xi8>
+// CHECK: %[[ZERO_POINT_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT_TENSOR]] : tensor<?xi8> to tensor<?xf32>
+
+// CHECK: %[[SCALED:.*]] = arith.subf %[[STORED_FLOAT]], %[[ZERO_POINT_FLOAT]] : tensor<?xf32>
+// CHECK: %[[EXPRESSED:.*]] = arith.mulf %[[SCALED]], %[[SCALE_TENSOR]] : tensor<?xf32>
+
+// CHECK: %[[EXPRESSED_EXPANDED:.*]] = tensor.reshape %[[EXPRESSED]](%[[INPUT_SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+// CHECK: return %[[EXPRESSED_EXPANDED]] : tensor<*xf32>
+
+!qalias = !quant.uniform<i8:f32, 2.0:10>
+func.func @dcast_per_layer_unranked(%arg0: tensor<*x!qalias>) -> tensor<*xf32> {
+  %0 = quant.dcast %arg0 : tensor<*x!qalias> to tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2, d3) -> (d1)>
+
+// CHECK-LABEL: @dcast_per_channel_ranked
+// CHECK-SAME: %[[ARG_0:.*]]: tensor
+
+// CHECK: %[[STORED_TENSOR:.*]] = quant.scast %[[ARG_0]] : tensor<4x?x?x5x!quant.uniform<i8:f32:1, {2.000000e+00:10,3.000000e+00:20}>> to tensor<4x?x?x5xi8>
+
+// CHECK: %[[SCALES:.*]] = arith.constant dense<[2.000000e+00, 3.000000e+00]> : tensor<2xf32>
+// CHECK: %[[ZERO_POINTS:.*]] = arith.constant dense<[10, 20]> : tensor<2xi8>
+// CHECK: %[[C_1:.*]] = arith.constant 1 : index
+// CHECK: %[[DIM_1:.*]] = tensor.dim %[[STORED_TENSOR]], %[[C_1]] : tensor<4x?x?x5xi8>
+// CHECK: %[[C_2:.*]] = arith.constant 2 : index
+// CHECK: %[[DIM_2:.*]] = tensor.dim %[[STORED_TENSOR]], %[[C_2]] : tensor<4x?x?x5xi8>
+// CHECK: %[[INIT:.*]] = tensor.empty(%[[DIM_1]], %[[DIM_2]]) : tensor<4x?x?x5xf32>
+// CHECK: %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_0]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[STORED_TENSOR]], %[[SCALES]], %[[ZERO_POINTS]] : tensor<4x?x?x5xi8>, tensor<2xf32>, tensor<2xi8>) outs(%[[INIT]] : tensor<4x?x?x5xf32>) {
+// CHECK: ^bb0(%[[STORED_INT:.*]]: i8, %[[SCALE:.*]]: f32, %[[ZERO_POINT:.*]]: i8, %[[OUT:.*]]: f32):
+// CHECK:   %[[STORED_FLOAT:.*]] = arith.sitofp %[[STORED_INT]] : i8 to f32
+// CHECK:   %[[ZERO_POINT_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT]] : i8 to f32
+// CHECK:   %[[SCALED:.*]] = arith.subf %[[STORED_FLOAT]], %[[ZERO_POINT_FLOAT]] : f32
+// CHECK:   %[[EXPRESSED:.*]] = arith.mulf %[[SCALED]], %[[SCALE]] : f32
+// CHECK:   linalg.yield %[[EXPRESSED]] : f32
+// CHECK: } -> tensor<4x?x?x5xf32>
+// CHECK: return %[[GENERIC]] : tensor<4x?x?x5xf32>
+
+!qalias = !quant.uniform<i8:f32:1, {2.0:10, 3.0:20}>
+func.func @dcast_per_channel_ranked(%arg0: tensor<4x?x?x5x!qalias>) -> tensor<4x?x?x5xf32> {
+  %0 = quant.dcast %arg0 : tensor<4x?x?x5x!qalias> to tensor<4x?x?x5xf32>
+  return %0 : tensor<4x?x?x5xf32>
+}
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1)>
+
+// CHECK-LABEL: @dcast_per_channel_unranked
+// CHECK-SAME: %[[ARG_0:.*]]: tensor
+
+// CHECK: %[[STORED_TENSOR:.*]] = quant.scast %[[ARG_0]] : tensor<*x!quant.uniform<i8:f32:2, {2.000000e+00:10,3.000000e+00:20,4.000000e+00:30}>> to tensor<*xi8>
+// CHECK: %[[SHAPE:.*]] = shape.shape_of %[[STORED_TENSOR]] : tensor<*xi8> -> tensor<?xindex>
+// CHECK: %[[CHANNEL_AXIS:.*]] = arith.constant 2 : index
+// CHECK: %[[CHANNEL_AXIS_NEXT:.*]] = arith.constant 3 : index
+// CHECK: %[[SHAPE_LEFT:.*]], %[[DISCARDED_0:.*]] = "shape.split_at"(%[[SHAPE]], %[[CHANNEL_AXIS]]) : (tensor<?xindex>, index) -> (tensor<?xindex>, tensor<?xindex>)
+// CHECK: %[[SIZE_LEFT:.*]] = shape.num_elements %[[SHAPE_LEFT]] : tensor<?xindex> -> index
+// CHECK: %[[DISCARDED_1:.*]], %[[SHAPE_RIGHT:.*]] = "shape.split_at"(%[[SHAPE]], %[[CHANNEL_AXIS_NEXT]]) : (tensor<?xindex>, index) -> (tensor<?xindex>, tensor<?xindex>)
+// CHECK: %[[SIZE_RIGHT:.*]] = shape.num_elements %[[SHAPE_RIGHT]] : tensor<?xindex> -> index
+
+// CHECK: %[[NUM_CHANNELS:.*]] = arith.constant 3 : index
+// CHECK: %[[COLLAPSED_SHAPE:.*]] = tensor.from_elements %[[SIZE_LEFT]], %[[NUM_CHANNELS]], %[[SIZE_RIGHT]] : tensor<3xindex>
+// CHECK: %[[STORED_COLLAPSED:.*]] = tensor.reshape %[[STORED_TENSOR]](%[[COLLAPSED_SHAPE]]) : (tensor<*xi8>, tensor<3xindex>) -> tensor<?x3x?xi8>
+
+// CHECK: %[[SCALES:.*]] = arith.constant dense<[2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<3xf32>
+// CHECK: %[[ZERO_POINTS:.*]] = arith.constant dense<[10, 20, 30]> : tensor<3xi8>
+// CHECK: %[[C_0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM_0:.*]] = tensor.dim %[[STORED_COLLAPSED]], %[[C_0]] : tensor<?x3x?xi8>
+// CHECK: %[[C_2:.*]] = arith.constant 2 : index
+// CHECK: %[[DIM_2:.*]] = tensor.dim %[[STORED_COLLAPSED]], %[[C_2]] : tensor<?x3x?xi8>
+// CHECK: %[[INIT:.*]] = tensor.empty(%[[DIM_0]], %[[DIM_2]]) : tensor<?x3x?xf32>
+// CHECK: %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_0]]], iterator_types = ["parallel", "parallel", "parallel"]} ins(%[[STORED_COLLAPSED]], %[[SCALES]], %[[ZERO_POINTS]] : tensor<?x3x?xi8>, tensor<3xf32>, tensor<3xi8>) outs(%[[INIT]] : tensor<?x3x?xf32>) {
+// CHECK: ^bb0(%[[STORED_INT:.*]]: i8, %[[SCALE:.*]]: f32, %[[ZERO_POINT:.*]]: i8, %[[OUT:.*]]: f32):
+// CHECK:   %[[STORED_FLOAT:.*]] = arith.sitofp %[[STORED_INT]] : i8 to f32
+// CHECK:   %[[ZERO_POINT_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT]] : i8 to f32
+// CHECK:   %[[SCALED:.*]] = arith.subf %[[STORED_FLOAT]], %[[ZERO_POINT_FLOAT]] : f32
+// CHECK:   %[[EXPRESSED:.*]] = arith.mulf %[[SCALED]], %[[SCALE]] : f32
+// CHECK:   linalg.yield %[[EXPRESSED]] : f32
+// CHECK: } -> tensor<?x3x?xf32>
+
+// CHECK: %[[EXPRESSED_EXPANDED:.*]] = tensor.reshape %[[GENERIC]](%[[SHAPE]]) : (tensor<?x3x?xf32>, tensor<?xindex>) -> tensor<*xf32>
+// CHECK: return %[[EXPRESSED_EXPANDED]] : tensor<*xf32>
+
+!qalias = !quant.uniform<i8:f32:2, {2.0:10, 3.0:20, 4.0:30}>
+func.func @dcast_per_channel_unranked(%arg0: tensor<*x!qalias>) -> tensor<*xf32> {
+  %0 = quant.dcast %arg0 : tensor<*x!qalias> to tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @qcast_per_layer_scalar
+// CHECK-SAME: %[[ARG_0:.*]]: f32
+
+// CHECK-DAG: %[[SCALE:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK-DAG: %[[ZERO_POINT:.*]] = arith.constant 10 : i8
+
+// CHECK: %[[SCALED:.*]] = arith.divf %[[ARG_0]], %[[SCALE]] : f32
+// CHECK: %[[ZERO_POINT_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT]] : i8 to f32
+// CHECK: %[[STORED:.*]] = arith.addf %[[SCALED]], %[[ZERO_POINT_FLOAT]] : f32
+// CHECK: %[[STORED_INT:.*]] = arith.fptosi %[[STORED]] : f32 to i8
+
+// CHECK: %[[STORED_QUANT:.*]] = quant.scast %[[STORED_INT]] : i8 to !quant.uniform<i8:f32, 2.000000e+00:10>
+// CHECK: return %[[STORED_QUANT]] : !quant.uniform<i8:f32, 2.000000e+00:10>
+
+!qalias = !quant.uniform<i8:f32, 2.0:10>
+func.func @qcast_per_layer_scalar(%arg0: f32) -> !qalias {
+  %0 = quant.qcast %arg0 : f32 to !qalias
+  return %0 : !qalias
+}
+
+// -----
+
+// CHECK-LABEL: @qcast_per_layer_scalar_bounds
+// CHECK-SAME: %[[ARG_0:.*]]: f32
+
+// CHECK-DAG: %[[SCALE:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK-DAG: %[[ZERO_POINT:.*]] = arith.constant 0 : i8
+
+// CHECK: %[[SCALED:.*]] = arith.divf %[[ARG_0]], %[[SCALE]] : f32
+// CHECK: %[[STORED_INT:.*]] = arith.fptosi %[[SCALED]] : f32 to i8
+
+// CHECK-DAG: %[[C_NEG_5:.*]] = arith.constant -5 : i8
+// CHECK-DAG: %[[C_10:.*]] = arith.constant 10 : i8
+// CHECK: %[[STORED_CLAMPED_TEMP:.*]] = arith.maxsi %[[STORED_INT]], %[[C_NEG_5]] : i8
+// CHECK: %[[STORED_CLAMPED:.*]] = arith.minsi %[[STORED_CLAMPED_TEMP]], %[[C_10]] : i8
+
+// CHECK: %[[STORED_QUANT:.*]] = quant.scast %[[STORED_CLAMPED]] : i8 to !quant.uniform<i8<-5:10>:f32, 2.000000e+00>
+// CHECK: return %[[STORED_QUANT]] : !quant.uniform<i8<-5:10>:f32, 2.000000e+00>
+
+!qalias = !quant.uniform<i8<-5:10>:f32, 2.0>
+func.func @qcast_per_layer_scalar_bounds(%arg0: f32) -> !qalias {
+  %0 = quant.qcast %arg0 : f32 to !qalias
+  return %0 : !qalias
+}
+
+// -----
+
+// CHECK-LABEL: @qcast_per_layer_scalar_unsigned_bounds
+// CHECK-SAME: %[[ARG_0:.*]]: f32
+
+// CHECK-DAG: %[[SCALE:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK-DAG: %[[ZERO_POINT:.*]] = arith.constant 0 : i8
+
+// CHECK: %[[SCALED:.*]] = arith.divf %[[ARG_0]], %[[SCALE]] : f32
+// CHECK: %[[STORED_INT:.*]] = arith.fptoui %[[SCALED]] : f32 to i8
+
+// CHECK-DAG: %[[C_2:.*]] = arith.constant 2 : i8
+// CHECK-DAG: %[[C_10:.*]] = arith.constant 10 : i8
+// CHECK: %[[STORED_CLAMPED_TEMP:.*]] = arith.maxui %[[STORED_INT]], %[[C_2]] : i8
+// CHECK: %[[STORED_CLAMPED:.*]] = arith.minui %[[STORED_CLAMPED_TEMP]], %[[C_10]] : i8
+
+// CHECK: %[[STORED_QUANT:.*]] = quant.scast %[[STORED_CLAMPED]] : i8 to !quant.uniform<u8<2:10>:f32, 2.000000e+00>
+// CHECK: return %[[STORED_QUANT]] : !quant.uniform<u8<2:10>:f32, 2.000000e+00>
+
+!qalias = !quant.uniform<u8<2:10>:f32, 2.0>
+func.func @qcast_per_layer_scalar_unsigned_bounds(%arg0: f32) -> !qalias {
+  %0 = quant.qcast %arg0 : f32 to !qalias
+  return %0 : !qalias
+}
+
+// -----
+
+// CHECK-LABEL: @qcast_per_layer_0d
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<f32>
+
+// CHECK-DAG: %[[SCALE:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK-DAG: %[[ZERO_POINT:.*]] = arith.constant 10 : i8
+
+// CHECK: %[[SCALE_TENSOR:.*]] = tensor.splat %[[SCALE]] : tensor<f32>
+// CHECK: %[[SCALED:.*]] = arith.divf %[[ARG_0]], %[[SCALE_TENSOR]] : tensor<f32>
+
+// CHECK: %[[ZERO_POINT_TENSOR:.*]] = tensor.splat %[[ZERO_POINT]] : tensor<i8>
+// CHECK: %[[ZERO_POINT_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT_TENSOR]] : tensor<i8> to tensor<f32>
+// CHECK: %[[STORED_FLOAT:.*]] = arith.addf %[[SCALED]], %[[ZERO_POINT_FLOAT]] : tensor<f32>
+// CHECK: %[[STORED_INT:.*]] = arith.fptosi %[[STORED_FLOAT]] : tensor<f32> to tensor<i8>
+
+// CHECK: %[[STORED_QUANT:.*]] = quant.scast %[[STORED_INT]] : tensor<i8> to tensor<!quant.uniform<i8:f32, 2.000000e+00:10>>
+// CHECK: return %[[STORED_QUANT]] : tensor<!quant.uniform<i8:f32, 2.000000e+00:10>>
+
+!qalias = !quant.uniform<i8:f32, 2.0:10>
+func.func @qcast_per_layer_0d(%arg0: tensor<f32>) -> tensor<!qalias> {
+  %0 = quant.qcast %arg0 : tensor<f32> to tensor<!qalias>
+  return %0 : tensor<!qalias>
+}
+
+// -----
+
+// CHECK-LABEL: @qcast_per_layer_ranked
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<3x?x5xf32>
+
+// CHECK-DAG: %[[SCALE:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK-DAG: %[[ZERO_POINT:.*]] = arith.constant 10 : i8
+// CHECK-DAG: %[[C_1:.*]] = arith.constant 1 : index
+
+// CHECK: %[[DIM_1:.*]] = tensor.dim %[[ARG_0]], %[[C_1]] : tensor<3x?x5xf32>
+// CHECK: %[[SCALE_TENSOR:.*]] = tensor.splat %[[SCALE]]{{\[}}%[[DIM_1]]] : tensor<3x?x5xf32>
+// CHECK: %[[SCALED:.*]] = arith.divf %[[ARG_0]], %[[SCALE_TENSOR]] : tensor<3x?x5xf32>
+
+// CHECK: %[[ZERO_POINT_TENSOR:.*]] = tensor.splat %[[ZERO_POINT]]{{\[}}%[[DIM_1]]] : tensor<3x?x5xi8>
+// CHECK: %[[ZERO_POINT_TENSOR_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT_TENSOR]] : tensor<3x?x5xi8> to tensor<3x?x5xf32>
+// CHECK: %[[STORED:.*]] = arith.addf %[[SCALED]], %[[ZERO_POINT_TENSOR_FLOAT]] : tensor<3x?x5xf32>
+// CHECK: %[[STORED_INT:.*]] = arith.fptosi %[[STORED]] : tensor<3x?x5xf32> to tensor<3x?x5xi8>
+
+// CHECK: %[[STORED_QUANT:.*]] = quant.scast %[[STORED_INT]] : tensor<3x?x5xi8> to tensor<3x?x5x!quant.uniform<i8:f32, 2.000000e+00:10>>
+// CHECK: return %[[STORED_QUANT]] : tensor<3x?x5x!quant.uniform<i8:f32, 2.000000e+00:10>>
+
+!qalias = !quant.uniform<i8:f32, 2.0:10>
+func.func @qcast_per_layer_ranked(%arg0: tensor<3x?x5xf32>) -> tensor<3x?x5x!qalias> {
+  %0 = quant.qcast %arg0 : tensor<3x?x5xf32> to tensor<3x?x5x!qalias>
+  return %0 : tensor<3x?x5x!qalias>
+}
+
+// -----
+
+// CHECK-LABEL: @qcast_per_layer_ranked_bounds
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<3x5xf32>
+
+// CHECK-DAG: %[[SCALE:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK-DAG: %[[ZERO_POINT:.*]] = arith.constant 10 : i8
+
+// CHECK: %[[SCALE_SPLAT:.*]] = tensor.splat %[[SCALE]] : tensor<3x5xf32>
+// CHECK: %[[SCALED:.*]] = arith.divf %[[ARG_0]], %[[SCALE_SPLAT]] : tensor<3x5xf32>
+
+// CHECK: %[[ZERO_POINT_SPLAT:.*]] = tensor.splat %[[ZERO_POINT]] : tensor<3x5xi8>
+// CHECK: %[[ZERO_POINT_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT_SPLAT]] : tensor<3x5xi8> to tensor<3x5xf32>
+
+// CHECK: %[[STORED_FLOAT:.*]] = arith.addf %[[SCALED]], %[[ZERO_POINT_FLOAT]] : tensor<3x5xf32>
+// CHECK: %[[STORED_INT:.*]] = arith.fptosi %[[STORED_FLOAT]] : tensor<3x5xf32> to tensor<3x5xi8>
+
+// CHECK-DAG: %[[C_NEG_8:.*]] = arith.constant -8 : i8
+// CHECK-DAG: %[[C_7:.*]] = arith.constant 7 : i8
+// CHECK-DAG: %[[SPLAT_NEG_8:.*]] = tensor.splat %[[C_NEG_8]] : tensor<3x5xi8>
+// CHECK-DAG: %[[SPLAT_7:.*]] = tensor.splat %[[C_7]] : tensor<3x5xi8>
+// CHECK: %[[STORED_CLAMPED_TEMP:.*]] = arith.maxsi %[[STORED_INT]], %[[SPLAT_NEG_8]] : tensor<3x5xi8>
+// CHECK: %[[STORED_CLAMPED:.*]] = arith.minsi %[[STORED_CLAMPED_TEMP]], %[[SPLAT_7]] : tensor<3x5xi8>
+
+// CHECK: %[[STORED_QUANT:.*]] = quant.scast %[[STORED_CLAMPED]] : tensor<3x5xi8> to tensor<3x5x!quant.uniform<i8<-8:7>:f32, 2.000000e+00:10>>
+// CHECK: return %[[STORED_QUANT]] : tensor<3x5x!quant.uniform<i8<-8:7>:f32, 2.000000e+00:10>>
+
+!qalias = !quant.uniform<i8<-8:7>:f32, 2.0:10>
+func.func @qcast_per_layer_ranked_bounds(%arg0: tensor<3x5xf32>) -> tensor<3x5x!qalias> {
+  %0 = quant.qcast %arg0 : tensor<3x5xf32> to tensor<3x5x!qalias>
+  return %0 : tensor<3x5x!qalias>
+}
+
+// -----
+
+// CHECK-LABEL: @qcast_per_layer_unranked
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<*xf32>
+
+// CHECK: %[[SHAPE:.*]] = shape.shape_of %[[ARG_0]] : tensor<*xf32> -> tensor<?xindex>
+// CHECK: %[[SIZE:.*]] = shape.num_elements %[[SHAPE]] : tensor<?xindex> -> index
+// CHECK: %[[SIZE_TENSOR:.*]] = tensor.from_elements %[[SIZE]] : tensor<1xindex>
+// CHECK: %[[RANKED_INPUT:.*]] = tensor.reshape %[[ARG_0]](%[[SIZE_TENSOR]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+
+// CHECK-DAG: %[[SCALE:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK-DAG: %[[ZERO_POINT:.*]] = arith.constant 10 : i8
+// CHECK-DAG: %[[C_0:.*]] = arith.constant 0 : index
+
+// CHECK: %[[DIM_0:.*]] = tensor.dim %[[RANKED_INPUT]], %[[C_0]] : tensor<?xf32>
+// CHECK: %[[SCALE_SPLAT:.*]] = tensor.splat %[[SCALE]]{{\[}}%[[DIM_0]]] : tensor<?xf32>
+// CHECK: %[[SCALED:.*]] = arith.divf %[[RANKED_INPUT]], %[[SCALE_SPLAT]] : tensor<?xf32>
+
+// CHECK: %[[ZERO_POINT_SPLAT:.*]] = tensor.splat %[[ZERO_POINT]]{{\[}}%[[DIM_0]]] : tensor<?xi8>
+// CHECK: %[[ZERO_POINT_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT_SPLAT]] : tensor<?xi8> to tensor<?xf32>
+// CHECK: %[[STORED:.*]] = arith.addf %[[SCALED]], %[[ZERO_POINT_FLOAT]] : tensor<?xf32>
+// CHECK: %[[STORED_INT:.*]] = arith.fptosi %[[STORED]] : tensor<?xf32> to tensor<?xi8>
+
+// CHECK: %[[STORED_UNRANKED:.*]] = tensor.reshape %[[STORED_INT]](%[[SHAPE]]) : (tensor<?xi8>, tensor<?xindex>) -> tensor<*xi8>
+// CHECK: %[[STORED_QUANT:.*]] = quant.scast %[[STORED_UNRANKED]] : tensor<*xi8> to tensor<*x!quant.uniform<i8:f32, 2.000000e+00:10>>
+// CHECK: return %[[STORED_QUANT]] : tensor<*x!quant.uniform<i8:f32, 2.000000e+00:10>>
+
+!qalias = !quant.uniform<i8:f32, 2.0:10>
+func.func @qcast_per_layer_unranked(%arg0: tensor<*xf32>) -> tensor<*x!qalias> {
+  %0 = quant.qcast %arg0 : tensor<*xf32> to tensor<*x!qalias>
+  return %0 : tensor<*x!qalias>
+}
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2, d3) -> (d1)>
+
+// CHECK-LABEL: @qcast_per_channel_ranked
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<4x?x?x5xf32>
+
+// CHECK: %[[SCALES:.*]] = arith.constant dense<[2.000000e+00, 3.000000e+00]> : tensor<2xf32>
+// CHECK: %[[ZERO_POINTS:.*]] = arith.constant dense<[10, 20]> : tensor<2xi8>
+
+// CHECK-DAG: %[[C_1:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[DIM_1:.*]] = tensor.dim %[[ARG_0]], %[[C_1]] : tensor<4x?x?x5xf32>
+// CHECK-DAG: %[[C_2:.*]] = arith.constant 2 : index
+// CHECK-DAG: %[[DIM_2:.*]] = tensor.dim %[[ARG_0]], %[[C_2]] : tensor<4x?x?x5xf32>
+// CHECK: %[[INIT:.*]] = tensor.empty(%[[DIM_1]], %[[DIM_2]]) : tensor<4x?x?x5xi8>
+
+// CHECK: %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_0]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[ARG_0]], %[[SCALES]], %[[ZERO_POINTS]] : tensor<4x?x?x5xf32>, tensor<2xf32>, tensor<2xi8>) outs(%[[INIT]] : tensor<4x?x?x5xi8>) {
+// CHECK: ^bb0(%[[IN:.*]]: f32, %[[SCALE:.*]]: f32, %[[ZERO_POINT:.*]]: i8, %[[OUT:.*]]: i8):
+// CHECK:   %[[SCALED:.*]] = arith.divf %[[IN]], %[[SCALE]] : f32
+// CHECK:   %[[ZERO_POINT_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT]] : i8 to f32
+// CHECK:   %[[STORED_FLOAT:.*]] = arith.addf %[[SCALED]], %[[ZERO_POINT_FLOAT]] : f32
+// CHECK:   %[[STORED_INT:.*]] = arith.fptosi %[[STORED_FLOAT]] : f32 to i8
+// CHECK:   linalg.yield %[[STORED_INT]] : i8
+// CHECK: } -> tensor<4x?x?x5xi8>
+
+// CHECK: %[[STORED_QUANT:.*]] = quant.scast %[[GENERIC]] : tensor<4x?x?x5xi8> to tensor<4x?x?x5x!quant.uniform<i8:f32:1, {2.000000e+00:10,3.000000e+00:20}>>
+// CHECK: return %[[STORED_QUANT]] : tensor<4x?x?x5x!quant.uniform<i8:f32:1, {2.000000e+00:10,3.000000e+00:20}>>
+
+!qalias = !quant.uniform<i8:f32:1, {2.0:10, 3.0:20}>
+func.func @qcast_per_channel_ranked(%arg0: tensor<4x?x?x5xf32>) -> tensor<4x?x?x5x!qalias> {
+  %0 = quant.qcast %arg0 : tensor<4x?x?x5xf32> to tensor<4x?x?x5x!qalias>
+  return %0 : tensor<4x?x?x5x!qalias>
+}
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1)>
+
+// CHECK-LABEL: @qcast_per_channel_ranked_bounds
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<4x2x5xf32>
+
+// CHECK: %[[SCALES:.*]] = arith.constant dense<[2.000000e+00, 3.000000e+00]> : tensor<2xf32>
+// CHECK: %[[ZERO_POINTS:.*]] = arith.constant dense<0> : tensor<2xi8>
+
+// CHECK: %[[INIT:.*]] = tensor.empty() : tensor<4x2x5xi8>
+// CHECK: %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_0]]], iterator_types = ["parallel", "parallel", "parallel"]} ins(%[[ARG_0]], %[[SCALES]], %[[ZERO_POINTS]] : tensor<4x2x5xf32>, tensor<2xf32>, tensor<2xi8>) outs(%[[INIT]] : tensor<4x2x5xi8>) {
+// CHECK: ^bb0(%[[IN:.*]]: f32, %[[SCALE:.*]]: f32, %[[ZERO_POINT:.*]]: i8, %[[OUT:.*]]: i8):
+// CHECK:   %[[SCALED:.*]] = arith.divf %[[IN]], %[[SCALE]] : f32
+// CHECK:   %[[ZERO_POINT_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT]] : i8 to f32
+// CHECK:   %[[STORED_FLOAT:.*]] = arith.addf %[[SCALED]], %[[ZERO_POINT_FLOAT]] : f32
+// CHECK:   %[[STORED_INT:.*]] = arith.fptosi %[[STORED_FLOAT]] : f32 to i8
+// CHECK:   %[[C_NEG_8:.*]] = arith.constant -8 : i8
+// CHECK:   %[[C_7:.*]] = arith.constant 7 : i8
+// CHECK:   %[[STORED_CLAMPED_TEMP:.*]] = arith.maxsi %[[STORED_INT]], %[[C_NEG_8]] : i8
+// CHECK:   %[[STORED_CLAMPED:.*]] = arith.minsi %[[STORED_CLAMPED_TEMP]], %[[C_7]] : i8
+// CHECK:   linalg.yield %[[STORED_CLAMPED]] : i8
+// CHECK: } -> tensor<4x2x5xi8>
+
+// CHECK: %[[STORED_QUANT:.*]] = quant.scast %[[GENERIC]] : tensor<4x2x5xi8> to tensor<4x2x5x!quant.uniform<i8<-8:7>:f32:1, {2.000000e+00,3.000000e+00}>>
+// CHECK: return %[[STORED_QUANT]] : tensor<4x2x5x!quant.uniform<i8<-8:7>:f32:1, {2.000000e+00,3.000000e+00}>>
+
+!qalias = !quant.uniform<i8<-8:7>:f32:1, {2.0, 3.0}>
+func.func @qcast_per_channel_ranked_bounds(%arg0: tensor<4x2x5xf32>) -> tensor<4x2x5x!qalias> {
+  %0 = quant.qcast %arg0 : tensor<4x2x5xf32> to tensor<4x2x5x!qalias>
+  return %0 : tensor<4x2x5x!qalias>
+}
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1)>
+
+// CHECK-LABEL: @qcast_per_channel_unranked
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<*xf32>
+
+// CHECK: %[[SHAPE:.*]] = shape.shape_of %[[ARG_0]] : tensor<*xf32> -> tensor<?xindex>
+// CHECK: %[[CHANNEL_AXIS:.*]] = arith.constant 2 : index
+// CHECK: %[[CHANNEL_AXIS_NEXT:.*]] = arith.constant 3 : index
+// CHECK: %[[SHAPE_LEFT:.*]], %[[DISCARDED_0:.*]] = "shape.split_at"(%[[SHAPE]], %[[CHANNEL_AXIS]]) : (tensor<?xindex>, index) -> (tensor<?xindex>, tensor<?xindex>)
+// CHECK: %[[SIZE_LEFT:.*]] = shape.num_elements %[[SHAPE_LEFT]] : tensor<?xindex> -> index
+// CHECK: %[[DISCARDED_1:.*]], %[[SHAPE_RIGHT:.*]] = "shape.split_at"(%[[SHAPE]], %[[CHANNEL_AXIS_NEXT]]) : (tensor<?xindex>, index) -> (tensor<?xindex>, tensor<?xindex>)
+// CHECK: %[[SIZE_RIGHT:.*]] = shape.num_elements %[[SHAPE_RIGHT]] : tensor<?xindex> -> index
+
+// CHECK: %[[CHANNEL_AXIS_SIZE:.*]] = arith.constant 3 : index
+// CHECK: %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[SIZE_LEFT]], %[[CHANNEL_AXIS_SIZE]], %[[SIZE_RIGHT]] : tensor<3xindex>
+// CHECK: %[[FLAT_INPUT:.*]] = tensor.reshape %[[ARG_0]](%[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x3x?xf32>
+
+// CHECK: %[[SCALES:.*]] = arith.constant dense<[2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<3xf32>
+// CHECK: %[[ZERO_POINTS:.*]] = arith.constant dense<[10, 20, 30]> : tensor<3xi8>
+
+// CHECK: %[[C_0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM_0:.*]] = tensor.dim %[[FLAT_INPUT]], %[[C_0]] : tensor<?x3x?xf32>
+// CHECK: %[[C_2:.*]] = arith.constant 2 : index
+// CHECK: %[[DIM_2:.*]] = tensor.dim %[[FLAT_INPUT]], %[[C_2]] : tensor<?x3x?xf32>
+// CHECK: %[[INIT:.*]] = tensor.empty(%[[DIM_0]], %[[DIM_2]]) : tensor<?x3x?xi8>
+
+// CHECK: %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_0]]], iterator_types = ["parallel", "parallel", "parallel"]} ins(%[[FLAT_INPUT]], %[[SCALES]], %[[ZERO_POINTS]] : tensor<?x3x?xf32>, tensor<3xf32>, tensor<3xi8>) outs(%[[INIT]] : tensor<?x3x?xi8>) {
+// CHECK: ^bb0(%[[IN:.*]]: f32, %[[SCALE:.*]]: f32, %[[ZERO_POINT:.*]]: i8, %[[OUT:.*]]: i8):
+// CHECK:   %[[SCALED:.*]] = arith.divf %[[IN]], %[[SCALE]] : f32
+// CHECK:   %[[ZERO_POINT_FLOAT:.*]] = arith.sitofp %[[ZERO_POINT]] : i8 to f32
+// CHECK:   %[[STORED_FLOAT:.*]] = arith.addf %[[SCALED]], %[[ZERO_POINT_FLOAT]] : f32
+// CHECK:   %[[STORED_INT:.*]] = arith.fptosi %[[STORED_FLOAT]] : f32 to i8
+// CHECK:   linalg.yield %[[STORED_INT]] : i8
+// CHECK: } -> tensor<?x3x?xi8>
+
+// CHECK: %[[STORED_UNRANKED:.*]] = tensor.reshape %[[GENERIC]](%[[SHAPE]]) : (tensor<?x3x?xi8>, tensor<?xindex>) -> tensor<*xi8>
+// CHECK: %[[STORED_QUANT:.*]] = quant.scast %[[STORED_UNRANKED]] : tensor<*xi8> to tensor<*x!quant.uniform<i8:f32:2, {2.000000e+00:10,3.000000e+00:20,4.000000e+00:30}>>
+// CHECK: return %[[STORED_QUANT]] : tensor<*x!quant.uniform<i8:f32:2, {2.000000e+00:10,3.000000e+00:20,4.000000e+00:30}>>
+
+!qalias = !quant.uniform<i8:f32:2, {2.0:10, 3.0:20, 4.0:30}>
+func.func @qcast_per_channel_unranked(%arg0: tensor<*xf32>) -> tensor<*x!qalias> {
+  %0 = quant.qcast %arg0 : tensor<*xf32> to tensor<*x!qalias>
+  return %0 : tensor<*x!qalias>
+}
+
diff --git a/mlir/test/Dialect/Quant/ops.mlir b/mlir/test/Dialect/Quant/ops.mlir
new file mode 100644
index 0000000000000..4abc5830d081e
--- /dev/null
+++ b/mlir/test/Dialect/Quant/ops.mlir
@@ -0,0 +1,151 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @dcast_scalar(%arg0: !qalias) {
+  %0 = quant.dcast %arg0 : !qalias to f32
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @dcast_ranked(%arg0: tensor<2x?x4x!qalias>) {
+  %0 = quant.dcast %arg0 : tensor<2x?x4x!qalias> to tensor<2x?x4xf32>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @dcast_unranked(%arg0: tensor<*x!qalias>) {
+  %0 = quant.dcast %arg0 : tensor<*x!qalias> to tensor<*xf32>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0, 3.0}>
+func.func @dcast_per_axis_static(%arg0: tensor<1x2x3x!qalias>) {
+  %0 = quant.dcast %arg0 : tensor<1x2x3x!qalias> to tensor<1x2x3xf32>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0, 3.0}>
+func.func @dcast_per_axis_dynamic(%arg0: tensor<?x?x?x!qalias>) {
+  %0 = quant.dcast %arg0 : tensor<?x?x?x!qalias> to tensor<?x?x?xf32>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0, 3.0}>
+func.func @dcast_per_axis_unranked(%arg0: tensor<*x!qalias>) {
+  %0 = quant.dcast %arg0 : tensor<*x!qalias> to tensor<*xf32>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @qcast_scalar(%arg0: f32) {
+  %0 = quant.qcast %arg0 : f32 to !qalias
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @qcast_ranked(%arg0: tensor<2x?x4xf32>) {
+  %0 = quant.qcast %arg0 : tensor<2x?x4xf32> to tensor<2x?x4x!qalias>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @qcast_unranked(%arg0: tensor<*xf32>) {
+  %0 = quant.qcast %arg0 : tensor<*xf32> to tensor<*x!qalias>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0, 3.0}>
+func.func @qcast_per_axis_static(%arg0: tensor<1x2x3xf32>) {
+  %0 = quant.qcast %arg0 : tensor<1x2x3xf32> to tensor<1x2x3x!qalias>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0, 3.0}>
+func.func @qcast_per_axis_dynamic(%arg0: tensor<?x?x?xf32>) {
+  %0 = quant.qcast %arg0 : tensor<?x?x?xf32> to tensor<?x?x?x!qalias>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0, 3.0}>
+func.func @qcast_per_axis_unranked(%arg0: tensor<*xf32>) {
+  %0 = quant.qcast %arg0 : tensor<*xf32> to tensor<*x!qalias>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @scast_scalar(%arg0: i8) {
+  %0 = quant.scast %arg0 : i8 to !qalias
+  %1 = quant.scast %0 : !qalias to i8
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @scast_ranked(%arg0: tensor<2x?x4xi8>) {
+  %0 = quant.scast %arg0 : tensor<2x?x4xi8> to tensor<2x?x4x!qalias>
+  %1 = quant.scast %0 : tensor<2x?x4x!qalias> to tensor<2x?x4xi8>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32, 1.0>
+func.func @scast_unranked(%arg0: tensor<*xi8>) {
+  %0 = quant.scast %arg0 : tensor<*xi8> to tensor<*x!qalias>
+  %1 = quant.scast %0 : tensor<*x!qalias> to tensor<*xi8>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0, 3.0}>
+func.func @scast_per_axis_static(%arg0: tensor<1x2x3xi8>) {
+  %0 = quant.scast %arg0 : tensor<1x2x3xi8> to tensor<1x2x3x!qalias>
+  %1 = quant.scast %0 : tensor<1x2x3x!qalias> to tensor<1x2x3xi8>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0, 3.0}>
+func.func @scast_per_axis_dynamic(%arg0: tensor<?x?x?xi8>) {
+  %0 = quant.scast %arg0 : tensor<?x?x?xi8> to tensor<?x?x?x!qalias>
+  %1 = quant.scast %0 : tensor<?x?x?x!qalias> to tensor<?x?x?xi8>
+  return
+}
+
+// -----
+
+!qalias = !quant.uniform<i8:f32:2, {1.0, 2.0, 3.0}>
+func.func @scast_per_axis_unranked(%arg0: tensor<*xi8>) {
+  %0 = quant.scast %arg0 : tensor<*xi8> to tensor<*x!qalias>
+  %1 = quant.scast %0 : tensor<*x!qalias> to tensor<*xi8>
+  return
+}
+
+
diff --git a/mlir/test/Dialect/Quant/parse-uniform-invalid.mlir b/mlir/test/Dialect/Quant/parse-uniform-invalid.mlir
index a82e8efdb1a3c..7613a344cf2b8 100644
--- a/mlir/test/Dialect/Quant/parse-uniform-invalid.mlir
+++ b/mlir/test/Dialect/Quant/parse-uniform-invalid.mlir
@@ -120,3 +120,28 @@
 // provided.
 // expected-error@+1 {{expected floating point literal}}
 !qalias = !quant.uniform<i8<-4:3>:f32, {2.000000e+02,-19.987200e-01:1}>
+
+// -----
+// Illegal negative axis in per-axis quantization
+// expected-error@+1 {{illegal quantized dimension: -1}}
+!qalias = !quant.uniform<i8:f32:-1, {2.0,3.0:1}>
+
+// -----
+// Scale f16 underflow
+// expected-error@+1 {{scale out of expressed type range}}
+!qalias = !quant.uniform<i8:f16, 5.8e-8>
+
+// -----
+// Scale f16 overflow
+// expected-error@+1 {{scale out of expressed type range}}
+!qalias = !quant.uniform<i8:f16, 6.6e4>
+
+// -----
+// Scale f16 underflow in per-axis quantization
+// expected-error@+1 {{scale out of expressed type range}}
+!qalias = !quant.uniform<i8:f16:1, {2.0,5.8e-8}>
+
+// -----
+// Scale f16 overflow in per-axis quantization
+// expected-error@+1 {{scale out of expressed type range}}
+!qalias = !quant.uniform<i8:f16:1, {2.0,6.6e4}>
diff --git a/mlir/test/Dialect/Quant/strip-func-quant-types.mlir b/mlir/test/Dialect/Quant/strip-func-quant-types.mlir
new file mode 100644
index 0000000000000..e5f0d4921bed3
--- /dev/null
+++ b/mlir/test/Dialect/Quant/strip-func-quant-types.mlir
@@ -0,0 +1,88 @@
+// RUN: mlir-opt %s --strip-func-quant-types --split-input-file | FileCheck %s
+
+// CHECK-LABEL: @strip_operands
+// CHECK-SAME: %[[ARG_0:.*]]: i8
+// CHECK-SAME: %[[ARG_1:.*]]: i16
+// CHECK-SAME: %[[ARG_2:.*]]: f32
+
+// CHECK: %[[ARG_0_CAST:.*]] = quant.scast %[[ARG_1]] : i16 to !quant.uniform<{{.*}}>
+// CHECK: %[[ARG_1_CAST:.*]] = quant.scast %[[ARG_0]] : i8 to !quant.uniform<{{.*}}>
+
+// CHECK: "test.custom_op"(%[[ARG_1_CAST]])
+// CHECK: "test.custom_op"(%[[ARG_0_CAST]])
+// CHECK: "test.custom_op"(%[[ARG_2]])
+
+!qalias = !quant.uniform<i8:f32, 2.0:128>
+!qalias1 = !quant.uniform<i16:f32, 3.0:128>
+
+func.func @strip_operands(%arg0: !qalias, %arg1: !qalias1, %arg2: f32) {
+  "test.custom_op"(%arg0) : (!qalias) -> tensor<4x!qalias>
+  "test.custom_op"(%arg1) : (!qalias1) -> tensor<?x!qalias1>
+  "test.custom_op"(%arg2) : (f32) -> tensor<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @strip_results
+// CHECK-SAME: tensor<4xi8>, tensor<?xi16>, tensor<*xi8>, tensor<4xf32>
+
+// CHECK: %[[RESULT_0:.*]] = "test.custom_op"()
+// CHECK: %[[RESULT_CAST_0:.*]] = quant.scast %[[RESULT_0]] : tensor<4x!quant.uniform<{{.*}}>> to tensor<4xi8>
+
+// CHECK: %[[RESULT_1:.*]] = "test.custom_op"()
+// CHECK: %[[RESULT_CAST_1:.*]] = quant.scast %[[RESULT_1]] : tensor<?x!quant.uniform<{{.*}}>> to tensor<?xi16>
+
+// CHECK: %[[RESULT_2:.*]] = "test.custom_op"()
+// CHECK: %[[RESULT_CAST_2:.*]] = quant.scast %[[RESULT_2]] : tensor<*x!quant.uniform<{{.*}}>> to tensor<*xi8>
+
+// CHECK: %[[RESULT_3:.*]] = "test.custom_op"()
+
+// CHECK: return %[[RESULT_CAST_0]], %[[RESULT_CAST_1]], %[[RESULT_CAST_2]], %[[RESULT_3]]
+
+!qalias = !quant.uniform<i8:f32, 2.0:128>
+!qalias1 = !quant.uniform<i16:f32, 3.0:128>
+
+func.func @strip_results() -> (tensor<4x!qalias>, tensor<?x!qalias1>, tensor<*x!qalias>, tensor<4xf32>) {
+  %0 = "test.custom_op"() : () -> tensor<4x!qalias>
+  %1 = "test.custom_op"() : () -> tensor<?x!qalias1>
+  %2 = "test.custom_op"() : () -> tensor<*x!qalias>
+  %3 = "test.custom_op"() : () -> tensor<4xf32>
+  return %0, %1, %2, %3 : tensor<4x!qalias>, tensor<?x!qalias1>, tensor<*x!qalias>, tensor<4xf32>
+}
+
+// -----
+
+
+// CHECK-LABEL: @callee
+// CHECK-SAME: (tensor<4xi8>, tensor<?xi16>) -> (tensor<*xi8>, tensor<4xf32>)
+
+// CHECK-LABEL: @strip_call
+
+// CHECK: %[[OPERAND_0:.*]] = "test.custom_op"()
+// CHECK: %[[OPERAND_0_CAST:.*]] = quant.scast %[[OPERAND_0]] : tensor<4x!quant.uniform<{{.*}}>> to tensor<4xi8>
+
+// CHECK: %[[OPERAND_1:.*]] = "test.custom_op"()
+// CHECK: %[[OPERAND_1_CAST:.*]] = quant.scast %[[OPERAND_1]] : tensor<?x!quant.uniform<{{.*}}>> to tensor<?xi16>
+
+// CHECK: %[[RESULTS:.*]]:2 = call @callee(%[[OPERAND_0_CAST]], %[[OPERAND_1_CAST]])
+
+// CHECK: %[[RESULT_0_CAST:.*]] = quant.scast %[[RESULTS]]#0 : tensor<*xi8> to tensor<*x!quant.uniform<{{.*}}>>
+// CHECK: "test.custom_op"(%[[RESULT_0_CAST]])
+
+// CHECK: "test.custom_op"(%[[RESULTS]]#1)
+
+// CHECK: return
+
+!qalias = !quant.uniform<i8:f32, 2.0:128>
+!qalias1 = !quant.uniform<i16:f32, 3.0:128>
+
+func.func private @callee(tensor<4x!qalias>, tensor<?x!qalias1>) -> (tensor<*x!qalias>, tensor<4xf32>)
+
+func.func @strip_call() {
+  %0 = "test.custom_op"() : () -> tensor<4x!qalias>
+  %1 = "test.custom_op"() : () -> tensor<?x!qalias1>
+  %2:2 = func.call @callee(%0, %1) : (tensor<4x!qalias>, tensor<?x!qalias1>) -> (tensor<*x!qalias>, tensor<4xf32>)
+  "test.custom_op"(%2#0) : (tensor<*x!qalias>) -> ()
+  "test.custom_op"(%2#1) : (tensor<4xf32>) -> ()
+  return
+}
diff --git a/mlir/test/Dialect/SCF/loop-pipelining.mlir b/mlir/test/Dialect/SCF/loop-pipelining.mlir
index 4747aad977a49..af49d2afc049b 100644
--- a/mlir/test/Dialect/SCF/loop-pipelining.mlir
+++ b/mlir/test/Dialect/SCF/loop-pipelining.mlir
@@ -766,8 +766,11 @@ func.func @stage_0_value_escape(%A: memref<?xf32>, %result: memref<?xf32>, %ub:
 
 // Check for predicated epilogue for dynamic loop.
 // CHECK-LABEL: dynamic_loop(
-//        CHECK:   %[[C0:.*]] = arith.constant 0 : index
-//        CHECK:   %{{.*}}:2 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}})
+//    CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//    CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//    CHECK-DAG:   %[[CM1:.*]] = arith.constant -1 : index
+//        CHECK:   %[[UBM:.*]] = arith.subi %[[UB:.*]], %{{.*}}
+//        CHECK:   %{{.*}}:2 = scf.for %[[ARG5:.*]] = %[[LB:.*]] to %[[UBM]] step %[[STEP:.*]] iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}})
 //        CHECK:       memref.store %[[ARG6]], %{{.*}}[%[[ARG5]]]
 //        CHECK:       %[[ADDF_24:.*]] = arith.addf %[[ARG7]], %{{.*}}
 //        CHECK:       %[[MULI_25:.*]] = arith.muli %{{.*}}, %{{.*}}
@@ -775,15 +778,17 @@ func.func @stage_0_value_escape(%A: memref<?xf32>, %result: memref<?xf32>, %ub:
 //        CHECK:       %[[LOAD_27:.*]] = memref.load %{{.*}}[%[[ADDI_26]]]
 //        CHECK:       scf.yield %[[ADDF_24]], %[[LOAD_27]]
 //        CHECK:   }
-//        CHECK:   %[[SUBI_10:.*]] = arith.subi %{{.*}}, %{{.*}}
-//        CHECK:   %[[ADDI_11:.*]] = arith.addi %[[SUBI_10]], %{{.*}}
-//        CHECK:   %[[ADDI_12:.*]] = arith.addi %[[ADDI_11]], %{{.*}}-1
-//        CHECK:   %[[DIVUI_13:.*]] = arith.divui %[[ADDI_12]], %{{.*}}
-//        CHECK:   %[[ADDI_14:.*]] = arith.addi %[[DIVUI_13]], %{{.*}}-1
+//        CHECK:   %[[CMPI_10:.*]] = arith.cmpi slt, %[[STEP]], %[[C0]]
+//        CHECK:   %[[SEL_10:.*]] = arith.select %[[CMPI_10]], %[[C1]], %[[CM1]]
+//        CHECK:   %[[SUBI_10:.*]] = arith.subi %[[UB]], %[[LB]]
+//        CHECK:   %[[ADDI_11:.*]] = arith.addi %[[SUBI_10]], %[[STEP]]
+//        CHECK:   %[[ADDI_12:.*]] = arith.addi %[[ADDI_11]], %[[SEL_10]]
+//        CHECK:   %[[DIVSI_13:.*]] = arith.divsi %[[ADDI_12]], %[[STEP]]
+//        CHECK:   %[[ADDI_14:.*]] = arith.addi %[[DIVSI_13]], %[[CM1]]
 //        CHECK:   %[[MULI_15:.*]] = arith.muli %{{.*}}, %[[ADDI_14]]
 //        CHECK:   %[[ADDI_16:.*]] = arith.addi %{{.*}}, %[[MULI_15]]
 //        CHECK:   %[[CMPI_17:.*]] = arith.cmpi sge, %[[ADDI_14]], %[[C0]]
-//        CHECK:   %[[ADDI_18:.*]] = arith.addi %[[DIVUI_13]], %{{.*}}-1
+//        CHECK:   %[[ADDI_18:.*]] = arith.addi %[[DIVSI_13]], %{{.*}}-1
 //        CHECK:   %[[ADDI_19:.*]] = arith.addi %[[ADDI_18]], %{{.*}}-1
 //        CHECK:   %[[MULI_20:.*]] = arith.muli %{{.*}}, %[[ADDI_19]]
 //        CHECK:   %[[ADDI_21:.*]] = arith.addi %{{.*}}, %[[MULI_20]]
@@ -834,32 +839,38 @@ func.func @dynamic_loop(%A: memref<?xf32>, %result: memref<?xf32>, %lb: index, %
 
 // Check for predicated epilogue for dynamic loop.
 // CHECK-LABEL:   func.func @dynamic_loop_result
-//       CHECK:     %{{.*}}:2 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}})
+//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[CM1:.*]] = arith.constant -1 : index
+//       CHECK:   %[[UBM:.*]] = arith.subi %[[UB:.*]], %{{.*}}
+//       CHECK:   %{{.*}}:2 = scf.for %[[ARG5:.*]] = %[[LB:.*]] to %[[UBM]] step %[[STEP:.*]] iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}})
 //       CHECK:       %[[ADDF_13:.*]] = arith.addf %[[ARG7]], %[[ARG6]]
 //       CHECK:       %[[MULF_14:.*]] = arith.mulf %[[ADDF_13]], %{{.*}}
 //       CHECK:       %[[ADDI_15:.*]] = arith.addi %[[ARG5]], %{{.*}}
 //       CHECK:       %[[LOAD_16:.*]] = memref.load %{{.*}}[%[[ADDI_15]]]
 //       CHECK:       scf.yield %[[MULF_14]], %[[LOAD_16]]
 //       CHECK:     }
-//       CHECK:     %[[SUBI_4:.*]] = arith.subi %{{.*}}, %{{.*}}
-//       CHECK:     %[[ADDI_5:.*]] = arith.addi %[[SUBI_4]], %{{.*}}
-//       CHECK:     %[[ADDI_6:.*]] = arith.addi %[[ADDI_5]], %{{.*}}-1
-//       CHECK:     %[[DIVUI_7:.*]] = arith.divui %[[ADDI_6]], %{{.*}}
-//       CHECK:     %[[ADDI_8:.*]] = arith.addi %[[DIVUI_7]], %{{.*}}-1
-//       CHECK:     %[[CMPI_9:.*]] = arith.cmpi sge, %[[ADDI_8]], %{{.*}}
-//       CHECK:     %[[IF_10:.*]] = scf.if %[[CMPI_9]]
+//       CHECK:     %[[CMPI_4:.*]] = arith.cmpi slt, %[[STEP]], %[[C0]]
+//       CHECK:     %[[SELECT_5:.*]] = arith.select %[[CMPI_4]], %[[C1]], %[[CM1]]
+//       CHECK:     %[[SUBI_6:.*]] = arith.subi %[[UB]], %[[LB]]
+//       CHECK:     %[[ADDI_7:.*]] = arith.addi %[[SUBI_6]], %[[STEP]]
+//       CHECK:     %[[ADDI_8:.*]] = arith.addi %[[ADDI_7]], %[[SELECT_5]]
+//       CHECK:     %[[DIVSI_9:.*]] = arith.divsi %[[ADDI_8]], %[[STEP]]
+//       CHECK:     %[[ADDI_10:.*]] = arith.addi %[[DIVSI_9]], %[[CM1]]
+//       CHECK:     %[[CMPI_11:.*]] = arith.cmpi sge, %[[ADDI_10]], %[[C0]]
+//       CHECK:     %[[IF_10:.*]] = scf.if %[[CMPI_11]]
 //       CHECK:       %[[ADDF_13:.*]] = arith.addf %{{.*}}#1, %{{.*}}#0
 //       CHECK:       scf.yield %[[ADDF_13]]
 //       CHECK:     } else {
 //       CHECK:       scf.yield %{{.*}}
 //       CHECK:     }
-//       CHECK:     %[[IF_11:.*]] = scf.if %[[CMPI_9]]
+//       CHECK:     %[[IF_11:.*]] = scf.if %[[CMPI_11]]
 //       CHECK:       %[[MULF_13:.*]] = arith.mulf %[[IF_10]], %{{.*}}
 //       CHECK:       scf.yield %[[MULF_13]]
 //       CHECK:     } else {
 //       CHECK:       scf.yield %{{.*}}
 //       CHECK:     }
-//       CHECK:     %[[SELECT_12:.*]] = arith.select %[[CMPI_9]], %[[IF_11]], %{{.*}}#0
+//       CHECK:     %[[SELECT_12:.*]] = arith.select %[[CMPI_11]], %[[IF_11]], %{{.*}}#0
 //       CHECK:     memref.store %[[SELECT_12]], %{{.*}}[%{{.*}}]
 func.func @dynamic_loop_result(%A: memref<?xf32>, %result: memref<?xf32>, %lb: index, %ub: index, %step: index) {
   %cf0 = arith.constant 1.0 : f32
diff --git a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
index 6fcd727621bae..03ddee1c7a98a 100644
--- a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
+++ b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics -allow-unregistered-dialect --cse | FileCheck %s
+// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics -allow-unregistered-dialect --cse --mlir-print-local-scope | FileCheck %s
 
 func.func @coalesce_inner() {
   %c0 = arith.constant 0 : index
@@ -33,19 +33,15 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-DAG: #[[MAP:.+]] = affine_map<() -> (64)>
-// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (d0 * s0)>
-// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
-// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
 func.func @coalesce_outer(%arg1: memref<64x64xf32, 1>, %arg2: memref<64x64xf32, 1>, %arg3: memref<64x64xf32, 1>) attributes {} {
-  // CHECK: %[[T0:.+]] = affine.apply #[[MAP]]()
-  // CHECK: %[[UB:.+]] = affine.apply #[[MAP1]](%[[T0]])[%[[T0]]]
+  // CHECK: %[[T0:.+]] = affine.apply affine_map<() -> (64)>()
+  // CHECK: %[[UB:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%[[T0]])[%[[T0]]]
   // CHECK: affine.for %[[IV1:.+]] = 0 to %[[UB:.+]] {
   // CHECK-NOT: affine.for %[[IV2:.+]]
   affine.for %arg4 = 0 to 64 {
     affine.for %arg5 = 0 to 64 {
-      // CHECK: %[[IDX0:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%{{.+}}]
-      // CHECK: %[[IDX1:.+]] = affine.apply #[[MAP3]](%[[IV1]])[%{{.+}}]
+      // CHECK: %[[IDX0:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)>(%[[IV1]])[%{{.+}}]
+      // CHECK: %[[IDX1:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 floordiv s0)>(%[[IV1]])[%{{.+}}]
       // CHECK-NEXT: %{{.+}} = affine.load %{{.+}}[%[[IDX1]], %[[IDX0]]] : memref<64x64xf32, 1>
       %0 = affine.load %arg1[%arg4, %arg5] : memref<64x64xf32, 1>
       %1 = affine.load %arg2[%arg4, %arg5] : memref<64x64xf32, 1>
@@ -76,9 +72,8 @@ func.func @coalesce_and_unroll(%arg1: memref<64x64xf32, 1>, %arg2: memref<64x64x
   scf.for %arg4 = %c0 to %c64 step %c1 {
     // CHECK-NOT: scf.for
     scf.for %arg5 = %c0 to %c64 step %c1 {
-      // CHECK: %[[IDX0:.+]] = arith.remsi %[[IV1]]
-      // CHECK: %[[IDX1:.+]] = arith.divsi %[[IV1]]
-      // CHECK-NEXT: %{{.+}} = memref.load %{{.+}}[%[[IDX1]], %[[IDX0]]] : memref<64x64xf32, 1>
+      // CHECK: %[[IDX:.+]]:2 = affine.delinearize_index
+      // CHECK-NEXT: %{{.+}} = memref.load %{{.+}}[%[[IDX]]#0, %[[IDX]]#1] : memref<64x64xf32, 1>
       %0 = memref.load %arg1[%arg4, %arg5] : memref<64x64xf32, 1>
       %1 = memref.load %arg2[%arg4, %arg5] : memref<64x64xf32, 1>
       %2 = arith.addf %0, %1 : f32
@@ -138,27 +133,22 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:     %[[LB2:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:     %[[UB2:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:     %[[STEP2:[a-zA-Z0-9_]+]]: index
-//      CHECK:   %[[NEWUB0_DIFF:.+]] = arith.subi %[[UB0]], %[[LB0]]
-//  CHECK-DAG:   %[[NEWUB0:.+]] = arith.ceildivsi %[[NEWUB0_DIFF]], %[[STEP0]]
-//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0
-//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1
-//      CHECK:   %[[NEWUB1_DIFF:.+]] = arith.subi %[[UB1]], %[[LB1]]
-//  CHECK-DAG:   %[[NEWUB1:.+]] = arith.ceildivsi %[[NEWUB1_DIFF]], %[[STEP1]]
-//      CHECK:   %[[NEWUB2_DIFF:.+]] = arith.subi %[[UB2]], %[[LB2]]
-//  CHECK-DAG:   %[[NEWUB2:.+]] = arith.ceildivsi %[[NEWUB2_DIFF]], %[[STEP2]]
-//      CHECK:   %[[PROD1:.+]] = arith.muli %[[NEWUB0]], %[[NEWUB1]]
-//      CHECK:   %[[NEWUB:.+]] = arith.muli %[[PROD1]], %[[NEWUB2]]
+//      CHECK:   %[[NITERS0:.+]] = affine.apply
+// CHECK-SAME:       affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[LB0]], %[[UB0]], %[[STEP0]]]
+//      CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK:   %[[NITERS1:.+]] = affine.apply
+// CHECK-SAME:       affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[LB1]], %[[UB1]], %[[STEP1]]]
+//      CHECK:   %[[NITERS2:.+]] = affine.apply
+// CHECK-SAME:        affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>()[%[[LB2]], %[[UB2]], %[[STEP2]]]
+//      CHECK:   %[[NEWUB:.+]] = affine.apply affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8] ->
+// CHECK-SAME:       ((((-s0 + s1) ceildiv s2) * ((-s3 + s4) ceildiv s5)) * ((-s6 + s7) ceildiv s8))
+// CHECK-SAME:       [%[[LB0]], %[[UB0]], %[[STEP0]], %[[LB1]], %[[UB1]], %[[STEP1]], %[[LB2]], %[[UB2]], %[[STEP2]]]
 //      CHECK:   %[[RESULT:.+]] = scf.for %[[IV:[a-zA-Z0-9]+]] = %[[C0]] to %[[NEWUB]] step %[[C1]] iter_args(%[[ITER_ARG:.+]] = %[[ARG0]])
-//      CHECK:     %[[IV2:.+]] = arith.remsi %[[IV]], %[[NEWUB2]]
-//      CHECK:     %[[PREVIOUS:.+]] = arith.divsi %[[IV]], %[[NEWUB2]]
-//      CHECK:     %[[IV1:.+]] = arith.remsi %[[PREVIOUS]], %[[NEWUB1]]
-//      CHECK:     %[[IV0:.+]] = arith.divsi %[[PREVIOUS]], %[[NEWUB1]]
-//      CHECK:     %[[K_STEP:.+]] = arith.muli %[[IV2]], %[[STEP2]]
-//      CHECK:     %[[K:.+]] = arith.addi %[[K_STEP]], %[[LB2]]
-//      CHECK:     %[[J_STEP:.+]] = arith.muli %[[IV1]], %[[STEP1]]
-//      CHECK:     %[[J:.+]] = arith.addi %[[J_STEP]], %[[LB1]]
-//      CHECK:     %[[I_STEP:.+]] = arith.muli %[[IV0]], %[[STEP0]]
-//      CHECK:     %[[I:.+]] = arith.addi %[[I_STEP]], %[[LB0]]
+//      CHECK:     %[[DELINEARIZE:.+]]:3 = affine.delinearize_index %[[IV]] into (%[[NITERS0]], %[[NITERS1]], %[[NITERS2]])
+//  CHECK-DAG:     %[[K:.+]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>(%[[DELINEARIZE]]#2)[%[[LB2]], %[[STEP2]]]
+//  CHECK-DAG:     %[[J:.+]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>(%[[DELINEARIZE]]#1)[%[[LB1]], %[[STEP1]]]
+//  CHECK-DAG:     %[[I:.+]] = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>(%[[DELINEARIZE]]#0)[%[[LB0]], %[[STEP0]]]
 //      CHECK:     %[[USE:.+]] = "use"(%[[ITER_ARG]], %[[I]], %[[J]], %[[K]])
 //      CHECK:     scf.yield %[[USE]]
 //      CHECK:   return %[[RESULT]]
@@ -201,8 +191,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:     %[[UB2:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:     %[[STEP2:[a-zA-Z0-9_]+]]: index
 //      CHECK:   scf.for
-//      CHECK:     arith.remsi
-//      CHECK:     arith.divsi
+//      CHECK:     affine.delinearize_index
 //      CHECK:     scf.for %{{[a-zA-Z0-9]+}} = %[[LB2]] to %[[UB2]] step %[[STEP2]]
 //  CHECK-NOT:       scf.for
 //      CHECK:   transform.named_sequence
@@ -245,8 +234,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:     %[[UB2:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:     %[[STEP2:[a-zA-Z0-9_]+]]: index
 //      CHECK:   scf.for
-//      CHECK:     arith.remsi
-//      CHECK:     arith.divsi
+//      CHECK:     affine.delinearize_index
 //      CHECK:     scf.for %{{[a-zA-Z0-9]+}} = %[[LB2]] to %[[UB2]] step %[[STEP2]]
 //  CHECK-NOT:       scf.for
 //      CHECK:   transform.named_sequence
@@ -289,13 +277,9 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:     %[[UB2:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:     %[[STEP2:[a-zA-Z0-9_]+]]: index
 //      CHECK:   scf.for %{{[a-zA-Z0-9]+}} = %[[LB0]] to %[[UB0]] step %[[STEP0]]
-//      CHECK:     arith.subi
-//      CHECK:     arith.ceildivsi
-//      CHECK:     arith.subi
-//      CHECK:     arith.ceildivsi
+//  CHECK-NOT:     affine.delinearize_index
 //      CHECK:     scf.for
-//      CHECK:       arith.remsi
-//      CHECK:       arith.divsi
+//      CHECK:       affine.delinearize_index
 //  CHECK-NOT:       scf.for
 //      CHECK:   transform.named_sequence
 
@@ -329,6 +313,9 @@ module attributes {transform.with_named_sequence} {
     %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
     %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
     %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
+    transform.apply_patterns to %2 {
+      transform.apply_patterns.canonicalization
+    } : !transform.op<"scf.for">
     transform.yield
   }
 }
@@ -337,11 +324,10 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-SAME:     %[[ARG2:.+]]: index)
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//       CHECK:   %[[UB:.+]] = arith.muli %[[ARG1]], %[[ARG2]]
+//       CHECK:   %[[UB:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%[[ARG1]], %[[ARG2]]]
 //       CHECK:   scf.for %[[IV:.+]] = %[[C0]] to %[[UB]] step %[[C1]]
-//       CHECK:     %[[IV1:.+]] = arith.remsi %[[IV]], %[[ARG2]]
-//       CHECK:     %[[IV2:.+]] = arith.divsi %[[IV]], %[[ARG2]]
-//       CHECK:     "some_use"(%{{[a-zA-Z0-9]+}}, %[[C0]], %[[C0]], %[[IV2]], %[[C0]], %[[IV1]])
+//       CHECK:     %[[DELINEARIZE:.+]]:2 = affine.delinearize_index %[[IV]] into (%[[ARG1]], %[[ARG2]])
+//       CHECK:     "some_use"(%{{[a-zA-Z0-9]+}}, %[[C0]], %[[C0]], %[[DELINEARIZE]]#0, %[[C0]], %[[DELINEARIZE]]#1)
 
 // -----
 
@@ -367,6 +353,9 @@ module attributes {transform.with_named_sequence} {
     %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
     %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
     %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
+    transform.apply_patterns to %2 {
+      transform.apply_patterns.canonicalization
+    } : !transform.op<"scf.for">
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir
index 81ba471d3f51e..8f021ed3d663d 100644
--- a/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir
@@ -274,9 +274,9 @@ func.func @rintvec(%arg0 : vector<3xf16>) -> () {
 // spirv.CL.printf
 //===----------------------------------------------------------------------===//
 // CHECK-LABEL: func.func @printf(
-func.func @printf(%arg0 : !spirv.ptr<i8, UniformConstant>, %arg1 : i32, %arg2 : i32) -> i32 {
-  // CHECK: spirv.CL.printf {{%.*}}, {{%.*}}, {{%.*}} : (!spirv.ptr<i8, UniformConstant>, (i32, i32)) -> i32
-  %0 = spirv.CL.printf %arg0, %arg1, %arg2 : (!spirv.ptr<i8, UniformConstant>, (i32, i32)) -> i32
+func.func @printf(%fmt : !spirv.ptr<i8, UniformConstant>, %arg1 : i32, %arg2 : i32) -> i32 {
+  // CHECK: spirv.CL.printf {{%.*}} {{%.*}}, {{%.*}} : !spirv.ptr<i8, UniformConstant>, i32, i32 -> i32
+  %0 = spirv.CL.printf %fmt %arg1, %arg2 : !spirv.ptr<i8, UniformConstant>, i32, i32 -> i32
   return %0 : i32
 }
 
diff --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir
index af78458f10932..df03d871ba3a3 100644
--- a/mlir/test/Dialect/SparseTensor/codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/codegen.mlir
@@ -826,3 +826,19 @@ func.func @sparse_new_coo_permute_no(%arg0: !llvm.ptr) -> tensor<?x?xf32, #CooPN
   %0 = sparse_tensor.new %arg0 : !llvm.ptr to tensor<?x?xf32, #CooPNo>
   return %0 : tensor<?x?xf32, #CooPNo>
 }
+
+// CHECK-LABEL: func.func @test_tensor_dim_unranked
+//       CHECK: tensor.dim
+func.func @test_tensor_dim_unranked(%arg0: tensor<*xf32>) -> index {
+  %c = arith.constant 0 : index
+  %0 = tensor.dim %arg0, %c : tensor<*xf32>
+  return %0 : index
+}
+
+// CHECK-LABEL: func.func @test_tensor_reshape_unranked
+//       CHECK: tensor.reshape
+func.func @test_tensor_reshape_unranked(%src: tensor<*xf32>, %shape: tensor<1xi32>) -> tensor<?xf32> {
+  %dst = tensor.reshape %src(%shape)
+         : (tensor<*xf32>, tensor<1xi32>) -> tensor<?xf32>
+  return %dst : tensor<?xf32>
+}
diff --git a/mlir/test/Dialect/Vector/drop-unit-dims-with-shape-cast.mlir b/mlir/test/Dialect/Vector/drop-unit-dims-with-shape-cast.mlir
index af3fc924c1dbe..34a155fbf2fc1 100644
--- a/mlir/test/Dialect/Vector/drop-unit-dims-with-shape-cast.mlir
+++ b/mlir/test/Dialect/Vector/drop-unit-dims-with-shape-cast.mlir
@@ -207,3 +207,100 @@ func.func @negative_transpose_with_no_unit_dims(%vec: vector<4x2x3xf32>) -> vect
 
 // CHECK-LABEL: func.func @negative_transpose_with_no_unit_dims
 // CHECK-NOT: vector.shape_cast
+
+// -----
+
+///----------------------------------------------------------------------------------------
+/// [Pattern: DropUnitDimsFromScfForOp]
+///----------------------------------------------------------------------------------------
+
+func.func @scf_for_with_internal_unit_dims(%vec: vector<4x1x1x[4]xf32>) -> vector<4x1x1x[4]xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %res = scf.for %i = %c0 to %c4 step %c1 iter_args(%iter = %vec) -> vector<4x1x1x[4]xf32> {
+    %s = math.sqrt %iter : vector<4x1x1x[4]xf32>
+    scf.yield %s : vector<4x1x1x[4]xf32>
+  }
+  return %res : vector<4x1x1x[4]xf32>
+}
+
+// CHECK-LABEL: func.func @scf_for_with_internal_unit_dims
+//  CHECK-SAME:   %[[VEC:[A-Za-z0-9]+]]: vector<4x1x1x[4]xf32>
+//       CHECK:   %[[CAST:.+]] = vector.shape_cast %[[VEC]] : vector<4x1x1x[4]xf32> to vector<4x[4]xf32>
+//       CHECK:   %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[ITER:.+]] = %[[CAST]])
+//       CHECK:     %[[SQRT:.+]] = math.sqrt %[[ITER]] : vector<4x[4]xf32>
+//       CHECK:     scf.yield %[[SQRT]]
+//       CHECK:   %[[CASTBACK:.+]] = vector.shape_cast %[[LOOP]] : vector<4x[4]xf32> to vector<4x1x1x[4]xf32>
+//       CHECK:   return %[[CASTBACK]]
+
+// -----
+
+func.func @scf_for_with_all_unit_dims(%vec: vector<1x1xf32>) -> vector<1x1xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %res = scf.for %i = %c0 to %c4 step %c1 iter_args(%iter = %vec) -> vector<1x1xf32> {
+    %s = math.sqrt %iter : vector<1x1xf32>
+    scf.yield %s : vector<1x1xf32>
+  }
+  return %res : vector<1x1xf32>
+}
+
+// CHECK-LABEL: func.func @scf_for_with_all_unit_dims
+//  CHECK-SAME:   %[[VEC:[A-Za-z0-9]+]]: vector<1x1xf32>
+//       CHECK:   %[[CAST:.+]] = vector.shape_cast %[[VEC]] : vector<1x1xf32> to vector<1xf32>
+//       CHECK:   %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[ITER:.+]] = %[[CAST]])
+//       CHECK:     %[[SQRT:.+]] = math.sqrt %[[ITER]] : vector<1xf32>
+//       CHECK:     scf.yield %[[SQRT]]
+//       CHECK:   %[[CASTBACK:.+]] = vector.shape_cast %[[LOOP]] : vector<1xf32> to vector<1x1xf32>
+//       CHECK:   return %[[CASTBACK]]
+
+// -----
+
+func.func @scf_for_with_multiple_operands(%idx: index, %vec0: vector<1x4xf32>, %vec1: vector<1x4xf32>) -> vector<1x4xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %res:3 = scf.for %i = %c0 to %c4 step %c1
+    iter_args(%id = %idx, %iter0 = %vec0, %iter1 = %vec1) -> (index, vector<1x4xf32>, vector<1x4xf32>) {
+    %add = arith.addf %iter0, %iter1 : vector<1x4xf32>
+    scf.yield %id, %add, %add : index, vector<1x4xf32>, vector<1x4xf32>
+  }
+  return %res#1 : vector<1x4xf32>
+}
+
+// CHECK-LABEL: func.func @scf_for_with_multiple_operands
+//  CHECK-SAME:   %[[IDX:[A-Za-z0-9]+]]: index
+//  CHECK-SAME:   %[[VEC0:[A-Za-z0-9]+]]: vector<1x4xf32>
+//  CHECK-SAME:   %[[VEC1:[A-Za-z0-9]+]]: vector<1x4xf32>
+//   CHECK-DAG:   %[[CAST0:.+]] = vector.shape_cast %[[VEC0]] : vector<1x4xf32> to vector<4xf32>
+//   CHECK-DAG:   %[[CAST1:.+]] = vector.shape_cast %[[VEC1]] : vector<1x4xf32> to vector<4xf32>
+//       CHECK:   %[[LOOP:.+]]:3 = scf.for
+//  CHECK-SAME:     iter_args(%{{.*}} = %[[IDX]], %[[ITER0:.+]] = %[[CAST0]], %[[ITER1:.+]] = %[[CAST1]])
+//       CHECK:     %[[ADD:.+]] = arith.addf %[[ITER0]], %[[ITER1]] : vector<4xf32>
+//       CHECK:     scf.yield %{{.*}}, %[[ADD]], %[[ADD]]
+//       CHECK:   %[[CASTBACK:.+]] = vector.shape_cast %[[LOOP]]#1 : vector<4xf32> to vector<1x4xf32>
+//       CHECK:   return %[[CASTBACK]]
+
+// -----
+
+func.func @scf_for_with_scalable_unit_dims(%vec: vector<1x[1]xf32>) -> vector<1x[1]xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %res = scf.for %i = %c0 to %c4 step %c1 iter_args(%iter = %vec) -> vector<1x[1]xf32> {
+    %s = math.sqrt %iter : vector<1x[1]xf32>
+    scf.yield %s : vector<1x[1]xf32>
+  }
+  return %res : vector<1x[1]xf32>
+}
+
+// CHECK-LABEL: func.func @scf_for_with_scalable_unit_dims
+//  CHECK-SAME:   %[[VEC:[A-Za-z0-9]+]]: vector<1x[1]xf32>
+//       CHECK:   %[[CAST:.+]] = vector.shape_cast %[[VEC]] : vector<1x[1]xf32> to vector<[1]xf32>
+//       CHECK:   %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[ITER:.+]] = %[[CAST]])
+//       CHECK:     %[[SQRT:.+]] = math.sqrt %[[ITER]] : vector<[1]xf32>
+//       CHECK:     scf.yield %[[SQRT]]
+//       CHECK:   %[[CASTBACK:.+]] = vector.shape_cast %[[LOOP]] : vector<[1]xf32> to vector<1x[1]xf32>
+//       CHECK:   return %[[CASTBACK]]
diff --git a/mlir/test/Dialect/Vector/shape-cast-folder.mlir b/mlir/test/Dialect/Vector/shape-cast-folder.mlir
new file mode 100644
index 0000000000000..9550c5c4ae056
--- /dev/null
+++ b/mlir/test/Dialect/Vector/shape-cast-folder.mlir
@@ -0,0 +1,38 @@
+// RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
+
+///----------------------------------------------------------------------------------------
+/// [Pattern: ShapeCastOpFolder]
+///----------------------------------------------------------------------------------------
+
+// CHECK-LABEL: func @fixed_width
+//  CHECK-SAME: %[[A0:.*0]]: vector<2x4xf32>
+//   CHECK-NOT: vector.shape_cast
+//       CHECK: return %[[A0]] : vector<2x4xf32>
+func.func @fixed_width(%arg0 : vector<2x4xf32>) -> vector<2x4xf32> {
+  %0 = vector.shape_cast %arg0 : vector<2x4xf32> to vector<8xf32>
+  %1 = vector.shape_cast %0 : vector<8xf32> to vector<2x4xf32>
+  return %1 : vector<2x4xf32>
+}
+
+// CHECK-LABEL: func @scalable
+//  CHECK-SAME: %[[A0:.*0]]: vector<2x[4]xf32>
+//   CHECK-NOT: vector.shape_cast
+//       CHECK: return %[[A0]] : vector<2x[4]xf32>
+func.func @scalable(%arg0 : vector<2x[4]xf32>) -> vector<2x[4]xf32> {
+  %0 = vector.shape_cast %arg0 : vector<2x[4]xf32> to vector<[8]xf32>
+  %1 = vector.shape_cast %0 : vector<[8]xf32> to vector<2x[4]xf32>
+  return %1 : vector<2x[4]xf32>
+}
+
+// ============================================================================
+//  TD sequence
+// ============================================================================
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.vector.drop_unit_dims_with_shape_cast
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Vector/transform-vector.mlir b/mlir/test/Dialect/Vector/transform-vector.mlir
index 4b38db79bff3e..0439844dc66ca 100644
--- a/mlir/test/Dialect/Vector/transform-vector.mlir
+++ b/mlir/test/Dialect/Vector/transform-vector.mlir
@@ -1,16 +1,18 @@
-// RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
+// RUN: mlir-opt --transform-interpreter="debug-payload-root-tag=payload" %s --split-input-file | FileCheck %s
 
 // CHECK-LABEL: func @matmul_tensors
-func.func @matmul_tensors(
-  %arg0: tensor<8x16xf32>, %arg1: tensor<16x32xf32>, %arg2: tensor<8x32xf32>)
-    -> tensor<8x32xf32> {
-// CHECK-NOT: linalg
-// CHECK: vector.extract {{.*}} : vector<4xf32> from vector<8x4xf32>
-// CHECK: vector.store {{.*}} : memref<8x32xf32>, vector<4xf32>
-  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<8x16xf32>, tensor<16x32xf32>)
-                     outs(%arg2: tensor<8x32xf32>)
-    -> tensor<8x32xf32>
-  return %0 : tensor<8x32xf32>
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @matmul_tensors(
+    %arg0: tensor<8x16xf32>, %arg1: tensor<16x32xf32>, %arg2: tensor<8x32xf32>)
+      -> tensor<8x32xf32> {
+  // CHECK-NOT: linalg
+  // CHECK: vector.extract {{.*}} : vector<4xf32> from vector<8x4xf32>
+  // CHECK: vector.store {{.*}} : memref<8x32xf32>, vector<4xf32>
+    %0 = linalg.matmul  ins(%arg0, %arg1: tensor<8x16xf32>, tensor<16x32xf32>)
+                       outs(%arg2: tensor<8x32xf32>)
+      -> tensor<8x32xf32>
+    return %0 : tensor<8x32xf32>
+  }
 }
 
 module attributes {transform.with_named_sequence} {
@@ -76,11 +78,13 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
 //  CHECK-SAME:   %[[ARG0]], %[[ARG1]], %[[ARG2]] : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
 //  CHECK-NEXT:   return %[[R]] : vector<64x64xf32>
-func.func @fold_arith_extf_into_contract(%arg0: vector<64x64xf16>, %arg1: vector<64x64xf16>, %arg2: vector<64x64xf32>) -> vector<64x64xf32> {
-    %lhs_f32 = arith.extf %arg0 : vector<64x64xf16> to vector<64x64xf32>
-    %rhs_f32 = arith.extf %arg1 : vector<64x64xf16> to vector<64x64xf32>
-    %result = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_f32, %rhs_f32, %arg2 : vector<64x64xf32>, vector<64x64xf32> into vector<64x64xf32>
-    return %result : vector<64x64xf32>
+module @payload attributes { transform.target_tag = "payload" } {
+  func.func @fold_arith_extf_into_contract(%arg0: vector<64x64xf16>, %arg1: vector<64x64xf16>, %arg2: vector<64x64xf32>) -> vector<64x64xf32> {
+      %lhs_f32 = arith.extf %arg0 : vector<64x64xf16> to vector<64x64xf32>
+      %rhs_f32 = arith.extf %arg1 : vector<64x64xf16> to vector<64x64xf32>
+      %result = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_f32, %rhs_f32, %arg2 : vector<64x64xf32>, vector<64x64xf32> into vector<64x64xf32>
+      return %result : vector<64x64xf32>
+  }
 }
 
 module attributes {transform.with_named_sequence} {
@@ -95,30 +99,32 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: func.func @arith_to_outerproduct_scalable_i32
-//  CHECK-SAME:   %[[LHS:.*]]: vector<[4]xi32>,
-//  CHECK-SAME:   %[[RHS:.*]]: vector<[4]xi32>) -> vector<[4]x[4]xi32> {
-//       CHECK:     %[[RES:.*]] = vector.outerproduct %[[LHS]], %[[RHS]] : vector<[4]xi32>, vector<[4]xi32>
-//       CHECK:     return %[[RES]] : vector<[4]x[4]xi32>
-func.func @arith_to_outerproduct_scalable_i32(%lhs: vector<[4]xi32>, %rhs: vector<[4]xi32>) -> vector<[4]x[4]xi32> {
-  %lhsBcast = vector.broadcast %lhs : vector<[4]xi32> to vector<[4]x[4]xi32>
-  %lhsT = vector.transpose %lhsBcast, [1, 0] : vector<[4]x[4]xi32> to vector<[4]x[4]xi32>
-  %rhsBcast = vector.broadcast %rhs : vector<[4]xi32> to vector<[4]x[4]xi32>
-  %mul = arith.muli %lhsT, %rhsBcast : vector<[4]x[4]xi32>
-  return %mul: vector<[4]x[4]xi32>
-}
+module @payload attributes { transform.target_tag = "payload" } {
+  // CHECK-LABEL: func.func @arith_to_outerproduct_scalable_i32
+  //  CHECK-SAME:   %[[LHS:.*]]: vector<[4]xi32>,
+  //  CHECK-SAME:   %[[RHS:.*]]: vector<[4]xi32>) -> vector<[4]x[4]xi32> {
+  //       CHECK:     %[[RES:.*]] = vector.outerproduct %[[LHS]], %[[RHS]] : vector<[4]xi32>, vector<[4]xi32>
+  //       CHECK:     return %[[RES]] : vector<[4]x[4]xi32>
+  func.func @arith_to_outerproduct_scalable_i32(%lhs: vector<[4]xi32>, %rhs: vector<[4]xi32>) -> vector<[4]x[4]xi32> {
+    %lhsBcast = vector.broadcast %lhs : vector<[4]xi32> to vector<[4]x[4]xi32>
+    %lhsT = vector.transpose %lhsBcast, [1, 0] : vector<[4]x[4]xi32> to vector<[4]x[4]xi32>
+    %rhsBcast = vector.broadcast %rhs : vector<[4]xi32> to vector<[4]x[4]xi32>
+    %mul = arith.muli %lhsT, %rhsBcast : vector<[4]x[4]xi32>
+    return %mul: vector<[4]x[4]xi32>
+  }
 
-// CHECK-LABEL: func.func @arith_to_outerproduct_trans_rhs_f32
-//  CHECK-SAME:   %[[LHS:.*]]: vector<16xf32>,
-//  CHECK-SAME:   %[[RHS:.*]]: vector<8xf32>) -> vector<8x16xf32> {
-//       CHECK:     %[[RES:.*]] = vector.outerproduct %[[RHS]], %[[LHS]] : vector<8xf32>, vector<16xf32>
-//       CHECK:     return %[[RES]] : vector<8x16xf32>
-func.func @arith_to_outerproduct_trans_rhs_f32(%lhs: vector<16xf32>, %rhs: vector<8xf32>) -> vector<8x16xf32> {
-  %rhsBcast = vector.broadcast %rhs : vector<8xf32> to vector<16x8xf32>
-  %rhsT = vector.transpose %rhsBcast, [1, 0] : vector<16x8xf32> to vector<8x16xf32>
-  %lhsBcast = vector.broadcast %lhs : vector<16xf32> to vector<8x16xf32>
-  %mul = arith.mulf %lhsBcast, %rhsT : vector<8x16xf32>
-  return %mul: vector<8x16xf32>
+  // CHECK-LABEL: func.func @arith_to_outerproduct_trans_rhs_f32
+  //  CHECK-SAME:   %[[LHS:.*]]: vector<16xf32>,
+  //  CHECK-SAME:   %[[RHS:.*]]: vector<8xf32>) -> vector<8x16xf32> {
+  //       CHECK:     %[[RES:.*]] = vector.outerproduct %[[RHS]], %[[LHS]] : vector<8xf32>, vector<16xf32>
+  //       CHECK:     return %[[RES]] : vector<8x16xf32>
+  func.func @arith_to_outerproduct_trans_rhs_f32(%lhs: vector<16xf32>, %rhs: vector<8xf32>) -> vector<8x16xf32> {
+    %rhsBcast = vector.broadcast %rhs : vector<8xf32> to vector<16x8xf32>
+    %rhsT = vector.transpose %rhsBcast, [1, 0] : vector<16x8xf32> to vector<8x16xf32>
+    %lhsBcast = vector.broadcast %lhs : vector<16xf32> to vector<8x16xf32>
+    %mul = arith.mulf %lhsBcast, %rhsT : vector<8x16xf32>
+    return %mul: vector<8x16xf32>
+  }
 }
 
 module attributes {transform.with_named_sequence} {
diff --git a/mlir/test/Dialect/Vector/vector-transforms.mlir b/mlir/test/Dialect/Vector/vector-transforms.mlir
index eda6a5cc40d99..89e8ca1d93109 100644
--- a/mlir/test/Dialect/Vector/vector-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-transforms.mlir
@@ -184,15 +184,6 @@ func.func @vector_transfers(%arg0: index, %arg1: index) {
   return
 }
 
-// CHECK-LABEL: func @cancelling_shape_cast_ops
-//  CHECK-SAME: %[[A0:.*0]]: vector<2x4xf32>
-//       CHECK: return %[[A0]] : vector<2x4xf32>
-func.func @cancelling_shape_cast_ops(%arg0 : vector<2x4xf32>) -> vector<2x4xf32> {
-  %0 = vector.shape_cast %arg0 : vector<2x4xf32> to vector<8xf32>
-  %1 = vector.shape_cast %0 : vector<8xf32> to vector<2x4xf32>
-  return %1 : vector<2x4xf32>
-}
-
 // CHECK-LABEL: func @elementwise_unroll
 //  CHECK-SAME: (%[[ARG0:.*]]: memref<4x4xf32>, %[[ARG1:.*]]: memref<4x4xf32>)
 //       CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
diff --git a/mlir/test/Examples/transform/ChH/full.mlir b/mlir/test/Examples/transform/ChH/full.mlir
index 259475ebdbf49..85dbf67023323 100644
--- a/mlir/test/Examples/transform/ChH/full.mlir
+++ b/mlir/test/Examples/transform/ChH/full.mlir
@@ -1,8 +1,6 @@
-// RUN: mlir-opt %s --transform-interpreter \
-// RUN:             --test-transform-dialect-erase-schedule \
-// RUN:             --math-uplift-to-fma \
-// RUN:             --convert-bufferization-to-memref \
-// RUN:             --test-lower-to-llvm |\
+// RUN: mlir-opt %s --transform-interpreter="debug-payload-root-tag=payload" \
+// RUN:             --test-transform-dialect-erase-schedule |\
+// RUN: mlir-opt -pass-pipeline='builtin.module(builtin.module(math-uplift-to-fma,convert-bufferization-to-memref,test-lower-to-llvm))' - |\
 // RUN: FileCheck %s
 
 // Fixed-size tensor types to be used in convolution.
@@ -19,6 +17,7 @@
 // tensors annotated with attributes from the `bufferization` dialect. These
 // attributes hint the bufferization pass to assume buffers can be directly
 // used for these tensors without reshaping.
+module @payload attributes { transform.target_tag = "payload" } {
 func.func @conv(
     %input: !tinput {bufferization.writable = false,
                      bufferization.access = "read",
@@ -84,7 +83,7 @@ func.func @conv(
 
   return %relued : !toutput
 }
-
+}
 // Module containing the transformation script to be applied. The attribute
 // is required to correctly verify the use of named (macro-like) sequences.
 module attributes { transform.with_named_sequence } {
diff --git a/mlir/test/IR/recursive-type.mlir b/mlir/test/IR/recursive-type.mlir
index 121ba095573ba..42aecb41d998d 100644
--- a/mlir/test/IR/recursive-type.mlir
+++ b/mlir/test/IR/recursive-type.mlir
@@ -2,7 +2,10 @@
 
 // CHECK: !testrec = !test.test_rec<type_to_alias, test_rec<type_to_alias>>
 // CHECK: ![[$NAME:.*]] = !test.test_rec_alias<name, !test.test_rec_alias<name>>
+// CHECK: ![[$NAME5:.*]] = !test.test_rec_alias<name5, !test.test_rec_alias<name3, !test.test_rec_alias<name4, !test.test_rec_alias<name5>>>>
 // CHECK: ![[$NAME2:.*]] = !test.test_rec_alias<name2, tuple<!test.test_rec_alias<name2>, i32>>
+// CHECK: ![[$NAME4:.*]] = !test.test_rec_alias<name4, !name5_>
+// CHECK: ![[$NAME3:.*]] = !test.test_rec_alias<name3, !name4_>
 
 // CHECK-LABEL: @roundtrip
 func.func @roundtrip() {
@@ -24,6 +27,14 @@ func.func @roundtrip() {
   // CHECK: () -> ![[$NAME2]]
   "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias<name2, tuple<!test.test_rec_alias<name2>, i32>>
   "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias<name2, tuple<!test.test_rec_alias<name2>, i32>>
+
+  // Mutual recursion.
+  // CHECK: () -> ![[$NAME3]]
+  // CHECK: () -> ![[$NAME4]]
+  // CHECK: () -> ![[$NAME5]]
+  "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias<name3, !test.test_rec_alias<name4, !test.test_rec_alias<name5, !test.test_rec_alias<name3>>>>
+  "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias<name4, !test.test_rec_alias<name5, !test.test_rec_alias<name3, !test.test_rec_alias<name4>>>>
+  "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias<name5, !test.test_rec_alias<name3, !test.test_rec_alias<name4, !test.test_rec_alias<name5>>>>
   return
 }
 
diff --git a/mlir/test/Integration/GPU/CUDA/dump-sass.mlir b/mlir/test/Integration/GPU/CUDA/dump-sass.mlir
new file mode 100644
index 0000000000000..d32f5efc29d58
--- /dev/null
+++ b/mlir/test/Integration/GPU/CUDA/dump-sass.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-opt %s \
+// RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline -debug-only=dump-sass \
+// RUN:  2>&1 | FileCheck %s
+
+// CHECK: MOV
+// CHECK: STG.E 
+
+func.func @other_func(%arg0 : f32, %arg1 : memref<?xf32>) {
+  %cst = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %cst2 = memref.dim %arg1, %c0 : memref<?xf32>
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) {
+    memref.store %arg0, %arg1[%tx] : memref<?xf32>
+    gpu.terminator
+  }
+  return
+}
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
index fdefdcc453ae7..f5f703d95e2d5 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
@@ -437,3 +437,74 @@ module attributes {transform.with_named_sequence} {
 //      CHECK:         scf.yield %[[LOOP_RESULT2]]#0, %[[LOOP_RESULT2]]#1 :
 //      CHECK:   }
 //      CHECK:   return %[[LOOP_RESULT1]]#1 :
+
+// -----
+
+// This test case checks fusion of consumer even if the producer has multiple uses.
+// The multiple uses of the producer essentially means that besides the consumer
+// op in concern, the only other uses of the producer are allowed in :-
+// 1. scf.yield
+// 2. tensor.parallel_insert_slice
+
+module {
+  module {
+    func.func @fuse_consumer_for_multi_use_producer(%arg0: tensor<256x512xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<256x256xf32>) -> (tensor<256x256xf32>, tensor<256x256xf32>) {
+      %c0 = arith.constant 0 : index
+      %c64 = arith.constant 64 : index
+      %c256 = arith.constant 256 : index
+      %cst = arith.constant 0.000000e+00 : f32
+      %0 = tensor.empty() : tensor<256x256xf32>
+      %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+      %2:2 = scf.for %arg3 = %c0 to %c256 step %c64 iter_args(%arg4 = %1, %arg5 = %arg2) -> (tensor<256x256xf32>, tensor<256x256xf32>) {
+        %3 = scf.for %arg6 = %c0 to %c256 step %c64 iter_args(%arg7 = %arg4) -> (tensor<256x256xf32>) {
+          %extracted_slice = tensor.extract_slice %arg7[%arg3, %arg6] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
+          %extracted_slice_0 = tensor.extract_slice %arg0[%arg3, 0] [64, 512] [1, 1] : tensor<256x512xf32> to tensor<64x512xf32>
+          %extracted_slice_1 = tensor.extract_slice %arg1[0, %arg6] [512, 64] [1, 1] : tensor<512x256xf32> to tensor<512x64xf32>
+          %5 = linalg.matmul ins(%extracted_slice_0, %extracted_slice_1 : tensor<64x512xf32>, tensor<512x64xf32>) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[%arg3, %arg6] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
+          scf.yield %inserted_slice : tensor<256x256xf32>
+        }
+        %4 = linalg.add ins(%3, %arg5 : tensor<256x256xf32>, tensor<256x256xf32>) outs(%0 : tensor<256x256xf32>) -> tensor<256x256xf32>
+        scf.yield %3, %4 : tensor<256x256xf32>, tensor<256x256xf32>
+      }
+      return %2#0, %2#1 : tensor<256x256xf32>, tensor<256x256xf32>
+    }
+  }
+  module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+      %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+      %consumer, %fused_consumer = transform.test.fuse_consumer %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield
+    }
+  }
+}
+//      CHECK: func.func @fuse_consumer_for_multi_use_producer(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<256x512xf32>
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<512x256xf32>
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<256x256xf32>
+//      CHECK:   %[[dest0:.*]] = tensor.empty() : tensor<256x256xf32>
+//      CHECK:   %[[dest1:.*]] = linalg.fill
+// CHECK-SAME:          outs(%[[dest0]] :
+//      CHECK:   %[[LOOP_RESULT1:.*]]:2 = scf.for %[[IV1:.*]] = %[[C0]]
+// CHECK-SAME:       iter_args(%[[FIRST_OUT_ARG1:.*]] = %[[dest1]], %[[SECOND_OUT_ARG1:.*]] = %[[ARG2]])
+// CHECK-SAME:   {
+//      CHECK:       %[[LOOP_RESULT2:.*]]:2 = scf.for %[[IV2:.*]] = %[[C0]]
+// CHECK-SAME:         iter_args(%[[FIRST_OUT_ARG2:.*]] = %[[FIRST_OUT_ARG1]], %[[SECOND_OUT_ARG2:.*]] = %[[dest0]])
+// CHECK-SAME:         {
+//      CHECK:            %[[MAT_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1]
+//      CHECK:            %[[INPUT_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[IV1]], 0] [64, 512] [1, 1]
+//      CHECK:            %[[WEIGHT_SLICE:.*]] = tensor.extract_slice %[[ARG1]][0, %[[IV2]]] [512, 64] [1, 1]
+//      CHECK:            %[[TILED_MAT_OUT:.*]] = linalg.matmul
+// CHECK-SAME:                  outs(%[[MAT_OUT_SLICE]] :
+//      CHECK:            %[[INSERT_MAT:.*]] = tensor.insert_slice %[[TILED_MAT_OUT]] into %[[FIRST_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1]
+//      CHECK:            %[[ADD_OPERAND2_SLICE:.*]] = tensor.extract_slice %[[SECOND_OUT_ARG1]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1]
+//      CHECK:            %[[ADD_OUT_SLICE:.*]] = tensor.extract_slice %[[SECOND_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1]
+//      CHECK:            %[[TILED_ADD_OUT:.*]] = linalg.add
+// CHECK-SAME:              ins(%[[TILED_MAT_OUT]], %[[ADD_OPERAND2_SLICE]] :
+// CHECK-SAME:              outs(%[[ADD_OUT_SLICE]] :
+//      CHECK:            %[[INSERT_ADD:.*]] = tensor.insert_slice %[[TILED_ADD_OUT]] into %[[SECOND_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1]
+//      CHECK:            scf.yield %[[INSERT_MAT]], %[[INSERT_ADD]] :
+//      CHECK:         }
+//      CHECK:         scf.yield %[[LOOP_RESULT2]]#0, %[[LOOP_RESULT2]]#1 :
+//      CHECK:   }
+//      CHECK:   return %[[LOOP_RESULT1]]#0, %[[LOOP_RESULT1]]#1 :
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index 966a00f9e3c67..007284d0ca443 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -2626,3 +2626,72 @@ llvm.func @reqd_work_group_size() attributes {reqd_work_group_size = array<i32:
 llvm.func @intel_reqd_sub_group_size() attributes {intel_reqd_sub_group_size = 32 : i32}
 
 // CHECK: ![[#INTEL_REQD_SUB_GROUP_SIZE]] = !{i32 32}
+
+// -----
+
+llvm.func @foo()
+
+llvm.func @call_with_empty_opbundle() {
+  llvm.call @foo() [] : () -> ()
+  llvm.return
+}
+
+//      CHECK: define void @call_with_empty_opbundle() {
+// CHECK-NEXT:   call void @foo()
+// CHECK-NEXT:   ret void
+// CHECK-NEXT: }
+
+llvm.func @call_with_empty_opbundle_operands() {
+  llvm.call @foo() ["tag"()] : () -> ()
+  llvm.return
+}
+
+//      CHECK: define void @call_with_empty_opbundle_operands() {
+// CHECK-NEXT:   call void @foo() [ "tag"() ]
+// CHECK-NEXT:   ret void
+// CHECK-NEXT: }
+
+llvm.func @call_with_opbundle() {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.mlir.constant(2 : i32) : i32
+  %2 = llvm.mlir.constant(3 : i32) : i32
+  llvm.call @foo() ["tag1"(%0, %1 : i32, i32), "tag2"(%2 : i32)] : () -> ()
+  llvm.return
+}
+
+//      CHECK: define void @call_with_opbundle() {
+// CHECK-NEXT:   call void @foo() [ "tag1"(i32 1, i32 2), "tag2"(i32 3) ]
+// CHECK-NEXT:   ret void
+// CHECK-NEXT: }
+
+llvm.func @__gxx_personality_v0(...) -> i32
+llvm.func @invoke_with_opbundle() attributes { personality = @__gxx_personality_v0 } {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.mlir.constant(2 : i32) : i32
+  %2 = llvm.mlir.constant(3 : i32) : i32
+  llvm.invoke @foo() to ^bb2 unwind ^bb1 ["tag1"(%0, %1 : i32, i32), "tag2"(%2 : i32)] : () -> ()
+
+^bb1:
+  %3 = llvm.landingpad cleanup : !llvm.struct<(ptr, i32)>
+  llvm.return
+
+^bb2:
+  llvm.return
+}
+
+//      CHECK: define void @invoke_with_opbundle() personality ptr @__gxx_personality_v0 {
+// CHECK-NEXT:   invoke void @foo() [ "tag1"(i32 1, i32 2), "tag2"(i32 3) ]
+// CHECK-NEXT:           to label %{{.+}} unwind label %{{.+}}
+//      CHECK: }
+
+llvm.func @call_intrin_with_opbundle(%arg0 : !llvm.ptr) {
+  %0 = llvm.mlir.constant(1 : i1) : i1
+  %1 = llvm.mlir.constant(16 : i32) : i32
+  llvm.call_intrinsic "llvm.assume"(%0) ["align"(%arg0, %1 : !llvm.ptr, i32)] : (i1) -> ()
+  llvm.return
+}
+
+//      CHECK: define void @call_intrin_with_opbundle(ptr %0) {
+// CHECK-NEXT:   call void @llvm.assume(i1 true) [ "align"(ptr %0, i32 16) ]
+// CHECK-NEXT:   ret void
+// CHECK-NEXT: }
diff --git a/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir b/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
index a14214cd8c1cb..5ee8d08068409 100644
--- a/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
@@ -19,7 +19,6 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
     %7 = omp.map.info var_ptr(%0 : !llvm.ptr, !llvm.array<3 x array<3 x array<3 x i32>>>)   map_clauses(tofrom) capture(ByRef) bounds(%5, %5, %6) -> !llvm.ptr {name = "inarray(1:3,1:3,2:2)"}
     %8 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.array<3 x array<3 x array<3 x i32>>>)   map_clauses(tofrom) capture(ByRef) bounds(%5, %5, %5) -> !llvm.ptr {name = "outarray(1:3,1:3,1:3)"}
     omp.target   map_entries(%7 -> %arg0, %8 -> %arg1 : !llvm.ptr, !llvm.ptr) {
-      ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
       %9 = llvm.mlir.constant(0 : i64) : i64
       %10 = llvm.mlir.constant(1 : i64) : i64
       %11 = llvm.getelementptr %arg0[0, %10, %9, %9] : (!llvm.ptr, i64, i64, i64) -> !llvm.ptr, !llvm.array<3 x array<3 x array<3 x i32>>>
diff --git a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
index 5931da7582fd7..9549de1258efc 100644
--- a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
@@ -7,7 +7,6 @@ module attributes {omp.is_target_device = true} {
     %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "sp"}
     %3 = omp.map.info var_ptr(%0 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr {name = "i"}
     omp.target map_entries(%2 -> %arg0, %3 -> %arg1 : !llvm.ptr, !llvm.ptr) {
-      ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
       %4 = llvm.load %arg1 : !llvm.ptr -> i32
       llvm.store %4, %arg0 : i32, !llvm.ptr
       omp.terminator
diff --git a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-host.mlir b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-host.mlir
index 7c494e80155bb..871f5caf7b2ff 100644
--- a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-host.mlir
@@ -7,7 +7,6 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
     %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "sp"}
     %3 = omp.map.info var_ptr(%0 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr {name = "i"}
     omp.target map_entries(%2 -> %arg0, %3 -> %arg1 : !llvm.ptr, !llvm.ptr) {
-      ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
       %4 = llvm.load %arg1 : !llvm.ptr -> i32
       llvm.store %4, %arg0 : i32, !llvm.ptr
       omp.terminator
diff --git a/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir b/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir
index aa4ac111a8a50..842d9d78a3c38 100644
--- a/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir
@@ -16,7 +16,6 @@ module attributes {omp.is_target_device = true} {
     %2 = llvm.alloca %1 x !llvm.struct<(ptr)> : (i64) -> !llvm.ptr
     %3 = omp.map.info var_ptr(%2 : !llvm.ptr, !llvm.struct<(ptr)>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr
     omp.target map_entries(%3 -> %arg0 : !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr):
       %4 = llvm.mlir.constant(1 : i32) : i32
       %5 = llvm.alloca %4 x !llvm.struct<(ptr)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
       %6 = llvm.mlir.constant(50 : i32) : i32
diff --git a/mlir/test/Target/LLVMIR/omptarget-constant-indexing-device-region.mlir b/mlir/test/Target/LLVMIR/omptarget-constant-indexing-device-region.mlir
index f263180d4240b..86fe6db3ff819 100644
--- a/mlir/test/Target/LLVMIR/omptarget-constant-indexing-device-region.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-constant-indexing-device-region.mlir
@@ -10,7 +10,6 @@ module attributes {omp.is_target_device = true} {
     %5 = omp.map.bounds lower_bound(%3 : i64) upper_bound(%4 : i64) extent(%1 : i64) stride(%2 : i64) start_idx(%2 : i64)
     %6 = omp.map.info var_ptr(%0 : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(tofrom) capture(ByRef) bounds(%5) -> !llvm.ptr {name = "sp"}
     omp.target map_entries(%6 -> %arg0 : !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr):
       %7 = llvm.mlir.constant(20 : i32) : i32
       %8 = llvm.mlir.constant(0 : i64) : i64
       %9 = llvm.getelementptr %arg0[0, %8] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.array<10 x i32>
diff --git a/mlir/test/Target/LLVMIR/omptarget-debug.mlir b/mlir/test/Target/LLVMIR/omptarget-debug.mlir
index 76a853249caca..bc930695c501d 100644
--- a/mlir/test/Target/LLVMIR/omptarget-debug.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-debug.mlir
@@ -6,7 +6,6 @@ module attributes {omp.is_target_device = true} {
     %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr
     %9 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     omp.target map_entries(%9 -> %arg0 : !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr):
       %13 = llvm.mlir.constant(1 : i32) : i32
       llvm.store %13, %arg0 : i32, !llvm.ptr loc(#loc2)
       omp.terminator
diff --git a/mlir/test/Target/LLVMIR/omptarget-debug2.mlir b/mlir/test/Target/LLVMIR/omptarget-debug2.mlir
index ee19cc31e5c6b..78dc6e18a40a7 100644
--- a/mlir/test/Target/LLVMIR/omptarget-debug2.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-debug2.mlir
@@ -8,7 +8,6 @@ module attributes {omp.is_target_device = false} {
     %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr
     %9 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     omp.target map_entries(%9 -> %arg0 : !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr):
       %13 = llvm.mlir.constant(1 : i32) : i32
       llvm.store %13, %arg0 : i32, !llvm.ptr loc(#loc2)
       omp.terminator
diff --git a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
index bb32000cc9457..e0c4c02e03a65 100644
--- a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
@@ -23,7 +23,6 @@ module attributes {omp.is_target_device = true} {
   // CHECK-DAG: br label %omp.region.cont
     %map = omp.map.info var_ptr(%0 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     omp.target   map_entries(%map -> %arg0 : !llvm.ptr) {
-      ^bb0(%arg0: !llvm.ptr):
       %1 = llvm.mlir.constant(1 : i32) : i32
       llvm.store %1, %arg0 : i32, !llvm.ptr
       omp.terminator
diff --git a/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir b/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir
index a951593d26741..621a206e18053 100644
--- a/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir
@@ -8,8 +8,7 @@ module attributes {omp.is_target_device = false} {
     %3 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%0 : i64) extent(%2 : i64) stride(%1 : i64) start_idx(%1 : i64)
     %4 = llvm.mlir.addressof @_QFEa : !llvm.ptr
     %5 = omp.map.info var_ptr(%4 : !llvm.ptr, !llvm.array<40 x i32>) map_clauses(from) capture(ByRef) bounds(%3) -> !llvm.ptr {name = "a"}
-    omp.target map_entries(%5 -> %arg0 : !llvm.ptr) depend(taskdependin -> %4 : !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr):
+    omp.target depend(taskdependin -> %4 : !llvm.ptr) map_entries(%5 -> %arg0 : !llvm.ptr) {
       %6 = llvm.mlir.constant(100 : index) : i32
       llvm.store %6, %arg0 : i32, !llvm.ptr
       omp.terminator
diff --git a/mlir/test/Target/LLVMIR/omptarget-depend.mlir b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
index 4783e56d38cf4..71fecd0fa5fd0 100644
--- a/mlir/test/Target/LLVMIR/omptarget-depend.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
@@ -47,8 +47,7 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
     %11 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.array<40 x i32>) map_clauses(from) capture(ByRef) bounds(%9) -> !llvm.ptr {name = "b"}
     %12 = omp.map.info var_ptr(%7 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
     %13 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "n"}
-    omp.target map_entries(%10 -> %arg0, %11 -> %arg1, %12 -> %arg2, %13 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) depend(taskdependin -> %4 : !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: !llvm.ptr):
+    omp.target depend(taskdependin -> %4 : !llvm.ptr) map_entries(%10 -> %arg0, %11 -> %arg1, %12 -> %arg2, %13 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
       %14 = llvm.mlir.constant(0 : index) : i64
       %15 = llvm.mlir.constant(10 : i32) : i32
       %16 = llvm.mlir.constant(1 : index) : i64
diff --git a/mlir/test/Target/LLVMIR/omptarget-fortran-allocatable-types-host.mlir b/mlir/test/Target/LLVMIR/omptarget-fortran-allocatable-types-host.mlir
index f0e301bd70e3b..6fe77bd228ef2 100644
--- a/mlir/test/Target/LLVMIR/omptarget-fortran-allocatable-types-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-fortran-allocatable-types-host.mlir
@@ -43,7 +43,6 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
     %32 = omp.map.info var_ptr(%5 : !llvm.ptr, f32) var_ptr_ptr(%31 : !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "scalar"}
     %33 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(tofrom) capture(ByRef) members(%32 : [0] : !llvm.ptr) -> !llvm.ptr {name = "scalar"}
     omp.target map_entries(%17 -> %arg0, %18 -> %arg1, %29 -> %arg2, %30 -> %arg3, %32 -> %arg4, %33 -> %arg5 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: !llvm.ptr, %arg4: !llvm.ptr, %arg5: !llvm.ptr):
       omp.terminator
     }
     llvm.return
diff --git a/mlir/test/Target/LLVMIR/omptarget-fortran-common-block-host.mlir b/mlir/test/Target/LLVMIR/omptarget-fortran-common-block-host.mlir
index 396628e1081e9..da8ddfd1edf9c 100644
--- a/mlir/test/Target/LLVMIR/omptarget-fortran-common-block-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-fortran-common-block-host.mlir
@@ -15,7 +15,6 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
     %5 = omp.map.info var_ptr(%3 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var1"}
     %6 = omp.map.info var_ptr(%4 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var2"}
     omp.target map_entries(%5 -> %arg0, %6 -> %arg1 : !llvm.ptr, !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
       omp.terminator
     }
     llvm.return
@@ -25,7 +24,6 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
     %0 = llvm.mlir.addressof @var_common_ : !llvm.ptr
     %1 = omp.map.info var_ptr(%0 : !llvm.ptr, !llvm.array<8 x i8>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var_common"}
     omp.target map_entries(%1 -> %arg0 : !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr):
       omp.terminator
     }
     llvm.return
diff --git a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
index 458d2f28a78f8..654763c577d1a 100644
--- a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
@@ -210,8 +210,7 @@ llvm.func @_QPopenmp_target_use_dev_ptr() {
   %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
   %map1 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
   %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map1 : !llvm.ptr) use_device_ptr(%map2 : !llvm.ptr)  {
-  ^bb0(%arg0: !llvm.ptr):
+  omp.target_data  map_entries(%map1 : !llvm.ptr) use_device_ptr(%map2 -> %arg0 : !llvm.ptr)  {
     %1 = llvm.mlir.constant(10 : i32) : i32
     %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
     llvm.store %1, %2 : i32, !llvm.ptr
@@ -255,8 +254,7 @@ llvm.func @_QPopenmp_target_use_dev_addr() {
   %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
   %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
   %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 : !llvm.ptr)  {
-  ^bb0(%arg0: !llvm.ptr):
+  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
     %1 = llvm.mlir.constant(10 : i32) : i32
     %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
     llvm.store %1, %2 : i32, !llvm.ptr
@@ -298,8 +296,7 @@ llvm.func @_QPopenmp_target_use_dev_addr_no_ptr() {
   %a = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr
   %map = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
   %map2 = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 : !llvm.ptr)  {
-  ^bb0(%arg0: !llvm.ptr):
+  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
     %1 = llvm.mlir.constant(10 : i32) : i32
     llvm.store %1, %arg0 : i32, !llvm.ptr
     omp.terminator
@@ -341,8 +338,7 @@ llvm.func @_QPopenmp_target_use_dev_addr_nomap() {
   %b = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
   %map = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
   %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 : !llvm.ptr)  {
-  ^bb0(%arg0: !llvm.ptr):
+  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
     %2 = llvm.mlir.constant(10 : i32) : i32
     %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
     llvm.store %2, %3 : i32, !llvm.ptr
@@ -400,13 +396,12 @@ llvm.func @_QPopenmp_target_use_dev_both() {
   %map1 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
   %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
   %map3 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map, %map1 : !llvm.ptr, !llvm.ptr) use_device_ptr(%map2 : !llvm.ptr) use_device_addr(%map3 : !llvm.ptr)  {
-  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  omp.target_data  map_entries(%map, %map1 : !llvm.ptr, !llvm.ptr) use_device_addr(%map3 -> %arg0 : !llvm.ptr) use_device_ptr(%map2 -> %arg1 : !llvm.ptr)  {
     %2 = llvm.mlir.constant(10 : i32) : i32
-    %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
+    %3 = llvm.load %arg1 : !llvm.ptr -> !llvm.ptr
     llvm.store %2, %3 : i32, !llvm.ptr
     %4 = llvm.mlir.constant(20 : i32) : i32
-    %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.ptr
+    %5 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
     llvm.store %4, %5 : i32, !llvm.ptr
     omp.terminator
   }
diff --git a/mlir/test/Target/LLVMIR/omptarget-nested-record-type-mapping-host.mlir b/mlir/test/Target/LLVMIR/omptarget-nested-record-type-mapping-host.mlir
index 8cec94abf968b..8c1182c839a25 100644
--- a/mlir/test/Target/LLVMIR/omptarget-nested-record-type-mapping-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-nested-record-type-mapping-host.mlir
@@ -23,7 +23,6 @@ llvm.func @_QQmain() {
     %11 = omp.map.info var_ptr(%9 : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(tofrom) capture(ByRef) bounds(%10) -> !llvm.ptr
     %12 = omp.map.info var_ptr(%4 : !llvm.ptr, !llvm.struct<(f32, array<10 x i32>, struct<(f32, i32)>, i32)>) map_clauses(tofrom) capture(ByRef) members(%6, %8, %11 : [3, -1], [2, 1], [1, -1] : !llvm.ptr, !llvm.ptr, !llvm.ptr) -> !llvm.ptr {partial_map = true}
     omp.target map_entries(%6 -> %arg0, %8 -> %arg1, %11 -> %arg2, %12 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: !llvm.ptr):
       omp.terminator
     }
     llvm.return
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
index a714e594d7812..4903656c22ec7 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
@@ -7,7 +7,6 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
   llvm.func @_QQmain_omp_outline_1(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
     %0 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = "d"}
     omp.target map_entries(%0 -> %arg2 : !llvm.ptr) {
-    ^bb0(%arg2: !llvm.ptr):
       omp.parallel {
         %1 = llvm.mlir.constant(1 : i32) : i32
         llvm.store %1, %arg2 : i32, !llvm.ptr
@@ -21,7 +20,6 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
   llvm.func @_test_num_threads(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
     %0 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = "d"}
     omp.target map_entries(%0 -> %arg2 : !llvm.ptr) {
-    ^bb0(%arg2: !llvm.ptr):
       %1 = llvm.mlir.constant(156 : i32) : i32
       omp.parallel num_threads(%1 : i32) {
         %2 = llvm.mlir.constant(1 : i32) : i32
@@ -39,7 +37,6 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
     %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = "d"}
     %3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "ifcond"}
     omp.target map_entries(%2 -> %arg1, %3 -> %arg2 : !llvm.ptr, !llvm.ptr) {
-    ^bb0(%arg1: !llvm.ptr, %arg2: !llvm.ptr):
       %4 = llvm.mlir.constant(10 : i32) : i32
       %5 = llvm.load %arg2 : !llvm.ptr -> i32
       %6 = llvm.mlir.constant(0 : i64) : i32
diff --git a/mlir/test/Target/LLVMIR/omptarget-record-type-mapping-host.mlir b/mlir/test/Target/LLVMIR/omptarget-record-type-mapping-host.mlir
index bbfcb4eecb3e8..e19b96bb2d732 100644
--- a/mlir/test/Target/LLVMIR/omptarget-record-type-mapping-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-record-type-mapping-host.mlir
@@ -22,7 +22,6 @@ llvm.func @_QQmain() {
     %11 = omp.map.info var_ptr(%9 : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(tofrom) capture(ByRef) bounds(%10) -> !llvm.ptr
     %12 = omp.map.info var_ptr(%4 : !llvm.ptr, !llvm.struct<(f32, array<10 x i32>, i32)>) map_clauses(tofrom) capture(ByRef) members(%7, %11 : [2], [1] : !llvm.ptr, !llvm.ptr) -> !llvm.ptr {partial_map = true}
     omp.target map_entries(%7 -> %arg0, %11 -> %arg1, %12 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
       omp.terminator
     }
     llvm.return
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
index ca8a2e6a5b98c..8993c0e85c5de 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
@@ -16,7 +16,6 @@ module attributes {omp.is_target_device = true} {
     %map2 = omp.map.info var_ptr(%5 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     %map3 = omp.map.info var_ptr(%7 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     omp.target map_entries(%map1 -> %arg0, %map2 -> %arg1, %map3 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
       %8 = llvm.load %arg0 : !llvm.ptr -> i32
       %9 = llvm.load %arg1 : !llvm.ptr -> i32
       %10 = llvm.add %8, %9  : i32
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-host-only.mlir b/mlir/test/Target/LLVMIR/omptarget-region-host-only.mlir
index 61b6f3b91cd79..333c8c308db96 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-host-only.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-host-only.mlir
@@ -16,7 +16,6 @@ module attributes {omp.is_target_device = false} {
     %map2 = omp.map.info var_ptr(%5 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     %map3 = omp.map.info var_ptr(%7 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     omp.target map_entries(%map1 -> %arg0, %map2 -> %arg1, %map3 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
       %8 = llvm.load %arg0 : !llvm.ptr -> i32
       %9 = llvm.load %arg1 : !llvm.ptr -> i32
       %10 = llvm.add %8, %9  : i32
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir
index 3af960d6ffcd0..8b769f2e7d1a4 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir
@@ -16,7 +16,6 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
     %map2 = omp.map.info var_ptr(%5 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     %map3 = omp.map.info var_ptr(%7 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     omp.target map_entries(%map1 -> %arg0, %map2 -> %arg1, %map3 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
       %8 = llvm.load %arg0 : !llvm.ptr -> i32
       %9 = llvm.load %arg1 : !llvm.ptr -> i32
       %10 = llvm.add %8, %9  : i32
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
index 4072150a8eab8..c11db4be1aa7c 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
@@ -16,7 +16,6 @@ module attributes {omp.is_target_device = false} {
     %map2 = omp.map.info var_ptr(%5 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     %map3 = omp.map.info var_ptr(%7 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     omp.target map_entries( %map1 -> %arg0, %map2 -> %arg1, %map3 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
       omp.parallel {
         %8 = llvm.load %arg0 : !llvm.ptr -> i32
         %9 = llvm.load %arg1 : !llvm.ptr -> i32
diff --git a/mlir/test/Target/LLVMIR/omptarget-target-inside-task.mlir b/mlir/test/Target/LLVMIR/omptarget-target-inside-task.mlir
index 3d18e608d857e..be6bb6df9e45a 100644
--- a/mlir/test/Target/LLVMIR/omptarget-target-inside-task.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-target-inside-task.mlir
@@ -17,7 +17,6 @@ module attributes {omp.is_target_device = true, omp.is_gpu = true} {
         %map2 = omp.map.info var_ptr(%5 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
         %map3 = omp.map.info var_ptr(%7 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
       omp.target map_entries(%map1 -> %arg0, %map2 -> %arg1, %map3 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
-      ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
         %8 = llvm.load %arg0 : !llvm.ptr -> i32
         %9 = llvm.load %arg1 : !llvm.ptr -> i32
         %10 = llvm.add %8, %9  : i32
diff --git a/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir b/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir
index 4ea9df369af66..3dad3e9eee60d 100644
--- a/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir
@@ -23,7 +23,6 @@ module attributes { } {
       %13 = omp.map.info var_ptr(%10 : !llvm.ptr, !llvm.array<100 x i32>) map_clauses(from) capture(ByRef) bounds(%11) -> !llvm.ptr {name = "int_array"}
       %14 = omp.map.info var_ptr(%9 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "index_"}
       omp.target map_entries(%13 -> %arg0, %14 -> %arg1 : !llvm.ptr, !llvm.ptr) {
-      ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
         %15 = llvm.mlir.constant(100 : i32) : i32
         %16 = llvm.mlir.constant(1 : i32) : i32
         %17 = llvm.mlir.constant(100 : index) : i64
diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir
index 8afa89f1d8368..c92c16b2a370a 100644
--- a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir
@@ -27,7 +27,7 @@
     %0 = llvm.mlir.constant(-1 : i32) : i32
     %1 = llvm.mlir.addressof @i : !llvm.ptr
     %2 = llvm.mlir.addressof @j : !llvm.ptr
-    omp.parallel reduction(byref @add_reduction_i_32 %1 -> %arg0 : !llvm.ptr, byref @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr) {
+    omp.parallel reduction(byref @add_reduction_i_32 %1 -> %arg0, byref @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr, !llvm.ptr) {
       llvm.store %0, %arg0 : i32, !llvm.ptr
       llvm.store %0, %arg1 : i32, !llvm.ptr
       omp.terminator
diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
index f4d599538ac4a..55fb5954548a0 100644
--- a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
@@ -22,7 +22,7 @@ omp.declare_reduction @add_reduction_byref_box_heap_i32 : !llvm.ptr init {
   omp.yield
 }
 llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: !llvm.ptr {fir.bindc_name = "y"}) attributes {fir.internal_name = "_QPmissordered_blocks", frame_pointer = #llvm.framePointerKind<"non-leaf">, target_cpu = "generic", target_features = #llvm.target_features<["+outline-atomics", "+v8a", "+fp-armv8", "+neon"]>} {
-  omp.parallel reduction(byref @add_reduction_byref_box_heap_i32 %arg0 -> %arg2 : !llvm.ptr, byref @add_reduction_byref_box_heap_i32 %arg1 -> %arg3 : !llvm.ptr) {
+  omp.parallel reduction(byref @add_reduction_byref_box_heap_i32 %arg0 -> %arg2, byref @add_reduction_byref_box_heap_i32 %arg1 -> %arg3 : !llvm.ptr, !llvm.ptr) {
     omp.terminator
   }
   llvm.return
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
index 21167668bbee1..6153e5685c29f 100644
--- a/mlir/test/Target/LLVMIR/openmp-private.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -35,7 +35,7 @@ llvm.func @parallel_op_1_private(%arg0: !llvm.ptr) {
 // CHECK: }
 
 llvm.func @parallel_op_2_privates(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
-  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr, @y.privatizer %arg1 -> %arg3 : !llvm.ptr) {
+  omp.parallel private(@x.privatizer %arg0 -> %arg2, @y.privatizer %arg1 -> %arg3 : !llvm.ptr, !llvm.ptr) {
     %0 = llvm.load %arg2 : !llvm.ptr -> f32
     %1 = llvm.load %arg3 : !llvm.ptr -> i32
     omp.terminator
@@ -206,7 +206,7 @@ llvm.func @private_and_reduction_() attributes {fir.internal_name = "_QPprivate_
   %0 = llvm.mlir.constant(1 : i64) : i64
   %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
   %2 = llvm.alloca %0 x f32 {bindc_name = "to_priv"} : (i64) -> !llvm.ptr
-  omp.parallel reduction(byref @reducer.part %1 -> %arg0 : !llvm.ptr) private(@privatizer.part %2 -> %arg1 : !llvm.ptr) {
+  omp.parallel private(@privatizer.part %2 -> %arg1 : !llvm.ptr) reduction(byref @reducer.part %1 -> %arg0 : !llvm.ptr) {
     %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
     %4 = llvm.mlir.constant(8.000000e+00 : f32) : f32
     llvm.store %4, %arg1 : f32, !llvm.ptr
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
index 2d8a13ccd2a1f..5a506310653c8 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
@@ -44,8 +44,7 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
   %2 = llvm.mlir.constant(1 : index) : i64
   omp.parallel {
     %3 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
-    omp.sections reduction(byref @add_reduction_byref_box_Uxf32 -> %3 : !llvm.ptr) {
-    ^bb0(%arg1: !llvm.ptr):
+    omp.sections reduction(byref @add_reduction_byref_box_Uxf32 %3 -> %arg1 : !llvm.ptr) {
       omp.section {
       ^bb0(%arg2: !llvm.ptr):
         llvm.br ^bb1(%0 : i64)
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
index 0f757de39a006..e9aa5d6694cc8 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
@@ -24,7 +24,7 @@ module {
   %87 = llvm.alloca %86 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
 // test multiple reduction variables to ensure they don't intefere with eachother
 // when inlining the reduction init region multiple times
-    omp.parallel reduction(byref @add_reduction_byref_box_Uxf64 %84 -> %arg3 : !llvm.ptr, byref @add_reduction_byref_box_Uxf64 %87 -> %arg4 : !llvm.ptr) {
+    omp.parallel reduction(byref @add_reduction_byref_box_Uxf64 %84 -> %arg3, byref @add_reduction_byref_box_Uxf64 %87 -> %arg4 : !llvm.ptr, !llvm.ptr) {
       omp.terminator
     }
     llvm.return
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
index 694180a5ced37..db9a314b1f5a3 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
@@ -13,8 +13,7 @@ llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.in
   %0 = llvm.mlir.constant(2.000000e+00 : f32) : f32
   %1 = llvm.mlir.constant(1.000000e+00 : f32) : f32
   omp.parallel {
-    omp.sections reduction(@add_reduction_f32 -> %arg0 : !llvm.ptr) {
-    ^bb0(%arg1: !llvm.ptr):
+    omp.sections reduction(@add_reduction_f32 %arg0 -> %arg1 : !llvm.ptr) {
       omp.section {
       ^bb0(%arg2: !llvm.ptr):
         %2 = llvm.load %arg2 : !llvm.ptr -> f32
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
index 1d4b4915bcc39..dcac402d5fa28 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
@@ -107,7 +107,7 @@ llvm.func @reuse_declaration(%lb : i64, %ub : i64, %step : i64) {
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   %2 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   omp.parallel {
-    omp.wsloop reduction(@add_f32 %0 -> %prv0 : !llvm.ptr, @add_f32 %2 -> %prv1 : !llvm.ptr) {
+    omp.wsloop reduction(@add_f32 %0 -> %prv0, @add_f32 %2 -> %prv1 : !llvm.ptr, !llvm.ptr) {
       omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
         %1 = llvm.mlir.constant(2.0 : f32) : f32
         %3 = llvm.load %prv0 : !llvm.ptr -> f32
@@ -199,7 +199,7 @@ llvm.func @missing_omp_reduction(%lb : i64, %ub : i64, %step : i64) {
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   %2 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   omp.parallel {
-    omp.wsloop reduction(@add_f32 %0 -> %prv0 : !llvm.ptr, @add_f32 %2 -> %prv1 : !llvm.ptr) {
+    omp.wsloop reduction(@add_f32 %0 -> %prv0, @add_f32 %2 -> %prv1 : !llvm.ptr, !llvm.ptr) {
       omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
         %1 = llvm.mlir.constant(2.0 : f32) : f32
         %3 = llvm.load %prv0 : !llvm.ptr -> f32
@@ -382,7 +382,7 @@ llvm.func @no_atomic(%lb : i64, %ub : i64, %step : i64) {
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   %2 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   omp.parallel {
-    omp.wsloop reduction(@add_f32 %0 -> %prv0 : !llvm.ptr, @mul_f32 %2 -> %prv1 : !llvm.ptr) {
+    omp.wsloop reduction(@add_f32 %0 -> %prv0, @mul_f32 %2 -> %prv1 : !llvm.ptr, !llvm.ptr) {
       omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
         %1 = llvm.mlir.constant(2.0 : f32) : f32
         %3 = llvm.load %prv0 : !llvm.ptr -> f32
diff --git a/mlir/test/Target/LLVMIR/openmp-target-private.mlir b/mlir/test/Target/LLVMIR/openmp-target-private.mlir
new file mode 100644
index 0000000000000..e41b18f593efe
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-target-private.mlir
@@ -0,0 +1,96 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+omp.private {type = private} @simple_var.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "simple_var", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+}
+llvm.func @target_map_single_private() attributes {fir.internal_name = "_QPtarget_map_single_private"} {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "simple_var"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.constant(2 : i32) : i32
+  llvm.store %4, %3 : i32, !llvm.ptr
+  %5 = omp.map.info var_ptr(%3 : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = "a"}
+  omp.target map_entries(%5 -> %arg0 : !llvm.ptr) private(@simple_var.privatizer %1 -> %arg1 : !llvm.ptr) {
+    %6 = llvm.mlir.constant(10 : i32) : i32
+    %7 = llvm.load %arg0 : !llvm.ptr -> i32
+    %8 = llvm.add %7, %6 : i32
+    llvm.store %8, %arg1 : i32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+// CHECK: define internal void @__omp_offloading_
+// CHECK-NOT: define {{.*}}
+// CHECK: %[[PRIV_ALLOC:.*]] = alloca i32, i64 1, align 4
+// CHECK: %[[ADD:.*]] = add i32 {{.*}}, 10
+// CHECK: store i32 %[[ADD]], ptr %[[PRIV_ALLOC]], align 4
+
+omp.private {type = private} @n.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x f32 {bindc_name = "n", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+}
+llvm.func @target_map_2_privates() attributes {fir.internal_name = "_QPtarget_map_2_privates"} {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "simple_var"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x f32 {bindc_name = "n"} : (i64) -> !llvm.ptr
+  %5 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(2 : i32) : i32
+  llvm.store %6, %5 : i32, !llvm.ptr
+  %7 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = "a"}
+  omp.target map_entries(%7 -> %arg0 : !llvm.ptr) private(@simple_var.privatizer %1 -> %arg1, @n.privatizer %3 -> %arg2 : !llvm.ptr, !llvm.ptr) {
+    %8 = llvm.mlir.constant(1.100000e+01 : f32) : f32
+    %9 = llvm.mlir.constant(10 : i32) : i32
+    %10 = llvm.load %arg0 : !llvm.ptr -> i32
+    %11 = llvm.add %10, %9 : i32
+    llvm.store %11, %arg1 : i32, !llvm.ptr
+    %12 = llvm.load %arg1 : !llvm.ptr -> i32
+    %13 = llvm.sitofp %12 : i32 to f32
+    %14 = llvm.fadd %13, %8  {fastmathFlags = #llvm.fastmath<contract>} : f32
+    llvm.store %14, %arg2 : f32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+
+
+// CHECK: define internal void @__omp_offloading_
+// CHECK: %[[PRIV_I32_ALLOC:.*]] = alloca i32, i64 1, align 4
+// CHECK: %[[PRIV_FLOAT_ALLOC:.*]] = alloca float, i64 1, align 4
+// CHECK: %[[ADD_I32:.*]] = add i32 {{.*}}, 10
+// CHECK: store i32 %[[ADD_I32]], ptr %[[PRIV_I32_ALLOC]], align 4
+// CHECK: %[[LOAD_I32_AGAIN:.*]] = load i32, ptr %[[PRIV_I32_ALLOC]], align 4
+// CHECK: %[[CAST_TO_FLOAT:.*]] = sitofp i32 %[[LOAD_I32_AGAIN]] to float
+// CHECK: %[[ADD_FLOAT:.*]] = fadd contract float %[[CAST_TO_FLOAT]], 1.100000e+01
+// CHECK: store float %[[ADD_FLOAT]], ptr %[[PRIV_FLOAT_ALLOC]], align 4
+
+// An entirely artifical privatizer that is meant to check multi-block
+// privatizers. The idea here is to prove that we set the correct
+// insertion points for the builder when generating, first, LLVM IR for the
+// privatizer and then for the actual target region.
+omp.private {type = private} @multi_block.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  llvm.br ^bb1(%c1 : i32)
+
+^bb1(%arg1: i32):
+  %0 = llvm.alloca %arg1 x f32 : (i32) -> !llvm.ptr
+  omp.yield(%0 : !llvm.ptr)
+}
+
+llvm.func @target_op_private_multi_block(%arg0: !llvm.ptr) {
+  omp.target private(@multi_block.privatizer %arg0 -> %arg2 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    omp.terminator
+  }
+  llvm.return
+}
+// CHECK: define internal void @__omp_offloading_
+// CHECK: %[[ONE:.*]] = phi i32 [ 1, {{.*}} ]
+// CHECK: %[[PRIV_ALLOC:.*]] = alloca float, i32 %[[ONE]], align 4
+// CHECK: %[[PHI_ALLOCA:.*]]  = phi ptr [ %[[PRIV_ALLOC]], {{.*}} ]
+// CHECK: %[[RESULT:.*]] = load float, ptr %[[PHI_ALLOCA]], align 4
diff --git a/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir b/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
index f094a46581dee..3a71778e7d0a7 100644
--- a/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
@@ -22,11 +22,9 @@ module attributes {omp.is_target_device = true } {
     %0 = llvm.mlir.constant(1 : i64) : i64
     %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
     %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-    omp.target_data use_device_ptr(%map : !llvm.ptr)  {
-    ^bb0(%arg0: !llvm.ptr):
+    omp.target_data use_device_ptr(%map -> %arg0 : !llvm.ptr)  {
       %map1 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-      omp.target map_entries(%map1 : !llvm.ptr){
-      ^bb0(%arg1: !llvm.ptr):
+      omp.target map_entries(%map1 -> %arg1 : !llvm.ptr){
         %1 = llvm.mlir.constant(999 : i32) : i32
         %2 = llvm.load %arg1 : !llvm.ptr -> !llvm.ptr
         llvm.store %1, %2 : i32, !llvm.ptr
diff --git a/mlir/test/Target/LLVMIR/openmp-task-target-device.mlir b/mlir/test/Target/LLVMIR/openmp-task-target-device.mlir
index b4c848beef690..4d6b36e5f5450 100644
--- a/mlir/test/Target/LLVMIR/openmp-task-target-device.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-task-target-device.mlir
@@ -15,7 +15,6 @@ module attributes {omp.is_target_device = true } {
     }
     %4 = omp.map.info var_ptr(%3 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "a"}
     omp.target map_entries(%4 -> %arg0 : !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr):
       %5 = llvm.mlir.constant(5 : i32) : i32
       %6 = llvm.load %arg0  : !llvm.ptr -> i32
       %7 = llvm.add %6, %5  : i32
diff --git a/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir b/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir
index 0b28294dc1ced..7726b980581fa 100644
--- a/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir
@@ -30,7 +30,7 @@
     %loop_ub = llvm.mlir.constant(9 : i32) : i32
     %loop_lb = llvm.mlir.constant(0 : i32) : i32
     %loop_step = llvm.mlir.constant(1 : i32) : i32 
-    omp.wsloop reduction(byref @add_reduction_i_32 %1 -> %arg0 : !llvm.ptr, byref @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr) {
+    omp.wsloop reduction(byref @add_reduction_i_32 %1 -> %arg0, byref @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr, !llvm.ptr) {
       omp.loop_nest (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
         llvm.store %0, %arg0 : i32, !llvm.ptr
         llvm.store %0, %arg1 : i32, !llvm.ptr
diff --git a/mlir/test/Transforms/parallel-loop-collapsing.mlir b/mlir/test/Transforms/parallel-loop-collapsing.mlir
index d1c23d584f92b..dc4e042a3c4f5 100644
--- a/mlir/test/Transforms/parallel-loop-collapsing.mlir
+++ b/mlir/test/Transforms/parallel-loop-collapsing.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,3 collapsed-indices-1=1,4 collapsed-indices-2=2}, canonicalize))' | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,3 collapsed-indices-1=1,4 collapsed-indices-2=2}, canonicalize))' --mlir-print-local-scope | FileCheck %s
 
 // CHECK: func @parallel_many_dims() {
 func.func @parallel_many_dims() {
@@ -33,14 +33,11 @@ func.func @parallel_many_dims() {
 // CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index
 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG: %[[C9:.*]] = arith.constant 9 : index
-// CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index
 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
 // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
 // CHECK: scf.parallel (%[[NEW_I0:.*]]) = (%[[C0]]) to (%[[C4]]) step (%[[C1]]) {
 // CHECK:   %[[V0:.*]] = arith.remsi %[[NEW_I0]], %[[C2]] : index
 // CHECK:   %[[I0:.*]] = arith.divsi %[[NEW_I0]], %[[C2]] : index
-// CHECK:   %[[V2:.*]] = arith.muli %[[V0]], %[[C10]]
-// CHECK:   %[[I3:.*]] = arith.addi %[[V2]], %[[C9]]
+// CHECK:   %[[I3:.*]] = affine.apply affine_map<(d0) -> (d0 * 10 + 9)>(%[[V0]])
 // CHECK:   "magic.op"(%[[I0]], %[[C3]], %[[C6]], %[[I3]], %[[C12]]) : (index, index, index, index, index) -> index
 // CHECK:   scf.reduce
diff --git a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir
index 4eed61a65aa47..1ef787bec1bb3 100644
--- a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir
+++ b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,1}, canonicalize))' | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,1}, canonicalize))' --mlir-print-local-scope %s | FileCheck %s
 
 func.func @collapse_to_single() {
   %c0 = arith.constant 3 : index
@@ -14,20 +14,15 @@ func.func @collapse_to_single() {
 }
 
 // CHECK: func @collapse_to_single() {
-// CHECK-DAG:         %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:         %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:         %[[C3:.*]] = arith.constant 3 : index
-// CHECK-DAG:         %[[C7:.*]] = arith.constant 7 : index
-// CHECK-DAG:         %[[C4:.*]] = arith.constant 4 : index
 // CHECK-DAG:         %[[C6:.*]] = arith.constant 6 : index
+// CHECK-DAG:         %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:         %[[C1:.*]] = arith.constant 1 : index
 // CHECK-DAG:         %[[C18:.*]] = arith.constant 18 : index
 // CHECK:         scf.parallel (%[[NEW_I:.*]]) = (%[[C0]]) to (%[[C18]]) step (%[[C1]]) {
 // CHECK:           %[[I0_COUNT:.*]] = arith.remsi %[[NEW_I]], %[[C6]] : index
 // CHECK:           %[[I1_COUNT:.*]] = arith.divsi %[[NEW_I]], %[[C6]] : index
-// CHECK:            %[[V0:.*]] = arith.muli %[[I0_COUNT]], %[[C4]]
-// CHECK:           %[[I1:.*]] = arith.addi %[[V0]], %[[C7]]
-// CHECK:            %[[V1:.*]] = arith.muli %[[I1_COUNT]], %[[C3]]
-// CHECK:           %[[I0:.*]] = arith.addi %[[V1]], %[[C3]]
+// CHECK:           %[[I1:.*]] = affine.apply affine_map<(d0) -> (d0 * 4 + 7)>(%[[I0_COUNT]])
+// CHECK:           %[[I0:.*]] = affine.apply affine_map<(d0) -> (d0 * 3 + 3)>(%[[I1_COUNT]])
 // CHECK:           "magic.op"(%[[I0]], %[[I1]]) : (index, index) -> index
 // CHECK:           scf.reduce
 // CHECK-NEXT:    }
diff --git a/mlir/test/mlir-tblgen/directive-common.td b/mlir/test/mlir-tblgen/directive-common.td
index dd86dea36417c..9429238a03f07 100644
--- a/mlir/test/mlir-tblgen/directive-common.td
+++ b/mlir/test/mlir-tblgen/directive-common.td
@@ -26,7 +26,7 @@ def TDLC_ClauseA : Clause<"clausea"> {
 // CHECK: def AKind: I32EnumAttr<
 // CHECK:   "ClauseAKind",
 // CHECK:   "AKind Clause",
-// CHECK:   [AKindvala,AKindvalb]> {
+// CHECK:   [AKindvala, AKindvalb]> {
 // CHECK:     let cppNamespace = "::mlir::tdl";
 // CHECK: }
 // CHECK: def AKindAttr : EnumAttr<TDL_Dialect, AKind, "akind">;
diff --git a/mlir/test/tblgen-to-irdl/TestDialect.td b/mlir/test/tblgen-to-irdl/TestDialect.td
index 4fea3d8576e9a..1ba84a5d3683d 100644
--- a/mlir/test/tblgen-to-irdl/TestDialect.td
+++ b/mlir/test/tblgen-to-irdl/TestDialect.td
@@ -13,6 +13,10 @@ class Test_Type<string name, string typeMnemonic, list<Trait> traits = []>
   let mnemonic = typeMnemonic;
 }
 
+class Test_Attr<string name, string attrMnemonic> : AttrDef<Test_Dialect, name> {
+  let mnemonic = attrMnemonic;
+}
+
 class Test_Op<string mnemonic, list<Trait> traits = []>
     : Op<Test_Dialect, mnemonic, traits>;
 
@@ -22,6 +26,8 @@ def Test_SingletonAType : Test_Type<"SingletonAType", "singleton_a"> {}
 def Test_SingletonBType : Test_Type<"SingletonBType", "singleton_b"> {}
 // CHECK: irdl.type @"!singleton_c"
 def Test_SingletonCType : Test_Type<"SingletonCType", "singleton_c"> {}
+// CHECK: irdl.attribute @"#test"
+def Test_TestAttr : Test_Attr<"Test", "test"> {}
 
 
 // Check that AllOfType is converted correctly.
@@ -45,6 +51,17 @@ def Test_AnyOp : Test_Op<"any"> {
 // CHECK-NEXT:    irdl.operands(%[[v0]])
 // CHECK-NEXT:  }
 
+// Check attributes are converted correctly.
+def Test_AttributesOp : Test_Op<"attributes"> {
+  let arguments = (ins I16Attr:$int_attr,
+                       Test_TestAttr:$test_attr);
+}
+// CHECK-LABEL: irdl.operation @attributes {
+// CHECK-NEXT:    %[[v0:[^ ]*]] = irdl.base "!builtin.integer"
+// CHECK-NEXT:    %[[v1:[^ ]*]] = irdl.base @test::@"#test"
+// CHECK-NEXT:    irdl.attributes {"int_attr" = %[[v0]], "test_attr" = %[[v1]]}
+// CHECK-NEXT:  }
+
 // Check confined types are converted correctly.
 def Test_ConfinedOp : Test_Op<"confined"> {
   let arguments = (ins ConfinedType<AnyType, [CPred<"::llvm::isa<::mlir::TensorType>($_self)">]>:$tensor,
diff --git a/mlir/tools/mlir-tblgen/DirectiveCommonGen.cpp b/mlir/tools/mlir-tblgen/DirectiveCommonGen.cpp
index de3e6d8ee8cbc..26de95fcc5773 100644
--- a/mlir/tools/mlir-tblgen/DirectiveCommonGen.cpp
+++ b/mlir/tools/mlir-tblgen/DirectiveCommonGen.cpp
@@ -54,8 +54,7 @@ static bool emitDecls(const RecordKeeper &recordKeeper, llvm::StringRef dialect,
       recordKeeper.getAllDerivedDefinitions("DirectiveLanguage");
   assert(!directiveLanguages.empty() && "DirectiveLanguage missing.");
 
-  for (const Record *r : recordKeeper.getAllDerivedDefinitions("Clause")) {
-    Clause c{r};
+  for (const Clause c : recordKeeper.getAllDerivedDefinitions("Clause")) {
     const auto &clauseVals = c.getClauseVals();
     if (clauseVals.empty())
       continue;
@@ -65,14 +64,13 @@ static bool emitDecls(const RecordKeeper &recordKeeper, llvm::StringRef dialect,
 
     std::vector<std::string> cvDefs;
     for (const auto &it : llvm::enumerate(clauseVals)) {
-      ClauseVal cval{it.value()};
+      const ClauseVal cval{it.value()};
       if (!cval.isUserVisible())
         continue;
 
       std::string name = cval.getFormattedName();
       std::string enumValName(name.length(), ' ');
-      std::transform(name.begin(), name.end(), enumValName.begin(),
-                     llvm::toLower);
+      llvm::transform(name, enumValName.begin(), llvm::toLower);
       enumValName[0] = llvm::toUpper(enumValName[0]);
       std::string cvDef{(enumName + llvm::Twine(name)).str()};
       os << "def " << cvDef << " : I32EnumAttrCase<\"" << enumValName << "\", "
@@ -84,11 +82,7 @@ static bool emitDecls(const RecordKeeper &recordKeeper, llvm::StringRef dialect,
     os << "  \"Clause" << enumName << "\",\n";
     os << "  \"" << enumName << " Clause\",\n";
     os << "  [";
-    for (unsigned int i = 0; i < cvDefs.size(); i++) {
-      os << cvDefs[i];
-      if (i != cvDefs.size() - 1)
-        os << ",";
-    }
+    llvm::interleaveComma(cvDefs, os);
     os << "]> {\n";
     os << "    let cppNamespace = \"::mlir::"
        << directiveLanguages[0]->getValueAsString("cppNamespace") << "\";\n";
diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
index a3a968b282605..c1e0e9fab6682 100644
--- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp
+++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
@@ -72,7 +72,7 @@ static StringRef extractOmpClauseName(const Record *clause) {
   assert(ompClause && "base OpenMP records expected to be defined");
 
   StringRef clauseClassName;
-  SmallVector<Record *, 1> clauseSuperClasses;
+  SmallVector<const Record *, 1> clauseSuperClasses;
   clause->getDirectSuperClasses(clauseSuperClasses);
 
   // Check if OpenMP_Clause is a direct superclass.
diff --git a/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp b/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
index ec211ad3519ce..8f830cdf513fb 100644
--- a/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
+++ b/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
@@ -112,7 +112,7 @@ Availability::Availability(const llvm::Record *def) : def(def) {
 }
 
 StringRef Availability::getClass() const {
-  SmallVector<Record *, 1> parentClass;
+  SmallVector<const Record *, 1> parentClass;
   def->getDirectSuperClasses(parentClass);
   if (parentClass.size() != 1) {
     PrintFatalError(def->getLoc(),
@@ -203,7 +203,7 @@ static bool emitInterfaceDefs(const RecordKeeper &recordKeeper,
   auto defs = recordKeeper.getAllDerivedDefinitions("Availability");
   SmallVector<const Record *, 1> handledClasses;
   for (const Record *def : defs) {
-    SmallVector<Record *, 1> parent;
+    SmallVector<const Record *, 1> parent;
     def->getDirectSuperClasses(parent);
     if (parent.size() != 1) {
       PrintFatalError(def->getLoc(),
@@ -293,7 +293,7 @@ static bool emitInterfaceDecls(const RecordKeeper &recordKeeper,
   auto defs = recordKeeper.getAllDerivedDefinitions("Availability");
   SmallVector<const Record *, 4> handledClasses;
   for (const Record *def : defs) {
-    SmallVector<Record *, 1> parent;
+    SmallVector<const Record *, 1> parent;
     def->getDirectSuperClasses(parent);
     if (parent.size() != 1) {
       PrintFatalError(def->getLoc(),
diff --git a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
index 45957bafc378e..d0a3552fb123d 100644
--- a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
+++ b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
@@ -74,8 +74,14 @@ Value typeToConstraint(OpBuilder &builder, Type type) {
   return op.getOutput();
 }
 
-std::optional<Type> recordToType(MLIRContext *ctx, const Record &predRec) {
+Value baseToConstraint(OpBuilder &builder, StringRef baseClass) {
+  MLIRContext *ctx = builder.getContext();
+  auto op = builder.create<irdl::BaseOp>(UnknownLoc::get(ctx),
+                                         StringAttr::get(ctx, baseClass));
+  return op.getOutput();
+}
 
+std::optional<Type> recordToType(MLIRContext *ctx, const Record &predRec) {
   if (predRec.isSubClassOf("I")) {
     auto width = predRec.getValueAsInt("bitwidth");
     return IntegerType::get(ctx, width, IntegerType::Signless);
@@ -164,12 +170,12 @@ std::optional<Type> recordToType(MLIRContext *ctx, const Record &predRec) {
   return std::nullopt;
 }
 
-Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) {
+Value createTypeConstraint(OpBuilder &builder, tblgen::Constraint constraint) {
   MLIRContext *ctx = builder.getContext();
   const Record &predRec = constraint.getDef();
 
   if (predRec.isSubClassOf("Variadic") || predRec.isSubClassOf("Optional"))
-    return createConstraint(builder, predRec.getValueAsDef("baseType"));
+    return createTypeConstraint(builder, predRec.getValueAsDef("baseType"));
 
   if (predRec.getName() == "AnyType") {
     auto op = builder.create<irdl::AnyOp>(UnknownLoc::get(ctx));
@@ -196,7 +202,7 @@ Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) {
     std::vector<Value> constraints;
     for (const Record *child : predRec.getValueAsListOfDefs("allowedTypes")) {
       constraints.push_back(
-          createConstraint(builder, tblgen::Constraint(child)));
+          createTypeConstraint(builder, tblgen::Constraint(child)));
     }
     auto op = builder.create<irdl::AnyOfOp>(UnknownLoc::get(ctx), constraints);
     return op.getOutput();
@@ -206,7 +212,7 @@ Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) {
     std::vector<Value> constraints;
     for (const Record *child : predRec.getValueAsListOfDefs("allowedTypes")) {
       constraints.push_back(
-          createConstraint(builder, tblgen::Constraint(child)));
+          createTypeConstraint(builder, tblgen::Constraint(child)));
     }
     auto op = builder.create<irdl::AllOfOp>(UnknownLoc::get(ctx), constraints);
     return op.getOutput();
@@ -241,7 +247,7 @@ Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) {
   // Confined type
   if (predRec.isSubClassOf("ConfinedType")) {
     std::vector<Value> constraints;
-    constraints.push_back(createConstraint(
+    constraints.push_back(createTypeConstraint(
         builder, tblgen::Constraint(predRec.getValueAsDef("baseType"))));
     for (Record *child : predRec.getValueAsListOfDefs("predicateList")) {
       constraints.push_back(createPredicate(builder, tblgen::Pred(child)));
@@ -253,6 +259,85 @@ Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) {
   return createPredicate(builder, constraint.getPredicate());
 }
 
+Value createAttrConstraint(OpBuilder &builder, tblgen::Constraint constraint) {
+  MLIRContext *ctx = builder.getContext();
+  const Record &predRec = constraint.getDef();
+
+  if (predRec.isSubClassOf("DefaultValuedAttr") ||
+      predRec.isSubClassOf("DefaultValuedOptionalAttr") ||
+      predRec.isSubClassOf("OptionalAttr")) {
+    return createAttrConstraint(builder, predRec.getValueAsDef("baseAttr"));
+  }
+
+  if (predRec.isSubClassOf("ConfinedAttr")) {
+    std::vector<Value> constraints;
+    constraints.push_back(createAttrConstraint(
+        builder, tblgen::Constraint(predRec.getValueAsDef("baseAttr"))));
+    for (Record *child : predRec.getValueAsListOfDefs("attrConstraints")) {
+      constraints.push_back(createPredicate(
+          builder, tblgen::Pred(child->getValueAsDef("predicate"))));
+    }
+    auto op = builder.create<irdl::AllOfOp>(UnknownLoc::get(ctx), constraints);
+    return op.getOutput();
+  }
+
+  if (predRec.isSubClassOf("AnyAttrOf")) {
+    std::vector<Value> constraints;
+    for (Record *child : predRec.getValueAsListOfDefs("allowedAttributes")) {
+      constraints.push_back(
+          createAttrConstraint(builder, tblgen::Constraint(child)));
+    }
+    auto op = builder.create<irdl::AnyOfOp>(UnknownLoc::get(ctx), constraints);
+    return op.getOutput();
+  }
+
+  if (predRec.getName() == "AnyAttr") {
+    auto op = builder.create<irdl::AnyOp>(UnknownLoc::get(ctx));
+    return op.getOutput();
+  }
+
+  if (predRec.isSubClassOf("AnyIntegerAttrBase") ||
+      predRec.isSubClassOf("SignlessIntegerAttrBase") ||
+      predRec.isSubClassOf("SignedIntegerAttrBase") ||
+      predRec.isSubClassOf("UnsignedIntegerAttrBase") ||
+      predRec.isSubClassOf("BoolAttr")) {
+    return baseToConstraint(builder, "!builtin.integer");
+  }
+
+  if (predRec.isSubClassOf("FloatAttrBase")) {
+    return baseToConstraint(builder, "!builtin.float");
+  }
+
+  if (predRec.isSubClassOf("StringBasedAttr")) {
+    return baseToConstraint(builder, "!builtin.string");
+  }
+
+  if (predRec.getName() == "UnitAttr") {
+    auto op =
+        builder.create<irdl::IsOp>(UnknownLoc::get(ctx), UnitAttr::get(ctx));
+    return op.getOutput();
+  }
+
+  if (predRec.isSubClassOf("AttrDef")) {
+    auto dialect = predRec.getValueAsDef("dialect")->getValueAsString("name");
+    if (dialect == selectedDialect) {
+      std::string combined = ("#" + predRec.getValueAsString("mnemonic")).str();
+      SmallVector<FlatSymbolRefAttr> nested = {SymbolRefAttr::get(ctx, combined)
+
+      };
+      auto typeSymbol = SymbolRefAttr::get(ctx, dialect, nested);
+      auto op = builder.create<irdl::BaseOp>(UnknownLoc::get(ctx), typeSymbol);
+      return op.getOutput();
+    }
+    std::string typeName = ("#" + predRec.getValueAsString("attrName")).str();
+    auto op = builder.create<irdl::BaseOp>(UnknownLoc::get(ctx),
+                                           StringAttr::get(ctx, typeName));
+    return op.getOutput();
+  }
+
+  return createPredicate(builder, constraint.getPredicate());
+}
+
 /// Returns the name of the operation without the dialect prefix.
 static StringRef getOperatorName(tblgen::Operator &tblgenOp) {
   StringRef opName = tblgenOp.getDef().getValueAsString("opName");
@@ -265,6 +350,12 @@ static StringRef getTypeName(tblgen::TypeDef &tblgenType) {
   return opName;
 }
 
+/// Returns the name of the attr without the dialect prefix.
+static StringRef getAttrName(tblgen::AttrDef &tblgenType) {
+  StringRef opName = tblgenType.getDef()->getValueAsString("mnemonic");
+  return opName;
+}
+
 /// Extract an operation to IRDL.
 irdl::OperationOp createIRDLOperation(OpBuilder &builder,
                                       tblgen::Operator &tblgenOp) {
@@ -282,7 +373,7 @@ irdl::OperationOp createIRDLOperation(OpBuilder &builder,
     SmallVector<Value> operands;
     SmallVector<irdl::VariadicityAttr> variadicity;
     for (const NamedTypeConstraint &namedCons : namedCons) {
-      auto operand = createConstraint(consBuilder, namedCons.constraint);
+      auto operand = createTypeConstraint(consBuilder, namedCons.constraint);
       operands.push_back(operand);
 
       irdl::VariadicityAttr var;
@@ -304,6 +395,15 @@ irdl::OperationOp createIRDLOperation(OpBuilder &builder,
   auto [operands, operandVariadicity] = getValues(tblgenOp.getOperands());
   auto [results, resultVariadicity] = getValues(tblgenOp.getResults());
 
+  SmallVector<Value> attributes;
+  SmallVector<Attribute> attrNames;
+  for (auto namedAttr : tblgenOp.getAttributes()) {
+    if (namedAttr.attr.isOptional())
+      continue;
+    attributes.push_back(createAttrConstraint(consBuilder, namedAttr.attr));
+    attrNames.push_back(StringAttr::get(ctx, namedAttr.name));
+  }
+
   // Create the operands and results operations.
   if (!operands.empty())
     consBuilder.create<irdl::OperandsOp>(UnknownLoc::get(ctx), operands,
@@ -311,6 +411,9 @@ irdl::OperationOp createIRDLOperation(OpBuilder &builder,
   if (!results.empty())
     consBuilder.create<irdl::ResultsOp>(UnknownLoc::get(ctx), results,
                                         resultVariadicity);
+  if (!attributes.empty())
+    consBuilder.create<irdl::AttributesOp>(UnknownLoc::get(ctx), attributes,
+                                           ArrayAttr::get(ctx, attrNames));
 
   return op;
 }
@@ -328,6 +431,20 @@ irdl::TypeOp createIRDLType(OpBuilder &builder, tblgen::TypeDef &tblgenType) {
   return op;
 }
 
+irdl::AttributeOp createIRDLAttr(OpBuilder &builder,
+                                 tblgen::AttrDef &tblgenAttr) {
+  MLIRContext *ctx = builder.getContext();
+  StringRef attrName = getAttrName(tblgenAttr);
+  std::string combined = ("#" + attrName).str();
+
+  irdl::AttributeOp op = builder.create<irdl::AttributeOp>(
+      UnknownLoc::get(ctx), StringAttr::get(ctx, combined));
+
+  op.getBody().emplaceBlock();
+
+  return op;
+}
+
 static irdl::DialectOp createIRDLDialect(OpBuilder &builder) {
   MLIRContext *ctx = builder.getContext();
   return builder.create<irdl::DialectOp>(UnknownLoc::get(ctx),
@@ -358,6 +475,14 @@ static bool emitDialectIRDLDefs(const RecordKeeper &recordKeeper,
     createIRDLType(builder, tblgenType);
   }
 
+  for (const Record *attr :
+       recordKeeper.getAllDerivedDefinitionsIfDefined("AttrDef")) {
+    tblgen::AttrDef tblgenAttr(attr);
+    if (tblgenAttr.getDialect().getName() != selectedDialect)
+      continue;
+    createIRDLAttr(builder, tblgenAttr);
+  }
+
   for (const Record *def :
        recordKeeper.getAllDerivedDefinitionsIfDefined("Op")) {
     tblgen::Operator tblgenOp(def);
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index d6452a5d589c5..9ea8d171cc830 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -398,6 +398,10 @@ void setLock(omp_lock_t *Lock) {
   } // wait for 0 to be the read value
 }
 
+void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
+
+void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
+
 #pragma omp end declare variant
 ///}
 
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 514bb89e0b644..2f1ef3e98d817 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -88,7 +88,7 @@ config.test_flags = " -I " + config.test_source_root + \
 
 # compiler specific flags
 config.test_flags_clang = ""
-config.test_flags_flang = ""
+config.test_flags_flang = "-fopenmp-version=52"
 
 if config.omp_host_rtl_directory:
     config.test_flags = config.test_flags + " -L " + \
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index b39fb8f6795e1..37e165b78aa1e 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -172,7 +172,6 @@ gentbl(
     ],
 )
 
-
 gentbl(
     name = "basic_arm_sve_typeflags_gen",
     tbl_outs = [(
@@ -359,6 +358,7 @@ gentbl(
     td_file = "include/clang/Basic/BuiltinsRISCV.td",
     td_srcs = [
         "include/clang/Basic/BuiltinsRISCV.td",
+        "include/clang/Basic/BuiltinsRISCVXCV.td",
         "include/clang/Basic/BuiltinsBase.td",
     ],
 )
@@ -996,12 +996,12 @@ cc_library(
     ],
     deps = [
         ":ast_attr_gen",
+        ":ast_bytecode_opcodes_gen",
         ":ast_comment_command_info_gen",
         ":ast_comment_html_named_character_references_gen",
         ":ast_comment_html_tags_gen",
         ":ast_comment_html_tags_properties_gen",
         ":ast_comment_nodes_gen",
-        ":ast_bytecode_opcodes_gen",
         ":ast_decl_nodes_gen",
         ":ast_properties_base_gen",
         ":ast_stmt_data_collectors_gen",
@@ -1278,6 +1278,9 @@ cc_library(
     ]),
     hdrs = glob(["include/clang/Testing/*.h"]),
     includes = ["include"],
+    textual_hdrs = [
+        "include/clang/Testing/TestLanguage.def",
+    ],
     deps = [
         ":basic",
         ":frontend",
diff --git a/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel
index e8c7106b28751..010a617066c7b 100644
--- a/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel
@@ -67,6 +67,7 @@ cc_test(
         "//clang:ast",
         "//clang:ast_matchers",
         "//clang:frontend",
+        "//clang:testing",
         "//clang:tooling",
         "//llvm:Support",
         "//llvm:TargetParser",
diff --git a/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel b/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel
index 9457e4454e39a..791bca34fe33b 100644
--- a/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel
@@ -71,6 +71,7 @@ cc_library(
         "lib/orc/interval_map.h",
         "lib/orc/interval_set.h",
         "lib/orc/jit_dispatch.h",
+        "lib/orc/record_section_tracker.h",
         "lib/orc/simple_packed_serialization.h",
         "lib/orc/stl_extras.h",
         "lib/orc/string_pool.h",
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 62f1c2a50acf7..727b19e47ec82 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -1455,8 +1455,9 @@ cc_library(
     ]),
     hdrs = glob(["include/llvm/SandboxIR/*.h"]),
     copts = llvm_copts,
-    textual_hdrs = ["include/llvm/SandboxIR/SandboxIRValues.def"],
+    textual_hdrs = ["include/llvm/SandboxIR/Values.def"],
     deps = [
+        ":Analysis",
         ":Core",
         ":Support",
     ],
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index 1170458664250..139de9344d388 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -632,6 +632,24 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "sandboxir_tests",
+    size = "small",
+    srcs = glob(
+        ["SandboxIR/*.cpp"],
+        allow_empty = False,
+    ),
+    deps = [
+        "//llvm:Analysis",
+        "//llvm:AsmParser",
+        "//llvm:Core",
+        "//llvm:SandboxIR",
+        "//llvm:Support",
+        "//third-party/unittest:gmock",
+        "//third-party/unittest:gtest",
+    ],
+)
+
 cc_test(
     name = "profile_data_tests",
     size = "small",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index f5437245e8e13..a2f70f765eec3 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -9615,6 +9615,7 @@ cc_library(
         ":PolynomialDialect",
         ":PtrDialect",
         ":QuantOps",
+        ":QuantTransforms",
         ":ROCDLDialect",
         ":ROCDLTarget",
         ":ReconcileUnrealizedCasts",
@@ -10662,8 +10663,8 @@ cc_library(
 td_library(
     name = "QuantizationOpsTdFiles",
     srcs = [
-        "include/mlir/Dialect/Quant/QuantOps.td",
-        "include/mlir/Dialect/Quant/QuantOpsBase.td",
+        "include/mlir/Dialect/Quant/IR/QuantBase.td",
+        "include/mlir/Dialect/Quant/IR/QuantOps.td",
     ],
     includes = ["include"],
     deps = [
@@ -10678,19 +10679,19 @@ gentbl_cc_library(
     tbl_outs = [
         (
             ["-gen-op-decls"],
-            "include/mlir/Dialect/Quant/QuantOps.h.inc",
+            "include/mlir/Dialect/Quant/IR/QuantOps.h.inc",
         ),
         (
             ["-gen-op-defs"],
-            "include/mlir/Dialect/Quant/QuantOps.cpp.inc",
+            "include/mlir/Dialect/Quant/IR/QuantOps.cpp.inc",
         ),
         (
             ["-gen-dialect-decls"],
-            "include/mlir/Dialect/Quant/QuantOpsDialect.h.inc",
+            "include/mlir/Dialect/Quant/IR/QuantOpsDialect.h.inc",
         ),
         (
             ["-gen-dialect-defs"],
-            "include/mlir/Dialect/Quant/QuantOpsDialect.cpp.inc",
+            "include/mlir/Dialect/Quant/IR/QuantOpsDialect.cpp.inc",
         ),
         (
             ["-gen-op-doc"],
@@ -10698,7 +10699,7 @@ gentbl_cc_library(
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/Quant/QuantOps.td",
+    td_file = "include/mlir/Dialect/Quant/IR/QuantOps.td",
     deps = [":QuantizationOpsTdFiles"],
 )
 
@@ -10710,11 +10711,11 @@ gentbl_cc_library(
                 "-gen-bytecode",
                 "-bytecode-dialect=Quant",
             ],
-            "include/mlir/Dialect/Quant/QuantDialectBytecode.cpp.inc",
+            "include/mlir/Dialect/Quant/IR/QuantDialectBytecode.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/Quant/QuantDialectBytecode.td",
+    td_file = "include/mlir/Dialect/Quant/IR/QuantDialectBytecode.td",
     deps = [
         ":BytecodeTdFiles",
     ],
@@ -10733,10 +10734,10 @@ cc_library(
         "lib/Dialect/Quant/Utils/UniformSupport.cpp",
     ],
     hdrs = [
-        "include/mlir/Dialect/Quant/FakeQuantSupport.h",
-        "include/mlir/Dialect/Quant/QuantOps.h",
-        "include/mlir/Dialect/Quant/QuantTypes.h",
-        "include/mlir/Dialect/Quant/UniformSupport.h",
+        "include/mlir/Dialect/Quant/IR/Quant.h",
+        "include/mlir/Dialect/Quant/IR/QuantTypes.h",
+        "include/mlir/Dialect/Quant/Utils/FakeQuantSupport.h",
+        "include/mlir/Dialect/Quant/Utils/UniformSupport.h",
     ],
     includes = ["include"],
     deps = [
@@ -12370,6 +12371,7 @@ cc_library(
     hdrs = glob(["include/mlir/Dialect/Transform/IR/*.h"]),
     deps = [
         ":Analysis",
+        ":BytecodeOpInterface",
         ":CallOpInterfaces",
         ":CastInterfaces",
         ":ControlFlowInterfaces",
@@ -13162,6 +13164,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ArithDialect",
+        ":ArithUtils",
         ":DialectUtils",
         ":IR",
         ":MathDialect",
@@ -14563,3 +14566,44 @@ gentbl_cc_library(
     td_file = "include/mlir/Dialect/LLVMIR/VCIXOps.td",
     deps = [":VCIXTdFiles"],
 )
+
+gentbl_cc_library(
+    name = "QuantPassIncGen",
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=Quant",
+            ],
+            "include/mlir/Dialect/Quant/Transforms/Passes.h.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/Quant/Transforms/Passes.td",
+    deps = [":PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "QuantTransforms",
+    srcs = glob([
+        "lib/Dialect/Quant/Transforms/*.cpp",
+    ]),
+    hdrs = glob([
+        "include/mlir/Dialect/Quant/Transforms/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":ArithDialect",
+        ":FuncDialect",
+        ":FuncTransforms",
+        ":IR",
+        ":LinalgDialect",
+        ":Pass",
+        ":QuantOps",
+        ":QuantPassIncGen",
+        ":ShapeDialect",
+        ":TensorDialect",
+        ":TransformUtils",
+        "//llvm:Support",
+    ],
+)